---
title: "Benchmarking fuzzystring against fuzzyjoin"
author: "Paul Efren Santos Andrade"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Benchmarking fuzzystring against fuzzyjoin}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 12,
  fig.height = 8
)
```

```{r setup}
library(dplyr)
library(tidyr)
library(ggplot2)

find_benchmark_csv <- function() {
  candidates <- c(
    file.path("inst", "extdata", "rbase_string_benchmark.csv"),
    file.path("..", "inst", "extdata", "rbase_string_benchmark.csv"),
    system.file("extdata", "rbase_string_benchmark.csv", package = "fuzzystring")
  )

  hit <- candidates[file.exists(candidates)][1]

  if (is.na(hit)) {
    stop(
      "Could not find the precomputed benchmark CSV. Expected it in ",
      paste(candidates, collapse = " or "),
      call. = FALSE
    )
  }

  hit
}

results_raw <- utils::read.csv(find_benchmark_csv(), stringsAsFactors = FALSE)

method_levels <- c(
  "osa", "lv", "dl", "hamming", "lcs",
  "qgram", "cosine", "jaccard", "jw", "soundex"
)

method_labels <- c(
  osa = "OSA",
  lv = "Levenshtein",
  dl = "Damerau-Levenshtein",
  hamming = "Hamming",
  lcs = "LCS",
  qgram = "Q-gram",
  cosine = "Cosine",
  jaccard = "Jaccard",
  jw = "Jaro-Winkler",
  soundex = "Soundex"
)

comparison_levels <- sprintf(
  "%.2f",
  sort(unique(results_raw$n_comps / 1e6))
)

summary_raw <- results_raw %>%
  group_by(expr, method, n_comps) %>%
  summarise(mean_time_ns = mean(time), .groups = "drop") %>%
  mutate(mean_time_ms = mean_time_ns / 1e6)

wide_summary <- summary_raw %>%
  select(expr, method, n_comps, mean_time_ms) %>%
  pivot_wider(names_from = expr, values_from = mean_time_ms) %>%
  mutate(
    runtime_ratio = fuzzyjoin / fuzzystring,
    method_label = factor(
      recode(method, !!!method_labels),
      levels = unname(method_labels[method_levels])
    ),
    comparisons_millions = n_comps / 1e6,
    comparisons_label = factor(
      sprintf("%.2f", comparisons_millions),
      levels = comparison_levels
    )
  ) %>%
  arrange(method_label, n_comps)

absolute_plot_data <- summary_raw %>%
  mutate(
    time_seconds = mean_time_ms / 1000,
    method_label = factor(
      recode(method, !!!method_labels),
      levels = unname(method_labels[method_levels])
    ),
    comparisons_label = factor(
      sprintf("%.2f", n_comps / 1e6),
      levels = comparison_levels
    ),
    implementation = factor(expr, levels = c("fuzzyjoin", "fuzzystring"))
  )

method_ranking <- wide_summary %>%
  group_by(method_label) %>%
  summarise(avg_runtime_ratio = mean(runtime_ratio), .groups = "drop") %>%
  arrange(desc(avg_runtime_ratio))

summary_table <- wide_summary %>%
  transmute(
    Method = as.character(method_label),
    `Candidate comparisons (M)` = sprintf("%.2f", comparisons_millions),
    `Mean time: fuzzyjoin (ms)` = round(fuzzyjoin, 2),
    `Mean time: fuzzystring (ms)` = round(fuzzystring, 2),
    `Runtime ratio (fuzzyjoin / fuzzystring)` = round(runtime_ratio, 2)
  )

ranking_table <- method_ranking %>%
  transmute(
    Method = as.character(method_label),
    `Average runtime ratio` = round(avg_runtime_ratio, 2)
  )

overall_ratio <- mean(wide_summary$runtime_ratio)
```

## Overview

This vignette documents the benchmark used to compare **fuzzyjoin** with the
`fuzzystring_join()` reimplementation in **fuzzystring**. The benchmark code is
shown for reproducibility, but it is **not executed** when this vignette is
built. Instead, the analysis below reads the precomputed CSV snapshot bundled in
`inst/extdata/rbase_string_benchmark.csv`.

The benchmark covers 10 distance methods, 3 sample sizes (`250`, `500`, and
`750`), and 10 repetitions per implementation. Across
`r nrow(wide_summary)` method-size combinations, **fuzzystring** is faster in
every case. The mean runtime ratio is `r sprintf("%.2f", overall_ratio)`x,
meaning that, on average, `fuzzyjoin` takes almost four times as long as the
reimplemented `fuzzystring` path on this benchmark snapshot.

## Benchmark Script

The script below is the benchmark used to generate the CSV analyzed in this
document. It is displayed as reference only.

```{r benchmark-script, eval = FALSE}
library(microbenchmark)
library(fuzzyjoin)
library(fuzzystring)
library(qdapDictionaries)
library(tibble)

samp_sizes <- c(250, 500, 750)
seed <- 2016

params <- list(
  list(method = "osa",     mode = "inner", max_dist = 1,   q = 0),
  list(method = "lv",      mode = "inner", max_dist = 1,   q = 0),
  list(method = "dl",      mode = "inner", max_dist = 1,   q = 0),
  list(method = "hamming", mode = "inner", max_dist = 1,   q = 0),
  list(method = "lcs",     mode = "inner", max_dist = 1,   q = 0),
  list(method = "qgram",   mode = "inner", max_dist = 2,   q = 2),
  list(method = "cosine",  mode = "inner", max_dist = 0.5, q = 2),
  list(method = "jaccard", mode = "inner", max_dist = 0.5, q = 2),
  list(method = "jw",      mode = "inner", max_dist = 0.5, q = 0),
  list(method = "soundex", mode = "inner", max_dist = 0.5, q = 0)
)

args <- commandArgs(trailingOnly = TRUE)
if (length(args) > 0) {
  params <- Filter(function(p) p$method %in% args, params)
}

data(misspellings)
words <- as.data.frame(DICTIONARY)

results <- data.frame()

for (p in params) {
  for (nsamp in samp_sizes) {
    cat(sprintf("Running %s with %d samples\n", p$method, nsamp))

    set.seed(seed)
    sub_misspellings <- misspellings[sample(seq_len(nrow(misspellings)), nsamp), ]

    bench <- microbenchmark(
      fuzzyjoin = {
        fuzzy_res <- stringdist_join(
          sub_misspellings, words,
          by         = c(misspelling = "word"),
          method     = p$method,
          mode       = p$mode,
          max_dist   = p$max_dist,
          q          = p$q
        )
      },
      fuzzystring = {
        fuzzystring_res <- fuzzystring_join(
          sub_misspellings, words,
          by       = c(misspelling = "word"),
          method   = p$method,
          mode     = p$mode,
          max_dist = p$max_dist,
          q        = p$q
        )
      },
      times = 10
    )

    # Verify that both implementations return the same result.
    fuzzy_res <- stringdist_join(
      sub_misspellings, words,
      by       = c(misspelling = "word"),
      method   = p$method,
      mode     = p$mode,
      max_dist = p$max_dist,
      q        = p$q
    )

    fuzzystring_res <- fuzzystring_join(
      sub_misspellings, words,
      by       = c(misspelling = "word"),
      method   = p$method,
      mode     = p$mode,
      max_dist = p$max_dist,
      q        = p$q
    )

    if (!isTRUE(all.equal(fuzzy_res, fuzzystring_res)) && p$method != "soundex") {
      message("Mismatch detected: fuzzyjoin vs fuzzystring for method: ", p$method)
    }

    df <- as.data.frame(bench)
    df$method <- p$method
    df$n_comps <- nrow(sub_misspellings) * nrow(words)
    df$os <- unname(Sys.info()["sysname"])

    results <- rbind(results, df)
  }
}

```

## Results

The benchmark spans 10 string distance methods and 3 sample sizes, producing 30
aggregated method-size combinations. The table below reports mean runtime in
milliseconds and the relative runtime ratio used throughout the figures.

```{r summary-table, include=FALSE}
knitr::kable(summary_table, align = "lrrrr")
```

The ranking below summarizes the average speedup by method. Larger values mean
that **fuzzystring** is faster by a wider margin.

```{r ranking-table, include=FALSE}
knitr::kable(ranking_table, align = "lr")
```

### Absolute Runtime

```{r absolute-runtime-plot}
ggplot(
  absolute_plot_data,
  aes(x = comparisons_label, y = time_seconds, fill = implementation)
) +
  geom_col(position = position_dodge(width = 0.75), width = 0.65) +
  facet_wrap(~ method_label, scales = "free_y") +
  labs(
    title = "Mean runtime by distance method",
    x = "Candidate comparisons (millions)",
    y = "Mean runtime (seconds)",
    fill = "Implementation"
  ) +
  scale_fill_manual(
    values = c("fuzzyjoin" = "#D95F02", "fuzzystring" = "#1B9E77"),
    labels = c("fuzzyjoin", "fuzzystring")
  ) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  theme_minimal(base_size = 14) +
  theme(
    strip.text = element_text(face = "bold"),
    legend.position = "bottom",
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = 30, hjust = 1)
  )
```

### Relative Runtime

Values above 1 indicate that **fuzzystring** is faster than **fuzzyjoin** for
the same method and workload size.

```{r relative-runtime-plot}
ggplot(
  wide_summary,
  aes(x = comparisons_label, y = runtime_ratio)
) +
  geom_col(fill = "#D95F02", width = 0.55) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray40") +
  facet_wrap(~ method_label, scales = "free_y") +
  labs(
    title = "Relative runtime against fuzzystring",
    subtitle = "Values above 1 mean fuzzystring is faster than fuzzyjoin",
    x = "Candidate comparisons (millions)",
    y = "Runtime ratio (fuzzyjoin / fuzzystring)"
  ) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  theme_minimal(base_size = 14) +
  theme(
    strip.text = element_text(face = "bold"),
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = 30, hjust = 1)
  )
```

## Interpretation

This benchmark snapshot supports the same conclusion across all methods tested:
the **fuzzystring** reimplementation is consistently faster than **fuzzyjoin**
for the workloads covered here.

The largest average gains appear in Hamming, Levenshtein, OSA, and
Damerau-Levenshtein, while cosine, q-gram, jaccard, and soundex still show a
clear but smaller improvement. In other words, the reimplemented path does not
only match `fuzzyjoin` functionally; it also improves runtime across every
benchmarked method-size combination in this snapshot.