inst/doc/benchmarks.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, include=F---------------------------------------------------------
library(tidyverse)

## ----echo=F-------------------------------------------------------------------
sim_data <- read_csv("sim_data.csv")
sim_data %>%
  mutate(
    name = ifelse(name == "time", "Time Usage (s)", "Memory Usage (MB)"),
    join_type = ifelse(join_type == "Jaccard Distance",
      "Jaccard Distance Join",
      "Euclidean Distance Joins"
    ),
  ) %>%
  ggplot(aes(x = as.numeric(n), y = value, col = package, linetype = package)) +
  geom_point() +
  geom_line() +
  facet_wrap(~ join_type + name, scales = "free") +
  scale_y_continuous("Time (s) / memory (MB)")

## ----echo=T, eval=F-----------------------------------------------------------
#  library(zoomerjoin)
#  library(fuzzyjoin)
#  library(tidyverse)
#  library(microbenchmark)
#  library(profmem)
#  
#  
#  # Sample million rows from DIME dataset
#  data_1 <- as.data.frame(sample_n(dime_data, 10^6))
#  names(data_1) <- c("id_1", "name")
#  data_2 <- as.data.frame(sample_n(dime_data, 10^6))
#  names(data_2) <- c("id_2", "name")
#  
#  # Generate datasets for euclidean join benchmarking
#  n <- 10^5
#  p <- 50
#  X <- matrix(rnorm(n * p), n, p)
#  X_1 <- as.data.frame(X)
#  X_2 <- as.data.frame(X + .000000001)
#  
#  # Get time and memory use statistics for fuzzyjoin when performing jaccard join
#  fuzzy_jaccard_bench <- function(n) {
#    time <- microbenchmark(
#      stringdist_inner_join(data_1[1:n, ],
#        data_2[1:n, ],
#        method = "jaccard",
#        max_dist = .6,
#        q = 4
#      ),
#      times = 10
#    )$time %>%
#      median()
#  
#    mem <- profmem(stringdist_inner_join(data_1[1:n, ],
#      data_2[1:n, ],
#      method = "jaccard",
#      max_dist = .6,
#      q = 4
#    )) %>%
#      total()
#  
#    return(c(time = time, memory = mem))
#  }
#  
#  
#  # Get time and memory use statistics for zoomerjoin when performing jaccard join
#  zoomer_jaccard_bench <- function(n) {
#    time <- microbenchmark(
#      jaccard_inner_join(data_1[1:n, ], data_2[1:n, ],
#        by = "name", band_width = 11,
#        n_bands = 350, threshold = .7,
#        n_gram_width = 4
#      ),
#      times = 50
#    )$time %>%
#      median()
#  
#    mem <- profmem(
#      jaccard_inner_join(data_1[1:n, ], data_2[1:n, ],
#        by = "name", band_width = 11,
#        n_bands = 350, threshold = .7,
#        n_gram_width = 4
#      )
#    ) %>%
#      total()
#  
#    return(c(time = time, memory = mem))
#  }
#  
#  # Get time and memory use statistics for fuzzyjoin when performing Euclidean join
#  fuzzy_euclid_bench <- function(n) {
#    time <- microbenchmark(
#      distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean"),
#      times = 10
#    )$time %>%
#      median()
#  
#    mem <- total(profmem(
#      distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean")
#    ))
#  
#    return(c(time = time, memory = mem))
#  }
#  
#  # Get time and memory use statistics for zoomerjoin when performing Euclidean join
#  zoomer_euclid_bench <- function(n) {
#    time <- microbenchmark(
#      euclidean_inner_join(X_1[1:n, ], X_2[1:n, ],
#        threshold = .1, n_bands = 90,
#        band_width = 2, r = .1
#      ),
#      times = 50
#    )$time %>%
#      median()
#  
#    mem <- profmem(euclidean_inner_join(X_1[1:n, ], X_2[1:n, ],
#      threshold = .1, n_bands = 90,
#      band_width = 2, r = .1
#    )) %>%
#      total()
#  
#    return(c(time = time, memory = mem))
#  }
#  
#  
#  # Run Grid of Jaccard Benchmarks, Collect results into DF
#  n <- seq(500, 4000, 250)
#  names(n) <- n
#  fuzzy_jacard_benches <- map_df(n, fuzzy_jaccard_bench, .id = "n")
#  zoomer_jacard_benches <- map_df(n, zoomer_jaccard_bench, .id = "n")
#  fuzzy_jacard_benches$package <- "fuzzyjoin"
#  zoomer_jacard_benches$package <- "zoomerjoin"
#  jaccard_benches <- bind_rows(fuzzy_jacard_benches, zoomer_jacard_benches)
#  jaccard_benches$join_type <- "Jaccard Distance"
#  
#  # Run Grid of Euclidean Benchmarks, Collect results into DF
#  n <- seq(250, 4000, 250)
#  names(n) <- n
#  fuzzy_euclid_benches <- map_df(n, fuzzy_euclid_bench, .id = "n")
#  zoomer_euclid_benches <- map_df(n, zoomer_euclid_bench, .id = "n")
#  fuzzy_euclid_benches$package <- "fuzzyjoin"
#  zoomer_euclid_benches$package <- "zoomerjoin"
#  euclid_benches <- bind_rows(fuzzy_euclid_benches, zoomer_euclid_benches)
#  euclid_benches$join_type <- "Euclidean Distance"
#  
#  sim_data <- bind_rows(euclid_benches, jaccard_benches) %>%
#    pivot_longer(c(time, memory)) %>%
#    mutate(value = ifelse(name == "time", value / 10^9, value / 10^6)) # convert ns to s and bytes to Gb.
#  
#  write_csv(sim_data, "sim_data.csv")

Try the zoomerjoin package in your browser

Any scripts or data that you put into this service are public.

zoomerjoin documentation built on April 13, 2025, 9:08 a.m.