Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup, include=F---------------------------------------------------------
library(tidyverse)
## ----echo=F-------------------------------------------------------------------
sim_data <- read_csv("sim_data.csv")
sim_data %>%
mutate(
name = ifelse(name == "time", "Time Usage (s)", "Memory Usage (MB)"),
join_type = ifelse(join_type == "Jaccard Distance",
"Jaccard Distance Join",
"Euclidean Distance Joins"
),
) %>%
ggplot(aes(x = as.numeric(n), y = value, col = package, linetype = package)) +
geom_point() +
geom_line() +
facet_wrap(~ join_type + name, scales = "free") +
scale_y_continuous("Time (s) / memory (MB)")
## ----echo=T, eval=F-----------------------------------------------------------
# library(zoomerjoin)
# library(fuzzyjoin)
# library(tidyverse)
# library(microbenchmark)
# library(profmem)
#
#
# # Sample million rows from DIME dataset
# data_1 <- as.data.frame(sample_n(dime_data, 10^6))
# names(data_1) <- c("id_1", "name")
# data_2 <- as.data.frame(sample_n(dime_data, 10^6))
# names(data_2) <- c("id_2", "name")
#
# # Generate datasets for euclidean join benchmarking
# n <- 10^5
# p <- 50
# X <- matrix(rnorm(n * p), n, p)
# X_1 <- as.data.frame(X)
# X_2 <- as.data.frame(X + .000000001)
#
# # Get time and memory use statistics for fuzzyjoin when performing jaccard join
# fuzzy_jaccard_bench <- function(n) {
# time <- microbenchmark(
# stringdist_inner_join(data_1[1:n, ],
# data_2[1:n, ],
# method = "jaccard",
# max_dist = .6,
# q = 4
# ),
# times = 10
# )$time %>%
# median()
#
# mem <- profmem(stringdist_inner_join(data_1[1:n, ],
# data_2[1:n, ],
# method = "jaccard",
# max_dist = .6,
# q = 4
# )) %>%
# total()
#
# return(c(time = time, memory = mem))
# }
#
#
# # Get time and memory use statistics for zoomerjoin when performing jaccard join
# zoomer_jaccard_bench <- function(n) {
# time <- microbenchmark(
# jaccard_inner_join(data_1[1:n, ], data_2[1:n, ],
# by = "name", band_width = 11,
# n_bands = 350, threshold = .7,
# n_gram_width = 4
# ),
# times = 50
# )$time %>%
# median()
#
# mem <- profmem(
# jaccard_inner_join(data_1[1:n, ], data_2[1:n, ],
# by = "name", band_width = 11,
# n_bands = 350, threshold = .7,
# n_gram_width = 4
# )
# ) %>%
# total()
#
# return(c(time = time, memory = mem))
# }
#
# # Get time and memory use statistics for fuzzyjoin when performing Euclidean join
# fuzzy_euclid_bench <- function(n) {
# time <- microbenchmark(
# distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean"),
# times = 10
# )$time %>%
# median()
#
# mem <- total(profmem(
# distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean")
# ))
#
# return(c(time = time, memory = mem))
# }
#
# # Get time and memory use statistics for zoomerjoin when performing Euclidean join
# zoomer_euclid_bench <- function(n) {
# time <- microbenchmark(
# euclidean_inner_join(X_1[1:n, ], X_2[1:n, ],
# threshold = .1, n_bands = 90,
# band_width = 2, r = .1
# ),
# times = 50
# )$time %>%
# median()
#
# mem <- profmem(euclidean_inner_join(X_1[1:n, ], X_2[1:n, ],
# threshold = .1, n_bands = 90,
# band_width = 2, r = .1
# )) %>%
# total()
#
# return(c(time = time, memory = mem))
# }
#
#
# # Run Grid of Jaccard Benchmarks, Collect results into DF
# n <- seq(500, 4000, 250)
# names(n) <- n
# fuzzy_jacard_benches <- map_df(n, fuzzy_jaccard_bench, .id = "n")
# zoomer_jacard_benches <- map_df(n, zoomer_jaccard_bench, .id = "n")
# fuzzy_jacard_benches$package <- "fuzzyjoin"
# zoomer_jacard_benches$package <- "zoomerjoin"
# jaccard_benches <- bind_rows(fuzzy_jacard_benches, zoomer_jacard_benches)
# jaccard_benches$join_type <- "Jaccard Distance"
#
# # Run Grid of Euclidean Benchmarks, Collect results into DF
# n <- seq(250, 4000, 250)
# names(n) <- n
# fuzzy_euclid_benches <- map_df(n, fuzzy_euclid_bench, .id = "n")
# zoomer_euclid_benches <- map_df(n, zoomer_euclid_bench, .id = "n")
# fuzzy_euclid_benches$package <- "fuzzyjoin"
# zoomer_euclid_benches$package <- "zoomerjoin"
# euclid_benches <- bind_rows(fuzzy_euclid_benches, zoomer_euclid_benches)
# euclid_benches$join_type <- "Euclidean Distance"
#
# sim_data <- bind_rows(euclid_benches, jaccard_benches) %>%
# pivot_longer(c(time, memory)) %>%
# mutate(value = ifelse(name == "time", value / 10^9, value / 10^6)) # convert ns to s and bytes to Gb.
#
# write_csv(sim_data, "sim_data.csv")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.