The landmark package is spun off from the Mapper package, which implemented two versions of the maxmin procedure: a C++ function for the Euclidean case, and an R function calling the proxy package for other cases. Four new functions are under development, two each for the maxmin and lastfirst procedures: definition-based (non-optimized) C++ functions for the Euclidean case, and optimized R functions, again calling the proxy package, for all cases. While the package is in development, this vignette will conduct benchmarkings of the various functions at common tasks. Note that the vignette invokes the currently installed version of landmark.
library(bench) library(dplyr) library(ggplot2) library(cowplot) # installed version of landmark package library(landmark) knitr::opts_chunk$set(echo = TRUE, fig.width = 7)
These two functions automate the benchmarking process for different engines of the same procedure.
maxmin_benchmark()
requires a list xs
of data sets (intended to be of a common type and of increasing size). It accepts a distance dist_method
as passed to any of the functions, either one or length(xs)
assignments to num
(as a vector), and an assignment to radius
.
lastfirst_benchmark()
also requires xs
and accepts the same type of argument to num
and an integer assignment to cardinality
. (Only the R implementation calls the proxy package to handle non-Euclidean distances, so benchmarking is only performed for the Euclidean case.)
# print both numerical results and plots maxmin_benchmark <- function( datasets, dist_method = "euclidean", num = NULL, radius = NULL ) { if (! is.null(num) && length(num) == 1L) { num <- rep(num, 3L) } marks <- NULL for (i in seq_along(datasets)) { x <- xs[[i]] mark <- mark( original = landmarks_maxmin( x, num = num[i], radius = radius, engine = "original" ), `C++` = landmarks_maxmin( x, num = num[i], radius = radius, engine = "C++" ), R = landmarks_maxmin( x, num = num[i], radius = radius, engine = "R" ), check = FALSE ) mark <- mutate( mark, implementation = factor(expression, levels = c("original", "C++", "R")), n = nrow(x) ) mark <- select(mark, n, implementation, median, mem_alloc) #if (dist_method != "euclidean") mark <- filter(mark, implementation != "C++") marks <- bind_rows(marks, mark) } run_size <- ggplot(marks, aes(x = n, y = median, color = implementation)) + theme_bw() + geom_point(alpha = .5) + geom_line(alpha = .5) + ggtitle("Benchmark runtimes") mem_size <- ggplot(marks, aes(x = n, y = mem_alloc, color = implementation)) + theme_bw() + geom_point(alpha = .5) + geom_line(alpha = .5) + ggtitle("Benchmark memory allocation") run_mem_size <- plot_grid(run_size + theme(legend.position = "none"), mem_size + theme(legend.position = "none"), nrow = 1) run_mem_size_legend <- get_legend(run_size) print(plot_grid(run_mem_size, run_mem_size_legend, rel_widths = c(3, .75))) marks }
# print both numerical results and plots lastfirst_benchmark <- function( datasets, num = NULL, cardinality = NULL ) { if (! is.null(num) && length(num) == 1L) { num <- rep(num, 3L) } marks <- NULL for (i in seq_along(datasets)) { x <- xs[[i]] mark <- mark( `C++` = landmarks_lastfirst( x, num = num[i], cardinality = cardinality, engine = "C++" ), R = landmarks_lastfirst( x, num = num[i], cardinality = cardinality, engine = "R" ) ) mark <- mutate( mark, implementation = factor(expression, levels = c("original", "C++", "R")), n = nrow(x) ) mark <- select(mark, n, implementation, median, mem_alloc) marks <- bind_rows(marks, mark) } run_size <- ggplot(marks, aes(x = n, y = median, color = implementation)) + theme_bw() + geom_point(alpha = .5) + geom_line(alpha = .5) + ggtitle("Benchmark runtimes") mem_size <- ggplot(marks, aes(x = n, y = mem_alloc, color = implementation)) + theme_bw() + geom_point(alpha = .5) + geom_line(alpha = .5) + ggtitle("Benchmark memory allocation") run_mem_size <- plot_grid(run_size + theme(legend.position = "none"), mem_size + theme(legend.position = "none"), nrow = 1) run_mem_size_legend <- get_legend(run_size) print(plot_grid(run_mem_size, run_mem_size_legend, rel_widths = c(3, .75))) marks }
The following benchmark compares the several maxmin procedures on an artificial "noisy circle" data set of varying sizes.
set.seed(0) # circle samples xs <- lapply(c(60L, 360L, 1680L, 10080L), tdaunif::sample_circle, sd = .5) # euclidean, defaults maxmin_benchmark(xs, dist_method = "euclidean") # cosine, defaults maxmin_benchmark(xs, dist_method = "cosine") # euclidean, twice binary log maxmin_benchmark(xs, dist_method = "euclidean", num = 2 * log(vapply(xs, nrow, 1L), 2)) # cosine, twice binary log maxmin_benchmark(xs, dist_method = "cosine", num = 2 * log(vapply(xs, nrow, 1L), 2))
The following benchmark compares the several maxmin procedures on artificial integer samples of varying sizes and multiplicities.
set.seed(0) # integer samples replacement_sample <- function(n) { values <- seq(n / 12) matrix(sample(values, size = n, replace = TRUE)) } xs <- lapply(c(60L, 360L, 1680L, 10080L), replacement_sample) # euclidean, defaults maxmin_benchmark(xs, dist_method = "euclidean") # cosine, defaults maxmin_benchmark(xs, dist_method = "cosine") # euclidean, twice binary log maxmin_benchmark(xs, dist_method = "euclidean", num = 2 * log(vapply(xs, nrow, 1L), 2)) # cosine, twice binary log maxmin_benchmark(xs, dist_method = "cosine", num = 2 * log(vapply(xs, nrow, 1L), 2))
The following benchmark compares both lastfirst procedures on artificial integer samples of varying sizes and multiplicities.
set.seed(0) # integer samples replacement_sample <- function(n) { values <- seq(n / 6) matrix(sample(values, size = n, replace = TRUE)) } xs <- lapply(c(12L, 36L, 60L), replacement_sample) # euclidean, defaults lastfirst_benchmark(xs) # euclidean, twice binary log lastfirst_benchmark(xs, num = 2 * log(vapply(xs, nrow, 1L), 2))
This function automates a benchmarking comparison of the similarly-implemented R engines of the maxmin and lastfirst procedures.
# print both numerical results and plots procedure_benchmark <- function( datasets, num = NULL ) { if (! is.null(num) && length(num) == 1L) { num <- rep(num, 3L) } marks <- NULL for (i in seq_along(datasets)) { x <- xs[[i]] mark <- mark( maxmin = landmarks_maxmin(x, num = num[i], engine = "R"), lastfirst = landmarks_lastfirst(x, num = num[i], engine = "R"), check = FALSE ) mark <- mutate( mark, procedure = factor(expression, levels = c("maxmin", "lastfirst")), n = nrow(x) ) mark <- select(mark, n, procedure, median, mem_alloc) marks <- bind_rows(marks, mark) } run_size <- ggplot(marks, aes(x = n, y = median, color = procedure)) + theme_bw() + geom_point(alpha = .5) + geom_line(alpha = .5) + ggtitle("Benchmark runtimes") mem_size <- ggplot(marks, aes(x = n, y = mem_alloc, color = procedure)) + theme_bw() + geom_point(alpha = .5) + geom_line(alpha = .5) + ggtitle("Benchmark memory allocation") run_mem_size <- plot_grid(run_size + theme(legend.position = "none"), mem_size + theme(legend.position = "none"), nrow = 1) run_mem_size_legend <- get_legend(run_size) print(plot_grid(run_mem_size, run_mem_size_legend, rel_widths = c(3, .75))) marks }
set.seed(0) # circle samples with duplicates replacement_sample <- function(n) { x <- tdaunif::sample_circle(n = n) x <- rbind(x, x[sample(nrow(x), n, replace = TRUE), , drop = FALSE]) rbind(x, x[sample(nrow(x), n, replace = TRUE), , drop = FALSE]) } xs <- lapply(c(12L, 36L, 60L, 180L), replacement_sample) # euclidean, defaults procedure_benchmark(xs) # euclidean, twice binary log procedure_benchmark(xs, num = 2 * log(vapply(xs, nrow, 1L), 2))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.