The landmark package is spun off from the Mapper package, which implemented two versions of the maxmin procedure: a C++ function for the Euclidean case, and an R function calling the proxy package for other cases. Four new functions are under development, two each for the maxmin and lastfirst procedures: definition-based (non-optimized) C++ functions for the Euclidean case, and optimized R functions, again calling the proxy package, for all cases. While the package is in development, this vignette will conduct benchmarkings of the various functions at common tasks. Note that the vignette invokes the currently installed version of landmark.

library(bench)
library(dplyr)
library(ggplot2)
library(cowplot)
# installed version of landmark package
library(landmark)
knitr::opts_chunk$set(echo = TRUE, fig.width = 7)

Benchmarking R versus C++ implementations

These two functions automate the benchmarking process for different engines of the same procedure. maxmin_benchmark() requires a list xs of data sets (intended to be of a common type and of increasing size). It accepts a distance dist_method as passed to any of the functions, either one or length(xs) assignments to num (as a vector), and an assignment to radius. lastfirst_benchmark() also requires xs and accepts the same type of argument to num and an integer assignment to cardinality. (Only the R implementation calls the proxy package to handle non-Euclidean distances, so benchmarking is only performed for the Euclidean case.)

# print both numerical results and plots
maxmin_benchmark <- function(
  datasets, dist_method = "euclidean", num = NULL, radius = NULL
) {
  if (! is.null(num) && length(num) == 1L) {
    num <- rep(num, 3L)
  }
  marks <- NULL
  for (i in seq_along(datasets)) {
    x <- xs[[i]]
    mark <- mark(
      original = landmarks_maxmin(
        x, num = num[i], radius = radius, engine = "original"
      ),
      `C++` = landmarks_maxmin(
        x, num = num[i], radius = radius, engine = "C++"
      ),
      R = landmarks_maxmin(
        x, num = num[i], radius = radius, engine = "R"
      ),
      check = FALSE
    )
    mark <- mutate(
      mark,
      implementation = factor(expression, levels = c("original", "C++", "R")),
      n = nrow(x)
    )
    mark <- select(mark, n, implementation, median, mem_alloc)
    #if (dist_method != "euclidean") mark <- filter(mark, implementation != "C++")
    marks <- bind_rows(marks, mark)
  }
  run_size <- ggplot(marks, aes(x = n, y = median, color = implementation)) +
    theme_bw() +
    geom_point(alpha = .5) + geom_line(alpha = .5) +
    ggtitle("Benchmark runtimes")
  mem_size <- ggplot(marks, aes(x = n, y = mem_alloc, color = implementation)) +
    theme_bw() +
    geom_point(alpha = .5) + geom_line(alpha = .5) +
    ggtitle("Benchmark memory allocation")
  run_mem_size <- plot_grid(run_size + theme(legend.position = "none"),
                            mem_size + theme(legend.position = "none"),
                            nrow = 1)
  run_mem_size_legend <- get_legend(run_size)
  print(plot_grid(run_mem_size, run_mem_size_legend, rel_widths = c(3, .75)))
  marks
}
# print both numerical results and plots
lastfirst_benchmark <- function(
  datasets, num = NULL, cardinality = NULL
) {
  if (! is.null(num) && length(num) == 1L) {
    num <- rep(num, 3L)
  }
  marks <- NULL
  for (i in seq_along(datasets)) {
    x <- xs[[i]]
    mark <- mark(
      `C++` = landmarks_lastfirst(
        x, num = num[i], cardinality = cardinality, engine = "C++"
      ),
      R = landmarks_lastfirst(
        x, num = num[i], cardinality = cardinality, engine = "R"
      )
    )
    mark <- mutate(
      mark,
      implementation = factor(expression, levels = c("original", "C++", "R")),
      n = nrow(x)
    )
    mark <- select(mark, n, implementation, median, mem_alloc)
    marks <- bind_rows(marks, mark)
  }
  run_size <- ggplot(marks, aes(x = n, y = median, color = implementation)) +
    theme_bw() +
    geom_point(alpha = .5) + geom_line(alpha = .5) +
    ggtitle("Benchmark runtimes")
  mem_size <- ggplot(marks, aes(x = n, y = mem_alloc, color = implementation)) +
    theme_bw() +
    geom_point(alpha = .5) + geom_line(alpha = .5) +
    ggtitle("Benchmark memory allocation")
  run_mem_size <- plot_grid(run_size + theme(legend.position = "none"),
                            mem_size + theme(legend.position = "none"),
                            nrow = 1)
  run_mem_size_legend <- get_legend(run_size)
  print(plot_grid(run_mem_size, run_mem_size_legend, rel_widths = c(3, .75)))
  marks
}

The following benchmark compares the several maxmin procedures on an artificial "noisy circle" data set of varying sizes.

set.seed(0)
# circle samples
xs <- lapply(c(60L, 360L, 1680L, 10080L), tdaunif::sample_circle, sd = .5)
# euclidean, defaults
maxmin_benchmark(xs, dist_method = "euclidean")
# cosine, defaults
maxmin_benchmark(xs, dist_method = "cosine")
# euclidean, twice binary log
maxmin_benchmark(xs, dist_method = "euclidean",
                 num = 2 * log(vapply(xs, nrow, 1L), 2))
# cosine, twice binary log
maxmin_benchmark(xs, dist_method = "cosine",
                 num = 2 * log(vapply(xs, nrow, 1L), 2))

The following benchmark compares the several maxmin procedures on artificial integer samples of varying sizes and multiplicities.

set.seed(0)
# integer samples
replacement_sample <- function(n) {
  values <- seq(n / 12)
  matrix(sample(values, size = n, replace = TRUE))
}
xs <- lapply(c(60L, 360L, 1680L, 10080L), replacement_sample)
# euclidean, defaults
maxmin_benchmark(xs, dist_method = "euclidean")
# cosine, defaults
maxmin_benchmark(xs, dist_method = "cosine")
# euclidean, twice binary log
maxmin_benchmark(xs, dist_method = "euclidean",
                 num = 2 * log(vapply(xs, nrow, 1L), 2))
# cosine, twice binary log
maxmin_benchmark(xs, dist_method = "cosine",
                 num = 2 * log(vapply(xs, nrow, 1L), 2))

The following benchmark compares both lastfirst procedures on artificial integer samples of varying sizes and multiplicities.

set.seed(0)
# integer samples
replacement_sample <- function(n) {
  values <- seq(n / 6)
  matrix(sample(values, size = n, replace = TRUE))
}
xs <- lapply(c(12L, 36L, 60L), replacement_sample)
# euclidean, defaults
lastfirst_benchmark(xs)
# euclidean, twice binary log
lastfirst_benchmark(xs, num = 2 * log(vapply(xs, nrow, 1L), 2))

Benchmarking lastfirst against maxmin

This function automates a benchmarking comparison of the similarly-implemented R engines of the maxmin and lastfirst procedures.

# print both numerical results and plots
procedure_benchmark <- function(
  datasets, num = NULL
) {
  if (! is.null(num) && length(num) == 1L) {
    num <- rep(num, 3L)
  }
  marks <- NULL
  for (i in seq_along(datasets)) {
    x <- xs[[i]]
    mark <- mark(
      maxmin = landmarks_maxmin(x, num = num[i], engine = "R"),
      lastfirst = landmarks_lastfirst(x, num = num[i], engine = "R"),
      check = FALSE
    )
    mark <- mutate(
      mark,
      procedure = factor(expression, levels = c("maxmin", "lastfirst")),
      n = nrow(x)
    )
    mark <- select(mark, n, procedure, median, mem_alloc)
    marks <- bind_rows(marks, mark)
  }
  run_size <- ggplot(marks, aes(x = n, y = median, color = procedure)) +
    theme_bw() +
    geom_point(alpha = .5) + geom_line(alpha = .5) +
    ggtitle("Benchmark runtimes")
  mem_size <- ggplot(marks, aes(x = n, y = mem_alloc, color = procedure)) +
    theme_bw() +
    geom_point(alpha = .5) + geom_line(alpha = .5) +
    ggtitle("Benchmark memory allocation")
  run_mem_size <- plot_grid(run_size + theme(legend.position = "none"),
                            mem_size + theme(legend.position = "none"),
                            nrow = 1)
  run_mem_size_legend <- get_legend(run_size)
  print(plot_grid(run_mem_size, run_mem_size_legend, rel_widths = c(3, .75)))
  marks
}
set.seed(0)
# circle samples with duplicates
replacement_sample <- function(n) {
  x <- tdaunif::sample_circle(n = n)
  x <- rbind(x, x[sample(nrow(x), n, replace = TRUE), , drop = FALSE])
  rbind(x, x[sample(nrow(x), n, replace = TRUE), , drop = FALSE])
}
xs <- lapply(c(12L, 36L, 60L, 180L), replacement_sample)
# euclidean, defaults
procedure_benchmark(xs)
# euclidean, twice binary log
procedure_benchmark(xs, num = 2 * log(vapply(xs, nrow, 1L), 2))


corybrunson/maxmin documentation built on Feb. 3, 2022, 1:58 a.m.