Nothing
benchmark_require_bigmemory <- function() {
if (!requireNamespace("bigmemory", quietly = TRUE)) {
stop("The benchmark interface requires the 'bigmemory' package.", call. = FALSE)
}
}
benchmark_existing_dir <- function(path_dir) {
if (!is.character(path_dir) || length(path_dir) != 1L || is.na(path_dir) || !nzchar(path_dir)) {
stop("`path_dir` must be a single non-empty character string", call. = FALSE)
}
path_dir <- normalizePath(path.expand(path_dir), winslash = "/", mustWork = FALSE)
if (!dir.exists(path_dir)) {
stop(sprintf("Benchmark directory does not exist: %s", path_dir), call. = FALSE)
}
path_dir
}
benchmark_output_path <- function(output_path) {
if (is.null(output_path)) {
return(NULL)
}
if (!is.character(output_path) || length(output_path) != 1L || is.na(output_path) || !nzchar(output_path)) {
stop("`output_path` must be NULL or a single non-empty character string", call. = FALSE)
}
output_path <- normalizePath(path.expand(output_path), winslash = "/", mustWork = FALSE)
parent <- dirname(output_path)
if (!dir.exists(parent)) {
stop(sprintf("Benchmark output directory does not exist: %s", parent), call. = FALSE)
}
output_path
}
benchmark_make_index_path <- function(path_dir, prefix = "bigannoy-benchmark") {
tempfile(pattern = prefix, tmpdir = path_dir, fileext = ".ann")
}
benchmark_type_size <- function(type_code) {
switch(
as.character(type_code),
"1" = 1,
"2" = 2,
"4" = 4,
"6" = 4,
"8" = 8,
NA_real_
)
}
benchmark_dense_bytes <- function(x) {
if (is.null(x)) {
return(0)
}
as.numeric(length(x)) * 8
}
benchmark_big_bytes <- function(xp) {
as.numeric(big_nrow(xp)) *
as.numeric(big_ncol(xp)) *
benchmark_type_size(big_type(xp))
}
benchmark_file_bytes <- function(path) {
if (is.null(path) || !length(path) || is.na(path) || !nzchar(path) || !file.exists(path)) {
return(0)
}
as.numeric(file.info(path)$size[[1L]])
}
benchmark_rows_per_second <- function(n, elapsed) {
if (!is.finite(elapsed) || elapsed <= 0) {
return(NA_real_)
}
as.numeric(n) / elapsed
}
benchmark_make_reference <- function(values, filebacked, path_dir, prefix) {
benchmark_require_bigmemory()
if (!isTRUE(filebacked)) {
return(utils::getFromNamespace("as.big.matrix", "bigmemory")(values))
}
backingfile <- sprintf("%s.bin", prefix)
descriptorfile <- sprintf("%s.desc", prefix)
big <- utils::getFromNamespace("filebacked.big.matrix", "bigmemory")(
nrow = nrow(values),
ncol = ncol(values),
type = "double",
backingfile = backingfile,
descriptorfile = descriptorfile,
backingpath = path_dir
)
big[,] <- values
big
}
benchmark_is_dense_matrix <- function(x) {
is.matrix(x) && is.numeric(x)
}
benchmark_prepare_input <- function(x, arg, path_dir, prefix, filebacked = FALSE, allow_null = FALSE) {
if (allow_null && is.null(x)) {
return(list(
annoy = NULL,
exact = NULL,
dense = NULL,
bytes = 0,
storage = "none",
nrow = NA_integer_,
ncol = NA_integer_
))
}
if (benchmark_is_dense_matrix(x)) {
storage.mode(x) <- "double"
if (any(!is.finite(x))) {
stop(sprintf("`%s` contains non-finite values", arg), call. = FALSE)
}
if (identical(arg, "x")) {
big <- benchmark_make_reference(x, filebacked = filebacked, path_dir = path_dir, prefix = prefix)
return(list(
annoy = big,
exact = big,
dense = x,
bytes = benchmark_dense_bytes(x),
storage = if (isTRUE(filebacked)) "filebacked_bigmatrix" else "bigmatrix",
nrow = nrow(x),
ncol = ncol(x)
))
}
return(list(
annoy = x,
exact = x,
dense = x,
bytes = benchmark_dense_bytes(x),
storage = "dense_matrix",
nrow = nrow(x),
ncol = ncol(x)
))
}
if (methods::is(x, "big.matrix")) {
xp <- resolve_big_pointer(x, arg)
return(list(
annoy = x,
exact = x,
dense = read_big_rows(xp, seq_len(big_nrow(xp)), arg),
bytes = benchmark_big_bytes(xp),
storage = "bigmatrix",
nrow = nrow(x),
ncol = ncol(x)
))
}
if (identical(typeof(x), "externalptr")) {
return(list(
annoy = x,
exact = x,
dense = read_big_rows(x, seq_len(big_nrow(x)), arg),
bytes = benchmark_big_bytes(x),
storage = "bigmatrix_pointer",
nrow = big_nrow(x),
ncol = big_ncol(x)
))
}
if (is_big_descriptor_input(x)) {
attached <- attach_big_descriptor(x, arg)
xp <- resolve_big_pointer(attached, arg)
return(list(
annoy = attached,
exact = attached,
dense = read_big_rows(xp, seq_len(big_nrow(xp)), arg),
bytes = benchmark_big_bytes(xp),
storage = "bigmatrix_descriptor",
nrow = nrow(attached),
ncol = ncol(attached)
))
}
stop(sprintf("`%s` must be NULL, a numeric matrix, a big.matrix, a descriptor, a descriptor path, or an external pointer", arg), call. = FALSE)
}
benchmark_prepare_data <- function(x,
query,
query_missing,
n_ref,
n_query,
n_dim,
seed,
filebacked,
path_dir,
prefix) {
if (is.null(x)) {
set.seed(seed)
ref <- matrix(stats::rnorm(n_ref * n_dim), nrow = n_ref, ncol = n_dim)
query_values <- if (isTRUE(query_missing)) {
matrix(stats::rnorm(n_query * n_dim), nrow = n_query, ncol = n_dim)
} else if (is.null(query)) {
NULL
} else {
query
}
ref_info <- benchmark_prepare_input(ref, "x", path_dir = path_dir, prefix = prefix, filebacked = filebacked)
query_info <- benchmark_prepare_input(query_values, "query", path_dir = path_dir, prefix = prefix, allow_null = TRUE)
} else {
ref_info <- benchmark_prepare_input(x, "x", path_dir = path_dir, prefix = prefix, filebacked = filebacked)
query_info <- benchmark_prepare_input(query, "query", path_dir = path_dir, prefix = prefix, allow_null = TRUE)
}
self_search <- is.null(query_info$annoy)
if (!self_search && !identical(ref_info$ncol, query_info$ncol)) {
stop("`query` must have the same number of columns as the benchmark reference input", call. = FALSE)
}
list(
ref_annoy = ref_info$annoy,
ref_exact = ref_info$exact,
ref_dense = ref_info$dense,
ref_bytes = ref_info$bytes,
ref_storage = ref_info$storage,
query_annoy = query_info$annoy,
query_exact = query_info$exact,
query_dense = query_info$dense,
query_bytes = query_info$bytes,
query_storage = query_info$storage,
self_search = self_search,
n_ref = ref_info$nrow,
n_query = if (self_search) ref_info$nrow else query_info$nrow,
n_dim = ref_info$ncol
)
}
benchmark_elapsed <- function(timing) {
as.numeric(unname(timing[["elapsed"]]))
}
benchmark_recall_at_k <- function(approx_index, exact_index, k) {
mean(vapply(seq_len(nrow(approx_index)), function(i) {
length(intersect(approx_index[i, ], exact_index[i, ])) / k
}, numeric(1L)))
}
benchmark_exact_available <- function(metric, exact) {
if (!isTRUE(exact)) {
return(FALSE)
}
if (!identical(metric, "euclidean")) {
warning(
"The exact benchmark baseline is only available for `metric = \"euclidean\"`; skipping recall/exact timing.",
call. = FALSE
)
return(FALSE)
}
pkg <- "bigKNN"
if (!requireNamespace(pkg, quietly = TRUE)) {
warning(
"The exact benchmark baseline requires the 'bigKNN' package; skipping recall/exact timing.",
call. = FALSE
)
return(FALSE)
}
TRUE
}
benchmark_exact_result <- function(ref_big, query, k, block_size) {
pkg <- "bigKNN"
knn_fun <- get("knn_bigmatrix", envir = asNamespace(pkg), inherits = FALSE)
timing <- system.time({
result <- knn_fun(
ref_big,
query = query,
k = k,
metric = "euclidean",
block_size = block_size,
exclude_self = is.null(query)
)
})
list(
elapsed = benchmark_elapsed(timing),
result = result
)
}
benchmark_write_summary <- function(summary, output_path) {
if (!is.null(output_path)) {
utils::write.csv(summary, output_path, row.names = FALSE)
}
invisible(summary)
}
benchmark_effective_query_bytes <- function(self_search, ref_bytes, query_bytes) {
if (isTRUE(self_search)) {
return(ref_bytes)
}
query_bytes
}
benchmark_total_data_bytes <- function(self_search, ref_bytes, query_bytes) {
ref_bytes + benchmark_effective_query_bytes(self_search, ref_bytes, query_bytes)
}
benchmark_summary_row <- function(implementation,
backend,
reference_storage,
metric,
self_search,
load_mode,
n_ref,
n_query,
n_dim,
k,
n_trees,
search_k,
build_threads,
ref_bytes,
query_bytes,
index_bytes,
metadata_bytes,
build_elapsed,
search_elapsed,
exact_elapsed,
recall_at_k,
index_id = NA_character_) {
effective_query_bytes <- benchmark_effective_query_bytes(self_search, ref_bytes, query_bytes)
total_data_bytes <- benchmark_total_data_bytes(self_search, ref_bytes, query_bytes)
artifact_bytes <- index_bytes + metadata_bytes
data.frame(
implementation = implementation,
backend = backend,
reference_storage = reference_storage,
metric = metric,
self_search = isTRUE(self_search),
load_mode = load_mode,
n_ref = as.integer(n_ref),
n_query = as.integer(n_query),
n_dim = as.integer(n_dim),
k = as.integer(k),
n_trees = as.integer(n_trees),
search_k = as.integer(search_k),
build_threads = as.integer(build_threads),
ref_bytes = as.numeric(ref_bytes),
query_bytes = as.numeric(query_bytes),
effective_query_bytes = as.numeric(effective_query_bytes),
total_data_bytes = as.numeric(total_data_bytes),
index_bytes = as.numeric(index_bytes),
metadata_bytes = as.numeric(metadata_bytes),
artifact_bytes = as.numeric(artifact_bytes),
build_elapsed = as.numeric(build_elapsed),
search_elapsed = as.numeric(search_elapsed),
exact_elapsed = as.numeric(exact_elapsed),
recall_at_k = as.numeric(recall_at_k),
build_rows_per_sec = benchmark_rows_per_second(n_ref, build_elapsed),
query_rows_per_sec = benchmark_rows_per_second(n_query, search_elapsed),
index_id = as.character(index_id),
stringsAsFactors = FALSE
)
}
benchmark_rcppannoy_build_search <- function(ref_dense,
query_dense,
self_search,
k,
search_k,
metric,
n_trees,
build_seed,
index_path,
block_size) {
n_ref <- nrow(ref_dense)
n_query <- if (isTRUE(self_search)) n_ref else nrow(query_dense)
n_dim <- ncol(ref_dense)
handle <- annoy_new_r_handle(metric, n_dim)
if (!is.null(build_seed)) {
handle$setSeed(build_seed)
}
build_timing <- system.time({
for (i in seq_len(n_ref)) {
handle$addItem(as.integer(i - 1L), ref_dense[i, ])
}
handle$build(n_trees)
handle$save(index_path)
})
buffers <- allocate_search_buffers(n_query, k, stream_index = NULL)
block_starts <- seq.int(1L, n_query, by = block_size)
search_timing <- system.time({
for (block_id in seq_along(block_starts)) {
start <- block_starts[block_id]
stop_row <- min(start + block_size - 1L, n_query)
rows <- seq.int(start, stop_row)
block <- if (isTRUE(self_search)) {
search_block(handle, k = k, search_k = search_k, rows = rows, self_search = TRUE)
} else {
search_block(
handle,
k = k,
search_k = search_k,
rows = rows,
self_search = FALSE,
query_block_values = query_dense[rows, , drop = FALSE]
)
}
buffers$index[rows, ] <- block$index
buffers$distance[rows, ] <- block$distance
}
})
unload_attempt <- try(handle$unload(), silent = TRUE)
invisible(unload_attempt)
list(
build_elapsed = benchmark_elapsed(build_timing),
search_elapsed = benchmark_elapsed(search_timing),
index = buffers$index,
distance = buffers$distance,
index_bytes = benchmark_file_bytes(index_path)
)
}
#' Benchmark bigANNOY against direct RcppAnnoy
#'
#' @description
#' Run the same Annoy build and search task through `bigANNOY` and through a
#' direct dense `RcppAnnoy` baseline. The comparison reports both speed metrics
#' and data-volume metrics such as reference bytes, query bytes, and generated
#' index size.
#'
#' @inheritParams benchmark_annoy_bigmatrix
#'
#' @return A list with a two-row `summary` data frame, one row for `bigANNOY`
#' and one for direct `RcppAnnoy`, plus benchmark metadata and any validation
#' report produced for the `bigANNOY` index.
#' @export
benchmark_annoy_vs_rcppannoy <- function(x = NULL,
query = NULL,
n_ref = 2000L,
n_query = 200L,
n_dim = 20L,
k = 10L,
n_trees = 50L,
metric = "euclidean",
search_k = -1L,
seed = 42L,
build_seed = seed,
build_threads = -1L,
block_size = annoy_default_block_size(),
backend = getOption("bigANNOY.backend", "cpp"),
exact = TRUE,
filebacked = FALSE,
path_dir = tempdir(),
keep_files = FALSE,
output_path = NULL,
load_mode = "eager") {
query_missing <- missing(query)
n_ref <- normalize_n_trees(n_ref)
n_query <- normalize_n_trees(n_query)
n_dim <- normalize_n_trees(n_dim)
k <- normalize_k(k)
n_trees <- normalize_n_trees(n_trees)
metric <- normalize_metric(metric)
search_k <- normalize_search_k(search_k)
seed <- normalize_seed(seed)
build_seed <- normalize_seed(build_seed)
build_threads <- normalize_build_threads(build_threads)
block_size <- normalize_block_size(block_size)
filebacked <- normalize_scalar_logical(filebacked, "filebacked")
keep_files <- normalize_scalar_logical(keep_files, "keep_files")
path_dir <- benchmark_existing_dir(path_dir)
output_path <- benchmark_output_path(output_path)
load_mode <- normalize_load_mode(load_mode, default = "eager")
prefix <- basename(tempfile(pattern = "bigannoy-compare-", tmpdir = path_dir))
dataset <- benchmark_prepare_data(
x = x,
query = query,
query_missing = query_missing,
n_ref = n_ref,
n_query = n_query,
n_dim = n_dim,
seed = seed,
filebacked = filebacked,
path_dir = path_dir,
prefix = prefix
)
validate_search_k(k, dataset$n_ref, self_search = isTRUE(dataset$self_search))
exact_elapsed <- NA_real_
exact_index <- NULL
exact_used <- benchmark_exact_available(metric, exact)
if (isTRUE(exact_used)) {
exact_result <- benchmark_exact_result(
ref_big = dataset$ref_exact,
query = dataset$query_exact,
k = k,
block_size = block_size
)
exact_elapsed <- exact_result$elapsed
exact_index <- exact_result$result$index
}
old_options <- options(bigANNOY.backend = backend)
on.exit(options(old_options), add = TRUE)
big_index_path <- benchmark_make_index_path(path_dir, prefix = "bigannoy-compare-")
big_metadata_path <- annoy_metadata_path(big_index_path)
rcpp_index_path <- benchmark_make_index_path(path_dir, prefix = "rcppannoy-compare-")
if (!isTRUE(keep_files)) {
on.exit(unlink(c(big_index_path, big_metadata_path, rcpp_index_path), force = TRUE), add = TRUE)
}
big_build_timing <- system.time({
big_index <- annoy_build_bigmatrix(
dataset$ref_annoy,
path = big_index_path,
n_trees = n_trees,
metric = metric,
seed = build_seed,
build_threads = build_threads,
block_size = block_size,
load_mode = load_mode
)
})
validation <- annoy_validate_index(big_index, strict = TRUE, load = TRUE)
big_search_timing <- system.time({
big_result <- annoy_search_bigmatrix(
big_index,
query = dataset$query_annoy,
k = k,
search_k = search_k,
block_size = block_size
)
})
rcpp_result <- benchmark_rcppannoy_build_search(
ref_dense = dataset$ref_dense,
query_dense = dataset$query_dense,
self_search = dataset$self_search,
k = k,
search_k = search_k,
metric = metric,
n_trees = n_trees,
build_seed = build_seed,
index_path = rcpp_index_path,
block_size = block_size
)
big_recall <- if (is.null(exact_index)) NA_real_ else benchmark_recall_at_k(big_result$index, exact_index, k)
rcpp_recall <- if (is.null(exact_index)) NA_real_ else benchmark_recall_at_k(rcpp_result$index, exact_index, k)
summary <- rbind(
benchmark_summary_row(
implementation = "bigANNOY",
backend = big_index$build_backend,
reference_storage = dataset$ref_storage,
metric = metric,
self_search = dataset$self_search,
load_mode = big_index$load_mode,
n_ref = dataset$n_ref,
n_query = dataset$n_query,
n_dim = dataset$n_dim,
k = k,
n_trees = n_trees,
search_k = search_k,
build_threads = build_threads,
ref_bytes = dataset$ref_bytes,
query_bytes = dataset$query_bytes,
index_bytes = benchmark_file_bytes(big_index_path),
metadata_bytes = benchmark_file_bytes(big_metadata_path),
build_elapsed = benchmark_elapsed(big_build_timing),
search_elapsed = benchmark_elapsed(big_search_timing),
exact_elapsed = exact_elapsed,
recall_at_k = big_recall,
index_id = big_index$index_id
),
benchmark_summary_row(
implementation = "RcppAnnoy",
backend = "dense",
reference_storage = "dense_matrix",
metric = metric,
self_search = dataset$self_search,
load_mode = "in_memory",
n_ref = dataset$n_ref,
n_query = dataset$n_query,
n_dim = dataset$n_dim,
k = k,
n_trees = n_trees,
search_k = search_k,
build_threads = NA_integer_,
ref_bytes = dataset$ref_bytes,
query_bytes = dataset$query_bytes,
index_bytes = rcpp_result$index_bytes,
metadata_bytes = 0,
build_elapsed = rcpp_result$build_elapsed,
search_elapsed = rcpp_result$search_elapsed,
exact_elapsed = exact_elapsed,
recall_at_k = rcpp_recall,
index_id = NA_character_
)
)
benchmark_write_summary(summary, output_path)
list(
summary = summary,
params = summary[, c("implementation", "backend", "reference_storage", "metric", "self_search", "load_mode", "n_ref", "n_query", "n_dim", "k", "n_trees", "search_k"), drop = FALSE],
exact_available = exact_used,
validation = validation,
artifacts = list(
bigannoy = list(index_path = big_index_path, metadata_path = big_metadata_path),
rcppannoy = list(index_path = rcpp_index_path)
)
)
}
#' Benchmark scaling across data volumes for bigANNOY and direct RcppAnnoy
#'
#' @description
#' Run `benchmark_annoy_vs_rcppannoy()` over a grid of synthetic data sizes to
#' study how build time, search time, and index size scale with data volume.
#'
#' @param n_ref Integer vector of synthetic reference row counts.
#' @param n_query Integer vector of synthetic query row counts.
#' @param n_dim Integer vector of synthetic column counts.
#' @inheritParams benchmark_annoy_vs_rcppannoy
#'
#' @return A list with a `summary` data frame containing one row per
#' implementation and data-volume combination.
#' @export
benchmark_annoy_volume_suite <- function(n_ref = c(2000L, 5000L, 10000L),
n_query = 200L,
n_dim = c(20L, 50L),
k = 10L,
n_trees = 50L,
metric = "euclidean",
search_k = -1L,
seed = 42L,
build_seed = seed,
build_threads = -1L,
block_size = annoy_default_block_size(),
backend = getOption("bigANNOY.backend", "cpp"),
exact = FALSE,
filebacked = FALSE,
path_dir = tempdir(),
keep_files = FALSE,
output_path = NULL,
load_mode = "eager") {
n_ref <- vapply(as.list(n_ref), normalize_n_trees, integer(1L))
n_query <- vapply(as.list(n_query), normalize_n_trees, integer(1L))
n_dim <- vapply(as.list(n_dim), normalize_n_trees, integer(1L))
k <- normalize_k(k)
n_trees <- normalize_n_trees(n_trees)
metric <- normalize_metric(metric)
search_k <- normalize_search_k(search_k)
seed <- normalize_seed(seed)
build_seed <- normalize_seed(build_seed)
build_threads <- normalize_build_threads(build_threads)
block_size <- normalize_block_size(block_size)
filebacked <- normalize_scalar_logical(filebacked, "filebacked")
keep_files <- normalize_scalar_logical(keep_files, "keep_files")
path_dir <- benchmark_existing_dir(path_dir)
output_path <- benchmark_output_path(output_path)
load_mode <- normalize_load_mode(load_mode, default = "eager")
rows <- vector("list", length(n_ref) * length(n_query) * length(n_dim))
row_id <- 0L
for (ref_rows in n_ref) {
for (query_rows in n_query) {
for (dims in n_dim) {
row_id <- row_id + 1L
current <- benchmark_annoy_vs_rcppannoy(
n_ref = ref_rows,
n_query = query_rows,
n_dim = dims,
k = k,
n_trees = n_trees,
metric = metric,
search_k = search_k,
seed = seed,
build_seed = build_seed,
build_threads = build_threads,
block_size = block_size,
backend = backend,
exact = exact,
filebacked = filebacked,
path_dir = path_dir,
keep_files = keep_files,
output_path = NULL,
load_mode = load_mode
)
rows[[row_id]] <- current$summary
}
}
}
summary <- do.call(rbind, rows)
benchmark_write_summary(summary, output_path)
list(
summary = summary,
exact_available = isTRUE(exact) &&
identical(metric, "euclidean") &&
length(find.package("bigKNN", quiet = TRUE)) > 0L
)
}
#' Benchmark a single bigANNOY build/search configuration
#'
#' @description
#' Build or reuse a benchmark reference dataset, create an Annoy index, query
#' it, and optionally compare recall against the exact `bigKNN` Euclidean
#' baseline.
#'
#' @param x Optional benchmark reference input. Supply `NULL` to generate a
#' synthetic reference matrix, or provide a numeric matrix, `big.matrix`,
#' descriptor, descriptor path, or external pointer.
#' @param query Optional benchmark query input. Supply `NULL` for self-search,
#' or provide a numeric matrix, `big.matrix`, descriptor, descriptor path, or
#' external pointer.
#' @param n_ref Number of synthetic reference rows to generate when `x = NULL`.
#' @param n_query Number of synthetic query rows to generate when `x = NULL` and
#' `query` is not `NULL`.
#' @param n_dim Number of synthetic columns to generate when `x = NULL`.
#' @param k Number of neighbours to return.
#' @param n_trees Number of Annoy trees to build.
#' @param metric Annoy metric. One of `"euclidean"`, `"angular"`,
#' `"manhattan"`, or `"dot"`.
#' @param search_k Annoy search budget.
#' @param seed Random seed used for synthetic data generation and, by default,
#' for the Annoy build seed.
#' @param build_seed Optional Annoy build seed. Defaults to `seed`.
#' @param build_threads Native Annoy build-thread setting.
#' @param block_size Build/search block size.
#' @param backend Requested bigANNOY backend.
#' @param exact Logical flag controlling whether to benchmark the exact
#' Euclidean baseline with `bigKNN` when available.
#' @param filebacked Logical flag; if `TRUE`, synthetic or dense reference
#' inputs are converted into file-backed `big.matrix` objects before build.
#' @param path_dir Directory where temporary Annoy and optional file-backed
#' benchmark files should be written.
#' @param keep_files Logical flag; if `TRUE`, leave the generated Annoy index on
#' disk after the benchmark finishes.
#' @param output_path Optional CSV path where the benchmark summary should be
#' written.
#' @param load_mode Whether the benchmarked index should be returned
#' metadata-only until first search (`"lazy"`) or eagerly loaded once built
#' (`"eager"`).
#'
#' @return A list with a one-row `summary` data frame plus the benchmark
#' parameters and generated Annoy file paths.
#' @export
benchmark_annoy_bigmatrix <- function(x = NULL,
query = NULL,
n_ref = 2000L,
n_query = 200L,
n_dim = 20L,
k = 10L,
n_trees = 50L,
metric = "euclidean",
search_k = -1L,
seed = 42L,
build_seed = seed,
build_threads = -1L,
block_size = annoy_default_block_size(),
backend = getOption("bigANNOY.backend", "cpp"),
exact = TRUE,
filebacked = FALSE,
path_dir = tempdir(),
keep_files = FALSE,
output_path = NULL,
load_mode = "eager") {
query_missing <- missing(query)
n_ref <- normalize_n_trees(n_ref)
n_query <- normalize_n_trees(n_query)
n_dim <- normalize_n_trees(n_dim)
k <- normalize_k(k)
n_trees <- normalize_n_trees(n_trees)
metric <- normalize_metric(metric)
search_k <- normalize_search_k(search_k)
seed <- normalize_seed(seed)
build_seed <- normalize_seed(build_seed)
build_threads <- normalize_build_threads(build_threads)
block_size <- normalize_block_size(block_size)
filebacked <- normalize_scalar_logical(filebacked, "filebacked")
keep_files <- normalize_scalar_logical(keep_files, "keep_files")
path_dir <- benchmark_existing_dir(path_dir)
output_path <- benchmark_output_path(output_path)
load_mode <- normalize_load_mode(load_mode, default = "eager")
prefix <- basename(tempfile(pattern = "bigannoy-benchmark-", tmpdir = path_dir))
dataset <- benchmark_prepare_data(
x = x,
query = query,
query_missing = query_missing,
n_ref = n_ref,
n_query = n_query,
n_dim = n_dim,
seed = seed,
filebacked = filebacked,
path_dir = path_dir,
prefix = prefix
)
validate_search_k(k, dataset$n_ref, self_search = isTRUE(dataset$self_search))
old_options <- options(bigANNOY.backend = backend)
on.exit(options(old_options), add = TRUE)
index_path <- benchmark_make_index_path(path_dir, prefix = "bigannoy-benchmark-")
metadata_path <- annoy_metadata_path(index_path)
if (!isTRUE(keep_files)) {
on.exit(unlink(c(index_path, metadata_path), force = TRUE), add = TRUE)
}
build_timing <- system.time({
index <- annoy_build_bigmatrix(
dataset$ref_annoy,
path = index_path,
n_trees = n_trees,
metric = metric,
seed = build_seed,
build_threads = build_threads,
block_size = block_size,
load_mode = load_mode
)
})
validation <- annoy_validate_index(index, strict = TRUE, load = TRUE)
search_timing <- system.time({
result <- annoy_search_bigmatrix(
index,
query = dataset$query_annoy,
k = k,
search_k = search_k,
block_size = block_size
)
})
exact_elapsed <- NA_real_
recall_at_k <- NA_real_
exact_used <- benchmark_exact_available(metric, exact)
if (isTRUE(exact_used)) {
exact_result <- benchmark_exact_result(
ref_big = dataset$ref_exact,
query = dataset$query_exact,
k = k,
block_size = block_size
)
exact_elapsed <- exact_result$elapsed
recall_at_k <- benchmark_recall_at_k(result$index, exact_result$result$index, k)
}
summary <- data.frame(
metric = metric,
backend = index$build_backend,
filebacked = filebacked,
self_search = isTRUE(dataset$self_search),
load_mode = index$load_mode,
n_ref = dataset$n_ref,
n_query = dataset$n_query,
n_dim = dataset$n_dim,
k = k,
n_trees = n_trees,
search_k = search_k,
build_threads = build_threads,
build_elapsed = benchmark_elapsed(build_timing),
search_elapsed = benchmark_elapsed(search_timing),
exact_elapsed = exact_elapsed,
recall_at_k = recall_at_k,
index_id = index$index_id,
stringsAsFactors = FALSE
)
benchmark_write_summary(summary, output_path)
list(
summary = summary,
params = summary[1L, c("metric", "backend", "filebacked", "self_search", "load_mode", "n_ref", "n_query", "n_dim", "k", "n_trees", "search_k", "build_threads"), drop = FALSE],
index_path = index_path,
metadata_path = metadata_path,
exact_available = exact_used,
validation = validation
)
}
#' Benchmark a recall suite across multiple Annoy configurations
#'
#' @description
#' Run a grid of `n_trees` and `search_k` settings on the same benchmark
#' dataset, optionally recording recall against the exact `bigKNN` Euclidean
#' baseline.
#'
#' @inheritParams benchmark_annoy_bigmatrix
#' @param n_trees Integer vector of Annoy tree counts to benchmark.
#' @param search_k Integer vector of Annoy search budgets to benchmark.
#'
#' @return A list with a `summary` data frame containing one row per
#' `(n_trees, search_k)` configuration.
#' @export
benchmark_annoy_recall_suite <- function(x = NULL,
query = NULL,
n_ref = 2000L,
n_query = 200L,
n_dim = 20L,
k = 10L,
n_trees = c(10L, 50L, 100L),
search_k = c(-1L, 1000L, 5000L),
metric = "euclidean",
seed = 42L,
build_seed = seed,
build_threads = -1L,
block_size = annoy_default_block_size(),
backend = getOption("bigANNOY.backend", "cpp"),
exact = TRUE,
filebacked = FALSE,
path_dir = tempdir(),
keep_files = FALSE,
output_path = NULL,
load_mode = "eager") {
query_missing <- missing(query)
n_ref <- normalize_n_trees(n_ref)
n_query <- normalize_n_trees(n_query)
n_dim <- normalize_n_trees(n_dim)
k <- normalize_k(k)
metric <- normalize_metric(metric)
seed <- normalize_seed(seed)
build_seed <- normalize_seed(build_seed)
build_threads <- normalize_build_threads(build_threads)
block_size <- normalize_block_size(block_size)
filebacked <- normalize_scalar_logical(filebacked, "filebacked")
keep_files <- normalize_scalar_logical(keep_files, "keep_files")
path_dir <- benchmark_existing_dir(path_dir)
output_path <- benchmark_output_path(output_path)
load_mode <- normalize_load_mode(load_mode, default = "eager")
n_trees <- vapply(as.list(n_trees), normalize_n_trees, integer(1L))
search_k <- vapply(as.list(search_k), normalize_search_k, integer(1L))
prefix <- basename(tempfile(pattern = "bigannoy-suite-", tmpdir = path_dir))
dataset <- benchmark_prepare_data(
x = x,
query = query,
query_missing = query_missing,
n_ref = n_ref,
n_query = n_query,
n_dim = n_dim,
seed = seed,
filebacked = filebacked,
path_dir = path_dir,
prefix = prefix
)
validate_search_k(k, dataset$n_ref, self_search = isTRUE(dataset$self_search))
old_options <- options(bigANNOY.backend = backend)
on.exit(options(old_options), add = TRUE)
exact_elapsed <- NA_real_
exact_index <- NULL
exact_used <- benchmark_exact_available(metric, exact)
if (isTRUE(exact_used)) {
exact_result <- benchmark_exact_result(
ref_big = dataset$ref_exact,
query = dataset$query_exact,
k = k,
block_size = block_size
)
exact_elapsed <- exact_result$elapsed
exact_index <- exact_result$result$index
}
rows <- vector("list", length(n_trees) * length(search_k))
row_id <- 0L
for (tree_count in n_trees) {
index_path <- benchmark_make_index_path(path_dir, prefix = sprintf("bigannoy-suite-%d-", tree_count))
metadata_path <- annoy_metadata_path(index_path)
if (!isTRUE(keep_files)) {
on.exit(unlink(c(index_path, metadata_path), force = TRUE), add = TRUE)
}
build_timing <- system.time({
index <- annoy_build_bigmatrix(
dataset$ref_annoy,
path = index_path,
n_trees = tree_count,
metric = metric,
seed = build_seed,
build_threads = build_threads,
block_size = block_size,
load_mode = load_mode
)
})
validation <- annoy_validate_index(index, strict = TRUE, load = TRUE)
invisible(validation)
for (search_budget in search_k) {
row_id <- row_id + 1L
search_timing <- system.time({
result <- annoy_search_bigmatrix(
index,
query = dataset$query_annoy,
k = k,
search_k = search_budget,
block_size = block_size
)
})
rows[[row_id]] <- data.frame(
metric = metric,
backend = index$build_backend,
filebacked = filebacked,
self_search = isTRUE(dataset$self_search),
load_mode = index$load_mode,
n_ref = dataset$n_ref,
n_query = dataset$n_query,
n_dim = dataset$n_dim,
k = k,
n_trees = tree_count,
search_k = search_budget,
build_threads = build_threads,
build_elapsed = benchmark_elapsed(build_timing),
search_elapsed = benchmark_elapsed(search_timing),
exact_elapsed = exact_elapsed,
recall_at_k = if (is.null(exact_index)) NA_real_ else benchmark_recall_at_k(result$index, exact_index, k),
index_id = index$index_id,
stringsAsFactors = FALSE
)
}
}
summary <- do.call(rbind, rows)
benchmark_write_summary(summary, output_path)
list(
summary = summary,
exact_available = exact_used
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.