R/benchmark_interface.R

Defines functions benchmark_annoy_recall_suite benchmark_annoy_bigmatrix benchmark_annoy_volume_suite benchmark_annoy_vs_rcppannoy benchmark_rcppannoy_build_search benchmark_summary_row benchmark_total_data_bytes benchmark_effective_query_bytes benchmark_write_summary benchmark_exact_result benchmark_exact_available benchmark_recall_at_k benchmark_elapsed benchmark_prepare_data benchmark_prepare_input benchmark_is_dense_matrix benchmark_make_reference benchmark_rows_per_second benchmark_file_bytes benchmark_big_bytes benchmark_dense_bytes benchmark_type_size benchmark_make_index_path benchmark_output_path benchmark_existing_dir benchmark_require_bigmemory

Documented in benchmark_annoy_bigmatrix benchmark_annoy_recall_suite benchmark_annoy_volume_suite benchmark_annoy_vs_rcppannoy

benchmark_require_bigmemory <- function() {
    if (!requireNamespace("bigmemory", quietly = TRUE)) {
        stop("The benchmark interface requires the 'bigmemory' package.", call. = FALSE)
    }
}

benchmark_existing_dir <- function(path_dir) {
    if (!is.character(path_dir) || length(path_dir) != 1L || is.na(path_dir) || !nzchar(path_dir)) {
        stop("`path_dir` must be a single non-empty character string", call. = FALSE)
    }
    path_dir <- normalizePath(path.expand(path_dir), winslash = "/", mustWork = FALSE)
    if (!dir.exists(path_dir)) {
        stop(sprintf("Benchmark directory does not exist: %s", path_dir), call. = FALSE)
    }
    path_dir
}

benchmark_output_path <- function(output_path) {
    if (is.null(output_path)) {
        return(NULL)
    }
    if (!is.character(output_path) || length(output_path) != 1L || is.na(output_path) || !nzchar(output_path)) {
        stop("`output_path` must be NULL or a single non-empty character string", call. = FALSE)
    }
    output_path <- normalizePath(path.expand(output_path), winslash = "/", mustWork = FALSE)
    parent <- dirname(output_path)
    if (!dir.exists(parent)) {
        stop(sprintf("Benchmark output directory does not exist: %s", parent), call. = FALSE)
    }
    output_path
}

benchmark_make_index_path <- function(path_dir, prefix = "bigannoy-benchmark") {
    tempfile(pattern = prefix, tmpdir = path_dir, fileext = ".ann")
}

benchmark_type_size <- function(type_code) {
    switch(
        as.character(type_code),
        "1" = 1,
        "2" = 2,
        "4" = 4,
        "6" = 4,
        "8" = 8,
        NA_real_
    )
}

benchmark_dense_bytes <- function(x) {
    if (is.null(x)) {
        return(0)
    }
    as.numeric(length(x)) * 8
}

benchmark_big_bytes <- function(xp) {
    as.numeric(big_nrow(xp)) *
        as.numeric(big_ncol(xp)) *
        benchmark_type_size(big_type(xp))
}

benchmark_file_bytes <- function(path) {
    if (is.null(path) || !length(path) || is.na(path) || !nzchar(path) || !file.exists(path)) {
        return(0)
    }
    as.numeric(file.info(path)$size[[1L]])
}

benchmark_rows_per_second <- function(n, elapsed) {
    if (!is.finite(elapsed) || elapsed <= 0) {
        return(NA_real_)
    }
    as.numeric(n) / elapsed
}

benchmark_make_reference <- function(values, filebacked, path_dir, prefix) {
    benchmark_require_bigmemory()

    if (!isTRUE(filebacked)) {
        return(utils::getFromNamespace("as.big.matrix", "bigmemory")(values))
    }

    backingfile <- sprintf("%s.bin", prefix)
    descriptorfile <- sprintf("%s.desc", prefix)
    big <- utils::getFromNamespace("filebacked.big.matrix", "bigmemory")(
        nrow = nrow(values),
        ncol = ncol(values),
        type = "double",
        backingfile = backingfile,
        descriptorfile = descriptorfile,
        backingpath = path_dir
    )
    big[,] <- values
    big
}

benchmark_is_dense_matrix <- function(x) {
    is.matrix(x) && is.numeric(x)
}

benchmark_prepare_input <- function(x, arg, path_dir, prefix, filebacked = FALSE, allow_null = FALSE) {
    if (allow_null && is.null(x)) {
        return(list(
            annoy = NULL,
            exact = NULL,
            dense = NULL,
            bytes = 0,
            storage = "none",
            nrow = NA_integer_,
            ncol = NA_integer_
        ))
    }

    if (benchmark_is_dense_matrix(x)) {
        storage.mode(x) <- "double"
        if (any(!is.finite(x))) {
            stop(sprintf("`%s` contains non-finite values", arg), call. = FALSE)
        }
        if (identical(arg, "x")) {
            big <- benchmark_make_reference(x, filebacked = filebacked, path_dir = path_dir, prefix = prefix)
            return(list(
                annoy = big,
                exact = big,
                dense = x,
                bytes = benchmark_dense_bytes(x),
                storage = if (isTRUE(filebacked)) "filebacked_bigmatrix" else "bigmatrix",
                nrow = nrow(x),
                ncol = ncol(x)
            ))
        }
        return(list(
            annoy = x,
            exact = x,
            dense = x,
            bytes = benchmark_dense_bytes(x),
            storage = "dense_matrix",
            nrow = nrow(x),
            ncol = ncol(x)
        ))
    }

    if (methods::is(x, "big.matrix")) {
        xp <- resolve_big_pointer(x, arg)
        return(list(
            annoy = x,
            exact = x,
            dense = read_big_rows(xp, seq_len(big_nrow(xp)), arg),
            bytes = benchmark_big_bytes(xp),
            storage = "bigmatrix",
            nrow = nrow(x),
            ncol = ncol(x)
        ))
    }

    if (identical(typeof(x), "externalptr")) {
        return(list(
            annoy = x,
            exact = x,
            dense = read_big_rows(x, seq_len(big_nrow(x)), arg),
            bytes = benchmark_big_bytes(x),
            storage = "bigmatrix_pointer",
            nrow = big_nrow(x),
            ncol = big_ncol(x)
        ))
    }

    if (is_big_descriptor_input(x)) {
        attached <- attach_big_descriptor(x, arg)
        xp <- resolve_big_pointer(attached, arg)
        return(list(
            annoy = attached,
            exact = attached,
            dense = read_big_rows(xp, seq_len(big_nrow(xp)), arg),
            bytes = benchmark_big_bytes(xp),
            storage = "bigmatrix_descriptor",
            nrow = nrow(attached),
            ncol = ncol(attached)
        ))
    }

    stop(sprintf("`%s` must be NULL, a numeric matrix, a big.matrix, a descriptor, a descriptor path, or an external pointer", arg), call. = FALSE)
}

benchmark_prepare_data <- function(x,
                                   query,
                                   query_missing,
                                   n_ref,
                                   n_query,
                                   n_dim,
                                   seed,
                                   filebacked,
                                   path_dir,
                                   prefix) {
    if (is.null(x)) {
        set.seed(seed)
        ref <- matrix(stats::rnorm(n_ref * n_dim), nrow = n_ref, ncol = n_dim)
        query_values <- if (isTRUE(query_missing)) {
            matrix(stats::rnorm(n_query * n_dim), nrow = n_query, ncol = n_dim)
        } else if (is.null(query)) {
            NULL
        } else {
            query
        }
        ref_info <- benchmark_prepare_input(ref, "x", path_dir = path_dir, prefix = prefix, filebacked = filebacked)
        query_info <- benchmark_prepare_input(query_values, "query", path_dir = path_dir, prefix = prefix, allow_null = TRUE)
    } else {
        ref_info <- benchmark_prepare_input(x, "x", path_dir = path_dir, prefix = prefix, filebacked = filebacked)
        query_info <- benchmark_prepare_input(query, "query", path_dir = path_dir, prefix = prefix, allow_null = TRUE)
    }

    self_search <- is.null(query_info$annoy)
    if (!self_search && !identical(ref_info$ncol, query_info$ncol)) {
        stop("`query` must have the same number of columns as the benchmark reference input", call. = FALSE)
    }

    list(
        ref_annoy = ref_info$annoy,
        ref_exact = ref_info$exact,
        ref_dense = ref_info$dense,
        ref_bytes = ref_info$bytes,
        ref_storage = ref_info$storage,
        query_annoy = query_info$annoy,
        query_exact = query_info$exact,
        query_dense = query_info$dense,
        query_bytes = query_info$bytes,
        query_storage = query_info$storage,
        self_search = self_search,
        n_ref = ref_info$nrow,
        n_query = if (self_search) ref_info$nrow else query_info$nrow,
        n_dim = ref_info$ncol
    )
}

benchmark_elapsed <- function(timing) {
    as.numeric(unname(timing[["elapsed"]]))
}

benchmark_recall_at_k <- function(approx_index, exact_index, k) {
    mean(vapply(seq_len(nrow(approx_index)), function(i) {
        length(intersect(approx_index[i, ], exact_index[i, ])) / k
    }, numeric(1L)))
}

benchmark_exact_available <- function(metric, exact) {
    if (!isTRUE(exact)) {
        return(FALSE)
    }
    if (!identical(metric, "euclidean")) {
        warning(
            "The exact benchmark baseline is only available for `metric = \"euclidean\"`; skipping recall/exact timing.",
            call. = FALSE
        )
        return(FALSE)
    }
    pkg <- "bigKNN"
    if (!requireNamespace(pkg, quietly = TRUE)) {
        warning(
            "The exact benchmark baseline requires the 'bigKNN' package; skipping recall/exact timing.",
            call. = FALSE
        )
        return(FALSE)
    }
    TRUE
}

benchmark_exact_result <- function(ref_big, query, k, block_size) {
    pkg <- "bigKNN"
    knn_fun <- get("knn_bigmatrix", envir = asNamespace(pkg), inherits = FALSE)
    timing <- system.time({
        result <- knn_fun(
            ref_big,
            query = query,
            k = k,
            metric = "euclidean",
            block_size = block_size,
            exclude_self = is.null(query)
        )
    })

    list(
        elapsed = benchmark_elapsed(timing),
        result = result
    )
}

benchmark_write_summary <- function(summary, output_path) {
    if (!is.null(output_path)) {
        utils::write.csv(summary, output_path, row.names = FALSE)
    }
    invisible(summary)
}

benchmark_effective_query_bytes <- function(self_search, ref_bytes, query_bytes) {
    if (isTRUE(self_search)) {
        return(ref_bytes)
    }
    query_bytes
}

benchmark_total_data_bytes <- function(self_search, ref_bytes, query_bytes) {
    ref_bytes + benchmark_effective_query_bytes(self_search, ref_bytes, query_bytes)
}

benchmark_summary_row <- function(implementation,
                                  backend,
                                  reference_storage,
                                  metric,
                                  self_search,
                                  load_mode,
                                  n_ref,
                                  n_query,
                                  n_dim,
                                  k,
                                  n_trees,
                                  search_k,
                                  build_threads,
                                  ref_bytes,
                                  query_bytes,
                                  index_bytes,
                                  metadata_bytes,
                                  build_elapsed,
                                  search_elapsed,
                                  exact_elapsed,
                                  recall_at_k,
                                  index_id = NA_character_) {
    effective_query_bytes <- benchmark_effective_query_bytes(self_search, ref_bytes, query_bytes)
    total_data_bytes <- benchmark_total_data_bytes(self_search, ref_bytes, query_bytes)
    artifact_bytes <- index_bytes + metadata_bytes

    data.frame(
        implementation = implementation,
        backend = backend,
        reference_storage = reference_storage,
        metric = metric,
        self_search = isTRUE(self_search),
        load_mode = load_mode,
        n_ref = as.integer(n_ref),
        n_query = as.integer(n_query),
        n_dim = as.integer(n_dim),
        k = as.integer(k),
        n_trees = as.integer(n_trees),
        search_k = as.integer(search_k),
        build_threads = as.integer(build_threads),
        ref_bytes = as.numeric(ref_bytes),
        query_bytes = as.numeric(query_bytes),
        effective_query_bytes = as.numeric(effective_query_bytes),
        total_data_bytes = as.numeric(total_data_bytes),
        index_bytes = as.numeric(index_bytes),
        metadata_bytes = as.numeric(metadata_bytes),
        artifact_bytes = as.numeric(artifact_bytes),
        build_elapsed = as.numeric(build_elapsed),
        search_elapsed = as.numeric(search_elapsed),
        exact_elapsed = as.numeric(exact_elapsed),
        recall_at_k = as.numeric(recall_at_k),
        build_rows_per_sec = benchmark_rows_per_second(n_ref, build_elapsed),
        query_rows_per_sec = benchmark_rows_per_second(n_query, search_elapsed),
        index_id = as.character(index_id),
        stringsAsFactors = FALSE
    )
}

benchmark_rcppannoy_build_search <- function(ref_dense,
                                             query_dense,
                                             self_search,
                                             k,
                                             search_k,
                                             metric,
                                             n_trees,
                                             build_seed,
                                             index_path,
                                             block_size) {
    n_ref <- nrow(ref_dense)
    n_query <- if (isTRUE(self_search)) n_ref else nrow(query_dense)
    n_dim <- ncol(ref_dense)

    handle <- annoy_new_r_handle(metric, n_dim)
    if (!is.null(build_seed)) {
        handle$setSeed(build_seed)
    }

    build_timing <- system.time({
        for (i in seq_len(n_ref)) {
            handle$addItem(as.integer(i - 1L), ref_dense[i, ])
        }
        handle$build(n_trees)
        handle$save(index_path)
    })

    buffers <- allocate_search_buffers(n_query, k, stream_index = NULL)
    block_starts <- seq.int(1L, n_query, by = block_size)

    search_timing <- system.time({
        for (block_id in seq_along(block_starts)) {
            start <- block_starts[block_id]
            stop_row <- min(start + block_size - 1L, n_query)
            rows <- seq.int(start, stop_row)
            block <- if (isTRUE(self_search)) {
                search_block(handle, k = k, search_k = search_k, rows = rows, self_search = TRUE)
            } else {
                search_block(
                    handle,
                    k = k,
                    search_k = search_k,
                    rows = rows,
                    self_search = FALSE,
                    query_block_values = query_dense[rows, , drop = FALSE]
                )
            }

            buffers$index[rows, ] <- block$index
            buffers$distance[rows, ] <- block$distance
        }
    })

    unload_attempt <- try(handle$unload(), silent = TRUE)
    invisible(unload_attempt)

    list(
        build_elapsed = benchmark_elapsed(build_timing),
        search_elapsed = benchmark_elapsed(search_timing),
        index = buffers$index,
        distance = buffers$distance,
        index_bytes = benchmark_file_bytes(index_path)
    )
}

#' Benchmark bigANNOY against direct RcppAnnoy
#'
#' @description
#' Run the same Annoy build and search task through `bigANNOY` and through a
#' direct dense `RcppAnnoy` baseline. The comparison reports both speed metrics
#' and data-volume metrics such as reference bytes, query bytes, and generated
#' index size.
#'
#' @inheritParams benchmark_annoy_bigmatrix
#'
#' @return A list with a two-row `summary` data frame, one row for `bigANNOY`
#'   and one for direct `RcppAnnoy`, plus benchmark metadata and any validation
#'   report produced for the `bigANNOY` index.
#' @export
benchmark_annoy_vs_rcppannoy <- function(x = NULL,
                                         query = NULL,
                                         n_ref = 2000L,
                                         n_query = 200L,
                                         n_dim = 20L,
                                         k = 10L,
                                         n_trees = 50L,
                                         metric = "euclidean",
                                         search_k = -1L,
                                         seed = 42L,
                                         build_seed = seed,
                                         build_threads = -1L,
                                         block_size = annoy_default_block_size(),
                                         backend = getOption("bigANNOY.backend", "cpp"),
                                         exact = TRUE,
                                         filebacked = FALSE,
                                         path_dir = tempdir(),
                                         keep_files = FALSE,
                                         output_path = NULL,
                                         load_mode = "eager") {
    query_missing <- missing(query)
    n_ref <- normalize_n_trees(n_ref)
    n_query <- normalize_n_trees(n_query)
    n_dim <- normalize_n_trees(n_dim)
    k <- normalize_k(k)
    n_trees <- normalize_n_trees(n_trees)
    metric <- normalize_metric(metric)
    search_k <- normalize_search_k(search_k)
    seed <- normalize_seed(seed)
    build_seed <- normalize_seed(build_seed)
    build_threads <- normalize_build_threads(build_threads)
    block_size <- normalize_block_size(block_size)
    filebacked <- normalize_scalar_logical(filebacked, "filebacked")
    keep_files <- normalize_scalar_logical(keep_files, "keep_files")
    path_dir <- benchmark_existing_dir(path_dir)
    output_path <- benchmark_output_path(output_path)
    load_mode <- normalize_load_mode(load_mode, default = "eager")

    prefix <- basename(tempfile(pattern = "bigannoy-compare-", tmpdir = path_dir))
    dataset <- benchmark_prepare_data(
        x = x,
        query = query,
        query_missing = query_missing,
        n_ref = n_ref,
        n_query = n_query,
        n_dim = n_dim,
        seed = seed,
        filebacked = filebacked,
        path_dir = path_dir,
        prefix = prefix
    )

    validate_search_k(k, dataset$n_ref, self_search = isTRUE(dataset$self_search))

    exact_elapsed <- NA_real_
    exact_index <- NULL
    exact_used <- benchmark_exact_available(metric, exact)
    if (isTRUE(exact_used)) {
        exact_result <- benchmark_exact_result(
            ref_big = dataset$ref_exact,
            query = dataset$query_exact,
            k = k,
            block_size = block_size
        )
        exact_elapsed <- exact_result$elapsed
        exact_index <- exact_result$result$index
    }

    old_options <- options(bigANNOY.backend = backend)
    on.exit(options(old_options), add = TRUE)

    big_index_path <- benchmark_make_index_path(path_dir, prefix = "bigannoy-compare-")
    big_metadata_path <- annoy_metadata_path(big_index_path)
    rcpp_index_path <- benchmark_make_index_path(path_dir, prefix = "rcppannoy-compare-")

    if (!isTRUE(keep_files)) {
        on.exit(unlink(c(big_index_path, big_metadata_path, rcpp_index_path), force = TRUE), add = TRUE)
    }

    big_build_timing <- system.time({
        big_index <- annoy_build_bigmatrix(
            dataset$ref_annoy,
            path = big_index_path,
            n_trees = n_trees,
            metric = metric,
            seed = build_seed,
            build_threads = build_threads,
            block_size = block_size,
            load_mode = load_mode
        )
    })

    validation <- annoy_validate_index(big_index, strict = TRUE, load = TRUE)

    big_search_timing <- system.time({
        big_result <- annoy_search_bigmatrix(
            big_index,
            query = dataset$query_annoy,
            k = k,
            search_k = search_k,
            block_size = block_size
        )
    })

    rcpp_result <- benchmark_rcppannoy_build_search(
        ref_dense = dataset$ref_dense,
        query_dense = dataset$query_dense,
        self_search = dataset$self_search,
        k = k,
        search_k = search_k,
        metric = metric,
        n_trees = n_trees,
        build_seed = build_seed,
        index_path = rcpp_index_path,
        block_size = block_size
    )

    big_recall <- if (is.null(exact_index)) NA_real_ else benchmark_recall_at_k(big_result$index, exact_index, k)
    rcpp_recall <- if (is.null(exact_index)) NA_real_ else benchmark_recall_at_k(rcpp_result$index, exact_index, k)

    summary <- rbind(
        benchmark_summary_row(
            implementation = "bigANNOY",
            backend = big_index$build_backend,
            reference_storage = dataset$ref_storage,
            metric = metric,
            self_search = dataset$self_search,
            load_mode = big_index$load_mode,
            n_ref = dataset$n_ref,
            n_query = dataset$n_query,
            n_dim = dataset$n_dim,
            k = k,
            n_trees = n_trees,
            search_k = search_k,
            build_threads = build_threads,
            ref_bytes = dataset$ref_bytes,
            query_bytes = dataset$query_bytes,
            index_bytes = benchmark_file_bytes(big_index_path),
            metadata_bytes = benchmark_file_bytes(big_metadata_path),
            build_elapsed = benchmark_elapsed(big_build_timing),
            search_elapsed = benchmark_elapsed(big_search_timing),
            exact_elapsed = exact_elapsed,
            recall_at_k = big_recall,
            index_id = big_index$index_id
        ),
        benchmark_summary_row(
            implementation = "RcppAnnoy",
            backend = "dense",
            reference_storage = "dense_matrix",
            metric = metric,
            self_search = dataset$self_search,
            load_mode = "in_memory",
            n_ref = dataset$n_ref,
            n_query = dataset$n_query,
            n_dim = dataset$n_dim,
            k = k,
            n_trees = n_trees,
            search_k = search_k,
            build_threads = NA_integer_,
            ref_bytes = dataset$ref_bytes,
            query_bytes = dataset$query_bytes,
            index_bytes = rcpp_result$index_bytes,
            metadata_bytes = 0,
            build_elapsed = rcpp_result$build_elapsed,
            search_elapsed = rcpp_result$search_elapsed,
            exact_elapsed = exact_elapsed,
            recall_at_k = rcpp_recall,
            index_id = NA_character_
        )
    )

    benchmark_write_summary(summary, output_path)

    list(
        summary = summary,
        params = summary[, c("implementation", "backend", "reference_storage", "metric", "self_search", "load_mode", "n_ref", "n_query", "n_dim", "k", "n_trees", "search_k"), drop = FALSE],
        exact_available = exact_used,
        validation = validation,
        artifacts = list(
            bigannoy = list(index_path = big_index_path, metadata_path = big_metadata_path),
            rcppannoy = list(index_path = rcpp_index_path)
        )
    )
}

#' Benchmark scaling across data volumes for bigANNOY and direct RcppAnnoy
#'
#' @description
#' Run `benchmark_annoy_vs_rcppannoy()` over a grid of synthetic data sizes to
#' study how build time, search time, and index size scale with data volume.
#'
#' @param n_ref Integer vector of synthetic reference row counts.
#' @param n_query Integer vector of synthetic query row counts.
#' @param n_dim Integer vector of synthetic column counts.
#' @inheritParams benchmark_annoy_vs_rcppannoy
#'
#' @return A list with a `summary` data frame containing one row per
#'   implementation and data-volume combination.
#' @export
benchmark_annoy_volume_suite <- function(n_ref = c(2000L, 5000L, 10000L),
                                         n_query = 200L,
                                         n_dim = c(20L, 50L),
                                         k = 10L,
                                         n_trees = 50L,
                                         metric = "euclidean",
                                         search_k = -1L,
                                         seed = 42L,
                                         build_seed = seed,
                                         build_threads = -1L,
                                         block_size = annoy_default_block_size(),
                                         backend = getOption("bigANNOY.backend", "cpp"),
                                         exact = FALSE,
                                         filebacked = FALSE,
                                         path_dir = tempdir(),
                                         keep_files = FALSE,
                                         output_path = NULL,
                                         load_mode = "eager") {
    n_ref <- vapply(as.list(n_ref), normalize_n_trees, integer(1L))
    n_query <- vapply(as.list(n_query), normalize_n_trees, integer(1L))
    n_dim <- vapply(as.list(n_dim), normalize_n_trees, integer(1L))
    k <- normalize_k(k)
    n_trees <- normalize_n_trees(n_trees)
    metric <- normalize_metric(metric)
    search_k <- normalize_search_k(search_k)
    seed <- normalize_seed(seed)
    build_seed <- normalize_seed(build_seed)
    build_threads <- normalize_build_threads(build_threads)
    block_size <- normalize_block_size(block_size)
    filebacked <- normalize_scalar_logical(filebacked, "filebacked")
    keep_files <- normalize_scalar_logical(keep_files, "keep_files")
    path_dir <- benchmark_existing_dir(path_dir)
    output_path <- benchmark_output_path(output_path)
    load_mode <- normalize_load_mode(load_mode, default = "eager")

    rows <- vector("list", length(n_ref) * length(n_query) * length(n_dim))
    row_id <- 0L

    for (ref_rows in n_ref) {
        for (query_rows in n_query) {
            for (dims in n_dim) {
                row_id <- row_id + 1L
                current <- benchmark_annoy_vs_rcppannoy(
                    n_ref = ref_rows,
                    n_query = query_rows,
                    n_dim = dims,
                    k = k,
                    n_trees = n_trees,
                    metric = metric,
                    search_k = search_k,
                    seed = seed,
                    build_seed = build_seed,
                    build_threads = build_threads,
                    block_size = block_size,
                    backend = backend,
                    exact = exact,
                    filebacked = filebacked,
                    path_dir = path_dir,
                    keep_files = keep_files,
                    output_path = NULL,
                    load_mode = load_mode
                )
                rows[[row_id]] <- current$summary
            }
        }
    }

    summary <- do.call(rbind, rows)
    benchmark_write_summary(summary, output_path)

    list(
        summary = summary,
        exact_available = isTRUE(exact) &&
            identical(metric, "euclidean") &&
            length(find.package("bigKNN", quiet = TRUE)) > 0L
    )
}

#' Benchmark a single bigANNOY build/search configuration
#'
#' @description
#' Build or reuse a benchmark reference dataset, create an Annoy index, query
#' it, and optionally compare recall against the exact `bigKNN` Euclidean
#' baseline.
#'
#' @param x Optional benchmark reference input. Supply `NULL` to generate a
#'   synthetic reference matrix, or provide a numeric matrix, `big.matrix`,
#'   descriptor, descriptor path, or external pointer.
#' @param query Optional benchmark query input. Supply `NULL` for self-search,
#'   or provide a numeric matrix, `big.matrix`, descriptor, descriptor path, or
#'   external pointer.
#' @param n_ref Number of synthetic reference rows to generate when `x = NULL`.
#' @param n_query Number of synthetic query rows to generate when `x = NULL` and
#'   `query` is not `NULL`.
#' @param n_dim Number of synthetic columns to generate when `x = NULL`.
#' @param k Number of neighbours to return.
#' @param n_trees Number of Annoy trees to build.
#' @param metric Annoy metric. One of `"euclidean"`, `"angular"`,
#'   `"manhattan"`, or `"dot"`.
#' @param search_k Annoy search budget.
#' @param seed Random seed used for synthetic data generation and, by default,
#'   for the Annoy build seed.
#' @param build_seed Optional Annoy build seed. Defaults to `seed`.
#' @param build_threads Native Annoy build-thread setting.
#' @param block_size Build/search block size.
#' @param backend Requested bigANNOY backend.
#' @param exact Logical flag controlling whether to benchmark the exact
#'   Euclidean baseline with `bigKNN` when available.
#' @param filebacked Logical flag; if `TRUE`, synthetic or dense reference
#'   inputs are converted into file-backed `big.matrix` objects before build.
#' @param path_dir Directory where temporary Annoy and optional file-backed
#'   benchmark files should be written.
#' @param keep_files Logical flag; if `TRUE`, leave the generated Annoy index on
#'   disk after the benchmark finishes.
#' @param output_path Optional CSV path where the benchmark summary should be
#'   written.
#' @param load_mode Whether the benchmarked index should be returned
#'   metadata-only until first search (`"lazy"`) or eagerly loaded once built
#'   (`"eager"`).
#'
#' @return A list with a one-row `summary` data frame plus the benchmark
#'   parameters and generated Annoy file paths.
#' @export
benchmark_annoy_bigmatrix <- function(x = NULL,
                                      query = NULL,
                                      n_ref = 2000L,
                                      n_query = 200L,
                                      n_dim = 20L,
                                      k = 10L,
                                      n_trees = 50L,
                                      metric = "euclidean",
                                      search_k = -1L,
                                      seed = 42L,
                                      build_seed = seed,
                                      build_threads = -1L,
                                      block_size = annoy_default_block_size(),
                                      backend = getOption("bigANNOY.backend", "cpp"),
                                      exact = TRUE,
                                      filebacked = FALSE,
                                      path_dir = tempdir(),
                                      keep_files = FALSE,
                                      output_path = NULL,
                                      load_mode = "eager") {
    query_missing <- missing(query)
    n_ref <- normalize_n_trees(n_ref)
    n_query <- normalize_n_trees(n_query)
    n_dim <- normalize_n_trees(n_dim)
    k <- normalize_k(k)
    n_trees <- normalize_n_trees(n_trees)
    metric <- normalize_metric(metric)
    search_k <- normalize_search_k(search_k)
    seed <- normalize_seed(seed)
    build_seed <- normalize_seed(build_seed)
    build_threads <- normalize_build_threads(build_threads)
    block_size <- normalize_block_size(block_size)
    filebacked <- normalize_scalar_logical(filebacked, "filebacked")
    keep_files <- normalize_scalar_logical(keep_files, "keep_files")
    path_dir <- benchmark_existing_dir(path_dir)
    output_path <- benchmark_output_path(output_path)
    load_mode <- normalize_load_mode(load_mode, default = "eager")

    prefix <- basename(tempfile(pattern = "bigannoy-benchmark-", tmpdir = path_dir))
    dataset <- benchmark_prepare_data(
        x = x,
        query = query,
        query_missing = query_missing,
        n_ref = n_ref,
        n_query = n_query,
        n_dim = n_dim,
        seed = seed,
        filebacked = filebacked,
        path_dir = path_dir,
        prefix = prefix
    )

    validate_search_k(k, dataset$n_ref, self_search = isTRUE(dataset$self_search))
    old_options <- options(bigANNOY.backend = backend)
    on.exit(options(old_options), add = TRUE)

    index_path <- benchmark_make_index_path(path_dir, prefix = "bigannoy-benchmark-")
    metadata_path <- annoy_metadata_path(index_path)

    if (!isTRUE(keep_files)) {
        on.exit(unlink(c(index_path, metadata_path), force = TRUE), add = TRUE)
    }

    build_timing <- system.time({
        index <- annoy_build_bigmatrix(
            dataset$ref_annoy,
            path = index_path,
            n_trees = n_trees,
            metric = metric,
            seed = build_seed,
            build_threads = build_threads,
            block_size = block_size,
            load_mode = load_mode
        )
    })

    validation <- annoy_validate_index(index, strict = TRUE, load = TRUE)

    search_timing <- system.time({
        result <- annoy_search_bigmatrix(
            index,
            query = dataset$query_annoy,
            k = k,
            search_k = search_k,
            block_size = block_size
        )
    })

    exact_elapsed <- NA_real_
    recall_at_k <- NA_real_
    exact_used <- benchmark_exact_available(metric, exact)

    if (isTRUE(exact_used)) {
        exact_result <- benchmark_exact_result(
            ref_big = dataset$ref_exact,
            query = dataset$query_exact,
            k = k,
            block_size = block_size
        )
        exact_elapsed <- exact_result$elapsed
        recall_at_k <- benchmark_recall_at_k(result$index, exact_result$result$index, k)
    }

    summary <- data.frame(
        metric = metric,
        backend = index$build_backend,
        filebacked = filebacked,
        self_search = isTRUE(dataset$self_search),
        load_mode = index$load_mode,
        n_ref = dataset$n_ref,
        n_query = dataset$n_query,
        n_dim = dataset$n_dim,
        k = k,
        n_trees = n_trees,
        search_k = search_k,
        build_threads = build_threads,
        build_elapsed = benchmark_elapsed(build_timing),
        search_elapsed = benchmark_elapsed(search_timing),
        exact_elapsed = exact_elapsed,
        recall_at_k = recall_at_k,
        index_id = index$index_id,
        stringsAsFactors = FALSE
    )

    benchmark_write_summary(summary, output_path)

    list(
        summary = summary,
        params = summary[1L, c("metric", "backend", "filebacked", "self_search", "load_mode", "n_ref", "n_query", "n_dim", "k", "n_trees", "search_k", "build_threads"), drop = FALSE],
        index_path = index_path,
        metadata_path = metadata_path,
        exact_available = exact_used,
        validation = validation
    )
}

#' Benchmark a recall suite across multiple Annoy configurations
#'
#' @description
#' Run a grid of `n_trees` and `search_k` settings on the same benchmark
#' dataset, optionally recording recall against the exact `bigKNN` Euclidean
#' baseline.
#'
#' @inheritParams benchmark_annoy_bigmatrix
#' @param n_trees Integer vector of Annoy tree counts to benchmark.
#' @param search_k Integer vector of Annoy search budgets to benchmark.
#'
#' @return A list with a `summary` data frame containing one row per
#'   `(n_trees, search_k)` configuration.
#' @export
benchmark_annoy_recall_suite <- function(x = NULL,
                                         query = NULL,
                                         n_ref = 2000L,
                                         n_query = 200L,
                                         n_dim = 20L,
                                         k = 10L,
                                         n_trees = c(10L, 50L, 100L),
                                         search_k = c(-1L, 1000L, 5000L),
                                         metric = "euclidean",
                                         seed = 42L,
                                         build_seed = seed,
                                         build_threads = -1L,
                                         block_size = annoy_default_block_size(),
                                         backend = getOption("bigANNOY.backend", "cpp"),
                                         exact = TRUE,
                                         filebacked = FALSE,
                                         path_dir = tempdir(),
                                         keep_files = FALSE,
                                         output_path = NULL,
                                         load_mode = "eager") {
    query_missing <- missing(query)
    n_ref <- normalize_n_trees(n_ref)
    n_query <- normalize_n_trees(n_query)
    n_dim <- normalize_n_trees(n_dim)
    k <- normalize_k(k)
    metric <- normalize_metric(metric)
    seed <- normalize_seed(seed)
    build_seed <- normalize_seed(build_seed)
    build_threads <- normalize_build_threads(build_threads)
    block_size <- normalize_block_size(block_size)
    filebacked <- normalize_scalar_logical(filebacked, "filebacked")
    keep_files <- normalize_scalar_logical(keep_files, "keep_files")
    path_dir <- benchmark_existing_dir(path_dir)
    output_path <- benchmark_output_path(output_path)
    load_mode <- normalize_load_mode(load_mode, default = "eager")
    n_trees <- vapply(as.list(n_trees), normalize_n_trees, integer(1L))
    search_k <- vapply(as.list(search_k), normalize_search_k, integer(1L))

    prefix <- basename(tempfile(pattern = "bigannoy-suite-", tmpdir = path_dir))
    dataset <- benchmark_prepare_data(
        x = x,
        query = query,
        query_missing = query_missing,
        n_ref = n_ref,
        n_query = n_query,
        n_dim = n_dim,
        seed = seed,
        filebacked = filebacked,
        path_dir = path_dir,
        prefix = prefix
    )

    validate_search_k(k, dataset$n_ref, self_search = isTRUE(dataset$self_search))
    old_options <- options(bigANNOY.backend = backend)
    on.exit(options(old_options), add = TRUE)

    exact_elapsed <- NA_real_
    exact_index <- NULL
    exact_used <- benchmark_exact_available(metric, exact)
    if (isTRUE(exact_used)) {
        exact_result <- benchmark_exact_result(
            ref_big = dataset$ref_exact,
            query = dataset$query_exact,
            k = k,
            block_size = block_size
        )
        exact_elapsed <- exact_result$elapsed
        exact_index <- exact_result$result$index
    }

    rows <- vector("list", length(n_trees) * length(search_k))
    row_id <- 0L

    for (tree_count in n_trees) {
        index_path <- benchmark_make_index_path(path_dir, prefix = sprintf("bigannoy-suite-%d-", tree_count))
        metadata_path <- annoy_metadata_path(index_path)
        if (!isTRUE(keep_files)) {
            on.exit(unlink(c(index_path, metadata_path), force = TRUE), add = TRUE)
        }

        build_timing <- system.time({
            index <- annoy_build_bigmatrix(
                dataset$ref_annoy,
                path = index_path,
                n_trees = tree_count,
                metric = metric,
                seed = build_seed,
                build_threads = build_threads,
                block_size = block_size,
                load_mode = load_mode
            )
        })

        validation <- annoy_validate_index(index, strict = TRUE, load = TRUE)
        invisible(validation)

        for (search_budget in search_k) {
            row_id <- row_id + 1L
            search_timing <- system.time({
                result <- annoy_search_bigmatrix(
                    index,
                    query = dataset$query_annoy,
                    k = k,
                    search_k = search_budget,
                    block_size = block_size
                )
            })

            rows[[row_id]] <- data.frame(
                metric = metric,
                backend = index$build_backend,
                filebacked = filebacked,
                self_search = isTRUE(dataset$self_search),
                load_mode = index$load_mode,
                n_ref = dataset$n_ref,
                n_query = dataset$n_query,
                n_dim = dataset$n_dim,
                k = k,
                n_trees = tree_count,
                search_k = search_budget,
                build_threads = build_threads,
                build_elapsed = benchmark_elapsed(build_timing),
                search_elapsed = benchmark_elapsed(search_timing),
                exact_elapsed = exact_elapsed,
                recall_at_k = if (is.null(exact_index)) NA_real_ else benchmark_recall_at_k(result$index, exact_index, k),
                index_id = index$index_id,
                stringsAsFactors = FALSE
            )
        }
    }

    summary <- do.call(rbind, rows)
    benchmark_write_summary(summary, output_path)

    list(
        summary = summary,
        exact_available = exact_used
    )
}

Try the bigANNOY package in your browser

Any scripts or data that you put into this service are public.

bigANNOY documentation built on April 1, 2026, 9:07 a.m.