tests/testthat/test-annoy.R

skip_if_not_installed("bigmemory")
skip_if_not_installed("RcppAnnoy")

library(bigmemory)

brute_force_knn_euclidean <- function(ref, query = ref, k, exclude_self = identical(ref, query)) {
    n_ref <- nrow(ref)
    n_query <- nrow(query)
    index <- matrix(NA_integer_, nrow = n_query, ncol = k)
    distance <- matrix(NA_real_, nrow = n_query, ncol = k)

    for (i in seq_len(n_query)) {
        dists <- rowSums((t(t(ref) - query[i, ]))^2)
        if (exclude_self && identical(ref, query)) {
            dists[i] <- Inf
        }
        order_idx <- order(dists, seq_len(n_ref))
        keep <- order_idx[seq_len(k)]
        index[i, ] <- keep
        distance[i, ] <- sqrt(dists[keep])
    }

    list(index = index, distance = distance)
}

make_filebacked_matrix <- function(values, type, backingpath, name) {
    bm <- filebacked.big.matrix(
        nrow = nrow(values),
        ncol = ncol(values),
        type = type,
        backingfile = sprintf("%s.bin", name),
        descriptorfile = sprintf("%s.desc", name),
        backingpath = backingpath
    )
    bm[,] <- values
    bm
}

test_that("build metadata includes v3 fields and validate/open round-trip works", {
    ref <- matrix(
        c(0, 0,
          10, 0,
          0, 10,
          10, 10),
        ncol = 2,
        byrow = TRUE
    )
    big_ref <- as.big.matrix(ref)
    path <- tempfile(fileext = ".ann")

    index <- annoy_build_bigmatrix(big_ref, path = path, n_trees = 20, seed = 77L, load_mode = "lazy")
    expect_false(annoy_is_loaded(index))
    validation <- annoy_validate_index(index, strict = TRUE, load = TRUE)
    reopened <- annoy_open_index(path, load_mode = "eager")
    loaded <- annoy_load_bigmatrix(path, load_mode = "lazy")
    expected <- brute_force_knn_euclidean(ref, k = 2, exclude_self = TRUE)
    metadata <- read.dcf(index$metadata_path)

    expect_s3_class(index, "bigannoy_index")
    expect_true(file.exists(path))
    expect_true(file.exists(index$metadata_path))
    expect_true(validation$valid)
    expect_true(is.character(index$index_id) && nzchar(index$index_id))
    expect_true(is.numeric(index$file_size) && index$file_size > 0)
    expect_true(is.character(index$file_md5) && nzchar(index$file_md5))
    expect_identical(index$load_mode, "lazy")
    expect_true(annoy_is_loaded(index))
    expect_true(annoy_is_loaded(reopened))
    expect_false(annoy_is_loaded(loaded))
    expect_equal(annoy_search_bigmatrix(reopened, k = 2)$index, expected$index)
    expect_equal(annoy_search_bigmatrix(loaded, k = 2)$distance, expected$distance, tolerance = 1e-12)
    expect_true(all(c("index_id", "file_size", "file_mtime", "file_md5", "load_mode") %in% colnames(metadata)))
})

test_that("search lazily loads, close unloads, and repeated search reloads successfully", {
    ref <- as.big.matrix(matrix(c(0, 0, 2, 0, 0, 2, 2, 2), ncol = 2, byrow = TRUE))
    index <- annoy_build_bigmatrix(ref, tempfile(fileext = ".ann"), n_trees = 20, load_mode = "lazy")

    expect_false(annoy_is_loaded(index))
    first <- annoy_search_bigmatrix(index, k = 2)
    expect_true(annoy_is_loaded(index))

    annoy_close_index(index)
    expect_false(annoy_is_loaded(index))

    second <- annoy_search_bigmatrix(index, k = 2)
    expect_true(annoy_is_loaded(index))
    expect_equal(second$index, first$index)
    expect_equal(second$distance, first$distance, tolerance = 1e-12)
})

test_that("dense, big.matrix, descriptor object, and descriptor path queries all work", {
    td <- tempfile("bigannoy-descriptor-")
    dir.create(td, recursive = TRUE)
    ref <- matrix(
        c(0, 0,
          5, 0,
          0, 5,
          5, 5,
          9, 9),
        ncol = 2,
        byrow = TRUE
    )
    query <- matrix(
        c(0.2, 0.1,
          4.7, 5.1),
        ncol = 2,
        byrow = TRUE
    )

    ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref")
    query_fb <- make_filebacked_matrix(query, type = "double", backingpath = td, name = "query")
    query_desc <- describe(query_fb)
    query_desc_path <- file.path(td, "query.desc")

    index <- annoy_build_bigmatrix(describe(ref_fb), tempfile(tmpdir = td, fileext = ".ann"), n_trees = 25, seed = 99L)
    dense_result <- annoy_search_bigmatrix(index, query = query, k = 2)
    big_result <- annoy_search_bigmatrix(index, query = query_fb, k = 2)
    desc_result <- annoy_search_bigmatrix(index, query = query_desc, k = 2)
    path_result <- annoy_search_bigmatrix(index, query = query_desc_path, k = 2)
    expected <- brute_force_knn_euclidean(ref, query = query, k = 2, exclude_self = FALSE)

    expect_equal(dense_result$index, expected$index)
    expect_equal(dense_result$distance, expected$distance, tolerance = 1e-6)
    expect_equal(big_result$index, dense_result$index)
    expect_equal(desc_result$distance, dense_result$distance, tolerance = 1e-6)
    expect_equal(path_result$index, dense_result$index)
})

test_that("streaming outputs accept descriptor objects and descriptor paths", {
    td <- tempfile("bigannoy-stream-desc-")
    dir.create(td, recursive = TRUE)
    ref <- matrix(
        c(1, 1,
          8, 1,
          1, 8,
          8, 8),
        ncol = 2,
        byrow = TRUE
    )
    query <- matrix(
        c(2, 2,
          7, 7),
        ncol = 2,
        byrow = TRUE
    )

    ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref")
    query_fb <- make_filebacked_matrix(query, type = "double", backingpath = td, name = "query")
    built <- annoy_build_bigmatrix(file.path(td, "ref.desc"), tempfile(tmpdir = td, fileext = ".ann"), n_trees = 20, seed = 101L)
    expected <- annoy_search_bigmatrix(built, query = query, k = 2)

    index_store <- filebacked.big.matrix(
        nrow = nrow(query),
        ncol = 2,
        type = "integer",
        backingfile = "index.bin",
        descriptorfile = "index.desc",
        backingpath = td
    )
    distance_store <- filebacked.big.matrix(
        nrow = nrow(query),
        ncol = 2,
        type = "double",
        backingfile = "distance.bin",
        descriptorfile = "distance.desc",
        backingpath = td
    )

    streamed <- annoy_search_bigmatrix(
        built,
        query = describe(query_fb),
        k = 2,
        xpIndex = describe(index_store),
        xpDistance = file.path(td, "distance.desc")
    )

    expect_equal(bigmemory::as.matrix(index_store), expected$index)
    expect_equal(bigmemory::as.matrix(distance_store), expected$distance, tolerance = 1e-12)
    expect_type(streamed, "list")
})

test_that("all supported metrics build and search, and native and debug backends agree", {
    ref <- matrix(
        c(1, 2,
          2, 1,
          4, 3,
          3, 5,
          7, 2),
        ncol = 2,
        byrow = TRUE
    )
    query <- matrix(
        c(1.1, 2.2,
          3.9, 3.2),
        ncol = 2,
        byrow = TRUE
    )
    metrics <- c("euclidean", "angular", "manhattan", "dot")

    for (metric in metrics) {
        old_options <- options(bigANNOY.backend = "r")
        on.exit(options(old_options), add = TRUE)
        r_index <- suppressWarnings(
            annoy_build_bigmatrix(as.big.matrix(ref), tempfile(fileext = ".ann"), n_trees = 30, metric = metric, seed = 55L)
        )
        r_result <- suppressWarnings(
            annoy_search_bigmatrix(r_index, query = query, k = 2, search_k = 200L)
        )

        expect_identical(r_result$metric, metric)
        expect_identical(dim(r_result$index), c(nrow(query), 2L))
        expect_identical(dim(r_result$distance), c(nrow(query), 2L))

        if (isTRUE(is.loaded("_bigANNOY_cpp_annoy_open_index", PACKAGE = "bigANNOY")) &&
            isTRUE(is.loaded("_bigANNOY_cpp_annoy_handle_search", PACKAGE = "bigANNOY"))) {
            options(bigANNOY.backend = "cpp")
            cpp_index <- annoy_build_bigmatrix(as.big.matrix(ref), tempfile(fileext = ".ann"), n_trees = 30, metric = metric, seed = 55L)
            cpp_result <- annoy_search_bigmatrix(cpp_index, query = query, k = 2, search_k = 200L)

            expect_equal(cpp_result$index, r_result$index)
            expect_equal(cpp_result$distance, r_result$distance, tolerance = 1e-6)
        }
    }
})

test_that("validation catches impossible k, bad dimensions, missing files, and corrupted metadata", {
    td <- tempfile("bigannoy-validate-")
    dir.create(td, recursive = TRUE)
    ref <- as.big.matrix(matrix(c(0, 0, 1, 0), ncol = 2, byrow = TRUE))
    path <- file.path(td, "index.ann")
    built <- annoy_build_bigmatrix(ref, path, n_trees = 10)

    expect_error(annoy_search_bigmatrix(built, k = 2), "`k` exceeds")
    expect_error(
        annoy_search_bigmatrix(built, query = matrix(1, ncol = 1), k = 1),
        "same number of columns"
    )
    expect_error(
        annoy_open_index(tempfile(fileext = ".ann")),
        "does not exist"
    )

    bad_index <- big.matrix(1, 1, type = "double")
    bad_distance <- big.matrix(1, 1, type = "integer")
    expect_error(
        annoy_search_bigmatrix(built, query = matrix(c(0, 0), ncol = 2), k = 1, xpIndex = bad_index),
        "`xpIndex` big.matrix must store integers"
    )
    expect_error(
        annoy_search_bigmatrix(
            built,
            query = matrix(c(0, 0), ncol = 2),
            k = 1,
            xpIndex = big.matrix(1, 1, type = "integer"),
            xpDistance = bad_distance
        ),
        "`xpDistance` big.matrix must store doubles"
    )

    metadata <- read.dcf(built$metadata_path)
    metadata[1L, "file_md5"] <- "corrupted"
    write.dcf(as.data.frame(metadata, stringsAsFactors = FALSE), file = built$metadata_path)
    reopened <- annoy_open_index(path, load_mode = "lazy")
    report <- annoy_validate_index(reopened, strict = FALSE, load = FALSE)

    expect_false(report$valid)
    expect_error(annoy_validate_index(reopened, strict = TRUE, load = FALSE), "checksum")
})

test_that("non-finite build and query inputs are rejected", {
    bad_ref <- as.big.matrix(matrix(c(0, 0, NA, 1), ncol = 2, byrow = TRUE))
    expect_error(
        annoy_build_bigmatrix(bad_ref, tempfile(fileext = ".ann")),
        "contains non-finite values"
    )

    ref <- as.big.matrix(matrix(c(0, 0, 1, 1, 2, 2), ncol = 2, byrow = TRUE))
    built <- annoy_build_bigmatrix(ref, tempfile(fileext = ".ann"), n_trees = 10)
    expect_error(
        annoy_search_bigmatrix(built, query = matrix(c(Inf, 0), ncol = 2), k = 1),
        "contains non-finite values"
    )
})

test_that("file-backed reopen and separated-column query matrices behave across sessions", {
    td <- tempfile("bigannoy-reopen-")
    dir.create(td, recursive = TRUE)
    ref <- matrix(rnorm(60), nrow = 20, ncol = 3)
    query <- matrix(rnorm(15), nrow = 5, ncol = 3)

    ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref_large")
    query_sep <- big.matrix(nrow(query), ncol(query), type = "double", separated = TRUE)
    query_sep[,] <- query

    path <- file.path(td, "persist.ann")
    built <- annoy_build_bigmatrix(file.path(td, "ref_large.desc"), path = path, n_trees = 25, metric = "euclidean", seed = 123L)
    reopened <- annoy_open_index(path, prefault = TRUE, load_mode = "eager")
    direct <- annoy_search_bigmatrix(built, query = describe(query_sep), k = 3, prefault = TRUE)
    reopened_result <- annoy_search_bigmatrix(reopened, query = query_sep@address, k = 3, prefault = TRUE)

    expect_true(annoy_is_loaded(reopened))
    expect_equal(reopened_result$index, direct$index)
    expect_equal(reopened_result$distance, direct$distance, tolerance = 1e-6)
})

test_that("benchmark interface supports user data, saved outputs, and suite summaries", {
    ref <- matrix(rnorm(80), nrow = 20, ncol = 4)
    query <- matrix(rnorm(16), nrow = 4, ncol = 4)
    single_out <- tempfile(fileext = ".csv")
    suite_out <- tempfile(fileext = ".csv")

    single <- benchmark_annoy_bigmatrix(
        x = ref,
        query = query,
        k = 2L,
        n_trees = 10L,
        exact = FALSE,
        output_path = single_out,
        load_mode = "eager"
    )
    suite <- benchmark_annoy_recall_suite(
        x = ref,
        query = query,
        k = 2L,
        n_trees = c(5L, 10L),
        search_k = c(-1L, 20L),
        exact = FALSE,
        output_path = suite_out,
        load_mode = "eager"
    )

    expect_true(file.exists(single_out))
    expect_true(file.exists(suite_out))
    expect_true(single$validation$valid)
    expect_true(all(c("summary", "params", "index_path", "metadata_path", "exact_available", "validation") %in% names(single)))
    expect_true(all(c("metric", "backend", "self_search", "load_mode", "build_elapsed", "search_elapsed", "recall_at_k", "index_id") %in% names(single$summary)))
    expect_equal(nrow(single$summary), 1L)

    expect_true(all(c("summary", "exact_available") %in% names(suite)))
    expect_equal(nrow(suite$summary), 4L)
    expect_true(all(c("n_trees", "search_k", "self_search", "load_mode", "build_elapsed", "search_elapsed") %in% names(suite$summary)))
})

Try the bigANNOY package in your browser

Any scripts or data that you put into this service are public.

bigANNOY documentation built on April 1, 2026, 9:07 a.m.