Nothing
skip_if_not_installed("bigmemory")
skip_if_not_installed("RcppAnnoy")
library(bigmemory)
brute_force_knn_euclidean <- function(ref, query = ref, k, exclude_self = identical(ref, query)) {
n_ref <- nrow(ref)
n_query <- nrow(query)
index <- matrix(NA_integer_, nrow = n_query, ncol = k)
distance <- matrix(NA_real_, nrow = n_query, ncol = k)
for (i in seq_len(n_query)) {
dists <- rowSums((t(t(ref) - query[i, ]))^2)
if (exclude_self && identical(ref, query)) {
dists[i] <- Inf
}
order_idx <- order(dists, seq_len(n_ref))
keep <- order_idx[seq_len(k)]
index[i, ] <- keep
distance[i, ] <- sqrt(dists[keep])
}
list(index = index, distance = distance)
}
make_filebacked_matrix <- function(values, type, backingpath, name) {
bm <- filebacked.big.matrix(
nrow = nrow(values),
ncol = ncol(values),
type = type,
backingfile = sprintf("%s.bin", name),
descriptorfile = sprintf("%s.desc", name),
backingpath = backingpath
)
bm[,] <- values
bm
}
test_that("build metadata includes v3 fields and validate/open round-trip works", {
ref <- matrix(
c(0, 0,
10, 0,
0, 10,
10, 10),
ncol = 2,
byrow = TRUE
)
big_ref <- as.big.matrix(ref)
path <- tempfile(fileext = ".ann")
index <- annoy_build_bigmatrix(big_ref, path = path, n_trees = 20, seed = 77L, load_mode = "lazy")
expect_false(annoy_is_loaded(index))
validation <- annoy_validate_index(index, strict = TRUE, load = TRUE)
reopened <- annoy_open_index(path, load_mode = "eager")
loaded <- annoy_load_bigmatrix(path, load_mode = "lazy")
expected <- brute_force_knn_euclidean(ref, k = 2, exclude_self = TRUE)
metadata <- read.dcf(index$metadata_path)
expect_s3_class(index, "bigannoy_index")
expect_true(file.exists(path))
expect_true(file.exists(index$metadata_path))
expect_true(validation$valid)
expect_true(is.character(index$index_id) && nzchar(index$index_id))
expect_true(is.numeric(index$file_size) && index$file_size > 0)
expect_true(is.character(index$file_md5) && nzchar(index$file_md5))
expect_identical(index$load_mode, "lazy")
expect_true(annoy_is_loaded(index))
expect_true(annoy_is_loaded(reopened))
expect_false(annoy_is_loaded(loaded))
expect_equal(annoy_search_bigmatrix(reopened, k = 2)$index, expected$index)
expect_equal(annoy_search_bigmatrix(loaded, k = 2)$distance, expected$distance, tolerance = 1e-12)
expect_true(all(c("index_id", "file_size", "file_mtime", "file_md5", "load_mode") %in% colnames(metadata)))
})
test_that("search lazily loads, close unloads, and repeated search reloads successfully", {
ref <- as.big.matrix(matrix(c(0, 0, 2, 0, 0, 2, 2, 2), ncol = 2, byrow = TRUE))
index <- annoy_build_bigmatrix(ref, tempfile(fileext = ".ann"), n_trees = 20, load_mode = "lazy")
expect_false(annoy_is_loaded(index))
first <- annoy_search_bigmatrix(index, k = 2)
expect_true(annoy_is_loaded(index))
annoy_close_index(index)
expect_false(annoy_is_loaded(index))
second <- annoy_search_bigmatrix(index, k = 2)
expect_true(annoy_is_loaded(index))
expect_equal(second$index, first$index)
expect_equal(second$distance, first$distance, tolerance = 1e-12)
})
test_that("dense, big.matrix, descriptor object, and descriptor path queries all work", {
td <- tempfile("bigannoy-descriptor-")
dir.create(td, recursive = TRUE)
ref <- matrix(
c(0, 0,
5, 0,
0, 5,
5, 5,
9, 9),
ncol = 2,
byrow = TRUE
)
query <- matrix(
c(0.2, 0.1,
4.7, 5.1),
ncol = 2,
byrow = TRUE
)
ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref")
query_fb <- make_filebacked_matrix(query, type = "double", backingpath = td, name = "query")
query_desc <- describe(query_fb)
query_desc_path <- file.path(td, "query.desc")
index <- annoy_build_bigmatrix(describe(ref_fb), tempfile(tmpdir = td, fileext = ".ann"), n_trees = 25, seed = 99L)
dense_result <- annoy_search_bigmatrix(index, query = query, k = 2)
big_result <- annoy_search_bigmatrix(index, query = query_fb, k = 2)
desc_result <- annoy_search_bigmatrix(index, query = query_desc, k = 2)
path_result <- annoy_search_bigmatrix(index, query = query_desc_path, k = 2)
expected <- brute_force_knn_euclidean(ref, query = query, k = 2, exclude_self = FALSE)
expect_equal(dense_result$index, expected$index)
expect_equal(dense_result$distance, expected$distance, tolerance = 1e-6)
expect_equal(big_result$index, dense_result$index)
expect_equal(desc_result$distance, dense_result$distance, tolerance = 1e-6)
expect_equal(path_result$index, dense_result$index)
})
test_that("streaming outputs accept descriptor objects and descriptor paths", {
td <- tempfile("bigannoy-stream-desc-")
dir.create(td, recursive = TRUE)
ref <- matrix(
c(1, 1,
8, 1,
1, 8,
8, 8),
ncol = 2,
byrow = TRUE
)
query <- matrix(
c(2, 2,
7, 7),
ncol = 2,
byrow = TRUE
)
ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref")
query_fb <- make_filebacked_matrix(query, type = "double", backingpath = td, name = "query")
built <- annoy_build_bigmatrix(file.path(td, "ref.desc"), tempfile(tmpdir = td, fileext = ".ann"), n_trees = 20, seed = 101L)
expected <- annoy_search_bigmatrix(built, query = query, k = 2)
index_store <- filebacked.big.matrix(
nrow = nrow(query),
ncol = 2,
type = "integer",
backingfile = "index.bin",
descriptorfile = "index.desc",
backingpath = td
)
distance_store <- filebacked.big.matrix(
nrow = nrow(query),
ncol = 2,
type = "double",
backingfile = "distance.bin",
descriptorfile = "distance.desc",
backingpath = td
)
streamed <- annoy_search_bigmatrix(
built,
query = describe(query_fb),
k = 2,
xpIndex = describe(index_store),
xpDistance = file.path(td, "distance.desc")
)
expect_equal(bigmemory::as.matrix(index_store), expected$index)
expect_equal(bigmemory::as.matrix(distance_store), expected$distance, tolerance = 1e-12)
expect_type(streamed, "list")
})
test_that("all supported metrics build and search, and native and debug backends agree", {
ref <- matrix(
c(1, 2,
2, 1,
4, 3,
3, 5,
7, 2),
ncol = 2,
byrow = TRUE
)
query <- matrix(
c(1.1, 2.2,
3.9, 3.2),
ncol = 2,
byrow = TRUE
)
metrics <- c("euclidean", "angular", "manhattan", "dot")
for (metric in metrics) {
old_options <- options(bigANNOY.backend = "r")
on.exit(options(old_options), add = TRUE)
r_index <- suppressWarnings(
annoy_build_bigmatrix(as.big.matrix(ref), tempfile(fileext = ".ann"), n_trees = 30, metric = metric, seed = 55L)
)
r_result <- suppressWarnings(
annoy_search_bigmatrix(r_index, query = query, k = 2, search_k = 200L)
)
expect_identical(r_result$metric, metric)
expect_identical(dim(r_result$index), c(nrow(query), 2L))
expect_identical(dim(r_result$distance), c(nrow(query), 2L))
if (isTRUE(is.loaded("_bigANNOY_cpp_annoy_open_index", PACKAGE = "bigANNOY")) &&
isTRUE(is.loaded("_bigANNOY_cpp_annoy_handle_search", PACKAGE = "bigANNOY"))) {
options(bigANNOY.backend = "cpp")
cpp_index <- annoy_build_bigmatrix(as.big.matrix(ref), tempfile(fileext = ".ann"), n_trees = 30, metric = metric, seed = 55L)
cpp_result <- annoy_search_bigmatrix(cpp_index, query = query, k = 2, search_k = 200L)
expect_equal(cpp_result$index, r_result$index)
expect_equal(cpp_result$distance, r_result$distance, tolerance = 1e-6)
}
}
})
test_that("validation catches impossible k, bad dimensions, missing files, and corrupted metadata", {
td <- tempfile("bigannoy-validate-")
dir.create(td, recursive = TRUE)
ref <- as.big.matrix(matrix(c(0, 0, 1, 0), ncol = 2, byrow = TRUE))
path <- file.path(td, "index.ann")
built <- annoy_build_bigmatrix(ref, path, n_trees = 10)
expect_error(annoy_search_bigmatrix(built, k = 2), "`k` exceeds")
expect_error(
annoy_search_bigmatrix(built, query = matrix(1, ncol = 1), k = 1),
"same number of columns"
)
expect_error(
annoy_open_index(tempfile(fileext = ".ann")),
"does not exist"
)
bad_index <- big.matrix(1, 1, type = "double")
bad_distance <- big.matrix(1, 1, type = "integer")
expect_error(
annoy_search_bigmatrix(built, query = matrix(c(0, 0), ncol = 2), k = 1, xpIndex = bad_index),
"`xpIndex` big.matrix must store integers"
)
expect_error(
annoy_search_bigmatrix(
built,
query = matrix(c(0, 0), ncol = 2),
k = 1,
xpIndex = big.matrix(1, 1, type = "integer"),
xpDistance = bad_distance
),
"`xpDistance` big.matrix must store doubles"
)
metadata <- read.dcf(built$metadata_path)
metadata[1L, "file_md5"] <- "corrupted"
write.dcf(as.data.frame(metadata, stringsAsFactors = FALSE), file = built$metadata_path)
reopened <- annoy_open_index(path, load_mode = "lazy")
report <- annoy_validate_index(reopened, strict = FALSE, load = FALSE)
expect_false(report$valid)
expect_error(annoy_validate_index(reopened, strict = TRUE, load = FALSE), "checksum")
})
test_that("non-finite build and query inputs are rejected", {
bad_ref <- as.big.matrix(matrix(c(0, 0, NA, 1), ncol = 2, byrow = TRUE))
expect_error(
annoy_build_bigmatrix(bad_ref, tempfile(fileext = ".ann")),
"contains non-finite values"
)
ref <- as.big.matrix(matrix(c(0, 0, 1, 1, 2, 2), ncol = 2, byrow = TRUE))
built <- annoy_build_bigmatrix(ref, tempfile(fileext = ".ann"), n_trees = 10)
expect_error(
annoy_search_bigmatrix(built, query = matrix(c(Inf, 0), ncol = 2), k = 1),
"contains non-finite values"
)
})
test_that("file-backed reopen and separated-column query matrices behave across sessions", {
td <- tempfile("bigannoy-reopen-")
dir.create(td, recursive = TRUE)
ref <- matrix(rnorm(60), nrow = 20, ncol = 3)
query <- matrix(rnorm(15), nrow = 5, ncol = 3)
ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref_large")
query_sep <- big.matrix(nrow(query), ncol(query), type = "double", separated = TRUE)
query_sep[,] <- query
path <- file.path(td, "persist.ann")
built <- annoy_build_bigmatrix(file.path(td, "ref_large.desc"), path = path, n_trees = 25, metric = "euclidean", seed = 123L)
reopened <- annoy_open_index(path, prefault = TRUE, load_mode = "eager")
direct <- annoy_search_bigmatrix(built, query = describe(query_sep), k = 3, prefault = TRUE)
reopened_result <- annoy_search_bigmatrix(reopened, query = query_sep@address, k = 3, prefault = TRUE)
expect_true(annoy_is_loaded(reopened))
expect_equal(reopened_result$index, direct$index)
expect_equal(reopened_result$distance, direct$distance, tolerance = 1e-6)
})
test_that("benchmark interface supports user data, saved outputs, and suite summaries", {
ref <- matrix(rnorm(80), nrow = 20, ncol = 4)
query <- matrix(rnorm(16), nrow = 4, ncol = 4)
single_out <- tempfile(fileext = ".csv")
suite_out <- tempfile(fileext = ".csv")
single <- benchmark_annoy_bigmatrix(
x = ref,
query = query,
k = 2L,
n_trees = 10L,
exact = FALSE,
output_path = single_out,
load_mode = "eager"
)
suite <- benchmark_annoy_recall_suite(
x = ref,
query = query,
k = 2L,
n_trees = c(5L, 10L),
search_k = c(-1L, 20L),
exact = FALSE,
output_path = suite_out,
load_mode = "eager"
)
expect_true(file.exists(single_out))
expect_true(file.exists(suite_out))
expect_true(single$validation$valid)
expect_true(all(c("summary", "params", "index_path", "metadata_path", "exact_available", "validation") %in% names(single)))
expect_true(all(c("metric", "backend", "self_search", "load_mode", "build_elapsed", "search_elapsed", "recall_at_k", "index_id") %in% names(single$summary)))
expect_equal(nrow(single$summary), 1L)
expect_true(all(c("summary", "exact_available") %in% names(suite)))
expect_equal(nrow(suite$summary), 4L)
expect_true(all(c("n_trees", "search_k", "self_search", "load_mode", "build_elapsed", "search_elapsed") %in% names(suite$summary)))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.