Nothing
annoy_default_block_size <- function() {
block_size <- getOption("bigANNOY.block_size", 1024L)
if (!is.numeric(block_size) || length(block_size) != 1L || is.na(block_size) || block_size < 1) {
return(1024L)
}
as.integer(block_size)
}
annoy_progress_enabled <- function() {
isTRUE(getOption("bigANNOY.progress", FALSE))
}
annoy_progress_message <- function(...) {
if (annoy_progress_enabled()) {
message(...)
}
}
annoy_cpp_available <- function() {
build_loaded <- isTRUE(is.loaded("_bigANNOY_cpp_annoy_build_bigmatrix", PACKAGE = "bigANNOY"))
search_loaded <- isTRUE(is.loaded("_bigANNOY_cpp_annoy_search", PACKAGE = "bigANNOY"))
build_loaded && search_loaded
}
normalize_backend <- function() {
backend <- getOption("bigANNOY.backend", "cpp")
if (!is.character(backend) || length(backend) != 1L || is.na(backend)) {
return("cpp")
}
backend <- tolower(backend)
if (!backend %in% c("cpp", "auto", "r")) {
stop("`getOption(\"bigANNOY.backend\")` must be one of \"cpp\", \"auto\", or \"r\"", call. = FALSE)
}
backend
}
annoy_debug_backend_warning <- function(action) {
warning(
sprintf(
"Using the debug-only R backend for %s. The native C++ backend is the supported default for bigmemory workflows.",
action
),
call. = FALSE
)
}
annoy_use_cpp_backend <- function() {
backend <- normalize_backend()
if (identical(backend, "r")) {
return(FALSE)
}
if (!(exists("cpp_annoy_build_bigmatrix", mode = "function") &&
exists("cpp_annoy_search", mode = "function"))) {
if (identical(backend, "cpp")) {
stop(
"The bigANNOY C++ backend functions are not available. Recompile/reload the package, ",
"for example with `devtools::load_all(recompile = TRUE)` or reinstall it.",
call. = FALSE
)
}
return(FALSE)
}
available <- annoy_cpp_available()
if (identical(backend, "cpp") && !available) {
stop(
"The bigANNOY C++ backend is not loaded. Recompile/reload the package, ",
"for example with `devtools::load_all(recompile = TRUE)` or reinstall it.",
call. = FALSE
)
}
available
}
normalize_scalar_logical <- function(x, name) {
if (!is.logical(x) || length(x) != 1L || is.na(x)) {
stop(sprintf("`%s` must be TRUE or FALSE", name), call. = FALSE)
}
x
}
get_bigmemory_fun <- local({
cache <- new.env(parent = emptyenv())
function(name) {
if (!exists(name, envir = cache, inherits = FALSE)) {
if (!requireNamespace("bigmemory", quietly = TRUE)) {
stop("`bigmemory` must be installed for big.matrix support", call. = FALSE)
}
cache[[name]] <- utils::getFromNamespace(name, "bigmemory")
}
cache[[name]]
}
})
is_big_descriptor_input <- function(x) {
methods::is(x, "big.matrix.descriptor") ||
(is.character(x) && length(x) == 1L && !is.na(x) && nzchar(x))
}
attach_big_descriptor <- function(x, arg) {
attach_fun <- get_bigmemory_fun("attach.big.matrix")
tryCatch(
{
if (methods::is(x, "big.matrix.descriptor")) {
attach_fun(x)
} else {
path <- normalizePath(path.expand(x), winslash = "/", mustWork = FALSE)
if (!file.exists(path)) {
stop(sprintf("Descriptor file for `%s` does not exist: %s", arg, path), call. = FALSE)
}
attach_fun(path)
}
},
error = function(e) {
stop(sprintf("`%s` must be a big.matrix, external pointer, descriptor, or descriptor path", arg), call. = FALSE)
}
)
}
resolve_big_pointer <- function(x, arg, allow_null = FALSE) {
if (allow_null && is.null(x)) {
return(NULL)
}
if (methods::is(x, "big.matrix")) {
addr <- methods::slot(x, "address")
if (!identical(typeof(addr), "externalptr")) {
stop(sprintf("`%s@address` must be an external pointer", arg), call. = FALSE)
}
return(addr)
}
if (is_big_descriptor_input(x)) {
attached <- attach_big_descriptor(x, arg)
addr <- methods::slot(attached, "address")
if (!identical(typeof(addr), "externalptr")) {
stop(sprintf("Attached descriptor for `%s` did not yield an external pointer", arg), call. = FALSE)
}
return(addr)
}
if (identical(typeof(x), "externalptr")) {
return(x)
}
stop(sprintf("`%s` must be a bigmemory::big.matrix, descriptor, descriptor path, or external pointer", arg), call. = FALSE)
}
big_nrow <- function(xp) {
as.integer(get_bigmemory_fun("CGetNrow")(xp))
}
big_ncol <- function(xp) {
as.integer(get_bigmemory_fun("CGetNcol")(xp))
}
big_type <- function(xp) {
as.integer(get_bigmemory_fun("CGetType")(xp))
}
validate_numeric_bigmatrix <- function(xp, arg) {
type_code <- big_type(xp)
if (!type_code %in% c(8L, 6L)) {
stop(sprintf("`%s` big.matrix must store doubles or floats", arg), call. = FALSE)
}
invisible(type_code)
}
validate_index_destination <- function(xp, arg) {
if (!identical(big_type(xp), 4L)) {
stop(sprintf("`%s` big.matrix must store integers", arg), call. = FALSE)
}
invisible(xp)
}
validate_distance_destination <- function(xp, arg) {
if (!identical(big_type(xp), 8L)) {
stop(sprintf("`%s` big.matrix must store doubles", arg), call. = FALSE)
}
invisible(xp)
}
read_big_rows <- function(xp, rows, arg) {
rows <- as.integer(rows)
if (!length(rows)) {
return(matrix(double(), nrow = 0L, ncol = big_ncol(xp)))
}
values <- get_bigmemory_fun("GetMatrixRows")(xp, as.numeric(rows))[[1L]]
if (!is.matrix(values)) {
values <- matrix(values, nrow = length(rows))
}
storage.mode(values) <- "double"
if (any(!is.finite(values))) {
stop(sprintf("`%s` contains non-finite values", arg), call. = FALSE)
}
values
}
write_big_rows <- function(xp, rows, values) {
rows <- as.integer(rows)
if (!length(rows)) {
return(invisible(NULL))
}
type_code <- big_type(xp)
if (identical(type_code, 4L)) {
storage.mode(values) <- "integer"
} else if (identical(type_code, 8L)) {
storage.mode(values) <- "double"
} else {
stop("Destination big.matrix type is not writable for bigANNOY outputs", call. = FALSE)
}
get_bigmemory_fun("SetMatrixRows")(xp, as.numeric(rows), values)
invisible(NULL)
}
normalize_metric <- function(metric) {
if (!is.character(metric) || length(metric) != 1L || is.na(metric)) {
stop("`metric` must be a single character string", call. = FALSE)
}
metric <- tolower(metric)
if (identical(metric, "dotproduct")) {
metric <- "dot"
}
if (!metric %in% c("euclidean", "angular", "manhattan", "dot")) {
stop("`metric` must be one of \"euclidean\", \"angular\", \"manhattan\", or \"dot\"", call. = FALSE)
}
metric
}
annoy_metric_code <- function(metric) {
switch(
metric,
euclidean = 1L,
angular = 2L,
manhattan = 3L,
dot = 4L,
stop("Unsupported metric", call. = FALSE)
)
}
normalize_k <- function(k) {
if (!is.numeric(k) || length(k) != 1L || is.na(k) || k < 1) {
stop("`k` must be a single positive integer", call. = FALSE)
}
as.integer(k)
}
normalize_n_trees <- function(n_trees) {
if (!is.numeric(n_trees) || length(n_trees) != 1L || is.na(n_trees) || n_trees < 1) {
stop("`n_trees` must be a single positive integer", call. = FALSE)
}
as.integer(n_trees)
}
normalize_block_size <- function(block_size) {
if (!is.numeric(block_size) || length(block_size) != 1L || is.na(block_size) || block_size < 1) {
stop("`block_size` must be a single positive integer", call. = FALSE)
}
as.integer(block_size)
}
normalize_search_k <- function(search_k) {
if (!is.numeric(search_k) || length(search_k) != 1L || is.na(search_k)) {
stop("`search_k` must be a single integer", call. = FALSE)
}
search_k <- as.integer(search_k)
if (!(identical(search_k, -1L) || search_k >= 1L)) {
stop("`search_k` must be -1 or a positive integer", call. = FALSE)
}
search_k
}
normalize_build_threads <- function(build_threads) {
if (!is.numeric(build_threads) || length(build_threads) != 1L || is.na(build_threads)) {
stop("`build_threads` must be -1 or a positive integer", call. = FALSE)
}
build_threads <- as.integer(build_threads)
if (!(identical(build_threads, -1L) || build_threads >= 1L)) {
stop("`build_threads` must be -1 or a positive integer", call. = FALSE)
}
build_threads
}
normalize_seed <- function(seed) {
if (is.null(seed)) {
return(NULL)
}
if (!is.numeric(seed) || length(seed) != 1L || is.na(seed) || seed < 1) {
stop("`seed` must be NULL or a single positive integer", call. = FALSE)
}
as.integer(seed)
}
normalize_prefault <- function(prefault) {
if (!is.logical(prefault) || length(prefault) != 1L || is.na(prefault)) {
stop("`prefault` must be TRUE or FALSE", call. = FALSE)
}
prefault
}
normalize_load_mode <- function(load_mode, default = "lazy") {
if (is.null(load_mode)) {
load_mode <- default
}
if (!is.character(load_mode) || length(load_mode) != 1L || is.na(load_mode)) {
stop("`load_mode` must be either \"lazy\" or \"eager\"", call. = FALSE)
}
load_mode <- tolower(load_mode)
if (!load_mode %in% c("lazy", "eager")) {
stop("`load_mode` must be either \"lazy\" or \"eager\"", call. = FALSE)
}
load_mode
}
normalize_dense_query <- function(query) {
if (!is.matrix(query) || !is.numeric(query)) {
stop("`query` must be NULL, a bigmemory::big.matrix, descriptor, descriptor path, external pointer, or numeric matrix", call. = FALSE)
}
storage.mode(query) <- "double"
if (any(!is.finite(query))) {
stop("`query` contains non-finite values", call. = FALSE)
}
query
}
normalize_index_path <- function(path) {
if (!is.character(path) || length(path) != 1L || is.na(path) || !nzchar(path)) {
stop("`path` must be a single non-empty character string", call. = FALSE)
}
path <- normalizePath(path.expand(path), winslash = "/", mustWork = FALSE)
parent <- dirname(path)
if (!dir.exists(parent)) {
stop(sprintf("Index directory does not exist: %s", parent), call. = FALSE)
}
if (dir.exists(path)) {
stop("`path` must point to a file, not a directory", call. = FALSE)
}
path
}
annoy_metadata_path <- function(path) {
paste0(path, ".meta")
}
normalize_metadata_path <- function(path, metadata_path = NULL) {
if (is.null(metadata_path)) {
return(annoy_metadata_path(path))
}
if (!is.character(metadata_path) || length(metadata_path) != 1L || is.na(metadata_path) || !nzchar(metadata_path)) {
stop("`metadata_path` must be NULL or a single non-empty character string", call. = FALSE)
}
metadata_path <- normalizePath(path.expand(metadata_path), winslash = "/", mustWork = FALSE)
parent <- dirname(metadata_path)
if (!dir.exists(parent)) {
stop(sprintf("Metadata directory does not exist: %s", parent), call. = FALSE)
}
if (dir.exists(metadata_path)) {
stop("`metadata_path` must point to a file, not a directory", call. = FALSE)
}
metadata_path
}
normalize_query_input <- function(query) {
if (is.null(query)) {
return(list(kind = "self", xp = NULL, dense = NULL, nrow = NA_integer_, ncol = NA_integer_))
}
if (methods::is(query, "big.matrix") || identical(typeof(query), "externalptr") || is_big_descriptor_input(query)) {
xp <- resolve_big_pointer(query, "query")
validate_numeric_bigmatrix(xp, "query")
return(list(
kind = "big",
xp = xp,
dense = NULL,
nrow = big_nrow(xp),
ncol = big_ncol(xp)
))
}
dense <- normalize_dense_query(query)
list(
kind = "dense",
xp = NULL,
dense = dense,
nrow = nrow(dense),
ncol = ncol(dense)
)
}
annoy_r_class <- function(metric) {
switch(
metric,
euclidean = RcppAnnoy::AnnoyEuclidean,
angular = RcppAnnoy::AnnoyAngular,
manhattan = RcppAnnoy::AnnoyManhattan,
dot = RcppAnnoy::AnnoyDotProduct,
stop("Unsupported metric", call. = FALSE)
)
}
annoy_new_r_handle <- function(metric, n_dim) {
methods::new(annoy_r_class(metric), as.integer(n_dim))
}
read_annoy_metadata <- function(metadata_path) {
if (!file.exists(metadata_path)) {
stop(sprintf("Annoy metadata file does not exist: %s", metadata_path), call. = FALSE)
}
raw <- read.dcf(metadata_path, all = TRUE)
if (!nrow(raw)) {
stop("Annoy metadata file is empty", call. = FALSE)
}
as.list(raw[1L, , drop = TRUE])
}
metadata_numeric_field <- function(x, name, default = NULL) {
value <- x[[name]]
if (is.null(value) || !nzchar(value)) {
return(default)
}
as.numeric(value)
}
metadata_integer_field <- function(x, name, default = NULL) {
value <- x[[name]]
if (is.null(value) || !nzchar(value)) {
return(default)
}
as.integer(value)
}
metadata_logical_field <- function(x, name, default = FALSE) {
value <- x[[name]]
if (is.null(value) || !nzchar(value)) {
return(default)
}
identical(tolower(value), "true")
}
normalize_timestamp <- function(x) {
if (is.null(x) || !length(x) || is.na(x)) {
return(NA_character_)
}
format(as.POSIXct(x, tz = "UTC"), "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
}
annoy_file_signature <- function(path) {
info <- file.info(path)
if (!nrow(info) || isTRUE(is.na(info$size))) {
stop(sprintf("Unable to read file metadata for %s", path), call. = FALSE)
}
list(
file_size = as.numeric(info$size[[1L]]),
file_mtime = normalize_timestamp(info$mtime[[1L]]),
file_md5 = unname(tools::md5sum(path)[[1L]])
)
}
generate_annoy_index_id <- function(signature, created_at = NULL) {
stamp <- gsub("[^0-9]", "", created_at %||% normalize_timestamp(Sys.time()))
paste0("annoy-", stamp, "-", substr(signature$file_md5, 1L, 12L))
}
new_bigannoy_index <- function(path,
metric,
n_trees,
n_ref,
n_dim,
metadata_path = annoy_metadata_path(path),
index_id = NULL,
build_seed = NULL,
build_threads = -1L,
build_backend = "cpp",
file_size = NULL,
file_mtime = NULL,
file_md5 = NULL,
load_mode = "lazy",
prefault = FALSE,
package_version = as.character(utils::packageVersion("bigANNOY")),
annoy_version = as.character(RcppAnnoy::getAnnoyVersion()),
created_at = format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", tz = "UTC"),
metadata_version = 3L) {
structure(
list(
path = path,
metadata_path = metadata_path,
index_id = index_id,
metric = metric,
n_trees = as.integer(n_trees),
n_ref = as.integer(n_ref),
n_dim = as.integer(n_dim),
build_seed = build_seed,
build_threads = as.integer(build_threads),
build_backend = build_backend,
file_size = file_size,
file_mtime = file_mtime,
file_md5 = file_md5,
load_mode = load_mode,
prefault = prefault,
package_version = package_version,
annoy_version = annoy_version,
created_at = created_at,
metadata_version = as.integer(metadata_version),
exact = FALSE,
backend = "annoy",
.cache = new.env(parent = emptyenv())
),
class = "bigannoy_index"
)
}
write_annoy_metadata <- function(index) {
signature <- annoy_file_signature(index$path)
index$file_size <- signature$file_size
index$file_mtime <- signature$file_mtime
index$file_md5 <- signature$file_md5
if (is.null(index$index_id) || !nzchar(index$index_id)) {
index$index_id <- generate_annoy_index_id(signature, created_at = index$created_at)
}
fields <- data.frame(
metadata_version = as.character(index$metadata_version),
package_version = as.character(index$package_version),
annoy_version = as.character(index$annoy_version),
index_id = as.character(index$index_id),
metric = index$metric,
n_dim = as.character(index$n_dim),
n_ref = as.character(index$n_ref),
n_trees = as.character(index$n_trees),
build_seed = if (is.null(index$build_seed)) "" else as.character(index$build_seed),
build_threads = as.character(index$build_threads),
build_backend = as.character(index$build_backend %||% "unknown"),
file_size = as.character(index$file_size),
file_mtime = as.character(index$file_mtime),
file_md5 = as.character(index$file_md5),
load_mode = as.character(index$load_mode %||% "lazy"),
created_at = as.character(index$created_at),
index_file = basename(index$path),
stringsAsFactors = FALSE
)
write.dcf(fields, file = index$metadata_path)
invisible(index)
}
validate_annoy_index <- function(index) {
if (!inherits(index, "bigannoy_index")) {
stop("`index` must be a bigannoy_index returned by `annoy_build_bigmatrix()` or `annoy_open_index()`", call. = FALSE)
}
if (!is.character(index$path) || length(index$path) != 1L || is.na(index$path) || !nzchar(index$path)) {
stop("`index$path` is invalid", call. = FALSE)
}
if (!file.exists(index$path)) {
stop(sprintf("Annoy index file does not exist: %s", index$path), call. = FALSE)
}
if (is.null(index$metadata_path)) {
index$metadata_path <- annoy_metadata_path(index$path)
}
if (!is.character(index$metric) || length(index$metric) != 1L || is.na(index$metric)) {
stop("`index$metric` is invalid", call. = FALSE)
}
if (is.null(index$build_threads)) {
index$build_threads <- -1L
}
if (is.null(index$build_backend)) {
index$build_backend <- "unknown"
}
if (is.null(index$index_id) || !nzchar(index$index_id)) {
signature <- annoy_file_signature(index$path)
index$index_id <- generate_annoy_index_id(signature, created_at = index$created_at)
}
if (is.null(index$file_size) || is.na(index$file_size) ||
is.null(index$file_mtime) || is.na(index$file_mtime) ||
is.null(index$file_md5) || !nzchar(index$file_md5)) {
signature <- annoy_file_signature(index$path)
index$file_size <- signature$file_size
index$file_mtime <- signature$file_mtime
index$file_md5 <- signature$file_md5
}
if (is.null(index$load_mode)) {
index$load_mode <- "lazy"
}
if (is.null(index$prefault)) {
index$prefault <- FALSE
}
if (!is.environment(index$.cache)) {
index$.cache <- new.env(parent = emptyenv())
}
index
}
annoy_open_index_impl <- function(path, metadata_path = NULL, prefault = FALSE, load_mode = "eager") {
path <- normalize_index_path(path)
metadata_path <- normalize_metadata_path(path, metadata_path)
prefault <- normalize_prefault(prefault)
load_mode <- normalize_load_mode(load_mode, default = "eager")
if (!file.exists(path)) {
stop(sprintf("Annoy index file does not exist: %s", path), call. = FALSE)
}
metadata <- read_annoy_metadata(metadata_path)
if (!is.null(metadata[["index_file"]]) &&
nzchar(metadata[["index_file"]]) &&
!identical(basename(path), metadata[["index_file"]])) {
stop("Annoy metadata does not match the supplied index file path", call. = FALSE)
}
signature <- annoy_file_signature(path)
metric <- normalize_metric(metadata$metric)
n_dim <- metadata_integer_field(metadata, "n_dim")
n_ref <- metadata_integer_field(metadata, "n_ref")
n_trees <- metadata_integer_field(metadata, "n_trees")
if (is.null(n_dim) || is.null(n_ref) || is.null(n_trees)) {
stop("Annoy metadata is missing one of `n_dim`, `n_ref`, or `n_trees`", call. = FALSE)
}
index <- new_bigannoy_index(
path = path,
metadata_path = metadata_path,
index_id = metadata[["index_id"]] %||% generate_annoy_index_id(signature, created_at = metadata[["created_at"]]),
metric = metric,
n_trees = n_trees,
n_ref = n_ref,
n_dim = n_dim,
build_seed = metadata_integer_field(metadata, "build_seed"),
build_threads = metadata_integer_field(metadata, "build_threads", default = -1L),
build_backend = metadata[["build_backend"]] %||% "unknown",
file_size = metadata_numeric_field(metadata, "file_size", default = signature$file_size),
file_mtime = metadata[["file_mtime"]] %||% signature$file_mtime,
file_md5 = metadata[["file_md5"]] %||% signature$file_md5,
load_mode = load_mode,
prefault = prefault,
package_version = metadata[["package_version"]] %||% NA_character_,
annoy_version = metadata[["annoy_version"]] %||% NA_character_,
created_at = metadata[["created_at"]] %||% NA_character_,
metadata_version = metadata_integer_field(metadata, "metadata_version", default = 2L)
)
index <- validate_annoy_index(index)
if (identical(load_mode, "eager")) {
index <- annoy_prepare_loaded_index(index, prefault = prefault)
}
index
}
`%||%` <- function(x, y) {
if (is.null(x) || !length(x)) y else x
}
annoy_native_handle_key <- function(prefault = FALSE) {
sprintf("native_handle_%s", if (isTRUE(prefault)) "prefault" else "lazy")
}
annoy_cached_native_handle <- function(index, prefault = FALSE) {
key <- annoy_native_handle_key(prefault)
if (!exists(key, envir = index$.cache, inherits = FALSE)) {
return(NULL)
}
handle <- index$.cache[[key]]
if (!identical(typeof(handle), "externalptr") || !isTRUE(cpp_annoy_is_loaded(handle))) {
rm(list = key, envir = index$.cache)
return(NULL)
}
handle
}
annoy_ensure_native_handle <- function(index, prefault = FALSE) {
index <- validate_annoy_index(index)
handle <- annoy_cached_native_handle(index, prefault = prefault)
if (!is.null(handle)) {
return(handle)
}
handle <- cpp_annoy_open_index(
metric_code = annoy_metric_code(index$metric),
path = index$path,
n_ref = index$n_ref,
n_dim = index$n_dim,
prefault = prefault
)
index$.cache[[annoy_native_handle_key(prefault)]] <- handle
handle
}
annoy_prepare_loaded_index <- function(index, prefault = FALSE) {
index <- validate_annoy_index(index)
if (annoy_use_cpp_backend()) {
annoy_ensure_native_handle(index, prefault = prefault)
} else {
get_annoy_handle(index, prefault = prefault)
}
index
}
annoy_close_loaded_handles <- function(index) {
native_keys <- grep("^native_handle_", ls(index$.cache, all.names = TRUE), value = TRUE)
if (length(native_keys)) {
for (key in native_keys) {
handle <- index$.cache[[key]]
if (identical(typeof(handle), "externalptr")) {
try(cpp_annoy_close_index(handle), silent = TRUE)
}
rm(list = key, envir = index$.cache)
}
}
r_keys <- grep("^handle_", ls(index$.cache, all.names = TRUE), value = TRUE)
if (length(r_keys)) {
for (key in r_keys) {
handle <- index$.cache[[key]]
unload_attempt <- try(handle$unload(), silent = TRUE)
invisible(unload_attempt)
}
rm(list = r_keys, envir = index$.cache)
}
invisible(index)
}
get_annoy_handle <- function(index, prefault = FALSE) {
index <- validate_annoy_index(index)
key <- sprintf("handle_%s", if (isTRUE(prefault)) "prefault" else "lazy")
if (exists(key, envir = index$.cache, inherits = FALSE)) {
return(index$.cache[[key]])
}
handle <- annoy_new_r_handle(index$metric, index$n_dim)
if (isTRUE(prefault)) {
load_attempt <- try(handle$load(index$path, prefault), silent = TRUE)
if (inherits(load_attempt, "try-error")) {
warning(
"The debug-only R backend could not enable `prefault`; continuing with the default Annoy load behaviour.",
call. = FALSE
)
load_attempt <- try(handle$load(index$path), silent = TRUE)
}
} else {
load_attempt <- try(handle$load(index$path), silent = TRUE)
}
if (inherits(load_attempt, "try-error")) {
stop(sprintf("Failed to load Annoy index from %s", index$path), call. = FALSE)
}
if (!identical(as.integer(handle$getNItems()), as.integer(index$n_ref))) {
stop("Loaded Annoy index item count does not match recorded metadata", call. = FALSE)
}
index$.cache[[key]] <- handle
handle
}
validate_reference_shape <- function(n_ref, n_dim, arg) {
if (n_ref < 1L || n_dim < 1L) {
stop(sprintf("`%s` must contain at least one row and one column", arg), call. = FALSE)
}
}
validate_search_k <- function(k, n_ref, self_search) {
limit <- if (self_search) n_ref - 1L else n_ref
if (k > limit) {
if (self_search) {
stop("`k` exceeds the number of available reference rows after excluding self", call. = FALSE)
}
stop("`k` exceeds the number of reference rows", call. = FALSE)
}
}
allocate_search_buffers <- function(n_query, k, stream_index) {
list(
index = if (is.null(stream_index)) matrix(NA_integer_, nrow = n_query, ncol = k) else NULL,
distance = if (is.null(stream_index)) matrix(NA_real_, nrow = n_query, ncol = k) else NULL
)
}
validate_stream_destinations <- function(xp_index, xp_distance, n_query, k) {
if (is.null(xp_index) && !is.null(xp_distance)) {
stop("`xpDistance` cannot be supplied without `xpIndex`", call. = FALSE)
}
if (is.null(xp_index)) {
return(invisible(NULL))
}
xp_index_ptr <- resolve_big_pointer(xp_index, "xpIndex")
validate_index_destination(xp_index_ptr, "xpIndex")
if (!identical(big_nrow(xp_index_ptr), as.integer(n_query)) || !identical(big_ncol(xp_index_ptr), as.integer(k))) {
stop("`xpIndex` must have shape `n_query x k`", call. = FALSE)
}
if (!is.null(xp_distance)) {
xp_distance_ptr <- resolve_big_pointer(xp_distance, "xpDistance")
validate_distance_destination(xp_distance_ptr, "xpDistance")
if (!identical(big_nrow(xp_distance_ptr), as.integer(n_query)) || !identical(big_ncol(xp_distance_ptr), as.integer(k))) {
stop("`xpDistance` must have shape `n_query x k`", call. = FALSE)
}
}
invisible(NULL)
}
query_block <- function(query_input, rows) {
if (identical(query_input$kind, "dense")) {
block <- query_input$dense[rows, , drop = FALSE]
storage.mode(block) <- "double"
return(block)
}
if (identical(query_input$kind, "big")) {
return(read_big_rows(query_input$xp, rows, "query"))
}
stop("Internal error: unexpected query kind", call. = FALSE)
}
search_block <- function(handle, k, search_k, rows, self_search, query_block_values = NULL) {
block_n <- length(rows)
index_block <- matrix(NA_integer_, nrow = block_n, ncol = k)
distance_block <- matrix(NA_real_, nrow = block_n, ncol = k)
for (i in seq_len(block_n)) {
if (self_search) {
res <- handle$getNNsByItemList(as.integer(rows[i] - 1L), as.integer(k + 1L), search_k, TRUE)
items <- as.integer(res$item) + 1L
distances <- as.numeric(res$distance)
keep <- items != rows[i]
items <- items[keep]
distances <- distances[keep]
} else {
res <- handle$getNNsByVectorList(query_block_values[i, ], k, search_k, TRUE)
items <- as.integer(res$item) + 1L
distances <- as.numeric(res$distance)
}
if (length(items) < k) {
stop("Annoy returned fewer than `k` neighbours", call. = FALSE)
}
index_block[i, ] <- items[seq_len(k)]
distance_block[i, ] <- distances[seq_len(k)]
}
list(index = index_block, distance = distance_block)
}
new_annoy_result <- function(index, distance, k, metric, n_ref, n_query) {
list(
index = index,
distance = distance,
k = as.integer(k),
metric = metric,
n_ref = as.integer(n_ref),
n_query = as.integer(n_query),
exact = FALSE,
backend = "annoy"
)
}
annoy_build_bigmatrix_cpp_impl <- function(xp_ref,
path,
metric,
n_trees,
build_seed,
build_threads,
block_size) {
cpp_annoy_build_bigmatrix(
xp_ref = xp_ref,
metric_code = annoy_metric_code(metric),
path = path,
n_trees = n_trees,
seed = if (is.null(build_seed)) NA_integer_ else build_seed,
use_seed = !is.null(build_seed),
build_threads = build_threads,
block_size = block_size,
progress = annoy_progress_enabled()
)
}
annoy_search_bigmatrix_cpp_impl <- function(index,
query_input,
k,
search_k,
xp_index_ptr,
xp_distance_ptr,
xpIndex,
xpDistance,
prefault,
block_size) {
self_search <- identical(query_input$kind, "self")
handle <- annoy_ensure_native_handle(index, prefault = prefault)
result <- cpp_annoy_handle_search(
xp_handle = handle,
self_search = self_search,
xp_query = if (!self_search && identical(query_input$kind, "big")) query_input$xp else NULL,
query_dense = if (!self_search && identical(query_input$kind, "dense")) query_input$dense else NULL,
k = k,
search_k = search_k,
xp_index = xp_index_ptr,
xp_distance = xp_distance_ptr,
block_size = block_size,
progress = annoy_progress_enabled()
)
new_annoy_result(
index = if (is.null(xpIndex)) result$index else xpIndex,
distance = if (is.null(xpIndex)) result$distance else xpDistance,
k = k,
metric = index$metric,
n_ref = index$n_ref,
n_query = result$n_query
)
}
annoy_build_bigmatrix_r_impl <- function(xp_ref,
path,
metric,
n_trees,
build_seed,
build_threads,
block_size) {
n_ref <- big_nrow(xp_ref)
n_dim <- big_ncol(xp_ref)
validate_reference_shape(n_ref, n_dim, "x")
if (!identical(build_threads, -1L)) {
warning("`build_threads` is only supported by the native C++ backend; the R backend ignores it.", call. = FALSE)
}
handle <- annoy_new_r_handle(metric, n_dim)
if (!is.null(build_seed)) {
handle$setSeed(build_seed)
}
initialized <- handle$onDiskBuild(path)
if (!isTRUE(initialized)) {
stop(sprintf("Failed to initialize Annoy on-disk build at %s", path), call. = FALSE)
}
block_starts <- seq.int(1L, n_ref, by = block_size)
for (block_id in seq_along(block_starts)) {
start <- block_starts[block_id]
stop_row <- min(start + block_size - 1L, n_ref)
rows <- seq.int(start, stop_row)
block <- read_big_rows(xp_ref, rows, "x")
for (i in seq_len(nrow(block))) {
handle$addItem(as.integer(rows[i] - 1L), block[i, ])
}
annoy_progress_message(sprintf(
"Building Annoy index: block %d/%d (%d rows)",
block_id,
length(block_starts),
length(rows)
))
}
handle$build(n_trees)
handle$unload()
list(n_ref = n_ref, n_dim = n_dim)
}
annoy_prefault_value <- function(index, prefault) {
if (is.null(prefault)) {
return(normalize_prefault(isTRUE(index$prefault)))
}
normalize_prefault(prefault)
}
#' Build an Annoy index from a `bigmemory::big.matrix`
#'
#' @description
#' Stream the rows of a reference [`bigmemory::big.matrix`] into an on-disk
#' Annoy index and write a small sidecar metadata file next to it. The returned
#' `bigannoy_index` can be reopened later with [annoy_open_index()].
#'
#' @param x A `bigmemory::big.matrix` or an external pointer referencing the
#' reference matrix.
#' @param path File path where the Annoy index should be written.
#' @param n_trees Number of Annoy trees to build.
#' @param metric Distance metric. bigANNOY v2 supports `"euclidean"`,
#' `"angular"`, `"manhattan"`, and `"dot"`.
#' @param seed Optional positive integer seed used to initialize Annoy's build
#' RNG.
#' @param build_threads Build-thread setting passed to Annoy's native backend.
#' Use `-1L` for Annoy's default.
#' @param block_size Number of rows processed per streamed block while building
#' the index.
#' @param metadata_path Optional path for the sidecar metadata file. Defaults to
#' `paste0(path, ".meta")`.
#' @param load_mode Whether to keep the returned index metadata-only until first
#' search (`"lazy"`) or eagerly load a live index handle immediately
#' (`"eager"`).
#'
#' @return A `bigannoy_index` object describing the persisted Annoy index.
#' @export
annoy_build_bigmatrix <- function(x,
path,
n_trees = 50L,
metric = "euclidean",
seed = NULL,
build_threads = -1L,
block_size = annoy_default_block_size(),
metadata_path = NULL,
load_mode = "lazy") {
metric <- normalize_metric(metric)
n_trees <- normalize_n_trees(n_trees)
seed <- normalize_seed(seed)
build_threads <- normalize_build_threads(build_threads)
block_size <- normalize_block_size(block_size)
load_mode <- normalize_load_mode(load_mode, default = "lazy")
path <- normalize_index_path(path)
metadata_path <- normalize_metadata_path(path, metadata_path)
xp_ref <- resolve_big_pointer(x, "x")
validate_numeric_bigmatrix(xp_ref, "x")
use_cpp <- annoy_use_cpp_backend()
if (!use_cpp) {
annoy_debug_backend_warning("index build")
}
result <- if (use_cpp) {
annoy_build_bigmatrix_cpp_impl(
xp_ref = xp_ref,
path = path,
metric = metric,
n_trees = n_trees,
build_seed = seed,
build_threads = build_threads,
block_size = block_size
)
} else {
annoy_build_bigmatrix_r_impl(
xp_ref = xp_ref,
path = path,
metric = metric,
n_trees = n_trees,
build_seed = seed,
build_threads = build_threads,
block_size = block_size
)
}
index <- new_bigannoy_index(
path = path,
metadata_path = metadata_path,
metric = metric,
n_trees = n_trees,
n_ref = result$n_ref,
n_dim = result$n_dim,
build_seed = seed,
build_threads = build_threads,
build_backend = if (use_cpp) "cpp" else "r",
load_mode = load_mode,
prefault = FALSE
)
index <- write_annoy_metadata(index)
if (identical(load_mode, "eager")) {
index <- annoy_prepare_loaded_index(index, prefault = FALSE)
}
index
}
#' Open an existing Annoy index and its sidecar metadata
#'
#' @param path File path to an existing Annoy index built by
#' [annoy_build_bigmatrix()].
#' @param metadata_path Optional path to the sidecar metadata file.
#' @param prefault Logical flag indicating whether searches should prefault the
#' index when loaded by the native backend.
#' @param load_mode Whether to eagerly load the native index handle on open or
#' defer until first search.
#'
#' @return A `bigannoy_index` object that can be passed to
#' [annoy_search_bigmatrix()].
#' @export
annoy_open_index <- function(path, metadata_path = NULL, prefault = FALSE, load_mode = "eager")
{
annoy_open_index_impl(path = path, metadata_path = metadata_path, prefault = prefault, load_mode = load_mode)
}
#' Load an existing Annoy index for bigmatrix workflows
#'
#' @inheritParams annoy_open_index
#'
#' @return A `bigannoy_index` object that can be passed to
#' [annoy_search_bigmatrix()].
#' @export
annoy_load_bigmatrix <- function(path, metadata_path = NULL, prefault = FALSE, load_mode = "eager")
{
annoy_open_index_impl(path = path, metadata_path = metadata_path, prefault = prefault, load_mode = load_mode)
}
#' Check whether an index currently has a loaded in-memory handle
#'
#' @param index A `bigannoy_index`.
#'
#' @return `TRUE` when a live native or debug-only handle is cached, otherwise
#' `FALSE`.
#' @export
annoy_is_loaded <- function(index) {
index <- validate_annoy_index(index)
native_keys <- grep("^native_handle_", ls(index$.cache, all.names = TRUE), value = TRUE)
if (length(native_keys)) {
native_loaded <- vapply(native_keys, function(key) {
handle <- index$.cache[[key]]
identical(typeof(handle), "externalptr") && isTRUE(cpp_annoy_is_loaded(handle))
}, logical(1L))
if (any(native_loaded)) {
return(TRUE)
}
}
any(grepl("^handle_", ls(index$.cache, all.names = TRUE)))
}
#' Close any loaded Annoy handle cached inside a `bigannoy_index`
#'
#' @param index A `bigannoy_index`.
#'
#' @return `index`, invisibly.
#' @export
annoy_close_index <- function(index) {
index <- validate_annoy_index(index)
annoy_close_loaded_handles(index)
invisible(index)
}
annoy_validation_row <- function(check, passed, message, severity = "error") {
data.frame(
check = check,
passed = isTRUE(passed),
severity = severity,
message = message,
stringsAsFactors = FALSE
)
}
#' Validate a persisted Annoy index and its sidecar metadata
#'
#' @param index A `bigannoy_index`.
#' @param strict Whether failed validation checks should raise an error.
#' @param load Whether to also verify that the index can be loaded successfully.
#' @param prefault Optional logical override used when `load = TRUE`.
#'
#' @return A list containing `valid`, `checks`, and the normalized `index`.
#' @export
annoy_validate_index <- function(index, strict = TRUE, load = TRUE, prefault = NULL) {
index <- validate_annoy_index(index)
strict <- normalize_scalar_logical(strict, "strict")
load <- normalize_scalar_logical(load, "load")
prefault <- if (is.null(prefault)) isTRUE(index$prefault) else normalize_prefault(prefault)
metadata <- read_annoy_metadata(index$metadata_path)
signature <- annoy_file_signature(index$path)
checks <- list(
annoy_validation_row(
"index_file",
identical(metadata[["index_file"]] %||% basename(index$path), basename(index$path)),
"Metadata index file matches the Annoy file basename."
),
annoy_validation_row(
"metric",
identical(normalize_metric(metadata[["metric"]] %||% index$metric), index$metric),
"Metadata metric matches the index metric."
),
annoy_validation_row(
"dimensions",
identical(metadata_integer_field(metadata, "n_dim", default = index$n_dim), index$n_dim),
"Metadata dimension matches the index dimension."
),
annoy_validation_row(
"items",
identical(metadata_integer_field(metadata, "n_ref", default = index$n_ref), index$n_ref),
"Metadata item count matches the index item count."
),
annoy_validation_row(
"file_size",
isTRUE(all.equal(metadata_numeric_field(metadata, "file_size", default = signature$file_size), signature$file_size)),
"Recorded file size matches the current Annoy file size."
),
annoy_validation_row(
"file_md5",
identical(metadata[["file_md5"]] %||% signature$file_md5, signature$file_md5),
"Recorded file checksum matches the current Annoy file checksum."
),
annoy_validation_row(
"file_mtime",
identical(metadata[["file_mtime"]] %||% signature$file_mtime, signature$file_mtime),
"Recorded file modification time matches the current Annoy file modification time.",
severity = "warning"
)
)
if (isTRUE(load)) {
load_ok <- tryCatch(
{
if (annoy_use_cpp_backend()) {
annoy_ensure_native_handle(index, prefault = prefault)
} else {
get_annoy_handle(index, prefault = prefault)
}
TRUE
},
error = function(e) FALSE
)
checks[[length(checks) + 1L]] <- annoy_validation_row(
"load",
load_ok,
"The index can be loaded successfully."
)
}
checks <- do.call(rbind, checks)
valid <- all(checks$passed[checks$severity == "error"])
result <- list(valid = valid, checks = checks, index = index)
if (isTRUE(strict) && !isTRUE(valid)) {
stop(paste(unique(checks$message[!checks$passed & checks$severity == "error"]), collapse = " "), call. = FALSE)
}
result
}
#' Search an Annoy index built from a `bigmemory::big.matrix`
#'
#' @description
#' Query a persisted Annoy index created by [annoy_build_bigmatrix()] or
#' reopened with [annoy_open_index()]. Supply `query = NULL` for self-search
#' over the indexed reference rows, or provide a dense numeric matrix,
#' `big.matrix`, or external pointer for external-query search. Results can be
#' returned in memory or streamed into destination `big.matrix` objects.
#'
#' @param index A `bigannoy_index` returned by [annoy_build_bigmatrix()],
#' [annoy_open_index()], or [annoy_load_bigmatrix()].
#' @param query Optional query source. Supply `NULL` for self-search, another
#' `big.matrix` or external pointer for streamed queries, or a dense numeric
#' matrix.
#' @param k Number of neighbours to return.
#' @param search_k Annoy's runtime search budget. Use `-1L` for the library
#' default.
#' @param xpIndex Optional writable `bigmemory::big.matrix` or external pointer
#' receiving the 1-based neighbour indices.
#' @param xpDistance Optional writable `bigmemory::big.matrix` or external
#' pointer receiving the Annoy distances. It may only be supplied when
#' `xpIndex` is also provided.
#' @param prefault Optional logical override controlling whether the native
#' backend prefaults the Annoy file while loading it for search.
#' @param block_size Number of queries processed per block.
#'
#' @return A list with components `index`, `distance`, `k`, `metric`, `n_ref`,
#' `n_query`, `exact`, and `backend`.
#' @export
annoy_search_bigmatrix <- function(index,
query = NULL,
k = 10L,
search_k = -1L,
xpIndex = NULL,
xpDistance = NULL,
prefault = NULL,
block_size = annoy_default_block_size()) {
index <- validate_annoy_index(index)
k <- normalize_k(k)
search_k <- normalize_search_k(search_k)
block_size <- normalize_block_size(block_size)
prefault <- annoy_prefault_value(index, prefault)
self_search <- is.null(query)
validate_search_k(k, index$n_ref, self_search)
query_input <- normalize_query_input(query)
n_query <- if (self_search) index$n_ref else query_input$nrow
if (!self_search && !identical(query_input$ncol, index$n_dim)) {
stop("`query` must have the same number of columns as the indexed reference matrix", call. = FALSE)
}
validate_stream_destinations(xpIndex, xpDistance, n_query, k)
xp_index_ptr <- resolve_big_pointer(xpIndex, "xpIndex", allow_null = TRUE)
xp_distance_ptr <- resolve_big_pointer(xpDistance, "xpDistance", allow_null = TRUE)
use_cpp <- annoy_use_cpp_backend()
if (!use_cpp) {
annoy_debug_backend_warning("search")
}
if (use_cpp) {
return(annoy_search_bigmatrix_cpp_impl(
index = index,
query_input = query_input,
k = k,
search_k = search_k,
xp_index_ptr = xp_index_ptr,
xp_distance_ptr = xp_distance_ptr,
xpIndex = xpIndex,
xpDistance = xpDistance,
prefault = prefault,
block_size = block_size
))
}
handle <- get_annoy_handle(index, prefault = prefault)
block_starts <- seq.int(1L, n_query, by = block_size)
output <- allocate_search_buffers(n_query, k, xpIndex)
for (block_id in seq_along(block_starts)) {
start <- block_starts[block_id]
stop_row <- min(start + block_size - 1L, n_query)
rows <- seq.int(start, stop_row)
block <- if (self_search) {
search_block(
handle = handle,
k = k,
search_k = search_k,
rows = rows,
self_search = TRUE
)
} else {
search_block(
handle = handle,
k = k,
search_k = search_k,
rows = rows,
self_search = FALSE,
query_block_values = query_block(query_input, rows)
)
}
if (is.null(xpIndex)) {
output$index[rows, ] <- block$index
output$distance[rows, ] <- block$distance
} else {
write_big_rows(xp_index_ptr, rows, block$index)
if (!is.null(xp_distance_ptr)) {
write_big_rows(xp_distance_ptr, rows, block$distance)
}
}
annoy_progress_message(sprintf(
"Searching Annoy index: block %d/%d (%d queries)",
block_id,
length(block_starts),
length(rows)
))
}
new_annoy_result(
index = if (is.null(xpIndex)) output$index else xpIndex,
distance = if (is.null(xpIndex)) output$distance else xpDistance,
k = k,
metric = index$metric,
n_ref = index$n_ref,
n_query = n_query
)
}
#' Print a `bigannoy_index`
#'
#' @param x A `bigannoy_index`.
#' @param ... Unused.
#'
#' @return `x`, invisibly.
#' @export
print.bigannoy_index <- function(x, ...) {
cat("<bigannoy_index>\n", sep = "")
cat(" path: ", x$path, "\n", sep = "")
cat(" metadata: ", x$metadata_path, "\n", sep = "")
cat(" index_id: ", x$index_id, "\n", sep = "")
cat(" metric: ", x$metric, "\n", sep = "")
cat(" trees: ", x$n_trees, "\n", sep = "")
cat(" items: ", x$n_ref, "\n", sep = "")
cat(" dimension: ", x$n_dim, "\n", sep = "")
cat(" build_seed: ", if (is.null(x$build_seed)) "NULL" else x$build_seed, "\n", sep = "")
cat(" build_threads: ", x$build_threads, "\n", sep = "")
cat(" build_backend: ", x$build_backend, "\n", sep = "")
cat(" load_mode: ", x$load_mode, "\n", sep = "")
cat(" loaded: ", annoy_is_loaded(x), "\n", sep = "")
cat(" file_size: ", x$file_size, "\n", sep = "")
cat(" file_md5: ", x$file_md5, "\n", sep = "")
cat(" prefault: ", isTRUE(x$prefault), "\n", sep = "")
invisible(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.