# Functions that support nearest neighbors use.
# Check whether nn index exists and is consistent with matrix and parameters.
# This function is not in use currently and may fall into disrepair.
check_cds_nn_index_is_current <- function(cds, reduction_method=c('PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'), nn_control=list(), verbose=FALSE) {
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(
!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
msg = paste0('Data has not been processed with',
' chosen reduction_method: ',
reduction_method))
# We check the index build parameters.
nn_control <- set_nn_control(mode=1, nn_control=nn_control, nn_control_default=list(), nn_index=NULL, k=NULL, verbose=verbose)
if(verbose) {
message('check_cds_nn_index_is_current:')
report_nn_control(' nn_control: ', nn_control=nn_control)
}
nn_method <- nn_control[['method']]
if(is.null(cds@reduce_dim_aux[[reduction_method]]) ||
is.null(cds@reduce_dim_aux[[reduction_method]][['nn_index']]) ||
is.null(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]])) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
if(nrow(SingleCellExperiment::reducedDims(cds)[[reduction_method]]) != cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['nrow']]) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
if(ncol(SingleCellExperiment::reducedDims(cds)[[reduction_method]]) != cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['ncol']]) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
if(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['matrix_id']] != get_reduce_dim_matrix_identity(cds, reduction_method)[['matrix_id']]) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
if(nn_method == 'annoy') {
if(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['metric']] != nn_control[['metric']]) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
if(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['n_trees']] != nn_control[['n_trees']]) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
}
else
if(nn_method == 'hnsw') {
if(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['metric']] != nn_control[['metric']]) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
if(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['M']] != nn_control[['M']]) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
if(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['ef_construction']] != nn_control[['ef_construction']]) {
if(verbose) {
message('check_cds_nn_index_is_current: FALSE')
}
return(FALSE)
}
}
else
stop('check_cds_nn_index_is_current: unsupported nearest neighbor index type \'', nn_method, '\'')
if(verbose) {
message('check_cds_nn_index_is_current: TRUE')
}
return(TRUE)
}
# Clear nearest neighbor index.
# Note: remove cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]]
# because the objects stored in it are related; that is, if the object
# cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']]
# is removed or changed, the other objects in
# cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]] need to be
# updated.
clear_cds_nn_index <- function(cds, reduction_method=c('PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'), nn_method=c('annoy', 'hnsw', 'all')) {
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(nn_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "nn_method must be one of 'annoy', 'hnsw', or 'all'")
nn_method <- match.arg(nn_method)
if(nn_method == 'annoy') {
if(!is.null(cds@reduce_dim_aux[[reduction_method]][['nn_index']]) &&
!is.null(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]])) {
cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]] <- NULL
}
}
else
if(nn_method == 'hnsw') {
if(!is.null(cds@reduce_dim_aux[[reduction_method]][['nn_index']]) &&
!is.null(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]])) {
cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]] <- NULL
}
}
else
if(nn_method == 'all') {
if(!is.null(cds@reduce_dim_aux[[reduction_method]][['nn_index']])) {
cds@reduce_dim_aux[[reduction_method]][['nn_index']] <- S4Vectors::SimpleList()
}
}
else
stop('clear_cds_nn_index: unsupported nearest neighbor index type \'', nn_method, '\'')
return(cds)
}
# Check whether nearest neighbor search information exists
# for reduction_method. This is not useful at this time. The
# intention was to store information about the search
# parameters used in cluster_cells() and use them in for
# nearest neighbor searches in later function calls. However,
# this does not seem useful at this time.
# This function is not in use currently and may fall into disrepair.
check_cds_nn_search_exists <- function(cds, reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"), search_id, verbose=FALSE)
{
assertthat::assert_that(methods::is(cds, "cell_data_set"))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'UMAP', 'tSNE', 'PCA', 'LSI', 'Aligned'")
reduction_method <- match.arg(reduction_method)
if(is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]])) {
return(FALSE)
}
if(is.null(cds@clusters[[reduction_method]])) {
return(FALSE)
}
if(is.null(cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]])) {
return(FALSE)
}
return(TRUE)
}
select_nn_parameter_value <- function(parameter, nn_control, nn_control_default, default_value) {
if(!is.null(nn_control[[parameter]])) {
return(nn_control[[parameter]])
}
else
if(!is.null(nn_control_default[[parameter]])) {
return(nn_control_default[[parameter]])
}
return(default_value)
}
#' set annoy search_k parameter
#' precedence for setting search_k
#' o nn_control[['search_k']]
#' o mode == 2 and the following values exist/set
#' o nn_index parameter
#' o n_trees value in nn_index parameter object
#' o k parameter
#' o nn_control[['n_trees']] and k parameter or k default
#' o nn_control_default[['search_k']]
#' o nn_control_default[['n_trees']] and k parameter or k default
#' o n_trees default and k parameter or k default
#' @noRd
select_annoy_search_k <- function(mode, nn_control, nn_control_default, nn_index, k, default_n_trees, default_k) {
if(!is.null(k)) {
use_k <- k
src_k <- 'parameter'
}
else {
use_k <- default_k
src_k <- 'default'
}
# Sanity test.
if(mode == 2 &&
!is.null(nn_index) &&
is.null(nn_index[['n_trees']])) {
stop('set_nn_control: unexpected condition: found old version of reduce_dim_aux')
}
if(!is.null(nn_control[['search_k']])) {
return(nn_control[['search_k']])
}
else
if(mode == 2 &&
!is.null(nn_index) &&
!is.null(nn_index[['n_trees']]) &&
!is.null(k)) {
n_trees <- nn_index[['n_trees']]
return(2 * n_trees * k)
}
else
if(!is.null(nn_control[['n_trees']])) {
return(2 * nn_control[['n_trees']] * use_k)
}
else
if(!is.null(nn_control_default[['search_k']])) {
return(nn_control_default[['search_k']])
}
else
if(!is.null(nn_control_default[['n_trees']])) {
return(2 * nn_control_default[['n_trees']] * use_k)
}
return(2 * default_n_trees * use_k)
}
#' @title Verify and set nearest neighbor parameter list.
#'
#' @description Verifies the listed parameter values
#' that will be passed to the nearest neighbor function
#' given by nn_control\[\['method'\]\]. Unspecified
#' values are set to default values. To see the default
#' values, call the function with
#' nn_control=list(show_values=TRUE).
#'
#' @section Optional nn_control parameters:
#' \describe{
#' \item{method}{The method used to find nearest neighbor points.
#' The available methods are 'nn2', 'annoy', and 'hnsw'.
#' Detailed information about each method can be found on the
#' WWW sites:
#' https://cran.r-project.org/web/packages/RANN/,
#' https://cran.r-project.org/web/packages/RcppAnnoy/index.html,
#' and https://cran.rstudio.com/web/packages/RcppHNSW/index.html.}
#' \item{metric}{The distance metric used by the nearest neighbor
#' functions. Annoy accepts 'euclidean', 'cosine', 'manhattan',
#' and 'hamming'. HNSW accepts 'euclidean', 'l2', 'cosine', and
#' 'ip'. RANN uses 'euclidean'.}
#' \item{n_trees}{The annoy index build parameter that affects the build
#' time and index size. Larger values give more accurate results,
#' longer build times, and larger indexes.}
#' \item{search_k}{The annoy index search parameter that affects the
#' search accuracy and time. Larger values give more accurate
#' results and longer search times. Default is 2 * n_trees * k.
#' In order to set search_k, the following conditions are tested
#' and the first TRUE condition is used: nn_control\[\['search_k'\]\]
#' exists; mode=2 and nn_index and k are not NULL;
#' nn_control\[\['n_trees'\]\] exists;
#' nn_control_default\[\['search_k'\]\] exists;
#' nn_control_default\[\['n_trees'\]\] exists. If none of those is
#' TRUE, the fallback default n_trees value is used. If the
#' set_nn_control k parameter value is not NULL, it is used;
#' otherwise, the default is used.}
#' \item{M}{The HNSW index build parameter that affects the
#' search accuracy and memory requirements. Larger values give
#' more accurate search results and increase the index memory
#' use.}
#' \item{ef_construction}{The HNSW index build parameter that affects
#' the search accuracy and index build time. Larger values give more
#' accurate search results and longer build times. Default is 200.}
#' \item{ef}{The HNSW index search parameter that affects the search
#' accuracy and search time. Larger values give more accurate results
#' and longer search times. ef must be greater than or equal to k.}
#' \item{grain_size}{The HNSW parameter that gives the
#' minimum amount of work to do per thread.}
#' \item{cores}{The annoy and HNSW parameter that gives the number of
#' threads to use for the annoy index search and for the HNSW index
#' build and search.}
#' \item{show_values}{A logical value used to show the
#' nearest neighbor parameters to use, and then exit the function.
#' When show_values=TRUE is the only nn_control value, the
#' parameters are the defaults for the function. Each function
#' that calls set_nn_control may have its own nn_control_default
#' list.}
#' }
#'
#' @param mode the nearest neighbor operation for which the nn_control
#' list will be used. 1=make index, 2=search index, and 3=both make
#' and search index. Required parameter.
#' @param nn_control an optional list of parameters passed to
#' the nearest neighbor function specified by nn_control\[\['method'\]\].
#' If a value is not given in nn_control, the value in nn_control_default
#' is used. If neither is given, a fallback default is assigned.
#' @param nn_control_default an optional nn_control list to use when
#' a parameter is not given in nn_control.
#' @param nn_index an nn_index. This may be used to look up parameters
#' that were used to make an index. For example, the default search_k
#' parameter depends on the n_trees values used to make the index.
#' The default is NULL.
#' @param k integer the number of desired nearest neighbor points to
#' return from a search. k is used to
#' * set the annoy search_k parameter when search_k is not given in
#' nn_control or nn_control_default where search_k is 2 * n_trees * k.
#' * test the hnsw ef parameter, which must be at least as large
#' as k.
#'
#' k is ignored for index builds and does not give the number
#' of nearest neighbors to return for a search.
#' @param verbose a boolean indicating whether to emit verbose output.
#'
#' @return an updated nn_control list.
set_nn_control <- function(mode, nn_control=list(), nn_control_default=list(), nn_index=NULL, k=NULL, verbose=FALSE) {
default_method <- 'annoy'
default_metric <- 'euclidean'
default_k <- 25
default_n_trees <- 50
default_M <- 48
default_ef_construction <- 200
default_ef <- 150
default_grain_size <- 1
default_cores <- 1
default_annoy_random_seed <- 42
assertthat::assert_that(methods::is(nn_control, "list"))
assertthat::assert_that(methods::is(nn_control_default, "list"))
allowed_control_parameters <- c('method',
'metric',
'n_trees',
'search_k',
'M',
'ef_construction',
'ef',
'grain_size',
'cores',
'show_values',
'annoy_random_seed')
assertthat::assert_that(all(names(nn_control) %in% allowed_control_parameters),
msg = "set_nn_control: unknown variable in nn_control")
assertthat::assert_that(all(names(nn_control_default) %in% allowed_control_parameters),
msg = "set_nn_control: unknown variable in nn_control_default")
assertthat::assert_that(assertthat::is.count(mode) && mode >= 1 && mode <= 3,
msg = paste0("set_nn_control: invalid mode value. Mode must be an integer with the value 1, 2, or 3."))
nn_control_out <- list()
nn_control_out[['method']] <- select_nn_parameter_value('method', nn_control, nn_control_default, default_method)
if(nn_control_out[['method']] == 'nn2') {
# Do nothing for nn2. Call report_nn_control when verbose <- TRUE
} else
if(nn_control_out[['method']] == 'annoy') {
nn_control_out[['metric']] <- select_nn_parameter_value('metric', nn_control, nn_control_default, default_metric)
assertthat::assert_that(nn_control_out[['metric']] %in% c('euclidean', 'cosine', 'manhattan', 'hamming'),
msg=paste0("set_nn_control: nearest neighbor metric for annoy must be one of 'euclidean', 'cosine', 'manhattan', or 'hamming'"))
if(bitwAnd(mode, 1)) {
nn_control_out[['n_trees']] <- select_nn_parameter_value('n_trees', nn_control, nn_control_default, default_n_trees)
nn_control_out[['annoy_random_seed']] <- select_nn_parameter_value('annoy_random_seed', nn_control, nn_control_default, default_annoy_random_seed)
assertthat::assert_that(assertthat::is.count(nn_control_out[['n_trees']]))
}
if(bitwAnd(mode, 2)) {
nn_control_out[['search_k']] <- select_annoy_search_k(mode, nn_control, nn_control_default, nn_index, k, default_n_trees, default_k)
nn_control_out[['grain_size']] <- select_nn_parameter_value('grain_size', nn_control, nn_control_default, default_grain_size)
nn_control_out[['cores']] <- select_nn_parameter_value('cores', nn_control, nn_control_default, default_cores)
assertthat::assert_that(assertthat::is.count(nn_control_out[['search_k']]))
assertthat::assert_that(assertthat::is.count(nn_control_out[['grain_size']]))
assertthat::assert_that(assertthat::is.count(nn_control_out[['cores']]))
}
} else
if(nn_control_out[['method']] == 'hnsw') {
nn_control_out[['metric']] <- select_nn_parameter_value('metric', nn_control, nn_control_default, default_metric)
assertthat::assert_that(nn_control_out[['metric']] %in% c('euclidean', 'l2', 'cosine', 'ip'),
msg=paste0("set_nn_control: nearest neighbor metric for HNSW must be one of 'euclidean', 'l2', 'cosine', or 'ip'"))
nn_control_out[['grain_size']] <- select_nn_parameter_value('grain_size', nn_control, nn_control_default, default_grain_size)
nn_control_out[['cores']] <- select_nn_parameter_value('cores', nn_control, nn_control_default, default_cores)
assertthat::assert_that(assertthat::is.count(nn_control_out[['grain_size']]))
assertthat::assert_that(assertthat::is.count(nn_control_out[['cores']]))
if(bitwAnd(mode, 1)) {
nn_control_out[['M']] <- select_nn_parameter_value('M', nn_control, nn_control_default, default_M)
nn_control_out[['ef_construction']] <- select_nn_parameter_value('ef_construction', nn_control, nn_control_default, default_ef_construction)
assertthat::assert_that(assertthat::is.count(nn_control_out[['M']]))
assertthat::assert_that(assertthat::is.count(nn_control_out[['ef_construction']]))
assertthat::assert_that(nn_control_out[['M']] >= 2,
msg=paste0('set_nn_control: HNSW nearest neighbor M index build parameter must be >= 2'))
}
if(bitwAnd(mode, 2)) {
assertthat::assert_that(assertthat::is.count(k),
msg=paste0('set_nn_control: parameter k must be set for method=\'hnsw\' and modes 2 and 3.'))
nn_control_out[['ef']] <- select_nn_parameter_value('ef', nn_control, nn_control_default, default_ef)
assertthat::assert_that(assertthat::is.count(nn_control_out[['ef']]))
assertthat::assert_that(nn_control_out[['ef']] >= k,
msg=paste0('set_nn_control: HNSW nearest neighbor ef index search parameter must be >= k (',k,')'))
}
}
else
stop('set_nn_control: unsupported nearest neighbor method \'', nn_control_out[['method']], '\'')
if(verbose) {
cs <- get_call_stack_as_string()
message('set_nn_control: call stack: ', cs)
report_nn_control(' nn_control: ', nn_control_out)
}
if(!is.null(nn_control[['show_values']]) && nn_control[['show_values']] == TRUE)
{
report_nn_control(' nn_control: ', nn_control=nn_control_out)
stop_no_noise()
}
return(nn_control_out)
}
# Report nn_control list values.
report_nn_control <- function(label=NULL, nn_control) {
indent <- ''
if(!is.null(label)) {
indent <- ' '
}
message(ifelse(!is.null(label), label, ''))
message(indent, ' method: ', ifelse(!is.null(nn_control[['method']]), nn_control[['method']], as.character(NA)))
message(indent, ' metric: ', ifelse(!is.null(nn_control[['metric']]), nn_control[['metric']], as.character(NA)))
if(is.null(nn_control[['method']])) {
return()
}
if(nn_control[['method']] == 'nn2') {
message(indent, ' nn2 has no parameters')
}
else
if(nn_control[['method']] == 'annoy') {
message(indent, ' n_trees: ', ifelse(!is.null(nn_control[['n_trees']]), nn_control[['n_trees']], as.character(NA)))
message(indent, ' search_k: ', ifelse(!is.null(nn_control[['search_k']]), nn_control[['search_k']], as.character(NA)))
message(indent, ' cores: ', ifelse(!is.null(nn_control[['cores']]), nn_control[['cores']], as.character(NA)))
message(indent, ' grain_size: ', ifelse(!is.null(nn_control[['grain_size']]), nn_control[['grain_size']], as.character(NA)))
}
else
if(nn_control[['method']] == 'hnsw') {
message(indent, ' M: ', ifelse(!is.null(nn_control[['M']]), nn_control[['M']], as.character(NA)))
message(indent, ' ef_construction: ', ifelse(!is.null(nn_control[['ef_construction']]), nn_control[['ef_construction']], as.character(NA)))
message(indent, ' ef: ', ifelse(!is.null(nn_control[['ef']]), nn_control[['ef']], as.character(NA)))
message(indent, ' cores: ', ifelse(!is.null(nn_control[['cores']]), nn_control[['cores']], as.character(NA)))
message(indent, ' grain_size: ', ifelse(!is.null(nn_control[['grain_size']]), nn_control[['grain_size']], as.character(NA)))
}
else
stop('report_nn_control: unsupported nearest neighbor method \'', nn_control[['method']], '\'')
}
new_annoy_index <- function(metric, ndim) {
nn_class <- switch( metric,
cosine = RcppAnnoy::AnnoyAngular,
euclidean = RcppAnnoy::AnnoyEuclidean,
hamming = RcppAnnoy::AnnoyHamming,
manhattan = RcppAnnoy::AnnoyManhattan,
stop('unsupported annoy metric ', metric)
)
nn_index <- methods::new(nn_class, ndim)
return(nn_index)
}
#' @title Make a nearest neighbor index.
#'
#' @description Make a nearest neighbor index from the
#' subject_matrix using either the default nearest
#' neighbor method or the method specified in the
#' nn_control list parameter. The function returns
#' the index.
#'
#' @param subject_matrix the matrix used to build the
#' index.
#' @param nn_control a list of parameters used to make the
#' nearest neighbor index. See the set_nn_control help
#' for details.
#' @param verbose a boolean indicating whether to emit verbose output.
#'
#' @return a nearest neighbor index.
#'
#' @examples
#' \donttest{
#' cds <- load_a549()
#' cds <- preprocess_cds(cds)
#' nn_index <- make_nn_index(SingleCellExperiment::reducedDims(cds)[['PCA']])
#' }
#'
#' @importFrom utils packageVersion
#' @export
make_nn_index <- function(subject_matrix, nn_control=list(), verbose=FALSE) {
assertthat::assert_that(methods::is(subject_matrix, 'matrix') ||
is_sparse_matrix(subject_matrix),
msg=paste0('make_nn_matrix: the subject_matrix object must be of type matrix'))
# We check the index build parameters.
nn_control_default <- get_global_variable('nn_control_annoy_euclidean')
nn_control <- set_nn_control(mode=1, nn_control=nn_control, nn_control_default=nn_control_default, nn_index=NULL, k=NULL, verbose=verbose)
if(verbose) {
message('make_nn_index:')
report_nn_control(' nn_control: ', nn_control)
tick('make_nn_index: build time')
}
nn_method <- nn_control[['method']]
metric = nn_control[['metric']]
num_row <- nrow(subject_matrix)
num_col <- ncol(subject_matrix)
if(!is.null(rownames(subject_matrix)))
checksum_rownames <- digest::digest(sort(rownames(subject_matrix)))
else
checksum_rownames <- NA_character_
if(nn_method == 'nn2') {
stop('make_nn_index is not valid for method nn2')
} else
if(nn_method == 'annoy') {
monocle3_annoy_index_version <- get_global_variable('monocle3_annoy_index_version')
annoy_index <- new_annoy_index(metric, num_col)
annoy_random_seed <- nn_control[['annoy_random_seed']]
annoy_index$setSeed(annoy_random_seed)
n_trees <- nn_control[['n_trees']]
if(num_row > 0 ) {
for(i in 1:num_row)
annoy_index$addItem(i-1, subject_matrix[i,])
annoy_index$build(n_trees)
}
annoy_index_version <- packageVersion('RcppAnnoy')
nn_index <- list(method='annoy', annoy_index=annoy_index, version=monocle3_annoy_index_version, annoy_index_version=annoy_index_version, metric=metric, n_trees=n_trees, nrow=num_row, ncol=num_col, checksum_rownames=checksum_rownames, annoy_random_seed=annoy_random_seed)
}
else
if(nn_method == 'hnsw') {
monocle3_hnsw_index_version <- get_global_variable('monocle3_hnsw_index_version')
M <- nn_control[['M']]
ef_construction <- nn_control[['ef_construction']]
hnsw_index <- RcppHNSW::hnsw_build(X=subject_matrix,
distance=metric,
M=M,
ef=ef_construction,
verbose=verbose,
n_threads=nn_control[['cores']],
grain_size=nn_control[['grain_size']])
hnsw_index_version <- packageVersion('RcppHNSW')
nn_index <- list(method='hnsw', hnsw_index=hnsw_index, version=monocle3_hnsw_index_version, hnsw_index_version=hnsw_index_version, metric=metric, M=M, ef_construction=ef_construction, nrow=num_row, ncol=num_col, checksum_rownames=checksum_rownames)
}
else
stop('make_nn_index: unsupported nearest neighbor index type \'', nn_method, '\'')
if(verbose) {
tock()
}
return(nn_index)
}
#' @title Set a nearest neighor index in the cell_data_set.
#'
#' @description Store the given nearest neighbor index
#' in the cell_data_set. The reduction_method parameter
#' tells set_cds_nn_index where in the cell_data_set to
#' store the index.
#'
#' @param cds a cell_data_set in which to store the nearest
#' neighbor index.
#' @param reduction_method a string giving the reduced
#' dimension matrix used to make the nn_index nearest
#' neighbor index, and determines where the index is
#' stored in the cell_data_set.
#' @param nn_index a nearest neighbor index to store in cds.
#' @param verbose a boolean indicating whether to emit verbose output.
#'
#' @return a cell_data_set with the stored index.
#'
#' @export
set_cds_nn_index <- function(cds, reduction_method=c('UMAP', 'PCA', 'LSI', 'Aligned', 'tSNE'), nn_index, verbose=FALSE) {
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
msg = paste0("When reduction_method = '", reduction_method,
"' the cds must have been processed for it.",
" Please run the required processing function",
" before this one."))
nn_method <- nn_index[['method']]
if(nn_method == 'nn2') {
stop('set_cds_nn_index is not valid for method nn2')
} else
if(nn_method == 'annoy') {
cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]] <- S4Vectors::SimpleList()
cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']] <- nn_index
cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['matrix_id']] <- get_reduce_dim_matrix_identity(cds, reduction_method)[['matrix_id']]
}
else
if(nn_method == 'hnsw') {
cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]] <- S4Vectors::SimpleList()
cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']] <- nn_index
cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['matrix_id']] <- get_reduce_dim_matrix_identity(cds, reduction_method)[['matrix_id']]
}
else
stop('set_cds_nn_index: unsupported nearest neighbor index type \'', nn_method, '\'')
return(cds)
}
#' @title Make and store a nearest neighbor index in the cell_data_set.
#'
#' @description Make a nearest neighbor index from the specified
#' reduction_method matrix in the cell_data_set using either the
#' default nearest neighbor method or the method specified in the
#' nn_control list parameter, and store the index in the
#' cell_data_set. This function returns a cell_data_set.
#'
#' @param cds a cell_data_set with the reduced dimension matrix from
#' which to make the nearest neighbor index and with which the index
#' is stored.
#' @param reduction_method a string giving the reduced dimension matrix
#' to use for making the nn_index nearest neighbor index.
#' Note: distances in tSNE space reflect spatial differences poorly
#' so using nearest neighbors with it may be meaningless.
#' @param nn_control a list of parameters to use for making the nearest
#' neighbor index. See the set_nn_control help for details.
#' @param verbose a boolean indicating whether to emit verbose output.
#'
#' @return a cell_data_set with the stored index.
#'
#' @examples
#' \donttest{
#' cds <- load_a549()
#' cds <- preprocess_cds(cds)
#' cds <- make_cds_nn_index(cds, 'PCA')
#' }
#'
#' @export
make_cds_nn_index <- function(cds, reduction_method=c('UMAP', 'PCA', 'LSI', 'Aligned', 'tSNE'), nn_control=list(), verbose=FALSE) {
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
msg = paste0("When reduction_method = '", reduction_method,
"' the cds must have been processed for it.",
" Please run the required processing function",
" before this one."))
nn_control_default <- get_global_variable('nn_control_annoy_euclidean')
nn_control <- set_nn_control(mode=1, nn_control=nn_control, nn_control_default=nn_control_default, nn_index=NULL, k=NULL, verbose=verbose)
nn_method <- nn_control[['method']]
if(nn_method == 'nn2') {
stop('make_cds_nn_index is not valid for method nn2')
}
reduced_matrix <- SingleCellExperiment::reducedDims(cds)[[reduction_method]]
nn_index <- make_nn_index(subject_matrix=reduced_matrix, nn_control=nn_control, verbose=verbose)
cds <- set_cds_nn_index(cds=cds, reduction_method=reduction_method, nn_index=nn_index, verbose=verbose)
return(cds)
}
# Return the nn_index that was made from the reduction_method
# reduced dimension matrix.
get_cds_nn_index <- function(cds, reduction_method=c('UMAP', 'PCA', 'LSI', 'Aligned', 'tSNE'), nn_method, verbose=FALSE) {
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(
!is.null(cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]]),
msg = paste("There is no nearest neighbor index",
"for reduction_method",
reduction_method,
"and nearest neighbor method",
nn_method))
assertthat::assert_that(nn_method %in% c('annoy', 'hnsw'),
msg = paste0('get_cds_nn_index: unsupported nearest neighbor index type \'',
nn_method, '\'.'))
if(nn_method == 'annoy') {
nn_index <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']]
}
else
if(nn_method == 'hnsw') {
nn_index <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']]
}
else
stop('get_cds_nn_index: unsupported nearest neighbor index type \'', nn_method, '\'')
return(nn_index)
}
# Check that the annoy index exists.
# Returns logical TRUE if the index exists.
test_annoy_index <- function(nn_index, verbose=FALSE) {
res <- TRUE
if(is.null(nn_index[['annoy_index']])) {
if(!verbose) {
cs <- get_call_stack_as_string()
message('test_annoy_index: the annoy nearest neighbor does not exist\ncall stack: ', cs)
}
else {
message('test_annoy_index: the annoy nearest neighbor does not exist.')
}
return(FALSE)
}
tryCatch( {
dist_res <- nn_index[['annoy_index']]$getDistance(0,1)
},
error=function(emsg) {
if(!verbose) {
cs <- get_call_stack_as_string()
message('test_annoy_index: the annoy nearest neighbor does not exist\ncall stack: ', cs)
}
else {
message('test_annoy_index: the annoy nearest neighbor does not exist.')
}
res <<- FALSE
} )
return(res)
}
# Check that the hnsw index exists.
# Returns logical TRUE if the index exists.
test_hnsw_index <- function(nn_index, verbose=FALSE) {
res <- TRUE
if(is.null(nn_index[['hnsw_index']])) {
if(!verbose) {
cs <- get_call_stack_as_string()
message('test_hnsw_index: the hnsw nearest neighbor does not exist\ncall stack: ', cs)
}
else {
message('test_hnsw_index: the hnsw nearest neighbor does not exist.')
}
return(FALSE)
}
tryCatch( {
size_res <- nn_index[['hnsw_index']]$size()
},
error=function(emsg) {
if(!verbose) {
cs <- get_call_stack_as_string()
message('test_hnsw_index: the hnsw nearest neighbor does not exist\ncall stack: ', cs)
}
else {
message('test_hnsw_index: the hnsw nearest neighbor does not exist.')
}
res <<- FALSE
} )
return(res)
}
search_nn_annoy_index <- function(query_matrix, nn_index, metric, k, search_k, beg_row_index, end_row_index) {
assertthat::assert_that(beg_row_index <= end_row_index,
msg=paste0('search_nn_annoy_index: beg_row_index must be <= end_row_index'))
if(!is.null(nn_index[['version']]))
annoy_index <- nn_index[['annoy_index']]
else
if(!is.null(nn_index[['type']]))
annoy_index <- nn_index[['ann']]
else
annoy_index <- nn_index
nrow <- end_row_index - beg_row_index + 1
idx <- matrix(nrow=nrow, ncol=k)
dists <- matrix(nrow=nrow, ncol=k)
num_bad <- 0
offset <- beg_row_index - 1
for(i in seq(1, nrow)) {
nn_list <- annoy_index$getNNsByVectorList(query_matrix[i+offset,], k, search_k, TRUE)
if(length(nn_list$item) != k)
num_bad <- num_bad + 1
idx[i,] <- nn_list$item
dists[i,] <- nn_list$distance
}
if(metric == 'cosine') {
dists <- 0.5 * dists * dists
}
return(list(idx=idx+1, dists=dists, num_bad=num_bad))
}
#' @title Search a nearest neighbor index.
#'
#' @description Search a nearest neighbor index for cells near
#' those in the query_matrix.
#'
#' @param query_matrix a reduced dimension matrix used to find the
#' nearest neighbors in the index nn_index.
#' @param nn_index a nearest_neighbor index.
#' @param k an integer for the number of nearest neighbors to return for
#' each cell. Default is 25.
#' @param nn_control a list of parameters used to search the nearest
#' neighbor index. See the set_nn_control help for details. Note: the
#' default annoy search_k parameter value is set to the default value
#' of 2 * n_trees * k. It does not know the value of n_trees that was
#' used to build the annoy index so if a non-default n_trees value
#' was used to build the index, you may need to set search_k in
#' nn_control list when you run search_nn_index.
#' @param verbose a boolean indicating whether to emit verbose output.
#'
#' @return a list list(nn.idx, nn.dists) where nn.idx is
#' a matrix of nearest neighbor indices and nn.dists is a matrix of
#' the distance between the index given by the row number and the index
#' given in nn.idx. If the same reduced dim matrix is used to make the
#' index and search the index, the index given by the row number should
#' be in the row, usually in the first column.
#'
#' @examples
#' \donttest{
#' cds <- load_a549()
#' cds <- preprocess_cds(cds)
#' nn_index <- make_nn_index(SingleCellExperiment::reducedDims(cds)[['PCA']])
#' nn_res <- search_nn_index(SingleCellExperiment::reducedDims(cds)[['PCA']], nn_index, 10)
#' }
#'
#' @export
search_nn_index <- function(query_matrix, nn_index, k=25, nn_control=list(), verbose=FALSE) {
assertthat::assert_that(methods::is(query_matrix, 'matrix') ||
is_sparse_matrix(query_matrix),
msg=paste0('make_nn_matrix: the query_matrix object must be of type matrix'))
assertthat::assert_that(assertthat::is.count(k))
nn_control_default <- get_global_variable('nn_control_annoy_euclidean')
nn_control <- set_nn_control(mode=2, nn_control=nn_control, nn_control_default=nn_control_default, nn_index=nn_index, k=k, verbose=verbose)
nn_method <- nn_control[['method']]
assertthat::assert_that(nn_method %in% c('annoy', 'hnsw'),
msg = paste0('search_nn_index: unsupported nearest neighbor index type \'',
nn_method, '\'.'))
if(verbose) {
message('search_nn_index:')
message(' k: ', k)
report_nn_control(' nn_control: ', nn_control)
tick('search_nn_index: search time:')
}
k <- min(k, nrow(query_matrix))
cores <- nn_control[['cores']]
if(nn_method == 'nn2') {
stop('search_nn_index is not valid for method nn2')
} else
if(nn_method == 'annoy') {
if(!test_annoy_index(nn_index=nn_index, verbose=verbose)) {
stop_no_noise()
}
# notes:
# o set list names to nn.idx and nn.dists for compatibility with
# RANN::nn2()
#
num_row <- nrow(query_matrix)
idx <- matrix(nrow=num_row, ncol=k)
dists <- matrix(nrow=num_row, ncol=k)
metric <- nn_control[['metric']]
search_k <- nn_control[['search_k']]
if(cores <= 1) {
nn_res <- search_nn_annoy_index(query_matrix=query_matrix, nn_index=nn_index, metric=metric, k=k, search_k=search_k, beg_row_index=1, end_row_index=num_row)
if(nn_res[['num_bad']])
stop('annoy was unable to find ', k, ' nearest neighbors for ', nn_res[["num_bad"]], ' rows. You may need to increase the n_trees and/or search_k parameter values.')
nn_res <- list(nn.idx=nn_res[['idx']], nn.dists=nn_res[['dists']])
}
else {
omp_num_threads <- get_global_variable('omp_num_threads')
blas_num_threads <- get_global_variable('blas_num_threads')
RhpcBLASctl::omp_set_num_threads(1L)
RhpcBLASctl::blas_set_num_threads(1L)
if(cores > num_row)
cores <- num_row
tasks <- tasks_per_block(num_row, cores)
beg_block <- c(0,cumsum(tasks))[1:cores]+1
end_block <- cumsum(tasks)
nn_blocks <- list()
inplan <- future::plan()
future::plan(future::multicore, workers=cores)
on.exit(future::plan(inplan), add=TRUE)
for(iblock in seq(cores)) {
nn_blocks[[iblock]] <- future::future( { search_nn_annoy_index(query_matrix=query_matrix, nn_index=nn_index, metric=metric, k=k, search_k=search_k, beg_row_index=beg_block[[iblock]], end_row_index=end_block[[iblock]]) })
}
tot_bad <- 0
for(iblock in seq(cores)) {
nn_res <- future::value(nn_blocks[[iblock]])
if(nrow(nn_res[['idx']]) != end_block[[iblock]] - beg_block[[iblock]] + 1) {
stop('bad row count in nn_res')
}
idx[beg_block[[iblock]]:end_block[[iblock]],] <- nn_res[['idx']]
dists[beg_block[[iblock]]:end_block[[iblock]],] <- nn_res[['dists']]
tot_bad <- tot_bad + nn_res[['num_bad']]
}
if(tot_bad)
stop('annoy was unable to find ', k, ' nearest neighbors for ', tot_bad, ' rows. You may need to increase the n_trees and/or search_k parameter values.')
nn_res <- list(nn.idx=idx, nn.dists=dists)
RhpcBLASctl::omp_set_num_threads(as.integer(omp_num_threads))
RhpcBLASctl::blas_set_num_threads(as.integer(blas_num_threads))
}
}
else
if(nn_method == 'hnsw') {
if(!test_hnsw_index(nn_index=nn_index, verbose=verbose)) {
stop_no_noise()
}
assertthat::assert_that(nn_control[['ef']] >= k,
msg=paste0('search_nn_index: ef must be >= k'))
tmp <- RcppHNSW::hnsw_search(X=query_matrix,
ann=nn_index[['hnsw_index']],
k=k,
ef=nn_control[['ef']],
verbose=verbose,
n_threads=cores,
grain_size=nn_control[['grain_size']])
# The RcppHNSW documentation says that the L2 metric is the square
# of the Euclidean distance but my tests indicate that the L2 metric
# returns sqrt(sum((SingleCellExperiment::reducedDims(cds)[['UMAP']][i,] - SingleCellExperiment::reducedDims(cds)[['UMAP']][j,])^2)).
# Additionally, nn2, annoy, and hnsw return the same distance values for the euclidean
# metric.
nn_res <- list(nn.idx=tmp[['idx']], nn.dists=tmp[['dist']])
}
else
stop('search_nn_index: unsupported nearest neighbor index type \'', nn_method, '\'')
if(verbose) {
tock()
}
return(nn_res)
}
#' @title Search a nearest neighbor index that is stored in
#' the cds.
#'
#' @description Search a nearest neighbor index for cells near
#' those in the query_matrix.
#'
#' @param query_matrix a reduced dimension matrix used to find the
#' nearest neighbors in the index nn_index.
#' @param cds a cell_data_set in which the nearest neighbor index
#' is stored.
#' @param reduction_method a string giving the reduced
#' dimension matrix used to make the nearest neighbor index, and
#' determines where the index is stored in the cell_data_set.
#' Note: distances in tSNE space reflect spatial differences poorly
#' so using nearest neighbors with it may be meaningless.
#' @param k an integer for the number of nearest neighbors to return for
#' each cell. Default is 25.
#' @param nn_control a list of parameters used to make and search
#' the nearest neighbors indexes. See the set_nn_control help
#' for additional details. Note that if nn_control\[\['search_k'\]\]
#' is not defined, transfer_cell_labels will try to use
#' search_k <- 2 * n_trees * k where n_trees is the value used
#' to build the index. The default metric is cosine for
#' reduction_methods PCA, LSI, and Aligned, and is euclidean for
#' reduction_methods tSNE and UMAP.
#' @param verbose a boolean indicating whether to emit verbose output.
#'
#' @return a list list(nn.idx, nn.dists) where nn.idx is
#' a matrix of nearest neighbor indices and nn.dists is a matrix of
#' the distance between the index given by the row number and the index
#' given in nn.idx. If the same reduced dim matrix is used to make the
#' index and search the index, the index given by the row number should
#' be in the row, usually in the first column.
#'
#' @examples
#' \donttest{
#' cds <- load_a549()
#' cds <- preprocess_cds(cds)
#' cds <- make_cds_nn_index(cds, 'PCA')
#' nn_res <- search_cds_nn_index(SingleCellExperiment::reducedDims(cds)[['PCA']], cds, 'PCA', 10)
#' }
#'
#' @export
search_cds_nn_index <- function(query_matrix, cds, reduction_method=c('UMAP', 'PCA', 'LSI', 'Aligned', 'tSNE'), k=25, nn_control=list(), verbose=FALSE) {
assertthat::assert_that(methods::is(query_matrix, 'matrix') ||
is_sparse_matrix(query_matrix),
msg=paste0('make_nn_matrix: the query_matrix object must be of type matrix'))
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
msg = paste0("When reduction_method = '", reduction_method,
"' the cds must have been processed for it.",
" Please run the required processing function",
" before this one."))
assertthat::assert_that(assertthat::is.count(k))
if(reduction_method == 'tSNE' || reduction_method == 'UMAP')
nn_control_default <- get_global_variable('nn_control_annoy_euclidean')
else
nn_control_default <- get_global_variable('nn_control_annoy_cosine')
# Use set_nn_control to find nn method, which we need in order to select the correct index,
# and we may need the index to set the nn_control[['search_k']].
nn_control_tmp <- set_nn_control(mode=2,
nn_control=nn_control,
nn_control_default=nn_control_default,
nn_index=NULL,
k=k,
verbose=verbose)
nn_index <- get_cds_nn_index(cds=cds, reduction_method=reduction_method, nn_control_tmp[['method']], verbose=FALSE)
nn_control <- set_nn_control(mode=2,
nn_control=nn_control,
nn_control_default=nn_control_default,
nn_index=nn_index,
k=k,
verbose=verbose)
nn_res <- search_nn_index(query_matrix=query_matrix,
nn_index=nn_index,
k=k,
nn_control=nn_control,
verbose=verbose)
return(nn_res)
}
# Notice that this function uses the available SingleCellExperiment::reducedDims(cds)[[reduction_method]] matrix
# for verifying the query matrix -- be certain that the search (and index build) were run
# on this matrix so call this function immediately after running the search.
# Notes:
# o this function was written to store nn search information in the cds when
# we intended to re-use an index for later processing stages but
# it appears that there is little or no opportunity to re-use
# indices so we are not storing search information at this time.
# This function is not in use currently and may fall into disrepair.
set_cds_nn_search <- function(cds, reduction_method=c('UMAP', 'PCA', 'LSI', 'Aligned', 'tSNE'), search_id, ef, k, nn_control=list(), verbose=TRUE) {
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
msg = paste0("When reduction_method = '", reduction_method,
"' the cds must have been processed for it.",
" Please run the required processing function",
" before this one."))
assertthat::assert_that(!is.null(search_id),
msg = paste0('You must give a search_id string.'))
assertthat::assert_that(!is.null(k),
msg = paste0('You must give a k value.'))
reduced_matrix <- SingleCellExperiment::reducedDims(cds)[[reduction_method]]
nn_control_default <- get_global_variable('nn_control_annoy_euclidean')
nn_control <- set_nn_control(mode=2, nn_control=nn_control, nn_control_default=nn_control_default, nn_index=NULL, k=k, verbose=verbose)
nn_method <- nn_control[['method']]
if(is.null(cds@reduce_dim_aux[[reduction_method]][['nn_search']])) {
cds@reduce_dim_aux[[reduction_method]][['nn_search']] <- S4Vectors::SimpleList()
}
if(nn_method == 'nn2') {
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]] <- S4Vectors::SimpleList()
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['method']] <- nn_method
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['k']] <- k
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['nrow']] <- nrow(reduced_matrix)
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['ncol']] <- ncol(reduced_matrix)
}
else
if(nn_method == 'annoy') {
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]] <- S4Vectors::SimpleList()
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['method']] <- nn_method
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['search_k']] <- nn_contol[['search_k']]
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['k']] <- k
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['nrow']] <- nrow(reduced_matrix)
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['ncol']] <- ncol(reduced_matrix)
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['grain_size']] <- nn_control[['grain_size']]
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['cores']] <- nn_control[['cores']]
}
else
if(nn_method == 'hnsw') {
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]] <- S4Vectors::SimpleList()
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['method']] <- nn_method
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['ef']] <- ef
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['k']] <- k
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['nrow']] <- nrow(reduced_matrix)
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['ncol']] <- ncol(reduced_matrix)
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['grain_size']] <- nn_control[['grain_size']]
cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]][['cores']] <- nn_control[['cores']]
}
else {
stop('set_cds_nn_search: unsupported nearest neighbor index type \'', nn_method, '\'')
}
return(cds)
}
# Get nearest neighbor search information stored in the cds.
# This function is not in use currently and may fall into disrepair.
get_cds_nn_search <- function(cds, reduction_method=c('UMAP', 'PCA', 'LSI', 'Aligned', 'tSNE'), search_id, verbose=TRUE) {
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
msg = paste0("When reduction_method = '", reduction_method,
"' the cds must have been processed for it.",
" Please run the required processing function",
" before this one."))
assertthat::assert_that(!is.null(search_id),
msg = paste0('You must give a search_id string.'))
assertthat::assert_that(!is.null(cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]]),
msg = paste0("Stopping because there is no information in the cds for search_id = '",
search_id, "'."))
nn_search <- cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]]
nn_method <- nn_search[['method']]
if(nn_method == 'nn2') {
nn_control <- list(method=nn_method,
grain_size=nn_search[['grain_size']],
cores=nn_search[['cores']])
}
else
if(nn_method == 'annoy') {
nn_control <- list(method=nn_method,
search_k=nn_search[['search_k']],
grain_size=nn_search[['grain_size']],
cores=nn_search[['cores']])
}
else
if(nn_method == 'hnsw') {
nn_control <- list(method=nn_method,
ef=nn_search[['ef']],
grain_size=nn_search[['grain_size']],
cores=nn_search[['cores']])
}
else {
stop('get_cds_nn_search: unsupported nearest neighbor search type \'', nn_method, '\'')
}
return(nn_control)
}
# Get index build and search information that's stored in the cds.
# This function is not used at this time. It may fall into disrepair.
get_cds_nn_control <- function(cds, reduction_method=c('UMAP', 'PCA', 'LSI', 'Aligned', 'tSNE'), search_id, verbose=TRUE) {
assertthat::assert_that(methods::is(cds, 'cell_data_set'),
msg=paste('cds parameter is not a cell_data_set'))
assertthat::assert_that(
tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
error = function(e) FALSE),
msg = "reduction_method must be one of 'PCA', 'LSI', 'Aligned', 'tSNE', 'UMAP'")
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
msg = paste0("When reduction_method = '", reduction_method,
"' the cds must have been processed for it.",
" Please run the required processing function",
" before this one."))
assertthat::assert_that(!is.null(search_id),
msg = paste0('You must give a search_id string.'))
assertthat::assert_that(!is.null(cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]]),
msg = paste0("Stopping because there is no information in the cds for search_id = '",
search_id, "'."))
nn_search <- cds@reduce_dim_aux[[reduction_method]][['nn_search']][[search_id]]
nn_method <- nn_search[['method']]
if(nn_method == 'nn2') {
nn_control <- list(method=nn_method,
grain_size=nn_search[['grain_size']],
cores=nn_search[['cores']])
}
else
if(nn_method == 'annoy') {
nn_control <- list(method=nn_method,
metric=cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['metric']],
n_trees=cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['n_trees']],
search_k=nn_search[['search_k']],
grain_size=nn_search[['grain_size']],
cores=nn_search[['cores']])
}
else
if(nn_method == 'hnsw') {
nn_control <- list(method=nn_method,
metric=cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['metric']],
M=cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['M']],
ef_construction=cds@reduce_dim_aux[[reduction_method]][['nn_index']][[nn_method]][['nn_index']][['ef_construction']],
ef=nn_search[['ef']],
grain_size=nn_search[['grain_size']],
cores=nn_search[['cores']])
}
else {
stop('get_cds_nn_control: unsupported nearest neighbor search type \'', nn_method, '\'')
}
if(verbose) {
cs <- get_call_stack_as_string()
message('get_cds_nn_control: (',cs,')')
report_nn_control(' nn_control: ', nn_control=nn_control)
}
return(nn_control)
}
#' @title Search a subject matrix for nearest neighbors to a query_matrix.
#'
#' @description Make a nearest neighbors index using the subject matrix and
#' search it for nearest neighbors to the query_matrix.
#'
#' @param subject_matrix a matrix used to build a nearest neighbor index.
#' @param query_matrix a matrix used to search the subject_matrix
#' nearest neighbor index.
#' @param k an integer for the number of nearest neighbors to return for
#' each cell. Default is 25.
#' @param nn_control a list of parameters used to make and search the nearest
#' neighbor index. See the set_nn_control help for details.
#' @param verbose a boolean indicating whether to emit verbose output.
#'
#' @return a list list(nn.idx, nn.dists) where nn.idx is
#' a matrix of nearest neighbor indices and nn.dists is a matrix of
#' the distance between the index given by the row number and the index
#' given in nn.idx. If the query_matrix is the same as the subject
#' matrix, the index given by the row number should be in the row, usually
#' in the first column.
#'
#' @export
search_nn_matrix <- function(subject_matrix, query_matrix, k=25, nn_control=list(), verbose=FALSE) {
assertthat::assert_that(methods::is(subject_matrix, 'matrix') ||
is_sparse_matrix(subject_matrix),
msg=paste0('search_nn_matrix: the subject_matrix object must be of type matrix'))
assertthat::assert_that(methods::is(query_matrix, 'matrix') ||
is_sparse_matrix(query_matrix),
msg=paste0('search_nn_matrix: the query_matrix object must be of type matrix'))
nn_control_default <- get_global_variable('nn_control_annoy_euclidean')
nn_control <- set_nn_control(mode=3, nn_control=nn_control, nn_control_default=nn_control_default, nn_index=NULL, k=k, verbose=verbose)
method <- nn_control[['method']]
k <- min(k, nrow(subject_matrix))
if(verbose) {
message('search_nn_matrix:')
message(' k: ', k)
report_nn_control(' nn_control: ', nn_control)
tick('search_nn_matrix: search_time')
}
if(method == 'nn2') {
nn_res <- RANN::nn2(subject_matrix, query_matrix, k, searchtype = "standard")
}
else {
nn_index <- make_nn_index(subject_matrix, nn_control=nn_control, verbose=verbose)
nn_res <- search_nn_index(query_matrix=query_matrix, nn_index=nn_index, k=k, nn_control=nn_control, verbose=verbose)
}
if(verbose)
tock()
return(nn_res)
}
# Check whether the neighbor in the first column == row index.
# The hnsw_search help page states
# Every item in the dataset is considered to be a neighbor of itself,
# so the first neighbor of item i should always be i itself.
check_nn_col1 <- function(mat) {
irow <- seq(nrow(mat))
if(any(mat[,1] != irow)) {
return(FALSE)
}
return(TRUE)
}
# When the search query matrix is the same as the
# index matrix, we expect that the row index
# exists in the row. Count the cases for which
# this is not true and the distances are not
# all zero.
count_nn_missing_self_index <- function(nn_res, verbose=FALSE) {
idx <- nn_res[['nn.idx']]
dst <- nn_res[['nn.dists']]
len <- length(idx[1,])
num_missing <- 0
if(nrow(idx) == 0)
return(0)
for (irow in 1:nrow(idx)) {
vidx <- idx[irow,]
vdst <- dst[irow,]
if(vidx[[1]] != irow) {
dself <- FALSE
dzero <- TRUE
if(length(vidx) == 1) {
num_missing <- num_missing + 1
next
}
for(i in seq(1, len, 1)) {
if(vidx[[i]] == irow) {
dself = TRUE
break
}
if(vdst[[i]] > .Machine$double.xmin) {
dzero <- FALSE
}
}
if(dself == FALSE && dzero == FALSE) {
num_missing <- num_missing + 1
}
}
}
if(verbose) {
message('count_nn_missing_self_index:')
message(' ', num_missing, ' out of ', nrow(nn_res[['nn.idx']]), ' rows are missing the row index')
message(' \'recall\': ', formatC((as.double(nrow(nn_res[['nn.idx']]) - num_missing) / as.double(nrow(nn_res[['nn.idx']]))) * 100.0), "%")
}
return(num_missing)
}
# For searches in which the index and search sets
# are the same, sort nearest neighbor results so
# that row index point is in the first column. If the
# row index point was missed, shift the results to the
# right one place and store the row index point in the
# first element.
#
# This is required because approximate searches may
# not store the row index point in the first column if
# there is more than one point with zero distance,
# or, the function may miss the row index point.
#
# Notes:
# o we assume that the self index distance is zero
# o do not use this if the search point set differs
# from the index build set.
#
swap_nn_row_index_point <- function(nn_res, verbose=FALSE) {
if(verbose) {
count_nn_missing_self_index(nn_res, verbose)
}
# Skip if there is no need to swap indices.
if(check_nn_col1(nn_res$nn.idx))
return(nn_res)
idx <- nn_res[['nn.idx']]
dst <- nn_res[['nn.dists']]
diagnostics <- FALSE
num_no_recall <- 0
if(nrow(idx) == 0 )
return(nn_res)
for (irow in 1:nrow(idx)) {
vidx <- idx[irow,]
vdst <- dst[irow,]
if(vidx[[1]] != irow) {
if(diagnostics) {
message('swap_nn_row_index_point: adjust nn matrix row: ', irow)
message('swap_nn_row_index_point: idx row pre fix: ', paste(vidx, collapse=' '))
}
if(length(vidx) == 1) {
num_no_recall <- num_no_recall + 1
next
}
if(vidx[[2]] == irow) {
vidx[[2]] <- vidx[[1]]
vidx[[1]] <- irow
} else {
match <- FALSE
for(i in seq(2, length(vidx), 1)) {
if(vidx[[i]] == irow) {
vidx[[i]] <- vidx[[1]]
vidx[[1]] <- irow
match <- TRUE
break
}
}
if(!match) {
if(diagnostics) {
message('swap_nn_row_index_point: dst row pre fix: ', paste(vdst, collapse=' '))
}
for(i in seq(1, length(vidx), 1)) {
if(vdst[[i]] > .Machine$double.xmin) {
num_no_recall <- num_no_recall + 1
}
}
vidx <- c(irow, vidx[1:(length(vidx)-1)])
vdst <- c(0, vdst[1:(length(vdst)-1)])
dst[irow,] <- vdst
if(diagnostics) {
message('swap_nn_row_index_point: dst row post fix: ', paste(vdst, collapse=' '))
}
}
}
idx[irow,] <- vidx
if(diagnostics) {
message('swap_nn_row_index_point: idx row post fix: ', paste(vidx, collapse=' '))
}
}
}
if(num_no_recall > 0) {
frac_recall <- (nrow(idx)-num_no_recall) / nrow(idx)
format_recall <- sprintf('%3.1f', frac_recall * 100.0)
message('The search result is expected to include the query row value (self)\n',
'because the NN index includes the query objects; however, this search result\n',
'is missing ', num_no_recall, ' self values (recall: ', format_recall, '%). Monocle3 has added the self\n',
'values to the first column of the search result in order to allow further\n',
'analysis -- but it is missing important nearest neighbors so you need to\n',
'increase the sensitivity for making and/or searching the index.')
}
nn_res_out <- list(nn.idx=idx, nn.dists=dst)
return(nn_res_out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.