#' @keywords internal
SUPPORTED_METRICS = c('cosine', 'hamming')
#' @keywords internal
ERROR_MESSAGE_METRIC = 'Only cosine and Hamming metric are supported (and custom functions in some cases, see documentation)'
#' @keywords internal
#' @importFrom Matrix Matrix
#' @importFrom Matrix sparseMatrix
#' @importFrom Matrix drop0
#' @importFrom Matrix diag
#' @importFrom Matrix t
#' @importFrom methods as
#' @importFrom parallel makeCluster
#' @importFrom parallel stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach %dopar%
#' @importFrom foreach foreach
#' @title Generate similarity matrix using different methods
#' @description A collection of functions that take a numeric / 0-1 integer matrix and calculate row-wise similarity.
#' @param X A matrix, or an object that can be coerced into a matrix. Values depend on the type of metric used (numeric, 1/0 integers, etc.)
#' @param metric One of the supported metrics. Currently \code{hamming} or \code{cosine}. For loop based functions you can also specify a function that takes two rows from the raw X and returns a distance
#' @param row_blocks Integer. How to divide matrix rows into blocks for block-wise similarity calculation (using matrix multiplication). Default is 1 which means similarity is calculated in a single step.
#' @param col_block Integer. Not implemented yet
#' @param thresh Float. Minimal similarity threshold to be returned. Values below are converted to 0 (to allow sparse representation). Default is 0.0
#' @param n_cpu Integer. Number of cores to use for the local cluster (using the \code{doParallel} and \code{parallel} backend). Default is 1 which results in a simple R loop. Negative numbers are interpreted as "all CPU expect".
#' @param cl A cluster object, pointing to the cluster to be used instead of a local cluster. This option overrides the \code{n_cpu} parameter.
#' @param include_diag Logical. Should the diagonal of 1 be included in the matrix? This can help save memory. Default is \code{TRUE}.
#' @return An object of \code{symmetricMatrix} class. Default is a sparse matrix but this can be modified using the \code{sparse} parameter
#' @describeIn sim_loopR Generate sparse similarity matrix using simple R loop in a single thread or a cluster
#' @export
sim_loopR <- function(X, metric, thresh = 0.0, n_cpu = 1, cl = NA, include_diag = TRUE) {
X <- as.matrix(X)
if (class(metric) == 'character') {
if (tolower(metric) == 'cosine') {
storage.mode(X) <- 'numeric'
X <- X / sqrt(rowSums(X^2))
row_sim <- function(x, y) {sum(x * y)}
} else if (tolower(metric) == 'hamming') {
X <- X != 0.0
row_sim <- function(x, y) {sum((x & y) + ((!x) & !(y))) / length(x)}
} else {stop(ERROR_MESSAGE_METRIC)}
} else if (class(metric) == 'function') {
row_sim <- metric
} else {stop(ERROR_MESSAGE_METRIC)}
if (n_cpu == 1 & is.na(cl)) {
S <- Matrix(data = 0, nrow = nrow(X), ncol = nrow(X), sparse = TRUE)
for (i in 1:nrow(X)) {
for (j in 1:i) {
if ((i == j) & !include_diag) {next}
s <- row_sim(X[i,], X[j,])
if (s >= thresh) {S[i,j] <- s; S[j,i] <- s}
}
}
S <- as(S, 'symmetricMatrix')
} else {
if (n_cpu <= 0) {n_cpu <- detectCores(logical = FALSE)[1] + n_cpu}
stop_cl <- FALSE
if (is.na(cl)) {
# based on https://github.com/rstudio/rstudio/issues/6692
cl <- makeCluster(n_cpu, setup_timeout = 0.5)
stop_cl <- TRUE
}
registerDoParallel(cl)
n <- nrow(X)
raw_S <- foreach(i = 1:n, .combine=rbind) %dopar% {
row_sim_matrix <- matrix(0, nrow = 0, ncol = 3)
for (j in i:n) {
if ((i == j) & !include_diag) {next}
s = row_sim(X[i,], X[j,])
if (s >= thresh) {
row_sim_matrix <- rbind(row_sim_matrix, c(i, j, s))
}
}
row_sim_matrix
}
if (stop_cl) {stopCluster(cl)}
S <- sparseMatrix(i = raw_S[,1], j = raw_S[,2], x = raw_S[,3], symmetric = TRUE)
}
return(S)
}
#' @keywords internal
cosine_self_similarity_ <- function(X, row_range) {
return(
X %*% t(X[row_range, , drop = FALSE])
)
}
#' @keywords internal
hamming_self_similarity_ <- function(X, row_range) {
Y = X[row_range, , drop = FALSE]
return(
(X %*% t(Y) + (1 - X) %*% t(1 - Y)) / ncol(X)
)
}
#' @describeIn sim_loopR Generate sparse similarity matrix using block-wise matrix multiplication
#' @export
sim_blocksR <- function(X, metric, row_blocks = 1, thresh = 0.0, n_cpu = 1, cl = NA, include_diag = TRUE) {
X <- as.matrix(X)
rows <- ceiling(nrow(X) / row_blocks)
if (class(metric) == 'character') {
if (tolower(metric) == 'cosine') {
storage.mode(X) <- 'numeric'
X <- X / sqrt(rowSums(X^2))
sim_f <- cosine_self_similarity_
} else if (tolower(metric) == 'hamming') {
storage.mode(X) <- 'logical'
sim_f <- hamming_self_similarity_
} else {stop(ERROR_MESSAGE_METRIC)}
} else {stop(ERROR_MESSAGE_METRIC)}
if (n_cpu == 1 & is.na(cl)) {
S_sparse_list <- list()
for (b in 1:row_blocks) {
row_range <- (1 + (b - 1) * rows):min((b * rows), nrow(X))
Sb <- sim_f(X, row_range)
if (thresh > 0) {Sb[Sb < thresh] <- 0}
S_sparse_list[[b]] <- Matrix(Sb, sparse = TRUE)
}
} else {
if (n_cpu <= 0) {n_cpu <- detectCores(logical = FALSE)[1] + n_cpu}
stop_cl <- FALSE
if (is.na(cl)) {
# based on https://github.com/rstudio/rstudio/issues/6692
cl <- makeCluster(n_cpu, setup_timeout = 0.5)
stop_cl <- TRUE
}
registerDoParallel(cl)
S_sparse_list <- foreach(b = 1:row_blocks, .packages = 'Matrix') %dopar% {
row_range <- (1 + (b - 1) * rows):min((b * rows), nrow(X))
Sb <- sim_f(X, row_range)
if (thresh > 0) {Sb[Sb < thresh] <- 0}
Matrix(Sb, sparse = TRUE)
}
if (stop_cl) {stopCluster(cl)}
}
S <- as(Reduce(cbind, S_sparse_list), "symmetricMatrix")
if (!include_diag) {
diag(S) <- 0
S <- drop0(S)
}
return(S)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.