#' Cosine similaritiy on sparse matrices/vectors
#'
#' @description
#' Cosine similarity between columns or rows of a single sparse matrix or a pair of sparse matrices and/or vectors
#'
#' @details
#' Cosine similarity is an exceptionally efficient calculation for sparse matrices due to extremely fast vector operations.
#'
#' "sparse.cos" applies a Euclidean norm to provide very similar results to Pearson correlation, restricted to the positive orthant.
#'
#' This function adopts the sparse matrix computational strategy applied by qlcMatrix::cosSparse, and extends it to sparse vectors.
#'
#' Note that negative values may be returned due to the use of Euclidean normalization. However, this is usually only the case in random matrices.
#'
#' @param x matrix or vector of, or coercible to, class "dgCMatrix" or "sparseVector"
#' @param y (optional) matrix or vector of, or coercible to, class "dgCMatrix" or "sparseVector"
#' @param return.sparse if result is a matrix, return as a "dgeMatrix", otherwise dense "matrix"
#' @examples
#' \dontrun{
#' library(Matrix)
#'
#' m1 <- rsparsematrix(1000, 10000, density = 0.1)
#' m2 <- rsparsematrix(1000, 100, density = 0.2)
#'
#' # Input a vector and a vector
#' r <- sparse.cos(m1[,1],m1[,2])
#'
#' # Input a vector and a matrix
#' r <- sparse.cos(m1[,1],m1[,1:100])
#'
#' # Input a matrix and a vector
#' r <- sparse.cos(m1[,1:100],m1[,1])
#'
#' # Input just a single matrix
#' res_m2 <- sparse.cos(m2)
#'
#' # Input a matrix and a matrix
#' res <- sparse.cos(m1, m2)
#' # note that negative values are returned, the above are random matrices
#' plot(density(res@x))
#'
#' # have a look at a non-random matrix.
#' # this matrix shows similarity of gene expression across cells from mouse embryos
#' data(moca7k)
#' res <- sparse.cos(moca7k[,1:1000])
#' plot(density(res@x))
#' # note how the non-random signal resulted in no negative values
#'
#' # calculate distance from similarity
#' # subtract by 1 + very small number to avoid machine tolerance causing negative values
#' dist <- 1 + 1e-10 - res
#' lines(density(dist@x), col = "red")
#'
#' # qlcMatrix::cosSparse is a great standard for comparison
#' # also consider wordspace::dist.matrix, but it only is faster in some conditions
#'
#' library(qlcMatrix)
#' max(abs(as.matrix(qlcMatrix::cosSparse(moca7k[,1:1000])) - res))
#' [1] 3.352874e-14
#'
#' library(rbenchmark)
#'
#' # compare to qlcMatrix::cosSparse
#' moca.sparse <- moca7k[,1:1000]
#' moca.dense <- as.matrix(moca.sparse)
#' #' benchmark(
#' "lsmf::sparse.cos" = sparse.cos(moca.sparse),
#' "qlcMatrix::cosSparse" = qlcMatrix::cosSparse(moca.sparse),
#' replications = 10)
#'
#' # test replications elapsed relative
#' # 1 lsmf::sparse.cos 10 2.98 1.000
#' # 2 qlcMatrix::cosSparse 10 3.08 1.034
#'
#' # compare to base::cor
#' benchmark(
#' "lsmf::sparse.cos" = sparse.cos(moca.sparse),
#' "base::cor" = cor(moca.dense),
#' replications = 1)
#'
#' # test replications elapsed relative
#' # 1 base::cor 1 6.42 22.138
#' # 2 lsmf::sparse.cos 1 0.29 1.000
#' }
sparse.cos <- function(x, y = NULL, return.sparse = FALSE) {
require(Matrix)
if (grepl("atrix", class(x)[1])) {
colnames(x) <- rownames(x) <- NULL
if(class(x)[1] != "dgCMatrix") x <- as(x, "dgCMatrix")
if(!is.null(y)){
if(grepl("atrix", class(y)[1])){
# x is a matrix, y is a matrix
colnames(y) <- rownames(y) <- NULL
if(class(y)[1] != "dgCMatrix") y <- as(y, "dgCMatrix")
res <- crossprod(tcrossprod(x, Diagonal(x = as.vector(crossprod(x ^ 2, rep(1, x@Dim[1]))) ^ -0.5)), tcrossprod(y, Diagonal(x = as.vector(crossprod(y ^ 2, rep(1, x@Dim[1]))) ^ -0.5)))
ifelse(return.sparse, return(res), return(as.matrix(res)))
} else {
# x is a matrix, y is a vector
return(as.vector(crossprod(tcrossprod(x, Diagonal(x = as.vector(crossprod(x ^ 2, rep(1, x@Dim[1]))) ^ -0.5)), tcrossprod(y, crossprod(y ^ 2, rep(1, x@Dim[1])) ^ -0.5))))
}
} else {
# x is a matrix, y is NULL
res <- crossprod(tcrossprod(x, Diagonal(x = as.vector(crossprod(x ^ 2, rep(1, x@Dim[1]))) ^ -0.5)))
ifelse(return.sparse, return(res), return(as.matrix(res)))
}
} else {
if(is.null(y)) stop("x is a vector and y is NULL")
if(grepl("atrix", class(y)[1])){
# x is a vector, y is a matrix
if(class(y)[1] != "dgCMatrix") y <- as(y, "dgCMatrix")
colnames(y) <- rownames(y) <- NULL
return(as.vector(crossprod(tcrossprod(x, crossprod(x ^ 2, rep(1, length(x)))), tcrossprod(y, Diagonal(x = as.vector(crossprod(y ^ 2, rep(1, length(x)))) ^ -0.5)))))
} else{
# x is a vector, y is a vector
return(as.vector(crossprod(tcrossprod(x, crossprod(x ^ 2, rep(1, length(x))) ^ -0.5), tcrossprod(y, crossprod(y ^ 2, rep(1, length(x))) ^ -0.5))))
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.