R/sparse.cos.R

Defines functions sparse.cos

Documented in sparse.cos

#' Cosine similaritiy on sparse matrices/vectors
#'
#' @description
#' Cosine similarity between columns or rows of a single sparse matrix or a pair of sparse matrices and/or vectors
#'
#' @details
#' Cosine similarity is an exceptionally efficient calculation for sparse matrices due to extremely fast vector operations.
#'
#' "sparse.cos" applies a Euclidean norm to provide very similar results to Pearson correlation, restricted to the positive orthant.
#' 
#' This function adopts the sparse matrix computational strategy applied by qlcMatrix::cosSparse, and extends it to sparse vectors.
#' 
#' Note that negative values may be returned due to the use of Euclidean normalization. However, this is usually only the case in random matrices.
#' 
#' @param x matrix or vector of, or coercible to, class "dgCMatrix" or "sparseVector"
#' @param y (optional) matrix or vector of, or coercible to, class "dgCMatrix" or "sparseVector"
#' @param return.sparse if result is a matrix, return as a "dgeMatrix", otherwise dense "matrix"
#' @examples
#' \dontrun{
#' library(Matrix)
#' 
#' m1 <- rsparsematrix(1000, 10000, density = 0.1)
#' m2 <- rsparsematrix(1000, 100, density = 0.2)
#'
#' # Input a vector and a vector
#' r <- sparse.cos(m1[,1],m1[,2])
#'
#' # Input a vector and a matrix
#' r <- sparse.cos(m1[,1],m1[,1:100])
#' 
#' # Input a matrix and a vector
#' r <- sparse.cos(m1[,1:100],m1[,1])
#' 
#' # Input just a single matrix
#' res_m2 <- sparse.cos(m2)
#'
#' # Input a matrix and a matrix
#' res <- sparse.cos(m1, m2)
#' # note that negative values are returned, the above are random matrices
#' plot(density(res@x))
#' 
#' # have a look at a non-random matrix.
#' # this matrix shows similarity of gene expression across cells from mouse embryos
#' data(moca7k)
#' res <- sparse.cos(moca7k[,1:1000])
#' plot(density(res@x))
#' # note how the non-random signal resulted in no negative values
#' 
#' # calculate distance from similarity
#' # subtract by 1 + very small number to avoid machine tolerance causing negative values
#' dist <- 1 + 1e-10 - res
#' lines(density(dist@x), col = "red")
#' 
#' # qlcMatrix::cosSparse is a great standard for comparison
#' # also consider wordspace::dist.matrix, but it only is faster in some conditions
#' 
#' library(qlcMatrix)
#' max(abs(as.matrix(qlcMatrix::cosSparse(moca7k[,1:1000])) - res))
#' [1] 3.352874e-14
#'
#' library(rbenchmark)
#' 
#' # compare to qlcMatrix::cosSparse
#' moca.sparse <- moca7k[,1:1000]
#' moca.dense <- as.matrix(moca.sparse)
#' #' benchmark(
#'        "lsmf::sparse.cos" = sparse.cos(moca.sparse),
#'    "qlcMatrix::cosSparse" = qlcMatrix::cosSparse(moca.sparse), 
#'    replications = 10)
#' 
#' #                   test replications elapsed relative 
#' # 1     lsmf::sparse.cos           10    2.98    1.000      
#' # 2 qlcMatrix::cosSparse           10    3.08    1.034     
#'
#' # compare to base::cor
#' benchmark(
#'    "lsmf::sparse.cos"     = sparse.cos(moca.sparse),
#'    "base::cor"            = cor(moca.dense),
#'    replications = 1)
#'
#' #                test  replications elapsed relative 
#' # 1         base::cor             1    6.42   22.138     
#' # 2  lsmf::sparse.cos             1    0.29    1.000      
#' }
sparse.cos <- function(x, y = NULL, return.sparse = FALSE) {
  require(Matrix)
  if (grepl("atrix", class(x)[1])) {
    colnames(x) <- rownames(x) <- NULL
    if(class(x)[1] != "dgCMatrix") x <- as(x, "dgCMatrix")
    if(!is.null(y)){
      if(grepl("atrix", class(y)[1])){
        # x is a matrix, y is a matrix
        colnames(y) <- rownames(y) <- NULL
        if(class(y)[1] != "dgCMatrix") y <- as(y, "dgCMatrix")
        res <- crossprod(tcrossprod(x, Diagonal(x = as.vector(crossprod(x ^ 2, rep(1, x@Dim[1]))) ^ -0.5)), tcrossprod(y, Diagonal(x = as.vector(crossprod(y ^ 2, rep(1, x@Dim[1]))) ^ -0.5)))
        ifelse(return.sparse, return(res), return(as.matrix(res)))
      } else {
        # x is a matrix, y is a vector
        return(as.vector(crossprod(tcrossprod(x, Diagonal(x = as.vector(crossprod(x ^ 2, rep(1, x@Dim[1]))) ^ -0.5)), tcrossprod(y, crossprod(y ^ 2, rep(1, x@Dim[1])) ^ -0.5))))
      }
    } else {
      # x is a matrix, y is NULL
      res <- crossprod(tcrossprod(x, Diagonal(x = as.vector(crossprod(x ^ 2, rep(1, x@Dim[1]))) ^ -0.5)))
      ifelse(return.sparse, return(res), return(as.matrix(res)))
    }
  } else {
    if(is.null(y)) stop("x is a vector and y is NULL")
    if(grepl("atrix", class(y)[1])){
      # x is a vector, y is a matrix
      if(class(y)[1] != "dgCMatrix") y <- as(y, "dgCMatrix")
      colnames(y) <- rownames(y) <- NULL
      return(as.vector(crossprod(tcrossprod(x, crossprod(x ^ 2, rep(1, length(x)))), tcrossprod(y, Diagonal(x = as.vector(crossprod(y ^ 2, rep(1, length(x)))) ^ -0.5)))))
    } else{
      # x is a vector, y is a vector
      return(as.vector(crossprod(tcrossprod(x, crossprod(x ^ 2, rep(1, length(x))) ^ -0.5), tcrossprod(y, crossprod(y ^ 2, rep(1, length(x))) ^ -0.5))))
    }
  }
}
zdebruine/LSMF documentation built on Jan. 1, 2021, 1:50 p.m.