FuseNet: Non-Parametric Network Fusion

Documented in Bootstrap EuclideanDist GaussianDist L1Norm L2Norm MatrixBigPower MatrixPower MergeLists Normalization PCA ProjectSketch RunGeomSketch Scaling

#' @useDynLib FuseNet
#' @importFrom Rcpp sourceCpp
#' 
NULL

#' Data Normalization
#'
#' Normalize count or other data.
#'
#' @param counts Data to normalize. An N x M matrix with N rows of features and M columns of data points.
#' @param normalization Normalization method used. Default is cosine.
#' \itemize{
#'   \item cosine, cosine normalization: feature counts for each data point are divided by the L2 norm of them.
#'   \item lognorm, log normalization: feature counts for each data point are divided by the total sum of them. Then the data is multiplied by the scale.factor before taking a log-transformed by log(1+x).
#'   \item none, additional normalization is not performed.
#'   }
#' @param normalize_factor Normalization factor used with lognorm method. Default is 10000.
#' @param zero_percent Zero-entry percentage threshold. If the number of zeros in the returned matrices is above this number, a sparse matrix will be returned. Default is 0.7 aka 70\%.
#' @param verbose Whether to display a process bar. Default is FALSE.
#' @return Returns the normalized data.
#' @importFrom Matrix nnzero Matrix
#' @export
#' @concept utils
#' @examples {
#' data.use <- matrix(runif(100), 10)
#' normalized.data <- Normalization(data.use)
#' }
#' 
Normalization <- function(counts, normalization = c("cosine", "lognorm", "none"), normalize_factor = 1e4, zero_percent = 0.7, verbose = FALSE){
  if(nnzero(counts)/length(counts) < (1-zero_percent)){
    counts <- Matrix(data = counts, sparse = TRUE)
    normalization <- match.arg(arg = normalization)
    return(switch(normalization,
                  lognorm = LogNormSparse(data = counts, scale_factor = normalize_factor, display_progress = verbose),
                  cosine = CosineNormSparse(data = counts, display_progress = verbose),
                  none = counts))
  } else {
    counts <- as.matrix(x = counts)
    normalization <- match.arg(arg = normalization)
    return(switch(normalization,
                  lognorm = LogNorm(data = counts, scale_factor = normalize_factor, display_progress = verbose),
                  cosine = CosineNorm(data = counts, display_progress = verbose),
                  none = counts))
  }
}

#' Data Scaling
#'
#' Scale and center data.
#'
#' @param matrix Matrix to use.
#' @param verbose Whether to display a process bar. Default is FALSE.
#' @return Returns a scaled and centered data.
#' @export
#' @concept utils
#' @examples {
#' data.use <- matrix(runif(100), 10)
#' scaled.data <- Scaling(data.use)
#' }
#' 
Scaling <- function(matrix, verbose = FALSE){
  if(is(matrix,"matrix")){
    scale.data <- FastRowScale(mat = matrix, display_progress = verbose)
  } else if (is(matrix,"dgCMatrix")){
    scale.data <- FastSparseRowScale(mat = matrix, display_progress = verbose)
  }
  dimnames(scale.data) <- dimnames(matrix)
  return(scale.data)
}

#' Principal Components Analysis
#'
#' Perform principal components analysis on scaled and centered data (z-scores).
#'
#' @param scaled_data Scaled and centered data (z-scores).
#' @param n_dims Number of dimensions to return. Default is 10.
#' @param seed Random seed number. Default is 1.
#' @return Returns a list contains outputs from \code{\link[irlba]{irlba}}.
#' @importFrom irlba irlba
#' @keywords internal
#' 
PCA <- function(scaled_data, n_dims = 10, seed = 1){
  set.seed(seed = seed)
  pca.res <- irlba(A = scaled_data, nv = n_dims)
  return(pca.res)
}

#' L1-like Norm
#'
#' Compute L1-like norm defined by the absolute values of the differences between each entry of two matrices with the same dimensions.
#'
#' @param mat1,mat2 Two matrices.
#' @return Returns a matrix with L1-like norms.
#' @keywords internal
#' 
L1Norm <- function(mat1, mat2){
  return(abs(x = (mat1 - mat2)))
}

#' 
#' L2-like Norm
#'
#' Compute L2-like norm defined by the square values of the differences between each entrie of two matrices with the same dimensions.
#'
#' @param mat1,mat2 Two matrices.
#' @return Returns a matrix with L2-like norms.
#' @keywords internal
#' 
L2Norm <- function(mat1, mat2){
  return((mat1 - mat2)^2)
}

#' Euclidean Distance
#'
#' Compute euclidean nearest neighbor distances.
#'
#' @param data An M x d matrix with M rows of data points and d columns of features.
#' @param ka Number of nearest neighbors. See details from \code{\link[RANN]{nn2}}.
#' @return Returns the distance matrix.
#' @importFrom RANN nn2
#' @export
#' @concept utils
#' @examples {
#' dist.mat <- EuclideanDist(iris[,1:2], ka = 10)
#' }
#' 
EuclideanDist <- function(data, ka){
  knn <- nn2(data = data, k = ka)
  n <- nrow(x = knn[[1]])
  dist.mat <- matrix(data = 0, nrow = n, ncol = n)
  for(i in 1:nrow(x = knn[[1]])){
    dist.mat[i, knn[[1]][i, ]] <- knn[[2]][i, ]
  }
  return(dist.mat)
}

#' Gaussian Distance
#'
#' Compute gaussian nearest neighbor distances.
#'
#' @param data An M x d matrix or data.frame with M rows of data points and d columns of features.
#' @param ka Number of nearest neighbors. See details from \code{\link[RANN]{nn2}}.
#' @return Returns the distance matrix.
#' @importFrom RANN nn2
#' @export
#' @concept utils
#' @examples {
#' dist.mat <- GaussianDist(iris[,1:2], ka = 10)
#' }
#' 
GaussianDist <- function(data, ka){
  knn <- nn2(data = data, k = ka * 3)
  n <- nrow(x = knn[[1]])
  dist.mat <- matrix(data = 0, nrow = n, ncol = n)
  for(i in 1:nrow(x = knn[[1]])) {
    dist.mat[i, knn[[1]][i, ]] <- exp(x = (-1 * (knn[[2]][i, ] / knn[[2]][i, ka])^2))
  }
  dist.mat <- dist.mat + t(x = dist.mat)
  dist.mat <- sweep(x = dist.mat, MARGIN = 1, STATS = rowSums(x = dist.mat), FUN = "/")
  dist.mat[is.na(x = dist.mat)] <- 0
  return(dist.mat)
}

#' Run Geometric Sketching
#'
#' Run Geometric sketching sampling.
#' See Hie et al. (\doi{10.1016/j.cels.2019.05.003}) for details.
#' This function relies on reticulate R package to import geosketch python library. Only one python environment is allowed by reticulate configuration.
#' Please be careful when importing other R packages that will invoke and set up reticulate python configuration, i.e library(Seurat), before calling GeomSketch function.
#' @param data An M x d matrix or data.frame with M rows of data points and d columns of features.
#' @param geom_size Size of geometric sketches to return.
#' @param is_pca Whether the data columns are principal components.
#' @param n_pca Number of PCA dimensions to use. Default is 10.
#' @param seed Random seed number. Default is 1.
#' @param which_python Path to python3 used.
#' @import reticulate
#' @importFrom irlba irlba
#' @return Returns geometric sketch ID.
#' @keywords internal
#'
RunGeomSketch <- function(data, geom_size, is_pca = FALSE, n_pca = 10, seed = 1, which_python = Sys.which(names = "python3")){
  use_python(python = which_python, required = TRUE)
  if(!py_module_available("geosketch")){
    stop("Cannot find geosketch, please install through pip (e.g. pip install geosketch).")
  }
  geosketch <- import(module = 'geosketch')
  if(is_pca){
    sketch.ind <- unlist(x = geosketch$gs(data$u, as.integer(x = geom_size))) + 1
  } else {
    set.seed(seed = seed)
    pca.res <- irlba(A = data, nv = n_pca)
    sketch.ind <- unlist(x = geosketch$gs(pca.res$u, as.integer(x = geom_size))) + 1
  }
  return(sketch.ind)
}

#' Bootstrapping of Distance Matrix
#'
#' Bootstrapping  of Distance Matrix
#'
#' @param data An M x d matrix or data.frame with M rows of data points and d columns of features.
#' @param dist_mat_null An M x M distance matrix calculated from the original data (null).
#' @param k Number of nearest neighbors. Default is 10. See details from \code{\link[RANN]{nn2}}.
#' @param kernel Kernel distance used:
#' \itemize{
#'   \item gaussian, gaussian distance kernel. See details \code{\link[FuseNet]{EuclideanDist}}.
#'   \item euclidean, euclidean distance kernel. See details \code{\link[FuseNet]{GaussianDist}}.
#'   }
#' @param normalization Normalization method used:
#' #' \itemize{
#'   \item cosine, cosine normalization. See details \code{\link[FuseNet]{Normalization}}.
#'   \item lognorm, log normalization. See details \code{\link[FuseNet]{Normalization}}.
#'   \item none, normalization is not performed.
#'   }
#' @param normalize_factor Normalize factor used in log normalization. Default is 10000. See details \code{\link[FuseNet]{Normalization}}.
#' @param pca_dims Number of dimensions used. Default is 0 and PCA is not performed.
#' @param norm_type Type of norm used:
#' \itemize{
#'   \item l1, L1-like norm. See details \code{\link[FuseNet]{L1Norm}}.
#'   \item l2, L1-like norm. See details \code{\link[FuseNet]{L2Norm}}.
#'   }
#' @param n_iters Number of bootstrapping iterations. Default is 100.
#' @param ratio Fraction of features to be downsampled in the original data matrix. Default is 0.05 aka 5\%.
#' @param t Matrix power used for the distance matrix. Default is 0 and powering is not performed. See \code{\link[FuseNet]{MatrixPower}} for details.
#' @param calc_perturb_mat Whether to calculate the perturb matrix. Default is FALSE.
#' @param n_cores Number of cores used. Default is to use all existing cores. See details \code{\link[parallel]{makeCluster}}.
#' @param zero_percent Zero-entry percentage threshold. If the number of zeros in the returned matrices is above this number, a sparse matrix will be returned. Default is 0.7 aka 70\%.
#' @param ... Additional parameters pass to \code{\link[parallel]{makeCluster}}.
#' @importFrom parallel detectCores makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom Matrix nnzero Matrix
#' @import foreach
#' @return Returns a list with entries:
#' \itemize{
#'   \item feature_weight, n x d binary matrix with n rows of bootstrap iterations and d columns of features where 0 means feature not sampled and 1 means sampled.
#'   \item sample_weight, n x M matrix with n rows of bootstrap iterations and M columns of data points where each entry represents weight.
#'   \item perturb_mat, d x M matrix with d rows of features and M columns of data points where each entry represents the relative importance of a feature to a data point.
#'   \item dist_mat, M x M distance matrix.
#'   }
#' @keywords internal
#' 
Bootstrap <- function(data, dist_mat_null, k = 10, kernel = c("gaussian", "euclidean"), normalization = c("cosine", "lognorm", "none"), normalize_factor = 1e4, pca_dims = 0, norm_type = c("l1", "l2"), 
                      n_iters = 100, ratio = 0.05, t = 0, calc_perturb_mat = FALSE, n_cores = NULL, zero_percent = 0.7, ...){
  kernel <- match.arg(arg = kernel)
  dist.fxn <- switch(kernel,
                     gaussian = GaussianDist,
                     euclidean = EuclideanDist)
  norm_type <- match.arg(arg = norm_type)
  norm.fxn <- switch(norm_type,
                     l1 = L1Norm,
                     l2 = L2Norm)
  n_cell <- nrow(x = data)
  n_feature <- ncol(x = data)
  n_sample <- n_feature * ratio
  dist_mat_null <- as.matrix(dist_mat_null)
  
  if(is.null(x = n_cores)) n_cores <- detectCores() - 1
  cl <- makeCluster(spec = n_cores, setup_strategy = "sequential") # R4.0.0 Compatibility
  registerDoParallel(cl = cl)
  if(pca_dims > 0){
    results <- foreach(i = 1:n_iters, .multicombine = TRUE, .packages = "FuseNet") %dopar% {
      set.seed(seed = i)
      sample.idx <- sample(x = 1:n_feature, size = n_sample)
      data.temp <- PCA(scaled_data = data[, sample.idx], n_dims = pca_dims)
      dist.mat.temp <- dist.fxn(data = data.temp$u, ka = k)
      dist.mat.temp <- MatrixPower(A = as.matrix(x = dist.mat.temp), n = t)
      norm.temp <- norm.fxn(mat1 = dist.mat.temp, mat2 = dist_mat_null)
      list(sample.idx, colSums(x = norm.temp), norm.temp)
    }
  } else {
    results <- foreach(i = 1:n_iters, .multicombine = TRUE, .packages = "FuseNet") %dopar% {
      set.seed(seed = i)
      sample.idx <- sample(x = 1:n_feature, size = n_sample)
      dist.mat.temp <- dist.fxn(data = data[, sample.idx], ka = k)
      dist.mat.temp <- MatrixPower(A = as.matrix(x = dist.mat.temp), n = t)
      norm.temp <- norm.fxn(mat1 = dist.mat.temp, mat2 = dist_mat_null)
      list(sample.idx, colSums(x = norm.temp), norm.temp)
    }
  }
  stopCluster(cl = cl)
  gc()

  message("Finalize")
  feature.weight.mat <- lapply(X = results, FUN = "[[", ... = 1L)
  sample.weight <- lapply(X = results, FUN = "[[", ... = 2L)
  dist.mat <- lapply(X = results, FUN = "[[", ... = 3L)
  
  feature.weight.mat <- do.call(what = "rbind", args = feature.weight.mat)
  sample.weight <- do.call(what = "rbind", args = sample.weight)
  dist.mat <- Reduce(f = "+", x = dist.mat)

  feature.weight <- matrix(data = 0, nrow = n_iters, ncol = n_feature)
  for(i in 1:n_iters) {
    feature.weight[i, feature.weight.mat[i, ]] <- 1
  }

  if(nnzero(x = feature.weight)/length(x = feature.weight) < (1-zero_percent)){
    feature.weight <- Matrix(data = feature.weight, sparse = TRUE)
  } else {
    feature.weight <- as.matrix(x = feature.weight)
  }
  if(nnzero(x = sample.weight)/length(x = sample.weight) < (1-zero_percent)){
    sample.weight <- Matrix(data = sample.weight, sparse = TRUE)
  } else {
    sample.weight <- as.matrix(x = sample.weight)
  }
  if(nnzero(x = dist.mat)/length(x = dist.mat) < (1-zero_percent)){
    dist.mat <- Matrix(data = dist.mat, sparse = TRUE)
  } else {
    dist.mat <- as.matrix(x = dist.mat)
  }
  dimnames(x = feature.weight) <- list(paste("bootstrap", 1:n_iters, sep = "_"), colnames(x = data))
  dimnames(x = sample.weight) <- list(paste("bootstrap", 1:n_iters, sep = "_"), rownames(x = data))
  dimnames(x = dist.mat) <- dimnames(x = dist_mat_null)
  if(calc_perturb_mat){
    perturb.mat <- t(x = feature.weight) %*% sample.weight
  } else {
    perturb.mat <- matrix(nrow = 0,ncol = 0)
  }
  dist.mat <- (dist.mat + t(x = dist.mat)) / n_iters  
  return(list(feature_weight = feature.weight, sample_weight = sample.weight, perturb_mat = perturb.mat, dist_mat = dist.mat))
}

#' Project Geometric Sketches
#'
#' Project weights learned from geometric sketches onto other data points based on nearest neighbors.
#'
#' @param data An M x d matrix or data.frame with M rows of data points and d columns of features.
#' @param data_query An N x d matrix or data.frame with N rows of queried data points and d columns of features.
#' @param weights_data A vector of weights for each data point with total length of M.
#' @param k Number of nearest neighbors. See details from \code{\link[RANN]{nn2}}.
#' @importFrom RANN nn2
#' @return Returns the weight matrix of the queried data.
#' @keywords internal
#' 
ProjectSketch <- function(data, data_query, weights_data, k){
  knn.obj <- nn2(data = data, query = data_query, k = k)
  gauss <- function(x, k) exp(x = (-1 * (x / x[k])^2))
  k.gauss <- ceiling(x = k / 3)
  knn.obj[[2]] <- t(x = apply(X = knn.obj[[2]], MARGIN = 1, FUN = function(x) gauss(x, k.gauss) / sum(gauss(x, k.gauss))))
  n.query <- nrow(x = data_query)
  n.data <- nrow(x = data)
  dist.mat <- matrix(data = 0, nrow = n.query, ncol = n.data)
  for(i in 1:n.query){
    dist.mat[i, knn.obj[[1]][i, ]] <- knn.obj[[2]][i, ]
  }
  weights.query <- dist.mat %*% weights_data
  return(weights.query)
}

#' Matrix Big Power
#'
#' Compute big powers (>10) of matrix.
#'
#' @param A A matrix.
#' @param n Number of powers.
#' @return Returns the powered matrix.
#' @keywords internal
#' 
MatrixBigPower <- function(A, n){
  if(n > 0){
    e <- eigen(x = A)
    M <- e$vectors
    d <- e$values
    return(M %*% diag(x = d^n) %*% solve(a = M))
  } else {
    return(A)
  }
}

#' Matrix Power
#'
#' Compute powers of matrix.
#'
#' @param A A matrix.
#' @param n Number of powers.
#' @return Returns the powered matrix.
#' @importFrom expm expm %^%
#' @keywords internal
#' 
MatrixPower <- function(A, n){
  if(n > 0){
    return(A %^% n)
  } else {
    return(A)
  }
}

#' Merge Two List
#'
#' Merge twp lists by the name of their elements.
#'
#' @param a,b Two lists.
#' @return Returns the merged list.
#' @keywords internal
#' 
MergeLists <- function(a, b) {
  a.names <- names(a)
  b.names <- names(b)
  m.names <- unique(c(a.names, b.names))
  sapply(m.names, function(i) {
    if (i %in% b.names) b[[i]]
    else a[[i]]
  }, simplify = FALSE)
}
stevexniu/FuseNet documentation built on May 16, 2022, 12:23 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
stevexniu/FuseNet
Non-Parametric Network Fusion

R/utils.R
In stevexniu/FuseNet: Non-Parametric Network Fusion

Defines functions MergeLists MatrixPower MatrixBigPower ProjectSketch Bootstrap RunGeomSketch GaussianDist EuclideanDist L2Norm L1Norm PCA Scaling Normalization

Documented in Bootstrap EuclideanDist GaussianDist L1Norm L2Norm MatrixBigPower MatrixPower MergeLists Normalization PCA ProjectSketch RunGeomSketch Scaling

R Package Documentation

Browse R Packages

We want your feedback!

stevexniu/FuseNet Non-Parametric Network Fusion

R/utils.R In stevexniu/FuseNet: Non-Parametric Network Fusion

Defines functions MergeLists MatrixPower MatrixBigPower ProjectSketch Bootstrap RunGeomSketch GaussianDist EuclideanDist L2Norm L1Norm PCA Scaling Normalization

Documented in Bootstrap EuclideanDist GaussianDist L1Norm L2Norm MatrixBigPower MatrixPower MergeLists Normalization PCA ProjectSketch RunGeomSketch Scaling

R Package Documentation

Browse R Packages

We want your feedback!

stevexniu/FuseNet
Non-Parametric Network Fusion

R/utils.R
In stevexniu/FuseNet: Non-Parametric Network Fusion