R/MainFunctions.R
In RCSL: Rank Constrained Similarity Learning for Single Cell RNA Sequencing Data

Documented in BDSM EProjSimplexdiag EstClusters EucDist GenesFilter NeigRepresent SimS

#'
#' Perform the step of gene filtering to normalizaed gene expression data
#'
#' @param data the normalized gene expression matrix 
#’        (each column represents a cell)
#' @param gfRatio the ratio of genes filtering
#'
#' @importFrom stats var
#'
#' @return the gene expression matrix after genes filtering gfData
#'
#' @export
#'
#' @examples
#' GenesFilter(yan[1:100,1:15])
#'
GenesFilter <- function(data, gfRatio = 0.025){

  N <- ncol(data)
  G <- nrow(data)
  ExpressNumMin <- gfRatio * N
  ExpressNumMax <- (1 - gfRatio) * N
  GenesExpress0 <- sapply(seq_len(G), function(n) length(which(data[n,] != 0)))
  gfData <- data[which(GenesExpress0 > ExpressNumMin),,drop = FALSE]
  GenesExpress <- GenesExpress0[which(GenesExpress0 > ExpressNumMin)]
  
  if(max(GenesExpress) == N){
   GenesPre <- gfData[which(GenesExpress > ExpressNumMax),,drop = FALSE]
   GenesVar <- as.matrix(apply(GenesPre, 1, var))
   if(any(GenesVar < 0.9 * mean(GenesVar))){
   gfData <- gfData[-which(GenesVar < 0.9 * mean(GenesVar)),,drop = FALSE]
   }
   
  }
 return(gfData)
}


#'
#' Calculate the initial similarity matrix
#'
#' @param data gene expression matrix after genes filtering 
#‘             (each column represents a cell)
#' @param pcRatio the ratio between the variance of the 
#’                 choosed PCs and the total variance
#' @param gamma the ratio of the global simialrity
#' @param NN.method the method of finding neighbors
#' @param Dis.method the distance metric in finding neighbors
#' @param LSH.TreeNum the tree number of LSH
#' @param LSH.Dim the dimension in LSH
#' @param LSH.Dis the distance metric in LSH                      
#' @param neiRatio ratio of the number of selected 
#‘                 neighbors to the total number of cells 
#'
#' @importFrom stats cor prcomp
#'                         
#' @return initial similarity matrix S
#' @return gene expression matrix after PCA processing drData
#'
#' @export
#'
#' @examples
#' gfData <- GenesFilter(yan[1:100,1:15])
#' res_Sims <- SimS(gfData)
#' res_Sims$S
#' res_Sims$res_pca
#'
SimS <- function(data, pcRatio = 0.95, gamma = 0.8, NN.method = "KNN", 
                   Dis.method = "Euclidean", LSH.TreeNum = 30, LSH.Dim = 1000, 
                   LSH.Dis = "angular", neiRatio = 0.65){

  res_PCA <- stats::prcomp(t(data), center = TRUE, scale. = TRUE)
  
  if(ncol(data) > 1300){
    EigenValues <- res_PCA$sdev
    V <- sapply(seq_len(length(EigenValues)), 
                function(n) sum(EigenValues[seq_len(n)])/sum(EigenValues))
    n_dim <- min(which(V >= pcRatio))
    res_pca <- t(res_PCA$x[, seq_len(n_dim), drop = FALSE])
    if(n_dim>4000){
    res_pca <- t(res_PCA$x[, seq_len(4000), drop = FALSE])
    }
  }else{
    res_pca <- t(res_PCA$x) 
  }
  
  if(gamma!=1){      
       message('Calculate the Spearman correlation','\n')
       SpearA <- stats::cor(data, method = "spearman")
       message('Calculate the Nerighbor Representation','\n')
       NR <- NeigRepresent(res_pca,NN.method = "KNN", Dis.method = "Euclidean",
                        LSH.TreeNum = LSH.TreeNum, LSH.Dim = LSH.Dim, LSH.Dis = LSH.Dis, 
                        neiRatio = neiRatio)
       S <-  gamma * SpearA + (1-gamma) * NR
  } else{
    message('Calculate the Spearman correlation','\n')
    S <- stats::cor(data, method = "spearman")
  }

  res = list("S" = S, "drData" = res_pca)
  return(res)
}

#'
#' Calculate the neighbor representation of cells to the low-dimensional 
#' gene expression matrix
#' 
#' @import RcppAnnoy
#'
#' @param drData gene expression matrix after dimensionality reduced by PCA
#' @param NN.method the method of finding neighbors
#' @param Dis.method the distance metric in finding neighbors
#' @param LSH.TreeNum the tree number of LSH
#' @param LSH.Dim the dimension in LSH
#' @param LSH.Dis the distance metric in LSH                      
#' @param neiRatio ratio of the number of selected 
#‘                 neighbors to the total number of cells    
#'
#' @importFrom methods new
#' @importFrom RcppAnnoy AnnoyAngular AnnoyHamming AnnoyEuclidean
#' @importFrom pracma pinv
#'
#' @return the similarity matrix measured by neighbor representation NR
#'
#' @export
#'
#' @examples 
#' gfData <- GenesFilter(yan[1:100,1:15])
#' res_SimS <- SimS(gfData)
#' NeigRepresent(res_SimS$drData)
#'
NeigRepresent <- function(drData, NN.method = "KNN", Dis.method = "Euclidean", 
                            LSH.TreeNum = 30, LSH.Dim = 500, LSH.Dis = "angular", 
                            neiRatio = 0.65){

k <- ceiling(ncol(drData) * neiRatio)

if(NN.method == "KNN"){
  if(Dis.method == "Euclidean"){
      message("Find neighbors by KNN(Euclidean)","\n")
      #distMatrix <- as.matrix(stats::dist(t(drData), diag = T, upper = T))
      distMatrix <- EucDist(drData,drData)
  } 
  if(Dis.method == "Cosine"){
      message("Find neighbors by KNN(Cosine)","\n")
      Cosine_sim <- t(drData) / sqrt(rowSums(t(drData) * t(drData)))
      Cosine_sim <- Cosine_sim %*% t(Cosine_sim)
      distMatrix <- as.matrix(1 - Cosine_sim)
  }
  neigData <- sapply(seq_len(nrow(distMatrix)), function(n){
                     curRow <- sort(distMatrix[n,,drop = TRUE])
                     rownames(as.matrix(which(rank(curRow, ties.method = "first") <= k)))})
}

if(NN.method == "LSH"){
  if(nrow(drData)>LSH.Dim){
    drData <- drData[seq_len(LSH.Dim),,drop = FALSE]
  }
  f = ncol(drData)
  if(LSH.Dis == "angular"){  
      message("Find neighbors by LSH(Cosine)","\n")
      Tree = methods::new(RcppAnnoy::AnnoyAngular,f)  
  }
  if(LSH.Dis == "hamming"){ 
      message("Find neighbors by LSH(Hamming)","\n") 
      Tree = methods::new(RcppAnnoy::AnnoyHamming,f)  
  }
  if(LSH.Dis == "Euclidean"){ 
      message("Find neighbors by LSH(Euclidean)","\n") 
      Tree = methods::new(RcppAnnoy::AnnoyEuclidean,f)  
  }
  for(i in seq(ncol(drData))){
      v = as.vector(drData[,i])
      Tree$addItem(i, v)
  }
  Tree$build(LSH.TreeNum)# 30 trees
  INDEX = mat.or.vec(ncol(drData),k+1)
  for(i in seq(ncol(drData))){
      v = drData[,i]
      INDEX[i,] = Tree$getNNsByVector(v, k+1)
  }
  neigData <- t(INDEX)[-1,]
}

neigData <- neigData[-1,] 
colnames(neigData) <- colnames(drData)

## Solve for reconstruction weights
## Create matrix Z consisting of all neighbors of each cell
wgtsNR <- matrix(0, ncol(drData), ncol(drData))
rownames(wgtsNR) <- colnames(drData)
colnames(wgtsNR) <- colnames(drData)
k <- dim(neigData)[1]

for(i in seq_len(ncol(neigData))){
  
  # find neighbors of each cell
  neigZ <- drData[, neigData[,i]]
  
  # Calculate the differences between the cell and its neighbors
  diffZ <- neigZ - matrix(drData[,i,drop = FALSE],
                        nrow = nrow(drData),ncol = nrow(neigData),byrow = FALSE)
  localCova <- t(diffZ) %*% diffZ
  
  ## Solve linear system
  if(qr(localCova)$rank < nrow(localCova)){
    wgtsNRi <- pracma::pinv(localCova) %*% matrix(1,nrow=k,ncol=1)
  }else {
    wgtsNRi <- base::solve(localCova, matrix(1,nrow=k,ncol=1))
  }
  
  wgtsNRi <- t(wgtsNRi)
  wgtsNRiRegu <- wgtsNRi / sum(wgtsNRi)
  wgtsNR[i, rownames(localCova)] <- wgtsNRiRegu
 }
 NR <- wgtsNR
 
 return(NR)
}


#'
#' Estimate the optimal number of clusters C for clustering
#' 
#' @param drData gene expression matrix after PCA processing
#' @param S the calculated similarity matrix S from "SimS"
#'
#'
#' @importFrom stats as.dist
#' @importFrom NbClust NbClust
#'
#' @return C the estimated number of clusters
#'
#' @export
#'
#' @examples 
#' gfData <- GenesFilter(yan[1:100,1:15])
#' res_SimS <- SimS(gfData)
#' EstClusters(res_SimS$drData,res_SimS$S)
#'
EstClusters <- function(drData, S){ 

  Diss <- stats::as.dist(1 - S)
  
  if(nrow(S) > 3000){
      
      #set.seed(1)
      C <- NbClust::NbClust(t(drData), diss = Diss, distance = NULL,
                            min.nc = 4, max.nc = 12,
                            method = "average", index = "kl")$Best.nc[1]
  }else{ 
      Q0 <- - 10e11
      #set.seed(1)
      res <- NbClust::NbClust(t(drData), diss = Diss, distance = NULL,
                              min.nc = 4, max.nc = 12,
                              method = "average", index = "kl")$All.index
      res <- sort(res)
      l <- length(res)
      K_range <- as.numeric(attr(res, "names")[(l - 2):l])
      for(k in K_range){
         resBDSM <- BDSM(S, k)
         a <- resBDSM$y
         one <- matrix(c(seq_len(nrow(S))), byrow = TRUE)
         a <- as.matrix(cbind(one, a))
         b <- matrix(0,nrow = max(a),ncol = max(a))
         for(i in seq_len(nrow(a))){
             b[a[i,1], a[i,2]] <- 1
         }
         b <- b[, -(which(colSums(b) == 0)),drop = FALSE]
         m <- sum(S) / 2
         c <-  as.matrix(rowSums(S))
         B <- S - (kronecker(matrix(1,1,nrow(c)),c) * kronecker(matrix(1,nrow(c),1),t(c)))/(2 * m)
         Q1 <- 1/(2 * m) * sum(diag(t(b) %*% B %*% b))
         if(Q1 > Q0){
             Q0 <- Q1
             C <- k
           }
       }
     }
  return(C)
}

#' 
#' Solve the problem: min 1/2*x'*L*x-x'*d s.t. x>=0, 1'x=1
#' 
#' @param d matrix or vector
#' @param l matrix or vector
#' 
#' @return x 
EProjSimplexdiag <- function(d,l){
  lambda <- min(l - d)
  f <- 1
  count <- 1
  
  while(abs(f) > 10^-8){
    v1 <-  1 / l * lambda + d/l
    g <-  sum(1 / l[v1 > 0])
    f <-  sum(v1[v1 > 0]) - 1
    lambda <- lambda - f / g
    if(count > 1000){
    break}
    count <- count + 1
  }
  
  v1 = 1/l*lambda + d/l
  x <- pmax(v1,0)
  
  return(x)
}

#' 
#' Solve the problem: ||A-B||^2 = ||A||^2 + ||B||^2 - 2*A'*B
#' 
#' @param A matrix or vector
#' @param B matrix or vector
#' 
#' @return d matrix or vector
EucDist <- function(A, B){
  if (dim(A)[1] == 1){
    A <- rbind(A, matrix(0,1,dim(A)[2]))
    B <- rbind(B, matrix(0,1,dim(B)[2]))
  }
  
  aa <- t(as.matrix(colSums(A * A)))
  bb <- t(as.matrix(colSums(B * B)))
  ab <- t(A) %*% B
  
  d <- kronecker(matrix(1,1,dim(bb)[2]),t(aa)) + kronecker(matrix(1,dim(aa)[2],1), bb) - 2 * ab
  d <-  Re(d)
  d <-  pmax(d,0)
  
  return(d)
}

#' 
#' Calculate the bolock-diagnal matrix B
#' min_{B>=0, B*1=1, F'*F=I}  ||B - A||_1 + r*||B||^2 + 2*lambda*trace(F'*L*F)
#' 
#' @import igraph
#'
#' @param S the calculated initial similarity matrix S
#' @param C the estimated number of clusters C
#'
#' @importFrom igraph graph_from_adjacency_matrix graph_from_adjacency_matrix components
#'
#' @return B block-diagonal matrix
#' @return y clustering results
#'
#' @export
#'
#' @examples 
#' gfData <- GenesFilter(yan[1:100,1:15])
#' res_SimS <- SimS(gfData)
#' C <- EstClusters(res_SimS$drData,res_SimS$S)
#' resBDSM <- BDSM(res_SimS$S,C)
#' resBDSM$y
#' resBDSM$B
BDSM <- function(S, C){
  NITER <- 30
  zr  <-  10e-11
  lambda  <-  0.1
  r  <-  0
  S  <-  S - diag(diag(S))
  num <- nrow(S)
  A10 <-  (S + t(S)) / 2
  L0  <-  diag(colSums(A10)) - A10
  L0 <- pmax(L0, t(L0))
  tmp0 <- eigen(L0) 
  w0 <- tmp0$values
  V0 <- tmp0$vectors
  wsort0 <- sort(w0, index.return = TRUE)
  di0 <- wsort0$x
  idx0 <- wsort0$ix
  idx1 <- idx0[seq_len(num)]
  F  <-  V0[, idx1][, seq_len(C)]
 
  if(sum(w0[seq_len((C + 1))]) < zr){
     stop('The original graph has more than %d connected component', C)
   }
  
  if(sum(w0[seq_len(C)]) < zr){
     g0 <- igraph::graph_from_adjacency_matrix(A10)
     y <- as.matrix(components(g0, mode = "strong")$membership, drop = FALSE)
     B <-  S
   }
    
  u = vector('list', num)  
  
  for(j in seq_len(num)){
     s0 <-  S[j,,drop = FALSE]
     idxa0 <- seq_len(num)
     u[[j]] <-  matrix(1, 1, length(idxa0))
  }
 
  for(iter in seq_len(NITER)){
      dist0 <- EucDist(t(F), t(F))
      B <-  matrix(0, num, num)
      for(i in seq_len(num)){
         idxa0 <-  seq_len(num)
         ai <-  S[i,,drop = FALSE][idxa0]
         di <-  dist0[i,idxa0,drop = F]
         ad <- u[[i]]*ai-lambda*di/2
         si <- EProjSimplexdiag(ad, u[[i]] + r * rep(1,length(idxa0)))
         u[[i]] <-  1 / (2 * sqrt((si - ai)^2 + .Machine$double.eps))
         B[i,idxa0] <-  si 
        }
      A <-  B
      A <-  (A + t(A)) / 2
      L <-  diag(colSums(A)) - A
      F_old <-  F
      L <-  pmax(L, t(L))
      w <-  eigen(L)$values
      V <-  eigen(L)$vectors
      wsort <-  sort(w,index.return = TRUE)
      idx2 <- wsort$ix[seq_len(C)]
      F <-  V[, idx2]
      ev <-  w[wsort$ix]
      fn1 <- sum(ev[seq_len(C)])
      fn2 <- sum(ev[seq_len(C+1)])
      if(fn1 > zr){ lambda <- 2 * lambda}
        else if(fn2 < zr) {
                   lambda <- lambda / 2 
                   F <- F_old
        }
       else {break}
   }
   message("======== Calculate maximal strongly connected components ========","\n")
   g <- igraph::graph_from_adjacency_matrix(B,mode = "undirected",weighted = TRUE,diag = FALSE)
   G <- igraph::components(g, mode = "strong"); 
   y <- as.matrix(G$membership,drop = FALSE)
   clusternum <- G$no
    
   res <- list("B" = B, "y" = y)
   return(res)
}

#' A public scRNA-seq dataset by Yan et al.
#'
#' @source \doi{10.1101/460147}
#'
#' Columns represent cells, rows represent genes expression values.
#'
"yan"

#' Cell type annotations of `yan` datasets by Yan et al.
#'
#' @source \doi{10.1101/460147}
#'
#' Each row corresponds to one cell of `yan` dataset
#'
"ann"

Any scripts or data that you put into this service are public.

RCSL documentation built on April 19, 2021, 9:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

RCSL
Rank Constrained Similarity Learning for Single Cell RNA Sequencing Data

R/MainFunctions.R
In RCSL: Rank Constrained Similarity Learning for Single Cell RNA Sequencing Data

Defines functions BDSM EucDist EProjSimplexdiag EstClusters NeigRepresent SimS GenesFilter

Documented in BDSM EProjSimplexdiag EstClusters EucDist GenesFilter NeigRepresent SimS

Try the RCSL package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RCSL Rank Constrained Similarity Learning for Single Cell RNA Sequencing Data

R/MainFunctions.R In RCSL: Rank Constrained Similarity Learning for Single Cell RNA Sequencing Data

Defines functions BDSM EucDist EProjSimplexdiag EstClusters NeigRepresent SimS GenesFilter

Documented in BDSM EProjSimplexdiag EstClusters EucDist GenesFilter NeigRepresent SimS

Try the RCSL package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RCSL
Rank Constrained Similarity Learning for Single Cell RNA Sequencing Data

R/MainFunctions.R
In RCSL: Rank Constrained Similarity Learning for Single Cell RNA Sequencing Data