scrattch.hicat: Hierarchical Iterative Clustering Analysis for Transcriptomic data

Documented in iter_clust jaccard jaccard_leiden jaccard_leiden.RANN jaccard_louvain jaccard_louvain.RANN onestep_clust reorder_cl

#' Compute jaccard distances between columns of a matrix
#' 
#' @param m A matrix or sparse matrix
#' 
#' @return a sparse matrix of Jaccard distances.
#' 
jaccard <-  function(m) {
  library(Matrix)
  ## common values:                                                                                                                                              
  A <-  Matrix::tcrossprod(m)
  A <- as(A, "dgTMatrix")
  ## counts for each row                                                                                                                                         
  b <- Matrix::rowSums(m)
  ## Jacard formula: #common / (#i + #j - #common)                                                                                                               
  x = A@x / (b[A@i+1] + b[A@j+1] - A@x)
  A@x = x
  return(A)
}  

knn_jaccard <- function(knn)
{
  knn.df = data.frame(i = rep(1:nrow(knn), ncol(knn)), j=as.vector(knn))
  knn.mat = sparseMatrix(i = knn.df[[1]], j=knn.df[[2]], x=1, giveCsparse=FALSE)
  jaccard(knn.mat)
}


pass_louvain <- function(mod.sc, adj.mat)
{
  library(Matrix)
  p <- mean(Matrix::colSums(adj.mat > 0) - 1) / nrow(adj.mat)
  n <- ncol(adj.mat)
  rand.mod1 <- 0.97 * sqrt((1 - p) / (p * n))
  rand.mod2 <- (1 - 2 / sqrt(n)) * (2 / (p * n)) ^ (2 / 3)
  rand.mod.max <- max(rand.mod1, rand.mod2, na.rm = TRUE)
  #cat("Modularity:",mod.sc, "threshold:",rand.mod.max, "\n")
  return(mod.sc > rand.mod.max)
}

#' Perform Jaccard/Louvain clustering based on RANN
#'
#' @param dat A matrix of features (rows) x samples (columns)
#' @param k K nearest neighbors to use
#' 
#' @return A list object with the cluster factor object and (cl) and Jaccard/Louvain results (result)
#' 
jaccard_louvain.RANN <- function(dat, 
                                 k = 10)
{
  library(igraph)
  library(matrixStats)
  library(RANN)  
  
  knn.matrix = RANN::nn2(dat, k = k)[[1]]
  jaccard.adj  <- knn_jaccard(knn.matrix)
  
  
  leiden.result = leiden(jaccard.adj)
  
  jaccard.gr <- igraph::graph.adjacency(jaccard.adj, 
                                        mode = "undirected", 
                                        weighted = TRUE)
  louvain.result <- igraph::cluster_louvain(jaccard.gr)
  mod.sc <- igraph::modularity(louvain.result)
  
  if(pass_louvain(mod.sc, jaccard.adj)) {
    
    cl <- setNames(louvain.result$membership, row.names(dat))
    
    return(list(cl = cl, result = louvain.result))
    
  } else{
    return(NULL)
  }
}

#' Perform Jaccard/Louvain clustering based on RANN
#'
#' @param dat A matrix of features (rows) x samples (columns)
#' @param k K nearest neighbors to use
#' 
#' @return A list object with the cluster factor object and (cl) and Jaccard/Louvain results (result)
#' 
jaccard_leiden.RANN <- function(dat, 
                                k = 10)
{
  library(igraph)
  library(matrixStats)
  library(RANN)  
  library(leiden)
  
  knn.matrix = RANN::nn2(dat, k = k)[[1]]
  
  jaccard.adj  <- knn_jaccard(knn.matrix)
  
  
  leiden.result <- igraph::cluster_louvain(jaccard.gr)
  mod.sc <- igraph::modularity(louvain.result)
  
  if(pass_louvain(mod.sc, jaccard.adj)) {
    
    cl <- setNames(louvain.result$membership, row.names(dat))
    
    return(list(cl = cl, result = louvain.result))
    
  } else{
    return(NULL)
  }
}


#' Perform Jaccard/Louvain clustering using Rphenograph
#' 
#' @param dat A matrix of features (rows) x samples (columns)
#' @param k K-nearest neighbors to use for clustering
#' 
#' @return A list object with the cluster factor object and (cl) and Rphenograph results (result)
#' 
jaccard_louvain <- function(dat, k = 10)
{
  suppressPackageStartupMessages(library(Rphenograph))
  
  rpheno <- Rphenograph(dat, k = k)
  
  cl <- setNames(rpheno[[2]]$membership, row.names(dat)[as.integer(rpheno[[2]]$names)])
  
  return(list(cl = cl, result = rpheno))
}

filter_RD <- function(rd.dat, rm.eigen, rm.th, verbose=FALSE)
{
  rm.cor=cor(rd.dat, rm.eigen[row.names(rd.dat),])
  rm.cor[is.na(rm.cor)]=0
  rm.score = rowMaxs(abs(rm.cor))
  select = colSums(t(abs(rm.cor)) >= rm.th) ==0
  if(sum(!select)>0 & verbose){
    print("Remove dimension:")
    print(rm.score[!select])
  }
  if(sum(select)==0){
    return(NULL)
  }
  rd.dat = rd.dat[,select,drop=F]
}



#' Perform Jaccard/Leiden clustering 
#' 
#'
#' @param dat A matrix of samples (rows) x features (columns)
#' @param k K nearest neighbors to use
#' 
#' @return A list object with the cluster factor object and (cl) and Jaccard/Leiden results (result)
#' @param num_iter If n_iter < 0 the optimiser continues iterating until it encounters an iteration that did not improve the partition.
#' 
#' 
#load("data/Leiden_test/L4_5_6_IT.rd.dat.rda")
#

jaccard_leiden <- function(dat, k = 10, weight = NULL,
                           num_iter = 2,
                           resolution_parameter = 0.0001,
                           random_seed = NULL,
                           verbose = FALSE, ...) {
  
  library(igraph)
  library(matrixStats)
  library(RANN)  
  library(Matrix)
  
  knn.matrix = RANN::nn2(dat, k = k)[[1]]
  
  jaccard.adj  <- knn_jaccard(knn.matrix)
  jaccard.gr <- igraph::graph.adjacency(jaccard.adj, 
                                        mode = "undirected", 
                                        weighted = TRUE)
  
  partition_type <- 'CPMVertexPartition'
  
  #write line to determine optimal resolutoin↔
  
  cluster_result <- leidenbase::leiden_find_partition(jaccard.gr,
                                                      partition_type = partition_type,
                                                      num_iter=10,
                                                      resolution_parameter=0.01,
                                                      verbose = FALSE )
  
  cl <- setNames(cluster_result$membership, row.names(dat))
  
  
  return(list(cl = cl, result = cluster_result))
}





#' One round of clustering in the iteractive clustering pipeline 
#'
#' @param norm.dat normalized expression data matrix in log transform, using genes as rows, and cells and columns. Users can use log2(FPKM+1) or log2(CPM+1).
#' @param select.cells The cells to be clustered. Default: columns of norm.dat
#' @param counts Raw gene counts. Default NULL, inferred from norm.dat.
#' @param method Clustering method. It can be "louvain", "hclust" and "kmeans". Default "louvain"
#' @param vg.padj.th High variance gene adjusted pvalue cut off. Default 0.5.
#' @param dim.method Dimension reduction techniques. Current options include "pca" and "WGCNA". Default "pca"
#' @param max.dim The number of top dimensions retained. Default 20. Since clustering is performed iteratively, not all relevant dimensions need to be captured in one iterations. 
#' @param rm.eigen The reduced dimensions that need to be masked and removed. Default NULL.  
#' @param rm.th The cutoff for correlation between reduced dimensions and rm.eigen. Reduced dimensions with correlatin with any rm.eigen vectors are not used for clustering. Default 0.7
#' @param de.param The differential gene expression threshold. See de_param() function for details. 
#' @param type Can either be "undirectional" or "directional". If "undirectional", the differential gene threshold de.param is applied to combined up-regulated and down-regulated genes, if "directional", then the differential gene threshold is applied to both up-regulated and down-regulated genes. 
#' @param maxGenes Only used when dim.method=="WGCNA". The maximum number of genes to calculate gene modules. 
#' @param sampleSize The number of sampled cells to compute reduced dimensions.
#' @param max.cl.size Sampled cluster size. This is to speed up limma DE gene calculation. Instead of using all cells, we randomly sampled max.cl.size number of cells for testing DE genes.    
#' @param prefix Used to keep track of intermediate results in "verbose" mode. Default NULL.
#' @param verbose Default FALSE
#'
#' @return Clustering result is returned as a list with two elements: 
#'         cl: cluster membership for each cell
#'         markers: top markers that seperate clusters     
#'         
onestep_clust <- function(norm.dat, 
                          select.cells = colnames(norm.dat), 
                          counts = NULL, 
                          method = c("louvain","leiden","ward.D", "kmeans"), 
                          vg.padj.th = 0.5, 
                          dim.method = c("pca","WGCNA"), 
                          max.dim = 20, 
                          rm.eigen = NULL, 
                          rm.th = 0.7, 
                          de.param = de_param(),
                          merge.type = c("undirectional", "directional"), 
                          maxGenes = 3000,
                          sampleSize = 4000,
                          max.cl.size = 300,
                          k.nn = 15,
                          prefix = NULL, 
                          verbose = FALSE, 
                          regress.x=NULL)

{
  library(matrixStats)
  
  method <- match.arg(method)
  dim.method <- match.arg(dim.method)
  merge.type <- match.arg(merge.type)
  
  if(!is.null(regress.x)){
    print("regression")
    tmp =  lm_normalize(as.matrix(norm.dat[,select.cells]), regress.x[select.cells], R_2.th=0.1)
    norm.dat = tmp[[1]]
  }
  if(length(select.cells)>sampleSize){
    sampled.cells = sample(select.cells, pmin(length(select.cells),sampleSize))
  }
  else{
    sampled.cells = select.cells
  }
  ###Find high variance genes
  select.genes = row.names(norm.dat)[which(Matrix::rowSums(norm.dat[,select.cells] > de.param$low.th) >= de.param$min.cells)]
  ###Find high variance genes.
  if(is.null(counts)){
    if(is.matrix(norm.dat)){
      counts = 2^(norm.dat[,sampled.cells])-1
    }
    else{
      counts = norm.dat[,sampled.cells]
      counts@x = 2^(counts@x) - 1
    }
  }
  plot_file=NULL
  if(verbose & !is.null(prefix)){
    plot_file=paste0(prefix,".vg.pdf")
  }
  vg = find_vg(as.matrix(counts[select.genes,sampled.cells]),plot_file=plot_file)
  if(dim.method=="auto"){
    if(length(select.cells)> 1000){
      dim.method="pca"
    }
    else{
      dim.method="WGCNA"
    }
  }
  if(dim.method=="WGCNA"){
    ###Ignore vg.padj.th for WGCNA, choose top "maxGgenes" for analysis
    select.genes = as.character(vg[which(vg$loess.padj < 1),"gene"])
    select.genes = head(select.genes[order(vg[select.genes, "loess.padj"],-vg[select.genes, "z"])],maxGenes)
    rd.dat = rd_WGCNA(norm.dat, select.genes=select.genes, select.cells=select.cells, sampled.cells=sampled.cells, de.param=de.param, max.mod=max.dim, max.cl.size=max.cl.size)$rd.dat
  }
  else{
    ###If most genes are differentially expressed, then use absolute dispersion value
    select.genes = as.character(vg[which(vg$loess.padj < vg.padj.th | vg$dispersion >3),"gene"])
    select.genes = head(select.genes[order(vg[select.genes, "loess.padj"],-vg[select.genes, "z"])],maxGenes)
    if(verbose){
      cat("Num high variance genes:",length(select.genes),"\n")
    }
    if(length(select.genes)< de.param$min.genes){
      return(NULL)
    }
    rd.dat = rd_PCA(norm.dat,select.genes, select.cells, sampled.cells=sampled.cells, max.pca = max.dim)$rd.dat
  }
  if(is.null(rd.dat)||ncol(rd.dat)==0){
    return(NULL)
  }
  if(!is.null(rm.eigen)){
    rd.dat <- filter_RD(rd.dat, rm.eigen, rm.th, verbose=verbose)
  }
  if(is.null(rd.dat)||ncol(rd.dat)==0){
    return(NULL)
  }
  if(verbose){
    print(method)
  }
  max.cl = ncol(rd.dat)*2 + 1
  if(method=="louvain"){
    k = pmin(k.nn, round(nrow(rd.dat)/2))
    tmp = jaccard_louvain(rd.dat, k)
    if(is.null(tmp)){
      return(NULL)
    }
    cl = tmp$cl
    if(length(unique(cl))>max.cl){
      tmp.means =do.call("cbind",tapply(names(cl),cl, function(x){
        colMeans(rd.dat[x,,drop=F])
      },simplify=F))
      tmp.hc = hclust(dist(t(tmp.means)), method="average")
      tmp.cl= cutree(tmp.hc, pmin(max.cl, length(unique(cl))))
      cl = setNames(tmp.cl[as.character(cl)], names(cl))
    }
  }
  
  
  else if(method=="leiden"){
    k = pmin(k.nn, round(nrow(rd.dat)/2))
    tmp = jaccard_leiden(rd.dat, k)
    if(is.null(tmp)){
      return(NULL)
    }
    cl = tmp$cl
    if(length(unique(cl))>max.cl){
      tmp.means =do.call("cbind",tapply(names(cl),cl, function(x){
        colMeans(rd.dat[x,,drop=F])
      },simplify=F))
      tmp.hc = hclust(dist(t(tmp.means)), method="average")
      tmp.cl= cutree(tmp.hc, pmin(max.cl, length(unique(cl))))
      cl = setNames(tmp.cl[as.character(cl)], names(cl))
    }
  }  
  
  else if(method=="ward.D"){
    hc = hclust(dist(rd.dat),method="ward.D")
    #print("Cluster cells")
    cl = cutree(hc, max.cl)
  }
  else if(method=="kmeans"){
    cl = kmeans(rd.dat, max.cl)$cluster
  }
  else{
    stop(paste("Unknown clustering method", method))
  }
  #print(table(cl))
  rd.dat.t = t(rd.dat)
  merge.result=merge_cl(norm.dat, cl=cl, rd.dat.t=rd.dat.t, merge.type=merge.type, de.param=de.param, max.cl.size=max.cl.size,verbose=verbose)
  gc()
  if(is.null(merge.result))return(NULL)
  sc = merge.result$sc
  #print(sc)
  cl = merge.result$cl
  if(length(unique(cl))>1){
    if(verbose){
      cat("Expand",prefix, "\n")
      cl.size=table(cl)
      print(cl.size)
      save(cl, file=paste0(prefix, ".cl.rda"))
    }
    de.genes = merge.result$de.genes
    markers= merge.result$markers
    cl.dat = get_cl_means(norm.dat[markers,], cl[sample_cells(cl, max.cl.size)])
    cl.hc = hclust(dist(t(cl.dat)),method="average")
    cl = setNames(factor(as.character(cl), levels= colnames(cl.dat)[cl.hc$order]), names(cl))
    if(verbose & !is.null(prefix)){
      tmp=display_cl(cl, norm.dat, prefix=prefix, markers=markers, max.cl.size=max.cl.size)
    }
    levels(cl) = 1:length(levels(cl))
    result=list(cl=cl, markers=markers)
    return(result)
  }
  return(NULL)
}


#' Iterative clustering algorithm for single cell RNAseq dataset
#'
#' @param norm.dat normalized expression data matrix in log transform, using genes as rows, and cells and columns. Users can use log2(FPKM+1) or log2(CPM+1)
#' @param select.cells The cells to be clustered
#' @param prefix The character string to indicate current iteration.
#' @param split.size The minimal cluster size for further splitting
#' @param result The current clustering result as basis for further splitting.
#' @param method Clustering method. It can be "auto", "louvain", "hclust"
#' @param ... Other parameters passed to method `onestep_clust()`
#'
#' @return Clustering result is returned as a list with two elements: 
#'         cl: cluster membership for each cell
#'         markers: top markers that seperate clusters     
#'         
iter_clust <- function(norm.dat, 
                       select.cells = colnames(norm.dat),
                       prefix = NULL, 
                       split.size = 10, 
                       result = NULL,
                       method = "auto",
                       ...)
{
  if(!is.null(prefix)) { print(prefix) }
  if(method == "auto"){
    if(length(select.cells) > 3000){
      select.method="louvain"
    }
    else{
      select.method="ward.D"
    }
  }
  else{
    select.method=method
  }
  if(length(select.cells) <= 3000){
    if(!is.matrix(norm.dat)){
      norm.dat = as.matrix(norm.dat[,select.cells])
    }
  }
  if(is.null(result)){        
    result=onestep_clust(norm.dat, select.cells=select.cells, prefix=prefix,method=select.method,...)
    gc()
  }
  if(!is.null(result)){
    select.cells= intersect(select.cells, names(result$cl))
    #save(result, file=paste0(prefix,".rda"))
    cl = result$cl[select.cells]
    gene.mod = result$gene.mod
    markers=result$markers
    cl = setNames(as.integer(cl),names(cl))
    new.cl =cl
    cl.size = table(cl)
    to.split = names(cl.size)[cl.size >=split.size]
    if(length(to.split)>0){
      n.cl = 1
      for(x in sort(unique(cl))){
        tmp.cells = names(cl)[cl==x]
        if(!x %in% to.split){
          new.cl[tmp.cells]=n.cl
        }
        else{
          tmp.prefix = paste(prefix, x, sep=".")
          tmp.result=iter_clust(norm.dat=norm.dat, select.cells=tmp.cells, prefix=tmp.prefix,split.size=split.size,method= method,...)
          gc()
          if(is.null(tmp.result)){
            new.cl[tmp.cells]=n.cl
          }
          else{
            tmp.cl = tmp.result$cl
            if(length(unique(tmp.cl)>1)){
              new.cl[names(tmp.cl)] = n.cl + as.integer(tmp.cl)
              markers=union(markers, tmp.result$markers)
            }
          }
        }
        n.cl = max(new.cl)+1
      }
      cl = new.cl
    }
    result=list(cl=cl, markers=markers)
    return(result)
  }
  return(NULL)
}


#' Reorder cluster based on hiearchical clustering of clusters based on average cluster values for the input data matrix 
#'
#' @param cl A vector of cluster membership with cells as names, and cluster id as values. 
#' @param dat The data matrix with cells as columns. 
#'
#' @return Reorder cluster membership vector. The cluster id start from 1 to the number of clusters.
#' @export 
#'
reorder_cl <- function(cl, dat)
{
  cl.means = get_cl_means(dat,cl)
  cl.hc = hclust(as.dist(1 - cor(cl.means)),method = "average")
  cl = setNames(factor(as.character(cl), levels = cl.hc$labels[cl.hc$order]),names(cl))
  cl = setNames(as.integer(cl),names(cl))
}



combine_finer_split <- function(cl, finer.cl)
{
  if(is.factor(cl)){
    cl = setNames(as.integer(cl), names(cl))
  }
  if(is.factor(finer.cl)){
    finer.cl = setNames(as.integer(finer.cl), names(finer.cl))
  }
  max.cl = max(cl)
  finer.cl = finer.cl + max.cl
  cl[names(finer.cl)] = finer.cl
  return(cl)
}

AllenInstitute/scrattch.hicat documentation built on Oct. 20, 2023, 6:55 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

AllenInstitute/scrattch.hicat
Hierarchical Iterative Clustering Analysis for Transcriptomic data

R/cluster.R
In AllenInstitute/scrattch.hicat: Hierarchical Iterative Clustering Analysis for Transcriptomic data

Defines functions combine_finer_split reorder_cl iter_clust onestep_clust jaccard_leiden filter_RD jaccard_louvain jaccard_leiden.RANN jaccard_louvain.RANN pass_louvain knn_jaccard jaccard

Documented in iter_clust jaccard jaccard_leiden jaccard_leiden.RANN jaccard_louvain jaccard_louvain.RANN onestep_clust reorder_cl

R Package Documentation

Browse R Packages

We want your feedback!

AllenInstitute/scrattch.hicat Hierarchical Iterative Clustering Analysis for Transcriptomic data

R/cluster.R In AllenInstitute/scrattch.hicat: Hierarchical Iterative Clustering Analysis for Transcriptomic data

Defines functions combine_finer_split reorder_cl iter_clust onestep_clust jaccard_leiden filter_RD jaccard_louvain jaccard_leiden.RANN jaccard_louvain.RANN pass_louvain knn_jaccard jaccard

Documented in iter_clust jaccard jaccard_leiden jaccard_leiden.RANN jaccard_louvain jaccard_louvain.RANN onestep_clust reorder_cl

R Package Documentation

Browse R Packages

We want your feedback!

AllenInstitute/scrattch.hicat
Hierarchical Iterative Clustering Analysis for Transcriptomic data

R/cluster.R
In AllenInstitute/scrattch.hicat: Hierarchical Iterative Clustering Analysis for Transcriptomic data