scrattch.bigcat: scrattch.hicat for big data

#' Martix opertaion for big data
#' 
#' @param m A matrix or sparse matrix
#' 
#' @return a sparse matrix of Jaccard distances.
#' @export
#' library(bigstatsr)
convert_big.dat <- function(mat, logNormal=TRUE, backingfile=file.path(getwd(), "fbm"),...)
  {
    library(bigstatsr)
    m = FBM(nrow=nrow(mat),ncol=ncol(mat),backingfile=backingfile, ...)
    ind_nozero <- which(mat != 0, arr.ind = TRUE)
    m[ind_nozero] <- mat[ind_nozero]
    big.dat = list(fbm=m, row_id = row.names(mat), col_id = colnames(mat))
    big.dat$logNormal = logNormal
    return(big.dat)
  }

append_big.dat <- function(big.dat, mat)
  {
    offset = big.dat$fbm$ncol
    ind_nozero <- which(mat != 0, arr.ind = TRUE)
    tmp = ind_nozero
    tmp[,2] = tmp[,2] + offset
    big.dat$fbm$add_columns(ncol(mat))
    big.dat$fbm[tmp] <- mat[ind_nozero]
    big.dat$col_id = c(big.dat$col_id, colnames(mat))
    return(big.dat)
  }


convert_mat_list_big.dat <- function(mat.list, backingfile=NULL, ...)
  {
    if(is.null(backingfile)){
      backingfile=file.path(getwd(), paste0(Sys.Date(),"_fbm"))
    }
    big.dat =  convert_big.dat(mat.list[[1]],backingfile=backingfile, ...)
    for(i in 2:length(mat.list)){
      big.dat = append_big.dat(big.dat, mat.list[[i]])
    }
    return(big.dat)
  }

reorder_big.dat <- function(big.dat, new.cols,backingfile=NULL)
  {
    if(is.character(new.cols)){
      cols = match(new.cols, big.dat$col_id)
    }
    else{
      cols = new.cols
    }
    if(is.null(backingfile)){
      backingfile=file.path(getwd(), paste0(Sys.Date(),"_fbm"))
    }
    big.dat$fbm = big_copy(big.dat$fbm, ind.col=cols, backingfile=backingfile)
    big.dat$col_id = big.dat$col_id[cols]
    return(big.dat)
  }

convert_mat_list_big.dat_old<- function(mat.list, ...)
  {
    cn = unlist(lapply(mat.list, colnames))
    ncols = length(cn)
    nrows = nrow(mat.list[[1]])
    m = FBM(nrow=nrows,ncol=ncols, ...)
    big.dat = list(fbm=m, row_id = row.names(mat.list[[1]]) , col_id = cn)
    add.start = 1
    for(i in 1:length(mat.list)){
      mat = mat.list[[i]]
      add.end = add.start + ncol(mat) - 1
      big.dat$fbm[,add.start:add.end] = mat
      add.start = add.end + 1
    }
    return(big.dat)
  }

big_dat_apply <- function(big.dat, cols, p.FUN, .combine="c",  ncores=1, block.size = 10000,...)
  {
    require(foreach)
    require(doParallel)
    require(parallel)
    if (ncores == 1) {
      registerDoSEQ()
    }
    else {
      cl <- parallel::makeCluster(ncores)
      doParallel::registerDoParallel(cl)
      on.exit(parallel::stopCluster(cl), add = TRUE)
    }
    if(is.character(cols)){
      cols = match(cols, big.dat$col_id)
    }
    cols = sort(cols)    
    bins = split(cols, ceiling(1:length(cols)/block.size))
    res = foreach(bin = bins, .combine=.combine) %dopar% p.FUN(big.dat, bin, ...)
  }


get_cols <- function(big.dat, cols, keep.col=TRUE, sparse=TRUE)
  {
    library(Matrix)
    if(is.character(cols)){
      id = match(cols, big.dat$col_id)
    }
    else{
      id = cols
    }
    ord = order(id)

    mat = big.dat$fbm[,id[ord],drop=F]
    if(keep.col){
      org.order = (1:length(id))[order(ord)]    
      mat = mat[,org.order,drop=F]
      colnames(mat) = big.dat$col_id[id]      
    }
    else{
      colnames(mat) = big.dat$col_id[id[ord]]      
    }
    if(sparse){
      mat = Matrix(mat,sparse=sparse)
    }
    rownames(mat) = big.dat$row_id
    return(mat)
  }

get_logNormal <- function(big.dat, cols, select.genes=NULL, ...)
  {
    mat = get_cols(big.dat, cols,  ...)
    if(big.dat$logNormal){
      norm.dat = mat
    } 
    else{
      norm.dat=logCPM(mat)
    }
    if(!is.null(select.genes)){
      mat = mat[select.genes,,drop=F]
    }
    return(mat)
  }


get_counts <- function(big.dat, cols, ...)
  {
    mat = get_cols(big.dat, cols,...)
    if(big.dat$logNormal){
      mat@x = 2^mat@x -1
    }
    mat
  }



rd_PCA_big <- function(big.dat, dat, select.cells, max.dim=10, th=2, verbose=TRUE, ncores=1)
{
  
  tmp = get_PCA(dat, max.dim,verbose)
  if(is.null(tmp)){
    return(NULL)
  }
  rot = tmp$rot
  rd.dat = tmp$rd.dat
  pca = tmp$pca
  if(ncol(dat)< length(select.cells)){
    if(verbose){
      print("project")
    }
    rd.dat = big_project(big.dat, select.cells, rot, ncores=ncores)
  }
  rm(rot)
  rm(dat)
  return(list(rd.dat=rd.dat, pca=pca))
}

big_project <- function(big.dat, select.cells, rot, ncores=1,...)
  {
    project <- function(big.dat, cols, rot){
      dat = get_logNormal(big.dat, cols,...)[row.names(rot),]
      dat = t(dat)  %*% rot      
      row.names(dat) = row.names(dat)
      dat
    }
    gc()
    rd.dat= big_dat_apply(big.dat, select.cells, project, .combine="rbind",  ncores=ncores, rot=rot)
    rd.dat = as.matrix(rd.dat)
    return(rd.dat)
  }


big_project2 <- function(big.dat, select.cells, rot, ncores=1)
  {
    cols=match(select.cells, big.dat$col_id)
    cols = sort(cols)
    rows = match(row.names(rot), big.dat$row_id)
    rd.dat=big_cprodMat(big.dat$fbm, rot, ind.row=rows, ind.col=cols, ncores=ncores)
    gc()
    rd.dat = as.matrix(rd.dat)
    colnames(rd.dat) = colnames(rot)
    row.names(rd.dat) = big.dat$col_id[cols]
    return(rd.dat[select.cells,])
  }


#' One round of clustering in the iteractive clustering pipeline 
#'
#' @param norm.dat normalized expression data matrix in log transform, using genes as rows, and cells and columns. Users can use log2(FPKM+1) or log2(CPM+1).
#' @param select.cells The cells to be clustered. Default: columns of norm.dat
#' @param counts Raw gene counts. Default NULL, inferred from norm.dat.
#' @param method Clustering method. It can be "louvain", "hclust" and "kmeans". Default "louvain"
#' @param vg.padj.th High variance gene adjusted pvalue cut off. Default 0.5.
#' @param dim.method Dimension reduction techniques. Current options include "pca" and "WGCNA". Default "pca"
#' @param max.dim The number of top dimensions retained. Default 20. Since clustering is performed iteratively, not all relevant dimensions need to be captured in one iterations. 
#' @param rm.eigen The reduced dimensions that need to be masked and removed. Default NULL.  
#' @param rm.th The cutoff for correlation between reduced dimensions and rm.eigen. Reduced dimensions with correlatin with any rm.eigen vectors are not used for clustering. Default 0.7
#' @param de.param The differential gene expression threshold. See de_param() function for details. 
#' @param min.genes The minimal number of high variance and differentially expressed genes genes. Default 5. 
#' @param type Can either be "undirectional" or "directional". If "undirectional", the differential gene threshold de.param is applied to combined up-regulated and down-regulated genes, if "directional", then the differential gene threshold is applied to both up-regulated and down-regulated genes. 
#' @param maxGenes Only used when dim.method=="WGCNA". The maximum number of genes to calculate gene modules. 
#' @param sampleSize The number of sampled cells to compute reduced dimensions.
#' @param max.cl.size Sampled cluster size. This is to speed up limma DE gene calculation. Instead of using all cells, we randomly sampled max.cl.size number of cells for testing DE genes.    
#' @param prefix Used to keep track of intermediate results in "verbose" mode. Default NULL.
#' @param verbose Default FALSE
#'
#' @return Clustering result is returned as a list with two elements: 
#'         cl: cluster membership for each cell
#'         markers: top markers that seperate clusters     
#'         
onestep_clust_big<- function(big.dat, 
                             select.cells= big.dat$col_id,
                             counts = NULL,
                             method = c("louvain","ward.D", "kmeans"), 
                             vg.padj.th = 0.5, 
                             dim.method = c("pca","WGCNA"), 
                             max.dim = 20, 
                             rm.eigen = NULL, 
                             rm.th = 0.7, 
                             de.param = de_param(),
                             merge.type = c("undirectional", "directional"), 
                             maxGenes = 3000,
                             sampleSize = 5000,
                             max.cl.size = 300,
                             k.nn=15,
                             prefix = NULL, 
                             verbose = FALSE, overwrite=FALSE)
                            
  {
    library(matrixStats)
    outfile=paste0(prefix, ".rda")
    if(file.exists(outfile) & !overwrite){
      load(outfile)
      return(result)
    }
    
    method=method[1]
    merge.type=merge.type[1]
    
    sampled = sample(select.cells, min(sampleSize, length(select.cells)))
    counts = get_counts(big.dat, sampled,sparse=TRUE)
    if(verbose){
      print("Find high variance genes")
    }
    vg = findVG(counts)
    select.genes = with(vg, row.names(vg)[which(loess.padj < vg.padj.th | dispersion >3)])
    if(length(select.genes) < de.param$min.genes){
      return(NULL)
    }
    select.genes = head(select.genes[order(vg[select.genes, "loess.padj"],-vg[select.genes, "z"])],maxGenes)
    if(verbose){
      print("Reduce dimensions")
    }
    counts = logCPM(counts)[select.genes,]
    rd.result = rd_PCA_big(big.dat, dat=counts, select.cells, max.dim=max.dim, verbose=verbose)
    if(is.null(rd.result)){
      return(NULL)
    }
    rd.dat = rd.result$rd.dat
    rm(counts)
    if(!is.null(rm.eigen)){
      
      rd.dat <- filter_RD(rd.dat, rm.eigen, rm.th, verbose=verbose)
    }
    if(is.null(rd.dat)||ncol(rd.dat)==0){
      return(NULL)
    }
    if(verbose){
      print("Clustering")
    }
    max.cl = ncol(rd.dat) *2 + 1
    if(method=="louvain"){
      k = pmin(k.nn, round(nrow(rd.dat)/2))
      tmp = jaccard_louvain(rd.dat, k)
      if(is.null(tmp)){
        return(NULL)
      }
      cl = tmp$cl
      if(length(unique(cl))>max.cl){
        tmp.means = get_cl_means(rd.dat, cl)
        tmp.hc = hclust(dist(t(tmp.means)), method="average")
        tmp.cl= cutree(tmp.hc, pmin(max.cl, length(unique(cl))))
        cl = setNames(tmp.cl[as.character(cl)], names(cl))
      }
    }
    else if(method=="ward.D"){
      hc = hclust(dist(rd.dat),method="ward.D")
      #print("Cluster cells")
      cl = cutree(hc, max.cl)
    }
    else if(method=="kmeans"){
      cl = kmeans(rd.dat, max.cl)$cluster
    }
    else{
      stop(paste("Unknown clustering method", method))
    }
    sampled.cells = sample_cells(cl, max.cl.size)
    norm.dat = get_logNormal(big.dat, sampled.cells)
    merge.result=merge_cl(norm.dat, cl=cl, rd.dat=rd.dat, merge.type=merge.type, de.param=de.param, max.cl.size=max.cl.size, verbose=verbose)
    rm(norm.dat)
    gc()
    if(is.null(merge.result)){
      return(NULL)
    }
    cl = merge.result$cl
    de.genes = merge.result$de.genes
    markers= merge.result$markers
    result=list(cl=cl, markers=markers)

    if(verbose){
      save(result, file=outfile)
    }
    rm(rd.dat.t)
    rm(merge.result)
    gc()
    return(result)
  }



#' Iterative clustering algorithm for single cell RNAseq dataset
#'
#' @param norm.dat normalized expression data matrix in log transform, using genes as rows, and cells and columns. Users can use log2(FPKM+1) or log2(CPM+1)
#' @param select.cells The cells to be clustered
#' @param prefix The character string to indicate current iteration.
#' @param split.size The minimal cluster size for further splitting
#' @param result The current clustering result as basis for further splitting.
#' @param method Clustering method. It can be "auto", "louvain", "hclust"
#' @param ... Other parameters passed to method `onestep_clust()`
#'
#' @return Clustering result is returned as a list with two elements: 
#'         cl: cluster membership for each cell
#'         markers: top markers that seperate clusters     
#'         
#' @examples clust.result <- iter_clust(norm.dat)
#'           clust.result <- iter_clust(norm.dat, de.param = de_param(q1.th = 0.5, de.score.th = 100))
iter_clust_big<- function(big.dat=NULL,
                          select.cells = big.dat$col_id,
                          prefix = NULL, 
                          split.size = 10, 
                          result = NULL,
                          method = "auto",
                          ...)
  {
    if(!is.null(prefix)) {
      cat(prefix, length(select.cells),"\n")
    }
    if(method == "auto"){
      if(length(select.cells) > 2000){
        select.method="louvain"
      }
      else{
        select.method="ward.D"
      }
    }
    else{
      select.method=method
    }
    if(is.null(result)){
      result=onestep_clust_big(big.dat=big.dat, select.cells=select.cells, prefix=prefix,method=select.method, ...)
      if(is.null(result)){
        return(NULL)
      }
    }

    select.cells= intersect(select.cells, names(result$cl))
    
    cl = result$cl[select.cells]
    gene.mod = result$gene.mod
    markers=result$markers
    cl = setNames(as.integer(cl),names(cl))
    new.cl =cl
    cl.size = table(cl)
    to.split = names(cl.size)[cl.size >=split.size]
    if(length(to.split)>0){
      n.cl = 1
      for(x in sort(unique(cl))){
        tmp.cells = names(cl)[cl==x]
        if(!x %in% to.split){
          new.cl[tmp.cells]=n.cl
        }
        else{
          tmp.prefix = paste(prefix, x, sep=".")
          tmp.result=iter_clust_big(big.dat=big.dat, select.cells=tmp.cells, prefix=tmp.prefix,split.size=split.size,method= method, ...)
          gc()
          if(is.null(tmp.result)){
            new.cl[tmp.cells]=n.cl
          }
          else{
            tmp.cl = tmp.result$cl
            if(length(unique(tmp.cl)>1)){
              new.cl[names(tmp.cl)] = n.cl + as.integer(tmp.cl)
              markers=union(markers, tmp.result$markers)
            }
          }
        }
        n.cl = max(new.cl)+1
      }
      cl = new.cl
    }
    result=list(cl=cl, markers=markers)
    return(result)
  }