R/SnapATACUtils.R

Defines functions addMatToSnap filterSnapBmat runLeiden runKNN queryEmbedding.default queryEmbedding landmarkEmbedding downSampleOnSnap plotConsensus snapGmat2Seurat emptySnapEmbedding emptySnapGraph orderSnap

Documented in addMatToSnap downSampleOnSnap emptySnapEmbedding emptySnapGraph filterSnapBmat landmarkEmbedding orderSnap plotConsensus queryEmbedding runKNN runLeiden snapGmat2Seurat

#' Order Snap based on the sample information
#' This is critical for adding bmat and other related functions in SnapATAC
#' @param s snap object
#' @return snap object
#' @export
orderSnap <- function(s) {
  if (all(s@sample == sort(s@sample))) {
    message("No need to order the snap since samples are ordered.")
  } else {
    message("Need to order the snap since samples are not ordered")
    s <- s[order(s@sample), ]
  }
  return(s)
}

#' Empty snap's KNN graph if it has.
#' @param s snap object
#' @return snap object
#' @export
emptySnapGraph <- function(s) {
  ngraph <- nrow(s@graph@mat)
  if (ngraph > 0) {
    message(ngraph, " nodes in current snap graph, and remove it.")
    s@graph@mat <- Matrix::Matrix(0, 0, 0, sparse = TRUE)
    s@graph@k <- numeric()
    s@graph@snn <- FALSE
    s@graph@file <- character()
    s@graph@snn.prune <- numeric()
  } else {
    message("No KNN graph is found in the snap.")
  }
  return(s)
}

#' Empty snap's embedding recorded in smat slot if it has.
#' @param s snap object
#' @return snap object
#' @export
emptySnapEmbedding <- function(s) {
  if (nrow(s@smat@dmat) > 0) {
    message("Snap has smat, now remove it.")
    s@smat@dmat <- matrix(0, 0, 0)
    s@smat@sdev <- numeric()
    s@smat@method <- character()
  } else {
    message("Snap has no smat.")
  }
  return(s)
}

#' Convert snap object to Seurat by gmat
#'
#' @param snap snap object defined in SnapATAC package
#' snap@gmat must be not empty; snap@smat@dmat should not be empty
#' @param eigDims vector, used for choosing PCA components, default 1:50
#' @param assay characters, name used in Seurat object
#' @param pcaPrefix characters, default "SnapATAC_"
#' @param nameDelim characters, default "-"
#' @param useSnapATACEmbed bool, use SnapATAC embedding as pca
#' embeding in Seurat, default TRUE.
#' @return Seurat object
#' @import Seurat
#' @export
snapGmat2Seurat <- function(snap, eigDims = 1:50,
                            assay = "GeneScore",
                            pcaPrefix = "SnapATAC_",
                            nameDelim = "-",
                            useSnapATACEmbed = TRUE) {
  # check snap@gmat
  # check snap@smat@dmat
  metaData <- snap@metaData
  rownames(metaData) <- paste(metaData$sample, metaData$barcode, sep = ".")
  gmatUse <- Matrix::t(snap@gmat)
  colnames(gmatUse) <- paste(metaData$sample, metaData$barcode, sep = ".")
  snapSeurat <- Seurat::CreateSeuratObject(counts = gmatUse, assay =
                                                               assay,
                                           names.delim = nameDelim)
  snapSeurat <- Seurat::AddMetaData(snapSeurat, metadata = metaData)
  if (useSnapATACEmbed) {
    message("Use SnapATAC Embed as pca in Seurat.")
    pcaUse <- snap@smat@dmat[, eigDims]
    rownames(pcaUse) <- paste(metaData$sample, metaData$barcode, sep = ".")
    colnames(pcaUse) <- paste0(pcaPrefix, 1:ncol(pcaUse))
    snapSeurat[["pca"]] <- methods::new(Class = "DimReduc", cell.embeddings = pcaUse,
      feature.loadings = matrix(0, 0, 0),
      feature.loadings.projected = matrix(0, 0, 0),
      assay.used = assay, stdev = rep(1, ncol(pcaUse)),
      key = pcaPrefix,
      jackstraw = new(Class = "JackStrawData"), misc = list())
  }
  return(snapSeurat)
}

#' Consensus plot.
#' @param consensusFile characters
#' @param type characters, cell type name for title
#' @param outPDF characters
#' @param width numeric, default is 7
#' @param height numeric, default is 7
#' @return None.
#' Side effect: generate the figure file.
#' @importFrom graphics axis legend mtext par
#' @importFrom methods new
#' @importFrom utils read.table
#' @importFrom stats as.formula complete.cases
#' @importFrom grDevices dev.off pdf
#' @export
plotConsensus <- function(consensusFile, type,
                          outPDF, width = 7, height = 7) {
  consensus <- read.table(consensusFile)
  colnames(consensus) <- c("file", "Resolution", "Dispersion",
                           "ProportionOfAmbiguousClustering")
  pdf(file = outPDF, width = width, height = height)
  par(mar = c(5, 4, 4, 4) + 0.3)
  plot(consensus$Resolution, consensus$Dispersion,
       type = "l", pch = 16, col = "red", lwd = 4,
       xlab = "Resolution", ylab = "Dispersion", cex.lab = 1.5,
       main =
         paste("Consensus analysis for Leiden-base clustering on",
               type, "cells")
       )
  par(new = TRUE)
  plot(consensus$Resolution, consensus$ProportionOfAmbiguousClustering,
       pch = 17, col = "blue",
       axes = FALSE, type = "l", lwd = 4, xlab = "", ylab = ""
       )
  axis(side = 4,
       at = pretty(range(consensus$ProportionOfAmbiguousClustering)))
  legend("topright", legend = c("Dispersion (left;red)", "PAC (right;blue)"),
         col = c("red", "blue"), cex = 1, lty = c(3, 3))
  mtext("Proportion Of Ambiguous Clustering (PAC)",
        side = 4, line = 3, cex = 1.5)
  dev.off()
}


#' Down sample snap object based on a given cluster meta.
#' @param snap SnapObject from SnapATAC
#' @param cluster vector of characters/integer
#' aligned with the cells in snap in order.
#' @param n integer, number of samples in each unique cluster id.
#' @return SnapObject
#' @export
downSampleOnSnap <- function(snap, cluster, n = 200) {
  snapList <- lapply(unique(cluster), function(i) {
    s <- snap[cluster %in% i, , drop = FALSE]
    if(SnapATAC::nrow(s) <= n) {
      return(s)
    } else {
      return(s[sample(SnapATAC::nrow(s), size = n, replace = F), ])
    }
  })
  return(SnapATAC::snapListRbind(snapList = snapList))
}

#' SnapATAC landmark embedding.
#' @param snap snap object of SnapATAC, default NULL
#' @param snapFile characters name of the snap File to load, default NULL
#' @param blacklistGR GenomicRanges object, default NULL
#' @param blackListFile characters file of the blacklistGR, default NULL
#' @param binSize integer dim of bmat, default 5,000
#' @param nPC integer dim of embeding for embedding, default 50
#' @param ncores integer number of cores to use for parallel, default 1
#' @param outFile characters to which we write snap with embedding, default NULL
#' @param excludeChr characters used to filter bmat, default "random|chrM"
#' @param removeBmat bool default FALSE
#' @param removeJmat bool default FALSE
#' @return SnapObject
#' @export
landmarkEmbedding <- function(snap = NULL,
                              snapFile = NULL,
                              blacklistGR = NULL,
                              blackListFile = NULL,
                              binSize = 5000,
                              nPC = 50,
                              ncores = 1,
                              outFile = NULL,
                              excludeChr = "random|chrM",
                              removeBmat = FALSE,
                              removeJmat = FALSE) {
  if (!is.null(snapFile)) {
    if (grepl("\\.RData", snapFile)) {
      snap <- loadRData(snapFile, check = FALSE)
    } else {
      snap <- readRDS(snapFile)
    }
  }
  if (is.null(snap)) {
    stop("No snap found.")
  }
  if (!is.null(blackListFile)) {
    blackList <- read.table(blackListFile,
                            header = FALSE, quote = "")
    blacklistGR <- GenomicRanges::GRanges(
      seqnames = blackList[, 1],
      ranges = IRanges::IRanges(blackList[, 2], blackList[, 3]))
  }
  if(is.null(blacklistGR)) {
    stop("No blacklist found")
  }
  if (!is.null(outFile)) {
    prepareOutfile(outFile)
  }
  ## add bmat
  if (nrow(snap@bmat) > 0) {
    message("Snap has bmat, will skip addBmat.")
  } else {
    snap <- SnapATAC::addBmatToSnap(
      obj = snap, bin.size = binSize, do.par = TRUE,
      num.cores = ncores,
      checkSnap = FALSE
    )
  }
  ## preprocess
  idy <- S4Vectors::queryHits(
    GenomicRanges::findOverlaps(snap@feature, blacklistGR))
  if (length(idy) > 0) {
    message("remove blacklist")
    snap <- snap[, -idy, mat = "bmat"]
  }

  chr.exclude <-  GenomeInfoDb::seqlevels(snap@feature)[
    grep(excludeChr, GenomeInfoDb::seqlevels(snap@feature))
  ]

  idy <- grep(paste(chr.exclude, collapse = "|"), snap@feature)
  if (length(idy) > 0) {
    message("remove ", excludeChr)
    snap <- snap[, -idy, mat = "bmat"]
  }

  ## filter based on bincoverage
  bin.cov <- log10(Matrix::colSums(snap@bmat) + 1)
  binCutoff <- quantile(bin.cov[bin.cov > 0], 0.95)
  idy <- which(bin.cov <= binCutoff & bin.cov > 0)
  snap <- snap[, idy, mat = "bmat"]
  ## binary snap bmat
  if(max(snap@bmat) == 1) {
    message("@bmat is already binarized.")
  } else {
    snap <- SnapATAC::makeBinary(snap, mat = "bmat")
  }

  snap <- SnapATAC::runDiffusionMaps(
    obj = snap,
    input.mat = "bmat",
    num.eigs = nPC,
    method = "RSpectra"
  )
  snap@metaData$landmark <- 1
  if(removeBmat & (nrow(methods::slot(snap, "bmat")) != 0L)) {
    message("Remove bmat.")
    snap <- SnapATAC::rmBmatFromSnap(snap)
  }
  if(removeJmat) {
    message("Remove jmat.")
    snap@jmat <- SnapATAC::newJaccard()
  }
  
  if (!is.null(outFile)) {
    saveRDS(object = snap, file = outFile)
  }
  return(snap)
}

#' SnapATAC query embedding.
#' @param snapLandmark snap object of SnapATAC, default NULL
#' @param snaplandmarkFile characters name of the snap File to load, default NULL
#' @param snapQuery snap object of SnapATAC, default NULL
#' @param snapQueryFile characters name of the snap File to load, default NULL
#' @param binSize integer dim of bmat, default 5,000
#' @param outFile characters to which we write snap with embedding, default NULL
#' @param chunkSize integer default 20,000
#' @param removeBmat bool default FALSE
#' @param removeJmat bool default FALSE
#' @return SnapObject
#' @export
queryEmbedding <- function(snapLandmark = NULL,
                           snapLandmarkFile = NULL,
                           snapQuery = NULL,
                           snapQueryFile = NULL,
                           binSize = 5000,
                           outFile = NULL,
                           chunkSize = 20000,
                           removeBmat = TRUE,
                           removeJmat = TRUE) {
  if (!is.null(snapLandmarkFile)) {
    snapLandmark <- readRDS(snapLandmarkFile)
  }
  if (is.null(snapLandmark)) {
    stop("No snapLandmark found.")
  }
  if (!is.null(snapQueryFile)) {
    snapQuery <- readRDS(snapQueryFile)
  }
  if (is.null(snapQuery)) {
    stop("No snapQuery found.")
  }
  if (!is.null(outFile)) {
    prepareOutfile(outFile)
  }
  n <- SnapATAC::nrow(snapQuery)
  if (n > chunkSize) {
    message(n, " has too many cells than ",
            chunkSize, " chunk.")
    message("now partition them into chunks.")
    chunks <- split(seq(n), ceiling(seq(n) / chunkSize))
  } else {
    chunks <- NULL
  }
  message("Add bmat.")
  if (is.null(chunks)) {
    snapQuery <- queryEmbedding.default(
      snapLandmark, snapQuery, binSize, removeBmat, removeJmat)
  } else {
    snapQueryList <- lapply(chunks, function(i) {
      s <- snapQuery[i, , drop = FALSE]
      s <- queryEmbedding.default(
        snapLandmark, s, binSize, removeBmat, removeJmat
      )
      return(s)
    })
    snapQuery <- SnapATAC::snapListRbind(snapList = snapQueryList,
                                         checkSnap = FALSE)
  }
  if(!is.null(outFile)) {
    saveRDS(object = snapQuery, file = outFile)
  }
  return(snapQuery)
}

queryEmbedding.default <- function(snapLandmark,
                                   snapQuery,
                                   binSize = 5000,
                                   removeBmat = TRUE,
                                   removeJmat = TRUE) {
  if (nrow(snapQuery@bmat) > 0) {
    message("snapQuery has bmat, will skip addBmat for it.")
  } else {
    snapQuery <- SnapATAC::addBmatToSnap(snapQuery, bin.size = binSize)
  }
  if(max(snapQuery@bmat) == 1) {
    message("@bmat is already binaryized.")
  } else {
    snapQuery <- SnapATAC::makeBinary(snapQuery)
  }
  idy <- unique(S4Vectors::queryHits(
    GenomicRanges::findOverlaps(snapQuery@feature, snapLandmark@feature)))
  snapQuery <- snapQuery[, idy, mat = "bmat"]
  message("Run embedding for query set.")
  snapQuery <- SnapATAC::runDiffusionMapsExtension(
    obj1 = snapLandmark,
    obj2 = snapQuery,
    input.mat = "bmat"
  )
  snapQuery@metaData$landmark <- 0
  if (removeBmat & (nrow(methods::slot(snapQuery, "bmat")) != 0L)) {
    message("Remove query bmat")
    snapQuery <- SnapATAC::rmBmatFromSnap(snapQuery)
  }
  if (removeJmat) {
    message("Remove jmat.")
    snapQuery@jmat <- SnapATAC::newJaccard()
  }
  return(snapQuery)
}

#' Run SnapATAC KNN.
#'
#' It will check snapLandmark and snapQuery firstly,
#' if snapLandmark is not NULL, will use it.
#' if snapQuery then is also NULL, will merge them.
#' if both of them are NULL, then use snapAllFile, snapAll in order.
#' 
#' @param snapAll SnapObject, default NULL
#' @param snapAllFile characters, default NULL
#' @param snapLandmark SnapObject, default NULL
#' @param snapLandmarkFile characters, default NULL
#' @param snapQuery SnapObject, default NULL
#' @param snapQueryFile SnapObject, default NULL
#' @param removeBmat bool, default TRUE
#' @param removeJmat bool, default TRUE
#' @param k integer, default 50
#' @param dims integer or vector, default is 1:30
#' @param method characters, method for KNN (either "RANN" or "Annoy")
#' default "RANN"
#' @param runUMAP bool, default TRUE
#' @param umapNcores integer, default 1
#' @param outmmtxFile characters, output file of mmtx, default NULL
#' @param outSnapFile characters, default NULL
#' @return SnapObject
#' @export
runKNN <- function(snapAll = NULL,
                   snapAllFile = NULL,
                   snapLandmark = NULL,
                   snapLandmarkFile = NULL,
                   snapQuery = NULL,
                   snapQueryFile = NULL,
                   removeBmat = TRUE,
                   removeJmat = TRUE,
                   k = 50,
                   dims = 1:30,
                   method = "RANN",
                   runUMAP = TRUE,
                   umapNcores = 1,
                   outmmtxFile = NULL,
                   outSnapFile = NULL) {
  if (!is.null(snapQueryFile)) {
    snapQuery <- readRDS(snapQueryFile)
  }
  if (!is.null(snapLandmarkFile)) {
    snapLandmark <- readRDS(snapLandmarkFile)
  }

  if (!is.null(snapLandmark)) {
    if (!is.null(snapQuery)) {
      snapAll <- SnapATAC::snapRbind(snapLandmark, snapQuery)
    } else {
      snapAll <- snapLandmark
    }
  }
  if (!is.null(snapAllFile)) {
    snapAll <- readRDS(snapAllFile)
  }

  if (is.null(snapAll)) {
    stop("Final snapAll is NULL.")
  }

  if (!is.null(outmmtxFile)) {
    prepareOutfile(outmmtxFile)
  }
  if (!is.null(outSnapFile)) {
    prepareOutfile(outSnapFile)
  }

  if (removeBmat & (nrow(methods::slot(snapAll, "bmat")) != 0L)) {
    message("Remove snapAll bmat.")
    snapAll <- SnapATAC::rmBmatFromSnap(snapAll)
  }

  if (removeJmat) {
    message("Remove snapAll jmat.")
    snapAll@jmat <- SnapATAC::newJaccard()
  }
  if (length(dims) == 1) {
    message("Dim is a scalar and convert it to vector.")
    dims <- 1:dims
  }

  message("Run KNN with k ", k, " and size of dims ", length(dims))
  message("KNN method: ", method)
  snapAll <- SnapATAC::runKNN(obj = snapAll, eigs.dims = dims, k = k, method = method)
  if (!is.null(outmmtxFile)) {
    message("save graph as mmtx file.")
    Matrix::writeMM(snapAll@graph@mat, file = outmmtxFile)
  }
  if (runUMAP) {
    message("Run Umap.")
    snapAll <- SnapATAC::runViz(
      obj = snapAll, tmp.folder = tempdir(), dims = 2,
      eigs.dims = dims, method = "uwot", seed.use = 2022,
      num.cores = umapNcores
    )
  }
  if (!is.null(outSnapFile)) {
    saveRDS(snapAll, outSnapFile)
  }
  return(snapAll)
}

#' Run Leiden Algorithm.
#' @param snap SnapObject default NULL
#' @param snapFile characters default NULL
#' @param r numeric resolution paramter for leiden, default 0.5
#' @param pt characters partition type for leiden, default "RB"
#' @param seed integer default NULL
#' @param pathToPython characters where the python is, default NULL
#' @param outLeidenFile characters default NULL
#' @param outClusterMetaCSV characters default NULL
#' @param outClusterPDF characters default NULL
#' @param pdfn function used to draw figures for outClusterPDF
#' the inputs are snap object and others
#' default NULL.
#' @param colName characters used for record cluster namae in snap meta,
#' default is "cluster". if snap@metaData has this column,
#' will use [colName].1 instead
#' @param dims integer or vector for UMAP default is 1:30
#' @param umapNcores integer default 1
#' @param ... use for pdfn function
#' @return SnapObject, slot "cluster" is the result
#' @import reticulate
#' @export
runLeiden <- function(snap = NULL,
                      snapFile = NULL,
                      r = 0.5,
                      pt = "RB",
                      seed = 10,
                      pathToPython = NULL,
                      outLeidenFile = NULL,
                      outClusterMetaCSV = NULL,
                      outClusterPDF = NULL,
                      pdfn = NULL,
                      colName = "cluster",
                      dims = 1:30,
                      umapNcores = 1,
                      ...) {
  if (!is.null(snapFile)) {
    snap <- readRDS(snapFile)
  }
  if (is.null(snap)) {
    stop("Snap is NULL.")
  }
  invisible(lapply(c(outLeidenFile, outClusterMetaCSV, outClusterPDF),
    function(i) {if (!is.null(i)) {prepareOutfile(i)}}))
  message("Run Leiden algorithm with: resolution ", r,
          " partition type: ", pt)
  if(!is.null(pathToPython)) {
    reticulate::use_python(pathToPython, required = TRUE)
    message("python path: ", pathToPython)
  }
  setSessionTimeLimit(cpu = Inf, elapsed = Inf)
  message("Loading python smmuty package.")
  pymod <- reticulate::import(module = "smmuty", convert = FALSE)
  message("Clustering.")
  c <- as.factor(reticulate::py_to_r(
    pymod$leiden(knn = reticulate::r_to_py(snap@graph@mat),
                 reso = r,
                 seed = as.integer(seed),
                 opt = pt)
  ))
  ## snap only accept factors
  snap@cluster <- c
  ## Label start from 1
  c <- as.integer(c)
  message("Summarize the clustering result:")
  print(table(c))
  m <- snap@metaData
  if (!("CellID" %in% colnames(m))) {
    m$CellID <- paste(snap@sample, snap@barcode, sep = ".")
  }
  if (colName %in% colnames(m)) {
    warning(colName, " is in the column of snap meta data.")
    colName <- paste0(colName, ".1")
    warning("Use ", colName, " instead.")
  }
  m[, colName] <- c

  ## cluster start from 1
  if (!is.null(outLeidenFile)) {
    message("Output Leiden result: ", outLeidenFile)
    write.table(x = c, file = outLeidenFile,
      row.names = FALSE, col.names = FALSE, quote = FALSE)
  } else {
    warning("No output of Leiden cluster file.")
  }
  if (!is.null(outClusterPDF)) {
    if (nrow(methods::slot(snap, "umap")) == 0L) {
      warning("No UMAP found, and now calculate it.")
      if (length(dims) == 1) {
        message("Dim is a scalar and convert it to vector.")
        dims <- 1:dims
      }
      snap <- SnapATAC::runViz(
        obj = snap, tmp.folder = tempdir(), dims = 2,
        eigs.dims = dims, method = "uwot", seed.use = 2022,
        num.cores = umapNcores
      )
    } # end of update umap
    if (is.null(pdfn)) {
      warning("No pdfn is found.")
      warning("No output of custer pdf.")
    } else {
      message("Output cluster pdf: ", outClusterPDF)
      withr::with_pdf(outClusterPDF, code = {
        pList <- pdfn(embed = snap@umap,
             meta = m,
             checkRowName = FALSE,
             names = c(colName, "TSSe", "log10UMI"),
             discretes = c(TRUE, FALSE, FALSE),
             legends = c(FALSE, TRUE, TRUE),
             addLabels = c(TRUE, FALSE, FALSE),
             ...)
        lapply(pList, function(p) {
          if(!is.null(p)) {
            print(p)
          }
        }) }, width = 10, height = 10)
    }
  } else {
    warning("No output of custer pdf.")
  }# end of outClusterPDF
  if (!is.null(outClusterMetaCSV)) {
    message("Output meta data to: ", outClusterMetaCSV)
    if (nrow(methods::slot(snap, "umap")) > 1L) {
      m$UMAP1 <- snap@umap[, 1]
      m$UMAP2 <- snap@umap[, 2]
    }
    write.table(x = m, file = outClusterMetaCSV,
      row.names = FALSE, col.names = TRUE,
      sep = ",", quote = FALSE)
  } else {
    warning("No output of cluster meta file.")
  }
  return(snap)
}


#' Filter bmat
#' removing common bins and those overlapped with blacklist
#' @param snap snap object of SnapATAC, default NULL
#' @param snapFile characters name of the snap File to load, default NULL
#' @param low double threshold, default 0 (bin freq <= low will be removed)
#' @param high double threshold, default 0.99 (bin freq > high will be removed)
#' @param blacklistGR GenomicRanges object, default NULL
#' @param blackListFile characters file of the blacklistGR, default NULL
#' @param binSize integer dim of bmat, default 5,000
#' @param ncores integer number of cores to use for parallel, default 1
#' @param excludeChr characters used to filter bmat, default "random|chrM"
#' @param addBmatIfEmpty bool, default FALSE
#' @return SnapObject
#' @export
filterSnapBmat <- function(snap = NULL,
                           snapFile = NULL,
                           low = 0,
                           high = 0.99,
                           blackListFile = NULL,
                           blacklistGR = NULL,
                           excludeChr = "random|chrM",
                           addBmatIfEmpty = FALSE,
                           binSize = 5000,
                           ncores = 1) {
  ## load snap
  if (!is.null(snapFile)) {
    if (grepl("\\.RData", snapFile)) {
      snap <- loadRData(snapFile, check = FALSE)
    } else {
      snap <- readRDS(snapFile)
    }
  }
  if (is.null(snap)) {
    stop("No snap found.")
  }
  ## check bmat
  if(nrow(snap@bmat) < 1) {
    warning("Snap has no bmat.")
    if(addBmatIfEmpty) {
      message("Add bmat to snap.")
      snap <- SnapATAC::addBmatToSnap(
        obj = snap, bin.size = binSize, do.par = TRUE,
        num.cores = ncores,
        checkSnap = FALSE)
    } else {
      stop("Add bmat is FALSE.")
    }
  } 

  ## load black list
  if (!is.null(blackListFile)) {
    blackList <- read.table(blackListFile,
                            header = FALSE, quote = "")
    blacklistGR <- GenomicRanges::GRanges(
      seqnames = blackList[, 1],
      ranges = IRanges::IRanges(blackList[, 2], blackList[, 3]))
  }
  if(is.null(blacklistGR)) {
    warning("Blacklist is empty.")
  } else {
    message("Filtering bin based on blacklist...")
    idy <- S4Vectors::queryHits(
      GenomicRanges::findOverlaps(snap@feature, blacklistGR))
    if (length(idy) > 0) {
      message("Remove blacklist-related ", length(idy), " bins.")
      snap <- snap[, -idy, mat = "bmat"]
    } else {
      message("No blacklist-related bins.")
    } 
  }
  message("Remove bins from the excludeChr: ", excludeChr)
  chr.exclude <-  GenomeInfoDb::seqlevels(snap@feature)[
    grep(excludeChr, GenomeInfoDb::seqlevels(snap@feature))
  ]
  idy <- grep(paste(chr.exclude, collapse = "|"), snap@feature)
  if (length(idy) > 0) {
    message("Remove ", length(idy),
            " bins located in ", excludeChr)
    snap <- snap[, -idy, mat = "bmat"]
  } else {
    message("No bins located in the excludeChr: ", excludeChr)
  }
  message("Filter bins based on the coverage.")
  bincov <- Matrix::colSums(snap@bmat) / nrow(snap@bmat)
  lowIndex <- which(bincov <= low)
  highIndex <- which(bincov > high)
  message(length(lowIndex), " features are blow ", low,
          " frequency.")
  message(length(highIndex), " features are above ", high,
          " frequencey.")
  removeIndex <- union(lowIndex, highIndex)
  if(length(removeIndex) == length(bincov)) {
    message("All the features will be filtered.")
    snap@bmat <- Matrix::Matrix(nrow = 0, ncol = 0, sparse = TRUE)
  } else {
    message(length(removeIndex), " / ", length(bincov),
            " features are removed.")
    snap <- snap[ , -removeIndex, mat = "bmat"]
  }
  return(snap)
}

#' Add bmat or gmat to snap
#'
#' @param snap snap object
#' @param cellGroup vector of characters
#' cluster labels for snap default is NULL
#' @param group characters which cluster needes
#' default is NULL
#' @param outDir characters default is NULL
#' @param matType characters default is "bmat"
#' @param gencode characters default is "vM16"
#' @param ndp integer default is -1
#' if its value is larger than 0, then down sampling will be used.
#' @param ncores integer default is 2
#' @param force bool default is TRUE
#' if add mat even the mat already exists
#' @param binSize integer default is 5000
#' @param coverFile bool default is TRUE
#' when set it TRUE, it will cover the output file if exists.
#' @param prefix characters default is "snap", which is used for output.
#' @return snap object with corresponding mat.
#' @export
addMatToSnap <- function(snap,
                         cellGroup = NULL,
                         group = NULL,
                         outDir = NULL,
                         matType = "bmat",
                         gencode = "vM16",
                         ndp = -1,
                         ncores = 2,
                         force = TRUE,
                         binSize = 5000,
                         coverFile = TRUE,
                         prefix = "snap") {
  # * get rowIndex belonging to a group
  rowIndex <- seq(nrow(snap))
  if ((!is.null(cellGroup)) & (length(cellGroup) != SnapATAC::nrow(snap))) {
    stop("cellGroup size does not match snap cell number.")
  }

  if (!is.null(cellGroup) & (!is.null(group))) {
    rowIndex <- cellGroup %in% group
    if (sum(rowIndex) == 0) {
      warning(group, " does not exist, and skip it.")
      return(NULL)
    }
  }
  # * down sampling if needed
  if ((ndp > 0) & (sum(rowIndex) > ndp)) {
    message("Downsample: ", ndp)
    rowIndex <- sort(sample(which(rowIndex), size = ndp, replace = FALSE))
  }
  # * get snap
  s <- snap[rowIndex, ]
  # ** double check the order of the snap
  s <- pureRUtils::orderSnap(s)
  # * get mat
  if (!(matType %in% c("bmat", "gmat"))) {
    stop("Do not support ", matType)
  }
  # * check if need to get mat
  flag <- TRUE
  n <- nrow(snap@bmat)
  if (matType == "gmat") {
    n <- nrow(snap@gmat)
  }
  if (n != 0L) {
    message("The snap has already owned a ", matType)
    if (!force) {
      message("Skip it due to the rule is not forced.")
      flag <- FALSE
    }
  }
  # * only run when flag is TRUE
  if (flag) {
    message("Get ", matType)
    if (matType == "bmat") {
      s <- SnapATAC::addBmatToSnap(obj = s, bin.size = binSize,
        do.par = TRUE, num.cores = ncores,
        checkSnap = TRUE)
    } else {
      s <- SnapATAC::addGmatToSnap(obj = s, do.par = TRUE,
        num.cores = ncores, checkSnap = TRUE)

    }
  }
  # * output
  if (is.null(outDir)) {
    return(s)
  }
  prepareOutdir(outDir)
  if (ndp > 0) {
    prefix <- paste(prefix, paste("dp", ndp, sep = "-"), sep = ".")
  }
  suffix <- paste(matType, "rds", sep = ".")
  outfile <- file.path(outDir, paste(prefix, suffix, sep = "."))
  if (matType == "gmat") {
    outmat <- file.path(outDir,
                        paste(prefix, paste("gencode", gencode, sep = "-"),
                              suffix, sep = "."))
  }
  if (file.exists(outfile) & (!coverFile)) {
    message("File exist. Skip.")
  } else {
    message("Generate ", matType, " file: ", outfile)
    saveRDS(s, outfile)
  }
  return(s)
}
beyondpie/pureRUtils documentation built on Jan. 10, 2023, 3:22 a.m.