R/SCATEpipeline.R
In SCATE: SCATE: Single-cell ATAC-seq Signal Extraction and Enhancement

Documented in SCATEpipeline

#' SCATE Pipeline
#'
#' SCATE pipeline of reading in bam, clustering cell, and performing SCATE 
#'
#' This function takes as input a list of bam files. It then read in the bam files, cluster cells, and performs SCATE for each cell cluster
#' @param bamfile Character vector of bam files to be processed
#' @param genome Character variable of either "hg19" or "mm10".
#' @param cellclunum Numeric variable giving the number of cell clusters when clustering cells. If NULL the cluster number will be determined automatically.
#' @param CREclunum Numeric variable giving the number of CRE clusters when running SCATE. If NULL the cluster number will be determined automatically.
#' @param ncores Numeric variable of number of cores to use. If NULL, the maximum number of cores is used.
#' @param perplexity Numeric variable specifying perplexity of tSNE. Reduce perplexity when sample size is small.
#' @param datapath Character variable of the path to the customized database (eg myfolder/database.rds). The database can be made using 'makedatabase' function. If not null, 'genome' is ignored.
#' @param example An indicator of whether this is running an example or real data. When running a real data, this should be set as FALSE. The default is FALSE.
#' @return A list of three elements. First element is a list generated by cellcluster function, and it contains the cell clustering results. Second element is a matrix generated by SCATE function. Each column is the SCATE result for one cell cluster. Column names indicate the cluster id. Third element is a list of peaks. Each element is the peak list for one cluster. Name of the element indicates the name of the cluster.
#' @export
#' @import GenomicAlignments GenomicRanges parallel splines2 xgboost
#' @author Zhicheng Ji, Weiqiang Zhou, Wenpin Hou, Hongkai Ji* <whou10@@jhu.edu>
#' @examples
#' f <- list.files(paste0(system.file(package="SCATEData"),"/extdata/"),full.names = TRUE,pattern='.bam$')
#' #Users need to set CREclunum to be NULL in real applications.
#' SCATEpipeline(f[1],genome="hg19",CREclunum=156,perplexity=0.1,example=TRUE) 

SCATEpipeline <- function(bamfile,genome='hg19',cellclunum=NULL,CREclunum=NULL,datapath=NULL,ncores=1,perplexity=30,example=FALSE) {
   satac <- sapply(sapply(bamfile,readGAlignmentPairs,simplify=FALSE),GRanges,simplify=FALSE)
   if (example) {
      if (Sys.getenv('R_ARCH')!='/i386') 
         SCATEres <- SCATE(satac[[1]][seq_len(2)],genome=genome,clunum=CREclunum,datapath=datapath,ncores=ncores)
   } else {
      suppressWarnings(satac <- satacprocess(satac,type='gr'))
      if (length(bamfile) == 1) {
         SCATEres <- SCATE(satac,genome=genome,clunum=CREclunum,datapath=datapath,ncores=ncores)
         peakres <- peakcall(SCATEres)
         list(SCATE=SCATEres,peak=peakres)
      } else {
         cellclu <- cellcluster(satac,genome=genome,perplexity=perplexity,clunum=cellclunum,datapath=datapath)
         SCATEres <- SCATE(satac,genome=genome,cluster=cellclu$cluster,clunum=CREclunum,datapath=datapath,ncores=ncores)
         peakres <- peakcall(SCATEres)
         list(cellcluster=cellclu,SCATE=SCATEres,peak=peakres)
      }
   }
}