scAPAtrap: Identification and Quantification of Alternative Polyadenylation Sites from Single-cell RNA-seq Data

Documented in convertAPAtrapData countPeaks dedupByPos extractBcAndUb findPeaks findPeaksByStrand findTails findTailsByPeaks findUniqueMap generateAlignBam generateFinalBam generateRefIndex generateSAF generatescExpMa initScAPAtrap loadBpCoverages reducePeaks scAPAtrap separateBamBystrand setTools setTrapParams TRAPFILES TRAP.PARAMS

#' @import GenomicRanges
#' @importFrom dplyr group_by summarise mutate
#' @importFrom Matrix sparseMatrix
#' @importFrom reshape2 dcast
#' @importFrom IRanges IRanges
#' @importFrom utils write.table
#' @importFrom stats quantile
#' @importFrom magrittr %>%
NULL

########## Utils ####################
.getDotArg<-function (name, value, ...) {
  args <- list(...)
  if (!name %in% names(args)) {
    return(value)
  }
  else {
    return(args[[name]])
  }
}

## global variable to record output files in scAPAtrap, a two-column data.frame (file, what)
## files' what marked with * are temporary files that can be savely deleted.
myenv=new.env()
myenv$TRAPFILES <- data.frame(matrix(nrow=0, ncol=2, dimnames=(list(c(), c('file','what')))))
#utils::globalVariables(c("TRAPFILES", "TRAP.PARAMS")) #not works

#' TRAPFILES global variable
#'
#' TRAPFILES is a global variable after running scAPAtrap, containing two columns (file, what).
#' This variable is automatically updated when running certain steps in scAPAtrap.
#' This function get TRAPFILES or clear TRAPFILES, but cannot add/modify items (rows) in TRAPFILES.
#' @param clear TRUE to clear TRAPFILES variable
#' @return A data frame of files containing two columns (file, what).
#' @examples
#' TRAPFILES()
#' countPeaks(umitools.path='umitools.path', input='xx.bam', outputdir='xxdir', TenX=TRUE, notRun=TRUE)
#' TRAPFILES()
#' TRAPFILES(clear=TRUE)
#' @export
TRAPFILES<-function(clear=FALSE) {
  if (clear) {
    myenv$TRAPFILES=myenv$TRAPFILES[character(0), ]
  }
  return(myenv$TRAPFILES)
}

## add file name and description in each function, those files marked with * can be savely deleted
.addTrapFiles<-function(files, whats) {
  if (length(files)>length(whats)) whats=rep('unknown', 1:length(files))
  for (i in 1:length(files)) {
    if (!(files[i] %in% myenv$TRAPFILES$file)) myenv$TRAPFILES <- rbind(myenv$TRAPFILES, c(files[i], whats[i]))
    colnames(myenv$TRAPFILES) <- c('file','what')
  }
}

## output a string to logf or screen
## It is easily to output a string with >>> and time to the screen and/or logfile
# .msg('command ...', verbose=FALSE) ## nothing
# .msg('command ...') --> >>> 2023-09-06 16:48:15 generatescExpMa: command...
# .msg('command ...', verbose=TRUE, logf='xx.log') --> >>> 2023-09-06 16:48:15 generatescExpMa: command...
# .msg('##', pre='', showTime=FALSE) --> ##
.msg<-function(string='', pre='>>>', showTime=TRUE, newLine=TRUE, preLine=FALSE, ...) {
  verbose <- .getDotArg("verbose", TRUE, ...)
  logf <- .getDotArg("logf", NULL, ...)

  t=ifelse(showTime, paste0(as.character(Sys.time()),' '), '')
  nl=ifelse(newLine, '\n', '')
  pl=ifelse(preLine, '\n', '')
  if (pre!='') pre=paste0(pre, ' ')
  s=sprintf("%s%s%s%s%s", pl, pre, t, string, nl)
  if (verbose) cat(s)
  if (is.character(logf)) {
    cat(s, file=logf, append=TRUE)
  }
}

# show msg within functions, by adding ... + function name + string.
# .msgin(fn='function', string='command***', verbose=TRUE, logf='xx.log') --> ... 2023-09-06 17:13:31 function: command***
# .msgin(fn='', string='command***', verbose=TRUE, logf=NULL) --> ... 2023-09-06 17:21:32 command***
# .msgin(fn='FUNC', string='command***', verbose=TRUE, logf=NULL) --> ... 2023-09-06 17:21:52 FUNC: command***
.msgin<-function(fn='', string='', ...) {
  if (fn=='') s=string else s=paste0(fn, ': ', string)
  .msg(string=s, pre='...', showTime=TRUE, newLine=TRUE, ...)
}

## run command line and output message on screen or file
## verbose=TRUE to show only the command line; FALSE now show
## logf=file to sink command and command output to file but NOT show on the screen; logf=NULL to show command output on the screen
## however, system() not allows capture screen output of inner command, the commond message will be shown on screen no matter verbose=FALSE
## .runCommand('', command="tree") # ... 2023-09-06 18:01:22 tree +++ tree results on screen +++ ... 2023-09-06 18:01:22 command done.
## .runCommand('', command="tree", logf='xx.log')  # ... 2023-09-06 18:01:22 tree +++ tree results in xx.log +++ ... 2023-09-06 18:01:22 command done.
## .runCommand('', command="tree", verbose=FALSE, logf='xx.log') # nothing on screen, but tree results in xx.log
## .runCommand('', command="tree", verbose=FALSE, logf=FALSE) # nothing on screen or file
.runCommand<-function(fn='', command,  ofile=NULL, ocheck=FALSE, ...) {

  verbose <- .getDotArg("verbose", TRUE, ...)
  notRun <- .getDotArg("notRun", FALSE, ...)
  wait <- .getDotArg("wait", TRUE, ...)
  logf <- .getDotArg("logf", TRUE, ...)

  if (notRun & !verbose &is.null(logf)) verbose=TRUE

  .msgin(fn=fn, string=command, ...)

  ## not run, only output command line string
  if (notRun) return(invisible(NULL))

  ## run command
  #s=system2(command = command, wait = wait, stdout=TRUE, stderr=TRUE) # return str vectors
  s=system(command = command, wait = wait, intern=FALSE, ignore.stdout=FALSE,  ignore.stderr=FALSE)

  if (s!=0) {
    warning("It seems the command failed (exit status!=0), please see screen message or log file for details!\n")
  }
  if (ocheck) {
    if (!is.null(ofile)) {
      if (!file.exists(ofile)) stop(ofile, "not exists! Probably the command failed, please see screen message or log file for detailes!")
    }
  }

  # if (length(s)>0) {
  #   if (is.character(logf)) { #output to file, now show on screen
  #     cat(s, file = logf, append=TRUE, sep='\n')
  #   } else {# on screen
  #     cat(s, sep='\n')
  #   }
  # }
  .msgin(fn=fn, 'command done.', ...)
}

#-----------------------------------Preprocess----------------------------------------------
#' Filter a BAM file to preserve unique mappings with samtools
#'
#' findUniqueMap filters a BAM file to preserve unique mappings with samtools. This function will generate .bam (<input>.uniq.bam or UniqSorted.bam if sort=TRUE) and .bai (if index=TRUE).
#' Suffix .Uniq or .UniqSorted will be added to the input BAM file. This function is the same as call samtools in Shell.
#'
#' @param samtools.path The path of the samtools.
#' @param input BAM file name.
#' @param thread Number of CPU threads, default is 12.
#' @param sort Logical value, TRUE to sort the BAM file.
#' @param index Logical value, TRUE to build the index.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' \item{notRun }{Default is FALSE. If `TRUE`, the Shell commands inside this function will not run but output the commad line.}
#' }
#' @examples
#' samtools.path <- '/home/aa/miniconda2/envs/nar_env/bin/samtools'
#' input <- './data/demo.bam'
#' \dontrun{
#' findUniqueMap(samtools.path, input, 24)
#' }
#' ## only output command lines
#' findUniqueMap(samtools.path, input, 24, notRun=TRUE)
#' @return Output path of the new BAM file name with only unique mapped reads (<input>.uniq.bam or UniqSorted.bam if sort=TRUE).
#' @export
findUniqueMap <- function(samtools.path, input, thread=12, sort = TRUE, index = TRUE, ...) {

  fn='findUniqueMap'

  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  output <- paste0(gsub(".bam$", "", input), ".Uniq.bam")
  command <- paste0(samtools.path, " view -@ ", thread, " -h -F 256 -bS ", input, " > ", output)
  .runCommand(fn, command, ofile=output, ocheck=TRUE, ...)
  #.addTrapFiles(output, 'findUniqueMap: samtools view unique')

  res <- output
  if (sort) {
      output.sorted <- paste0(gsub(".Uniq.bam$", "", output), ".UniqSorted.bam")
      command <- paste0(samtools.path, " sort -@ ", thread, " -o ", output.sorted, " ", output)
      .runCommand(fn, command, ofile=output.sorted, ocheck=TRUE, ...)
      unlink(res) # delete uniq.bam if sorted
      res <- output.sorted
      .addTrapFiles(res, '*findUniqueMap: samtools sort')
  }
  if (index) {
      command <- paste0(samtools.path, " index -@ ", thread, " ", res)
      .runCommand(fn, command, ...)
      .addTrapFiles(paste0(res,'.bai'), '*findUniqueMap: samtools index')
  }

  .msg(sprintf("%s: finish.", fn), ...)

  return(res)
}


#' Deduplication with umitools
#'
#' @param umitools.path The path of the umi_tools.
#' @param input A BAM file which needs the index file.
#' @param TenX Logical value, TRUE for 10X data.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' \item{notRun }{Default is FALSE. If `TRUE`, the Shell commands inside this function will not run but output the commad line.}
#' }
#' @examples
#' umitools.path <- '/home/aa/miniconda2/envs/umi_tools/bin/umi_tools'
#' input <- './data/demo.Uniq.sorted.bam'
#' \dontrun{
#' dedupByPos(umitools.path, input)
#' }
#' dedupByPos(umitools.path, input, notRun=TRUE)
#' @return Output path of the deduplicated BAM file (<input>.dedup.bam).
#' @export
dedupByPos <- function(umitools.path, input, TenX = TRUE, ...) {

  fn='dedupByPos'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  output <- paste0(gsub(".bam$", "", input), ".dedup.bam")
  if (TenX) {
    command <- paste0(umitools.path, " dedup -I ", input, " -S ", output, " --method=unique --extract-umi-method=tag --umi-tag=UB --cell-tag=CB")
  } else {
    command <- paste0(umitools.path, " dedup -I ", input, " -S ", output, " --method=unique")
  }

  .runCommand(fn, command, ofile=output, ocheck=TRUE, ...)

  .msg(sprintf("%s: finish.", fn), ...)

  .addTrapFiles(output, '*dedupByPos: umitools dedup')
  return(output)
}


#' Separate BAM files by strand
#'
#'@param samtools.path The path of the samtools.
#'@param input The BAM file name.
#'@param thread Number of CPU threads, default is 12.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' \item{notRun }{Default is FALSE. If `TRUE`, the Shell commands inside this function will not run but output the commad line.}
#' }
#'@examples
#' samtools.path <- '/home/aa/miniconda2/envs/nar_env/bin/samtools'
#' input <- './data/demo.Uniq.sorted.dedup.bam'
#' \dontrun{
#' separateBamBystrand(samtools.path,input,24)
#' }
#' separateBamBystrand(samtools.path,input,24, notRun=TRUE)
#'@return String vector storing the path of the forward BAM file (<input>.forward.bam) and the reverse BAM file (<input>.reverse.bam)
#'@export
separateBamBystrand <- function(samtools.path, input, thread=12, ...) {

  fn='separateBamBystrand'

  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  outputF <- paste0(gsub(".bam", "", input), ".forward.bam")
  command <- paste0(samtools.path, " view -@ ", thread, " -h -F 0x10 -bS ", input, " > ", outputF)
  .runCommand(fn, command, ofile=outputF, ocheck=TRUE, ...)
  .addTrapFiles(outputF, '*separateBamBystrand: samtools forward BAM')

  outputR <- paste0(gsub(".bam", "", input), ".reverse.bam")
  command <- paste0(samtools.path, " view -@ ", thread, " -h -f 0x10 -bS ", input, " > ", outputR)
  .runCommand(fn, command, ofile=outputR, ocheck=TRUE,  ...)
  .addTrapFiles(outputR, '*separateBamBystrand: samtools reverse BAM')

  command <- paste0(samtools.path, " index -@ ", thread, " ", outputF)
  .runCommand(fn, command, ...)
  .addTrapFiles(paste0(outputF,'.bai'), '*separateBamBystrand: samtools forward index')

  command <- paste0(samtools.path, " index -@ ", thread, " ", outputR)
  .runCommand(fn, command, ...)
  .addTrapFiles(paste0(outputR,'.bai'), '*separateBamBystrand: samtools reverse index')

  .msg(sprintf("%s: finish.", fn), ...)

  return(c(outputF, outputR))
}


#' Extract barcode and UMI in reads with umi_tools
#'
#' @param umitools.path The path of the umi_tools.
#' @param pattern The pattern used to represent barcode and UMI (e.g If the barcode length is 8 and the UMI length is 4, then the pattern=CCCCCCCCNNNN)
#' @param input.seq Path of the fastq file, which must contain two paths with the first one recording barcode and UMI information.
#' @param whitelist whitelist barcode.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' \item{notRun }{Default is FALSE. If `TRUE`, the Shell commands inside this function will not run but output the commad line.}
#' }
#' @examples
#' umitools.path <-  '/home/dell/miniconda3/envs/scAPA/bin/umi_tools'
#' input.seq <- c('./fastq/SRR1610598_1.fastq','./fastq/SRR1610598_2.fastq')
#' pattern <- 'CCCCCCCCNNNN'
#' whitelist <- './fastq/whitelist.txt'
#' \dontrun{
#' extractBcAndUb(umitools.path, pattern, input.seq, whitelist)
#' }
#' extractBcAndUb(umitools.path, pattern, input.seq, whitelist, notRun=TRUE)
#'@return The path of Read2 (<input.seq[2]>.extracted.fastq)
#' @export
extractBcAndUb <- function(umitools.path, pattern, input.seq, whitelist, ...) {

  fn='extractBcAndUb'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  output1 <- paste0(gsub(".fastq$", "", input.seq[1]), ".extracted.fastq")
  output2 <- paste0(gsub(".fastq$", "", input.seq[2]), ".extracted.fastq")
  command <- paste0(umitools.path, " extract --bc-pattern=", pattern, " --stdin ", input.seq[1], " --stdout ", output1,
                    " --read2-in ", input.seq[2], " --read2-out=", output2, " --filter-cell-barcode ", " --whitelist=", whitelist)

  .runCommand(fn, command, ofile=output2, ocheck=TRUE, ...)
  .addTrapFiles(output1, '*extractBcAndUb: umitools bc R1')
  .addTrapFiles(output2, '*extractBcAndUb: umitools bc R2')

  .msg(sprintf("%s: finish.", fn), ...)

  return(output2)
}


#' Build the reference genome index with STAR
#'
#' @param star.path The path of the STAR.
#' @param genome.fasta The reference genome sequence file in fasta format.
#' @param indexdir Directory of the reference genome index.
#' @param thread Number of CPU threads, default is 12.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' \item{notRun }{Default is FALSE. If `TRUE`, the Shell commands inside this function will not run but output the commad line.}
#' }
#' @return Directory of reference genome index
#' @examples
#' star.path='/home/dell/miniconda3/envs/scAPA/bin/STAR'
#' genome.fasta <- '/home/aa/genome_data/mm10/Mus_musculus.GRCm38.dna.primary_assembly.fa'
#' indexdir <- './mm10_index/'
#' \dontrun{
#' generateRefIndex(star.path, genome.fasta, indexdir, 24)
#' }
#' generateRefIndex(star.path, genome.fasta, indexdir, 24, notRun=TRUE)
#' @export
generateRefIndex <- function(star.path, genome.fasta, indexdir, thread=12, ...) {


  fn='generateRefIndex'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  command <- paste0(star.path, " --runMode genomeGenerate --genomeDir ", indexdir, " --genomeFastaFiles ", genome.fasta,
                    " --runThreadN ", thread)
  .runCommand(fn, command, ofile=indexdir, ocheck=TRUE, ...)
  .addTrapFiles(indexdir, '*generateRefIndex: STAR index genome')

  .msg(sprintf("%s: finish.", fn), ...)
  return(indexdir)
}


#' Align the sequence to the reference genome with STAR
#'
#' @param star.path The path of the STAR.
#' @param indexdir Directory of reference genome index.
#' @param input.seq Input fastq file.
#' @param prefix The prefix of the output file name.
#' @param thread Number of CPU threads, default is 12.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' \item{notRun }{Default is FALSE. If `TRUE`, the Shell commands inside this function will not run but output the commad line.}
#' }
#' @examples
#' star.path='/home/dell/miniconda3/envs/scAPA/bin/STAR'
#' indexdir <- "./"
#' input.seq <- 'SRR1610598_2.extracted.fastq'
#' \dontrun{
#' generateAlignBam(star.path, indexdir, input.seq, prefix='./BAM/rep1',12)
#' }
#' generateAlignBam(star.path, indexdir, input.seq, prefix='./BAM/rep1', notRun=TRUE)
#' @return NULL
#' @export
generateAlignBam <- function(star.path, indexdir, input.seq, prefix, thread=12, ...) {

  fn='generateAlignBam'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  command <- paste0(star.path, " --runThreadN ", thread, " --genomeDir ", indexdir, " --readFilesIn ", input.seq, " --outFilterMultimapNmax 1 ",
                    " --outSAMtype BAM SortedByCoordinate ", " --outFileNamePrefix ", prefix)
  .runCommand(fn, command, ...)

  .msg(sprintf("%s: finish.", fn), ...)
  ##bug: output?
}

#-----------------------------------findTails----------------------------------------------
.readChrInfo <- function(file) {
  chr_length <- utils::read.delim(file = file)
  colnames(chr_length) <- c("flag", "chr", "coord")
  chr_length$chr <- gsub("SN:", "", chr_length$chr)
  chr_length$coord <- as.numeric(gsub("LN:", "", chr_length$coord))
  chr_length <- subset(chr_length, chr != "MT")
  chr.g <- with(chr_length, GRanges(seqnames = chr, ranges = IRanges::IRanges(start = 1, end = coord)))

  return(chr.g)
}

## search BAM to find tails, returning chr/strand/coord/count ranges
.findTails<-function(bamfile, bamParam) {
  gal1 <- GenomicAlignments::readGAlignments(bamfile, use.names = TRUE, param = bamParam)
  s_1 <- (grepl("[0-9]*M[1-9]{2,}S", gal1@cigar) & as.vector(gal1@strand) == "+")
  s_2 <- (grepl("[1-9]{2,}S[0-9]*M", gal1@cigar) & as.vector(gal1@strand) == "-")

  bam1 <- gal1[s_1]
  bam2 <- gal1[s_2]

  bam1 <- bam1[grepl("(A{3,}[^A]{0,2})*A{6,}([^A]{0,2}A{3,})*.{0,2}?$", bam1@elementMetadata@listData$seq)]
  bam2 <- bam2[grepl("^.{0,2}?(T{3,}[^T]{0,2})*T{6,}([^T]{0,2}T{3,})*", bam2@elementMetadata@listData$seq)]

  final_bam1 <- data.frame(chr = as.vector(seqnames(bam1)), strand = as.vector(strand(bam1)), coord = end(bam1))
  final_bam2 <- data.frame(chr = as.vector(seqnames(bam2)), strand = as.vector(strand(bam2)), coord = start(bam2))

  bam <- rbind(final_bam1, final_bam2)
  bam <- dplyr::group_by(bam, chr, strand, coord) %>% dplyr::summarise(count = dplyr::n())
  ## warning.. override using .group argument..
  return(bam)
}

.getBAMchrs<-function(bamfile) {
  chrs <- as.character(seqnames(seqinfo(Rsamtools::BamFile(bamfile))))
  return(chrs)
}

.indexBam<-function(bamfile, ...) {
  baifile=paste0(bamfile, '.bai')
  if(!file.exists(baifile)) {
    .msgin(fn='indexBam', bamfile, ...)
    Rsamtools::indexBam(bamfile)
  }
  return(baifile)
}

#' Find the positions with polyA tails genome/chromosome-wide
#'
#' @description Find the precise positions with polyA tails genome-wide or on one given chromosome.
#'              Given chr, this function will traverse each chromosome and find the site through the A-rich at the end.
#' @param bamfile A BAM file which needs the index file (.bai).
#' @param chr NULL or a character string. If NULL, search whole genome. Otherwise search tails on one chr.
#' @param len NULL or the chromosome length of the given chr.
#' @param tailsfile if not NULL then output to tailsfile, and return the tailsfile name.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @return A data frame recording tail positions with chr/strand/coord/count; or tailsfile name (if tailsfile not NULL)
#' @examples
#' \dontrun{
#' findTails(bamfile, chr=1, len=1000000001)
#' findTails(bamfile, tailsfile='output/output.tails')
#' }
#' @export
findTails <- function(bamfile, chr=NULL, len=NULL, tailsfile=NULL, ...) {

  fn='findTails'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  what <- c("rname", "strand", "pos", "cigar", "seq")
  if (is.null(chr)) {
    param <- Rsamtools::ScanBamParam(what = what)
  } else {

    if (is.null(len)) stop("findTails: chr is provided, then len (the chromosome length) should be also provided!")
    # check the chrnames in bam to be consistent with chr
    # chrs=.getBAMchrs(bamfile)
    # if (!(chr %in% chrs)) stop(sprintf("findTails: chr=%s not in chrs of the bamfile (%s...)!", chr, .shortStr(chrs)))
    .indexBam(bamfile, ...)

    which <- GenomicRanges::GRanges(seqnames = chr, IRanges::IRanges(start=1, end=len))
    param <- Rsamtools::ScanBamParam(what = what, which = which)
  }
  #chr <- colnames(bam)[1]

  res=.findTails(bamfile, param)
  if (!is.null(tailsfile)) {
    .msgin(fn, sprintf("save %d tails-coords to %s", nrow(res), tailsfile), ...)
    write.table(res, file=tailsfile, col.names=TRUE, row.names = FALSE, sep="\t", quote=FALSE)
    .addTrapFiles(tailsfile, 'findTails: tails-coords')
    .msg(sprintf("%s: finish.", fn), ...)
    return(tailsfile)
  } else {
    .msgin(fn, sprintf("find %d tails-coords", nrow(res)), ...)
    .msg(sprintf("%s: finish.", fn), ...)
    return(res)
  }

}


#' Find tail positions around/within peak ranges
#'
#' @description Find the precise positions with polyA tails within d nt of peak ranges, given a peaksfile recording peak ranges.
#'              This function narrows the search range of tails to peak ranges to speed up tail searching, which is useful for large BAM file.
#' @param bamfile A BAM file which needs the index file (.bai).
#' @param peaksfile The path of the peaks.saf file generated by \code{\link{generateSAF}}.
#' @param d Add a 5' and 3' margin of d nt to each peak to search tails. Default is 200.
#' @param tailsfile if not NULL then output to tailsfile, and return the tailsfile name.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @return A data frame recording tail positions with chr/strand/coord/count; or tailsfile name (if tailsfile not NULL).
#' @examples
#' \dontrun{
#' findTailsByPeaks(bamfile, peaksfile, d=500)
#' findTailsByPeaks(bamfile, peaksfile, d=500, tailsfile='output.tails')
#' }
#' @export
findTailsByPeaks <- function(bamfile, peaksfile, d=200, tailsfile=NULL, ...) {

  fn='findTailsByPeaks'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  peaks <- utils::read.delim(peaksfile, header = FALSE)
  names(peaks) <- c("peakID", "chr", "start", "end", "strand")
  gr <- with(peaks, GenomicRanges::GRanges(seqnames = chr, ranges = IRanges::IRanges(start = start, end = end), strand = strand))
  if (d<=0) d=0
  if (d>0) {
    gr=GenomicRanges::resize(gr, width=width(gr)+d, fix="end", use.names=TRUE, ignore.strand=FALSE)
    gr=GenomicRanges::resize(gr, width=width(gr)+d, fix="start", use.names=TRUE, ignore.strand=FALSE)
    gr=GenomicRanges::reduce(gr)
  }

  .indexBam(bamfile, ...)

  # validate chr consistency
  cb=.getBAMchrs(bamfile)
  cp=unique(as.character(peaks$chr))
  if (!all(cp %in% cb)) {
    if (!any(cp %in% cb))
      stop(sprintf("findTailsByPeaks: peak chrs (%s...) not in bam chrs (%s...)!", .shortStr(cp), .shortStr(cb)))
  }

  what <- c("rname", "strand", "pos", "cigar", "seq")
  param <- Rsamtools::ScanBamParam(what = what, which = gr)
  .msgin(fn, sprintf("findTailsByPeaks on %d reduced peak ranges (d=%dnt)",length(gr), d), ...)

  res=.findTails(bamfile, param)

  if (!is.null(tailsfile)) {
    .msgin(fn, sprintf("save %d tails-coords to %s", nrow(res), tailsfile), ...)
    write.table(res, file=tailsfile, col.names=TRUE, row.names = FALSE, sep="\t", quote=FALSE)
    .addTrapFiles(tailsfile, 'findTails: tails-coords')
    .msg(sprintf("%s: finish.", fn), ...)
    return(tailsfile)
  } else {
    .msgin(fn, sprintf("find %d tails-coords", nrow(res)), ...)
    .msg(sprintf("%s: finish.", fn), ...)
    return(res)
  }
}


#-----------------------------------findPeaks----------------------------------------------
#' Load a BAM file and calculate coverages of each position with derfinder
#'
#' @param files BAM files which need the corresponding index files (.bai).
#' @param chrs Chromosome information.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @return A list with one element per chromosome. Each element is a DataFrame with the coverage information.
#' @examples
#' \dontrun{
#' chrs <- c(as.character(1:19),'X','Y')
#' fullcovF <- loadBpCoverages('./dedup_h2.forward.sorted.bam', chrs)
#' }
#' @export
loadBpCoverages <- function(files, chrs, ...) {

  fn='loadBpCoverages'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  fullCov <- derfinder::fullCoverage(files = files, chrs = chrs, ...)

  ## no SQ lines in @header
  # # validate chr consistency
  # if (is.null(chrs))
  #   chrs=.getBAMchrs(files[1])
  # else if (length(chrs)==0) {
  #   chrs=.getBAMchrs(files[1])
  # } else {
  #   cb=.getBAMchrs(files[1])
  #   if (!all(chrs %in% cb)) {
  #     if (!any(chrs %in% cb))
  #       stop(sprintf("loadBpCoverages: chrs (%s...) not in bam chrs (%s...)!", substr(toString(chrs), 1, 20), substr(toString(cb),1, 20)))
  #   }
  # }
  .msg(sprintf("%s: finish.", fn), ...)

  return(fullCov)
}


#' Perform peak calling on a BAM file
#'
#' Perform peak calling on a BAM file with single strand (+ or -) using the output of loadBpCoverages. It is recommended to use findPeaksByStrand instead.
#'
#' @param fullCov The output of loadBpCoverages.
#' @param strand strand.
#' @param L Read length, used to filter peaks wider than L.
#' @param maxwidth Maximum peak width, used to filter peaks narrower than maxwidth. Wider peaks will be splitted to smaller ones.
#' @param cutoff The base level coverage cutoff to filter valid peaks with enough coverages, default is 10.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @return A data frame with five columns: chr/start/end/strand/value, with each row being a peak. The value column is the average coverage of peak.
#' @examples
#' \dontrun{
#' forwardPeaks <-findPeaks(fullcovF, '+', 98, 1000)
#' reversePeaks <-findPeaks(fullcovR, '-', 98, 1000)
#' }
#' @export
findPeaks <- function(fullCov, strand, L, maxwidth, cutoff = 10, ...) {

  fn='findPeaks'
  #.msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  if (cutoff<=0) cutoff=1

  ## if chrnames start with digits (e.g., 1, 2L), then add 'Chr' to prevent errors in definder::regionMatrix
  dc=grep('^[0-9]', names(fullCov))
  changed=FALSE
  if (length(dc)>0) {
    names(fullCov)=paste0('Chr', names(fullCov))
    changed=TRUE
    #.msgin(fn, "Warning: some or all chr names in fullCov start with digits, will add [Chr] to the chr names of the output coverage list!", ...)
  }

  .msgin(fn, sprintf("start regionMatrix (L=%d, cutoff=%d)...", L, cutoff), ...)
  regionMat <- derfinder::regionMatrix(fullCov = fullCov, L = L, cutoff = cutoff)
  ## regionMat: a list, with each chr a Data.Frame(Rle to store coverage at each position)

  .splitPeaks <- function(pos_count, chrom, pos, readlength) {
    chr <- rep(chrom, length(pos_count))
    threshold <- quantile(pos_count, 0.25)
    tempdata <- bumphunter::regionFinder(pos_count, chr = chr, pos = pos, cutoff = threshold, verbose = FALSE)
    tempdata <- subset(tempdata, L > readlength)
    return(tempdata)
  }


  DefPeak <- GenomicRanges::GRanges()
  for (i in names(regionMat)) {
    index <- which(GenomicRanges::width(regionMat[[i]]$regions) > maxwidth)

    if (length(index)>0)
      .msgin(fn, sprintf("chr=%s has %d peaks with width>%dnt, start splitPeaks", i, length(index), maxwidth), ...)

    tempDataFram <- data.frame()
    for (j in index) {
      rawstart <- start(regionMat[[i]]$regions[j]) - 1
      temp <- regionMat[[i]]$bpCoverage[[j]]
      temp$pos <- c(1:dim(temp)[1])
      first.split.peaks <- .splitPeaks(temp$value, i, temp$pos, L)
      pre.split.peaks <- first.split.peaks
      split.peaks <- data.frame()
      iter <- 0
      while (TRUE) {
        if (iter >= 3) {
          break()
        }
        Ind <- which(pre.split.peaks$L > maxwidth)
        cur.split.peaks <- data.frame()
        if (length(Ind) > 0) {
          split.peaks <- rbind(split.peaks, pre.split.peaks[-Ind, ])
          for (k in Ind) {
            pos.temp <- temp[pre.split.peaks$start[k]:pre.split.peaks$end[k], ]
            cur.split.peaks <- .splitPeaks(pos.temp$value, i, pos.temp$pos, L)
          }
        } else {
          split.peaks <- rbind(split.peaks, pre.split.peaks)
          break()
        }
        pre.split.peaks <- cur.split.peaks
        iter <- iter + 1

      }
      split.peaks$start <- split.peaks$start + rawstart
      split.peaks$end <- split.peaks$end + rawstart
      tempDataFram <- rbind(tempDataFram, split.peaks)
    }

    if (length(index)>0) {
      suppressWarnings( DefPeak <- c(DefPeak, regionMat[[i]]$regions[-index]) ) #all+splitedpeaks
    }  else {
      suppressWarnings( DefPeak <- c(DefPeak, regionMat[[i]]$regions) ) #all peaks
    }

    if (nrow(tempDataFram)>0) {
      suppressWarnings(DefPeak <- c(DefPeak, regioneR::toGRanges(tempDataFram)))
      .msgin(fn, sprintf("chr=%s split %d wide peaks to %d small peaks", i, length(index), nrow(tempDataFram)), ...)
    }
    .msgin(fn, sprintf("chr=%s find %d peaks", i, length(DefPeak)), ...)
  }

  #DefPeak <- DefPeak[-which(DefPeak$area == GenomicRanges::width(DefPeak))]
  DefPeak <- DefPeak[GenomicRanges::width(DefPeak) > L]

  if (length(DefPeak)==0) {
    .msgin(fn, sprintf("Warning: 0 peak with width>%dnt (Read Length), please check the readlength (L) parameter!", i, L), ...)
  } else {
    .msgin(fn, sprintf("%d peaks with width>%dnt (Read Length) found on all chrs", length(DefPeak), L), ...)
  }

  strand(DefPeak) <- strand
  if (length(DefPeak)==0) {
    DefPeak=as.data.frame(DefPeak)
  } else {
    DefPeak=as.data.frame(DefPeak, row.names = 1:length(DefPeak)) #avoid dup row.names
  }
  DefPeak <- dplyr::mutate(DefPeak, chr = seqnames)
  DefPeak=DefPeak[, c('chr','strand','start','end','value')]

  # strand(DefPeak) <- strand
  # if (strand == "+") {
  #     DefPeak <- as.data.frame(DefPeak, row.names = 1:length(DefPeak))
  #     DefPeak <- dplyr::mutate(DefPeak, chr = seqnames, coord = end)
  # } else {
  #     DefPeak <- as.data.frame(DefPeak, row.names = 1:length(DefPeak))
  #     DefPeak <- dplyr::mutate(DefPeak, chr = seqnames, coord = start)
  # }

  # restore chr names
  if (changed & nrow(DefPeak)>0) {
    #.msgin(fn, "Restore chr names by removing [Chr]", ...)
    DefPeak$chr=gsub('^Chr','',DefPeak$chr)
  }

 # .msg(sprintf("%s: finish.", fn), ...)
  return(DefPeak)
}

write.bed <- function(.x, f) {
    write.table(x = .x, file = f, sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
}


#' Perform peak calling on a BAM file with single strand
#'
#' findPeaksByStrand calls peaks on a BAM file with single strand. This function combines two functions in previous scAPAtrap, loadBpCoverages and findPeaks.
#'
#' @inheritParams findPeaks
#' @param bamFile a BAM file with index (.bai)
#' @param chrs Chromosome information. If not provided, then will try get chrs from the bamFile.
#' @param ofile output file with each line is one peak
#' @return If ofile=NULL, return a data frame with many columns but four fixed columns chr/strand/start/end/value, with each row being a peak.
#' Otherwise, output to file and return ofile name.
#' @examples
#' \dontrun{
#' forwardPeaks <-findPeaksByStrand(bamFile, chrs=NULL, strand='+', L=98, maxwidth=1000, cutoff=10)
#' reversePeaks <-findPeaksByStrand(bamFile, chrs=NULL, strand='-', L=98, maxwidth=1000, cutoff=10)
#' }
#' @export
findPeaksByStrand <- function(bamFile, chrs=NULL, strand, L, maxwidth, cutoff = 10, ofile=NULL, ...) {

  fn='findPeaksByStrand'
  .msg(sprintf("%s (%s): start.", fn, strand), preLine=FALSE, ...)

  if (cutoff<=0) cutoff=1

  if (length(chrs)==0) {
    chrs=.getBAMchrs(bamFile)
    .msgin(fn, string=sprintf("Total %d chrs (e.g., %s)", length(chrs), .shortStr(chrs)), ...)
  }

  if (length(chrs)==0) {
    .msgin(fn, string="Get chr error in bamFile!", ...)
    stop("findPeaksByStrand error, please check logf to see Logs if set logf!")
  }

  .msgin(fn, sprintf("loadBpCoverages on %d chrs: %s", length(chrs), .shortStr(chrs)), ...)
  fullCov <- derfinder::fullCoverage(files = bamFile, chrs = chrs, ...)
  ## fullCov: a list, with each chr a Data.Frame(Rle to store coverage at each position)

  DefPeak=findPeaks(fullCov=fullCov, strand=strand, L=L, maxwidth=maxwidth, cutoff = cutoff, ...)

  if (!is.null(ofile)) {
    write.table(DefPeak, file=ofile, col.names=TRUE, row.names = FALSE, sep="\t", quote=FALSE)
    .addTrapFiles(ofile, paste0('*findPeaksByStrand: peak (', strand, ')'))
    .msg(sprintf("%s: finish.", fn), ...)
    return(ofile)
  } else {
    .msg(sprintf("%s (%s): finish.", fn, strand), ...)
    return(DefPeak)
  }
}

#----------------------------------- countPeaks ----------------------------------------------
#' Generate a peak annotation file
#'
#' @description Generate a peak annotation file (PeakID/chr/start/end/Strand) by combining forward peaks and reverse peaks generated by findPeaks
#'
#' @param forwardPeaks A peak filename or a peak data.frame of forward strand, which is the output of \code{\link{findPeaksByStrand}} or \code{\link{findPeaks}}, where strand = +.
#' @param reversePeaks Save as forwardPeaks but on the reverse strand. Can provide forwardPeaks or reversePeaks, or both.
#' @param outputdir Output file directory.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @examples
#' \dontrun{
#' forwardPeaks <-findPeaks(fullcovF,'+', 98, 1000, cutoff = 1)
#' reversePeaks <-findPeaks(fullcovR,'-',98, 1000, cutoff = 1)
#' generateSAF(forwardPeaks, reversePeaks,'./data')
#' }
#' @return Full path of the peak list (<outputdir>/peaks.saf). This file contains five columns without header: peakID/chr/start/end/strand.
#' @export
generateSAF <- function(forwardPeaks, reversePeaks, outputdir, ...) {

  fn='generateSAF'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  if (missing(x = forwardPeaks) && missing(x = reversePeaks)) {
    stop("Must provide at least forwardPeaks or reversePeaks (data.frame or filename)!")
  }

  both=FALSE
  if (!missing(x = forwardPeaks) && !missing(x = reversePeaks)) both=TRUE

  .loadFRpeaks<-function(frfile) {
    if (is.character(frfile)) {
      if (!file.exists(frfile))
        stop(frfile, 'not exists!')
      frfile <- utils::read.delim(frfile, header = TRUE) # chr/strand/coord/start/end... not known
    }
    return(frfile)
  }

  if (!is.null(forwardPeaks))
    forwardPeaks=.loadFRpeaks(forwardPeaks)
  if (!is.null(reversePeaks))
    reversePeaks=.loadFRpeaks(reversePeaks)
  if (both) peaks <- rbind(.loadFRpeaks(forwardPeaks), .loadFRpeaks(reversePeaks))

  peaks$PeakID <- paste0("peak", "_", 1:length(peaks$chr))
  peaks.saf <- peaks[, c("PeakID", "chr", "start", "end", "strand")]

  if (!file.exists(outputdir)) {
      dir.create(outputdir)
  }

  output <- paste0(outputdir, "/", "peaks.saf")
  write.bed(peaks.saf, output)
  .addTrapFiles(output, "generateSAF peaks.saf")

  .msg(sprintf("%s: finish.", fn), ...)
  return(output)
}

.mergePA <- function(bam, adjBp = 10) {
    peak <- GenomicRanges::GRanges(seq = S4Vectors::Rle(bam$chr), ranges = IRanges::IRanges(bam$coord, bam$coord + adjBp), strand = S4Vectors::Rle(bam$strand))

    peak <- GenomicRanges::reduce(peak)
    peak <- as.data.frame(peak)
    peak$end <- peak$end - adjBp
    peak$width <- peak$end - peak$start + 1

    peak$coord <- 0
    peak$coord[peak$strand == "+"] <- peak$end[peak$strand == "+"]
    peak$coord[peak$strand == "-"] <- peak$start[peak$strand == "-"]
    names(peak) <- c("chr", names(peak)[-1])
    return(peak)
}


.computePAcoord <- function(peak) {
    peak$coord <- 0
    peak$coord[peak$strand == "+"] <- peak$end[peak$strand == "+"]
    peak$coord[peak$strand == "-"] <- peak$start[peak$strand == "-"]

    return(peak)
}


#' Generate a BAM file for called peaks with featureCounts and samtools
#'
#' @description Generate a BAM file for called peaks with featureCounts and samtools, which can be used by umi_tools to quantify peaks.
#' @param featureCounts.path The path of the featureCounts tool.
#' @param samtools.path The path of the samtools.
#' @param input BAM file.
#' @param peakfile The path of the peaks.saf file generated by \code{\link{generateSAF}}.
#' @param thread Number of CPU threads, default is 12.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @examples
#' \dontrun{
#' input <- './data/demo.bam'
#' peakfile <- './data/peaks.saf'
#' generateFinalBam(featureCounts.path, samtools.path, input, peakfile, 24)
#' }
#' @return Generate final.bam file (final.bam) in the folder of the <input> dir.
#' @export
generateFinalBam <- function(featureCounts.path, samtools.path, input, peakfile, thread=12, ...) {

  fn='generateFinalBam'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  output.dir <- dirname(input)
  log <- paste0(output.dir, "/peak_assigned")
  command <- paste0(featureCounts.path, " -a ", peakfile, " -F SAF -t exon -s 1 -M --largestOverlap -o ", log, " -R BAM -T ",
      thread, " ", input)
  ofile=paste0(input, ".featureCounts.bam")
  .runCommand(fn, command, ofile=ofile, ocheck=TRUE, ...)

  .addTrapFiles(log, '*generateFinalBam: featureCounts log')
  .addTrapFiles(ofile, '*generateFinalBam: featureCounts BAM')

  input <- paste0(input, ".featureCounts.bam")
  output <- paste0(dirname(input), "/", "final.bam")
  command <- paste0(samtools.path, " sort -@ ", thread, " ", input, " -o ", output, " && ", samtools.path, " index -@ ",
      thread, " ", output)

  .runCommand(fn, command, ofile=output, ocheck=TRUE, ...)
  .addTrapFiles(output, '*generateFinalBam: samtools sort')

  .msg(sprintf("%s: finish.", fn), ...)

  return(output)
}

#' Calculate the expression level of each peak in each cell with umi_tools
#'
#' @param umitools.path The path of the umi_tools.
#' @param input The final.bam file is generated by \code{\link{generateFinalBam}}.
#' @param outputdir Output file directory.
#' @param TenX Logical value, TRUE for 10X data or BAM resulted from STARsolo.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' \item{notRun }{Default is FALSE. If `TRUE`, the Shell commands inside this function will not run but output the commad line.}
#' }
#' @examples
#' umitools.path <- '/home/aa/miniconda2/envs/umi_tools/bin/umi_tools'
#' input <- './data/final.bam'
#' outputdir <- './data'
#' \dontrun{
#' countPeaks(umitools.path, input, outputdir, TenX=TRUE)
#' }
#' countPeaks(umitools.path, input, outputdir, TenX=TRUE, notRun=TRUE)
#' @return Full path of the <counts.tsv.gz> file in the outputdir directory
#' @export
countPeaks <- function(umitools.path, input, outputdir, TenX = TRUE, ...) {

  fn='countPeaks'
  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  output <- paste0(outputdir, "/counts.tsv.gz")
  if (TenX) {
    command <- paste0(umitools.path, " count --per-gene --gene-tag=XT --assigned-status-tag=XS --method=unique --extract-umi-method=tag --umi-tag=UB --cell-tag=CB --per-cell -I ",
        input, " -S ", output)
  } else {
    command <- paste0(umitools.path, " count --per-gene --gene-tag=XT --assigned-status-tag=XS --method=unique --per-cell -I ",
        input, " -S ", output)
  }
  .runCommand(fn, command, ofile=output, ocheck=TRUE, ...)
  .addTrapFiles(output, 'countPeaks: umitools counts')
  .msg(sprintf("%s: finish.", fn), ...)
  return(output)
}

.findIntersectBypeak <- function(qrypeak, sbjpeak, d = 100) {
  gr1 <- with(qrypeak, GRanges(seqnames = chr, ranges = IRanges::IRanges(start = coord, end = coord), strand = strand))
  gr2 <- with(sbjpeak, GRanges(seqnames = chr, ranges = IRanges::IRanges(start = coord, end = coord), strand = strand))
  ov = GenomicRanges::findOverlaps(gr1, gr2, maxgap = d-1, minoverlap = 0L, type = c("any"), select = "all", ignore.strand = FALSE)
  return(qrypeak[unique(ov@from), ])
}

#----------------------------------- output scAPAtrapData ----------------------------------------------
# validate.arg(NULL, c('a','b','c', 'no')) --> a
# validate.arg('x', c('a','b','c', 'no')) --> error
# validate.arg(NULL, c('a','b','c', 'no'), null2default=FALSE)  --> error
# validate.arg(NULL, c('a','b','c', 'no', NULL), null2default=FALSE)  --> error
# validate.arg('a', c('ca','ab','c', 'no')) --> ab
# validate.arg('A', c('ca','ab','c', 'no')) --> ab
validate.arg <- function(arg, choices, several.ok = FALSE, null2default = TRUE, lc = TRUE) {
  if (is.null(arg)) {
    if (null2default) arg=choices[1]
  }
  if (lc) arg=tolower(arg)
  arg = match.arg(arg, choices, several.ok)
  return(arg)
}

## load a count table or read countsfile, and check
## return 3-column count table (gene, cell, count)
.loadCounts<-function(countsfile) {
  if (is.character(countsfile)) {
    if (!file.exists(countsfile))
      stop(countsfile, 'not exists!')
    countsfile <- utils::read.delim(countsfile, header = TRUE) # gene (peakID), cell, count
  }

  if (ncol(countsfile)!=3) stop("countsfile should be of 3 columns: gene/cell/count")
  if (!all(c('gene','cell','count') %in% colnames(countsfile))) stop("countsfile should be of 3 columns: gene/cell/count")
  return(countsfile)
}

.loadPeaks<-function(peaksfile) {
  if (is.character(peaksfile)) {
    if (!file.exists(peaksfile))
      stop(peaksfile, 'not exists!')
    peaksfile <- utils::read.delim(peaksfile, header = FALSE)
    names(peaksfile) <- c("peakID", "chr", "start", "end", "strand")
    rownames(peaksfile)=peaksfile$peakID
  }

  if (ncol(peaksfile)!=5) stop("peaks should be of 5 columns: peakID/chr/start/end/strand")
  if (!all(c('peakID','chr','start','end','strand') %in% colnames(peaksfile))) stop("peaks should be of 5 columns: peakID/chr/start/end/strand")
  rownames(peaksfile)=peaksfile$peakID
  return(peaksfile)
}


.loadTails<-function(tailsfile) {
  if (is.character(tailsfile)) {
    if (!file.exists(tailsfile))
      stop(tailsfile, 'not exists!')
    tailsfile <- utils::read.delim(tailsfile, header = TRUE) # chr/strand/coord/count
  }

  if (ncol(tailsfile)!=4) stop("tailsfile should be of 4 columns: chr/strand/coord/count")
  if (!all(c('chr','strand','coord','count') %in% colnames(tailsfile))) stop("tailsfile should be of 4 columns: chr/strand/coord/count")
  return(tailsfile)
}


#' Reduce peaks number in a countsfile and/or peaksfile
#'
#' @description Reduce peaks in a countsfile or counts table and also remove same peaks in peaksfile (if provided), by min.cells/max.cells, and min.counts/max.counts.
#' This function is useful to retrieve highly expressed, lowly expressed peaks or moderately expressed peaks.
#'
#' @param countsfile The decompressed file path of counts.tsv.gz generated by \code{\link{countPeaks}}, or the count table with three columns.
#' @param peaksfile peaksfile or peak table with five columns. If not NULL, then filter peaksfile after filtering countsfile.
#' @param min.cells retain peaks expressed in >= min.cells, the default value is 10.
#' @param min.count retain peaks with read count >= min.count, the default value is 10.
#' @param max.cells retain peaks expressed in < max.cells, the default value is NULL (unlimited). This is used to filter peaks with less expression.
#' @param max.count retain peaks with read count < max.count, the default value is NULL (unlimited). This is used to filter peaks with less expression.
#' @param suffix applicable when countsfile and peaksfile are both provided. Then counts and peaks will be output to <countsfile>.reduced; <peaksfile>.reduced.
#' @param toSparse to output a sparseMatrix (gene-cell) or keep the triplet table as input.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @return A data.frame (toSparse=FALSE), or a sparse Matrix (toSparse=TRUE) of counts,
#' or a filename list with (countsfile, peaksfile) (if peaksfile is not NULL).
#' @examples
#' \dontrun{
#' countsfile='../dataFly/APA.tails.no/counts.tsv.gz'
#' peaksfile='../dataFly/APA.tails.no/peaks-notails.saf'
#' ## only filter countsfile or counts-table, return a df
#' reducePeaks(countsfile, min.cells = 10, min.count = 10, toSparse=FALSE)
#'
#' ## retain large peaks, and output both counts and peaks, save to .reduced file (>=10 & >=50)
#' reducePeaks(countsfile=countsfile, peaksfile=peaksfile, min.cells = 10, min.count = 50)
#'
#' ## retain low-expressed peaks, and output both counts and peaks, save to .reduced file (<=9 & <=49)
#' reducePeaks(countsfile=countsfile, peaksfile=peaksfile, max.cells = 9, max.count = 49,
#'             min.cells=NULL, min.count=NULL, suffix='.small')
#'
#' <=9
#' reducePeaks(countsfile=countsfile, peaksfile=peaksfile, max.cells = 9,  max.count=NULL,
#'            min.cells=NULL, min.count=NULL, suffix='.smallcells')
#'
#' <=49
#' reducePeaks(countsfile=countsfile, peaksfile=peaksfile, max.cells = NULL,  max.count=49,
#'             min.cells=NULL, min.count=NULL, suffix='.smallcounts')
#'
#' smallpeaks=.loadPeaks('../dataFly/APA.tails.no/peaks-notails.saf.small')
#' largepeaks=.loadPeaks('../dataFly/APA.tails.no/peaks-notails.saf.reduced')
#' smallpeaks1=.loadPeaks('../dataFly/APA.tails.no/peaks-notails.saf.smallcells')
#' smallpeaks2=.loadPeaks('../dataFly/APA.tails.no/peaks-notails.saf.smallcounts')
#' fullpeaks=.loadPeaks(peaksfile)
#' nrow(smallpeaks); nrow(largepeaks); nrow(fullpeaks)
#' smallset2=unique(rbind(smallpeaks1, smallpeaks2))
#' nrow(smallset2) + nrow(largepeaks);  nrow(fullpeaks)
#' ## should be the same, but if not the same, may be some peakIDs in fullpeaks are not in the counts table
#'
#' }
#' @export
reducePeaks <- function(countsfile, peaksfile=NULL,
                        min.cells = 10, min.count = 10,
                        max.cells = NULL, max.count = NULL,
                        suffix='.reduced', toSparse=FALSE, ...) {

  if (!is.null(peaksfile)) {
    if (!is.character(countsfile) & !is.character(peaksfile)) stop("reducePeaks: countsfile and peaksfile should be filename when both are provided\n")
    if (!file.exists(countsfile) | !file.exists(peaksfile)) stop("reducePeaks: countsfile or peaksfile not exist!")
    if (suffix=='') stop("reducePeaks: countsfile and peaksfile both are provided, suffix (e.g., 'reduced' to the filename) should be provided\n")
    toSparse=FALSE
  }

  fn='reducePeaks'
  counts=.loadCounts(countsfile)
  N1=nrow(counts)

  counts = transform(counts,
                     gene = factor(gene),
                     cell = factor(cell))
  scounts = Matrix::sparseMatrix(as.integer(counts$gene), as.integer(counts$cell), x = counts$count)
  colnames(scounts) = levels(counts$cell)
  rownames(scounts) = levels(counts$gene)
  invisible(gc())

  N=nrow(scounts)

  nc=Matrix::rowSums(scounts) #counts
  nv=Matrix::rowSums(scounts>0) #cells

  nids=list(min.count=NA, min.cells=NA, max.count=NA, max.cells=NA)
  if (!is.null(min.count)) nids[[1]]=which(nc>=min.count)
  if (!is.null(min.cells)) nids[[2]]=which(nv>=min.cells)
  if (!is.null(max.count)) nids[[3]]=which(nc<=max.count)
  if (!is.null(max.cells)) nids[[4]]=which(nv<=max.cells)

  for (i in 1:length(nids)) {
    if (!is.na(nids[[i]][1])) {
      .msgin(fn, sprintf("%d peaks meet %s (%d).", length(nids[[i]]), names(nids)[i], get(names(nids)[i])), ...)
    }
  }

  ## get intersection of all conditions
  nids[is.na(nids)]=NULL
  allids=nids[[1]]
  if (length(nids)>=2) {
    for (i in 2:length(nids)) {
      allids=intersect(allids, nids[[i]])
    }
  }

  scounts=scounts[allids, ]
  .msgin(fn, sprintf("%d peaks (total %d) meet combined conditions", length(allids), N), ...)

  if (toSparse) return(scounts)

  peakid=rownames(scounts) ##sparse rowname = peakid

  #convert back to triplet table
  rid=rownames(scounts)
  cid=colnames(scounts)
  scounts <- as.data.frame(summary(scounts))
  colnames(scounts)=c('i','j','count')
  scounts$gene <- rid[scounts$i]
  scounts$cell <-cid[scounts$j]
  scounts=transform(scounts, i=NULL, j=NULL)
  scounts=scounts[, c('gene','cell','count')]

  if (is.null(peaksfile)) return(scounts)

  ## save both peaks and counts
  if (!is.null(peaksfile)) {
    saf=.loadPeaks(peaksfile)
    ipeak=base::intersect(saf$peakID, peakid) # common peaks
    if (length(ipeak)!=length(peakid)) {
      .msgin(fn, sprintf("Warning: peakID in counts-table are not all in peaksfile, will use common peaks (%d) to filter both counts (%d) and peaks (%d)!",
                         length(ipeak), length(peakid), nrow(saf)), ...)
    }
    if (!(all(ipeak %in% saf$peakID))) {
      .msgin(fn, sprintf("Warning: peakID in peaksfile are not all in counts-table, will use common peaks (%d) to filter both counts (%d) and peaks (%d)!",
                         length(ipeak), length(peakid), nrow(saf)), ...)
    }
    saf=saf[ipeak, ]
    scounts=scounts[scounts$gene %in% ipeak, , drop=FALSE]
    write.table(saf, file=paste0(peaksfile, suffix), col.names = FALSE, row.names = FALSE, sep="\t", quote=FALSE)
    write.table(scounts, file=paste0(countsfile, suffix), col.names = TRUE, row.names = FALSE, sep="\t", quote=FALSE)

    .addTrapFiles(paste0(peaksfile, suffix), 'reducePeaks: reduced peaksfile')
    .addTrapFiles(paste0(countsfile, suffix), 'reducePeaks: reduced countsfile')

    .msgin(fn, sprintf("save reduced peaksfile to <%s> and countsfile to <%s>",
                       basename(paste0(peaksfile, suffix)), basename(paste0(countsfile, suffix)) ), ...)
    return(list(countsfile=paste0(countsfile, suffix), peaksfile=paste0(peaksfile, suffix)))
  }
}

#' Generate single cell expression matrix
#'
#' @description Given peaks' count and annotation file, this function can filter peaks with low expression levels and generate different types of objects,
#'              including data.frame (df), list with data.frame peaks.meta and Sparse Matrix peaks.count (list), SeuratObject (Seurat),  SingleCellExperiment (sce), movAPA's PACdataset (PAC).
#'
#' @param countsfile The decompressed file path of counts.tsv.gz generated by \code{\link{countPeaks}} or a count table (gene/cell/count).
#' @param peaksfile The path of the peaks.saf file generated by \code{\link{generateSAF}}.
#' @param barcode Vector of the barcode list for filtering cells.
#' @param tails The result produced by \code{\link{findTails}} or \code{\link{findTailsByPeaks}}.
#' If provided, then filter tailed-peaks within d nt.
#' @param d Max distance from the tail and peak to determine whether a peak is real (if it is within d-nt of a tail).
#' @param min.cells peak is expressed in at least a few cells,the default value is 10.
#' @param min.count peak minimum expression, the default value is 10.
#' @param ofile if not NULL then output the an variable called scAPAtrapData to ofile, and return ofile.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @return A list containing peaks.meta and peaks.count (ofile=NULL) or the ofile name (ofile!=NULL).
#' The ofile is an RDA file which contains a variable called scAPAtrapData after load(ofile). The scAPAtrapData is a list.
#' @examples
#' \dontrun{
#' countsfile <- countPeaks(umitools.path,'./data/final.bam','./data',TenX=TRUE)
#' peakfile <- generateSAF(forwardPeaks,reversePeaks,'./data')
#' tails <- findTails(bamfile = './data/demo.bam')
#' barcode <- utils::read.delim2('./data/barcodes.tsv',header = FALSE)
#' barcode <- gsub('-[0-9]','',barcode$V1)
#' scExpMa <- generatescExpMa(countsfile, peaksfile, barcode, tails,
#'                           d=100, min.cells = 10, min.count = 10)
#' scExpMa2 <- eneratescExpMa(countsfile, peaksfile, barcode,
#'                min.cells = 10, min.count = 100,  verbose=TRUE, logf='xx.log')
#' }
#' @section Warning: The chromosome name in tails and peaksfile(peaks.saf) must be consistent (e.g., both be 1 or be Chr1).
#' @export
generatescExpMa <- function(countsfile, peaksfile, barcode=NULL,
                            tails=NULL, d=100,
                            min.cells = 10, min.count = 10,
                            ofile=NULL, ...) {

  fn='generatescExpMa'

  .msg(sprintf("%s: start.", fn), preLine=FALSE, ...)

  counts=.loadCounts(countsfile)
  saf=.loadPeaks(peaksfile)
  invisible(gc())

  # filter non-barcode cells in counts
  if (!is.null(barcode)) {
    cs1=length(unique(counts$cell))
    counts <- subset(counts, cell %in% barcode)
    cs2=length(unique(counts$cell))
    if (cs1!=cs2) {
      .msgin(fn, sprintf("%d cells from %d total cells are in barcode (#%d).", cs2, cs1, length(barcode)), ...)
    }
  }

  # add coord
  saf <- .computePAcoord(saf)

  # use tail-positions to filter peaks within d-nt
  if (!is.null(tails)) {
    tails=.loadTails(tails)
    .msgin(fn, sprintf("Using %d tails to filter tailed-peaks within %d nt.", nrow(tails), d), ...)
    n1=nrow(saf)
    saf <- .findIntersectBypeak(saf, tails, d=d)
    n2=nrow(saf)
    .msgin(fn, sprintf("%d non-tailed peaks are removed.",  n1-n2), ...)
    invisible(gc())

    ## filter counts
    n1=nrow(counts)
    counts=counts[counts$gene %in% saf$peakID, ]
    n2=nrow(counts)
    .msgin(fn, sprintf("%d non-tailed peak-count rows are removed", n1-n2), ...)
  }

  ## slow BIG MEM!!
  ##scExpMa <- reshape2::dcast(counts, gene ~ cell, fun.aggregate=sum, value.var = "count")

  # get gene-cell matrix very fast with Sparse Matrix
  #https://www.r-bloggers.com/2016/01/casting-a-wide-and-sparse-matrix-in-r/
  .msgin(fn, 'get peak-cell count matrix.', ...)

  scExpMa=reducePeaks(counts, min.cells = min.cells, min.count = min.count, toSparse=TRUE, ...)

  #make counts and meta data in order, but in different matrices
  .msgin(fn, "link peaks.meta and peaks.count.", ...)
  n1=nrow(saf)
  n2=nrow(scExpMa)
  cmpeaks=base::intersect(saf$peakID, rownames(scExpMa))
  n3=length(cmpeaks)
  .msgin(fn, sprintf("peaks.meta=%d peaks; peaks.counts=%d peaks; common=%d peaks.", n1, n2, n3), ...)

  rownames(saf)=saf$peakID
  saf=saf[cmpeaks, , drop=FALSE]
  scExpMa=scExpMa[cmpeaks, ,drop=FALSE]
  if (!identical(saf$peakID, rownames(scExpMa))) stop('Peaks in peaks.meta are not the same as the peak-cell matrix!\n')

  .msgin(fn, sprintf('output a list with %d peaks.meta (data.frame) and peaks.count (Sparse Matrix).', nrow(saf)), ...)
  if (!is.null(ofile)) {
    scAPAtrapData=list(peaks.meta=saf, peaks.count=scExpMa)
    save(scAPAtrapData, file=ofile)
    .addTrapFiles(ofile, 'generatescExpMa: scAPAtrapData')
    .msg(sprintf("%s: finish.", fn), ...)
    return(ofile)
  } else {
    .msg(sprintf("%s: finish.", fn), ...)
    return(list(peaks.meta=saf, peaks.count=scExpMa))
  }
}

#' Convert the scExpMa list generated by scAPAtrap to other format
#'
#' @description Given peaks' count and annotation file, this function can filter peaks with low expression levels and generate different types of objects,
#'              including data.frame (df), list with data.frame peaks.meta and Sparse Matrix peaks.count (list), SeuratObject (Seurat),  SingleCellExperiment (sce), movAPA's PACdataset (PAC).
#'
#' @param traplist A list with data.frame peaks.meta and Sparse Matrix peaks.count generated by generatescExpMa.
#' @param oformat A character string of df/seurat/sce/pac to generate different types of objects:
#'                 data.frame (df),  SeuratObject (Seurat),  SingleCellExperiment (sce), movAPA's PACdataset (PAC).
#'                 Generally, the output includes peaks.count (SparseMatrix) and peaks.meta (peakID/chr/start/end/strand/coord).
#'                 oformat=df may be problematic when the peak-cell matrix is very large. If oformat=df failed, please set oformat to other strings to represent the peak-cell counts matrix as Sparse Matrix.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @examples
#' \dontrun{
#' convertAPAtrapData(from=peakslist, to='pac')
#' }
#' @export
convertAPAtrapData<-function(traplist, oformat, ...) {

  fn='convertAPAtrapRes'

  oformat=validate.arg(oformat, c('df', 'seurat', 'pac', 'sce'))

  if (is.character(traplist)) {
    if (!file.exists(traplist)) stop('convertAPAtrapData: traplist is a filename but not exists!')
    e=new.env()
    v=load(traplist, envir = e)
    traplist=get(v, envir=e)
    rm(list=v, envir=e)
    .msgin(fn, paste0('traplist is an .rda file, loaded scAPAtrapData.'), ...)
  }

  if (!is.list(traplist)) stop("convertAPAtrapRes: traplist is not a list!")
  if (!(all(c('peaks.meta','peaks.count') %in% names(traplist)))) stop("convertAPAtrapRes: peaks.meta or peaks.count not in traplist!")

  scExpMa=traplist$peaks.count
  saf=traplist$peaks.meta

  if (oformat=='df') {
    msg="Warning: oformat=df may be problematic when the peak-cell matrix is very large.\nIf failed, please set pformat=list/seurat to use sparse matrix for storing the peak-count table!"
    .msgin(fn, paste0('oformat=df, output a data frame with peaks.meta and peaks.count together.', msg), ...)
    scExpMa=as.data.frame(as.matrix(scExpMa))
    invisible(gc())
    saf=cbind(saf, scExpMa)
    return(saf)
  }

  if (oformat=='seurat') {
    .msgin(fn, 'oformat=seurat, output a SeuratObject (assay=counts).', ...)
    so<-Seurat::CreateSeuratObject(scExpMa, assay = "counts")

    # make feature names like "peak-XXXX" , same as the assay in Seurat obj
    rownames(saf)=gsub('_', '-', rownames(saf))
    so[['counts']]=Seurat::AddMetaData(so[['counts']], saf)

    # Access cell-level meta-data
    #head(so[[]])
    # Access feature-level meta-data
    #head(so[["counts"]][[]])
    # Access counts table
    #so[["counts"]][1:10, 1:5]
    return(so)
  }

  if (oformat=='sce') {
    .msgin(fn, 'oformat=sce, output SingleCellExperiment (assay=counts).', ...)
    sce=SingleCellExperiment::SingleCellExperiment(assays=list(counts=scExpMa))
    rowData(sce)=saf
    return(sce)
    #head(rowData(sce))
    #head(counts(sce)[1:10, 1:10])
    #head(assay(sce, "counts")[1:10, 1:10])
  }

  if (oformat=='pac') {
    .msgin(fn, 'oformat=pac, output movAPA::PACdataset.', ...)
    pacds=movAPA::createPACdataset(counts=scExpMa, anno=saf)
    return(pacds)
  }

}


############# scAPAtrap wrapper #############
## global variable TRAP.PARAMS to store scAPAtrap parameters.
## default values
myenv$TRAP.PARAMS=list(
  TenX=TRUE, #whether 10X or STARsolo BAM
  barcode=NULL, #barcode vectors
  chrs=NULL, #chromosome names, NULL to retrieve from BAM
  maxwidth=1000,  #max pA peak width
  readlength=49,  #R2 read length
  cov.cutoff=10, #min coverage for peak calling
  min.cells=10, #filter peaks present in >=10 cells
  min.count=10, #filter peaks with >=10 reads
  tails.search='no', #search tails
  d=100, #min distance from tail to peak
  findUniqueMap=TRUE, #whether to run findUniqueMap
  findUniqueMap.sort=TRUE, #whether to sort BAM in findUniqueMap
  findUniqueMap.index=TRUE, #whether to index BAM in findUniqueMap
  thread=24 #thread CPU
)

#' Get parameters to run scAPAtrap
#'
#' @details
#' TRAP.PARAMS returns the global variable of list type to store parameters for running the wrapper function \code{\link{scAPAtrap}}.
#' \itemize{
#' \item{TenX}: Whether the inputBam is 10X-format (e.g., BAM file from CellRanger or STARsolo).
#' For non-10X protocols like CEL-seq, when STARsolo is used, the BAM file will also contain cell barcode tag (CB) and UMI barcode tag (UB), then can set TenX=TRUE.
#' \item{barcode}: string vector to define barcodes, default is NULL (not provided).
#' \item{chrs}: a string vector of chromosome names (e.g., chr1, chr2, chr3...), default is NULL.
#' This parameter is used in \code{\link{findPeaksByStrand}}.
#' If chrs=NULL and run one-step \code{\link{scAPAtrap}}, then will try to get all chrs from inputBAM.
#' However, when running \code{\link{scAPAtrap}}, if chrs=NULL and there are more than 100 chrs from inputBAM,
#' users should provide trap.params$chrs explicitly to run scAPAtrap just in case there are too many unuseful chrs to run.
#' \item{maxwidth}: max width (nt) to define a valid peak, default is 1000.
#' Peaks wider than `maxwidth` will be automatelly split until meet requirement.
#' This parameter is used in \code{\link{findPeaksByStrand}}.
#' \item{readlength}: R2 read length, default is 49. `readlength` determines the min peak width;
#' peaks wider than `readlength` will be retained.
#' This parameter is used in \code{\link{findPeaksByStrand}}.
#' \item{cov.cutoff}: min read coverage for peak calling, default is 10.
#' Peak regions with coverage greater than `cov.cutoff`will be considered as candidate peaks.
#' This parameter is used in \code{\link{findPeaksByStrand}}.
#' \item{min.cells}: number of cells that a peak is expressed (i.e., with read count over `min.count`), default is 10.
#' `min.cells` is used to filter peaks present in at least `min.cells` cells.
#' This parameter is used in \code{\link{reducePeaks}} and \code{\link{generatescExpMa}}.
#' \item{min.count}: number of read counts a peak has, default is 10.
#' `min.count` is used to filter peaks with at least `min.count' reads.
#' This parameter is used in \code{\link{reducePeaks}} and \code{\link{generatescExpMa}}.
#' \item{tails.search}: the strategy to search A tails, can be set `peaks`, `genome`, `no` (default). This parameter is used in the wrapper function \code{\link{scAPAtrap}}.
#' \describe{
#' \item{no }{Not search tails, which means the final peaks are not linked with any A-tails.}
#' \item{genome }{Search tails genome-wide, which may be suitable for small genome. Otherwise it will take very long time or MEM.}
#' \item{peaks }{Search tails within the peak regions by `d` nt. This way may be suitable for big genome but with relatively small number of peaks.}
#' }
#' \item{d}: min distance from a peak to any tail to call a peak as real PA, default is 100.
#' Only applicable when tails.search is peaks or genome. This parameter is used in \code{\link{findTailsByPeaks}} and \code{\link{generatescExpMa}}.
#' If provided, then the final scAPAtrapData (the PA list) will remove cells not in `barcode.`
#' \item{findUniqueMap}: default is TRUE, means to run \code{\link{findUniqueMap}} to get unique mappings, and/or sort/index the inputBam.
#' \item{findUniqueMap.sort}: default is TRUE, to sort inputBam. Only applicable when findUniqueMap=TRUE.
#' \item{findUniqueMap.index}: default is TRUE, to index inputBam. Only applicable when findUniqueMap=TRUE.
#' \item{thread}: Number of CPU threads, default is 24.
#' }
#' @return A list of parameters with default values for running \code{\link{scAPAtrap}}.
#' @examples
#' ## see TRAP.PARAMS
#' TRAP.PARAMS()
#' ## set up new trap.params
#' trap.params=TRAP.PARAMS()
#' trap.params$TenX=FALSE
#' ## set up from scratch
#' trap.params=list(thread=24,
#'                  TenX=TRUE,
#'                  chrs=c('2L','2R','3L','3R','4','X','Y'),
#'                  maxwidth=1000,
#'                  readlength=49,
#'                  cov.cutoff=10,
#'                  min.cells=10,
#'                  min.count=10,
#'                  tails.search='peaks',
#'                  d=100,
#'                  barcode=NULL,
#'                  findUniqueMap=TRUE,
#'                  findUniqueMap.sort=TRUE,
#'                  findUniqueMap.index=TRUE )
#' @export
TRAP.PARAMS<-function() {
  return(myenv$TRAP.PARAMS)
}

#' Set the parameters for running scAPAtrap
#'
#' setTrapParams sets parameters for running scAPAtrap, by setting default values using \code{\link{TRAP.PARAMS}} and checking the validity.
#'
#' @param trap.params A list similar to \code{\link{TRAP.PARAMS}}.
#' @param print Whether to print the list (default is TRUE).
#' @param check Whether to check the validity of values (default is TRUE).
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @return A list like code{\link{TRAP.PARAMS}}.
#' @examples
#' ## get default
#' trap.params=setTrapParams()
#' ## change some parameters
#' trap.params=setTrapParams(list(chrs=c('2L','2R','3L','3R','4','X','Y'), TenX=FALSE))
#' ## will ignore non-valid parameter names
#' trap.params=setTrapParams(list(xx=FALSE))
#' ## add barcodes
#' trap.params=setTrapParams(list(barcode=c('AAA','BBB')))
#' ## not print
#' trap.params=setTrapParams(print=FALSE)
#' @export
setTrapParams <- function(trap.params=NULL, print=TRUE, check=TRUE, ...) {
  ## set default
  if (!is.null(trap.params)) {
    if (!is.list(trap.params)) stop("trap.params should be a list, see TRAP.PARAMS!")
    for (i in names(TRAP.PARAMS())) { # for pars not set in trap.params, use the default one.
      if (!(i %in% names(trap.params))) {
        trap.params[[i]]=TRAP.PARAMS()[[i]]
      }
    }
  } else {
    trap.params=TRAP.PARAMS() # default theme
  }

  ## delete non TRAP.PARAMS
  trap.params[!(names(trap.params) %in% names(TRAP.PARAMS()))]=NULL

  trap.params$tails.search=tolower(trap.params$tails.search)

  if (check) {
    if (!(trap.params$tails.search %in% c('genome','peaks','peak','no','none'))) stop("error tails.search, should be genome/peak/no")
  }
  if (trap.params$tails.search %in% c('peaks','peak')) trap.params$tails.search='peaks'
  if (trap.params$tails.search %in% c('no','none')) trap.params$tails.search='no'

  if (!print) return(trap.params)

  for (iname in names(TRAP.PARAMS())) {
    if (is.null(trap.params[[iname]])) {
      str=sprintf("%s = NULL", iname)
      .msg(string=str, showTime=FALSE, pre='', ...)
      next
    }
    i=which(names(trap.params)==iname)
    str=trap.params[[i]]
    if (is.character(trap.params[[i]])) {
      str=.shortStr(trap.params[[i]])
    }
    if (iname=='chrs') str=sprintf("%s (%d chrs)", str, length(trap.params[[i]]))
    if (iname=='barcode') str=sprintf("%s (%d barcodes)", str, length(trap.params[[i]]))
    str=sprintf("%s = %s", names(trap.params)[i], str)
    .msg(string=str, showTime=FALSE, pre='', ...)
  }

  return(trap.params)
}

#' Set tool names for running scAPAtrap
#'
#' setTools sets tool names with full path for running scAPAtrap and checks the validity.
#'
#' @param tools A list providing full path of four tools: samtools, umitools, featureCounts, star.
#' @param check Whether to check the validity of values (default is TRUE).
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name. Then message will be written to `logf`.}
#' }
#' @return A list with tool names as the names of the list.
#' @examples
#' tools=list(samtools='/home/dell/miniconda3/envs/scAPA/bin/samtools',
#'            umitools='/home/dell/miniconda3/envs/scAPA/bin/umi_tools',
#'            featureCounts="/home/dell/miniconda3/envs/scAPA/bin/featureCounts",
#'            star='/home/dell/miniconda3/envs/scAPA/bin/STAR')
#' \dontrun{
#' tools=setTools(tools)
#' }
#' tools=setTools(tools, check=FALSE)
#' @export
setTools <- function(tools, check=TRUE, ...) {
  TOOLS=c('samtools','umitools', 'featureCounts', 'star')

  tools[!(names(tools) %in% TOOLS)]=NULL

  if (check) {
    for (i in TOOLS) {
      if (!(i %in% names(tools))) stop(i, " not provided!")
    }
    for (i in tools) {
      if (!file.exists(i)) stop(i, " not exists!")
    }
  }

  for (i in 1:length(tools)) {
    str=sprintf("%s = %s", names(tools)[i], tools[[i]])
    .msg(string=str, showTime=FALSE, pre='', ...)
  }
  return(tools)
}

# .shortStr('hello')
# .shortStr(rep('chr', 200)) #--> [1] "chr,chr,chr,chr,chr,..."
.shortStr<-function(strs, len=20) {
  cs=paste0(strs, collapse=',')
  if (nchar(cs)<=len) return(cs)
  cs=paste0(substr(cs, 1, min(len, nchar(cs))), '...')
  return(cs)
}


## .printStep("step1")
## ...
## 2023-09-12 15:21:42 dedupByPos
## ...
.printStep <- function(str, ...) {
  .msg(string=paste0(rep('#', 80), collapse=''), showTime=FALSE, pre='', ...)
  .msg(string=str, showTime=TRUE, pre='', ...)
  .msg(string=paste0(rep('#', 80), collapse=''), showTime=FALSE, pre='', ...)
}

## filenames: if is NULL, then not check/stop (just print varname=NULL)
## varnames: if not NULL, then will print like "varname='filename'"
## .checkAndPrintFiles(c("/mnt/for.txt", "/mnt/rev.txt"), c("forwardPeaksFile", "reversePeaksFile")) ## will check and print
## .checkAndPrintFiles(NULL, c("forwardPeaksFile", "reversePeaksFile")) ## forwardPeaksFile=NULL\nreversePeaksFile=NULL
.checkAndPrintFiles <- function(filenames=NULL, varnames=NULL, ...) {

  nullFile=is.null(filenames)

  ## print output flies, similar to .printStep, but use '++++'
  if (!is.null(varnames)) {

    quote='\''
    if (length(filenames)!=length(varnames)) {
      filenames=rep('NULL', length(varnames))
      quote=''
    }

    .msg(string=paste0(rep('+', 80), collapse=''), showTime=FALSE, pre='', ...)

    for (i in 1:length(varnames)) {
      .msg(string=sprintf("%s=%s%s%s", varnames[i], quote, filenames[i], quote), showTime=FALSE, pre='', ...)
    }

    .msg(string=paste0(rep('+', 80), collapse=''), showTime=FALSE, pre='', ...)
  }

  ## check and stop
  if (!nullFile) {
    NO=FALSE
    for (f in filenames) {
      if (!file.exists(f)) {
        .msg(string=sprintf("%s not exists!", f), showTime=FALSE, pre='', ...)
        NO=TRUE
      }
    }
    if (NO) stop("scAPAtrap error, please check logf to see Logs (if set logf)!")
  }
  return(invisible(NULL))
}


## print TRAPFILES and note
.printTRAPFILES<-function(nc=50, note=TRUE, ...) {
  files=TRAPFILES()
  if (nrow(files)>0) nc=min(100, max(nchar(files$what)))
  if (note) .msg(string="Files marked with * can be savely deleted!", showTime=FALSE, pre='', ...)

  for (i in 1:nrow(files)) {
    str=sprintf(paste0("%", nc, "s ==> %s"), files[i, 2], files[i, 1])
    .msg(string=str, showTime=FALSE, pre='', ...)
  }
}




#' Initiate scAPAtrap and check/set parameters
#'
#' @inheritParams scAPAtrap
#' @return NULL. But the input tools, trap.params, inputBam, outputDir may be modifed during initiation.
#' @examples
#' \dontrun{
#' tools=list(samtools='/home/dell/miniconda3/envs/scAPA/bin/samtools',
#'            umitools='/home/dell/miniconda3/envs/scAPA/bin/umi_tools',
#'            featureCounts="/home/dell/miniconda3/envs/scAPA/bin/featureCounts",
#'            star='/home/dell/miniconda3/envs/scAPA/bin/STAR')
#' trap.params=TRAP.PARAMS()
#' trap.params$chr=1:3
#'
#' ## input BAM
#' dir0='/mnt/64cf3476-350c-46ad-bc48-574fa64a0334/test/xwu/dataFly/'
#' inputBam=paste0(dir0, "filterCB.bam")
#'
#' ## log file
#' logf=gsub('.bam', '.scAPAtrap.log', inputBam, fixed=TRUE)
#'
#' ## output dir (will be under inputBam's dir)
#' outputDir="APA.result"
#'
#' ## one step scAPAtrap
#' initScAPAtrap(tools=tools, trap.params=trap.params,
#'          inputBam=inputBam, outputDir=outputDir, logf=logf)
#' }
#' ## just test the function with toCheck=FALSE
#' initScAPAtrap(tools='just-for-fun.tools',
#'               trap.params=list('just-for-fun.params'),
#'               inputBam='just-for-fun.bam', outputDir='just-for-fun.dir', toCheck=FALSE)
#' trap.params
#' @export
initScAPAtrap<-function(tools, trap.params, inputBam, outputDir, ...)  {

  fn='initScAPAtrap'

  logf <- .getDotArg("logf", NULL, ...)
  if (!is.null(logf)) {
    if (file.exists(logf)) stop("logf is already exists, Please delete the LOG file or set another logf!")
    .msg(string=sprintf("LOG file = %s", logf), showTime=FALSE, pre='', ...)
  }

  ## toCheck=FALSE, just for debug test
  toCheck <- .getDotArg("toCheck", TRUE, ...)

  ## global=TRUE, to change global vars (used by step-by-step); otherwise return list of (trap.params... ) used by once-scAPAtrap
  global <- .getDotArg("global", TRUE, ...)

  .printStep('initiate scAPAtrap', ...)

  if (toCheck) .checkAndPrintFiles(inputBam, ...)

  if (!grepl('/', outputDir)) {
    outputDir = paste0(dirname(inputBam), '/', outputDir)
  }

  if (toCheck) {
    if (file.exists(outputDir)) stop('initScAPAtrap error: outputDir [', outputDir, '] exists, please change another dirname to avoid overwriting\n')
  }

  .msg(string=sprintf("inputBam = %s", inputBam), showTime=FALSE, pre='', ...)
  .msg(string=sprintf("outputDir = %s", outputDir), showTime=FALSE, pre='', ...)

  trap.params <- setTrapParams(trap.params, print=FALSE, check = toCheck, ...)
  tools <- setTools(tools, check = toCheck, ...)

  TRAPFILES(clear=toCheck)

  ## get chr names: if not provided, then get from BAM files
  if (!toCheck) {
    # just print
    x=setTrapParams(trap.params, print=TRUE, check = FALSE, ...)
    return(invisible(NULL))
  }

  if (length(trap.params$chrs)==0) {
      trap.params$chrs <- .getBAMchrs(inputBam)
    .msgin(fn='', string=sprintf("Total %d chrs (e.g., %s) from inputBam", length(trap.params$chrs), .shortStr(trap.params$chrs, 50)), ...)

    if (length(trap.params$chrs)>100) {
      .msgin(fn='', string=sprintf("Too many chrs (%d, >100) from inputBam, please use trap.params$chrs to explicitly set chrs", length(trap.params$chrs)), ...)
      stop("initScAPAtrap quit because of too many chrs from inputBam, please check logf to see Logs!")
    }
  }
  if (length(trap.params$chrs)==0) {
    .msg(string="No chrs in inputBam or in trap.params!", showTime=FALSE, pre='', ...)
    stop("initScAPAtrap error because of no chrs provided, please check logf to see Logs!")
  }

  ## just print
  trap.params <- setTrapParams(trap.params, print=TRUE, check = toCheck, ...)

  if (global) {
    trap.params <<- trap.params
    tools <<- tools
    inputBam <<- inputBam
    outputDir <<- outputDir
    return(invisible(NULL))
  } else {
    return(list(trap.params=trap.params, tools=tools, inputBam=inputBam, outputDir=outputDir))
  }

}

#' Wrapper function for one-step running of scAPAtrap
#'
#' @param tools A list contains four tools used in scAPAtrap.
#' @param trap.params Parameters of running scAPAtrap, see global variable \code{\link{TRAP.PARAMS}}.
#' @param inputBam Input bam file name. If trap.params$chr is NULL, will try to get chrs from inputBam.
#' @param outputDir Output dir for storing scAPAtrap's final output -- peaks and counts files.
#' If only dirname is given (e.g., APAres but not ./APAres), then will add path of inputBam to the dir name.
#' If outputDir already exists, an error will be raised to avoid overwritting.
#' However, many temporary files are generated during running scAPAtrap, which could be seen from global variable TRAPFILES. Those files marked with * could be savely deleted.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{If `TRUE` basic status updates will be printed along the way.}
#' \item{logf }{If not NULL, then it should be a character string denoting a file name.
#' If logf is a character string, then message will be written to `logf`.
#' But if logf already exists, this function will quit to avoid overwriting an existing log file.
#' The logf will log full information during scAPAtrap, including time, command, and output files.}
#' }
#' @return A file name storing scAPAtrapData object, which is <outputDir>/scAPAtrapData.rda
#' @examples
#' \dontrun{
#' tools=list(samtools='/home/dell/miniconda3/envs/scAPA/bin/samtools',
#'            umitools='/home/dell/miniconda3/envs/scAPA/bin/umi_tools',
#'            featureCounts="/home/dell/miniconda3/envs/scAPA/bin/featureCounts",
#'            star='/home/dell/miniconda3/envs/scAPA/bin/STAR')
#' trap.params=TRAP.PARAMS()
#' trap.params$chrs=c('2L','2R','3L','3R','4','X','Y')
#'
#' ## input BAM
#' dir0='/mnt/64cf3476-350c-46ad-bc48-574fa64a0334/test/xwu/dataFly/'
#' inputBam=paste0(dir0, "filterCB.bam")
#'
#' ## log file
#' logf=gsub('.bam', '.scAPAtrap.log', inputBam, fixed=TRUE)
#' unlink(logf)
#'
#' ## barcode (if have)
#' barcodefile <- paste0(dir0, 'barcode.17.txt')
#' barcode <- utils::read.delim2(barcodefile, header = FALSE)
#' barcode <- gsub('-[0-9]','',barcode$V1)
#' ## add barcode to trap.params
#' trap.params$barcode=barcode
#'
#' ## output dir (will be under inputBam's dir)
#' outputDir="APA.result"
#'
#' ## one step scAPAtrap
#' scAPAtrap(tools=tools, trap.params=trap.params,
#'          inputBam=inputBam, outputDir=outputDir, logf=logf)
#' }
#' @export
scAPAtrap <- function(tools, trap.params, inputBam, outputDir, ...) {

  #set.seed(123)

  ################### 1. setting #######################
  ## set logf, trap.params, tools, TRAPFILES, chrs (if is NULL)
  init=initScAPAtrap(tools=tools, trap.params=trap.params, inputBam=inputBam, outputDir=outputDir, global=FALSE, ...)
  tools=init$tools
  trap.params=init$trap.params
  inputBam=init$inputBam
  outputDir=init$outputDir

  ################## 2. findUniqueMap #######################
  # findUniqueMap: uniq >> sort >> index with samtools
  # Skip this step if the inputBam contains only unique mappings.
  if (trap.params$findUniqueMap) {
    .printStep('findUniqueMap', ...)
    inputBam.u <- findUniqueMap(tools$samtools, inputBam, thread=trap.params$thread,
                                sort = trap.params$findUniqueMap, index = trap.params$findUniqueMap.index, ...)
  } else {
    inputBam.u=inputBam
  }
  .checkAndPrintFiles(filenames = inputBam.u, varnames=c("inputBam.u"), ...)

  #################### 3. dedupByPos #####################
  ## dedupByPos: remove duplicates with umitools
  ## If the inputBAM is from non-10X protocols, set TenX=FALSE
  ## However, if STARsolo is used for CEL-seq which contains cell barcode tag (CB) and UMI barcode tag (UB), set TenX=FALSE
  .printStep('dedupByPos', ...)
  inputBam.u.dedup <- dedupByPos(tools$umitools, input=inputBam.u, TenX=trap.params$TenX, ...)
  .checkAndPrintFiles(filenames = inputBam.u.dedup, varnames=c("inputBam.u.dedup"), ...)

  #################### 4. separateBamBystrand #####################
  ## separateBamBystrand: split the BAM file by strand to forward.bam and reverse.bam
  .printStep('separateBamBystrand', ...)
  inputBam.u.dedup.seps <- separateBamBystrand(tools$samtools, input=inputBam.u.dedup, thread=trap.params$thread, ...)
  .checkAndPrintFiles(filenames = inputBam.u.dedup.seps, varnames=c("inputBam.u.dedup.seps[1]", "inputBam.u.dedup.seps[2]"), ...)

  ################### 5. findPeaksByStrand ######################
  ## findPeaksByStrand: find peaks based on BAM coverages
  ## This step merges previous loadBpCoverages and findPeaks.
  .printStep('findPeaksByStrand', ...)
  forwardPeaksFile=paste0(inputBam.u.dedup.seps[1], '.peaks')
  forwardPeaksFile <-findPeaksByStrand(bamFile=inputBam.u.dedup.seps[1], chrs=trap.params$chrs, strand='+',
                                   L=trap.params$readlength, maxwidth=trap.params$maxwidth,
                                   cutoff = trap.params$cov.cutoff,
                                   ofile=forwardPeaksFile, ...)
  .checkAndPrintFiles(filenames = forwardPeaksFile, varnames=c("forwardPeaksFile"), ...)

  reversePeaksFile=paste0(inputBam.u.dedup.seps[2], '.peaks')
  reversePeaksFile <-findPeaksByStrand(bamFile=inputBam.u.dedup.seps[2], chrs=trap.params$chrs, strand='-',
                                   L=trap.params$readlength, maxwidth=trap.params$maxwidth,
                                   cutoff = trap.params$cov.cutoff,
                                   ofile=reversePeaksFile, ...)
  .checkAndPrintFiles(filenames = reversePeaksFile, varnames=c("reversePeaksFile"), ...)

  ################## 6. generateSAF #######################
  ## generateSAF: generate a .SAF file that records the information of identified peaks
  .printStep('generateSAF', ...)
  peaksfile <- generateSAF(forwardPeaks=forwardPeaksFile, reversePeaks=reversePeaksFile, outputdir=outputDir, ...)
  .checkAndPrintFiles(filenames = peaksfile, varnames=c("peaksfile"), ...)

  ################# 7. generateFinalBam ########################
  ## generateFinalBam: get a small BAM that contains the peak regions.
  ## Here the inputBam is used, which will contain duplicated reads.
  .printStep('generateFinalBam', ...)
  final.bam <- generateFinalBam(tools$featureCounts, tools$samtools, input=inputBam, peakfile=peaksfile, thread=trap.params$thread, ...)
  .checkAndPrintFiles(filenames = final.bam, varnames=c('final.bam'), ...)

  ################# 8. countPeaks ########################
  ## countPeaks: count read number in each peak with umitools
  .printStep('countPeaks', ...)
  countsfile<- countPeaks(tools$umitools, input=final.bam, outputdir=outputDir, TenX=trap.params$TenX, ...)
  .checkAndPrintFiles(filenames = countsfile, varnames=c('countsfile'), ...)
  countsPeaksFiles=list(countsfile=countsfile, peaksfile=peaksfile)

  ################## 9. findTails #######################
  ## findTails: peaks, genome, no
  if (trap.params$tails.search=='peaks') {
    .printStep('reducePeaks (when tails.search=peaks)', ...)
    #8.1# reducePeaks: remove small peaks to speed up tail-searching
    countsPeaksFiles <- reducePeaks(countsfile=countsfile, peaksfile=peaksfile,
                                         min.cells = trap.params$min.cells, min.count = trap.params$min.count,
                                         suffix='.reduced', toSparse=FALSE, ...)
    peaksfile.reduced=countsPeaksFiles$peaksfile
    .checkAndPrintFiles(filenames = peaksfile.reduced, varnames=c('peaksfile.reduced'), ...)

    .printStep('findTailsByPeaks (when tails.search=peaks)', ...)
    #8.2# findTailsByPeaks: find tails nearing peaks. Here the inputBam is used, which will contain duplicated reads.
    tailsfile=paste0(inputBam, '.peaks.tails')
    tailsfile <- findTailsByPeaks(bamfile = inputBam, peaksfile=peaksfile.reduced, d=trap.params$d, tailsfile=tailsfile, ...)

  } else if (trap.params$tails.search=='genome') {
    .printStep('findTails (when tails.search=genome)', ...)
    #8# findTails: find tails genome-wide. Here the inputBam is used, which will contain duplicated reads.
    tailsfile=paste0(inputBam, '.genome.tails')
    tailsfile <- findTails(bamfile = inputBam, tailsfile=tailsfile)

  } else {
    ## not find tails
    tailsfile=NULL
  }
  .checkAndPrintFiles(filenames = tailsfile, varnames=c('tailsfile'), ...)
  ################### 10. generatescExpMa ######################
  ## generatescExpMa: generate final PA data including peaks and counts information
  .printStep('generatescExpMa', ...)
  outputfile=paste0(outputDir, '/scAPAtrapData.rda')
  outputfile <- generatescExpMa(countsfile=countsPeaksFiles$countsfile,
                            peaksfile=countsPeaksFiles$peaksfile,
                            barcode=trap.params$barcode,
                            tails=tailsfile, d=trap.params$d,
                            min.cells=trap.params$min.cells, min.count=trap.params$min.count,
                            ofile=outputfile, ...)
  .checkAndPrintFiles(filenames = outputfile, varnames=c('outputfile'), ...)

  ################### 11. clean TRAPFILES ######################
  ## show TRAPFILES for cleanning
  .printStep('All output files in running order', ...)
  .printTRAPFILES(nc=50, ...)

  ## return scAPAtrapData.rda
  return(outputfile)

}
BMILAB/scAPAtrap documentation built on Oct. 13, 2023, 2:36 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
BMILAB/scAPAtrap
Identification and Quantification of Alternative Polyadenylation Sites from Single-cell RNA-seq Data

R/scAPAtrap_funlib.R
In BMILAB/scAPAtrap: Identification and Quantification of Alternative Polyadenylation Sites from Single-cell RNA-seq Data

R Package Documentation

Browse R Packages

We want your feedback!

BMILAB/scAPAtrap Identification and Quantification of Alternative Polyadenylation Sites from Single-cell RNA-seq Data

R/scAPAtrap_funlib.R In BMILAB/scAPAtrap: Identification and Quantification of Alternative Polyadenylation Sites from Single-cell RNA-seq Data

R Package Documentation

Browse R Packages

We want your feedback!

BMILAB/scAPAtrap
Identification and Quantification of Alternative Polyadenylation Sites from Single-cell RNA-seq Data

R/scAPAtrap_funlib.R
In BMILAB/scAPAtrap: Identification and Quantification of Alternative Polyadenylation Sites from Single-cell RNA-seq Data