R/count_polyA.R

Defines functions AggregatePeakCounts FindPeaks fit_gaussian make_reference make_exons CountPeaks

Documented in AggregatePeakCounts CountPeaks FindPeaks fit_gaussian make_exons make_reference

###################################################################
#'
#' Generate a peak x cell UMI count matrix
#'
#' Generates a UMI count matrix where rows are the peaks and columns are the cells. Counts cells 
#' that are identified through a provided 'white list' of cell barcodes. If alignment done using CellRanger,
#' this will be the barcodes.tsv file contained in the 'filtered_gene_matrices_mex' folder for example. 
#'
#' @param peak.sites.file a file containing peak coordinates generated by FindPeaks
#' @param gtf.file reference (GTF) file
#' @param bamfile scRNA-seq BAM file
#' @param whitelist.file file of cell barcodes to count
#' @param output.dir name of directory to write output (will be created if it doesn't exist)
#' @param countUMI whether to count UMIs (default: TRUE)
#' @param ncores Number of cores for multithreading
#' @param chr.names names of chromosomes
#' @param filter.chr names of chromosomes to filter
#' @param gene.symbol.ref field in the GTF file containing the gene symbol
#' @param CBtag cell barcode tag identifier present in BAM file. Default 'CB'.
#' @param UMItag UMI barcode tag identifier present in BAM file. Default 'UB'.
#' @return NULL. Writes counts to file.
#' @examples
#' 
#' extdata_path <- system.file("extdata",package = "Sierra")
#' reference.file <- paste0(extdata_path,"/Vignette_cellranger_genes_subset.gtf")
#' 
#' bamfile <- c(paste0(extdata_path,"/Vignette_example_TIP_sham.bam"),
#'              paste0(extdata_path,"/Vignette_example_TIP_mi.bam") )
#'              
#' whitelist.bc.file <- paste0(extdata_path,"/example_TIP_sham_whitelist_barcodes.tsv")
#'   
#' peak.merge.output.file = paste0(extdata_path, "/TIP_merged_peaks.txt")
#'  
#' \dontrun{                                
#' CountPeaks(peak.sites.file = peak.merge.output.file, 
#'              gtf.file = reference.file,
#'              bamfile = bamfile[1], 
#'              whitelist.file = whitelist.bc.file[1],
#'              output.dir = count.dirs[1], 
#'              countUMI = TRUE, 
#'              ncores = 1)
#'  }
#' 
#'
#' @importFrom magrittr "%>%"
#' @importFrom foreach "%dopar%"
#' @importFrom Matrix writeMM
#' @import utils
#'
#' @export
CountPeaks <- function(peak.sites.file, 
                       gtf.file, 
                       bamfile, 
                       whitelist.file, 
                       output.dir, 
                       countUMI=TRUE,
			                 ncores = 1, 
			                 chr.names = NULL, 
			                 filter.chr = FALSE, 
			                 gene.symbol.ref = 'gene_name',
			                 CBtag='CB', 
			                 UMItag='UB') 
{

  lock <- tempfile()
  whitelist.bc <- read.table(whitelist.file, stringsAsFactors = FALSE)
  whitelist.bc <- whitelist.bc[,1]
  n.bcs <- length(whitelist.bc)
  message("There are ", n.bcs, " whitelist barcodes.")

  n.columns <- n.bcs + 1

  # read in gene reference
  genes.ref <- make_reference(gtf_file = gtf.file, chr.names = chr.names, filter.chr = filter.chr, gene.symbol.ref = gene.symbol.ref)
  chr.names <- as.character(unique(genes.ref$chr))
  n.genes <- nrow(genes.ref)

  peak.sites <- read.table(peak.sites.file, header = T, sep = "\t", quote = '',
                            stringsAsFactors = FALSE)

  # Count the peaks
  n.total.sites <- nrow(peak.sites)
  message("There are ", n.total.sites, "  sites")
  message("Doing counting for each site...")

  # Set up multiple workers
  system.name <- Sys.info()['sysname']
  new_cl <- FALSE
  if (system.name == "Windows") {
    new_cl <- TRUE
    cluster <- parallel::makePSOCKcluster(rep("localhost", ncores))
    doParallel::registerDoParallel(cluster)
  } else {
    doParallel::registerDoParallel(cores=ncores)
  }
  #print(chr.names)
  mat.to.write <- foreach::foreach(each.chr = chr.names, .combine = 'rbind', .packages=c("magrittr")) %dopar% {
      mat.per.chr <- c()
      message("Processing chr: ", each.chr)
      

      for(strand in c(1, -1) ) {
      message(" and strand ", strand)
      isMinusStrand <- if(strand==1) FALSE else TRUE
     
      peak.sites.chr <- dplyr::filter(peak.sites, Chr == each.chr & Strand == strand) %>%
                           dplyr::select(Gene, Chr, Fit.start, Fit.end, Strand)

      peak.sites.chr$Fit.start <- as.integer(peak.sites.chr$Fit.start)
      peak.sites.chr$Fit.end <- as.integer(peak.sites.chr$Fit.end)
      peak.sites.chr <- dplyr::filter(peak.sites.chr, Fit.start < Fit.end)

      # If there are no sites in this range, then just keep going
      if(nrow(peak.sites.chr) == 0) {
	      next
      }

      isMinusStrand <- if(strand==1) FALSE else TRUE
      which <- GenomicRanges::GRanges(seqnames = each.chr, ranges = IRanges::IRanges(1, max(peak.sites.chr$Fit.end) ))

      param <- Rsamtools::ScanBamParam(tag=c(CBtag, UMItag),
                            which = which,
                            flag=Rsamtools::scanBamFlag(isMinusStrand=isMinusStrand))

      aln <- GenomicAlignments::readGAlignments(bamfile, param=param)
      
      nobarcodes <- which(unlist(is.na(GenomicRanges::mcols(aln)[CBtag])))
      noUMI <- which(unlist(is.na(GenomicRanges::mcols(aln)[UMItag])))
      
      
      to.remove <- dplyr::union(nobarcodes, noUMI)
      if (length(to.remove) > 0) {
        aln <- aln[-to.remove]
      }

      whitelist.pos <- which(unlist(GenomicRanges::mcols(aln)[CBtag]) %in% whitelist.bc)
      aln <- aln[whitelist.pos]
      
      # Check that at least one whitelisted barcode was identified. 
      if (length(aln) == 0) {
        next
      }

      # For de-duplicating UMIs, let's just remove a random read
      # when there is a duplicate
      if(countUMI) {
        GenomicRanges::mcols(aln)$CB_UB <- paste0(unlist(GenomicRanges::mcols(aln)[CBtag]), 
                                                  "_", unlist(GenomicRanges::mcols(aln)[UMItag]))
         uniqUMIs <- which(!duplicated(GenomicRanges::mcols(aln)$CB_UB))
         aln <- aln[uniqUMIs]
      }

      aln <- GenomicRanges::split(aln, unlist(GenomicRanges::mcols(aln)[CBtag]))

      polyA.GR <- GenomicRanges::GRanges(seqnames = peak.sites.chr$Chr,
                          IRanges::IRanges(start = peak.sites.chr$Fit.start,
                                  end = as.integer(peak.sites.chr$Fit.end)))
      n.polyA <- length(polyA.GR)
      barcodes.gene <- names(aln)
      res <- sapply(barcodes.gene, function(x) GenomicRanges::countOverlaps(polyA.GR, aln[[x]]))

      # Reorder the columns of the res matrix to match the whitelist barcodes
      res.mat <- matrix(0L, nrow = n.polyA, ncol = n.bcs)
      res.mat[,match(barcodes.gene, whitelist.bc)] <- res

      # Return a sparse matrix
      mat.per.strand <- Matrix::Matrix(res.mat, sparse = TRUE)
      #mat.to.write <- matrix(0L, nrow = n.polyA, ncol = n.bcs)
      #mat.to.write[,match(barcodes.gene, whitelist.bc)] <- res
      polyA.ids <- paste0(peak.sites.chr$Gene, ":", peak.sites.chr$Chr, ":", peak.sites.chr$Fit.start,
                          "-", peak.sites.chr$Fit.end, ":", peak.sites.chr$Strand )
      rownames(mat.per.strand) <- polyA.ids


      # Need to combine the two matrices from each strand
      if(is.null(mat.per.chr)) {
         mat.per.chr <- mat.per.strand
      } else {
         mat.per.chr <- rbind(mat.per.chr, mat.per.strand)
      }
    } # Loop for strand

    # Return sparse matrix for each chromosome for combining across all threads
    return(mat.per.chr)
  } # Loop for chr
  
  if (new_cl) { ## Shut down cluster if on Windows
    ## stop cluster
    parallel::stopCluster(cluster)
  }

  if (!dir.exists(output.dir)){
    dir.create(output.dir)
  }
  Matrix::writeMM(mat.to.write, file = paste0(output.dir, "/matrix.mtx"))
  write.table(whitelist.bc, file = paste0(output.dir, "/barcodes.tsv"), quote = FALSE, row.names = FALSE, col.names = FALSE)
  write.table(rownames(mat.to.write), file = paste0(output.dir, "/sitenames.tsv"), quote = FALSE, row.names = FALSE, col.names = FALSE)
  
  ## Compress the output files
  R.utils::gzip(paste0(output.dir, "/matrix.mtx"), overwrite = TRUE)
  R.utils::gzip(paste0(output.dir, "/barcodes.tsv"), overwrite = TRUE)
  R.utils::gzip(paste0(output.dir, "/sitenames.tsv"), overwrite = TRUE)

} # End function



###################################################################
#'
#' Helper function
#'
#' Helper function
#'
#' @param x x
#' @return to write
#' @examples
#' \dontrun{
#' make_exons(x)
#' }
#' 
#' 
make_exons <- function(x) {
  to.write <- paste0("(", as.character(x[1]), ",")
  niters <- length(x) -1
  for(i in 1:niters) {
    if(x[i+1]-x[i] == 1) {
      next
    } else {
      to.write <- paste0(to.write, as.character(x[i]), ")(",
                         as.character(x[i+1]), ",")
    }
  }
  to.write <- paste0(to.write, x[niters+1], ")")
  return(to.write)

}

###################################################################
#'
#' Build gene start-end reference from a gtf file
#'
#' @param gtf_file  gtf file
#' @param chr.names a list of valid chromosome names to use
#' @param filter.chr whether to filter chromosomes in the GTF file
#' @param gene.symbol.ref field in the GTF file containing the gene symbol
#'
#' Takes a GTF file as input and creates a table of chromosome start-end
#' positions for each gene. Works with GTF files downloaded from 10x Genomics website.
#'
make_reference <- function(gtf_file, 
                           chr.names = NULL, 
                           filter.chr = FALSE,
                           gene.symbol.ref = 'gene_name') {
  ## Read in the gtf file
  gtf_gr <- rtracklayer::import(gtf_file)
  gtf_TxDb <- GenomicFeatures::makeTxDbFromGFF(gtf_file, format="gtf")
  
  ## Build a table of gene start-end positions
  genes <- GenomicFeatures::genes(gtf_TxDb)
  genes.ref <- as.data.frame(genes)
  rownames(genes.ref) <- genes.ref$gene_id
  
  ## Build a unique map of Ensembl ID to gene symbol
  ensembl.symbol.map <- data.frame(EnsemblID = gtf_gr@elementMetadata@listData$gene_id,
                                  GeneName = gtf_gr@elementMetadata@listData[[gene.symbol.ref]], 
                                  stringsAsFactors = FALSE)
  
  ensembl.symbol.map %>% dplyr::distinct(EnsemblID, .keep_all = TRUE) -> ensembl.symbol.map.unique
  rownames(ensembl.symbol.map.unique) <- ensembl.symbol.map.unique$EnsemblID
  
  ## Add gene symbol to the gene table
  overlapping.gene.ids <- dplyr::intersect(rownames(genes.ref), ensembl.symbol.map.unique$EnsemblID)
  
  ensembl.symbol.map.unique <- ensembl.symbol.map.unique[overlapping.gene.ids, ]
  genes.ref$Gene <- ensembl.symbol.map.unique$GeneName
  
  ## update column names in reference file
  colnames(genes.ref) <- c("chr", "start", "end", "width", "strand", "EnsemblID", "Gene")
  genes.ref <- genes.ref[, c("EnsemblID", "chr", "start", "end", "strand", "Gene")]
  genes.ref$strand <- plyr::mapvalues(genes.ref$strand, from = c("-", "+"), to = c("-1", "1"))
  
  ## Filter the chromosome names 
  if (filter.chr) {
    if (!is.null(chr.names)) {
      # Use provided chromosome names
      chr.use <- chr.names
    } else{ 
      # Assume we're working with human or mouse
      chr.use1 <- as.character(c(1:22, c("X", "Y", "MT")))
      chr.use2 <- paste0("chr", as.character(c(1:22, c("X", "Y", "MT"))))
      chr.use <- c(chr.use1, chr.use2)
    }
    
    genes.ref.use <- subset(genes.ref, chr %in% chr.use )
    genes.ref.use$chr <- droplevels(genes.ref.use$chr)
    
    if (nrow(genes.ref.use) == 0) {
      warning("No entries were left after filtering GTF for chromosome names. Reverting to original entries.")
      genes.ref.use <- genes.ref
    }
  } else{
    # Use GTF as is
    genes.ref.use <- genes.ref
  }
  
  return(genes.ref.use)
}


###################################################################
#'
#' Fit Gaussian curve to the coverage 
#' 
#' Given read coverage data, fit a Guassian curve using either 
#' NLS or MLE fits. 
#'
#' @param fit.data read coverage to fit
#' @param maxval maximum read coverage
#' @param fit.method Either NLS or MLE 
#' @param mu initialised value for the centre of the peak (default: 300)
#'
#'
fit_gaussian <- function(fit.data, maxval, fit.method, mu = 300) {
  
  if(fit.method == "NLS") { 
    nls.res <- NULL
    tryCatch({
      nls.res <- nls( y ~ k*exp(-1/2*(x-mu)^2/sigma^2),
                      start=c(mu=mu,sigma=100,k=maxval) , data = fit.data)
    }, error = function(err) { })
    return(nls.res)
  } else if(fit.method == "MLE") { 
    
    x = fit.data$x 
    y = fit.data$y 
    
    # Likelihood function for MLE 
    LL <- function(mu, sigma,k) { 
      R = dnorm(y, k*exp(-1/2*(x-mu)^2/sigma^2), 10, log = TRUE)
      return(-sum(R))
    }
    
    mle.fit = NULL 
    tryCatch({
      mle.fit = mle(LL, start = list(mu = mu, sigma=100, k =maxval))
    }, error = function(err) { })  
    
    return(mle.fit)
  } else {
    stop("Fit method need to be either NLS or MLE")
  } 
  
} 


###################################################################
#'
#' Perform splice-aware peak calling on a BAM file produced from a scRNA-seq experiment
#'
#' Takes as input a BAM file produced from barcoded scRNA-seq experiment, the reference (GTF) file used during alignment and
#' a BED file of junctions produced by regtools. For each gene in the reference file, the peak calling process first splits 
#' the read coverage into 'across junction' and 'no junction' subsets. Within each subset, the site of maximum coverage 
#' is identified and a peak called, by fitting a Gaussian to the read coverage, from a 600bp window around this region.
#' After calling a peak, the local read coverage is removed and the next site of maximum coverage is identified. This process 
#' runs iteratively until at least one of two stopping criteria are reached. The first criteria is defined as the maximum 
#' read coverage a minimum cutoff (min.cov.cutoff) and proportion (min.cov.prop). The second critera is the size of the peak,
#' including a absolute threshold (min.peak.cutoff) and a relative threshold (min.peak.prop).
#'
#' @param output.file a file containing polyA sites
#' @param gtf.file reference (GTF) file
#' @param bamfile scRNA-seq BAM file
#' @param junctions.file BED file (as produced by regtools) or SJ.out.tab file (STAR aligner) containing  splice junction coordinates
#' @param min.jcutoff minimum number of spliced reads across a junction for it to be considered (default: 50). 
#' @param min.jcutoff.prop minimum proportion of junction reads out of all junction reads for that gene (default: 0.05)
#' @param min.cov.cutoff minimum number of reads to consider a peak (default: 500)
#' @param min.cov.prop minimum proportion of reads to consider a peak (default: 0.05) 
#' @param min.peak.cutoff minimum peak height (default: 200)
#' @param min.peak.prop minimum ratio of current peak height relative to maximum peak height for this gene (default: 0.05)
#' @param ncores number of cores to use
#' @param chr.names names of chromosomes
#' @param filter.chr names of chromosomes to filter
#' @param gene.symbol.ref field in the GTF file containing the gene symbol
#' @return NULL. Writes counts to file.
#' @examples
#' 
#' extdata_path <- system.file("extdata",package = "Sierra")
#' reference.file <- paste0(extdata_path,"/Vignette_cellranger_genes_subset.gtf")
#' junctions.file <- paste0(extdata_path,"/Vignette_example_TIP_sham_junctions.bed")
#' 
#' bamfile <- c(paste0(extdata_path,"/Vignette_example_TIP_sham.bam"),
#'              paste0(extdata_path,"/Vignette_example_TIP_mi.bam") )
#'              
#' peak.output.file <- c("Vignette_example_TIP_sham_peaks.txt",  "Vignette_example_TIP_MI_peaks.txt")
#' 
#' 
#' FindPeaks(output.file=peak.output.file[1], gtf.file = reference.file, 
#'              bamfile=bamfile[1], junctions.file=junctions.file)
#'
#' @importFrom magrittr "%>%"
#' @importFrom foreach "%dopar%"
#' @import GenomicRanges
#'
#' @export
#'
FindPeaks <- function(output.file, 
                      gtf.file, 
                      bamfile, 
                      junctions.file,
                      min.jcutoff=50, 
                      min.jcutoff.prop = 0.05, 
                      min.cov.cutoff = 500,
                      min.cov.prop = 0.05, 
                      min.peak.cutoff=200, 
                      min.peak.prop = 0.05, 
                      ncores = 1,
                      chr.names = NULL, 
                      filter.chr = FALSE, 
                      gene.symbol.ref = 'gene_name',
                      fit.method = "NLS") {
  
  if(!fit.method %in% c("NLS", "MLE") ) { stop("fit.method needs to be either NLS or MLE. ")}

  lock <- tempfile()
  #genes.ref <- read.table(reference.file,
  #                        header = TRUE, sep = ",", stringsAsFactors = FALSE)
  #chr.names <- as.character(unique(genes.ref$chr))
  #genes.ref <- subset(genes.ref, chr %in% chr.names)

  ## Read in the gtf file
  genes.ref = make_reference(gtf_file = gtf.file, chr.names = chr.names, filter.chr = filter.chr, gene.symbol.ref = gene.symbol.ref)
  n.genes = nrow(genes.ref)
  message(paste(n.genes, "gene entries to process"))

  # Initiate the output file
  write("Gene\tChr\tStrand\tMaxPosition\tFit.max.pos\tFit.start\tFit.end\tmu\tsigma\tk\texon/intron\texon.pos\tLogLik", 
        file = output.file)
  
  ## Read in junctions from either bed file or SJ file from STAR aligner
  # step1 determine file type by extension
  
  idx.star <- grep(pattern="SJ.out.tab(.gz|)", x=junctions.file)
  idx.bed <- grep(pattern=".bed(.gz|)", x=junctions.file)
  
  junctions <- read.table(junctions.file, sep = "\t",header = FALSE)
  
  if (length(idx.star) > 0)
  {
    junctions.GR <- GenomicRanges::GRanges(seqnames = junctions$V1, 
                                           IRanges::IRanges(start =junctions$V2,  end = junctions$V3), 
                                           counts = junctions$V8)
  }    
  else if(length(idx.bed) > 0)
  {
    junctions <- cbind(junctions,
                       reshape2::colsplit(junctions$V11, ",", c("blocks1","blocks2")))
    junctions$start <- junctions$V2+junctions$blocks1
    junctions$end <- junctions$V3-junctions$blocks2
    junctions.GR <- GenomicRanges::GRanges(seqnames = junctions$V1,
                            IRanges::IRanges(start = junctions$start,
                                    end = junctions$end), counts = junctions$V5)
  }
  else # unrecognisable format
  {
    warning(paste0("Unrecognisable junction file format.\n",
            "File must have either bed or SJ.out.tab extension.\n",
            "Cannot continue to findPeaks\n"))
    return(NULL)
  }
  
  # Set up multiple workers
  system.name <- Sys.info()['sysname']
  new_cl <- FALSE
  if (system.name == "Windows") {
    new_cl <- TRUE
    cluster <- parallel::makePSOCKcluster(rep("localhost", ncores))
    doParallel::registerDoParallel(cluster)
  } else {
    doParallel::registerDoParallel(cores=ncores)
  }
  
  foreach::foreach(i = 1:n.genes, .packages = c("GenomicRanges")) %dopar% {
    gene.name <- genes.ref[i, "Gene"]
    seq.name <- genes.ref[i,"chr"]
    gene.start <- genes.ref[i,"start"]
    gene.end <- genes.ref[i, "end"]
    strand <- genes.ref[i,"strand"]

    #message(i, " :", gene.name)
    isMinusStrand <- if(strand==1) FALSE else TRUE
    which <- GenomicRanges::GRanges(seqnames = seq.name, ranges = IRanges::IRanges(gene.start, gene.end))
    param <- Rsamtools::ScanBamParam(which = which,
                          flag=Rsamtools::scanBamFlag(isMinusStrand=isMinusStrand))

    aln <- GenomicAlignments::readGAlignments(bamfile, param=param)
    aln_cov <- GenomicRanges::coverage(aln)[seq.name][[1]]

    if (length(aln_cov@values) > 1) { ## check for 0 read coverage for this gene

      data <- data.frame(pos = seq(gene.start, gene.end),
                         coverage = S4Vectors::runValue(aln_cov)[S4Vectors::findRun(gene.start:gene.end, aln_cov)])

      # Find the junction which overlaps this gene
      j.cutoff <- max(min.jcutoff,min.jcutoff.prop*max(data$coverage))
      hits <- GenomicRanges::findOverlaps(which, junctions.GR)
      this.junctions.GR <- junctions.GR[hits@to]
      this.junctions.GR <- this.junctions.GR[this.junctions.GR$counts > j.cutoff]
      #this.junctions.GR <- IRanges::subset(this.junctions.GR, counts > j.cutoff)
      n.junctions <- length(this.junctions.GR)
      data.no.juncs <- data
      
      ## Filter 
      ## This is pretty slow way to do this filtering,
      ## can definitely improve computationally!
      if(n.junctions > 0) {
        for(i in 1:n.junctions) {
          j.start <- IRanges::start(this.junctions.GR[i])
          j.end <- IRanges::end(this.junctions.GR[i])
          data.no.juncs <- data.no.juncs %>%
            dplyr::filter(pos < j.start | pos > j.end)
        }
      }

      ## Find peaks 

      totalcov <- sum(data$coverage)
      cutoff <- max(min.cov.cutoff,min.cov.prop*totalcov)
      covsum <- totalcov
      maxpeakval <- max(data$coverage)
      maxpeakcutoff <- max(min.peak.cutoff,min.peak.prop*maxpeakval )

      #message("Finding exonic sites...")
      n.points <- nrow(data.no.juncs)
      if(n.points > 0) {
        while(covsum > cutoff) {
          maxpeak <- which.max(data.no.juncs$coverage)
          if(data.no.juncs[maxpeak, "coverage"] < maxpeakcutoff) { break }
          start <- maxpeak - 300
          end <- maxpeak + 299

          if(start < 1 ) { start <- 1 }
          if(end > n.points) { end <- n.points }

          maxval <- max(data.no.juncs$coverage)
          #message(start, ",", end, ",", maxval)
          #message("length: ", data.no.juncs[end,"pos"] - data.no.juncs[start, "pos"])
          #message("max peak: ", data.no.juncs[maxpeak, "pos"])

          fit.data <- data.frame(x = seq(1,end-start+1), y = data.no.juncs[start:end,"coverage"])
          
          #nls.res <- NULL
          #tryCatch({
          #  nls.res <- nls( y ~ k*exp(-1/2*(x-mu)^2/sigma^2),
          #                  start=c(mu=300,sigma=100,k=maxval) , data = fit.data)
          #}, error = function(err) { })
          gaussian.fit <- fit_gaussian(fit.data, maxval, fit.method)
          if(!is.null(gaussian.fit)) {
            
            est_mu <- coef(gaussian.fit)["mu"]
            est_sigma <- abs(coef(gaussian.fit)["sigma"]) # this has to be positie in the Gaussian  
            est_k <- coef(gaussian.fit)["k"]
            fit.loglik <- logLik(gaussian.fit)[1] 
            fitted.peak <- maxpeak - 300 + floor(est_mu)
            from <- fitted.peak - 3*floor(est_sigma)
            to <- fitted.peak + 3*floor(est_sigma)

            # Handle the cases where the peak is too close to eithr the start or the end
            if(from < 1) { from = 1 }
            if(to > n.points) { to = n.points}

            if(fitted.peak <= 0) {
              peak.pos <- "Negative"
            } else {
              peak.pos <- data.no.juncs[fitted.peak, "pos"]
            }

            isGapped <- FALSE
            if(to <= 0) {
              to.pos <- "Negative"
            } else {
              to.pos <- data.no.juncs[to, "pos"]
              # Check if this is a spliced region
              pos.gaps <- diff(data.no.juncs[from:to, "pos"])

              if(length(which(pos.gaps > 1) > 0)) {
                isGapped <- TRUE
              }

            }


            if(isGapped) {
              exon.pos <- make_exons(data.no.juncs[from:to, "pos"])
              line <- paste(gene.name, seq.name, strand, data.no.juncs[maxpeak, "pos"],
                            peak.pos,
                            data.no.juncs[from, "pos"],
                            to.pos,
                            est_mu, est_sigma, est_k, 
                            "across-junctions", exon.pos, fit.loglik, sep="\t")
            } else {
              exon.pos <- "NA"
              line <- paste(gene.name, seq.name, strand, data.no.juncs[maxpeak, "pos"],
                            peak.pos,
                            data.no.juncs[from, "pos"],
                            to.pos,
                            est_mu, est_sigma, est_k, 
                            "no-junctions", exon.pos, fit.loglik, sep="\t")
            }

            
            #print(line)
  	  locked <- flock::lock(lock)
            write(line,file=output.file,append=TRUE)
  	  flock::unlock(locked)
          } else {
            line <- paste(gene.name, seq.name, strand, data.no.juncs[maxpeak, "pos"],
                          "NA", "NA", "NA", "NA", "NA", "NA", "no-junctions", "NA", "NA", sep="\t")
  	  locked <- flock::lock(lock)
            write(line,file=output.file,append=TRUE)
  	  flock::unlock(locked)
          }

          data.no.juncs[start:end, "coverage"] <- 0
          covsum <- sum(data.no.juncs$coverage)
          #print(covsum)
        }
      }

      ## Now let's see if there are any peaks in the introns
      ## test each intron separate
      #message("Finding intronic peaks...")

      reduced.junctions <- GenomicRanges::reduce(this.junctions.GR)
      n.rjunctions <- length(reduced.junctions)

      if(n.junctions > 0) {

        for(i in 1:n.rjunctions) {
          #message(i)
          j.start <- IRanges::start(reduced.junctions[i])
          j.end <- IRanges::end(reduced.junctions[i])
          intron.data <- data %>%
            dplyr::filter(pos > j.start & pos < j.end)

          if(nrow(intron.data) == 0) { next }
          maxpeak <- which.max(intron.data$coverage)
          maxval <- intron.data[maxpeak, "coverage"]

          #message(maxpeak, "    ", maxval)
          if(maxval < maxpeakcutoff) { next }
          fit.data <- data.frame(x = seq(1,nrow(intron.data)),
                                 y = intron.data[,"coverage"])
          #nls.res <- NULL
          #tryCatch({
          #  nls.res <- nls( y ~ k*exp(-1/2*(x-mu)^2/sigma^2),
          #                  start=c(mu=maxpeak,sigma=100,k=maxval) , data = fit.data)
          #}, error = function(err) { })
          gaussian.fit <- fit_gaussian(fit.data, maxval, fit.method, mu=maxpeak) 
          
          if(!is.null(gaussian.fit)) {
            # residuals <- sum(summary(nls.res)$residuals )
            #v <- summary(nls.res)$parameters[,"Estimate"]
            est_mu <- coef(gaussian.fit)["mu"]
            est_sigma <- abs(coef(gaussian.fit)["sigma"]) 
            est_k <- coef(gaussian.fit)["k"]
            fit.loglik <- logLik(gaussian.fit)[1]
            
            fitted.peak <- floor(est_mu)
            from <- fitted.peak - 3*floor(est_sigma)
            to <- fitted.peak + 3*floor(est_sigma)
            
            # Make sure position is within the intron 
            if(from < 1) { from = 1 }
            this.n.points <- nrow(intron.data)
            if(to > this.n.points) { to = this.n.points}

            if(fitted.peak <= 0) {
              peak.pos <- "Negative"
            } else {
              peak.pos <- intron.data[fitted.peak, "pos"]
            }

            if(to <= 0) {
              to.pos <- "Negative"
            } else {
              to.pos <- intron.data[to, "pos"]
            }

            line=paste(gene.name, seq.name, strand, intron.data[maxpeak, "pos"],
                       peak.pos,
                       intron.data[from, "pos"],
                       to.pos,
                       est_mu, est_sigma, est_k, 
                       "no-junctions", "NA", fit.loglik, sep="\t")

            #line=paste(gene.name, seq.name, maxpeak, v[1], v[2], v[3], "junctions", sep=",")
            #print(line)
  	  locked <- flock::lock(lock)
            write(line,file=output.file,append=TRUE)
  	  flock::unlock(locked)
          } else {
            line=paste(gene.name, seq.name, strand, intron.data[maxpeak, "pos"],
                       "NA", "NA", "NA", "NA", "NA", "NA", "no-junctions", "NA", "NA", sep="\t")
  	  locked <- flock::lock(lock)
            write(line,file=output.file,append=TRUE)
  	  flock::unlock(locked)
          }
        }

      }

    }

  } # End loop for genes
  
  if (new_cl) { ## Shut down cluster if on Windows
    ## stop cluster
    parallel::stopCluster(cluster)
  }

  ## As a final step, read in the peak file, filter, and add Peak IDs
  peak.sites.file = output.file
  peak.sites <- read.table(peak.sites.file, header = T, sep = "\t", quote = '',
                            stringsAsFactors = FALSE)

  ## Filter the polyA sites
  n.total.sites <- nrow(peak.sites)
  to.filter <- which(peak.sites$Fit.max.pos == "Negative")
  to.filter <- dplyr::union(to.filter, which(peak.sites$Fit.start == "Negative"))
  to.filter <- dplyr::union(to.filter, which(peak.sites$Fit.end == "Negative"))
  to.filter <- dplyr::union(to.filter, which(is.na(peak.sites$Fit.start)))
  to.filter <- dplyr::union(to.filter, which(is.na(peak.sites$Fit.end)))
  to.filter <- dplyr::union(to.filter,  which(is.na(peak.sites$Fit.max.pos)))

  if (length(to.filter) > 0)
    peak.sites <- peak.sites[-to.filter,]
  
  ## Check for any examples of peaks with start before end
  sites.diffs <- as.numeric(peak.sites$Fit.end) - as.numeric(peak.sites$Fit.start)
  to.filter <- which(sites.diffs < 0)
  if (length(to.filter) > 0)
    peak.sites <- peak.sites[-to.filter,]
  
  n.filt.sites <- nrow(peak.sites)
  message("There are ", n.total.sites, " unfiltered sites and ", n.filt.sites, " filtered sites")

  ## Add polyA IDs to the table
  polyA.ids <- paste0(peak.sites$Gene, ":", peak.sites$Chr, ":", peak.sites$Fit.start,
                      "-", peak.sites$Fit.end, ":", peak.sites$Strand )
  peak.sites$polyA_ID = polyA.ids

  ## Remove any duplicates
  peak.sites %>% dplyr::distinct(polyA_ID, .keep_all = TRUE) -> peak.sites
  n.updated.sites = nrow(peak.sites)
  message("There are ", n.updated.sites, " sites following duplicate removal")

  ## re-write the updated table
  write.table(peak.sites, file = output.file, sep="\t", quote = FALSE, row.names = FALSE)

} # End function

###################################################################
#'
#' Aggregate multiple peak count outputs together
#'
#' Aggregate the output from multiple runs of CountPeaks together. By default, this function will
#' update the cell barcodes in a format consistent with the CellRanger aggr program. That is, for
#' n experiments to aggregate, cell barcodes will be appended with '-1', '-2',...,'-n'. For downstream 
#' analysis, if using the PeakSeuratFromTransfer function, it is important to ensure that these match what is
#' in the gene count matrix. The name of the expected separator character can be updated with the 'exp.sep'
#' parameter and preferred labels can be specified with the 'exp.labels' parameter. The barcode updates can 
#' be turned off by setting update.labels = FALSE if manual setting is preferred.   
#'
#' @param peak.sites.file a file containing peak site coordinates
#' @param count.dirs a list of output directories from CountPeaks
#' @param output.dir output directory for aggregate count matrix
#' @param update.labels whether to append an experiment label to the cell barcode (default: TRUE)
#' @param exp.sep a character separating the cell barcode from experiment label (default: '-')
#' @param exp.labels optional labels to append to cell barcodes corresponding to count.dirs
#' @return NULL. Writes counts to file.
#' @examples
#' 
#' library(Sierra)
#' extdata_path <- system.file("extdata",package = "Sierra")
#' reference.file <- paste0(extdata_path,"/Vignette_cellranger_genes_subset.gtf")
#' junctions.file <- paste0(extdata_path,"/Vignette_example_TIP_sham_junctions.bed")
#' bamfile <- c(paste0(extdata_path,"/Vignette_example_TIP_sham.bam"),
#'             paste0(extdata_path,"/Vignette_example_TIP_mi.bam") )
#' whitelist.bc.file <- c(paste0(extdata_path,"/example_TIP_sham_whitelist_barcodes.tsv"),
#'                       paste0(extdata_path,"/example_TIP_MI_whitelist_barcodes.tsv"))
#'
#' ### Peak calling
#' peak.output.file <- c("Vignette_example_TIP_sham_peaks.txt",
#'                      "Vignette_example_TIP_MI_peaks.txt")
#' FindPeaks(output.file = peak.output.file[1],   # output filename
#'          gtf.file = reference.file,           # gene model as a GTF file
#' bamfile = bamfile[1],                # BAM alignment filename.
#'          junctions.file = junctions.file,     # BED filename of splice junctions exising in BAM file.
#'          ncores = 1)                          # number of cores to use
#'
#'
#' FindPeaks(output.file = peak.output.file[2],   # output filename
#'          gtf.file = reference.file,           # gene model as a GTF file
#'          bamfile = bamfile[2],                # BAM alignment filename.
#'          junctions.file = junctions.file,     # BED filename of splice junctions exising in BAM file.
#'          ncores = 1)
#'
#' #### Peak merging
#' peak.dataset.table = data.frame(Peak_file = peak.output.file,
#'                                Identifier = c("TIP-example-Sham", "TIP-example-MI"),
#'                                stringsAsFactors = FALSE)
#'
#' peak.merge.output.file = "TIP_merged_peaks.txt"
#' MergePeakCoordinates(peak.dataset.table, output.file = peak.merge.output.file, ncores = 1)
#' 
#' count.dirs <- c("example_TIP_sham_counts", "example_TIP_MI_counts")
#' #sham data set
#' CountPeaks(peak.sites.file = peak.merge.output.file,  gtf.file = reference.file,
#'           bamfile = bamfile[1], whitelist.file = whitelist.bc.file[1],
#'           output.dir = count.dirs[1],  countUMI = TRUE, ncores = 1)
#'  # MI data set
#'  CountPeaks(peak.sites.file = peak.merge.output.file,  gtf.file = reference.file,
#'             bamfile = bamfile[2], whitelist.file = whitelist.bc.file[2],
#'             output.dir = count.dirs[2],  countUMI = TRUE, ncores = 1)
#'             
#'  out.dir <- "example_TIP_aggregate"
#'  AggregatePeakCounts(peak.sites.file = peak.merge.output.file, count.dirs = count.dirs,
#'                    exp.labels = c("Sham", "MI"), output.dir = out.dir)      
#'  
#' @export
#'
AggregatePeakCounts <- function(peak.sites.file, 
                                count.dirs, 
                                output.dir, 
                                update.labels = TRUE,
                                exp.sep = "-", 
                                exp.labels = NULL) {

  if (!is.null(exp.labels)) {
    if (length(exp.labels) != length(count.dirs)) {
      stop("number of count directories should equal number of labels")
    }
  }

  peak.table <- read.table(peak.sites.file, sep="\t", quote = '', header = TRUE, stringsAsFactors = FALSE)
  all.peaks <- peak.table$polyA_ID
  peaks.use <- all.peaks
  
  ## Get intersection of the peak IDs from each of the input files
  for (i in 1:length(count.dirs)) {
    this.dir <- count.dirs[i]
    
    ## First check if files are compressed
    file.list <- list.files(this.dir)
    if (sum(endsWith(file.list, ".gz")) == 3) {
      gzipped = TRUE
    } else{
      gzipped = FALSE
    }
    
    if (gzipped) {
      sites.file <- paste0(this.dir, "/sitenames.tsv.gz")
      #peaks.con <- gzfile(sites.file)
      #this.peak.set <- readLines(peaks.con)
      
      peaks.con <- gzfile(sites.file)
      this.peak.set <- readLines(peaks.con)
      close(peaks.con)
    } else {
      sites.file <- paste0(this.dir, "/sitenames.tsv")
      this.peak.set <- readLines(sites.file)
    }
    
    peaks.use <- dplyr::intersect(peaks.use, this.peak.set)
    
    ## Check if peak IDs from the sites file are different to the count file
    if (length(dplyr::setdiff(all.peaks, this.peak.set)) > 0 | length(dplyr::setdiff(this.peak.set, all.peaks)) > 0) {
      warning(paste0("Some peaks in file ", this.dir, " do not match input peak coordinate file."))
    }
  }

  print(paste0("Aggregating counts for ", length(peaks.use), " peak coordinates"))
  aggregate.counts <- c()
  for (i in 1:length(count.dirs)) {
    this.dir <- count.dirs[i]
    this.data <- ReadPeakCounts(this.dir)

    
    if (update.labels) { ## update cell barcode names
      cell.names = colnames(this.data)
      pattern <- paste0("(.*)", exp.sep, ".*")
      barcodes = sub(pattern, "\\1", cell.names)
      if (is.null(exp.labels)) {
        cell.names.update = paste0(barcodes, exp.sep, i)
      } else{
        cell.names.update = paste0(barcodes, exp.sep, exp.labels[i])
      }
      colnames(this.data) = cell.names.update
    }
    
    this.data <- this.data[peaks.use, ]
    aggregate.counts <- cbind(aggregate.counts, this.data)
  }

  if (!dir.exists(output.dir)){
    dir.create(output.dir)
  }

  Matrix::writeMM(aggregate.counts, file = paste0(output.dir, "/matrix.mtx"))
  writeLines(colnames(aggregate.counts), paste0(output.dir, "/barcodes.tsv"))
  writeLines(rownames(aggregate.counts), paste0(output.dir, "/sitenames.tsv"))
  
  ## Compress the output files
  R.utils::gzip(paste0(output.dir, "/matrix.mtx"), overwrite = TRUE)
  R.utils::gzip(paste0(output.dir, "/barcodes.tsv"), overwrite = TRUE)
  R.utils::gzip(paste0(output.dir, "/sitenames.tsv"), overwrite = TRUE)

}
VCCRI/Sierra documentation built on July 3, 2023, 6:39 a.m.