R/calculate_expression_profile.R
In noisyr: Noise Quantification in High Throughput Sequencing Output

Documented in calculate_expression_profile

#' Calculate the expression profile of a gene
#' @description This function calculates the expression profile of an exon in a selection
#' of BAM files. The expression profile is defined as the number of reads overlapping
#' with each position of the exon's transcript.
#' @param gene The exon for which the expression profile is calculated; this should be
#' a row from the tibble generated by \code{\link{cast_gtf_to_genes}}; for a manual input,
#' a tibble with 1 row and named columns (seqid, start, end) would be needed
#' @param bams a vector of paths to the BAM files from which the profile is extracted
#' @param unique.only whether only uniquely mapped reads should contribute to the profile;
#' default is TRUE
#' @param mapq.unique The values of the mapping quality field in the BAM file that corresponds
#' to uniquely mapped reads; by default, values of 50 and 255 are used as these correspond to
#' the most popular aligners, but an adjustment might be needed
#' @param slack slack needs to be >=readLength, adjust for efficiency; the default is 200,
#' as it is higher than most modern sequencing experiments
#' @return The function outputs a list: the first element is a matrix of expression profiles.
#' Rows correspond to positions in the exon transcript and each column corresponds to an input
#' BAM file. Each read is counted for all the positions with which it overlaps (so a read of
#' length 100 that completely overlaps with the exon would be counted for all 100 positions).
#' The second list element is a vector of raw expression of the gene in the different BAM files
#' @export
#' @examples
#' bams <- rep(system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE), 2)
#' genes <- data.frame("id" = 1:2,
#'                     "gene_id" = c("gene1", "gene2"),
#'                     "seqid" = c("seq1", "seq2"),
#'                     "start" = 1,
#'                     "end" = 100)
#' profile <- calculate_expression_profile(
#'   gene = genes[1,],
#'   bams = bams,
#'   mapq.unique = 99
#' )
#'
#' ggplot2::ggplot(tibble::tibble(y = profile$profile[,1],
#'                                x = seq_along(y))) +
#' ggplot2::geom_bar(ggplot2::aes(x, y), stat = "identity") +
#' ggplot2::theme_minimal()
calculate_expression_profile = function(gene,
                                        bams,
                                        unique.only=TRUE,
                                        mapq.unique=c(50,255),
                                        slack=200){
  profile <- base::matrix(0,
                          nrow=gene$end-gene$start+1,
                          ncol=base::length(bams))
  base::colnames(profile) <- bams

  gr <- GenomicRanges::GRanges(seqnames = gene$seqid,
                               ranges = IRanges::IRanges(start=gene$start-slack,
                                                         end=gene$end))
  params <- Rsamtools::ScanBamParam(which=gr, what=Rsamtools::scanBamWhat())
  expression.vector <- base::vector(mode="double", length=base::length(bams))
  for(j in base::seq_len(base::length(bams))){
    bamIndex <- base::paste0(bams[j], ".bai")
    bamFile <- Rsamtools::BamFile(bams[j], bamIndex)
    aln <- Rsamtools::scanBam(bamFile, param = params)[[1]]
    reads <- tibble::tibble(pos=aln$pos-gene$start+1,
                            qwidth=aln$qwidth,
                            mapq=aln$mapq)
    if(unique.only){
      mapq=NULL
      reads <- dplyr::filter(reads, mapq %in% mapq.unique)
    }
    expression.vector[j] <- base::nrow(reads)
    if(base::nrow(reads)>0){
      for(r in base::seq_len(base::nrow(reads))){
        add <- base::seq(from=reads$pos[r], length.out=reads$qwidth[r])
        add <- add[add>0 & add<=base::nrow(profile)]
        profile[add, j] <- profile[add, j] + 1
        if(base::is.na(base::sum(profile[,j]))){break}
      }
    }
  }
  return(base::list("profile"=profile, "expression"=expression.vector))
}