R/read_dist.R

Defines functions readGenomeDist readLengthDist

Documented in readGenomeDist readLengthDist

# read distribution

#' Read length distribution
#'
#' @description Read length distribution.
#'
#' @param bam A \code{GAlignments} object of aligned reads. (Required).
#'
#' @return A \code{data.frame} with 3 columns (\code{readLen}, \code{count}, and \code{pct})
#' of read length, number of reads with this length, and percentage of reads with this length.
#'
#' @export
#'
#' @importFrom methods is
#' @importFrom dplyr %>% group_by summarize n mutate
#'
readLengthDist = function(bam) {
  # sanity check
  if(!is(bam, 'GAlignments')) {
    stop('bam must be a GAlignments object.')
  }

  # read length distribution
  dist = data.frame(
    readLen = qwidth(bam)
  )
  dist = dist %>% group_by(readLen) %>% summarize(count = n()) %>%
    mutate(pct = 100 * count / sum(count)) %>% as.data.frame()

  return(dist)
}

#' Genomic feature distribution of aligned reads
#'
#' @description Genomic feature distribution of aligned reads. Reads are assigned to a feature
#' based on the following priority: CDS, UTR, Intron, and Intergenic.
#'
#' @param bam A \code{GAlignments} or \code{GRanges} object of aligned reads. (Required).
#' @param txdb A \code{TxDb} object of genome annotation. See \code{GenomicFeatures} package
#' for more details. (Required).
#' @param category A vector of characters. By default, distribution of reads on CDS, UTR,
#' intron, and intergenic regions is calculated. Must be selected from
#' \code{c('CDS', 'UTR', 'Intron', 'Intergenic')}. (Default: c('CDS', 'UTR', 'Intron',
#' 'Intergenic')).
#' @param fiveEndOnly A logical variable indicating if only keeping the 5'-ends of reads.
#' This option can be used to decrease the ambiguity for reads spanning multiple features.
#' (Default: TRUE).
#' @param ignoreStrand A logical variable indicating if ignoring that reads and features must
#' be on the same strand (Default: TRUE).
#'
#' @return A list of two elements. The first is a character vector indicating the genomic
#' feature assigned for each read. The order is the same as the input bam. The second element
#' is a \code{data.frame} with 3 columns (\code{feature}, \code{count}, and \code{pct}) of
#' genomic feature, number of reads falling into this feature, and percentage of reads falling
#' into this feature.
#'
#' @export
#'
#' @importFrom methods is as
#' @importFrom IRanges resize reduce countOverlaps
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomicFeatures genes exons cds fiveUTRsByTranscript threeUTRsByTranscript
#' intronsByTranscript
#' @importFrom dplyr %>% group_by summarize n mutate
#'
readGenomeDist = function(bam, txdb, category=c('CDS', 'UTR', 'Intron', 'Intergenic'),
  fiveEndOnly=TRUE, ignoreStrand=TRUE) {
  # sanity check
  if(!is(bam, 'GAlignments') & !is(bam, 'GRanges')) {
    stop('bam must be a GAlignments or GRanges object.')
  }
  if(!is(txdb, 'TxDb')) {
    stop('txdb must be a TxDb object.')
  }
  if(!all(category %in% c('CDS', 'UTR', 'Intron', 'Intergenic'))) {
    stop('category must be selected from c(\'CDS\', \'UTR\', \'Intron\', \'Intergenic\')')
  }
  if(!is.logical(fiveEndOnly)) {
    stop('fiveEndOnly must be a logical variable.')
  }
  if(!is.logical(ignoreStrand)) {
    stop('ignoreStrand must be a logical variable.')
  }

  # keep only 5'-end to minimize a read overlapping multiple features
  bamGR = as(bam, 'GRanges')
  if(fiveEndOnly) {
    bamGR = resize(as(bam, 'GRanges'), width=1)
  }

  # extract each feature
  geneGR = suppressMessages(reduce(genes(txdb))) # gene
  cdsGR = suppressMessages(reduce(cds(txdb))) # CDS
  fiveUTRGR = suppressMessages(reduce(unlist(fiveUTRsByTranscript(txdb)))) # 5'-UTR
  threeUTRGR = suppressMessages(reduce(unlist(threeUTRsByTranscript(txdb)))) # 3'-UTR
  utrGR = reduce(c(fiveUTRGR, threeUTRGR)) # UTR
  intronGR = suppressMessages(reduce(unlist(intronsByTranscript(txdb)))) # introns

  # internal category counter
  # UA (unassigned) means read has not been assigned to a feature
  categoryIndex = rep('UA', length(bamGR))

  # overlap with features
  # reads are first matched to CDS, then
  # unassigned reads are matched to UTR, then
  # unassigned reads are matched to Intron, then
  # unassigned reads are matched to Intergenic (because reads do not match genes).
  categoryIndex[categoryIndex == 'UA'][countOverlaps(bamGR[categoryIndex == 'UA'], cdsGR,
    ignore.strand=ignoreStrand) > 0] = 'CDS'
  categoryIndex[categoryIndex == 'UA'][countOverlaps(bamGR[categoryIndex == 'UA'], utrGR,
    ignore.strand=ignoreStrand) > 0] = 'UTR'
  categoryIndex[categoryIndex == 'UA'][countOverlaps(bamGR[categoryIndex == 'UA'], intronGR,
    ignore.strand=ignoreStrand) > 0] = 'Intron'
  categoryIndex[categoryIndex == 'UA'][countOverlaps(bamGR[categoryIndex == 'UA'], geneGR,
    ignore.strand=ignoreStrand) > 0] = 'GeneOther'
  categoryIndex[categoryIndex == 'UA'] = 'Intergenic'

  # to summary dataframe
  features = categoryIndex[categoryIndex %in% category]
  features = as.data.frame(table(features))
  colnames(features) = c('feature', 'count')
  features = features %>% mutate(pct = 100 * count / sum(count))

  return(list(feature=categoryIndex, summary=features))
}
nzhang89/RiboSeeker documentation built on April 15, 2022, 10:18 a.m.