R/generateBedEcdf.R

#' generateBedEcdf
#'
#' @description
#' This function computes empirical cumulative distribution functions (eCDF) for
#' per-read beta values of the sequencing reads.
#'
#' @details
#' The function matches reads (for paired-end sequencing alignment files - read
#' pairs as a single entity) to the genomic
#' regions provided in a BED file/\code{\linkS4class{GRanges}} object, computes
#' average per-read beta values according to the cytosine context parameter
#' `ecdf.context`, and returns a list of eCDFs for within- and out-of-context
#' average per-read beta values, which can be used for plotting.
#' 
#' The resulting eCDFs and their plots can be used to characterise the
#' methylation pattern of a particular genomic region, e.g. if reads that match
#' to that region are methylated in an "all-CpGs-or-none" manner or if some
#' intermediate methylation levels are more frequent.
#'
#' @param bam BAM file location string OR preprocessed output of
#' \code{\link[epialleleR]{preprocessBam}} function. Read more about BAM file
#' requirements and BAM preprocessing at \code{\link{preprocessBam}}.
#' @param bed Browser Extensible Data (BED) file location string OR object of
#' class \code{\linkS4class{GRanges}} holding genomic coordinates for
#' regions of interest. It is used to match sequencing reads to the genomic
#' regions prior to eCDF computation. The style of seqlevels of BED file/object
#' must match the style of seqlevels of the BAM file/object used.
#' @param bed.type character string for the type of assay that was used to
#' produce sequencing reads:
#' \itemize{
#'   \item "amplicon" (the default) -- used for amplicon-based next-generation
#'   sequencing when exact coordinates of sequenced fragments are known.
#'   Matching of reads to genomic ranges are then performed by the read's start
#'   or end positions, either of which should be no further than
#'   `match.tolerance` bases away from the start or end position of genomic
#'   ranges given in BED file/\code{\link[GenomicRanges]{GRanges}} object
#'   \item "capture" -- used for capture-based next-generation sequencing when
#'   reads partially overlap with the capture target regions. Read is considered
#'   to match the genomic range when their overlap is more or equal to
#'   `match.min.overlap`. If read matches two or more BED genomic regions, only
#'   the first match is taken (input \code{\link[GenomicRanges]{GRanges}} are
#'   \strong{not} sorted internally)
#' }
#' @param bed.rows integer vector specifying what `bed` regions should be
#' included in the output. If `c(1)` (the default), then function returns eCDFs
#' for the first region of `bed`, if NULL - eCDF functions for all `bed`
#' genomic regions as well as for the reads that didn't match any of the regions
#' (last element of the return value; only if there are such reads).
#' @param zero.based.bed boolean defining if BED coordinates are zero based
#' (default: FALSE).
#' @param match.tolerance integer for the largest difference between read's and
#' BED \code{\link[GenomicRanges]{GRanges}} start or end positions during
#' matching of amplicon-based NGS reads (default: 1).
#' @param match.min.overlap integer for the smallest overlap between read's and
#' BED \code{\link[GenomicRanges]{GRanges}} start or end positions during
#' matching of capture-based NGS reads (default: 1). If read matches two or more
#' BED genomic regions, only the first match is taken (input
#' \code{\link[GenomicRanges]{GRanges}} are \strong{not} sorted internally).
#' @param ecdf.context string defining cytosine methylation context used
#' for computing within-the-context and out-of-context eCDFs:
#' \itemize{
#'   \item "CG" (the default) -- within-the-context: CpG cytosines (called as
#'   zZ), out-of-context: all the other cytosines (hHxX)
#'   \item "CHG" -- within-the-context: CHG cytosines (xX), out-of-context: hHzZ
#'   \item "CHH" -- within-the-context: CHH cytosines (hH), out-of-context: xXzZ
#'   \item "CxG" -- within-the-context: CG and CHG cytosines (zZxX),
#'   out-of-context: CHH cytosines (hH)
#'   \item "CX" -- all cytosines are considered within-the-context
#' }
#' @param ... other parameters to pass to the
#' \code{\link[epialleleR]{preprocessBam}} function.
#' Options have no effect if preprocessed BAM data was supplied as an input.
#' @param verbose boolean to report progress and timings (default: TRUE).
#' @return list with a number of elements equal to the length of `bed.rows` (if
#' not NULL), or to the number of genomic regions within `bed` (if 
#' `bed.rows==NULL`) plus one item for all reads not matching `bed` genomic
#' regions (if any). Every list item is a list on it's own, consisting of two
#' eCDF functions for within- and out-of-context per-read beta values.
#' @seealso \code{\link{preprocessBam}} for preloading BAM data,
#' \code{\link{generateCytosineReport}} for methylation statistics at the level
#' of individual cytosines, \code{\link{generateBedReport}} for genomic
#' region-based statistics, \code{\link{generateVcfReport}} for evaluating
#' epiallele-SNV associations, \code{\link{extractPatterns}} for exploring
#' methylation patterns, and `epialleleR` vignettes for the description of
#' usage and sample data.
#' @examples
#'   # amplicon data
#'   amplicon.bam <- system.file("extdata", "amplicon010meth.bam",
#'                               package="epialleleR")
#'   amplicon.bed <- system.file("extdata", "amplicon.bed",
#'                               package="epialleleR")
#'   
#'   # let's compute eCDF
#'   amplicon.ecdfs <- generateBedEcdf(bam=amplicon.bam, bed=amplicon.bed,
#'                                     bed.rows=NULL)
#'   
#'   # there are 5 items in amplicon.ecdfs, let's plot them all
#'   par(mfrow=c(1,length(amplicon.ecdfs)))
#'   
#'   # cycle through items
#'   for (x in 1:length(amplicon.ecdfs)) {
#'     # four of them have names corresponding to amplicon.bed genomic regions, 
#'     # fifth - NA for all the reads that don't match to any of those regions
#'     main <- if (is.na(names(amplicon.ecdfs[x]))) "unmatched"
#'             else names(amplicon.ecdfs[x])
#'     
#'     # plotting eCDF for within-the-context per-read beta values (in red)
#'     plot(amplicon.ecdfs[[x]]$context, col="red", verticals=TRUE,
#'          do.points=FALSE, xlim=c(0,1), xlab="per-read beta value",
#'          ylab="cumulative density", main=main)
#'     
#'     # adding eCDF for out-of-context per-read beta values (in blue)
#'     plot(amplicon.ecdfs[[x]]$out.of.context, add=TRUE, col="blue",
#'          verticals=TRUE, do.points=FALSE)
#'   }
#'   
#'   # recover default plotting parameters
#'   par(mfrow=c(1,1))
#'   
#' @export
generateBedEcdf <- function (bam,
                             bed,
                             bed.type=c("amplicon", "capture"),
                             bed.rows=c(1),
                             zero.based.bed=FALSE,
                             match.tolerance=1,
                             match.min.overlap=1,
                             ecdf.context=c("CG", "CHG", "CHH", "CxG", "CX"),
                             ...,
                             verbose=TRUE)
{
  bed.type     <- match.arg(bed.type, bed.type)
  ecdf.context <- match.arg(ecdf.context, ecdf.context)
  
  if (!methods::is(bed, "GRanges"))
    bed <- .readBed(bed.file=bed, zero.based.bed=zero.based.bed,
                    verbose=verbose)

  bam <- preprocessBam(bam.file=bam, ..., verbose=verbose)
  
  ecdf.list <- .getBedEcdf(
    bam.processed=bam, bed=bed, bed.type=bed.type, bed.rows=bed.rows,
    match.tolerance=match.tolerance, match.min.overlap=match.min.overlap,
    ctx.meth=.context.to.bases[[ecdf.context]][["ctx.meth"]],
    ctx.unmeth=.context.to.bases[[ecdf.context]][["ctx.unmeth"]],
    ooctx.meth=.context.to.bases[[ecdf.context]][["ooctx.meth"]],
    ooctx.unmeth=.context.to.bases[[ecdf.context]][["ooctx.unmeth"]],
    verbose=verbose
  )
  
  return(ecdf.list)
}
BBCG/epialleleR documentation built on March 24, 2024, 11:32 p.m.