R/binCoverage.R
In biscuiteer: Convenience Functions for Biscuit

Documented in binCoverage

#' Bin CpG or CpH coverage to simplify and improve CNA "sketching"
#'
#' Example usage for E-M
#'
#' NOTE: As of early Sept 2019, QDNAseq did not have hg38 capabilities. If you
#' desire to use the hg38 genome, biscuiteer suggests you use a GRanges object
#' to define your bins.
#'
#' @param bsseq    A bsseq object - supplied to getCoverage()
#' @param bins     Bins to summarize over - from tileGenome or QDNAseq.xxYY
#' @param which    Limit to specific regions? - functions as an import()
#'                   (DEFAULT: NULL)
#' @param QDNAseq  Return a QDNAseqReadCounts? - if FALSE, returns a GRanges
#'                   (DEFAULT: TRUE)
#' @param readLen  Correction factor for coverage - read length in bp
#'                   (DEFAULT: 100)
#'
#' @return         Binned read counts
#'
#' @importFrom methods as is new
#' @importFrom Biobase featureNames
#' @importFrom utils packageVersion
#' @import BiocGenerics
#' @import GenomeInfoDb
#' @import GenomicRanges
#' @import Mus.musculus
#' @import Homo.sapiens
#' @import QDNAseq
#'
#' @examples
#'
#'   bins <- GRanges(seqnames = rep("chr11",10),
#'                   strand = rep("*",10),
#'                   ranges = IRanges(start=100000*0:9, width=100000)
#'                  )
#'
#'   reg <- GRanges(seqnames = rep("chr11",5),
#'                  strand = rep("*",5),
#'                  ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7),
#'                                   end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7))
#'                  )
#'
#'   orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz",
#'                           package="biscuiteer")
#'   orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz",
#'                           package="biscuiteer")
#'   bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf,
#'                       merged = FALSE)
#'
#'   bc <- binCoverage(bsseq = bisc, bins = bins, which = reg, QDNAseq = FALSE)
#'
#' @export
#'
binCoverage <- function(bsseq,
                        bins,
                        which = NULL,
                        QDNAseq = TRUE,
                        readLen = 100) {
  # TODO: turn this into a generic for bsseq objects, for bigWigs, for [...]
  # TODO: figure out how to estimate the most likely GCbias ~ DNAme linkage

  # Check input types
  stopifnot(is.logical(QDNAseq))
  stopifnot(is.numeric(readLen))

  # turn QDNAseq bins into an annotated GRanges object 
  if (is(bins, "AnnotatedDataFrame") & # QDNAseq bins 
      !is.null(attr(bins, "QDNAseq")$build)) {
    message("Converting QDNAseq bins to GRanges for coverage")
    gr <- makeGRangesFromDataFrame(pData(bins), keep.extra.columns=TRUE)
    genome(gr) <- attr(bins, "QDNAseq")$build
  } else if (is(bins, "GenomicRanges")) {
    message("You really should use QDNAseq bins IF you can.")
    message("Output will be a GRanges object.")
    gr <- bins
  } else { 
    stop("Don't know what to do with bins of class ", class(bins), "!")
  }
  origStyle <- seqlevelsStyle(gr)
  if (!is.null(which)) {
    stopifnot(is(which, "GenomicRanges"))
    seqlevelsStyle(gr) <- seqlevelsStyle(which)
    gr <- subsetByOverlaps(gr, which)
  }
  seqlevelsStyle(gr) <- seqlevelsStyle(bsseq)
  names(gr) <- as(granges(gr), "character")
  summed <- getCoverage(bsseq, gr, what="perRegionTotal", withDimnames=TRUE)
  summed <- round(summed/readLen) # mostly so plot titles don't look insane
  gc(,TRUE) # cautious
  gr$score <- summed
  attr(gr, "binned") <- TRUE
  if (QDNAseq & is(bins, "AnnotatedDataFrame")) {
    seqlevelsStyle(gr) <- origStyle 
    names(gr) <- as(granges(gr), "character")
    phenodata <- data.frame(name=colnames(gr$score), 
                            row.names=colnames(gr$score),
                            stringsAsFactors=FALSE)
    phenodata$total.reads <- colSums(score(gr), na.rm=TRUE)
    if (!exists("use")) use <- seq_len(length(gr))
    phenodata$used.reads <- colSums(score(subset(gr, use)), na.rm=TRUE)
    object <- new("QDNAseqReadCounts", 
                  bins=subset(bins, featureNames(bins) %in% names(gr)),
                  counts=gr$score, phenodata=phenodata)
    object$expected.variance <- QDNAseq:::expectedVariance(object)
    annotation(object) <- paste("generated by biscuiteer", 
                                packageVersion("biscuiteer"))
    return(object) 
  } else { 
    if (!exists("use")) use <- seq_len(length(gr))
    return(subset(gr, rowSums(is.na(gr$score)) < ncol(bsseq) & use))
  }
}