R/binCoverage.R

#' bin CpG/CpH coverage to simplify & improve CNA "sketching" (e.g. for E-M) 
#' 
#' FIXME: turn this into a generic for bsseq objects, for bigWigs, for [...]
#' FIXME: figure out how to estimate the most likely GCbias ~ DNAme linkage
#'
#' @param x         a bsseq object, maybe HDF5-backed, for getCoverage()
#' @param bins      bins to summarize over (from tileGenome or QDNAseq.xxYY)
#' @param which     limit to specific regions? (functions as in import())
#' @param QDNAseq   return a QDNAseqReadCounts? (if FALSE, return a GRanges)
#' @param readLen   correction factor for coverage: read length in bp (100)
#'
#' @return          binned read counts
#'
#' @import GenomicRanges
#' @import Mus.musculus
#' @import Homo.sapiens
#' @import Biobase 
#' @import QDNAseq 
#' 
#' @export
binCoverage <- function(x, bins, which=NULL, QDNAseq=TRUE, readLen=100) {

  # turn QDNAseq bins into an annotated GRanges object 
  if (is(bins, "AnnotatedDataFrame") & # QDNAseq bins 
      !is.null(attr(bins, "QDNAseq")$build)) {
    message("Converting QDNAseq bins to GRanges for coverage")
    gr <- makeGRangesFromDataFrame(pData(bins), keep=TRUE)
    genome(gr) <- attr(bins, "QDNAseq")$build
  } else if (is(bins, "GenomicRanges")) {
    message("You really should use QDNAseq bins if you can.")
    gr <- bins
  } else { 
    stop("Don't know what to do with bins of class ", class(bins), "!")
  }
  origStyle <- seqlevelsStyle(gr)
  if (!is.null(which)) {
    seqlevelsStyle(gr) <- seqlevelsStyle(which)
    gr <- subsetByOverlaps(gr, which)
  }
  seqlevelsStyle(gr) <- seqlevelsStyle(x)
  names(gr) <- as(granges(gr), "character")
  summed <- getCoverage(x, gr, what="perRegionTotal", withDimnames=TRUE)
  summed <- round(summed/readLen) # mostly so plot titles don't look insane
  gc(,TRUE) # cautious
  gr$score <- summed
  attr(gr, "binned") <- TRUE
  if (QDNAseq) {
    seqlevelsStyle(gr) <- origStyle 
    names(gr) <- as(granges(gr), "character")
    phenodata <- data.frame(name=colnames(gr$score), 
                            row.names=colnames(gr$score),
                            stringsAsFactors=FALSE)
    phenodata$total.reads <- colSums(score(gr), na.rm=TRUE)
    phenodata$used.reads <- colSums(score(subset(gr, use)), na.rm=TRUE)
    object <- new("QDNAseqReadCounts", 
                  bins=subset(bins, featureNames(bins) %in% names(gr)),
                  counts=gr$score, phenodata=phenodata)
    object$expected.variance <- QDNAseq:::expectedVariance(object)
    annotation(object) <- paste("generated by biscuiteer", 
                                packageVersion("biscuiteer"))
    return(object) 
  } else { 
    return(subset(gr, rowSums(is.na(gr$score)) < ncol(x) & use == TRUE))
  }
}
ttriche/biscuitEater documentation built on May 15, 2019, 4:18 p.m.