#' Read coverage file
#'
#' Read coverage file produced by external tools like The Genome Analysis
#' Toolkit or by \code{\link{calculateBamCoverageByInterval}}.
#'
#' @param file Target coverage file.
#' @param format File format. If missing, derived from the file
#' extension. Currently GATK3 DepthofCoverage, GATK4 CollectFragmentCounts
#' (hdf5), and CNVkit formats supported.
#' @param zero Start position is 0-based. Default is \code{FALSE}
#' for GATK, \code{TRUE} for BED file based intervals.
#' @param read.length For output formats which do not provide both counts
#' and total coverages, approximate them using the specified read length.
#' @return A \code{data.frame} with the parsed coverage information.
#' @author Markus Riester
#' @seealso \code{\link{calculateBamCoverageByInterval}}
#' @examples
#'
#' tumor.coverage.file <- system.file("extdata", "example_tumor.txt.gz",
#' package = "PureCN")
#' coverage <- readCoverageFile(tumor.coverage.file)
#'
#' @importFrom tools file_ext
#' @importFrom rhdf5 H5Fopen
#' @importFrom data.table fread
#' @export readCoverageFile
readCoverageFile <- function(file, format, zero = NULL, read.length = 100) {
if (missing(format)) format <- .getFormat(file)
if (format %in% c("cnn", "cnr")) {
targetCoverage <- .readCoverageCnn(file, zero, format)
} else if (format %in% c("hdf5")) {
targetCoverage <- .readCoverageGatk4(file, zero, format, read.length)
} else {
targetCoverage <- .readCoverageGatk3(file, zero, read.length)
}
.checkLowCoverage(targetCoverage)
.checkIntervals(targetCoverage)
}
.getFormat <- function(file) {
ext <- file_ext(file)
if (ext %in% c("cnn", "cnr")) return(ext)
if (ext %in% c("hdf5", "h5")) return("hdf5")
"GATK"
}
.readCoverageGatk3 <- function(file, zero, read.length) {
if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.")
inputCoverage <- fread(file)
if (is.null(inputCoverage$total_coverage)) inputCoverage$total_coverage <- NA
if (is.null(inputCoverage$counts)) inputCoverage$counts <- inputCoverage$total_coverage / read.length
if (is.null(inputCoverage$on_target)) inputCoverage$on_target <- TRUE
if (is.null(inputCoverage$duplication_rate)) inputCoverage$duplication_rate <- NA
inputCoverage$on_target <- as.logical(inputCoverage$on_target)
targetCoverage <- GRanges(inputCoverage$Target,
coverage = inputCoverage$total_coverage,
average.coverage = NA,
counts = inputCoverage$counts,
on.target = inputCoverage$on_target,
duplication.rate = inputCoverage$duplication_rate)
targetCoverage <- .addAverageCoverage(targetCoverage)
targetCoverage
}
.readCoverageGatk4 <- function(file, zero, format, read.length) {
if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.")
x <- H5Fopen(file, flags = "H5F_ACC_RDONLY")
intervals <- data.frame(x$intervals$transposed_index_start_end)
intervals[, 1] <- x$intervals$indexed_contig_names[intervals[, 1] + 1]
targetCoverage <- GRanges(intervals[, 1], IRanges(intervals[, 2], intervals[, 3]))
targetCoverage$counts <- x$counts$values[,1]
# TODO
targetCoverage$coverage <- targetCoverage$counts * read.length * 2
targetCoverage <- .addAverageCoverage(targetCoverage)
targetCoverage$on.target <- TRUE
targetCoverage
}
.addAverageCoverage <- function(x) {
x$average.coverage <- x$coverage / width(x)
x
}
.readCoverageCnn <- function(file, zero, format = "cnn") {
if (is.null(zero)) zero <- TRUE
inputCoverage <- fread(file, sep = "\t")
if (zero) inputCoverage$start <- inputCoverage$start + 1
targetCoverage <- GRanges(inputCoverage)
targetCoverage$coverage <- targetCoverage$depth * width(targetCoverage)
targetCoverage$average.coverage <- targetCoverage$depth
targetCoverage$on.target <- TRUE
targetCoverage$depth <- NULL
targetCoverage$Gene <- targetCoverage$gene
targetCoverage$on.target[which(targetCoverage$Gene == "Background")] <- FALSE
targetCoverage$on.target[which(targetCoverage$Gene == "Antitarget")] <- FALSE
targetCoverage$gene <- NULL
targetCoverage$duplication.rate <- NA
if (format == "cnr") {
targetCoverage$log.ratio <- targetCoverage$log2
}
targetCoverage$log2 <- NULL
targetCoverage
}
.checkIntervals <- function(coverageGr) {
if (min(width(coverageGr))<2) {
flog.warn("Coverage data contains single nucleotide intervals.")
}
if (min(start(coverageGr))<1) {
.stopUserError("Interval coordinates should start at 1, not at 0")
}
ov <- findOverlaps(coverageGr, coverageGr)
dups <- duplicated(queryHits(ov))
if (sum(dups)) {
dupLines <- subjectHits(ov)[dups]
flog.warn("Found %i overlapping intervals, starting at line %i.",
sum(dups), dupLines[1])
coverageGr <- reduce(coverageGr)
}
targets <- as.character(coverageGr)
coverageGr <- sortSeqlevels(coverageGr)
coverageGr <- sort(coverageGr)
if (!identical(targets, as.character(coverageGr))) {
flog.warn("Target intervals were not sorted.")
}
# add fields that might be missing due to old PureCN versions
if (is.null(coverageGr$counts)) coverageGr$counts <- NA
if (is.null(coverageGr$on.target)) coverageGr$on.target <- TRUE
coverageGr
}
.checkLowCoverage <- function(coverage) {
chrsWithLowCoverage <- names(which(sapply(split(coverage$average.coverage,
as.character(seqnames(coverage))), mean, na.rm=TRUE) < 1))
if (length(chrsWithLowCoverage)>2) {
flog.warn("Multiple chromosomes with very low coverage: %s",
paste(chrsWithLowCoverage, collapse=","))
}
}
.addGCData <- function(tumor, interval.file, verbose = TRUE) {
tumor$mappability <- 1
tumor$reptiming <- NA
tumor$reptiming <- as.numeric(tumor$reptiming)
tumor$gc_bias <- NA
tumor$gc_bias <- as.numeric(tumor$gc_bias)
if (is.null(tumor$Gene)) tumor$Gene <- "."
targetGC <- readIntervalFile(interval.file)
ov <- findOverlaps(tumor, targetGC)
if (!identical(as.character(tumor), as.character(targetGC))) {
# if only a few intervals are missing, maybe because of some poor
# quality regions, we just ignore those, otherwise we stop because
# user probably used the wrong file for the assay
if (length(ov) < length(tumor) / 2) {
.stopUserError("tumor.coverage.file and interval.file do not align.")
} else {
flog.warn("tumor.coverage.file and interval.file do not align.")
}
}
if (!is.null(tumor$on.target)) {
if (!identical(tumor[queryHits(ov)]$on.target, targetGC[subjectHits(ov)]$on.target)) {
flog.warn("Intervals in coverage and interval.file have conflicting on/off-target annotation.")
tumor[queryHits(ov)]$on.target <- targetGC[subjectHits(ov)]$on.target
}
}
tumor[queryHits(ov)]$mappability <- targetGC[subjectHits(ov)]$mappability
tumor[queryHits(ov)]$reptiming <- targetGC[subjectHits(ov)]$reptiming
tumor[queryHits(ov)]$gc_bias <- targetGC[subjectHits(ov)]$gc_bias
tumor[queryHits(ov)]$Gene <- as.character(targetGC[subjectHits(ov)]$Gene)
tumor <- .checkSymbolsChromosome(tumor)
return(tumor)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.