inst/extdata/Coverage.R

suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(futile.logger))
suppressPackageStartupMessages(library(BiocParallel))

### Parsing command line ------------------------------------------------------

option_list <- list(
    make_option(c("--bam"), action = "store", type = "character", 
        default = NULL, help = "Input BAM file"),
    make_option(c("--bai"), action = "store", type = "character", 
        default = NULL,
        help = "BAM index file. Only necessary for non-standard file naming."),
    make_option(c("--coverage"), action = "store", type = "character", 
        default = NULL,
        help = "Input coverage file (supported file formats are GATK and CNVkit)"),
    make_option(c("--intervals"), action = "store", type = "character", default = NULL,
        help = "Interval file as generated by IntervalFile.R"),
    make_option(c("--keepduplicates"), action = "store_true", 
        default = formals(PureCN::calculateBamCoverageByInterval)$keep.duplicates, 
        help = "Count reads marked as duplicates [default %default]"),
    make_option(c("--removemapq0"), action = "store_true", default = FALSE, 
        help = "Not count reads marked with mapping quality 0 [default %default]"),
    make_option(c("--outdir"), action = "store", type = "character", 
        default = NULL,
        help = "Output directory to which results should be written"),
    make_option(c("--parallel"), action = "store_true", default = FALSE,
        help = "Use BiocParallel to calculate coverage in parallel whem --bam is a list of BAM files."),
    make_option(c("--cores"), action = "store", type = "integer", default = 1,
        help = "Number of CPUs to use when --bam is a list of BAM files [default %default]"),
    make_option(c("--seed"), action = "store", type = "integer", 
        default = NULL,
        help = "Seed for random number generator [default %default]"),
    make_option(c("-v", "--version"), action = "store_true", default = FALSE, 
        help = "Print PureCN version"),
    make_option(c("-f", "--force"), action = "store_true", default = FALSE, 
        help = "Overwrite existing files")
)

opt <- parse_args(OptionParser(option_list=option_list))

if (opt$version) {
    message(as.character(packageVersion("PureCN")))
    q(status = 1)
}    

if (!is.null(opt$seed)) {
    set.seed(opt$seed)
}

force <- opt$force

bam.file <- opt$bam
index.file <- opt$bai

gatk.coverage <- opt$coverage
interval.file <- opt$intervals

if (is.null(opt$outdir)) stop("Need --outdir")

outdir <- normalizePath(opt$outdir, mustWork = TRUE)
if (file.access(dirname(outdir), 2) < 0) stop("Permission denied to write in --outdir.")

interval.file <- normalizePath(interval.file, mustWork = TRUE)

### Calculate coverage from BAM files -----------------------------------------

.checkFileList <- function(file) {
    files <- read.delim(file, as.is = TRUE, header = FALSE)[,1]
    numExists <- sum(file.exists(files), na.rm = TRUE)
    if (numExists < length(files)) { 
        stop("File not exists in file ", file)
    }
    files
}

checkDataTableVersion <- function() {
    if (compareVersion(package.version("data.table"), "1.12.4") < 0) {
        flog.fatal("data.table package is outdated. >= 1.12.4 required")
        q(status = 0)
    }
    return(TRUE)
}        

getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file, 
    force = FALSE, keep.duplicates = FALSE, removemapq0 = FALSE) {

    bamFiles <- bamFiles
    indexFiles <- indexFiles
    outdir <- outdir
    interval.file <- interval.file
    force <- force

    .getCoverageBam <- function(bam.file, index.file = NULL, outdir,
        interval.file, force) {
        checkDataTableVersion()
        output.file <- file.path(outdir,  gsub(".bam$", "_coverage.txt.gz", 
            basename(bam.file)))
        futile.logger::flog.info("Processing %s...", output.file)
        if (!is.null(index.file)) {
            index.file <- normalizePath(index.file, mustWork=TRUE)
            index.file <- sub(".bai$", "", index.file)
        } else if (file.exists(sub("bam$", "bai", bam.file))) {
            index.file <- sub(".bam$", "", bam.file)
        } else {
            index.file <- bam.file
        }
        if (file.exists(output.file) && !force) {
            futile.logger::flog.info("%s exists. Skipping... (--force will overwrite)", output.file)
        } else {
            PureCN::calculateBamCoverageByInterval(bam.file = bam.file,
                interval.file = interval.file, output.file = output.file,
                index.file = index.file, keep.duplicates = keep.duplicates,
                mapqFilter = if (removemapq0) 1 else NA)
        }
        output.file
    }
    BPPARAM <- NULL
    if (!is.null(opt$cores) && opt$cores > 1) {
        suppressPackageStartupMessages(library(BiocParallel))
        BPPARAM <- MulticoreParam(workers = opt$cores)
        flog.info("Using BiocParallel MulticoreParam backend with %s cores.", opt$cores)
    } else if (opt$parallel) {
        suppressPackageStartupMessages(library(BiocParallel))
        BPPARAM <- bpparam()
        flog.info("Using default BiocParallel backend. You can change the default in your ~/.Rprofile file.") 
    }
   
    if (!is.null(BPPARAM) && length(bamFiles) > 1) {
        coverageFiles <- unlist(
            bplapply(seq_along(bamFiles), 
                function(i) .getCoverageBam(bamFiles[i], indexFiles[i], outdir, interval.file, force), 
                BPPARAM=BPPARAM)
        )
    } else {
        coverageFiles <- 
            sapply(seq_along(bamFiles), 
                function(i) .getCoverageBam(bamFiles[i], indexFiles[i], outdir, interval.file, force))
    }

    coverageFiles
}

coverageFiles <- NULL
indexFiles <- NULL

flog.info("Loading PureCN %s...", Biobase::package.version("PureCN"))

suppressPackageStartupMessages(library(PureCN))
    
if (!is.null(bam.file)) {
    bam.file <- normalizePath(bam.file, mustWork=TRUE)
    if (grepl(".list$", bam.file)) {
        bamFiles <- .checkFileList(bam.file)
        if (!is.null(index.file)) { 
            if(!grepl(".list$", index.file)) {
                stop("list of BAM files requires list of BAI files.")
            }
            indexFiles <- .checkFileList(index.file)   
        }
    } else {
        bamFiles <- bam.file
    }    
    if (length(bamFiles) != length(indexFiles) && !is.null(indexFiles)) {
        stop("List of BAM files and BAI files of different length.")
    }    

    coverageFiles <- getCoverageBams(bamFiles, indexFiles, outdir, 
        interval.file, force, opt$keepduplicates, opt$removemapq0) 
}

### GC-normalize coverage -----------------------------------------------------
.gcNormalize <- function(gatk.coverage, interval.file, outdir, force) {
    checkDataTableVersion()
    output.file <- file.path(outdir,  gsub(".txt$|.txt.gz$|_interval_summary",
        "_loess.txt.gz", basename(gatk.coverage)))
    outpng.file <- sub("txt.gz$", "png", output.file)
    output.qc.file <- sub(".txt.gz$", "_qc.txt", output.file)

    if (file.exists(output.file) && !force) {
        flog.info("%s exists. Skipping... (--force will overwrite)", output.file)
    } else {
        png(outpng.file, width = 800, height = 800)
        correctCoverageBias(gatk.coverage, interval.file,
            output.file = output.file, output.qc.file = output.qc.file, 
                plot.bias = TRUE)
        invisible(dev.off())
   }
}

if (!is.null(gatk.coverage) || !is.null(coverageFiles)) {
    # started not from BAMs?
    if (is.null(coverageFiles)) {
        if (grepl(".list$", gatk.coverage)) {
            coverageFiles <- .checkFileList(gatk.coverage)
        } else {
            coverageFiles <- gatk.coverage
        }
    }
    for (gatk.coverage in coverageFiles)     
        .gcNormalize(gatk.coverage, interval.file, outdir, force)
}

Try the PureCN package in your browser

Any scripts or data that you put into this service are public.

PureCN documentation built on Nov. 8, 2020, 5:37 p.m.