inst/extdata/BenchmarkTumorOnly.R

# Simple helper script that benchmarks predictSomatic using a tumor/normal VCF

suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(futile.logger))
suppressPackageStartupMessages(library(pROC))

### Parsing command line ------------------------------------------------------

option_list <- list(
    make_option(c("--rds"), action="store", type="character", default=NULL,
        help="Input: PureCN output RDS file"),
    make_option(c("--vcf"), action = "store", type = "character", default = NULL,
                help = "Input: VCF file containing SOMATIC calls"),
    make_option(c("--callable"), action="store", type="character", default=NULL,
        help="TMB: File parsable by rtracklayer specifying all callable regions, e.g. generated by GATK CallableLoci"),
    make_option(c("--keepflagged"), action = "store_true", default=FALSE,
        help = "Keep variants flagged by predictSomatic [default %default]"),
    make_option(c("--keepdb"), action = "store_true", default=FALSE,
        help = "Keep variants in dbSNP [default %default]"),
    make_option(c("--callcutoff"), action="store", type="double", default=0.8,
        help="Posterior probability cutoff to include variants [default %default]"),
    make_option(c("--out"), action="store", type="character", default=NULL,
        help="File name prefix to which results should be written"),
    make_option(c("-v", "--version"), action="store_true", default=FALSE, 
        help="Print PureCN version"),
    make_option(c("-f", "--force"), action="store_true", default=FALSE, 
        help="Overwrite existing files")
)

opt <- parse_args(OptionParser(option_list=option_list))

if (opt$version) {
    message(as.character(packageVersion("PureCN")))
    q(status = 0)
}    

# Parse input rds
infileRds <- opt$rds
if (is.null(infileRds)) stop("Need --rds")
infileRds <- normalizePath(infileRds, mustWork=TRUE)

# Parse both BED files restricting covered region
callableFile <- opt$callable
callable <- NULL
if (!is.null(callableFile)) {
    suppressPackageStartupMessages(library(rtracklayer))
    callableFile <- normalizePath(callableFile, mustWork=TRUE)
    flog.info("Reading %s...", callableFile)
    callable <- GRanges(import(callableFile))
}

### Run PureCN ----------------------------------------------------------------

flog.info("Loading PureCN...")
suppressPackageStartupMessages(library(PureCN))

res <- readCurationFile(infileRds)
sampleid <- res$input$sampleid

.getOutPrefix <- function(opt, infile, sampleid) {
    out <- opt[["out"]]
    if (is.null(out)) {
        if (!is.null(infile) && file.exists(infile)) {
            outdir <- dirname(infile)
            prefix <- sampleid
        } else {
            stop("Need --out")
        }    
    } else {
        outdir <- dirname(out)
        prefix <- basename(out)
    }    
    outdir <- normalizePath(outdir, mustWork=TRUE)
    outPrefix <- file.path(outdir, prefix)
}
outPrefix <- .getOutPrefix(opt, infileRds, sampleid)
     
outfile <- paste0(outPrefix, "_tumor_only_benchmark.csv")


if (!opt$force && file.exists(outfile)) {
    stop(outfile, " exists. Use --force to overwrite.")
}

flog.info("Reading %s...", basename(opt$vcf))
vcf <- PureCN:::.readAndCheckVcf(opt$vcf, genome = genome(res$input$vcf)[1])

flog.info("Classifying germline vs. somatic...")
    
p <- GRanges(predictSomatic(res))

if (!is.null(callable)) {
    p <- p[overlapsAny(p, callable)]
    vcf <- vcf[overlapsAny(vcf, callable)]
}

p$SOMATIC <- NA
ov <- findOverlaps(p, vcf)
p$SOMATIC[queryHits(ov)] <- info(vcf)$SOMATIC[subjectHits(ov)]

if (!opt$keepflagged) {
    p <- p[!p$FLAGGED]
}

if (!opt$keepdb) {
    p <- p[p$prior.somatic > 0.1]
}

idxCalled <- p$POSTERIOR.SOMATIC <= 1-opt$callcutoff | p$POSTERIOR.SOMATIC >= opt$callcutoff
callRate <- sum(idxCalled)/length(idxCalled)
callsSomatic <- table(p$POSTERIOR.SOMATIC[idxCalled] >= opt$callcutoff & p$SOMATIC[idxCalled] | !p$SOMATIC[idxCalled])
callsGermline <- table(p$POSTERIOR.SOMATIC[idxCalled] <= 1 - opt$callcutoff & !p$SOMATIC[idxCalled] | p$SOMATIC[idxCalled])
callsSomatic <- callsSomatic[["TRUE"]]/sum(callsSomatic)
callsGermline <- callsGermline[["TRUE"]]/sum(callsGermline)

x <- data.frame(
    Sampleid = sampleid,
    Purity = res$results[[1]]$purity,
    Ploidy = res$results[[1]]$ploidy,
    Num.Variants = length(p),
    Median.Coverage = median(p$depth, na.rm = TRUE),
    AUC.AR = auc(p$SOMATIC, p$AR),
    AUC.POSTERIOR.SOMATIC = auc(p$SOMATIC, p$POSTERIOR.SOMATIC),
    Callrate = callRate,
    Accuracy.Somatic = callsSomatic,
    Accuracy.Germline = callsGermline
)

write.csv(x, file = outfile, row.names = FALSE, quote = FALSE)


x <- p[idxCalled & p$SOMATIC != p$ML.SOMATIC]
loh <- GRanges(callLOH(res))
mcols(x) <- cbind(mcols(x), mcols(loh[findOverlaps(x, loh, select="first")]))
x$C <- NULL
x$M <- NULL
x$M.flagged <- NULL

outfile2 <- paste0(outPrefix, "_tumor_only_benchmark_wrong_calls.csv")
write.csv(as.data.frame(x), file = outfile2, row.names = FALSE, quote = FALSE)

x <- p[idxCalled & p$SOMATIC == p$ML.SOMATIC]
loh <- GRanges(callLOH(res))
mcols(x) <- cbind(mcols(x), mcols(loh[findOverlaps(x, loh, select="first")]))
x$C <- NULL
x$M <- NULL
x$M.flagged <- NULL

outfile2 <- paste0(outPrefix, "_tumor_only_benchmark_correct_calls.csv")
write.csv(as.data.frame(x), file = outfile2, row.names = FALSE, quote = FALSE)
lima1/PureCN documentation built on April 24, 2024, 8:23 p.m.