#!/usr/bin/Rscript
# Copyright 2018 GRAIL, Inc. All rights reserved.
# Use of this source code is governed by the Apache 2.0
# license that can be found in the LICENSE file.
suppressMessages(library(conta))
suppressMessages(library(optparse))
# constants
# functions
main <- function() {
start_time <- proc.time()
option_list <- list(
make_option(c("-i", "--tsv_file"), type = "character", default = NULL,
help = "input pileup file with counts on SNP positions",
metavar = "character"),
make_option(c("-a", "--tsv_rev_file"), type = "character", default = NULL,
help = "input pileup file with reverse strand counts (optional),
when provided, first input file is assumed to contain
forward (+) strand counts)", metavar = "character"),
make_option(c("-b", "--biometrics_file"), type = "character", default = "",
help = "bio-metrics file", metavar = "character"),
make_option(c("-d", "--dbSNP_file"), type = "character", default = NULL,
help = "input vcf file from dbSNP", metavar = "character"),
make_option(c("-c", "--dbSNP_rev_file"), type = "character", default = NULL,
help = "input vcf file from rev dbSNP", metavar = "character"),
make_option(c("-s", "--sample"), type = "character", default = "test",
help = "Sample ID / file name to prefix out", metavar = "character"),
make_option(c("-l", "--lr_th"), type = "numeric", default = 0.005,
help = "Likelihood ratio threshold", metavar = "numeric"),
make_option(c("-m", "--sim_level"), type = "numeric", default = 0,
help = "Add simulated cf. 0 means none.", metavar = "numeric"),
make_option(c("-p", "--min_depth"), type = "numeric", default = 15,
help = "Minimum depth for a SNP.", metavar = "numeric"),
make_option(c("-g", "--max_depth"), type = "numeric", default = 10000,
help = "Maximum depth for a SNP.", metavar = "numeric"),
make_option(c("-z", "--min_cf"), type = "numeric", default = 0.002,
help = "Minimum cf to call.", metavar = "numeric"),
make_option(c("-q", "--min_maf"), type = "numeric", default = 0.001,
help = "Minimum maf for a SNP.", metavar = "numeric"),
make_option(c("-o", "--save_dir"), type = "character", default = "out",
help = "output folder", metavar = "character"),
make_option(c("-r", "--remove_maf_file"), type = "logical", default = TRUE,
help = "remove maf file", metavar = "logical"),
make_option(c("-n", "--baseline"), type = "character", default = NA,
help = "baseline file for blacklist or noise model",
metavar = "character"),
make_option(c("-f", "--loh_cutoff"), type = "numeric", default = 0.001,
help = "loh likelihood ratio cutoff", metavar = "numeric"),
make_option(c("-e", "--loh_delta_cutoff"), type = "numeric", default = 0.2,
help = "loh delta (deviation) cutoff", metavar = "numeric"),
make_option(c("", "--loh_min_snps"), type = "numeric", default = 20,
help = "minimum number of SNPs per chromosomal region",
metavar = "numeric"),
make_option(c("", "--loh_max_snps"), type = "numeric", default = 1000,
help = "maximum number of SNPs per chromosomal region",
metavar = "numeric"),
make_option(c("-w", "--blackswan"), type = "numeric", default = 0.2,
help = "black swan term for MLE", metavar = "numeric"),
make_option(c("-v", "--outlier_frac"), type = "numeric", default = 0,
help = "fraction of outliers to remove", metavar = "numeric"),
make_option(c("-x", "--cf_correction"), type = "numeric", default = 0,
help = "conta fraction correction", metavar = "numeric"),
make_option(c("", "--subsample"), type = "numeric", default = NA,
help = "subsample to number of SNPs", metavar = "numeric"),
make_option(c("-u", "--cores"), type = "numeric", default = 2,
help = "cpu cores", metavar = "numeric"),
make_option(c("-y", "--non_dbSNP"), type = "logical", default = FALSE,
help = "Include non-dbSNP for error rate", metavar = "logical"),
make_option(c("", "--fasta"), type = "character", default = NA,
help = "input genome fasta file for nucleotide context capture,
if provided, conta switches to analyzing errors
in tri-nucleotide contexts", metavar = "character"),
make_option(c("", "--chr_y"), type = "numeric", default = 0.0005,
help = "Chromosome Y normalized read threshold for male calls"),
make_option(c("", "--seed"), type = "integer", default = 1234,
help = "seed for simulations", metavar = "integer"),
make_option(c("", "--default_het_mean"), type = "numeric", default = 0.5,
help = "default mean for hetetozygote allele frequqencies, in
case it is expectd to deviate", metavar = "numeric"),
make_option(c("", "--default_het_sum"), type = "numeric", default = 200,
help = "default sum for hetetozygote allele frequqency alpha and
beta parameter, which is used in the alternative
formulation for beta binomial variance related term",
metavar = "numeric"),
make_option(c("", "--error_quantile_filter"), type = "numeric",
default = 0.75,
help = "Remove positions with error quantile higher than this
level from conta likelihood analysis",
metavar = "numeric"),
make_option(c("", "--min_lr_to_cf_ratio"), type = "numeric",
default = 0.2,
help = "Do not make a conta call if likelihood ratio to
contamination fraction is equal to or below this ratio",
metavar = "numeric")
)
opt_parser <- OptionParser(option_list = option_list);
opt <- parse_args(opt_parser);
filename_prefix <- opt$sample
sample_id <- opt$sample
if (is.null(opt$tsv_file) | is.null(opt$dbSNP_file)) {
print_help(opt_parser)
stop("At least a pileup file and dbSNP must be supplied.\n", call. = FALSE)
}
# Create output dir if it doesn't exist
dir.create(opt$save_dir, showWarnings = FALSE)
# Intersect counts file with dbSNP
maf_file <- paste(opt$save_dir, "/", filename_prefix, ".maf.tsv", sep = "")
if (file.exists(maf_file)) {
# File is already interesected, skip it
message(paste("Skipping intersection since ", maf_file, "already exists"))
} else {
message(paste("Intersecting", maf_file, "with", opt$dbSNP_file))
conta::intersect_snps(opt$tsv_file, maf_file, opt$dbSNP_file, opt$non_dbSNP,
opt$fasta)
}
# Intersect counts with minus strand counts if provided
if (!is.null(opt$tsv_rev_file) & !is.null(opt$dbSNP_rev_file)) {
maf_file2 <- paste(opt$save_dir, "/", filename_prefix, ".maf.rev.tsv", sep = "")
if (file.exists(maf_file2)) {
# File is already interesected, skip it
message(paste("Skipping intersection since ",
maf_file2, "already exists"))
} else {
message(paste("Intersecting", maf_file2, "with", opt$dbSNP_rev_file))
conta::intersect_snps(opt$tsv_rev_file, maf_file2, opt$dbSNP_rev_file,
opt$non_dbSNP, opt$fasta)
}
} else {
maf_file2 <- NA
}
# Run conta
message(paste("Starting conta"))
conta::conta_main(tsv_file = maf_file,
tsv_rev_file = maf_file2,
sample_id = sample_id,
save_dir = opt$save_dir,
metrics_file = opt$biometrics_file,
lr_th = opt$lr_th,
sim_level = opt$sim_level,
baseline = opt$baseline,
min_depth = opt$min_depth,
max_depth = opt$max_depth,
loh_lr_cutoff = opt$loh_cutoff,
loh_delta_cutoff = opt$loh_delta_cutoff,
loh_min_snps = opt$loh_min_snps,
loh_max_snps = opt$loh_max_snps,
min_maf = opt$min_maf,
subsample = opt$subsample,
cf_correction = opt$cf_correction,
min_cf = opt$min_cf,
blackswan = opt$blackswan,
outlier_frac = opt$outlier_frac,
cores = opt$cores,
context_mode = !is.na(opt$fasta),
chr_y_male_threshold = opt$chr_y,
seed = opt$seed,
default_het_mean = opt$default_het_mean,
default_het_sum = opt$default_het_sum,
error_quantile_filter = opt$error_quantile_filter,
min_lr_to_cf_ratio = opt$min_lr_to_cf_ratio
)
message(paste("Done."))
# Remove maf file
if (opt$remove_maf_file) {
message(paste("Removing annotated pileup files.",
"Set --remove_maf_file FALSE to keep them."))
file.remove(maf_file)
if (!is.na(maf_file2))
file.remove(maf_file2)
}
proc.time() - start_time
}
if (sys.nframe() == 0) {
main()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.