# Copyright 2018 GRAIL, Inc. All rights reserved.
# Use of this source code is governed by the Apache 2.0
# license that can be found in the LICENSE file.
#' Run conta.
#'
#' Reads a given counts file, processes it and calculates
#' contamination likelihood for a set of ranges.
#'
#' @param tsv_file input tsv file
#' @param metrics_file input tsv metrics file in long format
#' @param filename_prefix experiment name (basename for outputs)
#' @param save_dir output folder
#' @param lr_th min avg. likelihood ratio per SNP to make a call, this
#' number is highly dependent on the data type used and should be optimized
#' by the user based on a sensitivity - specificity trade-off.
#' @param sim_level if non-zero, a contaminant at this level will
#' be added to the current sample. The contaminant will be randomly
#' generated from minor allele frequencies of the SNPs in the population.
#' @param baseline baseline file for blacklist or noise model
#' @param min_depth minimum depth for a SNP to be considered
#' @param max_depth maximum depth for a SNP to be considered
#' @param loh_lr_cutoff minimum likelihood ratio to call a region as LOH
#' @param loh_delta_cutoff minimum delta (het deviation) to call a region as LOH
#' @param loh_min_snps minimum number of SNPs in a region to consider LOH
#' @param loh_max_snps maxmimum number of SNPs in a region to use for LOH, if
#' there are more SNPs, they are subsampled to this number
#' @param cf_correction cf correction calculated from empirical data
#' @param min_maf minimum minor allele frequency to include a SNP
#' @param min_cf minimum contamination fraction to call
#' @param blackswan blackswan term for maximum likelihood estimation
#' @param outlier_frac fraction of outlier SNPs (based on depth) to remove
#' @param tsv_rev_file input tsv file for reverse strand reads, if this option
#' is provided then first tsv_file is considered as positive strand reads
#' @param cores number of cores to be used for parallelization
#' @param subsample Either NA (use all SNPs) or number of SNPs to subsample to
#' @param context_mode whether to run with errors calculated in 3-base context
#' @param default_het_mean default for heterozygote mean allele frequency
#' @param default_het_sum default heterozygote allele frequency alpha + beta
#' from beta binomial alternative formulation
#' @param error_quantile_filter remove SNPs with error rates higher than this
#' quantile. Default 1.0 does not remove any SNPs. The idea is to remove
#' SNPs with high mean error rates, since they might also contain high
#' variance and thus introduce spurious contamination likelihoods.
#' @param min_lr_to_cf_ratio do not make a conta call if likelihood ratio to
#' contamination fraction is equal to or below this ratio. In general
#' conta likelihood ratio is greater than and is correlated with
#' the contamination fraction. In cases where this ratio is greatly violated
#' such as 10 times or less likelihood ratio compared to contamination
#' fraction, it is very likely that is is a spurious call generated due to
#' LOH or other artifacts.
#' @param seed random seed
#'
#' @return none
#'
#' @importFrom utils packageVersion
#' @importFrom utils write.table
#' @export
conta_main <- function(tsv_file, sample_id, save_dir, filename_prefix = sample_id,
metrics_file = "",
lr_th = 0.005, sim_level = 0, baseline_file = NA,
min_depth = 15, max_depth = 10000, loh_lr_cutoff = 0.001,
loh_delta_cutoff = 0.2,
loh_min_snps = 20, loh_max_snps = 1000,
min_maf = 0.001, subsample = NA,
cf_correction = 0, min_cf = 0.002, blackswan = 0.2,
outlier_frac = 0, tsv_rev_file = NA,
cores = 2, context_mode = FALSE,
chr_y_male_threshold = 0.0005,
default_het_mean = 0.5,
default_het_sum = 200,
error_quantile_filter = 0.75,
min_lr_to_cf_ratio = 0.2,
seed = 1359) {
options("digits" = 8)
options("mc.cores" = cores)
set.seed(seed)
# Create output folder if it doesn't exist
dir.create(save_dir, showWarnings = FALSE)
# Write out a default TSV result file in case we exit early from fail_test
empty_result <- empty_result(sample_id)
out_file <- file.path(save_dir, paste(filename_prefix, "conta.tsv", sep = "."))
write.table(empty_result, file = out_file, sep = "\t", row.names = FALSE,
quote = FALSE)
# Read baseline if available
baseline <- NA
if (!is.na(baseline_file)) {
baseline <- read_data_table(baseline_file)
}
# Prep snp counts
dat <- read_and_prep(tsv_file, tsv_rev_file, baseline, default_het_mean,
default_het_sum)
# Set dat as a subset of dat if it exceeds a pre-determined size
if (!is.na(subsample) & nrow(dat) > subsample) {
dat <- dat[sort(sample(1:nrow(dat), subsample)), ]
}
# Simulate contamination if sim_level is non-0
dat <- sim_conta(dat, sim_level)
# Original depth by chr
plot_depth_by_chr(dat, save_dir, filename_prefix, min_depth = min_depth,
ext_plot = "depth.png", min_maf = min_maf)
# Add in more useful fields and filter based on depth and outlier fractions
dat <- annotate_and_filter(dat, min_depth = min_depth, max_depth = max_depth,
out_frac = outlier_frac)
# Fail if there is no data, or one of the genotypes is never observed
fail_test(dat)
# Depth by chr after filters
plot_depth_by_chr(dat, save_dir, filename_prefix, min_depth = min_depth,
ext_plot = "filtered.depth.png", min_maf = min_maf)
# Calculate substitution rates per base and add them to SNP data table
EE <- calculate_error_model(dat, save_dir, filename_prefix,
context_mode = context_mode)
dat <- add_error_rates(dat, EE, context_mode)
# Remove low maf positions (they were kept for error model calculations)
dat <- dat %>%
dplyr::filter(maf > min_maf, maf < (1 - min_maf))
# Optionally remove high error positions
dat <- dat %>%
dplyr::filter(er <= quantile(dat$er, error_quantile_filter))
# Add experimental bayesian genotype estimation.
dat <- bayesian_genotype(dat)
# Fail if there is no data, or one of the genotypes is never observed
fail_test(dat)
# Obtain results and plot lr without LOH filter applied yet
result <- optimize_likelihood(dat, lr_th, save_dir, filename_prefix,
loh = FALSE, blackswan, min_cf, cf_correction,
outlier_frac = outlier_frac)
# Remove loss of heterozygosity regions
bin_stats <- get_per_bin_loh(dat, save_dir, filename_prefix, min_lr = loh_lr_cutoff,
blackswan = blackswan, conta_cf = result$cf,
min_loh = loh_delta_cutoff,
min_snps = loh_min_snps,
max_snps = loh_max_snps)
dat_loh <- exclude_high_loh_regions(dat, bin_stats)
# Plot minor allele ratio plot (.vr) with LOH
plot_minor_ratio(dat, dat_loh, save_dir, filename_prefix, max_snps = 100000)
# Fail if there is no data, or one of the genotypes is never observed
fail_test(dat_loh)
# Calculate likelihood, max likelihood, and whether to call it
result <- optimize_likelihood(dat_loh, lr_th, save_dir,
filename_prefix, loh = TRUE, blackswan, min_cf,
cf_correction, outlier_frac = outlier_frac,
min_lr_to_cf_ratio = min_lr_to_cf_ratio)
# Calculate chr X and Y counts from metrics file if provided
chr_y_stats <- chr_stats(metrics_file, "chrY")
chr_x_stats <- chr_stats(metrics_file, "chrX")
x_het_snps <- nrow(dat[gt == "0/1" & (chrom == "X" | chrom == "chrX"), ])
# Make a sex/gender call for the host
sex <- get_sex_call(chr_y_stats, chr_y_male_threshold, result)
# Make a pregnancy call for the contaminant
pregnancy <- get_pregnancy_call(result, sex,
chr_y_stats, chr_y_male_threshold)
# Final results table
max_result <- data.table(conta_version = packageVersion("conta"),
sample = sample_id,
sex = sex,
format(result, digits = 5, trim = TRUE),
y_count = round(chr_y_stats$count, 4),
y_norm_count = round(chr_y_stats$normalized_count, 6),
y_fraction_covered = round(chr_y_stats$fraction_covered, 4),
x_count = round(chr_x_stats$count, 4),
x_norm_count = round(chr_x_stats$normalized_count, 6),
x_fraction_covered = round(chr_x_stats$fraction_covered, 4),
x_het_snps = x_het_snps,
pregnancy = pregnancy,
excluded_regions = bin_stats[loh == TRUE, .N],
error_rate = round(mean(EE$er), 7),
round(t(data.frame(EE$er,
row.names = rownames(EE))), digits = 7))
# Remove sample ID column if not provided or empty.
if (is.na(sample_id) || is.null(sample_id) || sample_id == "") {
max_result$sample <- NULL
}
# Write conta results
write.table(max_result, file = out_file, sep = "\t", row.names = FALSE,
quote = FALSE)
# Write genotype files for all SNPs
write_gt_file(dat, max_result, blackswan, outlier_frac,
file.path(save_dir, paste(filename_prefix, "gt.tsv",
sep = ".")))
# Write genotype files for LOH-excluded SNPs
write_gt_file(dat_loh, max_result, blackswan, outlier_frac,
file.path(save_dir, paste(filename_prefix, "gt.loh.tsv",
sep = ".")))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.