# Copyright 2018 GRAIL, Inc. All rights reserved.
# Use of this source code is governed by the Apache 2.0
# license that can be found in the LICENSE file.
#' Calculate log likelihood ratio
#'
#' Calculate ll-ratio between cf+noise vs noise.
#' Binomial model for each probability requires average error mu.
#'
#' @param cp numeric contamination prior based on maf and genotype
#' @param depth numeric number of reads
#' @param cf numeric tested contamination fraction
#' @param mu numeric average error rate for this type of SNP
#' @param ad numeric number of alternative alleles called
#' @param blackswan numeric limit likelihood of individual hypotheses
#' @param limit_lh numeric limit maximum for minimum lh
#'
#' @return numeric log likelihood ratio
#'
#' @importFrom stats dbinom
#' @export
log_lr <- function(cp, depth, cf, mu, ad, blackswan = 0.01, limit_lh = 0.01) {
# Probability of het contamination
binom_cont <- dbinom(ad, size = depth, prob = (mu + cf), log = FALSE)
# Probability of noise
binom_noise <- dbinom(ad, size = depth, prob = mu, log = FALSE)
# Minimum likelihood for each hypothesis to set a limit
min_lh <- pmin(limit_lh, blackswan / depth ^ 2)
# Likelihood of contamination (lower limit at min_lh)
lh <- (1 - min_lh) * (cp * binom_cont + (1 - cp) * binom_noise) + min_lh
# Likelihood of no contamination (lower limit at min_lh)
lh0 <- (1 - min_lh) * binom_noise + min_lh
# Log likelihood ratio
return(log(lh / lh0))
}
#' Calculate likelihood ratio for each SNP
#'
#' @param cf numeric contamination fraction to be tested
#' @param dat data.table containing counts and metrics per SNP, hets filtered
#' @param blackswan blackswan term for maximum likelihood
#' @param outlier_frac fraction of outliers to remove
#' @param het_rate ratio of heterozygote to homozygote alt contaminant SNPs
#'
#' @return numeric avg. log-likelihood ratio for the given cf for each SNP
#'
#' @importFrom stats quantile weighted.mean sd
#' @export
get_lr <- function(cf, dat, blackswan = 0.05, outlier_frac = 0.005,
het_rate = 0.8) {
# Remove het genotype and calculate log likelihood ratio for each SNP
dat <- dat[gt != "0/1"]
dat$lr <- log_lr(dat$cp, dat$depth, get_exp_cf(cf, het_rate = het_rate),
dat$er, dat$vr, blackswan)
# Remove outliers
dat <- dat[lr <= quantile(lr, 1 - outlier_frac) &
lr >= quantile(lr, outlier_frac)]
return(dat)
}
#' Conta log likelihood ratio
#'
#' @param cf numeric contamination fraction to be tested
#' @param dat data.table containing counts and metrics per SNP, hets filtered
#' @param save_dir directory to save results
#' @param filename_prefix file name prefix for printing purposes
#' @param loh whether to plot in LOH mode
#' @param blackswan blackswan term for maximum likelihood
#' @param outlier_frac fraction of outliers to remove
#' @param het_rate ratio of heterozygote to homozygote alt contaminant SNPs
#'
#' @return numeric avg. log-likelihood ratio for the given cf or a more detailed
#' result if the save_dir and filename_prefix are specified.
#'
#' @importFrom stats quantile weighted.mean sd
#' @export
conta_llr <- function(cf, dat, save_dir = NA, filename_prefix = NA,
loh = FALSE, blackswan = 0.05, outlier_frac = 0.005,
het_rate = 0.8) {
# Likelihood for each SNP to be contaminated at cf level
dat <- get_lr(cf, dat, blackswan, outlier_frac, het_rate)
# Calculate average likelihoods of contamination per SNP per depth
if (is.na(save_dir) | is.na(filename_prefix)) {
return(max(weighted.mean(dat$lr, log(dat$depth), na.rm = TRUE), 0))
} else {
sum_log_lr <- sum(dat$lr, na.rm = TRUE)
avg_log_lr <- weighted.mean(dat$lr, log(dat$depth), na.rm = TRUE)
pos_lr_all <- mean(dat$lr > 0, na.rm = TRUE)
# Generate stats per chr
per_chr <- dat %>%
mutate(log_depth = log(depth)) %>%
dplyr::group_by(chrom) %>%
dplyr::summarize(depth = mean(depth, na.rm = TRUE),
avg_lr = weighted.mean(lr, log_depth, na.rm = TRUE),
pos_lr = sum(lr > 0, na.rm = TRUE),
pos_ratio = mean(lr > 0, na.rm = TRUE),
vfn = mean(vfn, na.rm = TRUE),
vr = mean(vr, na.rm = TRUE))
# Generate likelihood ratio plots
if (loh) {
plot_lr(save_dir, filename_prefix, dat, per_chr,
ext_chr_table = "per_chr.loh.tsv",
ext_loh_table = "per_bin.loh.tsv",
ext_loh_plot = "bin.lr.loh.png")
} else {
plot_lr(save_dir, filename_prefix, dat, per_chr)
}
# Pos lr ratio on X chr tells us whether the contaminant is male or female
pos_lr_x <- per_chr %>%
dplyr::filter(chrom == "X" | chrom == "chrX") %>%
dplyr::pull(pos_ratio)
pos_cv <- sd(per_chr$pos_ratio, na.rm = TRUE) /
mean(per_chr$pos_ratio, na.rm = TRUE)
return(data.table( cf = round(cf, 5),
sum_log_lr = max(c(sum_log_lr, 0), na.rm = TRUE),
avg_log_lr = max(c(avg_log_lr, 0), na.rm = TRUE),
hom_snps = nrow(dat),
depth = round(mean(dat$depth, na.rm = TRUE), 0),
pos_lr_all = pos_lr_all,
pos_lr_x = ifelse(length(pos_lr_x) > 0, pos_lr_x, NA),
pos_lr_chr_cv = pos_cv,
stringsAsFactors = FALSE ))
}
}
#' Optimize a set of likelihood metrics across SNPs using optimize function.
#'
#'
#' @param dat data.table containing counts and metrics per SNP, hets filtered
#' @param lr_th numeric likelihood threshold to make a call
#' @param save_dir character location to save the results
#' @param filename_prefix character file name prefix
#' @param loh whether to plot in LOH mode
#' @param blackswan blackswan term for maximum likelihood
#' @param min_cf minimum contamination fraction to call
#' @param cf_correction cf correction which is subtracted from calculated cf
#' @param outlier_frac fraction of outliers to remove
#' @param min_lr_to_cf_ratio do not make a conta call if likelihood ratio to
#' contamination fraction is equal to or below this ratio.
#'
#' @return data.table of likelihoods and related metrics
#'
#' @importFrom stats optimize
#' @export
optimize_likelihood <- function(dat, lr_th, save_dir = NA, filename_prefix = NA,
loh = FALSE, blackswan, min_cf, cf_correction,
outlier_frac, min_lr_to_cf_ratio = 0) {
# Do an initial grid search
cf <- get_initial_range()
grid_lr <- parallel::mclapply(cf, function(x)
conta_llr(x, dat, blackswan = blackswan, outlier_frac = outlier_frac))
# Cf that gives the max result from the initial grid search
vmax <- which.max(grid_lr)
# If max cf is the last or the first value, shift to prevent out of bounds
vmax <- ifelse(vmax == length(cf), vmax - 1, ifelse(vmax == 1, 2, vmax))
# Optimize avg. likelihood around the cf that gave best result
opt_val <- optimize(conta_llr, lower = cf[vmax - 1], upper = cf[vmax + 1],
dat = dat, blackswan = blackswan,
outlier_frac = outlier_frac,
tol = 1e-6, maximum = TRUE)
# Get optimized result
result <- conta_llr(opt_val$maximum, dat, save_dir, filename_prefix,
loh = loh, blackswan = blackswan,
outlier_frac = outlier_frac)
# Correct cf by an empiricially calculated factor (depends on characteristics
# of the samples such as depth and error rate variance). A cf_correction
# factor should be calculated for accurate cf readouts at levels of
# contamination close to the limit of detection, but it is not required.
result$cf <- max(result$cf - cf_correction, 0)
# Make a call based on likelihood threshold
conta_call <- (result$avg_log_lr >= lr_th) & (result$cf >= min_cf) &
(result$avg_log_lr / result$cf > min_lr_to_cf_ratio)
# Plot the likelihood distribution and the maximum likelihood (in red cross)
# Do not plot if this was not a post-loh run
if (conta_call & loh)
plot_max_likelihood(cf, grid_lr, opt_val, save_dir, filename_prefix)
return(cbind(conta_call, result))
}
#' Calculate likelihood ratio based on known source genotypes
#'
#' @param gt1 genotype/variant call table of the host
#' @param gt2 genotype/variant call table of the contaminant
#' @param cf numeric contamination fraction
#' @param blackswan blackswan term for maximum likelihood
#' @param outlier_frac fraction of outliers to remove
#' @param match_prob prior probability of contamination
#' @param detailed_results return likelihood ratios per SNP
#'
#' @return likelihood ratio from the source
#'
#' @importFrom stats weighted.mean
#' @export
get_source_llr <- function(gt1, gt2, cf, blackswan = 1,
outlier_frac = 0.005, match_prob = 0.9,
detailed_results = FALSE) {
# Merge host with source candidate
m1 <- merge(gt1, gt2, by = "rsid")
# Prepare data table for conta
dat <- data.table(rsid = m1$rsid,
cp = ifelse(m1$gt.x != m1$gt.y, match_prob, 1 - match_prob),
gt = m1$gt.x,
depth = m1$dp.x,
er = m1$er.x,
vr = m1$vr.x)
if (detailed_results) {
return(get_lr(cf = cf, dat = dat, blackswan = blackswan,
outlier_frac = outlier_frac))
} else {
return(conta_llr(cf = cf, dat = dat, blackswan = blackswan,
outlier_frac = outlier_frac))
}
}
#' Calculate genotype concordance between two samples
#'
#' @param gt1 genotype/variant call table of the host
#' @param gt2 genotype/variant call table of the contaminant
#' @return numeric fraction of matching genotypes between two samples
#'
#' @export
get_genotype_concordance <- function(gt1, gt2) {
# Merge host with source candidate
m1 <- merge(gt1, gt2, by = "rsid")
return(mean(m1$gt.x == m1$gt.y, na.rm = TRUE))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.