# look for hSNPs in ever-larger windows around this chunk. exit as soon as any
# training hSNPs are found. when using MDA or PTA single cell WGA, there is
# essentially no useful AB information in hSNPs > 100kb away, but perhaps this
# won't be the case in future amplification technologies.
#
# N.B. hg38 added some contigs that are far away from the 1000 genomes population
# phasing panel. we used to only use flank=100kb, but in hg38 this sometimes leads
# to 0 extended training hSNPs and ultimately a failure in infer.gp().
# Genome-wide AB model estimation for spatial sensitivity covers the whole genome.
# The short arms of acrocentric chromosomes (e.g., chr22) are so large (16MB) that
# 10MB extensions are also sometimes necessary.
get.training.sites.for.abmodel.by.range <- function(region, integrated.table.path,
single.cell.id, flanks.to.try=10^(5:8), quiet=FALSE)
{
for (flank in flanks.to.try) {
if (!quiet) cat('Importing extended hSNP training data from', integrated.table.path,
'with flank size =', flank, '\n')
# trim() always generates a warning for first and last regions on a chromosome
# because adding/subtracting flank goes outside of the chromosome boundaries.
extended.range <- GenomicRanges::trim(
GenomicRanges::GRanges(seqnames=seqnames(region)[1],
ranges=IRanges(start=start(region)-flank, end=end(region)+flank),
seqinfo=seqinfo(region)))
extended.training.hsnps <- read.training.hsnps(integrated.table.path,
sample.id=single.cell.id, region=extended.range, quiet=quiet)
if (!quiet)
cat(sprintf("hSNP training sites: %d, flank size: %d\n",
nrow(extended.training.hsnps), as.integer(flank)))
# there needs to be >1 training hSNPs or else the leave-one-out method
# will produce NA AB values at the training hSNP (when it's left out)
if (nrow(extended.training.hsnps) > 1)
break
}
return(extended.training.hsnps)
}
get.training.sites.for.abmodel <- function(object, region,
integrated.table.path=object@integrated.table.path,
seqinfo=genome.string.to.seqinfo.object(object@genome.string),
quiet=FALSE)
{
# if this is a chunked object, extend hSNPs up/downstream so that full info is available at edges
if (!is.null(region)) {
flanks.to.try <- 10^(5:8)
training.hsnps <- get.training.sites.for.abmodel.by.range(region=region,
integrated.table.path=integrated.table.path, single.cell.id=object@single.cell,
flanks.to.try=flanks.to.try, quiet=quiet)
# nrow(object@gatk) > 0: don't give up in heterochromatic arms of, e.g., chr13
# where there are neither hSNPs nor somatic candidates.
if (nrow(training.hsnps) < 2 & nrow(object@gatk) > 0) {
stop(sprintf("%d hSNPs found within %d bp of %s:%d-%d, need at least 2; giving up",
nrow(training.hsnps), max(flanks.to.try),
seqnames(region)[1], start(region)[1], end(region)[1]))
}
} else {
training.hsnps <- object@gatk[training.site == TRUE & muttype == 'snv']
}
training.hsnps
}
compute.ab.given.sites.and.training.data <- function(sites, training.hsnps, ab.fits, quiet=FALSE)
{
# Splitting by chromosome is not for parallelization; the AB model
# is fit separately for each chromosome and thus applies different
# parameter estimates.
chroms <- unique(sites$chr)
do.work <- function(chrom, sites, ab.fit, hsnps) {
if (!quiet)
cat(sprintf("inferring AB for %d sites on %s:%d-%d\n",
nrow(sites), chrom, as.integer(min(sites$pos)), as.integer(max(sites$pos))))
time.elapsed <- system.time(z <- infer.gp1(ssnvs=sites, fit=ab.fit,
hsnps=hsnps, flank=1e5, verbose=!quiet))
if (!quiet) print(time.elapsed)
# z is a matrix
as.data.table(z)
}
if (length(chroms) == 1) {
# slightly more efficient for real use cases with chunked computation
ab <- do.work(chrom=chroms, sites=sites,
ab.fit=ab.fits[chroms,,drop=FALSE],
hsnps=training.hsnps)
} else {
ab <- do.call(rbind, lapply(chroms, function(chrom) {
hsnps <- training.hsnps[chr == chrom]
ab.fit <- ab.fits[chrom,,drop=FALSE]
do.work(chrom=chrom, sites=sites, ab.fit=ab.fit, hsnps=hsnps)
}))
}
# Chunks are sometimes empty. Add dummy columns so that rbind() doesn't
# fail when stiching together results from many chunks.
if (is.null(ab)) {
data.frame(gp.mu=numeric(0), gp.sd=numeric(0))
} else {
ab
}
}
# Very simple approximation of how correlation is affected by binomial sampling.
# Generate correlated normal pairs, which represent a pair of hSNP VAFs (p1, p2).
# Then sample observed VAFs via {x1,x2}/depth where x1 ~ Bin(p1, depth), x2 ~ Bin(p2, depth)
# Then calculate observed correlation of the observed VAF pairs.
#
# n.samples=1e4 is still a little noisy, especially for low depth.
#
# There are many reasons why this is not a particularly accurate model of our
# data. Its results are only used in diagnostics.
binomial.effect.on.correlation <- function(cor, depth, n.samples=1e4) {
# latent variable in normal space
# Note to self: mvnfast doesn't improve
L <- MASS::mvrnorm(n=n.samples, mu=c(0,0), Sigma=matrix(c(1,cor,cor,1),nrow=2))
# Transform into [0,1] space
B <- 1/(1+exp(-L))
# Draw discrete counts from N=depth using prob=B
N <- apply(B, 2, function(col) rbinom(n=length(col), size=depth, prob=col))
cor(N/depth)[1,2]
}
# get an approximate idea of correlation between training hSNPs
# by only looking at adjacent hSNPs. dist is the distance between
# them and (phased.af.x, phased.af.y) are the allele frequencies of hSNP i
# and its upstream neighbor hSNP i+1.
#
# bin.breaks - hSNP pairs are grouped into bins based on the distance
# between the two hSNPs to compute the observed AF correlation.
# bin.breaks defines the distance break points for this binning.
# CAUTION: this function can fail if bin.breaks creates very
# small bins (in which there are few or no hSNP pairs).
approx.abmodel.covariance <- function(object, bin.breaks=10^(0:5)) {
# Create a much smaller data.table (~50 Mb for humans) so copying won't be problematic
tiny.gatk <- object@gatk[training.site == TRUE & muttype == 'snv', .(chr, pos, phased.dp=phased.hap1 + phased.hap2, phased.af=phased.hap1/(phased.hap1 + phased.hap2))]
setkey(tiny.gatk, chr) # make chr==this.chr filters fast
ret <- rbindlist(lapply(object@gatk[,chr[1],by=chr]$chr, function(this.chr) {
# "Join" (just put side-by-side) tiny.gatk[1:n-1,] and tiny.gatk[2:n,]
z <- tiny.gatk[chr == this.chr]
z <- cbind(z[-nrow(z), .(chr.x=chr, pos.x=pos, phased.dp.x=phased.dp, phased.af.x=phased.af)],
z[-1, .(chr.y=chr, pos.y=pos, phased.dp.y=phased.dp, phased.af.y=phased.af)])
z[, dist := pos.y - pos.x]
# bin the adjacent hSNPs by the distance between them
z[, bin := cut(dist, breaks=bin.breaks, ordered_result=T)]
z[, bin.min := bin.breaks[as.integer(bin)]]
z[, bin.max := bin.breaks[as.integer(bin)+1]]
# use=na.or.complete is identical to use=complete.obs, except if there
# are no observations (like on a female chrY), NA is returned instead
# of throwing an error.
ret <- z[!is.na(bin), .(chr=this.chr,
n.pairs=nrow(.SD), # how many hSNP pairs in total?
# how many hSNP pairs have non-NA AFs?
n.complete.pairs=sum(!is.na(phased.af.x) & !is.na(phased.af.y)),
bin.min=bin.min[1], # bin.min/max just correspond to bin.breaks
bin.max=bin.max[1],
mean.dist=mean(dist),
max.dist=max(dist),
observed.cor=cor(phased.af.x, phased.af.y, use='na.or.complete'),
# mean.dp is used to infer the latent correlation after binomial
# sampling error. a proper model would account for the distinct
# depths at each of the hSNP locations, but since this function
# only serves as a diagnostic, we use a less correct (but still
# useful) model that only considers depth of the first hSNP in
# each pair.
mean.dp=as.integer(mean(phased.dp.x))), by=bin][order(bin)]
# pmin/pmax: avoid cor=1 or -1 because the binomial sampling correction can fail
# mask correlation calculations (with NA) when only 1 or 2 observations exist.
ret[, observed.cor := pmax(-0.99, pmin(0.99, ifelse(n.complete.pairs < 3, NA, observed.cor)))]
}))
# None of the below changes from chromosome to chromomsome
# get a rough inverse function of (observed correlation, dp) -> (underlying correlation, dp)
all.dps <- unique(as.integer(ret$mean.dp))
inverse.fns.by.depth <- setNames(lapply(all.dps, function(depth) {
if (depth == 0)
return(function(x) NA)
# use a linear interpolation on the approximate relationship between
# sampling correlation (on the latent variable) and observed correlation (after binomial sampling)
sampling.cor <- seq(-0.99, 0.99, length.out=50)
interp.points <- sapply(sampling.cor, binomial.effect.on.correlation, depth=depth)
# rule=2: when observations are outside of the observed range, returns the boundary value
approxfun(x=interp.points, y=sampling.cor, rule=2) # returns a function
}), as.character(all.dps))
# data.table complains about recursive indexing if i try to do this in a := statement
corrected.cor <- sapply(1:nrow(ret), function(i) inverse.fns.by.depth[[as.character(ret$mean.dp[i])]](ret$observed.cor[i]))
ret[, corrected.cor := corrected.cor]
ret
}
# probability distribution of seeing y variant reads at a site with
# depth d with the estimate (gp.mu, gp.sd) of the local AB. model is
# Y|p ~ Bin(d, 1/(1+exp(-b))), b ~ N(gp.mu, gp.sd)
# the marginal distribution of Y (# variant reads) requires integrating
# over b, which has no closed form solution. we approximate it using
# gauss-hermite quadrature. increasing the number of nodes in ghd
# increases accuracy.
# 'factor' allows changing the relationship of ab -> af
# NOTE: for many calls, is best for the caller to compute ghd once
# and supply it to successive calls.
#
# IMPORTANT!! recomputing this in the function increases runtime
# by approximately 5-fold! Setting this as a global constant is critical!
# XXX: ..but it would be nice if users could change it.
ghd = fastGHQuad::gaussHermiteData(128)
dreads <- function(ys, d, gp.mu, gp.sd, factor=1) { #, ghd=gaussHermiteData(128)) {
sapply(ys, function(y)
ghQuad(function(x) {
# ghQuad requires the weight function on x to be exp(-x^2)
# (which is NOT a standard normal)
b <- sqrt(2)*gp.sd*x + gp.mu
exp(dbinom(y, size=d, prob=1/(factor*(1+exp(-b))), log=TRUE) - log(pi)/2)
}, ghd
)
)
}
# IMPORTANT: both of the artifact models are symmetric w.r.t. gp.mu
# (i.e., -gp.mu and +gp.mu will give the same p-values).
# However, the true mutation model assumes that gp.mu
# has already been matched to the VAF (by #alt reads) of the somatic
# candidate, so it is not symmetric.
mut.model.tables <- function(dp, gp.mu, gp.sd) {
dps=0:dp
mut=dreads(dps, d=dp, gp.mu=gp.mu, gp.sd=gp.sd)
pre1=dreads(dps, d=dp, gp.mu=gp.mu, gp.sd=gp.sd, factor=2)
pre2=dreads(dps, d=dp, gp.mu=-gp.mu, gp.sd=gp.sd, factor=2)
pre <- (pre1 + pre2)/2
mda1=dreads(dps, d=dp, gp.mu=gp.mu, gp.sd=gp.sd, factor=4)
mda2=dreads(dps, d=dp, gp.mu=-gp.mu, gp.sd=gp.sd, factor=4)
mda <- (mda1 + mda2)/2
data.frame(dps=dps, mut=mut, pre=pre, mda=mda)
}
# The alpha cutoff:
# For each candidate SNV, we can compute the probability of a
# more extreme event (in terms of the number of alt reads)
# under each artifact model. This roughly corresponds to alpha,
# the type I error rate (false positive rate).
#
# How to determine an alpha cutoff for calling:
# To determine a good calling threshold, we want to know how
# various p-value cutoffs map to false discovery rates (FDRs).
# For example, if our set of candidate SNVs is 99% artifacts,
# then an alpha cutoff of 0.05 would produce an FDR of at
# least 5:1 (~5% of all candidates would be called, 99% of
# which are artifacts; and we don't know how many of the
# remaining true mutations would be called, but it must be
# less than 1% of all candidates). The same 0.05 cutoff would
# perform very differently on a set of candidate mutations
# that is only 1% artifact.
#
# Estimating the FDR:
# The FDR estimate is:
# FDR = alpha N_A / (alpha N_A + beta N_T).
# N_T and N_A are estimates for the relative rates of true
# mutations and artifacts, respectively, and were calculated
# by comparing VAF distributions between hSNPs and SNV
# candidates. 'alpha' and 'beta' are computed here:
# alpha is the probability of incorrectly calling
# an artifact as a mutation and beta is the probability of
# correctly calling a true mutation.
#
# N.B.:
# Since these distributions are not unimodal, we define "more
# significant" as any event with lower probability.
compute.pvs.and.betas <- function(altreads, dp, gp.mu, gp.sd, verbose=TRUE) {
progressr::with_progress({
p <- progressr::progressor(along=1:(length(dp)/100))
pab <- mapply(function(altreads, dp, gp.mu, gp.sd, idx) {
# Step1: compute dreads for all relevant models:
# These dreads() calls are the most expensive part of genotyping
tb <- mut.model.tables(dp, gp.mu, gp.sd)
# Step 2: compute model p-values (=alpha) and power to
# differentiate artifacts from true mutations (=beta) for the
# two artifact models.
abc.pv <- sum(tb$mut[tb$mut <= tb$mut[altreads + 1]])
lysis.pv <- sum(tb$pre[tb$pre <= tb$pre[altreads + 1]])
lysis.beta <- sum(tb$mut[tb$pre <= tb$pre[altreads + 1]])
mda.pv <- sum(tb$mda[tb$mda <= tb$mda[altreads + 1]])
mda.beta <- sum(tb$mut[tb$mda <= tb$mda[altreads + 1]])
if (idx %% 100 == 1) p()
c(abc.pv, lysis.pv, lysis.beta, mda.pv, mda.beta)
}, altreads, dp, gp.mu, gp.sd, 1:length(dp))
}, enable=verbose)
rownames(pab) <- c('abc.pv', 'lysis.pv', 'lysis.beta', 'mda.pv', 'mda.beta')
as.data.frame(t(pab))
}
bin.afs <- function(afs, bins=20) {
sq <- seq(0, 1, 1/bins)
x <- findInterval(afs, sq, left.open=TRUE)
# af=0 will be assigned to bin 0 because intervals are (.,.]
x[x==0] <- 1
tx <- tabulate(x, nbins=bins)
names(tx) <- apply(cbind(sq[-length(sq)], sq[-1]), 1, mean)
tx
}
# the "rough" interval is not a confidence interval. it is just a
# heuristic that demonstrates the range of reasonably consistent
# somatic mutation burdens
# WARNING! this is the *callable somatic burden*. if you want an
# estimate of the total somatic burden genome-wide, there is
# another function for that provides a better estimate!
estimate.somatic.burden <- function(fc, min.s=1, max.s=5000, n.subpops=10, display=FALSE, rough.interval=0.99) {
sim <- function(n.muts, g, s, n.samples=1000, diagnose=FALSE) {
n.pass <- .colSums(rmultinom(n=n.samples, size=n.muts, prob=g) <= s, m=length(g), n=n.samples)
if (diagnose) {
boxplot(t(samples))
lines(s, lwd=2, col=2)
}
# if the random mutations fit underneath the germline curve at
# every AF bin, then the .colSums() above will equal the number
# of AF bins (which is length(g)).
ret <- sum(n.pass == length(g))/length(n.pass)
#gc() # XXX: it is waaay too slow to gc() on every simulation
ret
}
# determine how often a sample of N somatic mutations from the
# proper het distribution (germlines) "fits" inside the somatic
# candidate distribution.
# always try nmut=1-100
srange <- c(1:100, seq(101, max.s, length.out=n.subpops))
srange <- srange[srange < max.s]
fraction.embedded <- sapply(srange, sim, g=fc$g, s=fc$s)
# max(c(0,... in some cases, there will be no absolutely no overlap
# between hSNP VAFs and somatic VAFs, leading to fraction.embedded=0
# for all N. In this case, return 0 and let the caller decide what
# to do.
min.burden <- max(c(0, srange[fraction.embedded >= 1 - (1 -
rough.interval)/2]))
max.burden <- max(c(0, srange[fraction.embedded >= (1 - rough.interval)/2]))
c(min=min.burden, max=max.burden)
}
# {germ,som}.df need only have columns named dp and af
# estimate the population component of FDR
fcontrol <- function(germ.df, som.df, bins=20, rough.interval=0.99, eps=0.1, quiet=FALSE) {
germ.afs <- germ.df$af[!is.na(germ.df$af) & germ.df$af > 0]
som.afs <- som.df$af[!is.na(som.df$af)]
g <- bin.afs(germ.afs, bins=bins) # counts, not probabilities
s <- bin.afs(som.afs, bins=bins)
# when fcontrol is used on small candidate sets (i.e., when controlling
# for depth), there may be several 0 bins in s.
# g <- g*1*(s > 0)
# XXX: i don't like this. it doesn't quite match the principle, but
# rather addresses a limitation of the heuristic method.
if (length(s) == 0 | all(g == 0))
return(list(est.somatic.burden=c(0, 0),
binmids=as.numeric(names(g)),
g=g, s=s, pops=NULL))
# returns (lower, upper) bound estimates
approx.ns <- estimate.somatic.burden(fc=list(g=g, s=s),
min.s=1, max.s=sum(s), n.subpops=min(sum(s), 100),
rough.interval=rough.interval)
if (!quiet) {
cat(sprintf("fcontrol: dp=%d, max.s=%d (%d), n.subpops=%d, min=%d, max=%d\n",
germ.df$dp[1], nrow(som.df), sum(s), min(nrow(som.df),100),
as.integer(approx.ns[1]), as.integer(approx.ns[2])))
}
pops <- lapply(approx.ns, function(n) {
# nt <- pmax(n*(g/sum(g))*1*(s > 0), 0.1)
nt <- pmax(n*(g/sum(g)), eps)
# ensure na > 0, since FDR would be 0 for any alpha for na=0
# XXX: the value 0.1 is totally arbitrary and might need to be
# more carefully thought out.
na <- pmax(s - nt, eps)
cbind(nt=nt, na=na)
})
return(list(est.somatic.burden=approx.ns,
binmids=as.numeric(names(g)),
g=g, s=s, pops=pops))
}
compute.fdr.prior.data.for.candidates <- function(candidates, hsnps, bins=20, random.seed=0, quiet=FALSE, eps=0.1, legacy=FALSE)
{
if (nrow(hsnps) == 0 & nrow(candidates) > 0)
stop(paste('0 hsnps provided but', nrow(candidates), 'candidate sites provided'))
# fcontrol -> estimate.somatic.burden relies on simulations to
# estimate the artifact:mutation ratio.
if (!quiet) {
cat(sprintf("estimating bounds on number of true mutations in candidate set (seed=%d)..\n",
random.seed))
}
orng <- RNGkind()
RNGkind('Mersenne-Twister')
set.seed(random.seed)
# split candidates by depth; collapse all depths beyond the 80th
# percentile into one bin
# if there are no hets, max.dp=0 will just generate some dummy tables.
if (nrow(hsnps) > 0)
max.dp <- as.integer(quantile(hsnps$dp, prob=0.8))
else
max.dp <- 0
progressr::with_progress({
if (!quiet) p <- progressr::progressor(along=0:(max.dp+1))
# future_lapply makes it a little difficult to obtain legacy output because of
# different handling of random seeds. it's probably possible to set future
# parameters to use a single seed=0 MersenneTwister RNG, but i haven't
# explored that.
if (legacy) {
fcs <- lapply(c(0:max.dp), function(thisdp) {
germ.df <- hsnps[dp == thisdp]
som.df <- candidates[dp == thisdp]
ret <- fcontrol(germ.df=germ.df, som.df=som.df, bins=bins, eps=eps, quiet=quiet)
if (!quiet) p()
ret
})
fc.max <- fcontrol(germ.df=hsnps[dp > max.dp],
som.df=candidates[dp > max.dp],
bins=bins, eps=eps, quiet=quiet)
if (!quiet) p()
fcs <- c(fcs, list(fc.max))
} else {
# non-legacy mode just parallelizes this step. same strategy is used.
# use a special value of NA to signal the last depth bucket
hsnps.by.depth <- lapply(c(0:max.dp,NA), function(thisdp) {
if (is.na(thisdp)) {
# only the af and dp columns are used, don't copy the whole table
germ.df <- hsnps[dp > max.dp,.(af, dp)]
som.df <- candidates[dp > max.dp,.(af, dp)]
} else {
germ.df <- hsnps[dp == thisdp,.(af, dp)]
som.df <- candidates[dp == thisdp,.(af, dp)]
}
list(germ.df=germ.df, som.df=som.df)
})
fcs <- future.apply::future_lapply(hsnps.by.depth, function(thisdp) {
ret <- fcontrol(germ.df=thisdp$germ.df, som.df=thisdp$som.df, bins=bins, eps=eps, quiet=quiet)
if (!quiet) p()
ret
},
future.seed=0,
future.globals=c('bins', 'eps', 'quiet'))
}
}, enable=TRUE)
# randomness done (used only in fcontrol())
RNGkind(orng[1])
if (!quiet) {
cat(sprintf(" profiled hSNP and candidate VAFs at depths %d .. %d\n",
0, max.dp))
}
burden <- as.integer(
c(sum(sapply(fcs, function(fc) fc$est.somatic.burden[1])), # min est
sum(sapply(fcs, function(fc) fc$est.somatic.burden[2]))) # max est
)
if (!quiet) {
cat(sprintf(" estimated callable mutation burden range (%d, %d)\n",
burden[1], burden[2]))
cat(" -> using MAXIMUM burden\n")
cat(" creating true (N_T) and artifact (N_A) count tables based on mutations expected to exist in candidate set..\n")
}
# all necessary data for making NT/NA tables
b.vec <- sapply(fcs, function(fc) fc$est.somatic.burden[2])
g.tab <- sapply(fcs, function(fc) fc$g)
s.tab <- sapply(fcs, function(fc) fc$s)
nt.tab <- make.nt.tab(list(bins=bins, eps=eps, b.vec=b.vec, g.tab=g.tab, s.tab=s.tab))
na.tab <- make.na.tab(list(bins=bins, eps=eps, b.vec=b.vec, g.tab=g.tab, s.tab=s.tab))
# these tables represent "partially adjusted" scores for leave-one-out (LOO)
# calling of germline het SNPs or indels. the basic idea is to mimic a single
# germline heterozygous SNP being left out of the germline set and added to
# the somatic set. this should (a) increase the total estimated
# somatic burden by 1 (i.e., b.vec+1) and (b) increase the somatic candidate
# set by 1 as well (s.tab+1).
#
# (a) is the part that cannot be feasibly recalculated for every hSNP, since
# this is done by simulations
# (b) may be slightly confusing because s.tab+1 means adding a new somatic
# site to every VAF bin at every depth. this would indeed be incorrect if
# taken as a whole, but in the case of any particular germline het
# site, only the (VAF, DP) cell in s.tab that matches the germline het
# is used. so adding 1 to the entire table is just a convenient shortcut.
ghet.loo.nt.tab <- make.nt.tab(list(bins=bins, eps=eps, b.vec=b.vec+1, g.tab=g.tab, s.tab=s.tab+1))
ghet.loo.na.tab <- make.na.tab(list(bins=bins, eps=eps, b.vec=b.vec+1, g.tab=g.tab, s.tab=s.tab+1))
list(bins=bins, max.dp=max.dp, fcs=fcs, candidates.used=nrow(candidates),
hsnps.used=nrow(hsnps), burden=burden,
# eps, b.vec, g.tab and s.tab are sufficient to reproduce the final
# N_T/N_A tabs nt.tab and na.tab. This can be useful for creating
# partially adjusted N_T/N_A scores for hSNPs.
eps=eps, b.vec=b.vec, g.tab=g.tab, s.tab=s.tab,
nt.tab=nt.tab, na.tab=na.tab,
ghet.loo.nt.tab=ghet.loo.nt.tab, ghet.loo.na.tab=ghet.loo.na.tab,
mode=ifelse(legacy, 'legacy', 'new'))
}
make.nt.tab <- function(prior.data) {
sapply(1:length(prior.data$b.vec), function(i) {
S <- sum(prior.data$s.tab[,i])
G <- sum(prior.data$g.tab[,i])
if (S == 0 | G == 0)
return(rep(prior.data$eps, prior.data$bins))
# this has no effect other than avoiding NaN when no germline sites
# are present. it causes the column in the table to be all 'eps' values.
if (G == 0)
G <- 1
pmax(prior.data$b.vec[i] * (prior.data$g.tab[,i]/G), prior.data$eps)
})
}
make.na.tab <- function(prior.data) {
nt.tab <- make.nt.tab(prior.data)
sapply(1:length(prior.data$b.vec), function(i) {
S <- sum(prior.data$s.tab[,i])
G <- sum(prior.data$g.tab[,i])
if (S == 0 | G == 0)
return(rep(prior.data$eps, prior.data$bins))
pmax(prior.data$s.tab[,i] - nt.tab[,i], prior.data$eps)
})
}
estimate.fdr.priors.old <- function(candidates, prior.data)
{
fcs <- prior.data$fcs
popbin <- ceiling(candidates$af * prior.data$bins)
popbin[candidates$dp == 0 | popbin == 0] <- 1
progressr::with_progress({
dp <- candidates$dp
p <- progressr::progressor(along=1:(length(dp)/100))
nt.na <- future.apply::future_sapply(1:length(dp), function(i) {
if (i %% 100 == 0) p()
idx = min(dp[i], max.dp+1) + 1
if (is.null(fcs[[idx]]$pops))
c(0.1, 0.1)
else
fcs[[idx]]$pops$max[popbin[i],]
})
})
list(nt=nt.na[1,], na=nt.na[2,])
}
# New implementation of above using tables rather than loops
# Update: try not to pass the whole @gatk table to avoid memory duplication
estimate.fdr.priors <- function(dp, af, prior.data, use.ghet.loo=FALSE)
{
# Assign each candidate mutation to a (VAF, DP) bin
vafbin <- ceiling(af * prior.data$bins)
vafbin[dp == 0 | vafbin == 0] <- 1
dp.idx <- pmin(dp, prior.data$max.dp+1) + 1
if (!use.ghet.loo) {
nt.tab <- prior.data$nt.tab
na.tab <- prior.data$na.tab
} else {
nt.tab <- prior.data$ghet.loo.nt.tab
na.tab <- prior.data$ghet.loo.na.tab
}
list(nt=nt.tab[cbind(vafbin, dp.idx)], na=na.tab[cbind(vafbin, dp.idx)])
}
# Finds the (alpha, beta, FDR) that minimizes FDR
min.fdr <- function(pv, alphas, betas, nt, na) {
x <- data.frame(alpha=alphas, beta=betas,
fdr=ifelse(alphas*na + betas*nt > 0,
alphas*na / (alphas*na + betas*nt), 0))
x <- rbind(c(1, 0, 1), x)
x <- x[pv <= x$alpha,] # passing values
x <- x[x$fdr == min(x$fdr),,drop=F]
x$fdr[1]
}
# Slower than the current method because the model tables need to
# be fully calculated to find the min. FDR.
# Unlike the real legacy code, the alphas and betas corresponding to
# the minimum FDR are not reported.
compute.fdr.legacy <- function(altreads, dp, gp.mu, gp.sd, nt, na, verbose=TRUE) {
if (length(dp) == 0)
return(list(lysis.fdr=numeric(0), mda.fdr=numeric(0)))
progressr::with_progress({
p <- progressr::progressor(along=1:max(1,length(dp)/100))
fdrs <- future.apply::future_mapply(function(altreads, dp, gp.mu, gp.sd, nt, na, idx) {
# Step1: compute dreads for all relevant models:
# These dreads() calls are the most expensive part of genotyping
tb <- mut.model.tables(dp, gp.mu, gp.sd)
# compute BEFORE sorting, so that altreads+1 is the corret row
lysis.pv <- sum(tb$pre[tb$pre <= tb$pre[altreads + 1]])
mda.pv <- sum(tb$mda[tb$mda <= tb$mda[altreads + 1]])
tb <- tb[order(tb$pre),]
lysis.fdr <- min.fdr(pv=lysis.pv,
alphas=cumsum(tb$pre), betas=cumsum(tb$mut), nt=nt, na=na)
tb <- tb[order(tb$mda),]
mda.fdr <- min.fdr(pv=mda.pv,
alphas=cumsum(tb$mda), betas=cumsum(tb$mut), nt=nt, na=na)
if (idx %% 100 == 1) p(amount=1)
c(lysis.fdr=lysis.fdr, mda.fdr=mda.fdr)
}, altreads, dp, gp.mu, gp.sd, nt, na, 1:length(dp))
}, enable=verbose)
list(lysis.fdr=fdrs[1,], mda.fdr=fdrs[2,])
}
resample.germline <- function(sites, hsnps, M=50, seed=0) {
# Short circuit here if 0 sites given. Try to replicate the data structure that
# would be generated below to some extent.
if (nrow(sites) == 0 | nrow(hsnps) == 0) {
return(list(selection=data.frame(dist=integer(0), ds=NULL, dh=NULL, u=integer(0),
keep=integer(0)),
dist.s=NULL, dist.h=NULL)
)
}
# XXX: Random position sampling: maybe add an option to use random
# instead of somatic candidates?
# Random positioning is not as realistic as all non-ref sites because
# it doesn't account for alignability/mappability in the same way as
# non-ref sites.
#random.pos <- find.nearest.germline(som=data.frame(chr='X',
# pos=sort(as.integer(runif(n=1e5, min=min(spos$pos), max=max(spos$pos))))),
# germ=data, chrs='X')
#random.pos <- random.pos[abs(random.pos$pos-random.pos$nearest.het)>0,]
# Distribution of candidates
# Remember that the nearest hSNP must be on the same chromosome
tmpsom <- find.nearest.germline(som=sites[order(sites$pos),],
germ=hsnps,
#chrs=unique(sites$chr)) # unique() might not preserve order
chrs=sites$chr[!duplicated(sites$chr)])
spos <- log10(abs(tmpsom$pos-tmpsom$nearest.het))
# Distribution of hSNP distances
# Since the training data are already sorted, diff() gives the distance
# between this SNP and the next. Nearest SNP is the min of the distance
# to the left and right.
# Must split by chrom to prevent negative distance between the final
# site on chr N and the first site on chr N+1
hsnps$nearest.hsnp <- do.call(c, lapply(unique(hsnps$chr), function(chrom) {
hsnps <- hsnps[hsnps$chr == chrom,]
pmin(diff(c(0,hsnps$pos)), diff(c(hsnps$pos, max(hsnps$pos)+1)))
}))
hpos <- log10(hsnps$nearest.hsnp)
# Approximate the distributions
dist.s <- get.distance.distn(spos)
dist.h <- get.distance.distn(hpos)
ds <- dist.s$density[findInterval(hpos, dist.s$breaks, all.inside=T)]
dh <- dist.h$density[findInterval(hpos, dist.h$breaks, all.inside=T)]
set.seed(seed)
u <- runif(n=length(ds))
list(selection=data.frame(dist=hsnps$nearest.hsnp, ds=ds, dh=dh, u=u, keep=u < ds / (M*dh)),
dist.s=dist.s, dist.h=dist.h)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.