# R/03-posterior.R In DeepCNV: Exploiting Normal Contamination to Infer Copy Number from Deep Sequencing

#### Documented in makeCNVPosterior

```# Copyright (C) Kevin R. Coombes 2014-2017

###################################
# Posterior Distribution

# This may be confusing, largely because of the 'hidden' discrete
# prior structure. What I think we actually need is
#    (0) (log)likelihoods for 13 hidden states
#    (1) snp-level (log)likelihoods and posteriors for 3-way CN state
#    (2) gene-level (log)likelihoods and posteriors for 3-way CN state
# Right now,  these are represented by
#    (0) 'hiddenloglike'
#    (1) 'snploglike' and 'snppost'
#    (2) 'loglike' and 'posterior'
setClass("CNVPosterior",
representation=list(
hiddenloglike="array", # grid x hidden x variants
snploglike="array",    # grid x cnstate x variants
snppost="array",       # grid x cnstate x variants
loglike="array",       # grid x cnstate
posterior="array",     # grid x cnstate
observed="data.frame",
prior="CNVPrior"
))

computePosterior <- function(loglike, prior) {
nup <- prior@pdf
pop <- sweep(loglike, 1, nup, "+")
# assumes the same answer for pd\$Mutation and for pd\$SNP
pop <- sweep(pop, 2, log(cnPrior(prior)), "+")
# normalize so the sum of the posterior probabilities equals one
weights <- log( apply(exp(pop), 3, sum) )
pop<- sweep(pop, 3, weights, '-')
exp(pop)
}

createPosterior <- function(hll, obs, prior) {
# so, we are going to compute the one-at-a-time posterior
# probabilities that Vi=v from the  above stuff and collapse
# down using the maximum a posteriori (MAP) estimate.  After
# that, we pretend that the MAP variant types are the truth.
sll <- array(NA, dim=c(dim(hll), 3, dim(hll)),
dimnames=list(NULL,
cnSet,
NULL))
for (i in 1:dim(hll)) {
x <- exp(hll[,,i])
vartype <- as.character(obs\$V)[i]
temp <- lapply(vtSet, function(v) x * pVgT[vartype, v])
xx <- do.call(cbind, temp)
temp <- lapply(vtSet, function(v) pGgTS[,,v] * pT[v])
zz <- do.call(rbind, temp)
yy <- xx %*% zz
sll[,,i] <- log(yy)
}
# Now 'sll' is the likelihood
#    Prob(Ki=k, Ni=n | Vbar=vbar, nu=nu0, S=s)
# where we used, in vbar,  the MAP value of vi for each datarow. We
# can reapply the beta and copy number priors to start converting
# these likelihoods into posteriors..
snppost <- computePosterior(sll, prior)
# But we're stil not done. need to combine likelihoods across
# independent variants = datarows
logprior <- outer(prior@pdf, log(c(0.4, 0.3, 0.3)), "+") # log( P(Theta) )
loglike <- apply(sll, 1:2, sum)
lognumer <- loglike + logprior
M <- max(lognumer)
temp <- lognumer - M
foo <- log(sum(exp(temp)))
logpost <- temp - foo
posterior <- exp(logpost)
if (abs(1 - sum(posterior)) > 1e-14) warning("bad scaling")

new("CNVPosterior",
loglike=loglike, posterior=posterior,
snploglike=sll, snppost=snppost,
hiddenloglike=hll,
observed=obs, prior=prior)
}

# the 'obs' input must be a data.frame with columns labeled
#      K = number of variant reads
#      N = total number of reads
#      V = type of variant  (SNP or Mutation)
# KRC: should we force this to be an S4-object so we can perform
# validity checking?
makeCNVPosterior <- function(obs, prior) {
gridPoints <- prior@grid
hll <- cnvLikelihood(gridPoints, obs) # 'hidden' log likelihood
# At this point, 'hiddenloglike' is a 3D array of size
#   gridPoints x genotypes(basecase) x datarows(variants)
# and it represents the logarithm of
#     Prob(Ki=k, Ni=n | Gi=g, nu=nu0)
# We also need something to represent the logarithm of
#     Prob(Kbar = kbar, Nbar=nbar | Vbar=vbar, nu=nu0, S=s)
# Since this probability is the product over datarows i, we
# might take the sum on the log scale
#    snploglike <- apply(hiddenloglike, 1:3, sum)
# but this isn't quite correct, since we still need to
# collapse it for each vector of variantType assignments.
# The problem is that, with 10 datarows, there are 4^10
# possible vector assignments to consider, and that's way
# too many to store and carry around with us.
#
createPosterior(hll, obs, prior)
}

#KRC: Why don't we have to export this class to get the 'summary' method for
# 'CNVPosterior' to work?
setClass("CNVPosteriorSummary",
representation=list(
copyNumbers = "data.frame",
loc="numeric", q05="numeric", q95="numeric",
grid="numeric", cdf='numeric', pdf='numeric'))

setMethod("as.data.frame", c("CNVPosteriorSummary"), function (x, row.names = NULL, optional = FALSE, ...) {
data.frame(x@copyNumbers, loc=x@loc, q05=x@q05, q95=x@q95)
})

setMethod("summary", c("CNVPosterior"), function(object, ...) {
# integrate out nu to get the copy number posteriors
cnpost <- as.data.frame(as.list(apply(object@posterior, 2, sum)))
# separately, have to work out the posterior on nu
# start by integrating out the copy number state
tp <- apply(object@posterior, 1, sum)
# find the mode
grid <- object@prior@grid
loc <- median(grid[tp == max(tp)])
cdf <- cumsum(tp)/sum(tp)
# TODO: Handle (literal) edge cases where MAP estimate is at
# one end of the interval
q05 <- round(max(grid[cdf < 0.05]), 3)
q95 <- round(min(grid[cdf > 0.95]), 3)
new("CNVPosteriorSummary", copyNumbers=cnpost,
loc=loc, q05=q05, q95=q95,
grid=grid, cdf=cdf, pdf=tp)
})

setMethod("show", c("CNVPosteriorSummary"), function(object) {
cat("Most likely normal fraction =", object@loc,
"\nNinety percent credible interval = (", object@q05, ',', object@q95, ")\n")
print(object@copyNumbers)
invisible(object)
})

setMethod("plot", c("CNVPosterior","missing"), function(x, place="topright", lwd=1, ...) {
post <- x@snppost
grid <- x@prior@grid
plot(grid, post[,"Normal",1], type="n", xlim=c(0,1),
ylim=range(post),
xlab="Normal Fraction", ylab="Posterior Probability",
lwd=lwd, ...)
n <- dim(post)
c1p <- colorRampPalette(c("blue", "cyan"))(n)
c2p <- colorRampPalette(c("red", "magenta"))(n)
c3p <- colorRampPalette(c("green", "yellow"))(n)
for (i in 1:n) {
lines(grid, post[,"Normal",i], col=c1p[i], lwd=lwd)
lines(grid, post[,"Deleted",i], col=c2p[i], lwd=lwd)
lines(grid, post[,"Gained",i], col=c3p[i], lwd=lwd)
}
legend(place, c("Deleted", "Normal", "Gained"), lwd=2,
col=c("red", "blue", "green"))
invisible(x)
})

setMethod("hist", c("CNVPosterior"), function(x, place="topright", lwd=1, ...) {
post <- x@posterior
grid <- x@prior@grid
plot(grid, post[,"Deleted"], col="red", type="l",
ylim=range(c(post[,"Deleted"], post[,"Gained"], post[,"Normal"])),
xlab="Normal Fraction", ylab="Posterior Probability", lwd=lwd, ...)
lines(grid, post[,"Normal"], col="blue", lwd=lwd)
lines(grid, post[,"Gained"], col="green", lwd=lwd)
legend(place, c("Deleted", "Normal", "Gained"), lwd=2,
col=c("red", "blue", "green"))
invisible(post)
})

setMethod("image", c("CNVPosterior"), function(x, ...) {
post <- x@posterior
grid <- x@prior@grid
image(grid, 1:3, post, ylab='', yaxt='n', ...)
mtext(colnames(post), side=2, at=1:3)
invisible(post)
})

############
# update the Bayesian analysis by applying a different prior
# Note that we stole the 'update' method from the idea of
# refitting frequentist models.

setMethod("update", c("CNVPosterior"), function(object, prior, ...) {
# this simply repeats the code from 'makeCNVPosterior'
# should really refactor it as a function call...
createPosterior(object@hiddenloglike,
object@observed,
prior)
})
```

## Try the DeepCNV package in your browser

Any scripts or data that you put into this service are public.

DeepCNV documentation built on May 2, 2019, 5:23 p.m.