R/03-posterior.R

Defines functions makeCNVPosterior createPosterior computePosterior

Documented in makeCNVPosterior

# Copyright (C) Kevin R. Coombes 2014-2017

###################################
# Posterior Distribution

# This may be confusing, largely because of the 'hidden' discrete
# prior structure. What I think we actually need is
#    (0) (log)likelihoods for 13 hidden states
#    (1) snp-level (log)likelihoods and posteriors for 3-way CN state
#    (2) gene-level (log)likelihoods and posteriors for 3-way CN state
# Right now,  these are represented by
#    (0) 'hiddenloglike'
#    (1) 'snploglike' and 'snppost'
#    (2) 'loglike' and 'posterior'
setClass("CNVPosterior", 
         representation=list(
             hiddenloglike="array", # grid x hidden x variants
             snploglike="array",    # grid x cnstate x variants
             snppost="array",       # grid x cnstate x variants
             loglike="array",       # grid x cnstate
             posterior="array",     # grid x cnstate
             observed="data.frame",
             prior="CNVPrior"
             ))


computePosterior <- function(loglike, prior) {
  nup <- prior@pdf
  pop <- sweep(loglike, 1, nup, "+")
  # assumes the same answer for pd$Mutation and for pd$SNP
  pop <- sweep(pop, 2, log(cnPrior(prior)), "+")
  # normalize so the sum of the posterior probabilities equals one
  weights <- log( apply(exp(pop), 3, sum) )
  pop<- sweep(pop, 3, weights, '-')
  exp(pop)
}

createPosterior <- function(hll, obs, prior) {
  # so, we are going to compute the one-at-a-time posterior
  # probabilities that Vi=v from the  above stuff and collapse
  # down using the maximum a posteriori (MAP) estimate.  After
  # that, we pretend that the MAP variant types are the truth.
  sll <- array(NA, dim=c(dim(hll)[1], 3, dim(hll)[3]),
               dimnames=list(NULL,
                             cnSet,
                             NULL))
  for (i in 1:dim(hll)[3]) {
    x <- exp(hll[,,i])
    vartype <- as.character(obs$V)[i]
    temp <- lapply(vtSet, function(v) x * pVgT[vartype, v])
    xx <- do.call(cbind, temp)
    temp <- lapply(vtSet, function(v) pGgTS[,,v] * pT[v])
    zz <- do.call(rbind, temp)
    yy <- xx %*% zz
    sll[,,i] <- log(yy)
  }
  # Now 'sll' is the likelihood
  #    Prob(Ki=k, Ni=n | Vbar=vbar, nu=nu0, S=s)
  # where we used, in vbar,  the MAP value of vi for each datarow. We
  # can reapply the beta and copy number priors to start converting
  # these likelihoods into posteriors..
  snppost <- computePosterior(sll, prior)
  # But we're stil not done. need to combine likelihoods across
  # independent variants = datarows
  logprior <- outer(prior@pdf, log(c(0.4, 0.3, 0.3)), "+") # log( P(Theta) )
  loglike <- apply(sll, 1:2, sum)
  lognumer <- loglike + logprior
  M <- max(lognumer)
  temp <- lognumer - M
  foo <- log(sum(exp(temp)))
  logpost <- temp - foo
  posterior <- exp(logpost)
  if (abs(1 - sum(posterior)) > 1e-14) warning("bad scaling")

  
  new("CNVPosterior", 
      loglike=loglike, posterior=posterior,
      snploglike=sll, snppost=snppost,
      hiddenloglike=hll,
      observed=obs, prior=prior)
}

# the 'obs' input must be a data.frame with columns labeled
#      K = number of variant reads
#      N = total number of reads
#      V = type of variant  (SNP or Mutation)
# KRC: should we force this to be an S4-object so we can perform
# validity checking?
makeCNVPosterior <- function(obs, prior) {
  gridPoints <- prior@grid
  hll <- cnvLikelihood(gridPoints, obs) # 'hidden' log likelihood
  # At this point, 'hiddenloglike' is a 3D array of size
  #   gridPoints x genotypes(basecase) x datarows(variants)
  # and it represents the logarithm of
  #     Prob(Ki=k, Ni=n | Gi=g, nu=nu0)
  # We also need something to represent the logarithm of
  #     Prob(Kbar = kbar, Nbar=nbar | Vbar=vbar, nu=nu0, S=s)
  # Since this probability is the product over datarows i, we
  # might take the sum on the log scale
  #    snploglike <- apply(hiddenloglike, 1:3, sum)
  # but this isn't quite correct, since we still need to
  # collapse it for each vector of variantType assignments.
  # The problem is that, with 10 datarows, there are 4^10
  # possible vector assignments to consider, and that's way
  # too many to store and carry around with us.
  #
  createPosterior(hll, obs, prior)
}

#KRC: Why don't we have to export this class to get the 'summary' method for
# 'CNVPosterior' to work?
setClass("CNVPosteriorSummary", 
         representation=list(
           copyNumbers = "data.frame",
           loc="numeric", q05="numeric", q95="numeric",
           grid="numeric", cdf='numeric', pdf='numeric'))

setMethod("as.data.frame", c("CNVPosteriorSummary"), function (x, row.names = NULL, optional = FALSE, ...) {
    data.frame(x@copyNumbers, loc=x@loc, q05=x@q05, q95=x@q95)
})

setMethod("summary", c("CNVPosterior"), function(object, ...) {
  # integrate out nu to get the copy number posteriors
  cnpost <- as.data.frame(as.list(apply(object@posterior, 2, sum)))
  # separately, have to work out the posterior on nu
  # start by integrating out the copy number state
  tp <- apply(object@posterior, 1, sum)
  # find the mode
  grid <- object@prior@grid
  loc <- median(grid[tp == max(tp)])
  cdf <- cumsum(tp)/sum(tp)
  # TODO: Handle (literal) edge cases where MAP estimate is at
  # one end of the interval
  q05 <- round(max(grid[cdf < 0.05]), 3)
  q95 <- round(min(grid[cdf > 0.95]), 3)
  new("CNVPosteriorSummary", copyNumbers=cnpost,
      loc=loc, q05=q05, q95=q95,
      grid=grid, cdf=cdf, pdf=tp)
})

setMethod("show", c("CNVPosteriorSummary"), function(object) {
  cat("Most likely normal fraction =", object@loc,
      "\nNinety percent credible interval = (", object@q05, ',', object@q95, ")\n")
  print(object@copyNumbers)
  invisible(object)
})


setMethod("plot", c("CNVPosterior","missing"), function(x, place="topright", lwd=1, ...) {
  post <- x@snppost
  grid <- x@prior@grid
  plot(grid, post[,"Normal",1], type="n", xlim=c(0,1),
       ylim=range(post), 
       xlab="Normal Fraction", ylab="Posterior Probability",
       lwd=lwd, ...)
  n <- dim(post)[3]
  c1p <- colorRampPalette(c("blue", "cyan"))(n)
  c2p <- colorRampPalette(c("red", "magenta"))(n)
  c3p <- colorRampPalette(c("green", "yellow"))(n)
  for (i in 1:n) {
    lines(grid, post[,"Normal",i], col=c1p[i], lwd=lwd)
    lines(grid, post[,"Deleted",i], col=c2p[i], lwd=lwd)
    lines(grid, post[,"Gained",i], col=c3p[i], lwd=lwd)
  }
  legend(place, c("Deleted", "Normal", "Gained"), lwd=2,
         col=c("red", "blue", "green"))
  invisible(x)
})

setMethod("hist", c("CNVPosterior"), function(x, place="topright", lwd=1, ...) {
  post <- x@posterior
  grid <- x@prior@grid
  plot(grid, post[,"Deleted"], col="red", type="l",
       ylim=range(c(post[,"Deleted"], post[,"Gained"], post[,"Normal"])), 
       xlab="Normal Fraction", ylab="Posterior Probability", lwd=lwd, ...)
  lines(grid, post[,"Normal"], col="blue", lwd=lwd)
  lines(grid, post[,"Gained"], col="green", lwd=lwd)
  legend(place, c("Deleted", "Normal", "Gained"), lwd=2,
         col=c("red", "blue", "green"))
  invisible(post)
})

setMethod("image", c("CNVPosterior"), function(x, ...) {
  post <- x@posterior
  grid <- x@prior@grid
  image(grid, 1:3, post, ylab='', yaxt='n', ...)
  mtext(colnames(post), side=2, at=1:3)
  invisible(post)
})

############
# update the Bayesian analysis by applying a different prior
# Note that we stole the 'update' method from the idea of
# refitting frequentist models.

setMethod("update", c("CNVPosterior"), function(object, prior, ...) {
  # this simply repeats the code from 'makeCNVPosterior'
  # should really refactor it as a function call...
  createPosterior(object@hiddenloglike, 
                  object@observed,
                  prior)
})

Try the DeepCNV package in your browser

Any scripts or data that you put into this service are public.

DeepCNV documentation built on May 2, 2019, 5:23 p.m.