R/SeparateBackground.R

Defines functions SeparateSignatureFromBackgroundOptions Nloptr2Signature Nloptr2BGMutationCounts Solution2Signature Solution2SigVec ObjFn1 SeparateSignatureFromBackground test_g_ineq NegLLHOfSignature LLHOfSignatureGivenSpectrum

Documented in LLHOfSignatureGivenSpectrum NegLLHOfSignature ObjFn1 SeparateSignatureFromBackground SeparateSignatureFromBackgroundOptions

#' The log likelihood of a \strong{single} observed spectrum given the signature.
#'
#' Assumes that all mutations \code{spectrum} were generated by \code{signature}.
#' 
#' @param spectrum A single background spectrum.
#' 
#' @param signature A signature as a numeric vector.
#' 
#' @param nbinom.size The dispersion parameter for the negative
#'        binomial distribution; smaller is more dispersed.
#'        See \code{\link[stats]{NegBinomial}}.
#' 
#' @keywords internal
#' 
LLHOfSignatureGivenSpectrum <- function(spectrum, signature, nbinom.size) {
    expected.counts <- sum(spectrum) * signature
    LLH <- LLHSpectrumNegBinom(spectrum, expected.counts, nbinom.size)
    return(LLH)
  }


#' Objective function for estimating a background signature from multiple background spectra.
#' 
#' Returns the negative log likelihood
#' that the signature with an
#' estimated negative binomial
#' dispersion parameter,
#' generated the observed (multiple) spectra given the
#' likelihood of the observed total counts
#' attributed to the signature.
#' 
#' @keywords internal
#' 
#' @param sig.and.nbinom.size Concatenation of signature as a vector and
#' the negative binomial dispersion parameter.For the dispersion parameter for the negative
#'        binomial distribution, smaller is more dispersed.
#'        See \code{\link[stats]{NegBinomial}}.
#' 
#' @param spectra The observed spectra as an \code{\link[ICAMS]{ICAMS}}
#' \code{catalog}.
#' 
#' @return -1 * the log likelihood that the the input signature,
#' dispersion parameter, and mutation count generated 
#' the observed spectrum

NegLLHOfSignature <- function(sig.and.nbinom.size, spectra) {
  len <- length(sig.and.nbinom.size)
  nbinom.size <- sig.and.nbinom.size[len]
  sig         <- sig.and.nbinom.size[-len]
  loglh <- 0
  for (i in 1:ncol(spectra)) {
    loglh.i <-
      LLHOfSignatureGivenSpectrum(spectra[ , i, drop = FALSE],
                                  sig,
                                  nbinom.size)
    loglh <- loglh + loglh.i
  }
  
  return(-1 * loglh)   
}


test_g_ineq <- function(est.target.sig.and.b, # Parameters to optimize
                        obs.spectra,     
                        bg.sig.info ) {
  
  bg.sig.profile <- bg.sig.info[["background.sig"]]
  
  stopifnot("matrix" %in% class(bg.sig.profile))
  
  
  len.sig <- nrow(bg.sig.profile)

  x1 <- est.target.sig.and.b[1:len.sig]
  abs(1 - sum(x1))  
}


#' Estimate a signature from experimentally exposed spectra minus a background signature.
#' 
#' @description 
#'
#' We index mutation channels (e.g. \code{ACA > AAA, ACC > AAC, ...}) by \eqn{j}, \eqn{j \in 1...96}.
#' 
#' We index input mutational spectra by \eqn{i}. 
#'
#' Let
#'  
#' \eqn{g = g_1, g_2,\ldots , g_{96}}, with \eqn{\Sigma g_j = 1},
#'  be the previously determined, input background signature profile,
#' 
#' \eqn{s^i, i \in 1, 2,\ldots} be the input spectra,
#' from exposed samples, usually only 2 or 3,
#' 
#' \eqn{b^i, i \in 1, 2,\ldots} be the (to-be-estimated)
#' numbers of mutations due to the background signature in each
#' \eqn{s^i}, and
#' 
#' \eqn{t = t_1, t_2,\ldots , t_{96}}, with \eqn{\Sigma t_j = 1},
#' be the (to-be-estimated) target signature due to an exposure.
#' 
#' We want to maximize \eqn{\Pi^iP(s^i|b^i,t)P(b^i)} over 
#' \eqn{b^1, b^2,\dots} and \eqn{t}.  (Note that the code
#' actually minimizes the additive inverse of this.)
#' 
#' \eqn{P(b^i)} is estimated from the distribution of previously observed
#' numbers of mutations in untreated samples, with the additional 
#' constraint that \eqn{b^i \le |s^i|}), where \eqn{|s^i|} is defined as
#' the total number of mutations in spectrum \eqn{s^i}, i.e.
#' \eqn{|s^i| = \Sigma_j s^i_j}, \eqn{j \in 1...96}.
#' 
#' \eqn{P(s^i|b^i,t)} is estimated as follows:
#' 
#' The expected number of mutations in each
#' mutation category, \eqn{j}, is estimated as
#' 
#' \eqn{e^i_j = g_jb^i + t_j(|s^i| - b^i)}.
#' 
#' Then \eqn{P(s^i|e^i)} is estimated as \eqn{\Pi_jP(s^i_j|e^i_j)}.
#' 
#' \eqn{P(s^i_j|e^i_j)} is estimated from a negative binomial distribution
#' centered on each \eqn{e^i_j}; see the \code{sig.nbinom.size} elements of
#' the \code{\link{background.info}} package variables.
#' 
#' @param spectra The spectra from which to subtract the background,
#'   as a matrix or \code{\link[ICAMS]{ICAMS}} catalog.
#'   
#' @param bg.sig.info Information about the background signature. See
#'   \code{\link{background.info}}.
#'   
#' @param m.opts Options to pass to \code{\link[nloptr]{nloptr}}.
#' 
#' @param start.b.fraction The estimated fraction of the mutations in
#'   \code{spectra} due to the background signature.
#' 
#' @details
#' See \code{\link{ObjFn1}}.
#' 
#' @return A list with the elements \describe{
#'
#'   \item{\code{inferred.target.sig}}{The estimated target signature as a numerical
#'    vector.}
#'    
#'   \item{exposures.to.target.sig}{The estimated total number of mutations due
#'     to the target signature in each input spectrum.}
#'     
#'   \item{\code{exposures.to.bg.sig}}{The estimated total number of mutations due
#'    to the background in each input spectrum.}
#'    
#'   \item{\code{message}}{The \code{message} element of \code{all.opt.ret}.}
#'   
#'   \item{\code{all.opt.ret}}{The entire return value from the optimization.
#'   See \code{\link[nloptr]{nloptr}}}
#' }
#' 
#' @export

SeparateSignatureFromBackground <-
  function(spectra,
           bg.sig.info,
           m.opts = NULL,
           start.b.fraction = 0.1) {
    
    if (is.null(m.opts)) m.opts <- SeparateSignatureFromBackgroundOptions()
    
    spectra.catalog.type <- attr(spectra, "catalog.type", exact = TRUE)
    stopifnot(!is.null(spectra.catalog.type))
    sig0 <- MeanOfSpectraAsSig(spectra)$mean.sig
    
    b.x0 <- start.b.fraction * colSums(spectra)
    est.target.sig.and.b.x0 <- c(sig0, b.x0)
    
    # Each element of the signature <= 1.
    upper.bounds.of.target.sig <- rep(1, nrow(spectra))
    
    ret <- nloptr::nloptr(
      x0          = est.target.sig.and.b.x0,
      
      eval_f      = ObjFn1,
      
      eval_g_ineq = test_g_ineq,
      
      lb          = rep(0, length(est.target.sig.and.b.x0)),
      
      ub          = c(upper.bounds.of.target.sig, 
                      
                      colSums(spectra) # The contribution of the background
                                       # should not exceed the total count
                                       # of the spectrum
                      ),
      opt = m.opts,
      obs.spectra = spectra,     
      bg.sig.info = bg.sig.info)
    
    soln <- ret$solution
    
    bg.exposure <- soln[(1 + nrow(spectra)):length(soln)]
    obs.counts  <- colSums(spectra)
  
    sig.to.return <- soln[1:nrow(spectra)]
    sum.sig <- sum(sig.to.return)
    is.one <- all.equal(1, sum.sig, tolerance = 1e-5)
    if (is.character(is.one)) {
      message("Sum of elements in inferred signature is not 1; ", is.one)
    }
    # sum.sig is probably not quite equal to 1
    sig.to.return <- sig.to.return / sum.sig    
    return(list(inferred.target.sig     = sig.to.return,
                exposures.to.target.sig = obs.counts - bg.exposure,
                exposures.to.bg.sig     = bg.exposure,
                message                 = ret$message,
                all.opt.ret             = ret))
  }


#' Objective function for \code{\link{SeparateSignatureFromBackground}}.
#' 
#' @param est.target.sig.and.b A numeric vector of which elements
#' 1:96 are the signature (as a vector) and the remaining elements
#' are the coefficients for each input spectrum in \code{obs.spectra}.
#' 
#' @param obs.spectra The observed spectra, which may be the
#' sum of the activities of the background signature and the signature
#' of an experimental treatment.
#' 
#' @param bg.sig.info Information on the background signature. See for example
#' \code{\link{background.info}}, e.g. \code{background.info[["HepG2"]]}.
#' 
#' The caller will seek to minimize the value of this function.
#' 
#' @keywords internal
#' 
ObjFn1 <- function(
  est.target.sig.and.b, # Parameters to optimize
  obs.spectra,     
  bg.sig.info   
) {
  bg.sig.profile <- bg.sig.info[["background.sig"]]
  
  stopifnot("matrix" %in% class(bg.sig.profile))
  
  
  len.sig <- nrow(bg.sig.profile)
  est.target.sig <- est.target.sig.and.b[1:len.sig]
  
  b <- est.target.sig.and.b[(1 + len.sig):length(est.target.sig.and.b)]
  # b is a vector of the number of mutations contributed
  # by the background to each spectrum

  loglh <- 0
  for (i in 1:ncol(obs.spectra)) { # For each observed spectrum
    obs.spectrum <- obs.spectra[ , i, drop = FALSE]
    total.obs.count <- sum(obs.spectrum)
    expected.bg.counts <- bg.sig.profile * b[i]
    expected.counts <- 
      expected.bg.counts + (est.target.sig * (total.obs.count - b[i]))
    if (FALSE) { # Experimental code, if enabled, does not converge
      # (or converges very slowly).
      bg.greater.than.spectrum <- which(expected.bg.counts > obs.spectrum)
      if (length(bg.greater.than.spectrum) > 0) {
        message("x")
        return(Inf)
      } }
    loglh1.i <- LLHSpectrumNegBinom(
      spectrum        = obs.spectrum,
      expected.counts = expected.counts,
      nbinom.size     = bg.sig.info[["sig.nbinom.size"]])  # This is the dispersion parameter for each channel
 
    loglh2.i <- dnbinom(x    = round(b[i]), 
                        mu   = bg.sig.info[["count.nbinom.mu"]],
                        size = bg.sig.info[["count.nbinom.size"]], # This is the dispersion parameter for the number of background signature mutations
                        log  = TRUE)
    
    # We might try a different distribtuion, e.g. draw from 
    # hist(rgamma(100000, shape = 8, rate = 6) * 1534 / (8 / 6))
    # or
    # rgamma(100000, shape = 8, rate = 3) * 1534 / (8 / 3))
    # rather than 
    # hist(rnbinom(10000, mu = 1534, size = 3))
    # or
    # hist(0.1 * rnbinom(10000, mu = 15340, size = 4))
    
    loglh <- loglh + loglh1.i + loglh2.i
  }
  
  return(-1 * loglh)
}


Solution2SigVec <- function(solution, sig.number = 96) {
  sig.vec <- solution[1:sig.number]
  return(sig.vec / sum(sig.vec))
}


Solution2Signature <- function(solution, 
                                sig.number = 96,
                                ref.genome = NULL,
                                region = "genome") {
  sig <- matrix(Solution2SigVec(solution), ncol = 1)
  rownames(sig) <- ICAMS::catalog.row.order[["SBS96"]]
  colnames(sig) <- "Inferred.sig"
  sig <- ICAMS::as.catalog(
    sig, ref.genome = ref.genome,
    region = region,
    catalog.type = "counts.signature"
  )
  return(sig)
}


Nloptr2BGMutationCounts <- function(nloptr.retval, sig.number = 96) {
  solution <- nloptr.retval$solution
  return(solution[(sig.number + 1):length(solution)])
}


Nloptr2Signature <- function(nloptr.retval, sig.number = 96) {
  return(Solution2Signature(nloptr.retval$solution, sig.number))
}


#' Return a default value to pass as the \code{m.opts} argument to \code{\link{SeparateSignatureFromBackground}}.
SeparateSignatureFromBackgroundOptions <- function() {
  return(
    list(algorithm = "NLOPT_LN_COBYLA",
         maxeval = 20000, 
         print_level = 0,
         xtol_rel = 0.000001,  # 0.0001,)
         xtol_abs = 1e-07 # 1e-06
         )
  )
}
steverozen/mSigBG documentation built on Nov. 4, 2022, 10:58 a.m.