R/calculate.ci.rLNLN.R
In stochprofML: Stochastic Profiling using Maximum Likelihood Estimation

Documented in calculate.ci.rLNLN

calculate.ci.rLNLN <-
function(alpha,parameter,prev.result,dataset,n,TY,fix.mu=F,fixed.mu) {
# Calculates approximate marginal maximum likelihood confidence intervals with
# significance level alpha for all parameters in the model. The intervals are
# approximate because the function uses the formula
#    [ theta_i +/- q_{1-alpha/2} * sqrt(H_ii) ],
# where
   # * theta_i is the estimate of the i.th parameter,
# * q_{1-alpha/2} is the 1-alpha/2 quantile of the standard normal distribution,
# * H is the inverse Hessian of the negative log likelihood function evaluated
#   at the maximum likelihood estimate; H_ii is the i.th diagonal element of H.
# This approximation implicitly assumes that the log likelihood function is unimodal.
# The confidence interval is first calculated on the transfomed, unrestricted parameter
# space and then back-transformed to the original one.
#
# Parameters:
# - alpha is the significance level
# - parameter is the maximum likelihood estimate around which the confidence interval
#   is centered; if this value is missing, it is determined from prev.result. This
#   parameter has to be on the original scale, not on the logit-/log-transformed scale
#   as used during the estimation procedure. It has to be (TY*(m+2)-1)-dimensional, even for
#   fix.mu==T.
# - prev.result is a list of previous results as returned by stochprof.results. This
#   variable is only used when "parameter" is missing.
# - dataset is a matrix which contains the cumulated expression data over all cells in a tissue sample.
#   Columns represent different genes, rows represent different tissue samples.
# - n is the number of cells taken from each tissue sample. This can also be a vector stating how many
#   cells are in each sample seperatly.
# - TY is the number of types of cells that is assumed in the stochastic model.
# - fix.mu (default=F) indicates whether the log-means are kept fixed in the estimation procedure or whether
#   they are to be estimated.
# - fixed.mu (no default, needs to be specified only when fix.mu==T) is a vector containing the values to which
#   the log-means should be fixed if fix.mu==T. The order of components is as follows:
#   (mu_type_1_gene_1, mu_type_1_gene_2, ..., mu_type_2_gene_1, mu_type__gene_2, ...)


   # definition of variables (necessary for CMD R check)
   # (these variables will be initialized later, but they are not visible as global functions/data)
   d.sum.of.mixtures <- NULL
   backtransform.par <- NULL
   stochprof.results <- NULL
   transform.par <- NULL
   rm(d.sum.of.mixtures)
   rm(backtransform.par)
   rm(stochprof.results)
   rm(transform.par)

   # number of genes
   m <- ncol(dataset)

   ############################
   ## define target function ##
   ############################

   # log likelihood for one gene
   loglikeli <- function(y,p,mu,sigma) {
      # p, mu and sigma are of length TY
      max(-10^7,sum(d.sum.of.mixtures(y,n,p,mu,sigma,logdens=T)))
   }

   # this function will be minimized (the function "to.minimize" below just
   # changes the parameterisation)
   target.function <- function(p,mu,sigma) {
      # p is of length TY
      # mu is of length TY*m
      # sigma is of length TY

      # build mu.matrix such that the g.th row contains the values for gene g
      mu.matrix <- matrix(mu,byrow=F,ncol=TY)

      # consider negative log likelihood because the target function will be minimized
      this.sum <- 0
      for (g in 1:m) {
         this.sum <- this.sum - loglikeli(dataset[,g],p,mu.matrix[g,,drop=T],sigma)
      }
      return(this.sum)
   }

   to.minimize <- function(theta) {
      # theta is the transformed parameter.
      # No penalization included here.

      # When a parameter is already at the boundary, e.g. log(sigma)=-Inf,
      # then the hessian function will pass an NaN to this function. In that case,
      # an error would occur. Hence, replace the NaNs by
      na.positions <- which(is.na(theta))
      if (sum(na.positions)>0) {
         theta[na.positions] <- parameter.trans[na.positions]
      }

      back.theta <- backtransform.par(this.par=theta,m=m,fix.mu=fix.mu,fixed.mu=fixed.mu)
      if (TY>1) {
         p <- back.theta[1:(TY-1)]
         p <- c(p,1-sum(p))
      }
      else {
         p <- 1
      }
      mu <- back.theta[TY:((m+1)*TY-1)]
      sigma <- back.theta[((m+1)*TY):length(back.theta)]

      a <- target.function(p,mu,sigma)
      a <- min(10^7,a)

      return(a)
   }

   # if "parameter" is missing, it has to be determined as the optimum from the previous results
   if (missing(parameter)) {
      res <- stochprof.results(prev.result=prev.result,TY=TY,show.plots=F)
      parameter <- res[1,-ncol(res)]
   }

   # when fix.mu==T: check whether fixed.mu (to which mu should be fixed) agrees with the mu part
   # of "parameter". If not, something went wrong. If fixed.mu is missing, determine it from "parameter"
   if (fix.mu) {
      mu.in.parameter <- parameter[TY:((m+1)*TY-1)]
      if ((missing(fixed.mu)) || (is.null(fixed.mu))) {
         fixed.mu <- mu.in.parameter
      }
      else {
         differ <- fixed.mu - mu.in.parameter
         if (sum(round(differ,4))>0) {
            stop("calculate.ci: wrong mu fixed.")
         }
      }
   }

   # transform parameter to unrestricted parameter space;
   # if fix.mu==T, parameter.trans becomes lower-dimensional than parameter
   parameter.trans <- transform.par(parameter,m,fix.mu)

   # Calculate Hessian matrix of to.minimize, i.e. of *negative* log-likelihood.
   # This is the Hessian matrix in the unrestricted parameterization, not in the original one.
   hesse <- hessian(func=to.minimize,x=parameter.trans,method="Richardson")

   na.positions <- which(is.na(hesse[1,]))
   hesse[is.na(hesse)] <- 0

   # inverse Hessian
   Sigma <- ginv(hesse)

   # 1-alpha/2 quantile of standard normal distribution
   coeff <- qnorm(1-alpha/2)

   # check whether there are negative diagonal elements in the Hessian
   if (sum(diag(Sigma)<0)>0) {
      print("Hessian not positive semi-definite; return NULL.")
      return(NULL)
   }

   # confidence interval for transformed parameter
   CI.trans.lower <- parameter.trans - coeff * sqrt(diag(Sigma))
   CI.trans.upper <- parameter.trans + coeff * sqrt(diag(Sigma))
   CI.trans <- cbind(CI.trans.lower,CI.trans.upper)

   # confidence interval for original parameter
   CI <- matrix(NA,ncol=2,nrow=TY*(m+2)-1)
   CI[,1] <- backtransform.par(this.par=CI.trans[,1],m=m,fix.mu=fix.mu,fixed.mu=fixed.mu)
   CI[,2] <- backtransform.par(this.par=CI.trans[,2],m=m,fix.mu=fix.mu,fixed.mu=fixed.mu)
   colnames(CI) <- c("lower","upper")

   # bounds for p might not be in correct order (i.e. lower <= upper) due to transformation;
   # in that case, swap
   if (TY>1) {
      for (i in 1:(TY-1)) {
         if (CI[i,1]>CI[i,2]) {
            a <- CI[i,1]
            CI[i,1] <- CI[i,2]
            CI[i,2] <- a
         }
      }
   }

   return(CI)
}