R/em.R

Defines functions em BIC AIC EMPosterior nparam.EMResult logLikelihood.EMResult plotDensity.EMResult fit.ZinbaCopula logLikelihood.ZinbaCopula drawFrom.TruncatedDistribution fit.TruncatedDistribution logLikelihood.TruncatedDistribution truncate TruncatedDistribution CDF.LeftCensoredGamma drawFrom.LeftCensoredGamma fit.LeftCensoredGamma logLikelihood.LeftCensoredGamma CDF.Gamma drawFrom.Gamma fit.Gamma logLikelihood.Gamma CDF.Negbinom drawFrom.Negbinom fit.Negbinom logLikelihood.Negbinom fit.CZinba logLikelihood.CZinba nparam.Zero drawFrom.Zero logLikelihood.Zero fit.Zero drawFrom.Zinba fit.Zinba logLikelihood.Zinba fit.LogNormal logLikelihood.LogNormal plotDensity.Distribution plotDensity nparam CDF drawFrom fit logLikelihood logsum

# for a comparison between using expression data and just using the signal
# we need an EM algorithm to estimate a log normal mixture
library(methods)

# helper for more numerical precision when summing up log likelihoods
logsum <- function(x) {
  m = max(x)
  return(log(sum(exp(x - m))) + m)
}

# distributions are defined in a object oriented way
# they have a logLikelihood and a fit function

# dummy generic logLikelihood function (returns -Inf)
logLikelihood <- function(model, x) return (-Inf)
setGeneric("logLikelihood")

# dummy generic fit function (just returns the model)
fit <- function(model, x, logExpectation) {
  return(model)
}
setGeneric("fit")

drawFrom <- function(model, N) {
    print("drawFrom dummy")
    return(rep(NA, N))
}
setGeneric("drawFrom")

CDF <- function(model, x) {
  print("CDF dummy")
  return(rep(NA, length(x)))
}
setGeneric("CDF")

## defualt is just to return the total length of all slots of the object
nparam <- function(model) {
  slots <- slotNames(model)
  return(sum(sapply(slots, function(sname) length(slot(model, sname)))))
}
setGeneric("nparam")

# the superclass is Distribution
Distribution <- setClass("Distribution")

# allow plotting of the densities and mixture
plotDensity <- function(model, x=NA, xlim=NULL, add=FALSE, col="black", alpha=1) {
  print("plotDensity (generic)")
}
setGeneric("plotDensity")

plotDensity.Distribution <- function(model, x=NULL, xlim=NULL, add=FALSE, col="black", alpha=1) {
  # print("plotDensity.Distribution")
  if (is.null(x)) {
    if (is.null(xlim))
      stop("please specify either data or xlim")
  } else {
    if (is.null(xlim)) {
      xlim = range(x)
    } else {
      x = x[xlim[1] <= x & x <= xlim[2]]
    }
  }
  xx = seq(xlim[1], xlim[2])
  d = exp(logLikelihood(model, xx)) * alpha
  if (!is.na(x)[1] && !add) {
    hist(x, freq=F, add=add, breaks=xlim[2], main="", xlab="", ylab="")
  }
  if (!add || !is.na(x)) {
    lines(xx, d, col=col, lwd=2)
  } else {
    plot(xx, d, type="l", col=col, lwd=2, main="", xlab="", ylab="")
  }
}
setMethod("plotDensity", signature=c(model="Distribution", x="numeric", xlim="numeric", add="logical", col="character", alpha="numeric"), plotDensity.Distribution)



# for now we just put the lognormal distribution
LogNormal <- setClass("LogNormal", representation=list(meanlog="numeric", sdlog="numeric"))

logLikelihood.LogNormal <- function(model, x) {
  return(dlnorm(x, meanlog=model@meanlog, sdlog=model@sdlog, log=T))
}
setMethod("logLikelihood", signature=c(model="LogNormal", x="numeric"), logLikelihood.LogNormal)

fit.LogNormal <- function(model, x, logExpectation) {
  # adapted code from MASS::fitdistr
  # we need to add a weighting by the Expectation
  expect = exp(logExpectation)
  lx = log(x)
  n = sum(expect)
  meanlog = sum(lx * expect) / n
  sdlog = sqrt(sum(expect * (lx - meanlog)^2) / n)
  newmodel = new("LogNormal")
  newmodel@meanlog = meanlog
  newmodel@sdlog = sdlog
  return(newmodel)
}
setMethod("fit", signature=c(model="LogNormal", x="numeric", logExpectation="numeric"), fit.LogNormal)

# we also define this for a zero inflated negative binomial
Zinba <- setClass("Zinba", representation=list(size="numeric", mu="numeric", beta="numeric"), contains="Distribution")

logLikelihood.Zinba <- function(model, x) {
  return(log(dzinba(x, size=model@size, mu=model@mu, beta=model@beta)))
}
setMethod("logLikelihood", signature=c(model="Zinba", x="numeric"), logLikelihood.Zinba)

fit.Zinba <- function(model, x, logExpectation) {
  if (length(model@size) == 0) {
    start = NULL
  } else {
    start = c(model@size, model@mu, model@beta)
    names(start) = c("size", "mu", "beta")
  }
  par = fitzinba(x, weight=exp(logExpectation), start=start)
  # newmodel = Zinba()
  newmodel = new("Zinba")
  newmodel@size = par["size"]
  newmodel@mu = par["mu"]
  newmodel@beta = par["beta"]
  return(newmodel)
}
setMethod("fit", signature=c(model="Zinba", x="numeric", logExpectation="numeric"), fit.Zinba)

drawFrom.Zinba <- function(model, N) {
    return(rzinba(N, model@size, model@mu, model@beta))
}
setMethod("drawFrom", signature=c(model="Zinba", N="numeric"), drawFrom.Zinba)

Zero <- setClass("Zero", representation=list(p="numeric"), contains="Distribution")

fit.Zero <- function(model, x, logExpectation) {
    return(model)
}
setMethod("fit", signature=c(model="Zero", x="numeric", logExpectation="numeric"), fit.Zero)

logLikelihood.Zero <- function(model, x) {
  ll = rep(-Inf, length(x))
  ll[x == 0] = 0
  return(ll)
}
setMethod("logLikelihood", signature=c(model="Zero", x="numeric"), logLikelihood.Zero)

drawFrom.Zero <- function(model, N) {
    return(rep(0, N))
}
setMethod("drawFrom", signature=c(model="Zero", N="numeric"), drawFrom.Zero)

nparam.Zero <- function(model) return(0)
setMethod("nparam", signature=c(model="Zero"), nparam.Zero)


# we also define this for a zero inflated negative binomial using a C implementation
CZinba <- setClass("CZinba", representation=list(size="numeric", mu="numeric", beta="numeric"), contains="Distribution")

logLikelihood.CZinba <- function(model, x) {
  return(log(dzinba(x, size=model@size, mu=model@mu, beta=model@beta)))
}
setMethod("logLikelihood", signature=c(model="CZinba", x="numeric"), logLikelihood.CZinba)

fit.CZinba <- function(model, x, logExpectation) {
  if (length(model@size) == 0) {
    start = NULL
  } else {
    start = c(model@size, model@mu, model@beta)
    names(start) = c("size", "mu", "beta")
  }
  par = .fitzinba(x, weight=exp(logExpectation), start=start)
  # newmodel = CZinba()
  newmodel = new("CZinba")
  newmodel@size = par["size"]
  newmodel@mu = par["mu"]
  newmodel@beta = par["beta"]
  return(newmodel)
}
setMethod("fit", signature=c(model="CZinba", x="numeric", logExpectation="numeric"), fit.CZinba)

# just the regular negative binomial (special case of the zero inflated)
Negbinom <- setClass("Negbinom", representation=list(size="numeric", mu="numeric"), contains="Distribution")

logLikelihood.Negbinom <- function(model, x) {
  return(dnbinom(x, size=model@size, mu=model@mu, log=T))
}
setMethod("logLikelihood", signature=c(model="Negbinom", x="numeric"), logLikelihood.Negbinom)

## fit.Negbinom <- function(model, x, logExpectation) {
##   if (length(model@size) == 0) {
##     start = NULL
##   } else {
##     start = c(model@size, model@mu, 0)
##     names(start) = c("size", "mu", "beta")
##   }
##   par = fitzinba(x, weight=exp(logExpectation), start, fix.beta=TRUE)
##   # newmodel = Zinba()
##   newmodel = new("Negbinom")
##   newmodel@size = par["size"]
##   newmodel@mu = par["mu"]
##   return(newmodel)
## }

fit.Negbinom <- function(model, x, logExpectation) {
  par = .fitzinba(x, weight=exp(logExpectation), fix.beta=T)
  # newmodel = Zinba()
  newmodel = new("Negbinom")
  newmodel@size = par["size"]
  newmodel@mu = par["mu"]
  return(newmodel)
}
setMethod("fit", signature=c(model="Negbinom", x="numeric", logExpectation="numeric"), fit.Negbinom)

drawFrom.Negbinom <- function(model, N) {
    return(rnbinom(N, model@size, model@mu))
}
setMethod("drawFrom", signature=c(model="Negbinom", N="numeric"), drawFrom.Negbinom)

CDF.Negbinom <- function(model, x) {
  return(pnbinom(x, size=model@size, mu=model@mu))
}
setMethod("CDF", signature=c(model="Negbinom", x="numeric"), CDF.Negbinom)


## Gamma distribution
Gamma <- setClass("Gamma", representation=list(shape="numeric", rate="numeric"), contains="Distribution")

logLikelihood.Gamma <- function(model, x) {
  return(dgamma(x, shape=model@shape, rate=model@rate, log=T))
}
setMethod("logLikelihood", signature=c(model="Gamma", x="numeric"), logLikelihood.Gamma)

fit.Gamma <- function(model, x, logExpectation) {
  ## this is modified from MASS::fidistr
  if (any(x <= 0))
    warning("gamma values must be > 0")

  n<-length(x)
  weight <- exp(logExpectation)
  m <- x %*% weight / n
  v <- sum(weight * (x - m)^2) / n

  start <- list(shape = m^2/v, rate = m/v)
  control <- list(parscale = c(1, start$rate))

  ## define the objective function
  obj <- function(par) -sum(weight[x > 0] * dgamma(x[x > 0], shape=par[["shape"]], rate=par[["rate"]], log=TRUE)) / n

  fit <- optim(start, obj)
  if (fit$convergence != 0) {
    warning("fitzinba did not converge")
  }
  par <- fit$par

  newmodel = new("Gamma")
  newmodel@shape = par["shape"]
  newmodel@rate = par["rate"]
  return(newmodel)
}
setMethod("fit", signature=c(model="Gamma", x="numeric", logExpectation="numeric"), fit.Gamma)

drawFrom.Gamma <- function(model, N) {
  return(rgamma(N, shape=model@shape, rate=model@rate))
}
setMethod("drawFrom", signature=c(model="Gamma", N="numeric"), drawFrom.Gamma)

CDF.Gamma <- function(model, x) {
  return(pgamma(x, shape=model@shape, rate=model@rate))
}
setMethod("CDF", signature=c(model="Gamma", x="numeric"), CDF.Gamma)


## Left-censored Gamma distribution
LeftCensoredGamma <- setClass("LeftCensoredGamma",
                              representation=list(shape="numeric", rate="numeric",cutoff="numeric"),
                              contains="Distribution")

logLikelihood.LeftCensoredGamma <- function(model, x) {
  values<-ifelse(x < model@cutoff,
                 pgamma(model@cutoff,shape=model@shape, rate=model@rate, log.p=T),
                 dgamma(x, shape=model@shape, rate=model@rate, log=T))
  return(values)
}
setMethod("logLikelihood", signature=c(model="LeftCensoredGamma", x="numeric"), logLikelihood.LeftCensoredGamma)

fit.LeftCensoredGamma <- function(model, x, logExpectation) {

  n<-length(x)
  weight <- exp(logExpectation)
  m <- x %*% weight / n
  v <- sum(weight * (x - c(m))^2) / n

  start <- list(shape = m^2/v, rate = m/v)
  control <- list(parscale = c(1, start$rate))

  ## define the objective function
  obj <- function(par) -sum(weight *
                              ifelse(x >= model@cutoff,
                                     dgamma(x, shape=par[["shape"]], rate=par[["rate"]], log=TRUE),
                                     pgamma(model@cutoff,shape=par[["shape"]], rate=par[["rate"]],log.p=TRUE))) / n

  #Sometimes NA warnings appear
  suppressWarnings(fit <- optim(start, obj))

  if (fit$convergence != 0) {
    warning("fit did not converge")
  }
  par <- fit$par

  newmodel = new("LeftCensoredGamma")
  newmodel@shape = par["shape"]
  newmodel@rate = par["rate"]
  newmodel@cutoff=model@cutoff
  return(newmodel)
}

setMethod("fit", signature=c(model="LeftCensoredGamma", x="numeric", logExpectation="numeric"),
          fit.LeftCensoredGamma)

drawFrom.LeftCensoredGamma<- function(model, N) {
  return(rgamma(N, shape=model@shape, rate=model@rate))
}
setMethod("drawFrom", signature=c(model="Gamma", N="numeric"), drawFrom.LeftCensoredGamma)

CDF.LeftCensoredGamma <- function(model, x) {
  return(pgamma(x, shape=model@shape, rate=model@rate))
}
setMethod("CDF", signature=c(model="LeftCensoredGamma", x="numeric"), CDF.LeftCensoredGamma)

## the EM behaves strangely somtimes, so we try truncated distributions
## beta is the normalization factor that we need to get a real density
## it is 1 / cdf(x) for left=F and 1 / (1-cdf(x)) for left=T
setClass("TruncatedDistribution", representation=list(distribution="Distribution", cutpoint="numeric", left="logical", beta="numeric"), contains="Distribution")
TruncatedDistribution <- function(distribution, cutpoint, left) {
  # beta = CDF(distribution, cutpoint)
  # if (left) {
  #   beta = 1 - beta
  # }
  # beta = 1 / beta
  ## some problems with this, so we just use 1 for the moment
  beta = 1
  td = new("TruncatedDistribution", distribution=distribution, cutpoint=cutpoint, left=left, beta=beta)
  return(td)
}


truncate <- function(model, x) {
  if (model@left) {
    truncate = x <= model@cutpoint
  } else {
    truncate = x > model@cutpoint
  }
  return(truncate)
}
# setMethod("truncate", signature=c(model="TruncatedDistribution", x="numeric"))

logLikelihood.TruncatedDistribution <- function(model, x) {
  ll = logLikelihood(model@distribution, x)
  ll = ll + log(model@beta)
  truncated = truncate(model, x)
  ll[truncated] = -Inf
  return(ll)
}
setMethod("logLikelihood", signature=c(model="TruncatedDistribution", x="numeric"), logLikelihood.TruncatedDistribution)

fit.TruncatedDistribution <- function(model, x, logExpectation) {
  truncated = truncate(model, x)
  lexp = logExpectation
  lexp[truncated] = -Inf

  newdist = fit(model@distribution, x, lexp)
  newmodel = TruncatedDistribution(newdist, model@cutpoint, model@left)

  return(newmodel)
}
setMethod("fit", signature=c(model="TruncatedDistribution", x="numeric", logExpectation="numeric"), fit.TruncatedDistribution)

drawFrom.TruncatedDistribution <- function(model, N) {
    return(rnbinom(N, model@size, model@mu))
}
setMethod("drawFrom", signature=c(model="TruncatedDistribution", N="numeric"), drawFrom.TruncatedDistribution)





# copula based bivariate negative binomial
ZinbaCopula <- setClass("ZinbaCopula", representation=list(marginal.x="numeric", marginal.y="numeric", sigma="matrix"), contains="Distribution")

logLikelihood.ZinbaCopula <- function(model, x) {
  require(histoneHMM)
  lobj = list(marginal.x=model@marginal.x, marginal.y=model@marginal.y, sigma=model@sigma)
  return(log(histoneHMM::dzinba.copula(x[,1], x[,2], lobj)))
}
setMethod("logLikelihood", signature=c(model="ZinbaCopula", x="matrix"), logLikelihood.ZinbaCopula)

fit.ZinbaCopula <- function(model, x, logExpectation) {
  par = fit.zinba.copula(x[,1], x[,2], weight=exp(logExpectation))
  newmodel = new(ZinbaCopula)
  newmodel@marginal.x = par$marginal.x
  newmodel@marginal.y = par$marginal.y
  newmodel@sigma = par$sigma
  return(newmodel)
}
setMethod("fit", signature=c(model="ZinbaCopula", x="matrix", logExpectation="numeric"), fit.ZinbaCopula)



EMResult <- setClass("EMResult", representation=list(models="list", logLikelihood="matrix", dataLogLikelihood="numeric", proportions="numeric"), contains="Distribution")

plotDensity.EMResult <- function(model, x=NULL, xlim=NULL, add=FALSE, col="black", alpha=1) {

  # plot the components and compute the mixture density
  xx = seq(xlim[1], xlim[2])
  d = rep(0, length(xx))
  for (i in 1:length(model@models)) {
    plotDensity(model@models[[i]], x, xlim, add=((i!=1) || (i==1 && add)), col=palette()[i+1], alpha=model@proportions[i])
    d = d + exp(logLikelihood(model@models[[i]], xx)) * model@proportions[i]
  }

  if (!add || !is.null(x)) {
    lines(xx, d, col=col, lwd=2)
  } else {
    plot(xx, d, type="l", col=col, lwd=2)
  }
  if (!is.null(names(model@models))) {
    legend("right", lty=1, lwd=2, col=c(col, palette()[1 + 1:length(model@models)]), legend=c("mixture", names(model@models)))
  }
}
setMethod("plotDensity", signature=c(model="EMResult", x="numeric", xlim="numeric", add="logical", col="character", alpha="numeric"), plotDensity.EMResult)

logLikelihood.EMResult <- function(model, x) {
  ll = sapply(model@models, logLikelihood, x)
  ll = ll + rep(log(model@proportions), each=length(x))
  ll = apply(ll, 1, logsum)
  return(ll)
}
setMethod("logLikelihood", signature=c(model="EMResult", x="numeric"), logLikelihood.EMResult)

nparam.EMResult <- function(model) {
  nparam.submodels <- sapply(model@models, nparam)
  ## proportions sum up to one so we actually have nr of components - 1 proportion parameters
  return(sum(nparam.submodels) + length(model@models) - 1)
}
setMethod("nparam", signature=c(model="EMResult"), nparam.EMResult)

EMPosterior <- function(emfit, x) {
  posterior = sapply(emfit@models, logLikelihood, x) + rep(log(emfit@proportions), each=length(x))
  posterior = posterior - apply(posterior, 1, logsum)
  posterior = exp(posterior)
  return(posterior)
}


## compute AIC and BIC for the mixture models
AIC <- function(emfit) {
  return(2 * nparam(emfit) - 2 * emfit@dataLogLikelihood)
}

BIC <- function(emfit) {
  return(log(nrow(emfit@logLikelihood)) * nparam(emfit) - 2 * emfit@dataLogLikelihood)
}


#' Fit mixture distributions using the expectation maximization (EM) algorithm
#' 
#'
#' @param x numeric vector or matrix with observations
#' @param ncomp number of mixture components to be fitted
#' @param prop numeric vector of length ncomp with the initial mixture proportions (between 0-1).
#' Default is NULL, which means that proportions will be initialized uniformly.
#' @param maxit maximal number of iterations. Default is 100.
#' @param model.constructor character vector either of length 1 or ncomp. The values specify the class of
#' distributions for each component. If length is 1 all compoments will be of the same class of distributions.
#' Values can be 'Zinba', 'Negbinom', 'Gamma', 'LeftCensoredGamma', 'Zero' and some others. These distributions
#' are represented as classes and the respective constructors are called.
#' @param models list of ncomp instances of distribution classes. This can be used to pre-initialize the
#' starting values of the distributions of the components.
#'
#' @return an object of class \code{EMResult}. It has the following slots:
#' 'models': a list of ncomp distribution instances,
#' 'logLikelihood': a matrix n observations x ncomp with log likelihoods for each observation and component,
#' 'dataLogLikelihood': numeric value of the overall likelihood of the data,
#' 'proportions': the ncomp mixture proportions
#'
#' @export
em <- function(x, ncomp, prop=NULL, maxit=100, eps=1e-4, model.constructor="Zinba", models=NULL) {
  n = length(x)
  if (is.matrix(x) || is.data.frame(x)) {
    n = nrow(x)
  }
  if (is.null(prop)) {
    prop = rep(1 / ncomp, ncomp)
  }
  logProp = log(prop)

  if (length(model.constructor) == 1) {
    model.constructor = rep(model.constructor, ncomp)
  } else if (length(model.constructor) != ncomp) {
    stop("model.constructor has to have length 1 or ncomp")
  }

  # initialize the models
  if (is.null(models)) {
    models = list()
    # we just use quantiles to cut the data
    if (is.matrix(x) || is.data.frame(x)) {
      # for simplicity just cut along the first axis
      cutted = findInterval(x[,1], quantile(x[,1], prob=(0:ncomp)/ncomp),  rightmost.closed=T)
    } else {
      cutted = findInterval(x, quantile(x, prob=(0:ncomp)/ncomp),  rightmost.closed=T)
    }

    # browser()
    cat("Initializing..\n")
    for (m in 1:ncomp) {
      model = new(model.constructor[m])
      # make the weights a mixture of the quantile based assignment but also
      # distribute a bit of weight on the other classes
      alpha = 0.9
      weight = alpha * as.numeric(cutted == m)
      weight[weight == 0] = 1 - alpha
      model = fit(model, x, log(weight))
      models[[m]] = model
    }
  } else {
    cat("Using preinitialized models\n")
  }


  oldLogLik = NA
  # iterate E and M step
  for (i in 1:maxit) {
    #
    # E step
    #
    logExpectation = sapply(models, logLikelihood, x) # p_i(x)
    logExpectation = logExpectation + rep(logProp, each=n) # a_i p_i(x)
    logLikByElement = apply(logExpectation, 1, logsum) # p(x)
    logExpectation = logExpectation - logLikByElement # a_i p_i(x) / p(x)
    logLik = sum(logLikByElement)

    # check convergence using the delta in log likelihood
    cat("E-step: logLik =", logLik, "delta =", logLik - oldLogLik, "\n")
    if (!is.na(oldLogLik) && logLik - oldLogLik < eps) {
      if (logLik - oldLogLik < 0) {
        warning("Problem in the EM algorithm: likelihood is decreasing!")
      }
      break
    }
    oldLogLik = logLik

    #
    # M step
    #

    # new estimate for the proportions
    logProb = apply(logExpectation, 2, logsum) - log(n)

    # new estimates for the models
    for (m in 1:ncomp) {
      models[[m]] = fit(models[[m]], x, logExpectation[,m])
    }

    cat("M-step: proportions =", exp(logProb), "\nmodels:\n")
    print(models)

    res = new("EMResult")
    res@models = models
    res@logLikelihood = logExpectation
    res@dataLogLikelihood = logLik
    res@proportions = exp(logProb)
  }
  return(res)
}
heiniglab/scPower documentation built on Jan. 9, 2025, 12:13 p.m.