DEClust: DE analysis based on Clustering

DEClustOne <- function(counts, treatment, K, normalizer, phi) {
  # prepare some constant value
  Ngene <- nrow(counts)
  Ngroup <- nlevels(treatment)
  Nsample <- length(treatment)

  # initialize alpha (matrix[Ngene, K])
  alpha <- matrix(rep(apply(counts, 1, function(x) {
    mean(ifelse(x == 0, log(1e-10), log(x)))
  }), times = K), nrow = Ngene, ncol = K)

  # initialize beta (matrix[Ngroup, K])
  para <- t(vapply(seq_len(Ngene), function(i) {
    (predict(glm(counts[i, ] ~ treatment, family = MASS::negative.binomial(1/phi[i]), offset = normalizer[i, ])) - normalizer[i, ])[!duplicated(treatment)]
  }, vector("numeric", nlevels(treatment))))
  alpha.alt <- rowMeans(para)
  beta.alt <- sweep(para, 1, alpha.alt, "-")
  .tmp <- apply(beta.alt, 1, function(x) max(x) - min(x))

  beta <- t(kmeans(beta.alt[.tmp < quantile(.tmp, .75), ], K, nstart = 5)$centers)


  # initialize priors (vector[K])
  pi <- rep(1/K, K)

  # initialize posteriors (matrix[Ngene, K])
  Z <- matrix(rep(pi, each = Ngene), nrow = Ngene, ncol = K)

  # function: calculate log likelihood
  loglikelihood.gene <- function(alpha_k, beta_k) {
    lambda <- exp(normalizer + rep(alpha_k, times = Nsample) + rep(beta_k[treatment], each = Ngene))
    logL <- dnbinom(counts, size = rep(1 / phi, Nsample), mu = lambda, log = TRUE)
    logL <- ifelse(is.infinite(logL), -.Machine$double.xmax, logL)
    dim(logL) <- c(Ngene, Nsample)
    apply(logL, 1, sum)
  }

  # function: calculate posteriors
  posteriors <- function() {
    logL <- vapply(1:K, function(k) {
      loglikelihood.gene(alpha_k = alpha[, k], beta_k = beta[, k])
    }, vector("double", Ngene))
    prior.logL <- sweep(logL, 2, log(pi), "+")
    logPPs <- prior.logL - apply(prior.logL, 1, function(x) log(sum(exp(x - max(x)))) + max(x))
    exp(logPPs)
  }

  # function: estimate beta by maximazing weighted log likelihood
  estimateBeta <- function() {
    weightedLoglinlihood.gene <- function(beta, k) {
      rtn <- -sum(Z[, k] * loglikelihood.gene(alpha[, k], beta))
      ifelse(is.finite(rtn), rtn, .Machine$double.xmax)
    }
    constraint <- function(beta, k) {
      sum(beta)
    }
    vapply(1:K, function(k) {
      Rsolnp::solnp(pars =  beta[, k], fun = weightedLoglinlihood.gene,
                    eqfun = constraint, eqB = 0, control = list(trace = 0), k = k)$pars
    }, vector("numeric", Ngroup))
  }

  # function: estimate alpha by maximazing log likelihood of each gene
  estimateAlpha <- function() {
    loglikelihood.onegene <- function(alpha_gk, g, k) {
      lambda <- exp(normalizer[g, ] + alpha_gk + beta[treatment, k])
      sum(dnbinom(counts[g, ], size = 1 / phi[g], mu = lambda, log = TRUE))
    }
    vapply(1:K, function(k) {
      vapply(1:Ngene, function(g) {
        optimize(loglikelihood.onegene, c(-100, 50), g = g, k = k, maximum = TRUE)$maximum
      }, vector("numeric", 1))
    }, vector("numeric", Ngene))
  }

  # main process (EM algorythm)
  logL.model <- -Inf
  for (iter in 1:100) {
    # E-step
    Z <- posteriors()
    # M-step
    beta <- estimateBeta()
    pi <- colMeans(Z)
    alpha <- estimateAlpha()

    # calculate log likelihood of the model with estimated parameters
    logL.model.tmp <- vapply(1:K, function(k) {
      loglikelihood.gene(alpha[, k], beta[, k])
    }, vector("numeric", Ngene))
    logL.model.tmp <- sweep(logL.model.tmp, 2, log(pi), "+")
    logL.model.tmp <- sum(apply(logL.model.tmp, 1, function(x) {
      log(sum(exp(x - max(x)))) + max(x)
    }))

    # exit condition
    diff <- logL.model.tmp - logL.model
    if (diff >= 0 && abs(diff) < 0.01 && iter >= 20) break

    logL.model <- logL.model.tmp
  }

  # cluster
  cluster <- max.col(Z)

  # calculate AIC and BIC
  np <- (Ngene + Ngroup) * K - 1
  aic <- -2 * logL.model + 2 * np
  bic <- -2 * logL.model + log(Ngene) * np

  # return value
  rtn <- list(counts = counts, normalizer = normalizer, treatment = treatment,
               posteriors = Z, alpha = alpha, beta = beta, cluster = cluster,
               phi = phi, logL = logL.model, AIC = aic, BIC = bic, iter = iter)
  class(rtn) <- "DEClustOne"
  return(rtn)
}

DEClust <- function(counts, treatment, K, normalizer = NULL, phi = NULL, disp_method = "dds") {

  if (is.null(normalizer)) {
    .tmp <- counts
    .tmp[.tmp == 0] <- NA
    q3 <- apply(.tmp, 2, quantile, .75 , na.rm = TRUE)
    size.factors <- q3 / mean(q3)
    normalizer <- matrix(log(size.factors), nrow = nrow(counts), ncol = ncol(counts), byrow = TRUE)
  } else if (is.matrix(normalizer)) {
    if (!all(dim(counts) == dim(normalizer))) {
      stop("the shape of counts matrix and normalizer matrix is different")
    }
    size.factors <- colSums(normalizer) / sum(normalizer)
    normalizer <- log(normailzer)
  } else if (length(normalizer) == ncol(counts)) {
    size.factors <- normalizer
    normalizer <- matrix(log(size.factors), nrow = nrow(counts), ncol = ncol(counts), byrow = TRUE)
  } else {
    stop("Invalid normalizer!")
  }


  if (is.null(phi)) {
    if (disp_method == "dds") {
      library(DSS)
      designs <- model.matrix(~ treatment)
      .tmp <- counts
      colnames(counts) <- 1:ncol(counts)
      seqData <- DSS::newSeqCountSet(counts, as.data.frame(designs))
      seqData <- DSS::estNormFactors(seqData)
      seqData <- DSS::estDispersion(seqData)
      phi <- DSS::dispersion(seqData)

    }
    if (disp_method == "edger"){
      d <- edgeR::DGEList(counts = counts, group = treatment, lib.size = colSums(counts) / size.factors)
      d <- edgeR::calcNormFactors(d)
      d <- edgeR::estimateGLMCommonDisp(d)
      d <- edgeR::estimateGLMTrendedDisp(d)
      d <- edgeR::estimateGLMTagwiseDisp(d)
      phi <- d$tagwise.dispersion
    }
  }


  rtn <- lapply(K, function(k) DEClustOne(counts, treatment, k, normalizer, phi))
  class(rtn) <- "DEClust"
  return(rtn)

}