DEClustOne <- function(counts, treatment, K, normalizer, phi) {
# prepare some constant value
Ngene <- nrow(counts)
Ngroup <- nlevels(treatment)
Nsample <- length(treatment)
# initialize alpha (matrix[Ngene, K])
alpha <- matrix(rep(apply(counts, 1, function(x) {
mean(ifelse(x == 0, log(1e-10), log(x)))
}), times = K), nrow = Ngene, ncol = K)
# initialize beta (matrix[Ngroup, K])
para <- t(vapply(seq_len(Ngene), function(i) {
(predict(glm(counts[i, ] ~ treatment, family = MASS::negative.binomial(1/phi[i]), offset = normalizer[i, ])) - normalizer[i, ])[!duplicated(treatment)]
}, vector("numeric", nlevels(treatment))))
alpha.alt <- rowMeans(para)
beta.alt <- sweep(para, 1, alpha.alt, "-")
.tmp <- apply(beta.alt, 1, function(x) max(x) - min(x))
beta <- t(kmeans(beta.alt[.tmp < quantile(.tmp, .75), ], K, nstart = 5)$centers)
# initialize priors (vector[K])
pi <- rep(1/K, K)
# initialize posteriors (matrix[Ngene, K])
Z <- matrix(rep(pi, each = Ngene), nrow = Ngene, ncol = K)
# function: calculate log likelihood
loglikelihood.gene <- function(alpha_k, beta_k) {
lambda <- exp(normalizer + rep(alpha_k, times = Nsample) + rep(beta_k[treatment], each = Ngene))
logL <- dnbinom(counts, size = rep(1 / phi, Nsample), mu = lambda, log = TRUE)
logL <- ifelse(is.infinite(logL), -.Machine$double.xmax, logL)
dim(logL) <- c(Ngene, Nsample)
apply(logL, 1, sum)
}
# function: calculate posteriors
posteriors <- function() {
logL <- vapply(1:K, function(k) {
loglikelihood.gene(alpha_k = alpha[, k], beta_k = beta[, k])
}, vector("double", Ngene))
prior.logL <- sweep(logL, 2, log(pi), "+")
logPPs <- prior.logL - apply(prior.logL, 1, function(x) log(sum(exp(x - max(x)))) + max(x))
exp(logPPs)
}
# function: estimate beta by maximazing weighted log likelihood
estimateBeta <- function() {
weightedLoglinlihood.gene <- function(beta, k) {
rtn <- -sum(Z[, k] * loglikelihood.gene(alpha[, k], beta))
ifelse(is.finite(rtn), rtn, .Machine$double.xmax)
}
constraint <- function(beta, k) {
sum(beta)
}
vapply(1:K, function(k) {
Rsolnp::solnp(pars = beta[, k], fun = weightedLoglinlihood.gene,
eqfun = constraint, eqB = 0, control = list(trace = 0), k = k)$pars
}, vector("numeric", Ngroup))
}
# function: estimate alpha by maximazing log likelihood of each gene
estimateAlpha <- function() {
loglikelihood.onegene <- function(alpha_gk, g, k) {
lambda <- exp(normalizer[g, ] + alpha_gk + beta[treatment, k])
sum(dnbinom(counts[g, ], size = 1 / phi[g], mu = lambda, log = TRUE))
}
vapply(1:K, function(k) {
vapply(1:Ngene, function(g) {
optimize(loglikelihood.onegene, c(-100, 50), g = g, k = k, maximum = TRUE)$maximum
}, vector("numeric", 1))
}, vector("numeric", Ngene))
}
# main process (EM algorythm)
logL.model <- -Inf
for (iter in 1:100) {
# E-step
Z <- posteriors()
# M-step
beta <- estimateBeta()
pi <- colMeans(Z)
alpha <- estimateAlpha()
# calculate log likelihood of the model with estimated parameters
logL.model.tmp <- vapply(1:K, function(k) {
loglikelihood.gene(alpha[, k], beta[, k])
}, vector("numeric", Ngene))
logL.model.tmp <- sweep(logL.model.tmp, 2, log(pi), "+")
logL.model.tmp <- sum(apply(logL.model.tmp, 1, function(x) {
log(sum(exp(x - max(x)))) + max(x)
}))
# exit condition
diff <- logL.model.tmp - logL.model
if (diff >= 0 && abs(diff) < 0.01 && iter >= 20) break
logL.model <- logL.model.tmp
}
# cluster
cluster <- max.col(Z)
# calculate AIC and BIC
np <- (Ngene + Ngroup) * K - 1
aic <- -2 * logL.model + 2 * np
bic <- -2 * logL.model + log(Ngene) * np
# return value
rtn <- list(counts = counts, normalizer = normalizer, treatment = treatment,
posteriors = Z, alpha = alpha, beta = beta, cluster = cluster,
phi = phi, logL = logL.model, AIC = aic, BIC = bic, iter = iter)
class(rtn) <- "DEClustOne"
return(rtn)
}
DEClust <- function(counts, treatment, K, normalizer = NULL, phi = NULL, disp_method = "dds") {
if (is.null(normalizer)) {
.tmp <- counts
.tmp[.tmp == 0] <- NA
q3 <- apply(.tmp, 2, quantile, .75 , na.rm = TRUE)
size.factors <- q3 / mean(q3)
normalizer <- matrix(log(size.factors), nrow = nrow(counts), ncol = ncol(counts), byrow = TRUE)
} else if (is.matrix(normalizer)) {
if (!all(dim(counts) == dim(normalizer))) {
stop("the shape of counts matrix and normalizer matrix is different")
}
size.factors <- colSums(normalizer) / sum(normalizer)
normalizer <- log(normailzer)
} else if (length(normalizer) == ncol(counts)) {
size.factors <- normalizer
normalizer <- matrix(log(size.factors), nrow = nrow(counts), ncol = ncol(counts), byrow = TRUE)
} else {
stop("Invalid normalizer!")
}
if (is.null(phi)) {
if (disp_method == "dds") {
library(DSS)
designs <- model.matrix(~ treatment)
.tmp <- counts
colnames(counts) <- 1:ncol(counts)
seqData <- DSS::newSeqCountSet(counts, as.data.frame(designs))
seqData <- DSS::estNormFactors(seqData)
seqData <- DSS::estDispersion(seqData)
phi <- DSS::dispersion(seqData)
}
if (disp_method == "edger"){
d <- edgeR::DGEList(counts = counts, group = treatment, lib.size = colSums(counts) / size.factors)
d <- edgeR::calcNormFactors(d)
d <- edgeR::estimateGLMCommonDisp(d)
d <- edgeR::estimateGLMTrendedDisp(d)
d <- edgeR::estimateGLMTagwiseDisp(d)
phi <- d$tagwise.dispersion
}
}
rtn <- lapply(K, function(k) DEClustOne(counts, treatment, k, normalizer, phi))
class(rtn) <- "DEClust"
return(rtn)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.