#' Clustering Using mixtures of MVPLN via variational Gaussian approximations
#'
#' Performs clustering using mixtures of matrix variate Poisson-log normal
#' (MVPLN) via variational Gaussian approximations. Model selection can
#' be done using AIC, AIC3, BIC and ICL.
#'
#' @details Starting values (argument: initMethod) and the number of
#' iterations for each MCMC chain (argument: nIterations) play an
#' important role in the successful operation of this algorithm.
#' Occasionally an error may result due to singular matrix. In that
#' case rerun the method and may set a different seed to ensure
#' a different set of initialization values.
#'
#' @param dataset A list of length nUnits, containing Y_j matrices.
#' A matrix Y_j has size r x p, and the dataset will have 'j' such
#' matrices with j = 1,...,n. If a Y_j has all zeros, such Y_j will
#' be removed prior to cluster analysis.
#' @param membership A numeric vector of size length(dataset) containing
#' the cluster membership of each Y_j matrix. If not available, leave
#' as "none".
#' @param gmin A positive integer specifying the minimum number of
#' components to be considered in the clustering run.
#' @param gmax A positive integer, >gmin, specifying the maximum number of
#' components to be considered in the clustering run.
#' @param initMethod A method for initialization. Current options are
#' "kmeans", "random", "medoids", "clara", or "fanny". If nInitIterations
#' is set to >= 1, then this method will be used for initialization.
#' Default is "kmeans".
#' @param nInitIterations A positive integer or zero, specifying the number
#' of initialization runs prior to clustering. The run with the highest
#' loglikelihood is used to initialize values, if nInitIterations > 1.
#' Default value is set to 0, in which case no initialization will
#' be performed.
#' @param normalize A character string "Yes" or "No" specifying
#' if normalization should be performed. Currently, normalization
#' factors are calculated using TMM method of edgeR package.
#' Default is "Yes", for which normalization will be performed.
#'
#' @return Returns an S3 object of class mvplnParallel with results.
#' \itemize{
#' \item dataset - The input dataset on which clustering is performed.
#' \item nUnits - Number of units in the input dataset.
#' \item nVariables - Number of variables in the input dataset.
#' \item nOccassions - Number of occassions in the input dataset.
#' \item normFactors - A vector of normalization factors used for
#' input dataset.
#' \item gmin - Minimum number of components considered in the
#' clustering run.
#' \item gmax - Maximum number of components considered in the
#' clustering run.
#' \item initalizationMethod - Method used for initialization.
#' \item allResults - A list with all results.
#' \item loglikelihood - A vector with value of final log-likelihoods
#' for each cluster size.
#' \item nParameters - A vector with number of parameters for each
#' cluster size.
#' \item trueLabels - The vector of true labels, if provided by user.
#' \item ICL_all - A list with all ICL model selection results.
#' \item BIC_all - A list with all BIC model selection results.
#' \item AIC_all - A list with all AIC model selection results.
#' \item AIC3_all - A list with all AIC3 model selection results.
#' \item totalTime - Total time used for clustering and model selection.
#' }
#'
#' @examples
#' \dontrun{
#' # Example 1
#' set.seed(1234)
#' trueG <- 2 # number of total G
#' truer <- 2 # number of total occasions
#' truep <- 3 # number of total responses
#' trueN <- 1000 # number of total units
#'
#' # Mu is a r x p matrix
#' trueM1 <- matrix(rep(6, (truer * truep)),
#' ncol = truep,
#' nrow = truer, byrow = TRUE)
#'
#' trueM2 <- matrix(rep(1, (truer * truep)),
#' ncol = truep,
#' nrow = truer,
#' byrow = TRUE)
#'
#' trueMall <- rbind(trueM1, trueM2)
#'
#' # Phi is a r x r matrix
#' # Loading needed packages for generating data
#' # if (!require(clusterGeneration)) install.packages("clusterGeneration")
#' # library("clusterGeneration")
#'
#' # Covariance matrix containing variances and covariances between r occasions
#' # truePhi1 <- clusterGeneration::genPositiveDefMat("unifcorrmat",
#' # dim = truer,
#' # rangeVar = c(1, 1.7))$Sigma
#' truePhi1 <- matrix(c(1.075551, -0.488301, -0.488301, 1.362777), nrow = 2)
#' truePhi1[1, 1] <- 1 # For identifiability issues
#'
#' # truePhi2 <- clusterGeneration::genPositiveDefMat("unifcorrmat",
#' # dim = truer,
#' # rangeVar = c(0.7, 0.7))$Sigma
#' truePhi2 <- matrix(c(0.7000000, 0.6585887, 0.6585887, 0.7000000), nrow = 2)
#' truePhi2[1, 1] <- 1 # For identifiability issues
#' truePhiall <- rbind(truePhi1, truePhi2)
#'
#' # Omega is a p x p matrix
#' # Covariance matrix containing variances and covariances between p responses
#' # trueOmega1 <- clusterGeneration::genPositiveDefMat("unifcorrmat", dim = truep,
#' # rangeVar = c(1, 1.7))$Sigma
#' trueOmega1 <- matrix(c(1.0526554, 1.0841910, -0.7976842,
#' 1.0841910, 1.1518811, -0.8068102,
#' -0.7976842, -0.8068102, 1.4090578),
#' nrow = 3)
#' # trueOmega2 <- clusterGeneration::genPositiveDefMat("unifcorrmat", dim = truep,
#' # rangeVar = c(0.7, 0.7))$Sigma
#' trueOmega2 <- matrix(c(0.7000000, 0.5513744, 0.4441598,
#' 0.5513744, 0.7000000, 0.4726577,
#' 0.4441598, 0.4726577, 0.7000000),
#' nrow = 3)
#' trueOmegaAll <- rbind(trueOmega1, trueOmega2)
#'
#' # Generated simulated data
#' sampleData <- mixMVPLN::mvplnDataGenerator(nOccasions = truer,
#' nResponses = truep,
#' nUnits = trueN,
#' mixingProportions = c(0.79, 0.21),
#' matrixMean = trueMall,
#' phi = truePhiall,
#' omega = trueOmegaAll)
#'
#' # Clustering simulated matrix variate count data
#' clusteringResults <- mixMVPLN::mvplnVGAclus(
#' dataset = sampleData$dataset,
#' membership = sampleData$truemembership,
#' gmin = 1,
#' gmax = 3,
#' initMethod = "kmeans",
#' nInitIterations = 3,
#' normalize = "Yes")
#'
#' # Example 2
#' trueG <- 1 # number of total G
#' truer <- 2 # number of total occasions
#' truep <- 3 # number of total responses
#' trueN <- 1000 # number of total units
#' truePiG <- 1L # mixing proportion for G = 1
#'
#' # Mu is a r x p matrix
#' trueM1 <- matrix(c(6, 5.5, 6, 6, 5.5, 6),
#' ncol = truep,
#' nrow = truer,
#' byrow = TRUE)
#' trueMall <- rbind(trueM1)
#' # Phi is a r x r matrix
#' # truePhi1 <- clusterGeneration::genPositiveDefMat(
#' # "unifcorrmat",
#' # dim = truer,
#' # rangeVar = c(0.7, 1.7))$Sigma
#' truePhi1 <- matrix(c(1.3092747, 0.3219674,
#' 0.3219674, 1.3233794), nrow = 2)
#' truePhi1[1, 1] <- 1 # for identifiability issues
#' truePhiall <- rbind(truePhi1)
#'
#' # Omega is a p x p matrix
#' # trueOmega1 <- genPositiveDefMat(
#' # "unifcorrmat",
#' # dim = truep,
#' # rangeVar = c(1, 1.7))$Sigma
#' trueOmega1 <- matrix(c(1.1625581, 0.9157741, 0.8203499,
#' 0.9157741, 1.2216287, 0.7108193,
#' 0.8203499, 0.7108193, 1.2118854), nrow = 3)
#' trueOmegaAll <- rbind(trueOmega1)
#'
#' # Generated simulated data
#' sampleData2 <- mixMVPLN::mvplnDataGenerator(
#' nOccasions = truer,
#' nResponses = truep,
#' nUnits = trueN,
#' mixingProportions = truePiG,
#' matrixMean = trueMall,
#' phi = truePhiall,
#' omega = trueOmegaAll)
#'
#' # Clustering simulated matrix variate count data
#' clusteringResults <- mixMVPLN::mvplnVGAclus(
#' dataset = sampleData2$dataset,
#' membership = sampleData2$truemembership,
#' gmin = 1,
#' gmax = 2,
#' initMethod = "clara",
#' nInitIterations = 2,
#' normalize = "Yes")
#' # Example 3
#' # mixture of six independent Poisson distributions with G = 2
#' set.seed(23456)
#' totalSet <- 1 # Number of datasets
#' data <- list()
#' for (i in 1:totalSet) {
#' N <- 1000 # biological samples e.g. genes
#' d <- 6 # dimensionality e.g. conditions*replicates = total samples
#' G <- 2
#'
#' piG <- c(0.45, 0.55) # mixing proportions
#'
#' tmu1 <- c(1000, 1500, 1500, 1000, 1500, 1000)
#' tmu2 <- c(1000, 1000, 1000, 1500, 1000, 1200)
#' z <- t(rmultinom(n = N, size = 1, prob = piG))
#' para <- list()
#' para[[1]] <- NULL # for pi
#' para[[2]] <- list() # for mu
#' para[[3]] <- list() # for Z
#' names(para) <- c("pi", "mu", "Z")
#'
#' para[[1]] <- piG
#' para[[2]] <- list(tmu1, tmu2)
#' para[[3]] <- z
#'
#' Y <- matrix(0, ncol = d, nrow = N)
#'
#' for (g in 1:G) {
#' obs <- which(para[[3]][, g] == 1)
#' for (j in 1:d) {
#' Y[obs, j] <- rpois(length(obs), para[[2]][[g]][j])
#' }
#' }
#'
#' Y_mat <- list()
#' for (obs in 1:N) {
#' Y_mat[[obs]] <- matrix(Y[obs, ], nrow = 2, byrow = TRUE)
#' }
#'
#' data[[i]] <- list()
#' data[[i]][[1]] <- para
#' data[[i]][[2]] <- Y
#' data[[i]][[3]] <- Y_mat
#' }
#' outputExample3 <- mixMVPLN::mvplnVGAclus(dataset = data[[1]][[3]],
#' membership = "none",
#' gmin = 1,
#' gmax = 3,
#' initMethod = "kmeans",
#' nInitIterations = 1,
#' normalize = "Yes")
#'
#' # Example 4
#' # mixture of six independent negative binomial distributions with G = 2
#' set.seed(23456)
#' N1 <- 2000 # biological samples e.g. genes
#' d1 <- 6 # dimensionality e.g. conditions*replicates = total samples
#' piG1 <- c(0.79, 0.21) # mixing proportions for G=2
#' trueMu1 <- rep(c(1000, 500), 3)
#' trueMu2 <- sample(c(1000, 500), 6, replace=TRUE)
#' trueMu <- list(trueMu1, trueMu2)
#'
#' z <- t(rmultinom(N1, size = 1, piG1))
#'
#' nG <- vector("list", length = length(piG1))
#' dataNB <- matrix(NA, ncol = d1, nrow = N1)
#'
#' for (i in 1:length(piG1)) { # This code is not generalized, for G = 2 only
#' nG[[i]] <- which(z[, i] == 1)
#' for (j in 1:d1) {
#' dataNB[nG[[i]], j] <- rnbinom(n = length(nG[[i]]),
#' mu = trueMu[[i]][j],
#' size = 100)
#' }
#' }
#'
#' dataNBmat <- list()
#' for (obs in 1:N1) {
#' dataNBmat[[obs]] <- matrix(dataNB[obs,],
#' nrow = 2,
#' byrow = TRUE)
#' }
#'
#' outputExample4 <- mixMVPLN::mvplnVGAclus(dataset= dataNBmat,
#' membership = "none",
#' gmin = 1,
#' gmax = 2,
#' initMethod = "kmeans",
#' nInitIterations = 1,
#' normalize = "Yes")
#' }
#'
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}, Sanjeena Dang,
#' \email{sanjeenadang@cunet.carleton.ca}. }
#'
#' @references
#' Aitchison, J. and C. H. Ho (1989). The multivariate Poisson-log normal distribution.
#' \emph{Biometrika} 76.
#'
#' Akaike, H. (1973). Information theory and an extension of the maximum likelihood
#' principle. In \emph{Second International Symposium on Information Theory}, New York, NY,
#' USA, pp. 267–281. Springer Verlag.
#'
#' Arlot, S., Brault, V., Baudry, J., Maugis, C., and Michel, B. (2016).
#' capushe: CAlibrating Penalities Using Slope HEuristics. R package version 1.1.1.
#'
#' Biernacki, C., G. Celeux, and G. Govaert (2000). Assessing a mixture model for
#' clustering with the integrated classification likelihood. \emph{IEEE Transactions
#' on Pattern Analysis and Machine Intelligence} 22.
#'
#' Bozdogan, H. (1994). Mixture-model cluster analysis using model selection criteria
#' and a new informational measure of complexity. In \emph{Proceedings of the First US/Japan
#' Conference on the Frontiers of Statistical Modeling: An Informational Approach:
#' Volume 2 Multivariate Statistical Modeling}, pp. 69–113. Dordrecht: Springer Netherlands.
#'
#' Robinson, M.D., and Oshlack, A. (2010). A scaling normalization method for differential
#' expression analysis of RNA-seq data. \emph{Genome Biology} 11, R25.
#'
#' Schwarz, G. (1978). Estimating the dimension of a model. \emph{The Annals of Statistics}
#' 6.
#'
#' Silva, A. et al. (2019). A multivariate Poisson-log normal mixture model
#' for clustering transcriptome sequencing data. \emph{BMC Bioinformatics} 20.
#' \href{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2916-0}{Link}
#'
#' Silva, A. et al. (2018). Finite Mixtures of Matrix Variate Poisson-Log Normal Distributions
#' for Three-Way Count Data. \href{https://arxiv.org/abs/1807.08380}{arXiv preprint arXiv:1807.08380}.
#'
#' @export
#' @importFrom edgeR calcNormFactors
#' @importFrom mclust unmap
#' @importFrom mclust map
#' @importFrom mvtnorm rmvnorm
#' @importFrom utils tail
#'
mvplnVGAclus <- function(dataset,
membership = "none",
gmin,
gmax,
initMethod = "kmeans",
nInitIterations = 0,
normalize = "Yes") {
ptm <- base::proc.time()
# Performing checks
if (typeof(unlist(dataset)) != "double" & typeof(unlist(dataset)) != "integer") {
stop("dataset should be a list of count matrices.");}
if (any((unlist(dataset) %% 1 == 0) == FALSE)) {
stop("dataset should be a list of count matrices.")
}
if (is.list(dataset) != TRUE) {
stop("dataset needs to be a list of matrices.")
}
if(is.numeric(gmin) != TRUE || is.numeric(gmax) != TRUE) {
stop("Class of gmin and gmin should be numeric.")
}
if (gmax < gmin) {
stop("gmax cannot be less than gmin.")
}
if (gmax > length(dataset)) {
stop("gmax cannot be larger than nrow(dataset).")
}
if(all(membership != "none") && is.numeric(membership) != TRUE) {
stop("membership should be a numeric vector containing the
cluster membership. Otherwise, leave as 'none'.")
}
# Checking if missing membership values
# First check for the case in which G = 1, otherwise check
# if missing cluster memberships
if(all(membership != "none") && length(unique(membership)) != 1) {
if(all(membership != "none") &&
all((diff(sort(unique(membership))) == 1) != TRUE) ) {
stop("Cluster memberships in the membership vector
are missing a cluster, e.g. 1, 3, 4, 5, 6 is missing cluster 2.")
}
}
if(all(membership != "none") && length(membership) != length(dataset)) {
stop("membership should be a numeric vector, where length(membership)
should equal the number of observations. Otherwise, leave as 'none'.")
}
if (is.character(initMethod) == TRUE) {
if(initMethod != "none" & initMethod != "kmeans" & initMethod != "random" & initMethod != "medoids" & initMethod != "medoids" & initMethod != "clara" & initMethod != "fanny") {
stop("initMethod should of class character, specifying
either: none, kmeans, random, medoids, clara, or fanny.")
}
} else if (is.character(initMethod) != TRUE) {
stop("initMethod should of class character, specifying
either: none, kmeans, random, medoids, clara, or fanny.")
}
if (is.numeric(nInitIterations) != TRUE) {
stop("nInitIterations should be positive integer or zero, specifying
the number of initialization runs to be considered.")
}
if (is.character(normalize) != TRUE) {
stop("normalize should be a string of class character specifying
if normalization should be performed.")
}
if (normalize != "Yes" && normalize != "No") {
stop("normalize should be a string indicating Yes or No, specifying
if normalization should be performed.")
}
n <- length(dataset)
p <- ncol(dataset[[1]])
r <- nrow(dataset[[1]])
d <- p * r
# Changing dataset into a n x rp dataset
TwoDdataset <- matrix(NA, ncol = d, nrow = n)
sampleMatrix <- matrix(c(1:d), nrow = r, byrow = TRUE)
for (u in 1:n) {
for (e in 1:p) {
for (s in 1:r) {
TwoDdataset[u, sampleMatrix[s, e]] <- dataset[[u]][s, e]
}
}
}
# Check if entire row is zero
if(any(rowSums(TwoDdataset) == 0)) {
TwoDdataset <- TwoDdataset[- which(rowSums(TwoDdataset) == 0), ]
n <- nrow(TwoDdataset)
membership <- membership[- which(rowSums(TwoDdataset) == 0)]
}
if(all(is.na(membership) == TRUE)) {
membership <- "Not provided" }
# Calculating normalization factors
if(normalize == "Yes") {
normFactors <- as.vector(edgeR::calcNormFactors(as.matrix(TwoDdataset),
method = "TMM"))
} else if(normalize == "No") {
normFactors <- rep(0, d)
} else {
stop("Argument normalize should be 'Yes' or 'No' ")
}
parallelFA <- function(G, dataset,
TwoDdataset,
r, p, d, n,
normFactors,
nInitIterations,
initMethod) {
# arranging normalization factors
libMat <- matrix(normFactors, n, d, byrow = T)
libMatList <- list()
for (i in 1:n) {
libMatList[[i]] <- t(matrix(libMat[i, ], nrow = p))
}
if (initMethod == "none") {
# Initialization
mu <- omega <- phi <- list() # mu is M;
delta <- kappa <- sigma <- isigma <- iphi <- iomega <- list()
# delta is variational parameter Delta
# kappa is variational parameter kappa
# sigma is Psi in the math - kronecker of omega and Phi
# isigma is inverse of Psi
# iphi is inverse of Phi
# iomega is inverse of omega
m <- S <- list()
# m is vectorized xi
# S is Psi
# Other intermediate items initialized
Sk <- array(0, c(d, d, G) )
start <- GX <- dGX <- zS <- list()
iKappa <- iDelta <- startList <- list()
# iKappa is inverse of kappa
# iDelta is inverse of Delta
kMeansResults <- kmeans(log(TwoDdataset+1),
centers = G,
nstart = 50,
iter.max = 20)$cluster
zValue <- mclust::unmap(kMeansResults) ### Starting value for Z
piG <- colSums(zValue) / n
for (g in 1:G) {
obs <- which(zValue[, g] == 1)
mu[[g]] <- colMeans(log(TwoDdataset[obs, ] + 1 / 6))
sigma[[g]] <- var(log(TwoDdataset[obs, ] + 1 / 6))
isigma[[g]] <- solve(sigma[[g]])
phi[[g]] <- diag(r) * sqrt(min(diag(var(log(TwoDdataset[obs, ] + 1 / 6)))))
omega[[g]] <- diag(p) * sqrt(min(diag(var(log(TwoDdataset[obs, ] + 1 / 6)))))
iphi[[g]] <- solve(phi[[g]])
iomega[[g]] <- solve(omega[[g]])
}
for (g in 1:G) {
start[[g]] <- log(TwoDdataset + 1 / 6) ###Starting value for M
m[[g]] <- log(TwoDdataset + 1 / 6)
S[[g]] <- list()
delta[[g]] <- list()
kappa[[g]] <- list()
startList[[g]] <- list()
for (i in 1:n) {
startList[[g]][[i]] <- log(dataset[[i]])
delta[[g]][[i]] <- diag(r) * 0.001
kappa[[g]][[i]] <- diag(p) * 0.001
S[[g]][[i]] <- delta[[g]][[i]] %x% kappa[[g]][[i]]
}
}
} else {
# Initialize based on specified initMethod method
outputInitialization <- list()
checklogL <- vector()
for (initIt in seq_along(1:nInitIterations)) {
set.seed(initIt)
outputInitialization[[initIt]] <- initializationRunVGA(
G = G,
dataset = dataset,
TwoDdataset = TwoDdataset,
r = r,
p = p,
d = d,
n = n,
normFactors = normFactors,
initMethod = initMethod)
checklogL[initIt] <- outputInitialization[[initIt]]$finalLogLik
}
# select init run with highest logL
maxRun <- which.max(checklogL)
mu <- outputInitialization[[maxRun]]$mu
omega <- outputInitialization[[maxRun]]$omega
phi <- outputInitialization[[maxRun]]$phi
delta <- outputInitialization[[maxRun]]$delta
kappa <- outputInitialization[[maxRun]]$kappa
sigma <- outputInitialization[[maxRun]]$sigma
isigma <- outputInitialization[[maxRun]]$isigma
iphi <- outputInitialization[[maxRun]]$iphi
iomega <- outputInitialization[[maxRun]]$iomega
m <- outputInitialization[[maxRun]]$m
S <- outputInitialization[[maxRun]]$S
Sk <- outputInitialization[[maxRun]]$Sk
start <- outputInitialization[[maxRun]]$start
GX <- outputInitialization[[maxRun]]$GX
dGX <- outputInitialization[[maxRun]]$dGX
zS <- outputInitialization[[maxRun]]$zS
iKappa <- outputInitialization[[maxRun]]$iKappa
iDelta <- outputInitialization[[maxRun]]$iDelta
startList <- outputInitialization[[maxRun]]$startList
zValue <- outputInitialization[[maxRun]]$zValue
piG <- outputInitialization[[maxRun]]$piG
}
# start clustering after initialization
it <- 1
aloglik <- loglik <- NULL
checks <- aloglik[c(1:5)] <- 0
itMax <- 200
while (checks == 0) {
for (g in 1:G) {
GX[[g]] <- dGX[[g]] <- zS[[g]] <- list()
iDelta[[g]] <- iKappa[[g]] <- list()
deltaO <- delta[[g]]
kappaO <- kappa[[g]]
for (i in 1:n) {
iDelta[[g]][[i]] <- diag(c(t(diag(kappa[[g]][[i]])) %*%
t(exp(log(libMatList[[i]]) +
startList[[g]][[i]] +
0.5 * diag(delta[[g]][[i]]) %*%
t(diag(kappa[[g]][[i]])))))) +
iphi[[g]] * sum(diag(iomega[[g]] %*%
kappa[[g]][[i]]))
delta[[g]][[i]] <- p * solve(iDelta[[g]][[i]])
iKappa[[g]][[i]] <- diag(c(t(diag(delta[[g]][[i]])) %*%
(exp(log(libMatList[[i]]) +
startList[[g]][[i]] +
t(0.5 * diag(kappa[[g]][[i]]) %*%
t(diag(delta[[g]][[i]]))))))) +
iomega[[g]] * sum(diag(iphi[[g]] %*%
delta[[g]][[i]]))
kappa[[g]][[i]] <- solve(iKappa[[g]][[i]])
S[[g]][[i]] <- delta[[g]][[i]] %x% kappa[[g]][[i]]
zS[[g]][[i]] <- zValue[i, g] * S[[g]][[i]]
GX[[g]][[i]] <- TwoDdataset[i, ] -
exp(start[[g]][i, ] +
log(libMat[i,]) +
0.5 * diag(S[[g]][[i]])) -
(isigma[[g]]) %*% (start[[g]][i, ] - mu[[g]])
m[[g]][i, ] <- start[[g]][i, ] + S[[g]][[i]] %*% GX[[g]][[i]]
startList[[g]][[i]] <- t(matrix(m[[g]][i, ], nrow = p))
}
start[[g]] <- m[[g]]
mu[[g]] <- colSums(zValue[, g] * m[[g]]) / sum(zValue[, g]) # this is xi
# Updating Sample covariance
muMat <- t(matrix(mu[[g]], nrow = p))
phiM <- list()
for (i in 1:n) {
phiM[[i]] <- zValue[i,g]*(startList[[g]][[i]] - muMat) %*%
iomega[[g]] %*% t(startList[[g]][[i]] - muMat) +
zValue[i, g] * delta[[g]][[i]] * sum(diag(iomega[[g]] %*%
kappa[[g]][[i]]))
}
phi[[g]] <- Reduce("+", phiM) / sum(zValue[, g] * p)
iphi[[g]] <- solve(phi[[g]])
omegaM <- list()
for (i in 1:n) {
omegaM[[i]] <- zValue[i, g] * t(startList[[g]][[i]] - muMat) %*%
iphi[[g]] %*% (startList[[g]][[i]] - muMat) +
zValue[i, g] * kappa[[g]][[i]] * sum(diag(iphi[[g]] %*%
delta[[g]][[i]]))
}
omega[[g]] <- Reduce("+", omegaM) / sum(zValue[, g] * r)
iomega[[g]] <- solve(omega[[g]])
sigma[[g]] <- phi[[g]] %x% omega[[g]]
isigma[[g]] <- iphi[[g]] %x% iomega[[g]]
}
piG <- colSums(zValue) / n
# Internal functions
funFive <- function(x, y = isigma[[g]]) {
sum(diag(x %*% y))
}
FMatrix <- matrix(NA, ncol = G, nrow = n)
for (g in 1:G) {
two <- rowSums(exp(m[[g]] + log(libMat) +
0.5 * matrix(unlist(lapply(S[[g]], diag)),
ncol = d, byrow = TRUE)))
five <- 0.5 * unlist(lapply(S[[g]], funFive))
six <- 0.5 * log(unlist(lapply(S[[g]], det)))
FMatrix[, g] <- piG[g] * exp(rowSums(m[[g]] * TwoDdataset) -
two - rowSums(lfactorial(TwoDdataset)) +
rowSums(log(libMat) * TwoDdataset) - 0.5 *
mahalanobis(m[[g]], center = mu[[g]], cov = isigma[[g]],
inverted = TRUE) - five + six + 0.5 *
log(det(isigma[[g]])) - d/2)
}
# loglik[it] <- sum(log(rowSums(FMatrix)))
# if FMatrix values is 0, then log(0) = -Inf and logL error
rowSumsFMatrix <- rowSums(FMatrix)
if(any(rowSumsFMatrix == 0) == TRUE) {
zeroRows <- which(rowSumsFMatrix == 0)
rowSumsFMatrix[zeroRows] <- min(rowSumsFMatrix[-zeroRows])
}
loglik[it] <- sum(log(rowSumsFMatrix))
zValue <- FMatrix / rowSumsFMatrix
if (it <= 5) {
zValue[zValue == "NaN"] <- 0
}
if (it > 5) {
# Aitkaine's stopping criterion
# Print for check; commented out
# cat("\n At it:", it ,"loglik[it - 1]:",
# loglik[it - 1], "- loglik[it - 2]:", loglik[it - 2],
# "is", loglik[it - 1] - loglik[it - 2], "\n")
if ((loglik[it - 1] - loglik[it - 2]) == 0) checks <- 1 else {
a <- (loglik[it] - loglik[it - 1]) / (loglik[it - 1] - loglik[it - 2])
addTo <- (1 / (1 - a) * (loglik[it] - loglik[it - 1]))
aloglik[it] <- loglik[it - 1] + addTo
if (abs(aloglik[it] - aloglik[it - 1]) < 0.05) {
checks <- 1
} else {
checks <- checks
}
}
}
it <- it + 1
if (it == itMax) {
checks <- 1
}
finalPhi <- finalOmega <- list()
for (g in 1:G) {
finalPhi[[g]] <- phi[[g]] / diag(phi[[g]])[1]
finalOmega[[g]] <- omega[[g]] * diag(phi[[g]])[1]
}
}
programclust <- mclust::map(zValue)
FinalGResults <- list(mu = matrix(unlist(mu),
ncol = p,
byrow = TRUE),
sigma = sigma,
phi = finalPhi,
omega = finalOmega,
probaPost = zValue,
loglikelihood = loglik,
proportion = piG,
clusterlabels = programclust,
iterations = it)
class(FinalGResults) <- "FinalGResults"
return(FinalGResults)
}
parallelFAOutput <- list()
for(g in seq_along(1:(gmax - gmin + 1))) {
if(length(1:(gmax - gmin + 1)) == gmax) {
clustersize <- g
} else if(length(1:(gmax - gmin + 1)) < gmax) {
clustersize <- seq(gmin, gmax, 1)[g]
}
parallelFAOutput[[g]] <- parallelFA(
G = clustersize,
dataset = dataset,
TwoDdataset = TwoDdataset,
r = r,
p = p,
d = d,
n = n,
normFactors = normFactors,
nInitIterations = nInitIterations,
initMethod = initMethod)
}
# cluster data result extracting
BIC <- ICL <- AIC <- AIC3 <- k <- ll <- vector()
for(g in seq_along(1:(gmax - gmin + 1))) {
#print(g)
if(length(1:(gmax - gmin + 1)) == gmax) {
clustersize <- g
} else if(length(1:(gmax - gmin + 1)) < gmax) {
clustersize <- seq(gmin, gmax, 1)[g]
}
# save the final log-likelihood
ll[g] <- unlist(tail(parallelFAOutput[[g]]$loglik, n = 1))
k[g] <- calcParameters(g = clustersize,
r = r,
p = p)
# starting model selection
if (g == max(1:(gmax - gmin + 1))) {
bic <- BICFunction(ll = ll,
k = k,
n = n,
run = parallelFAOutput,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
icl <- ICLFunction(bIc = bic,
gmin = gmin,
gmax = gmax,
run = parallelFAOutput,
parallel = FALSE)
aic <- AICFunction(ll = ll,
k = k,
run = parallelFAOutput,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
aic3 <- AIC3Function(ll = ll,
k = k,
run = parallelFAOutput,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
}
}
final <- base::proc.time() - ptm
RESULTS <- list(dataset = dataset,
nUnits = n,
nVariables = p,
nOccassions = r,
normFactors = normFactors,
gmin = gmin,
gmax = gmax,
initalizationMethod = initMethod,
allResults = parallelFAOutput,
loglikelihood = ll,
nParameters = k,
trueLabels = membership,
ICLAll = icl,
BICAll = bic,
AICAll = aic,
AIC3All = aic3,
totalTime = final)
class(RESULTS) <- "mvplnVGA"
return(RESULTS)
}
initializationRunVGA <- function(G,
dataset,
TwoDdataset,
r, p, d, n,
normFactors,
initMethod) {
# arranging normalization factors
libMat <- matrix(normFactors, n, d, byrow = T)
libMatList <- list()
for (i in 1:n) {
libMatList[[i]] <- t(matrix(libMat[i, ], nrow = p))
}
# Initialization
mu <- omega <- phi <- list() # mu is M;
delta <- kappa <- sigma <- isigma <-
iphi <- iomega <- list()
# delta is variational parameter Delta
# kappa is variational parameter kappa
# sigma is Psi in the math - kronecker of omega and Phi
# isigma is inverse of Psi
# iphi is inverse of Phi
# iomega is inverse of omega
m <- S <- list()
# m is vectorized xi
# S is Psi
# Other intermediate items initialized
Sk <- array(0, c(d, d, G) )
start <- GX <- dGX <- zS <- list()
iKappa <- iDelta <- startList <- list()
# iKappa is inverse of kappa
# iDelta is inverse of Delta
if (initMethod == "kmeans") {
# cat("\n initMethod == kmeans \n")
zValue <- mclust::unmap(stats::kmeans(x = log(TwoDdataset + 1 / 3),
centers = G)$cluster)
} else if (initMethod == "random") {
# cat("\n initMethod == random \n")
if(G == 1) { # generating z if g = 1
zValue <- as.matrix(rep.int(1, times = n),
ncol = G,
nrow = n)
} else { # generating z if g>1
zConv = 0
while(! zConv) { # ensure that dimension of z is same as G (i.e.
# if one column contains all 0s, then generate z again)
zValue <- t(stats::rmultinom(n = n,
size = 1,
prob = rep(1 / G, G)))
if(length(which(colSums(zValue) > 0)) == G) {
zConv <- 1
}
}
}
} else if (initMethod == "medoids") {
# cat("\n initMethod == medoids \n")
zValue <- mclust::unmap(cluster::pam(x = log(TwoDdataset + 1 / 3),
k = G)$cluster)
} else if (initMethod == "clara") {
# cat("\n initMethod == clara \n")
zValue <- mclust::unmap(cluster::clara(x = log(TwoDdataset + 1 / 3),
k = G)$cluster)
} else if (initMethod == "fanny") {
# cat("\n initMethod == fanny \n")
zValue <- mclust::unmap(cluster::fanny(x = log(TwoDdataset + 1 / 3),
k = G)$cluster)
}
piG <- colSums(zValue) / n
for (g in 1:G) {
obs <- which(zValue[, g] == 1)
mu[[g]] <- colMeans(log(TwoDdataset[obs, ] + 1 / 6))
sigma[[g]] <- var(log(TwoDdataset[obs, ] + 1 / 6))
isigma[[g]] <- solve(sigma[[g]])
phi[[g]] <- diag(r) * sqrt(min(diag(var(log(TwoDdataset[obs, ] + 1 / 6)))))
omega[[g]] <- diag(p) * sqrt(min(diag(var(log(TwoDdataset[obs, ] + 1 / 6)))))
iphi[[g]] <- solve(phi[[g]])
iomega[[g]] <- solve(omega[[g]])
}
for (g in 1:G) {
start[[g]] <- log(TwoDdataset + 1/6) ###Starting value for M
m[[g]] <- log(TwoDdataset + 1/6)
S[[g]] <- list()
delta[[g]] <- list()
kappa[[g]] <- list()
startList[[g]] <- list()
for (i in 1:n) {
startList[[g]][[i]] <- log(dataset[[i]])
delta[[g]][[i]] <- diag(r) * 0.001
kappa[[g]][[i]] <- diag(p) * 0.001
S[[g]][[i]] <- delta[[g]][[i]] %x% kappa[[g]][[i]]
}
}
it <- 1
aloglik <- loglik <- NULL
checks <- aloglik[c(1:5)] <- 0
itMax <- 200
while (checks == 0) {
for (g in 1:G) {
GX[[g]] <- dGX[[g]] <- zS[[g]] <- list()
iDelta[[g]] <- iKappa[[g]] <- list()
deltaO <- delta[[g]]
kappaO <- kappa[[g]]
for (i in 1:n) {
iDelta[[g]][[i]] <- diag(c(t(diag(kappa[[g]][[i]])) %*%
t(exp(log(libMatList[[i]]) +
startList[[g]][[i]] +
0.5 * diag(delta[[g]][[i]]) %*%
t(diag(kappa[[g]][[i]])))))) +
iphi[[g]] * sum(diag(iomega[[g]] %*%
kappa[[g]][[i]]))
delta[[g]][[i]] <- p * solve(iDelta[[g]][[i]])
iKappa[[g]][[i]] <- diag(c(t(diag(delta[[g]][[i]])) %*%
(exp(log(libMatList[[i]]) +
startList[[g]][[i]] +
t(0.5 * diag(kappa[[g]][[i]]) %*%
t(diag(delta[[g]][[i]]))))))) +
iomega[[g]] * sum(diag(iphi[[g]] %*%
delta[[g]][[i]]))
kappa[[g]][[i]] <- solve(iKappa[[g]][[i]])
S[[g]][[i]] <- delta[[g]][[i]] %x% kappa[[g]][[i]]
zS[[g]][[i]] <- zValue[i, g] * S[[g]][[i]]
GX[[g]][[i]] <- TwoDdataset[i, ] -
exp(start[[g]][i, ] +
log(libMat[i,]) +
0.5 * diag(S[[g]][[i]])) -
(isigma[[g]]) %*% (start[[g]][i, ] - mu[[g]])
m[[g]][i, ] <- start[[g]][i, ] + S[[g]][[i]] %*% GX[[g]][[i]]
startList[[g]][[i]] <- t(matrix(m[[g]][i, ], nrow = p))
}
start[[g]] <- m[[g]]
mu[[g]] <- colSums(zValue[, g] * m[[g]]) / sum(zValue[, g]) # this is xi
# Updating Sample covariance
muMat <- t(matrix(mu[[g]], nrow = p))
phiM <- list()
for (i in 1:n) {
phiM[[i]] <- zValue[i,g]*(startList[[g]][[i]] - muMat) %*%
iomega[[g]] %*% t(startList[[g]][[i]] - muMat) +
zValue[i, g] * delta[[g]][[i]] * sum(diag(iomega[[g]] %*%
kappa[[g]][[i]]))
}
phi[[g]] <- Reduce("+", phiM) / sum(zValue[, g] * p)
iphi[[g]] <- solve(phi[[g]])
omegaM <- list()
for (i in 1:n) {
omegaM[[i]] <- zValue[i, g] * t(startList[[g]][[i]] - muMat) %*%
iphi[[g]] %*% (startList[[g]][[i]] - muMat) +
zValue[i, g] * kappa[[g]][[i]] * sum(diag(iphi[[g]] %*%
delta[[g]][[i]]))
}
omega[[g]] <- Reduce("+", omegaM) / sum(zValue[, g] * r)
iomega[[g]] <- solve(omega[[g]])
sigma[[g]] <- phi[[g]] %x% omega[[g]]
isigma[[g]] <- iphi[[g]] %x% iomega[[g]]
}
piG <- colSums(zValue) / n
# Internal functions
funFive <- function(x, y = isigma[[g]]) {
sum(diag(x %*% y))
}
FMatrix <- matrix(NA, ncol = G, nrow = n)
for (g in 1:G) {
two <- rowSums(exp(m[[g]] + log(libMat) +
0.5 * matrix(unlist(lapply(S[[g]], diag)),
ncol = d, byrow = TRUE)))
five <- 0.5 * unlist(lapply(S[[g]], funFive))
six <- 0.5 * log(unlist(lapply(S[[g]], det)))
FMatrix[, g] <- piG[g] * exp(rowSums(m[[g]] * TwoDdataset) -
two - rowSums(lfactorial(TwoDdataset)) +
rowSums(log(libMat) * TwoDdataset) - 0.5 *
mahalanobis(m[[g]], center = mu[[g]], cov = isigma[[g]],
inverted = TRUE) - five + six + 0.5 *
log(det(isigma[[g]])) - d/2)
}
# loglik[it] <- sum(log(rowSums(FMatrix)))
# if FMatrix values is 0, then log(0) = -Inf and logL error
rowSumsFMatrix <- rowSums(FMatrix)
if(any(rowSumsFMatrix == 0) == TRUE) {
zeroRows <- which(rowSumsFMatrix == 0)
rowSumsFMatrix[zeroRows] <- min(rowSumsFMatrix[-zeroRows])
}
loglik[it] <- sum(log(rowSumsFMatrix))
zValue <- FMatrix / rowSumsFMatrix
if (it <= 5) {
zValue[zValue == "NaN"] <- 0
}
# cat("\n Initialization loglik[it - 1] - loglik[it - 2]:", loglik[it - 1] - loglik[it - 2])
if (it > 5) {
# Aitkaine's stopping criterion
if ((loglik[it - 1] - loglik[it - 2]) == 0) checks <- 1 else {
a <- (loglik[it] - loglik[it - 1]) / (loglik[it - 1] - loglik[it - 2])
addTo <- (1 / (1 - a) * (loglik[it] - loglik[it - 1]))
aloglik[it] <- loglik[it - 1] + addTo
if (abs(aloglik[it] - aloglik[it - 1]) < 0.05) {
checks <- 1
} else {
checks <- checks
}
}
}
it <- it + 1
if (it == itMax) {
checks <- 1
}
finalPhi <- finalOmega <- list()
for (g in 1:G) {
finalPhi[[g]] <- phi[[g]] / diag(phi[[g]])[1]
finalOmega[[g]] <- omega[[g]] * diag(phi[[g]])[1]
}
}
programclust <- mclust::map(zValue)
initRunOutput <- list(mu = mu,
sigma = sigma,
omega = omega,
phi = phi,
omega = omega,
delta = delta,
kappa = kappa,
isigma = isigma,
iphi = iphi,
iomega = iomega,
m = m,
S = S,
Sk = Sk,
start = start,
GX = GX,
dGX = dGX,
zS = zS,
iKappa = iKappa,
iDelta = iDelta,
startList = startList,
zValue = zValue,
loglik = loglik,
finalLogLik = loglik[it-1],
piG = piG,
clusterlabels = programclust,
iterations = it)
class(initRunOutput) <- "initializationRunVGA"
return(initRunOutput)
}
# [END]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.