#' Clustering Using MPLN With MCMC-EM Via Parallel Performance
#'
#' Performs clustering using mixtures of multivariate Poisson-log
#' normal (MPLN) distribution with Markov chain Monte Carlo
#' expectation-maximization algorithm (MCMC-EM) for parameter
#' estimation. Coarse grain parallelization is employed, such that
#' when a range of components/clusters (g = 1,...,G) are considered, each
#' G is run on a different processor. This can be performed because
#' each component/cluster size is independent from another. All
#' components/clusters in the range to be tested have been parallelized
#' to run on a seperate core using the *parallel* R package. The number of
#' nodes to be used for clustering can be specified or calculated using
#' *parallel::detectCores() - 1*. Model selection is performed using
#' AIC, AIC3, BIC and ICL.
#'
#' @param dataset A dataset of class matrix and type integer such that
#' rows correspond to observations and columns correspond to variables.
#' The dataset have dimensions n x d, where n is the total number of
#' observations and d is the dimensionality. If rowSums are zero,
#' these rows will be removed prior to cluster analysis.
#' @param membership A numeric vector of length nrow(dataset) containing the
#' cluster membership of each observation. If not available,
#' leave as "none".
#' @param gmin A positive integer specifying the minimum number of components
#' to be considered in the clustering run.
#' @param gmax A positive integer, >= gmin, specifying the maximum number of
#' components to be considered in the clustering run.
#' @param nChains A positive integer specifying the number of Markov chains.
#' Default is 3, the recommended minimum number.
#' @param nIterations A positive integer specifying the number of iterations
#' for each MCMC chain (including warmup). The value should be greater than
#' 40. The upper limit will depend on size of dataset.
#' @param initMethod An algorithm for initialization. Current options are
#' "kmeans", "random", "medoids", "clara", or "fanny". Default is "kmeans".
#' @param nInitIterations A positive integer or zero, specifying the number
#' of initialization runs to be performed. This many runs, each with 10
#' iterations, will be performed via MPLNClust and values from the run with
#' highest log-likelihood will be used as initialization values. Default is 0.
#' @param normalize A string with options "Yes" or "No" specifying
#' if normalization should be performed. Currently, normalization factors
#' are calculated using TMM method of edgeR package. Default is "Yes".
#' @param numNodes A positive integer indicating the number of nodes to be
#' used from the local machine to run the clustering algorithm. Else
#' leave as NA, so default will be detected as
#' parallel::makeCluster(parallel::detectCores() - 1).
#'
#' @return Returns an S3 object of class mplnMCMCParallel with results.
#' \itemize{
#' \item dataset - The input dataset on which clustering is performed.
#' \item dimensionality - Dimensionality of the input dataset.
#' \item normalizationFactors - A vector of normalization factors used
#' for input dataset.
#' \item gmin - Minimum number of components considered in the clustering
#' run.
#' \item gmax - Maximum number of components considered in the clustering
#' run.
#' \item initalizationMethod - Method used for initialization.
#' \item allResults - A list with all results.
#' \item logLikelihood - A vector with value of final log-likelihoods for
#' each cluster size.
#' \item numbParameters - A vector with number of parameters for each
#' cluster size.
#' \item trueLabels - The vector of true labels, if provided by user.
#' \item ICLresults - A list with all ICL model selection results.
#' \item BICresults - A list with all BIC model selection results.
#' \item AICresults - A list with all AIC model selection results.
#' \item AIC3results - A list with all AIC3 model selection results.
#' \item slopeHeuristics - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item DjumpModelSelected - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item DDSEModelSelected - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item totalTime - Total time used for clustering and model selection.
#' }
#'
#' @examples
#' \dontrun{
#' trueMu1 <- c(6.5, 6, 6, 6, 6, 6)
#' trueMu2 <- c(2, 2.5, 2, 2, 2, 2)
#'
#' trueSigma1 <- diag(6) * 2
#' trueSigma2 <- diag(6)
#'
#' # Generating simulated data
#' sampleData <- MPLNClust::mplnDataGenerator(nObservations = 40,
#' dimensionality = 6,
#' mixingProportions = c(0.79, 0.21),
#' mu = rbind(trueMu1, trueMu2),
#' sigma = rbind(trueSigma1, trueSigma2),
#' produceImage = "No")
#'
#' # Clustering
#' mplnResults <- MPLNClust::mplnMCMCParallel(dataset = sampleData$dataset,
#' membership = sampleData$trueMembership,
#' gmin = 1,
#' gmax = 1,
#' nChains = 3,
#' nIterations = 400,
#' initMethod = "kmeans",
#' nInitIterations = 0,
#' normalize = "Yes",
#' numNodes = 2)
#' }
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}, Sanjeena Dang,
#' \email{sanjeenadang@cunet.carleton.ca}. }
#'
#' @references
#' Aitchison, J. and C. H. Ho (1989). The multivariate Poisson-log normal distribution.
#' \emph{Biometrika} 76.
#'
#' Akaike, H. (1973). Information theory and an extension of the maximum likelihood
#' principle. In \emph{Second International Symposium on Information Theory}, New York, NY,
#' USA, pp. 267–281. Springer Verlag.
#'
#' Arlot, S., Brault, V., Baudry, J., Maugis, C., and Michel, B. (2016).
#' capushe: CAlibrating Penalities Using Slope HEuristics. R package version 1.1.1.
#'
#' Biernacki, C., G. Celeux, and G. Govaert (2000). Assessing a mixture model for
#' clustering with the integrated classification likelihood. \emph{IEEE Transactions
#' on Pattern Analysis and Machine Intelligence} 22.
#'
#' Bozdogan, H. (1994). Mixture-model cluster analysis using model selection criteria
#' and a new informational measure of complexity. In \emph{Proceedings of the First US/Japan
#' Conference on the Frontiers of Statistical Modeling: An Informational Approach:
#' Volume 2 Multivariate Statistical Modeling}, pp. 69–113. Dordrecht: Springer Netherlands.
#'
#' Robinson, M.D., and Oshlack, A. (2010). A scaling normalization method for differential
#' expression analysis of RNA-seq data. \emph{Genome Biology} 11, R25.
#'
#' Schwarz, G. (1978). Estimating the dimension of a model. \emph{The Annals of Statistics}
#' 6.
#'
#' Silva, A. et al. (2019). A multivariate Poisson-log normal mixture model
#' for clustering transcriptome sequencing data. \emph{BMC Bioinformatics} 20.
#' \href{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2916-0}{Link}
#'
#' @export
#' @import coda
#' @importFrom capushe capushe
#' @import cluster
#' @import clusterGeneration
#' @importFrom edgeR calcNormFactors
#' @import MASS
#' @importFrom mclust unmap
#' @importFrom mclust map
#' @importFrom mvtnorm rmvnorm
#' @import Rcpp
#' @importFrom rstan sampling
#' @importFrom rstan stan_model
#' @import parallel
#' @import stats
#' @importFrom utils tail
#' @importFrom utils write.csv
#'
mplnMCMCParallel <- function(dataset,
membership = "none",
gmin = 1,
gmax = 2,
nChains = 3,
nIterations = 1000,
initMethod = "kmeans",
nInitIterations = 0,
normalize = "Yes",
numNodes = NA) {
# Keeping track of time
ptm <- proc.time()
# Performing checks of user input
if (typeof(dataset) != "double" & typeof(dataset) != "integer") {
stop("Dataset type needs to be integer.")
}
if (any((dataset %% 1 == 0) == FALSE)) {
stop("Dataset should be a matrix of counts.")
}
if (is.matrix(dataset) != TRUE) {
stop("Dataset needs to be a matrix.")
}
if (any(colSums(dataset) <= 0)) {
stop("Column sums cannot be less than or equal to 0. Double check dataset.")
}
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
if(is.numeric(gmin) != TRUE || is.numeric(gmax) != TRUE) {
stop("Class of gmin and gmin should be numeric.")
}
if (gmax < gmin) {
stop("gmax cannot be less than gmin.")
}
if (gmax > nObservations) {
stop("gmax cannot be larger than nrow(dataset).")
}
if (is.numeric(nChains) != TRUE) {
stop("nChains should be a positive integer of class numeric specifying
the number of Markov chains.")
}
if(nChains < 3) {
cat("nChains is less than 3. Note: recommended minimum number of
chains is 3.")
}
if(is.numeric(nIterations) != TRUE) {
stop("nIterations should be a positive integer of class numeric,
specifying the number of iterations for each Markov chain
(including warmup).")
}
if(is.numeric(nIterations) == TRUE && nIterations < 40) {
stop("nIterations argument should be greater than 40.")
}
if(all(membership != "none") && is.numeric(membership) != TRUE) {
stop("membership should be a numeric vector containing the
cluster membership. Otherwise, leave as 'none'.")
}
if(all(membership != "none") &&
all((diff(sort(unique(membership))) == 1) != TRUE) ) {
stop("Cluster memberships in the membership vector
are missing a cluster, e.g. 1, 3, 4, 5, 6 is missing cluster 2.")
}
if(all(membership != "none") && length(membership) != nObservations) {
stop("membership should be a numeric vector, where length(membership)
should equal the number of observations. Otherwise, leave as 'none'.")
}
# Remove rows with only zeros, if present
removezeros <- removeZeroCounts(dataset = dataset, membership = membership)
dataset <- removezeros$dataset
membership <- removezeros$membership
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
if (is.character(initMethod) == TRUE) {
initMethodsUsed <- c("kmeans", "random", "medoids", "clara", "fanny")
if(all((initMethod == initMethodsUsed) == FALSE)) {
stop("initMethod should of class character, specifying
either: kmeans, random, medoids, clara, or fanny.")
}
} else if (is.character(initMethod) != TRUE) {
stop("initMethod should of class character, specifying
either: kmeans, random, medoids, clara, or fanny.")
}
if (is.numeric(nInitIterations) != TRUE) {
stop("nInitIterations should be positive integer or zero, specifying
the number of initialization runs to be considered.")
}
if (is.character(normalize) != TRUE) {
stop("normalize should be a string of class character specifying
if normalization should be performed.")
}
if (normalize != "Yes" && normalize != "No") {
stop("normalize should be a string indicating Yes or No, specifying
if normalization should be performed.")
}
# Check numNodes and calculate the number of cores and initiate cluster
if(is.na(numNodes) == TRUE) {
cl <- parallel::makeCluster(parallel::detectCores() - 1)
} else if (is.numeric(numNodes) == TRUE) {
cl <- parallel::makeCluster(numNodes)
} else {
stop("numNodes should be a positive integer indicating the number of
nodes to be used from the local machine. Else leave as NA, so the
numNodes will be determined by
parallel::makeCluster(parallel::detectCores() - 1).")
}
# Calculating normalization factors
if(normalize == "Yes") {
normFactors <- log(as.vector(edgeR::calcNormFactors(as.matrix(dataset),
method = "TMM")))
} else if(normalize == "No") {
normFactors <- rep(0, dimensionality)
} else{
stop("normalize should be 'Yes' or 'No' ")
}
# Construct a Stan model
# Stan model was developed with help of Sanjeena Dang
# <sanjeenadang@cunet.carleton.ca>
stancode <- 'data{int<lower=1> d; // Dimension of theta
int<lower=0> N; //Sample size
int y[N,d]; //Array of Y
vector[d] mu;
matrix[d,d] Sigma;
vector[d] normfactors;
}
parameters{
matrix[N,d] theta;
}
model{ //Change values for the priors as appropriate
for (n in 1:N){
theta[n,]~multi_normal(mu,Sigma);
}
for (n in 1:N){
for (k in 1:d){
real z;
z=exp(normfactors[k]+theta[n,k]);
y[n,k]~poisson(z);
}
}
}'
mod <- rstan::stan_model(model_code = stancode, verbose = FALSE)
# Constructing parallel code
mplnParallelCode <- function(g) {
## ** Never use set.seed(), use clusterSetRNGStream() instead,
# to set the cluster seed if you want reproducible results
# clusterSetRNGStream(cl = cl, iseed = g)
mplnParallelRun <- callingClustering(data = dataset,
gmin = g,
gmax = g,
nChains = nChains,
initMethod = initMethod,
nInitIterations = nInitIterations,
nIterations = nIterations,
normFactors = normFactors,
model = mod)
return(mplnParallelRun)
}
# Doing clusterExport
parallel::clusterExport(cl = cl,
varlist = c(
"nInitIterations",
"dataset",
"initMethod",
"normFactors",
"nChains",
"nIterations",
"calcLikelihood",
"calcParameters",
"callingClustering",
"mplnCluster",
"initializationRun",
"mplnParallelCode",
"mod",
"AICFunction",
"AIC3Function",
"BICFunction",
"ICLFunction",
"removeZeroCounts",
"stanRun",
"calcZvalue"),
envir = environment())
# Doing clusterEvalQ
parallel::clusterEvalQ(cl, library(parallel))
parallel::clusterEvalQ(cl, library(rstan))
parallel::clusterEvalQ(cl, library(Rcpp))
parallel::clusterEvalQ(cl, library(mclust))
parallel::clusterEvalQ(cl, library(mvtnorm))
parallel::clusterEvalQ(cl, library(edgeR))
parallel::clusterEvalQ(cl, library(capushe))
parallel::clusterEvalQ(cl, library(clusterGeneration))
parallel::clusterEvalQ(cl, library(coda))
parallelRun <- list()
cat("\nRunning parallel code now.")
parallelRun <- parallel::clusterMap(cl = cl,
fun = mplnParallelCode,
g = gmin:gmax)
cat("\nDone parallel code.")
parallel::stopCluster(cl)
BIC <- ICL <- AIC <- AIC3 <- Djump <- DDSE <- nParameters <- logLikelihood <- vector()
for(g in seq_along(1:(gmax - gmin + 1))) {
# save the final log-likelihood
logLikelihood[g] <- unlist(tail(parallelRun[[g]]$allResults$logLikelihood,
n = 1))
if(length(1:(gmax - gmin + 1)) == gmax) {
clustersize <- g
} else if(length(1:(gmax - gmin + 1)) < gmax) {
clustersize <- seq(gmin, gmax, 1)[g]
}
nParameters[g] <- calcParameters(numberG = clustersize,
dimensionality = dimensionality)
if (g == max(1:(gmax - gmin + 1))) { # starting model selection
bic <- BICFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
nObservations = nObservations,
clusterRunOutput = parallelRun,
gmin = gmin,
gmax = gmax,
parallel = TRUE)
icl <- ICLFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
nObservations = nObservations,
gmin = gmin,
gmax = gmax,
clusterRunOutput = parallelRun,
parallel = TRUE)
aic <- AICFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
clusterRunOutput = parallelRun,
gmin = gmin,
gmax = gmax,
parallel = TRUE)
aic3 <- AIC3Function(logLikelihood = logLikelihood,
nParameters = nParameters,
clusterRunOutput = parallelRun,
gmin = gmin,
gmax = gmax,
parallel = TRUE)
}
}
# for Djump and DDSE
if((gmax - gmin + 1) > 10 ) {
# adapted based on HTSCluster package 2.0.8 (25 Oct 2016)
Kchoice <- gmin:gmax # all the cluster/component sizes considered
message("Note: diagnostic plots for results corresponding
to model selection via slope heuristics (Djump and DDSE)
should be examined to ensure that sufficiently complex
models have been considered.")
mat <- cbind(Kchoice, nParameters / nObservations, nParameters / nObservations, - logLikelihood)
ResCapushe <- capushe::capushe(mat, nObservations)
DDSEmodel<- ResCapushe@DDSE@model
Djumpmodel<- ResCapushe@Djump@model
final <- proc.time() - ptm
RESULTS <- list(dataset = dataset,
dimensionality = dimensionality,
normalizationFactors = normFactors,
gmin = gmin,
gmax = gmax,
initalizationMethod = initMethod,
allResults = parallelRun,
logLikelihood = logLikelihood,
numbParameters = nParameters,
trueLabels = membership,
ICLresults = icl,
BICresults = bic,
AICresults = aic,
AIC3results = aic3,
slopeHeuristics = ResCapushe,
DjumpModelSelected = ResCapushe@Djump@model,
DDSEModelSelected = ResCapushe@DDSE@model,
totalTime = final)
} else {
final <- proc.time() - ptm
RESULTS <- list(dataset = dataset,
dimensionality = dimensionality,
normalizationFactors = normFactors,
gmin = gmin,
gmax = gmax,
initalizationMethod = initMethod,
allResults = parallelRun,
logLikelihood = logLikelihood,
numbParameters = nParameters,
trueLabels = membership,
ICLresults = icl,
BICresults = bic,
AICresults = aic,
AIC3results = aic3,
slopeHeuristics = "Not used",
totalTime = final)
}
class(RESULTS) <- "mplnMCMCParallel"
return(RESULTS)
}
#' Clustering Using MPLN With MCMC-EM Via Non-Parallel Performance
#'
#' Performs clustering using mixtures of multivariate Poisson-log
#' normal (MPLN) distribution with Markov chain Monte Carlo
#' expectation-maximization algorithm (MCMC-EM) for parameter
#' estimation. No internal parallelization, thus code is run in
#' serial. Model selection is performed using AIC, AIC3, BIC
#' and ICL.
#'
#' @param dataset A dataset of class matrix and type integer such that
#' rows correspond to observations and columns correspond to variables.
#' The dataset have dimensions n x d, where n is the total number of
#' observations and d is the dimensionality. If rowSums are zero,
#' these rows will be removed prior to cluster analysis.
#' @param membership A numeric vector of length nrow(dataset) containing the
#' cluster membership of each observation. If not available,
#' leave as "none".
#' @param gmin A positive integer specifying the minimum number of components
#' to be considered in the clustering run.
#' @param gmax A positive integer, >= gmin, specifying the maximum number of
#' components to be considered in the clustering run.
#' @param nChains A positive integer specifying the number of Markov chains.
#' Default is 3, the recommended minimum number.
#' @param nIterations A positive integer specifying the number of iterations
#' for each chain (including warmup). The value should be greater than 40.
#' The upper limit will depend on size of dataset.
#' @param initMethod An algorithm for initialization. Current options are
#' "kmeans", "random", "medoids", "clara", or "fanny". Default is "kmeans"
#' @param nInitIterations A positive integer or zero, specifying the number
#' of initialization runs to be performed. This many runs, each with 10
#' iterations, will be performed via MPLNClust and values from the run with
#' highest log-likelihood will be used as initialization values. Default is 0.
#' @param normalize A string with options "Yes" or "No" specifying
#' if normalization should be performed. Currently, normalization factors
#' are calculated using TMM method of edgeR package. Default is "Yes".
#'
#' @return Returns an S3 object of class mplnMCMCNonParallel with results.
#' \itemize{
#' \item dataset - The input dataset on which clustering is performed.
#' \item dimensionality - Dimensionality of the input dataset.
#' \item normalizationFactors - A vector of normalization factors used
#' for input dataset.
#' \item gmin - Minimum number of components considered in the clustering
#' run
#' \item gmax - Maximum number of components considered in the clustering
#' run
#' \item initalizationMethod - Method used for initialization.
#' \item allResults - A list with all results.
#' \item logLikelihood - A vector with value of final log-likelihoods for
#' each cluster size.
#' \item numbParameters - A vector with number of parameters for each
#' cluster size.
#' \item trueLabels - The vector of true labels, if provided by user.
#' \item ICLresults - A list with all ICL model selection results.
#' \item BICresults - A list with all BIC model selection results.
#' \item AICresults - A list with all AIC model selection results.
#' \item AIC3results - A list with all AIC3 model selection results.
#' \item slopeHeuristics - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item DjumpModelSelected - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item DDSEModelSelected - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item totalTime - Total time used for clustering and model selection.
#' }
#'
#' @examples
#' \dontrun{
#' trueMu1 <- c(6.5, 6, 6, 6, 6, 6)
#' trueMu2 <- c(2, 2.5, 2, 2, 2, 2)
#'
#' trueSigma1 <- diag(6) * 2
#' trueSigma2 <- diag(6)
#'
#' # Generating simulated data
#' sampleData <- MPLNClust::mplnDataGenerator(nObservations = 40,
#' dimensionality = 6,
#' mixingProportions = c(0.79, 0.21),
#' mu = rbind(trueMu1, trueMu2),
#' sigma = rbind(trueSigma1, trueSigma2),
#' produceImage = "No")
#'
#' # Clustering
#' mplnResults <- MPLNClust::mplnMCMCNonParallel(dataset = sampleData$dataset,
#' membership = sampleData$trueMembership,
#' gmin = 1,
#' gmax = 1,
#' nChains = 3,
#' nIterations = 700,
#' initMethod = "kmeans",
#' nInitIterations = 0,
#' normalize = "Yes")
#' }
#'
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}, Sanjeena Dang,
#' \email{sanjeenadang@cunet.carleton.ca}. }
#'
#' @references
#' Aitchison, J. and C. H. Ho (1989). The multivariate Poisson-log normal distribution.
#' \emph{Biometrika} 76.
#'
#' Akaike, H. (1973). Information theory and an extension of the maximum likelihood
#' principle. In \emph{Second International Symposium on Information Theory}, New York, NY,
#' USA, pp. 267–281. Springer Verlag.
#'
#' Arlot, S., Brault, V., Baudry, J., Maugis, C., and Michel, B. (2016).
#' capushe: CAlibrating Penalities Using Slope HEuristics. R package version 1.1.1.
#'
#' Biernacki, C., G. Celeux, and G. Govaert (2000). Assessing a mixture model for
#' clustering with the integrated classification likelihood. \emph{IEEE Transactions
#' on Pattern Analysis and Machine Intelligence} 22.
#'
#' Bozdogan, H. (1994). Mixture-model cluster analysis using model selection criteria
#' and a new informational measure of complexity. In \emph{Proceedings of the First US/Japan
#' Conference on the Frontiers of Statistical Modeling: An Informational Approach:
#' Volume 2 Multivariate Statistical Modeling}, pp. 69–113. Dordrecht: Springer Netherlands.
#'
#' Robinson, M.D., and Oshlack, A. (2010). A scaling normalization method for differential
#' expression analysis of RNA-seq data. \emph{Genome Biology} 11, R25.
#'
#' Schwarz, G. (1978). Estimating the dimension of a model. \emph{The Annals of Statistics}
#' 6.
#'
#' Silva, A. et al. (2019). A multivariate Poisson-log normal mixture model
#' for clustering transcriptome sequencing data. \emph{BMC Bioinformatics} 20.
#' \href{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2916-0}{Link}
#'
#' @export
#' @import coda
#' @import cluster
#' @importFrom capushe capushe
#' @importFrom edgeR calcNormFactors
#' @import MASS
#' @importFrom mclust unmap
#' @importFrom mclust map
#' @importFrom mvtnorm rmvnorm
#' @import Rcpp
#' @importFrom rstan sampling
#' @importFrom rstan stan_model
#' @import parallel
#' @import stats
#' @importFrom utils tail
#' @importFrom utils write.csv
#'
mplnMCMCNonParallel <- function(dataset,
membership = "none",
gmin = 1,
gmax = 2,
nChains = 3,
nIterations = 1000,
initMethod = "kmeans",
nInitIterations = 0,
normalize = "Yes") {
ptm <- proc.time()
# Performing checks
if (typeof(dataset) != "double" & typeof(dataset) != "integer") {
stop("Dataset type needs to be integer.")
}
if (any((dataset %% 1 == 0) == FALSE)) {
stop("Dataset should be a matrix of counts.")
}
if (is.matrix(dataset) != TRUE) {
stop("Dataset needs to be a matrix.")
}
if (any(colSums(dataset) <= 0)) {
stop("Column sums cannot be less than or equal to 0. Double check dataset.")
}
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
if(is.numeric(gmin) != TRUE || is.numeric(gmax) != TRUE) {
stop("Class of gmin and gmin should be numeric.")
}
if (gmax < gmin) {
stop("gmax cannot be less than gmin.")
}
if (gmax > nObservations) {
stop("gmax cannot be larger than nrow(dataset).")
}
if (is.numeric(nChains) != TRUE) {
stop("nChains should be a positive integer of class numeric specifying
the number of Markov chains.")
}
if(nChains < 3) {
cat("nChains is less than 3. Note: recommended minimum number of
chains is 3.")
}
if(is.numeric(nIterations) != TRUE) {
stop("nIterations should be a positive integer of class numeric,
specifying the number of iterations for each Markov chain
(including warmup).")
}
if(is.numeric(nIterations) == TRUE && nIterations < 40) {
stop("nIterations argument should be greater than 40.")
}
if(all(membership != "none") && is.numeric(membership) != TRUE) {
stop("membership should be a numeric vector containing the
cluster membership. Otherwise, leave as 'none'.")
}
if(all(membership != "none") &&
all((diff(sort(unique(membership))) == 1) != TRUE) ) {
stop("Cluster memberships in the membership vector
are missing a cluster, e.g. 1, 3, 4, 5, 6 is missing cluster 2.")
}
if(all(membership != "none") && length(membership) != nObservations) {
stop("membership should be a numeric vector, where length(membership)
should equal the number of observations. Otherwise, leave as 'none'.")
}
# Remove rows with zeros, if present
removezeros <- removeZeroCounts(dataset = dataset,
membership = membership)
dataset <- removezeros$dataset
membership <- removezeros$membership
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
if (is.character(initMethod) == TRUE) {
initMethodsUsed <- c("kmeans", "random", "medoids", "clara", "fanny")
if(all((initMethod == initMethodsUsed) == FALSE)) {
stop("initMethod should of class character, specifying
either: kmeans, random, medoids, clara, or fanny.")
}
} else if (is.character(initMethod) != TRUE) {
stop("initMethod should of class character, specifying
either: kmeans, random, medoids, clara, or fanny.")
}
if (is.numeric(nInitIterations) != TRUE) {
stop("nInitIterations should be positive integer or zero, specifying
the number of initialization runs to be considered.")
}
if (is.character(normalize) != TRUE) {
stop("normalize should be a string of class character specifying
if normalization should be performed.")
}
if (normalize != "Yes" && normalize != "No") {
stop("normalize should be a string indicating Yes or No, specifying
if normalization should be performed.")
}
# Calculating normalization factors
if(normalize == "Yes") {
normFactors <- log(as.vector(edgeR::calcNormFactors(as.matrix(dataset),
method = "TMM")))
} else if(normalize == "No") {
normFactors <- rep(0, dimensionality)
} else {
stop("normFactors should be 'Yes' or 'No' ")
}
# Construct a Stan model
# Stan model was developed with help of Sanjeena Dang
# <sanjeenadang@cunet.carleton.ca>
stancode <- 'data{int<lower=1> d; // Dimension of theta
int<lower=0> N; //Sample size
int y[N,d]; //Array of Y
vector[d] mu;
matrix[d,d] Sigma;
vector[d] normfactors;
}
parameters{
matrix[N,d] theta;
}
model{ //Change values for the priors as appropriate
for (n in 1:N){
theta[n,]~multi_normal(mu,Sigma);
}
for (n in 1:N){
for (k in 1:d){
real z;
z=exp(normfactors[k]+theta[n,k]);
y[n,k]~poisson(z);
}
}
}'
mod <- rstan::stan_model(model_code = stancode,
verbose = FALSE)
nonParallelRun <- list()
# Constructing non parallel code
for (gmodel in seq_along(1:(gmax - gmin + 1))) {
if(length(1:(gmax - gmin + 1)) == gmax) {
clustersize <- gmodel
} else if(length(1:(gmax - gmin + 1)) < gmax) {
clustersize <- seq(gmin, gmax, 1)[gmodel]
}
if(nInitIterations != 0) {
# cat("\nRunning initialization for G =", clustersize)
initializeruns <- initializationRun(gmodel = clustersize,
dataset = dataset,
initMethod = initMethod,
initIterations = nInitIterations,
nChain = nChains,
numbIterations = nIterations,
initialization = NA,
normalizefactors = normFactors,
mod = mod)
# cat("\nInitialization done for G =", clustersize)
# cat("\nRunning clustering for G =", clustersize)
nonParallelRun[[gmodel]] <- mplnCluster(dataset = dataset,
z = NA,
G = clustersize,
nChains = nChains,
nIterations = nIterations,
initialization = initializeruns,
normalizefac = normFactors,
mod = mod)
# cat("\nClustering done for G =", clustersize)
} else if(nInitIterations == 0) {
# cat("\nNo initialization done for G =", clustersize)
# cat("\nRunning clustering for G =", clustersize)
nonParallelRun[[gmodel]] <- mplnCluster(dataset = dataset,
z = mclust::unmap(stats::kmeans(log(dataset + 1 / 3),
clustersize)$cluster),
G = clustersize,
nChains = nChains,
nIterations = nIterations,
initialization = NA,
normalizefac = normFactors,
mod = mod)
# cat("\nClustering done for G =", clustersize)
}
}
BIC <- ICL <- AIC <- AIC3 <- Djump <- DDSE <- nParameters <- logLikelihood <- vector()
for(g in seq_along(1:(gmax - gmin + 1))) {
# save the final log-likelihood
logLikelihood[g] <- unlist(utils::tail(nonParallelRun[[g]]$logLikelihood, n = 1))
if(length(1:(gmax - gmin + 1)) == gmax) {
clustersize <- g
} else if(length(1:(gmax - gmin + 1)) < gmax) {
clustersize <- seq(gmin, gmax, 1)[g]
}
nParameters[g] <- calcParameters(numberG = clustersize, dimensionality = dimensionality)
if (g == max(1:(gmax - gmin + 1))) { # starting model selection
bic <- BICFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
nObservations = nObservations,
clusterRunOutput = nonParallelRun,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
icl <- ICLFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
nObservations = nObservations,
gmin = gmin,
gmax = gmax,
clusterRunOutput = nonParallelRun,
parallel = FALSE)
aic <- AICFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
clusterRunOutput = nonParallelRun,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
aic3 <- AIC3Function(logLikelihood = logLikelihood,
nParameters = nParameters,
clusterRunOutput = nonParallelRun,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
}
}
# for Djump and DDSE
if((gmax - gmin + 1) > 10 ) {
# adapted based on HTSCluster package 2.0.8 (25 Oct 2016)
Kchoice <- gmin:gmax # all the cluster/component sizes considered
message("Note: diagnostic plots for results corresponding
to model selection via slope heuristics (Djump and DDSE)
should be examined to ensure that sufficiently complex
models have been considered.")
mat <- cbind(Kchoice, nParameters / nObservations, nParameters / nObservations, - logLikelihood)
ResCapushe <- capushe ::capushe(mat, nObservations)
DDSEmodel<- ResCapushe@DDSE@model
Djumpmodel<- ResCapushe@Djump@model
final <- proc.time() - ptm
RESULTS <- list(dataset = dataset,
dimensionality = dimensionality,
normalizationFactors = normFactors,
gmin = gmin,
gmax = gmax,
initalizationMethod = initMethod,
allResults = nonParallelRun,
logLikelihood = logLikelihood,
numbParameters = nParameters,
trueLabels = membership,
ICLresults = icl,
BICresults = bic,
AICresults = aic,
AIC3results = aic3,
slopeHeuristics = ResCapushe,
DjumpModelSelected = ResCapushe@Djump@model,
DDSEModelSelected = ResCapushe@DDSE@model,
totalTime = final)
} else {# end of Djump and DDSE
final <- proc.time() - ptm
RESULTS <- list(dataset = dataset,
dimensionality = dimensionality,
normalizationFactors = normFactors,
gmin = gmin,
gmax = gmax,
initalizationMethod = initMethod,
allResults = nonParallelRun,
logLikelihood = logLikelihood,
numbParameters = nParameters,
trueLabels = membership,
ICLresults = icl,
BICresults = bic,
AICresults = aic,
AIC3results = aic3,
slopeHeuristics = "Not used",
totalTime = final)
}
class(RESULTS) <- "mplnMCMCNonParallel"
return(RESULTS)
}
# Log likelihood calculation
calcLikelihood <- function(z,
PI,
dataset,
muG,
G,
sigmaG,
thetaStan,
normFactors) {
nObservations <- nrow(dataset)
like <- matrix(NA, nrow = nObservations, ncol = G)
for (g in seq_along(1:G)) {
for (i in seq_along(1:nObservations)) {
x <- thetaStan[[g]][i, ]
dimensionality <- ncol(dataset)
like[i, g] <- (z[i, g] * (log(PI[g]) +
t(dataset[i, ]) %*% (x + normFactors) -
sum(exp(x + normFactors)) -
sum(lfactorial(dataset[i, ])) -
dimensionality / 2 * log(2 * pi) -
0.5 * log(det(sigmaG[((g - 1) *
dimensionality + 1):(g * dimensionality), ])) -
0.5 * t(x - muG[g, ]) %*%
solve(sigmaG[((g - 1) *
dimensionality + 1):(g * dimensionality), ]) %*%
(x - muG[g, ])))
}
}
loglike <- sum(rowSums(like))
return(loglike)
# Developed by Anjali Silva
}
# Parameter calculation
calcParameters <- function(numberG,
dimensionality) {
muPara <- dimensionality * numberG
sigmaPara <- (dimensionality * ((dimensionality + 1) / 2)) * numberG
# because if you have numberG-1 parameters,
# you can do 1-these to get the last one
piPara <- numberG - 1
# total parameters
paraTotal <- muPara + sigmaPara + piPara
return(paraTotal)
# Developed by Anjali Silva
}
# Zvalue calculation
calcZvalue <- function(thetaStan,
dataset,
G,
muG,
sigmaG,
PI,
normalizefactors) {
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
forz <- matrix(NA,ncol = G, nrow = nObservations)
for (g in seq_along(1:G)) {
for (i in seq_along(1:nObservations)) {
x <- thetaStan[[g]][i, ]
# for zig calculation (the numerator part)
forz[i, g] <- PI[g] * exp(t(dataset[i, ]) %*%
(x + normalizefactors) -
sum(exp(x + normalizefactors)) -
sum(lfactorial(dataset[i, ])) -
dimensionality / 2 * log(2 * pi) - 0.5 *
log(det(sigmaG[((g - 1) *
dimensionality + 1):(g*dimensionality), ])) -
0.5 * t(x - muG[g, ]) %*%
solve(sigmaG[((g - 1) *
dimensionality + 1):(g * dimensionality), ]) %*%
(x - muG[g, ]))
}
# check which forz == 0 and rowSums(forz)==0 and which of these
# have both equalling to 0 (because 0/0 =NaN)
if (G == 1) {
errorpossible <- Reduce(intersect,
list(which(forz == 0),
which(rowSums(forz) == 0)))
zvalue <- forz / rowSums(forz)
zvalue[errorpossible, ] <- 1
} else {
zvalue <- forz / rowSums(forz)
}
}
return(zvalue)
# Developed by Anjali Silva
}
# Calling clustering
callingClustering <- function(data,
gmin,
gmax,
nChains,
nIterations = NA,
initMethod = NA,
nInitIterations = NA,
normFactors,
model) {
ptmInner = proc.time()
for (gmodel in seq_along(1:(gmax - gmin + 1))) {
if(length(1:(gmax - gmin + 1)) == gmax) {
clustersize <- gmodel
} else if(length(1:(gmax - gmin + 1)) < gmax) {
clustersize <- seq(gmin, gmax, 1)[gmodel]
}
if(nInitIterations != 0) {
# cat("\nRunning initialization for G =", clustersize)
initializeruns <- initializationRun(gmodel = clustersize,
dataset = data,
initMethod = initMethod,
initIterations = nInitIterations,
nChain = nChains,
numbIterations = nIterations,
initialization = NA,
normalizefactors = normFactors,
mod = model)
# cat("\nInitialization done for G =", clustersize)
# cat("\nRunning clustering for G =", clustersize)
allruns <- mplnCluster(dataset = data,
z = NA,
G = clustersize,
nChains = nChains,
nIterations = nIterations,
initialization = initializeruns,
normalizefac = normFactors,
mod = model)
# cat("\nClustering done for G =", clustersize)
} else if(nInitIterations == 0) {
# cat("\nNo initialization done for G =", clustersize)
# cat("\nRunning clustering for G =", clustersize)
allruns <- mplnCluster(dataset = data,
z = mclust::unmap(stats::kmeans(log(data + 1 / 3),
clustersize)$cluster),
G = clustersize,
nChains = nChains,
nIterations = nIterations,
initialization = NA,
normalizefac = normFactors,
mod = model)
# cat("\nClustering done for G =", clustersize)
}
}
finalInner <- proc.time() - ptmInner
RESULTS <- list(gmin = gmin,
gmax = gmax,
initalizationMethod = initMethod,
allResults = allruns,
totalTime = finalInner)
class(RESULTS) <- "mplnCallingClustering"
return(RESULTS)
# Developed by Anjali Silva
}
# Initialization
initializationRun <- function(gmodel,
dataset,
initMethod,
initIterations,
nChain,
numbIterations,
initialization = NA,
normalizefactors,
mod) {
z <- initRuns <- list()
logLinit <- vector()
nObservations <- nrow(dataset)
dimensionality <- ncol(dataset)
for(iterations in seq_along(1:initIterations)) {
# setting seed, to ensure if multiple iterations are selected by
# user, then each run will give a different result.
set.seed(iterations)
if (initMethod == "kmeans" | is.na(initMethod)) {
z[[iterations]] <- mclust::unmap(stats::kmeans(log(dataset + 1 / 3),
gmodel)$cluster)
} else if (initMethod == "random") {
z[[iterations]] <- randomInitfunction(gmodel = gmodel, nObservations = nObservations)
} else if (initMethod == "medoids") {
z[[iterations]] <- mclust::unmap(cluster::pam(log(dataset + 1 / 3),
k = gmodel)$cluster)
# if z generated has less columns than gmodel, then use random initialization
if(ncol(z[[iterations]]) < gmodel) {
z[[iterations]] <- randomInitfunction(gmodel = gmodel, nObservations = nObservations)
}
} else if (initMethod == "clara") {
z[[iterations]] <- mclust::unmap(cluster::clara(log(dataset + 1 / 3),
k = gmodel)$cluster)
# if z generated has less columns than gmodel, then use random initialization
if(ncol(z[[iterations]]) < gmodel) {
z[[iterations]] <- randomInitfunction(gmodel = gmodel, nObservations = nObservations)
}
} else if (initMethod == "fanny") {
z[[iterations]] <- mclust::unmap(cluster::fanny(log(dataset + 1 / 3),
k = gmodel)$cluster)
# if z generated has less columns than gmodel, then use random initialization
if(ncol(z[[iterations]]) < gmodel) {
z[[iterations]] <- randomInitfunction(gmodel = gmodel, nObservations = nObservations)
}
}
initRuns[[iterations]] <- mplnCluster(dataset = dataset,
z = z[[iterations]],
G = gmodel,
nChains = nChain,
nIterations = numbIterations,
initialization = "init",
normalizefac = normalizefactors,
mod = mod)
logLinit[iterations] <-
unlist(utils::tail((initRuns[[iterations]]$logLikelihood), n = 1))
}
initialization <- initRuns[[which(logLinit == max(logLinit, na.rm = TRUE))[1]]]
return(initialization)
# Developed by Anjali Silva
}
# Clustering function
mplnCluster <- function(dataset,
z,
G,
nChains,
nIterations,
initialization,
normalizefac,
mod) {
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
# for convergence calculation
normMuOuter <- normSigmaOuter <- vector()
medianMuOuter <- medianSigmaOuter <- list()
# for saving mu and sigma values
muAllOuter <- sigmaAllOuter <- list()
obs <- PI <- logL <- vector()
nIterationsOuter <- 2 # the starting value of interation for outer loop
convOuter <- 0
if (all(is.na(initialization)) == TRUE || all(initialization == "init")) {
# mean for both t and normal distribution
muAllOuter[[1]] <- muG <- matrix(log(mean(dataset)),
ncol = dimensionality,
nrow = G)
# sig for sigma of t distribtuion
sigmaAllOuter[[1]] <- sigmaG <- do.call("rbind",
rep(list(cov(log(dataset + 1)) *
dimensionality), G))
} else {
muAllOuter[[1]] <- muG <- initialization$finalmu
sigmaAllOuter[[1]] <- sigmaG <- initialization$finalsigma
z <- initialization$probaPost
}
while(! convOuter) {
for(g in seq_along(1:G)) {
obs[g] <- sum(z[, g]) # number of observations in each group
PI[g] <- obs[g] / nObservations # obtain probability of each group
}
thetaStan <- ExpTheta2 <- list()
rstanResults <- stanRun(model = mod,
gmin = 1,
gmax = G,
dataset = dataset,
muAllOuter = muAllOuter,
nIterationsOuter = nIterationsOuter,
sigmaAllOuter = sigmaAllOuter,
numbIterations = nIterations,
nChain = nChains,
normalizefacs = normalizefac)
fit <- rstanResults$fitrstan
nIterations <- rstanResults$numbIterations
for (g in seq_along(1:G)) {
tt <- as.matrix(fit[[g]])
thetaStan[[g]] <- matrix(NA, nrow = nObservations, ncol = dimensionality)
ExpTheta2[[g]] <- list()
for (i in seq_along(1:nObservations)) {
zz <- c(1:(dimensionality - 1)) * nObservations + i
thetaMatrix <- tt[, c(i, zz)]
thetaStan[[g]][i, ] <- colMeans(thetaMatrix)
ExpTheta2[[g]][[i]] <- z[i, g] * t(tt[, c(i, zz)]) %*% tt[, c(i, zz)]/
((0.5 * nIterations) * nChains)
}
muG[g, ] <- colSums(z[, g] * thetaStan[[g]]) / sum(z[, g])
sigmaG[((g - 1) * dimensionality + 1):(g * dimensionality), ] <-
Reduce("+", ExpTheta2[[g]]) / sum(z[, g]) - muG[g, ] %*% t(muG[g, ])
}
muAllOuter[[nIterationsOuter]] <- muG
sigmaAllOuter[[nIterationsOuter]] <- sigmaG
logL[nIterationsOuter] <- calcLikelihood(z = z,
PI = PI,
dataset = dataset,
muG = muAllOuter[[nIterationsOuter]],
G = G,
sigmaG = sigmaAllOuter[[nIterationsOuter]],
thetaStan = thetaStan,
normFactors = normalizefac)
# convergence of outer loop
normMuOuter[nIterationsOuter] <- norm((muAllOuter[[nIterationsOuter]] -
muAllOuter[[nIterationsOuter - 1]]),
type = "F")
normSigmaOuter[nIterationsOuter] <- norm(sigmaAllOuter[[nIterationsOuter]] -
sigmaAllOuter[[nIterationsOuter - 1]],
type = "F")
medianMuOuter[[nIterationsOuter]] <- median(normMuOuter, na.rm = TRUE)
medianSigmaOuter[[nIterationsOuter]] <- median(normSigmaOuter, na.rm = TRUE)
# par(mfrow = c(1, 2))
# plot(normMuOuter, main = paste0("Norm outer mean, G=", G),
# type = "l", ylab = "median(normMuOuter)", xlab = "iterations")
# plot(normSigmaOuter, main = paste0("Norm outer sigma, G = ", G),
# type = "l", ylab = "median(normSigmaOuter)", xlab = "iterations")
thresholdOuter <- 2
if(nIterationsOuter > (thresholdOuter + 1)) {
# cat("\nMedian difference of mean and sigma in outer loop respectively ",
# c(abs(medianMuOuter[[nIterationsOuter - thresholdOuter]]-
# medianMuOuter[[nIterationsOuter]])))
if( ( (abs(medianMuOuter[[nIterationsOuter - thresholdOuter]] -
medianMuOuter[[nIterationsOuter]]) < 5) &&
(abs(medianSigmaOuter[[nIterationsOuter - thresholdOuter]] -
medianSigmaOuter[[nIterationsOuter]]) < 5) ) || nIterationsOuter > 100) {
# cat("\nConvergence of mu and sigma at outer loop
# iteration ", nIterationsOuter)
# take out absolute value
programclust <- vector()
programclust <- mclust::map(z)
# checking for spurious clusters and getting rid of them
# keep <- as.numeric(names(which(table(programclust) > 5)))
# if ((length(keep) != length(unique(programclust))) &&
# (length(keep) != 0)) {
# z <- as.matrix(z[ , keep])
# z <- z / rowSums(z)
# programclust <- map(z)
#}
# checking for empty clusters
J <- 1:ncol(z)
K <- as.logical(match(J, sort(unique(programclust)), nomatch = 0))
if(length(J[! K]) > 0) { # J[!K] tells which are empty clusters
z <- z[, - J[! K]]
programclust <- mclust::map(z)
}
convOuter <- 1
}
}
# if running for initialization, need to stop after 10 iterations
if(nIterationsOuter == 10 && all(is.na(initialization) != TRUE)) {
if(all(initialization == "init")) {
programclust <- vector()
programclust <- mclust::map(z)
convOuter <- 1
}
}
if(convOuter != 1) { # only update until convergence, not after
z <- calcZvalue(thetaStan = thetaStan,
dataset = dataset,
G = G,
muG = muG,
sigmaG = sigmaG,
PI = PI,
normalizefactors = normalizefac)
nIterationsOuter <- nIterationsOuter + 1 # updating outer loop iteration
nIterations <- nIterations + 10
}
} # end of outer loop
results <- list(finalmu = muAllOuter[[nIterationsOuter]] +
matrix(rep(normalizefac,
nrow(muAllOuter[[nIterationsOuter]])),
byrow = TRUE, ncol =
ncol(muAllOuter[[nIterationsOuter]])),
finalsigma = sigmaAllOuter[[nIterationsOuter]],
allmu = lapply(muAllOuter, function(x)
(x + matrix(rep(normalizefac,
nrow(muAllOuter[[nIterationsOuter]])),
byrow = TRUE,
ncol = ncol(muAllOuter[[nIterationsOuter]])))),
allsigma = sigmaAllOuter,
clusterlabels = programclust,
iterations = nIterationsOuter,
proportion = PI,
logLikelihood = logL,
probaPost = z,
stanresults = fit)
class(results) <- "MPLNcluster"
return(results)
# Developed by Anjali Silva
}
# Calculate and remove rows with zeros
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}}
removeZeroCounts <- function(dataset,
membership = "none") {
zeroSUMrows <- which(rowSums(dataset) == 0)
if (length(zeroSUMrows) > 0 && is.numeric(membership) == TRUE) {
dataset <- dataset[- zeroSUMrows, ]
membership <- membership[- zeroSUMrows]
} else if(length(zeroSUMrows) > 0 && all(membership == "none")) {
dataset <- dataset[- zeroSUMrows, ]
membership <- "none"
}
RESULTS <- list(dataset = dataset,
membership = membership)
class(RESULTS) <- "MPLNZerosRemoved"
return(RESULTS)
}
# Stan sampling
stanRun <- function(model,
gmin,
gmax,
dataset,
muAllOuter,
nIterationsOuter,
sigmaAllOuter,
numbIterations,
nChain = nChain,
normalizefacs) {
fitrstan <- list()
dimensionality <- ncol(dataset)
for (g in seq_along(gmin:gmax)) {
data1 <- list(d = ncol(dataset),
N = nrow(dataset),
y = dataset,
mu = muAllOuter[[nIterationsOuter - 1]][g, ],
Sigma = sigmaAllOuter[[nIterationsOuter - 1]][((g - 1) *
dimensionality + 1):(g*dimensionality), ],
normfactors = as.vector(normalizefacs))
stanproceed <- 0
try <- 1
while (! stanproceed) {
# cat("\nRstan generating sample at outer iteration",
# nIterationsOuter, "for g: ",g , "try: ", try)
# cat("\nNumber of iterations is", numbIterations, "\n")
fitrstan[[g]] <- rstan::sampling(object = model,
data = data1,
iter = numbIterations,
chains = nChain,
verbose = FALSE,
refresh = -1)
# Checking convergence
if (all(rstan::summary(fitrstan[[g]])$summary[, "Rhat"] < 1.1) == TRUE &&
all(rstan::summary(fitrstan[[g]])$summary[, "n_eff"] > 100) == TRUE) {
stanproceed <- 1
} else if(all(rstan::summary(fitrstan[[g]])$summary[, "Rhat"] < 1.1) != TRUE ||
all(rstan::summary(fitrstan[[g]])$summary[, "n_eff"] > 100) != TRUE) {
if(try == 10) { # Stop after 10 attempts
stanproceed <- 1
}
numbIterations <- numbIterations + 100
try <- try + 1
}
}
}
results <- list(fitrstan = fitrstan,
numbIterations = numbIterations)
class(results) <- "MPLNRStan"
return(results)
# Developed by Anjali Silva
}
#' Clustering Using MPLN Via Variational-EM
#'
#' Performs clustering using mixtures of multivariate Poisson-log
#' normal (MPLN) distribution with variational expectation-maximization
#' (EM) for parameter estimation. Model selection is performed using
#' AIC, AIC3, BIC and ICL. If more than 10 models are considered, Djump
#' and DDSE is also applied for model selection. No internal
#' parallelization, thus code is run in serial.
#'
#' @param dataset A dataset of class matrix and type integer such that
#' rows correspond to observations and columns correspond to variables.
#' The dataset have dimensions n x d, where n is the total number of
#' observations and d is the dimensionality. If rowSums are zero, these
#' rows will be removed prior to cluster analysis.
#' @param membership A numeric vector of length nrow(dataset) containing the
#' cluster membership of each observation. If not available,
#' leave as "none".
#' @param gmin A positive integer specifying the minimum number of components
#' to be considered in the clustering run.
#' @param gmax A positive integer, >= gmin, specifying the maximum number of
#' components to be considered in the clustering run.
#' @param initMethod An algorithm for initialization. Current options are
#' "kmeans", "random", "medoids", "clara", or "fanny". Default is "kmeans".
#' @param nInitIterations A positive integer or zero, specifying the number
#' of initialization runs to be performed. This many runs, each with 10
#' iterations, will be performed via MPLNClust and values from the run with
#' highest log-likelihood will be used as initialization values. Default is 2.
#' @param normalize A string with options "Yes" or "No" specifying
#' if normalization should be performed. Currently, normalization factors
#' are calculated using TMM method of edgeR package. Default is "Yes".
#'
#' @return Returns an S3 object of class mplnVariational with results.
#' \itemize{
#' \item dataset - The input dataset on which clustering is performed.
#' \item dimensionality - Dimensionality of the input dataset.
#' \item normalizationFactors - A vector of normalization factors used
#' for input dataset.
#' \item gmin - Minimum number of components/clusters considered in the clustering
#' run.
#' \item gmax - Maximum number of components/clusters considered in the clustering
#' run.
#' \item initalizationMethod - Method used for initialization.
#' \item allResults - A list with all results.
#' \item logLikelihood - A vector with value of final log-likelihoods for
#' each component/cluster size.
#' \item numbParameters - A vector with number of parameters for each
#' component/cluster size.
#' \item trueLabels - The vector of true labels, if provided by user.
#' \item ICLresults - A list with all ICL model selection results.
#' \item BICresults - A list with all BIC model selection results.
#' \item AICresults - A list with all AIC model selection results.
#' \item AIC3results - A list with all AIC3 model selection results.
#' \item slopeHeuristics - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item DjumpModelSelected - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item DDSEModelSelected - If more than 10 models are considered, slope heuristic
#' results as obtained via capushe::capushe().
#' \item totalTime - Total time used for clustering and model selection.
#' }
#'
#' @examples
#' # Example 1
#' trueMu1 <- c(6.5, 6, 6, 6, 6, 6)
#' trueMu2 <- c(2, 2.5, 2, 2, 2, 2)
#'
#' trueSigma1 <- diag(6) * 2
#' trueSigma2 <- diag(6)
#'
#' # Generating simulated data
#' sampleData <- MPLNClust::mplnDataGenerator(nObservations = 1000,
#' dimensionality = 6,
#' mixingProportions = c(0.79, 0.21),
#' mu = rbind(trueMu1, trueMu2),
#' sigma = rbind(trueSigma1, trueSigma2),
#' produceImage = "No")
#'
#' # Clustering
#' mplnResults <- MPLNClust::mplnVariational(
#' dataset = sampleData$dataset,
#' membership = sampleData$trueMembership,
#' gmin = 1,
#' gmax = 2,
#' initMethod = "kmeans",
#' nInitIterations = 2,
#' normalize = "Yes")
#' names(mplnResults)
#'
#' # Example 2
#' # Use an external dataset
#' if (requireNamespace("MBCluster.Seq", quietly = TRUE)) {
#' library(MBCluster.Seq)
#' data("Count")
#' dim(Count)
#'
#' # Clustering subset of data
#' mplnResultsMBClust <- MPLNClust::mplnVariational(
#' dataset = as.matrix(Count[c(1:100), ]),
#' membership = "none",
#' gmin = 1,
#' gmax = 5,
#' initMethod = "kmeans",
#' nInitIterations = 2,
#' normalize = "Yes")
#' names(mplnResultsMBClust)
#'
#'}
#'
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}, Sanjeena Dang,
#' \email{sanjeenadang@cunet.carleton.ca}. }
#'
#' @references
#' Aitchison, J. and C. H. Ho (1989). The multivariate Poisson-log normal distribution.
#' \emph{Biometrika} 76.
#'
#' Akaike, H. (1973). Information theory and an extension of the maximum likelihood
#' principle. In \emph{Second International Symposium on Information Theory}, New York, NY,
#' USA, pp. 267–281. Springer Verlag.
#'
#' Arlot, S., Brault, V., Baudry, J., Maugis, C., and Michel, B. (2016).
#' capushe: CAlibrating Penalities Using Slope HEuristics. R package version 1.1.1.
#'
#' Biernacki, C., G. Celeux, and G. Govaert (2000). Assessing a mixture model for
#' clustering with the integrated classification likelihood. \emph{IEEE Transactions
#' on Pattern Analysis and Machine Intelligence} 22.
#'
#' Bozdogan, H. (1994). Mixture-model cluster analysis using model selection criteria
#' and a new informational measure of complexity. In \emph{Proceedings of the First US/Japan
#' Conference on the Frontiers of Statistical Modeling: An Informational Approach:
#' Volume 2 Multivariate Statistical Modeling}, pp. 69–113. Dordrecht: Springer Netherlands.
#'
#' Robinson, M.D., and Oshlack, A. (2010). A scaling normalization method for differential
#' expression analysis of RNA-seq data. \emph{Genome Biology} 11, R25.
#'
#' Schwarz, G. (1978). Estimating the dimension of a model. \emph{The Annals of Statistics}
#' 6.
#'
#' Silva, A. et al. (2019). A multivariate Poisson-log normal mixture model
#' for clustering transcriptome sequencing data. \emph{BMC Bioinformatics} 20.
#' \href{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2916-0}{Link}
#'
#' Subedi, S., and R. Browne (2020). A parsimonious family of multivariate Poisson-lognormal
#' distributions for clustering multivariate count data. arXiv preprint arXiv:2004.06857.
#' \href{https://arxiv.org/pdf/2004.06857.pdf}{Link}
#'
#' @export
#' @importFrom capushe capushe
#' @importFrom edgeR calcNormFactors
#' @importFrom mclust unmap
#' @importFrom mclust map
#' @import stats
#' @import cluster
#'
mplnVariational <- function(dataset,
membership = "none",
gmin,
gmax,
initMethod = "kmeans",
nInitIterations = 2,
normalize = "Yes") {
initialTime <- proc.time()
# Performing checks
if (typeof(dataset) != "double" & typeof(dataset) != "integer") {
stop("Dataset type needs to be integer.")
}
if (any((dataset %% 1 == 0) == FALSE)) {
stop("Dataset should be a matrix of counts.")
}
if (is.matrix(dataset) != TRUE) {
stop("Dataset needs to be a matrix.")
}
if (any(colSums(dataset) <= 0)) {
stop("Column sums cannot be less than or equal to 0. Double check dataset.")
}
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
if(is.numeric(gmin) != TRUE || is.numeric(gmax) != TRUE) {
stop("Class of gmin and gmin should be numeric.")
}
if (gmax < gmin) {
stop("gmax cannot be less than gmin.")
}
if (gmax > nObservations) {
stop("gmax cannot be larger than nrow(dataset).")
}
if(all(membership != "none") && is.numeric(membership) != TRUE) {
stop("membership should be a numeric vector containing the
cluster membership. Otherwise, leave as 'none'.")
}
if(all(membership != "none") &&
all((diff(sort(unique(membership))) == 1) != TRUE) ) {
stop("Cluster memberships in the membership vector
are missing a cluster, e.g. 1, 3, 4, 5, 6 is missing cluster 2.")
}
if(all(membership != "none") && length(membership) != nObservations) {
stop("membership should be a numeric vector, where length(membership)
should equal the number of observations. Otherwise, leave as 'none'.")
}
# Remove rows with only zeros, if present
removezeros <- removeZeroCounts(dataset = dataset, membership = membership)
dataset <- removezeros$dataset
membership <- removezeros$membership
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
if (is.character(initMethod) == TRUE) {
initMethodsUsed <- c("kmeans", "random", "medoids", "clara", "fanny")
if(all((initMethod == initMethodsUsed) == FALSE)) {
stop("initMethod should of class character, specifying
either: kmeans, random, medoids, clara, or fanny.")
}
} else if (is.character(initMethod) != TRUE) {
stop("initMethod should of class character, specifying
either: kmeans, random, medoids, clara, or fanny.")
}
if (is.numeric(nInitIterations) != TRUE) {
stop("nInitIterations should be positive integer or zero, specifying
the number of initialization runs to be considered.")
}
if (is.character(normalize) != TRUE) {
stop("normalize should be a string of class character specifying
if normalization should be performed.")
}
if ((normalize != "Yes") && (normalize != "No")) {
stop("normalize should be a string indicating Yes or No, specifying
if normalization should be performed.")
}
if(normalize == "Yes") {
normFactors <- as.vector(edgeR::calcNormFactors(as.matrix(dataset),
method = "TMM"))
} else if(normalize == "No") {
normFactors <- rep(1, dimensionality)
} else{
stop("normalize should be 'Yes' or 'No'.")
}
clusterResults <- list() # to save cluster output
for (gmodel in seq_along(1:(gmax - gmin + 1))) {
if(length(1:(gmax - gmin + 1)) == gmax) {
clustersize <- gmodel
} else if(length(1:(gmax - gmin + 1)) < gmax) {
clustersize <- seq(gmin, gmax, 1)[gmodel]
}
cat("\n Running for g =", clustersize)
clusterResults[[gmodel]] <- varMPLNClustering(dataset = dataset,
initMethod = initMethod,
nInitIterations = nInitIterations,
G = clustersize,
maxIterations = 1000,
normFactors = normFactors)
# cat("\n Legnth of clusterResults is ", length(clusterResults))
}
names(clusterResults) <- paste0(rep("G=", length(seq(gmin, gmax, 1))), seq(gmin, gmax, 1))
BIC <- ICL <- AIC <- AIC3 <- Djump <- DDSE <- nParameters <- logLikelihood <- vector()
for(g in seq_along(1:(gmax - gmin + 1))) {
# save the final log-likelihood
if(length(1:(gmax - gmin + 1)) == gmax) {
clustersize <- g
} else if(length(1:(gmax - gmin + 1)) < gmax) {
clustersize <- seq(gmin, gmax, 1)[g]
}
if(all(is.na(clusterResults[[g]])) != TRUE) {
logLikelihood[g] <- unlist(utils::tail(clusterResults[[g]]$logLikelihood, n = 1))
} else if(all(is.na(clusterResults[[g]])) == TRUE){
logLikelihood[g] <- NA
}
if(all(is.na(clusterResults[[g]])) != TRUE) {
nParameters[g] <- calcParameters(numberG = clustersize, dimensionality = dimensionality)
} else if(all(is.na(clusterResults[[g]])) == TRUE){
nParameters[g] <- NA
}
if (g == max(1:(gmax - gmin + 1))) { # starting model selection
bic <- BICFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
nObservations = nObservations,
clusterRunOutput = clusterResults,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
# cat("\n Done BIC")
icl <- ICLFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
nObservations = nObservations,
gmin = gmin,
gmax = gmax,
clusterRunOutput = clusterResults,
parallel = FALSE)
# cat("\n Done ICL")
aic <- AICFunction(logLikelihood = logLikelihood,
nParameters = nParameters,
clusterRunOutput = clusterResults,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
# cat("\n Done AIC")
aic3 <- AIC3Function(logLikelihood = logLikelihood,
nParameters = nParameters,
clusterRunOutput = clusterResults,
gmin = gmin,
gmax = gmax,
parallel = FALSE)
# cat("\n Done AIC3")
}
}
# for Djump and DDSE
if((gmax - gmin + 1) > 10 ) {
# adapted based on HTSCluster package 2.0.8 (25 Oct 2016)
Kchoice <- gmin:gmax # all the cluster/component sizes considered
message("Note: diagnostic plots for results corresponding
to model selection via slope heuristics (Djump and DDSE)
should be examined to ensure that sufficiently complex
models have been considered.")
mat <- cbind(Kchoice, nParameters / nObservations, nParameters / nObservations, - logLikelihood)
ResCapushe <- capushe::capushe(mat, nObservations)
DDSEmodel<- ResCapushe@DDSE@model
Djumpmodel<- ResCapushe@Djump@model
finalTime <- proc.time() - initialTime
RESULTS <- list(dataset = dataset,
dimensionality = dimensionality,
normalizationFactors = normFactors,
gmin = gmin,
gmax = gmax,
initalizationMethod = initMethod,
allResults = clusterResults,
logLikelihood = logLikelihood,
numbParameters = nParameters,
trueLabels = membership,
ICLresults = icl,
BICresults = bic,
AICresults = aic,
AIC3results = aic3,
slopeHeuristics = ResCapushe,
DjumpModelSelected = ResCapushe@Djump@model,
DDSEModelSelected = ResCapushe@DDSE@model,
totalTime = finalTime)
} else {
finalTime <- proc.time() - initialTime
RESULTS <- list(dataset = dataset,
dimensionality = dimensionality,
normalizationFactors = normFactors,
gmin = gmin,
gmax = gmax,
initalizationMethod = initMethod,
allResults = clusterResults,
logLikelihood = logLikelihood,
numbParameters = nParameters,
trueLabels = membership,
ICLresults = icl,
BICresults = bic,
AICresults = aic,
AIC3results = aic3,
slopeHeuristics = "Not used",
totalTime = finalTime)
}
class(RESULTS) <- "mplnVariational"
return(RESULTS)
}
varMPLNClustering <- function(dataset,
initMethod,
nInitIterations,
G,
maxIterations,
normFactors) {
nObservations <- nrow(dataset)
dimensionality <- ncol(dataset)
# If no intialization is requested by user
if (nInitIterations == 0) {
# basic initialization performed for parameters
mu <- sigma <- isigma <- list()
m <- S <- list() # variational parameters
###Other intermediate items initialized
Sk <- array(0, c(dimensionality, dimensionality, G))
mPreviousValue <- GX <- dGX <- zSValue <- list()
zValue <- mclust::unmap(stats::kmeans(log(dataset + 1 / 6),
centers = G, nstart = 100)$cluster )
piG <- colSums(zValue) / nObservations
for (g in 1:G) {
obs <- which(zValue[ , g] == 1)
if(length(obs) > 1) {
mu[[g]] <- colMeans(log(dataset[obs, ] + 1 / 6)) # starting value for mu
# starting value for sample covariance matrix
sigma[[g]] <- cov(log(dataset[obs, ] + 1 / 6))
# starting value for inverse of sample covariance matrix
# If the inverse is not present for covariance matrix, handle that
isigma[[g]] <- tryCatch(solve(sigma[[g]]), error = function(err) NA)
if(all(is.na(isigma[[g]]))) {
isigma[[g]] <- diag(ncol(dataset[obs, ])) # if error with inverse
}
} else if(length(obs) == 1) {
mu[[g]] <- log(dataset[obs, ] + 1 / 6) # starting value for mu
# starting value for sample covariance matrix
sigma[[g]] <- diag(ncol(dataset))
# starting value for inverse of sample covariance matrix
# If the inverse is not present for covariance matrix, handle that
isigma[[g]] <- tryCatch(solve(sigma[[g]]), error = function(err) NA)
if(all(is.na(isigma[[g]]))) {
isigma[[g]] <- diag(ncol(dataset)) # if error with inverse
}
}
}
for (g in 1:G) {
mPreviousValue[[g]] <- m[[g]] <- log(dataset + 1 / 6)
# mPreviousValue = starting value for m
S[[g]] <- list() # starting value for S
for (i in 1:nObservations) {
S[[g]][[i]] <- diag(dimensionality) * 0.000000001
}
}
} else if (nInitIterations != 0) {
# if initialization is requested by user
initializationResults <- varMPLNInitialization(dataset = dataset,
numbG = G,
initMethod = initMethod,
nInitIterations = nInitIterations,
normFactors = normFactors)
mu <- initializationResults$mu
sigma <- initializationResults$sigma
isigma <- initializationResults$isigma
mPreviousValue <- m <- initializationResults$m # variational parameters
S <- initializationResults$S # variational parameters
zValue <- initializationResults$zValue
piG <- colSums(zValue) / nObservations
###Other intermediate items initialized
Sk <- array(0, c(dimensionality, dimensionality, G))
GX <- dGX <- zSValue <- list()
}
# Start clustering
itOuter <- 1
aloglik <- logLikelihood <- NULL
checks <- aloglik[c(1, 2, 3)] <- 0
while (checks == 0) {
for (g in 1:G) {
GX[[g]] <- list()
dGX[[g]] <- list()
zSValue[[g]] <- list()
for (i in 1:nObservations) {
dGX[[g]][[i]] <- diag(exp((log(normFactors) + mPreviousValue[[g]][i, ]) +
0.5 * diag(S[[g]][[i]])), dimensionality) + isigma[[g]]
S[[g]][[i]] <- solve(dGX[[g]][[i]]) # update S
# update S; will be used for updating sample covariance matrix
# S[[g]][[i]] <- tryCatch(solve(dGX[[g]][[i]]), error = function(err) NA)
# if(all(is.na(S[[g]][[i]]))) {
# S[[g]][[i]] <- diag(ncol(dGX[[g]][[i]])) # if error with inverse
# }
zSValue[[g]][[i]] <- zValue[i, g] * S[[g]][[i]] # will be used for updating sample covariance matrix
GX[[g]][[i]] <- dataset[i, ] - exp(mPreviousValue[[g]][i, ] +
log(normFactors) + 0.5 * diag(S[[g]][[i]])) -
(isigma[[g]]) %*% (mPreviousValue[[g]][i, ] - mu[[g]])
m[[g]][i , ] <- mPreviousValue[[g]][i , ] + S[[g]][[i]] %*% GX[[g]][[i]] # update m
}
mPreviousValue[[g]] <- m[[g]]
# Updating mu
mu[[g]] <- colSums(zValue[ , g] * m[[g]]) / sum(zValue[ , g])
# Updating sample covariance
muMatrix <- matrix(rep(mu[[g]], nObservations), nrow = nObservations, byrow = TRUE)
res <- m[[g]] - muMatrix
# Calculate weighted covariance
temp <- stats::cov.wt(res, wt = zValue[ , g], center = FALSE, method = "ML")
Sk[ , , g] <- temp$cov
sigma[[g]] <- temp$cov + Reduce("+", zSValue[[g]]) / sum(zValue[ , g])
isigma[[g]] <- solve(sigma[[g]])
}
piG <- colSums(zValue) / nObservations
# Matrix containing normaization factors
normFactorsAsMatrix <- matrix(rep(normFactors, nObservations), nrow = nObservations, byrow = TRUE)
# A useful function
zvalueFuncTerm <- function(x, y = isigma[[g]]) {
sum(diag(x %*% y))
}
# # Zvalue calculation
forz <- matrix(NA, ncol = G, nrow = nObservations)
for (g in 1:G) {
# exp(m_igj + 0.5 S_ig,jj)
two <- rowSums(exp(m[[g]] + log(normFactorsAsMatrix) +
0.5 * matrix(unlist(lapply(S[[g]], diag)), ncol = dimensionality, byrow = TRUE)))
five <- 0.5 * unlist(lapply(S[[g]], zvalueFuncTerm)) # trace(isigma x S_ig)
six <- 0.5 * log(unlist(lapply(S[[g]], det))) # 0.5 * log|S_ig|
# For zig calculation (the numerator part)
forz[ , g] <- piG[g] *
exp(rowSums(m[[g]] * dataset) - two - rowSums(lfactorial(dataset)) +
rowSums(log(normFactorsAsMatrix) * dataset) -
0.5 * mahalanobis(m[[g]], center = mu[[g]], cov = sigma[[g]]) -
five + six - 0.5 * log(det(sigma[[g]])) - dimensionality / 2)
}
# Calculate zValue value
# check which forz == 0 and rowSums(forz)==0 and which of these
# have both equalling to 0 (because 0/0 =NaN)
if (G == 1) {
errorpossible <- Reduce(intersect,
list(which(forz == 0),
which(rowSums(forz) == 0)))
forz[errorpossible] <- 1e-100
zvalue <- forz / rowSums(forz)
} else {
# check for error, if rowsums are zero
rowSumsZero <- which(rowSums(forz) == 0)
if(length(rowSumsZero) > 0) {
forz[rowSumsZero, ] <- mclust::unmap(stats::kmeans(log(dataset + 1 / 6),
centers = G,
nstart = 100)$cluster)[rowSumsZero, ]
zvalue <- forz / rowSums(forz)
} else {
zvalue <- forz / rowSums(forz)
}
}
# if z generated has less clusters than numbG, then use random initialization
# checkClusters <- 0
# if(length(unique(mclust::map(zvalue))) < G) {
# while(! checkClusters) {
# zvalue <- randomInitfunction(gmodel = G, nObservations = nObservations)
# if(length(unique(mclust::map(zvalue))) == G) {
# checkClusters <- 1
# # cat("\n checkClusters", checkClusters)
# }
# }
# }
# if z generated has less clusters than numbG, then NA
if(length(unique(mclust::map(zvalue))) < G) {
cat("\n length(unique(mclust::map(zvalue))) < G for G =", G)
checks <- 2
} else { # if z generated clusters == numbG
# Calculate log-likelihood
logLikelihood[itOuter] <- sum(log(rowSums(forz)))
# Stopping criterion
if (itOuter > 2) {
if ((logLikelihood[itOuter - 1] - logLikelihood[itOuter - 2]) == 0) {
checks <-1
} else {
# Aitken stopping criterion
termAitkens <- (logLikelihood[itOuter]- logLikelihood[itOuter - 1]) /
(logLikelihood[itOuter - 1] - logLikelihood[itOuter - 2])
term2Aitkens <- (1 / (1 - termAitkens) * (logLikelihood[itOuter] -
logLikelihood[itOuter - 1]))
aloglik[itOuter] <- logLikelihood[itOuter - 1] + term2Aitkens
if (abs(aloglik[itOuter] - logLikelihood[itOuter - 1]) < 0.001) {
# If this critera, as per Böhning et al., 1994 is achieved
# convergence is achieved
checks <- 1
} else {
checks <- checks}
}
}
# print(itOuter)
itOuter <- itOuter + 1
if (itOuter == maxIterations) {
checks <- 1
}
}
}
if(checks == 1) {
# Naming parameters
names(mu) <- names(sigma) <- paste0(rep("G=", G), 1:G)
# Saving results for output
Results <- list(piG = piG,
mu = mu,
sigma = sigma,
probaPost = zvalue,
clusterlabels = mclust::map(zvalue),
logLikelihood = logLikelihood)
class(Results) <- "varMPLNClustering"
return(Results)
} else if(checks == 2) {
# if z generated has less clusters than numbG, then NA
return(NA)
}
}
varMPLNInitialization <- function(dataset,
numbG,
initMethod,
nInitIterations,
normFactors) {
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
zValue <- initRuns <- list()
logLinit <- vector()
for(iterations in seq_along(1:nInitIterations)) {
# setting seed, to ensure if multiple iterations are selected by
# user, then each run will give a different result.
set.seed(iterations)
if (initMethod == "kmeans" | is.na(initMethod)) {
zValue[[iterations]] <- mclust::unmap(stats::kmeans(log(dataset + 1 / 6),
centers = numbG, nstart = 100)$cluster )
# if z generated has less clusters than numbG, then use random initialization
checkClusters <- 0
if(length(unique(mclust::map(zValue[[iterations]]))) < numbG) {
while(! checkClusters) {
zValue[[iterations]] <- randomInitfunction(gmodel = numbG, nObservations = nObservations)
if(length(unique(mclust::map(zValue[[iterations]]))) == numbG) {
checkClusters <- 1
# cat("\n checkClusters", checkClusters)
}
}
}
} else if (initMethod == "random") {
zValue[[iterations]] <- randomInitfunction(gmodel = numbG, nObservations = nObservations)
} else if (initMethod == "medoids") {
zValue[[iterations]] <- mclust::unmap(cluster::pam(log(dataset + 1 / 3),
k = numbG, cluster.only = TRUE))
# if z generated has less clusters than numbG, then use random initialization
checkClusters <- 0
if(length(unique(mclust::map(zValue[[iterations]]))) < numbG) {
while(! checkClusters) {
zValue[[iterations]] <- randomInitfunction(gmodel = numbG, nObservations = nObservations)
if(length(unique(mclust::map(zValue[[iterations]]))) == numbG) {
checkClusters <- 1
# cat("\n checkClusters", checkClusters)
}
}
}
} else if (initMethod == "clara") {
zValue[[iterations]] <- mclust::unmap(cluster::clara(log(dataset + 1 / 3),
k = numbG)$cluster)
# if z generated has less clusters than numbG, then use random initialization
checkClusters <- 0
if(length(unique(mclust::map(zValue[[iterations]]))) < numbG) {
while(! checkClusters) {
zValue[[iterations]] <- randomInitfunction(gmodel = numbG, nObservations = nObservations)
if(length(unique(mclust::map(zValue[[iterations]]))) == numbG) {
checkClusters <- 1
# cat("\n checkClusters", checkClusters)
}
}
}
} else if (initMethod == "fanny") {
zValue[[iterations]] <- mclust::unmap(cluster::fanny(log(dataset + 1 / 3),
k = numbG, memb.exp = numbG, cluster.only = TRUE)$clustering)
# if z generated has less clusters than numbG, then use random initialization
checkClusters <- 0
if(length(unique(mclust::map(zValue[[iterations]]))) < numbG) {
while(! checkClusters) {
zValue[[iterations]] <- randomInitfunction(gmodel = numbG, nObservations = nObservations)
if(length(unique(mclust::map(zValue[[iterations]]))) == numbG) {
checkClusters <- 1
# cat("\n checkClusters", checkClusters)
}
}
}
}
initRuns[[iterations]] <- varMPLNInitClustering(dataset = dataset,
G = numbG,
zValue = zValue[[iterations]],
normFactors = normFactors,
maxIterations = 10)
# maxIterations set to 10 for initialization
logLinit[iterations] <-
unlist(utils::tail((initRuns[[iterations]]$logLikelihood), n = 1))
}
# select the initialization run with highest loglikelihood
initializationResults <- initRuns[[which(logLinit == max(logLinit, na.rm = TRUE))[1]]]
class(initializationResults) <- "varMPLNInitialization"
return(initializationResults)
}
varMPLNInitClustering <- function(dataset,
G,
zValue,
normFactors,
maxIterations = 10) {
# if running for initialization, need to stop after 10 iterations
dimensionality <- ncol(dataset)
nObservations <- nrow(dataset)
# basic initialization performed for parameters
mu <- sigma <- isigma <- list()
m <- S <- list() # variational parameters
###Other intermediate items initialized
Sk <- array(0, c(dimensionality, dimensionality, G))
mPreviousValue <- GX <- dGX <- zSValue <- list()
piG <- colSums(zValue) / nObservations
for (g in 1:G) {
obs <- which(zValue[ , g] == 1)
if(length(obs) > 1) {
mu[[g]] <- colMeans(log(dataset[obs, ] + 1 / 6)) # starting value for mu
# starting value for sample covariance matrix
sigma[[g]] <- cov(log(dataset[obs, ] + 1 / 6))
# starting value for inverse of sample covariance matrix
# If the inverse is not present for covariance matrix, handle that
isigma[[g]] <- tryCatch(solve(sigma[[g]]), error = function(err) NA)
if(all(is.na(isigma[[g]]))) {
isigma[[g]] <- diag(ncol(dataset[obs, ])) # if error with inverse
}
} else if(length(obs) == 1) {
mu[[g]] <- log(dataset[obs, ] + 1 / 6) # starting value for mu
# starting value for sample covariance matrix
sigma[[g]] <- diag(ncol(dataset))
# starting value for inverse of sample covariance matrix
# If the inverse is not present for covariance matrix, handle that
isigma[[g]] <- tryCatch(solve(sigma[[g]]), error = function(err) NA)
if(all(is.na(isigma[[g]]))) {
isigma[[g]] <- diag(ncol(dataset)) # if error with inverse
}
}
}
for (g in 1:G) {
# mPreviousValue = starting value for m
mPreviousValue[[g]] <- m[[g]] <- log(dataset + 1 / 6)
S[[g]] <- list() # starting value for S
for (i in 1:nObservations) {
S[[g]][[i]] <- diag(dimensionality) * 0.000000001
}
}
# Start clustering
itOuter <- 1
aloglik <- logLikelihood <- NULL
checks <- aloglik[c(1, 2, 3)] <- 0
while (checks == 0) {
for (g in 1:G) {
GX[[g]] <- list()
dGX[[g]] <- list()
zSValue[[g]] <- list()
for (i in 1:nObservations) {
dGX[[g]][[i]] <- diag(exp((log(normFactors) + mPreviousValue[[g]][i, ]) +
0.5 * diag(S[[g]][[i]])), dimensionality) + isigma[[g]]
# update S; will be used for updating sample covariance matrix
S[[g]][[i]] <- tryCatch(solve(dGX[[g]][[i]]), error = function(err) NA)
if(all(is.na(S[[g]][[i]]))) {
S[[g]][[i]] <- diag(ncol(dGX[[g]][[i]])) # if error with inverse
}
zSValue[[g]][[i]] <- zValue[i, g] * S[[g]][[i]]
GX[[g]][[i]] <- dataset[i, ] - exp(mPreviousValue[[g]][i, ] +
log(normFactors) + 0.5 * diag(S[[g]][[i]])) -
(isigma[[g]]) %*% (mPreviousValue[[g]][i, ] - mu[[g]])
m[[g]][i , ] <- mPreviousValue[[g]][i , ] + S[[g]][[i]] %*% GX[[g]][[i]] # update m
}
mPreviousValue[[g]] <- m[[g]]
# Updating mu
mu[[g]] <- colSums(zValue[ , g] * m[[g]]) / sum(zValue[ , g])
# Updating sample covariance
muMatrix <- matrix(rep(mu[[g]], nObservations), nrow = nObservations, byrow = TRUE)
res <- m[[g]] - muMatrix
# Calculate weighted covariance
temp <- stats::cov.wt(res, wt = zValue[ , g], center = FALSE, method = "ML")
Sk[ , , g] <- temp$cov
sigma[[g]] <- temp$cov + Reduce("+", zSValue[[g]]) / sum(zValue[ , g])
isigma[[g]] <- solve(sigma[[g]])
}
piG <- colSums(zValue) / nObservations
# Matrix containing normaization factors
normFactorsAsMatrix <- matrix(rep(normFactors, nObservations), nrow = nObservations, byrow = TRUE)
# A useful function
zvalueFuncTerm <- function(x, y = isigma[[g]]) {
sum(diag(x %*% y))
}
# # Zvalue calculation
forz <- matrix(NA, ncol = G, nrow = nObservations)
for (g in 1:G) {
# exp(m_igj + 0.5 S_ig,jj)
two <- rowSums(exp(m[[g]] + log(normFactorsAsMatrix) +
0.5 * matrix(unlist(lapply(S[[g]], diag)), ncol = dimensionality, byrow = TRUE)))
five <- 0.5 * unlist(lapply(S[[g]], zvalueFuncTerm)) # trace(isigma x S_ig)
six <- 0.5 * log(unlist(lapply(S[[g]], det))) # 0.5 * log|S_ig|
# For zig calculation (the numerator part)
forz[ , g] <- piG[g] *
exp(rowSums(m[[g]] * dataset) - two - rowSums(lfactorial(dataset)) +
rowSums(log(normFactorsAsMatrix) * dataset) -
0.5 * mahalanobis(m[[g]], center = mu[[g]], cov = sigma[[g]]) -
five + six - 0.5 * log(det(sigma[[g]])) - dimensionality / 2)
}
# Calculate zValue value
# check which forz == 0 and rowSums(forz)==0 and which of these
# have both equalling to 0 (because 0/0 =NaN)
if (G == 1) {
errorpossible <- Reduce(intersect,
list(which(forz == 0),
which(rowSums(forz) == 0)))
forz[errorpossible] <- 1e-100
zvalue <- forz / rowSums(forz)
} else {
# check for error, if rowsums are zero
rowSumsZero <- which(rowSums(forz) == 0)
if(length(rowSumsZero) > 0) {
forz[rowSumsZero, ] <- mclust::unmap(stats::kmeans(log(dataset + 1 / 6),
centers = G,
nstart = 100)$cluster)[rowSumsZero, ]
zvalue <- forz / rowSums(forz)
} else {
zvalue <- forz / rowSums(forz)
}
# if z generated has less clusters than numbG, then use random initialization
checkClusters <- 0
if(length(unique(mclust::map(zvalue))) < G) {
while(! checkClusters) {
zvalue <- randomInitfunction(gmodel = G, nObservations = nObservations)
if(length(unique(mclust::map(zvalue))) == G) {
checkClusters <- 1
# cat("\n checkClusters", checkClusters)
}
}
}
}
# Calculate log-likelihood
logLikelihood[itOuter] <- sum(log(rowSums(forz)))
# Stopping criterion
if (itOuter > 2) {
if ((logLikelihood[itOuter - 1] - logLikelihood[itOuter - 2]) == 0) {
checks <-1
} else {
# Aitken stopping criterion
termAitkens <- (logLikelihood[itOuter]- logLikelihood[itOuter - 1]) /
(logLikelihood[itOuter - 1] - logLikelihood[itOuter - 2])
term2Aitkens <- (1 / (1 - termAitkens) * (logLikelihood[itOuter] - logLikelihood[itOuter - 1]))
aloglik[itOuter] <- logLikelihood[itOuter - 1] + term2Aitkens
if (abs(aloglik[itOuter] - logLikelihood[itOuter - 1]) < 0.001) {
# If this critera, as per Böhning et al., 1994 is achieved
# convergence is achieved
checks <- 1
} else {
checks <- checks}
}
}
# print(itOuter)
itOuter <- itOuter + 1
if (itOuter == maxIterations) {
checks <- 1
}
}
# Naming parameters
names(mu) <- names(sigma) <- paste0(rep("G=", G), 1:G)
# Saving results for output
Results <- list(piG = piG,
mu = mu,
sigma = sigma,
isigma = isigma,
zValue = zvalue,
m = m,
S = S,
clusterlabels = mclust::map(zvalue),
logLikelihood = logLikelihood)
class(Results) <- "varMPLNInitClustering"
return(Results)
}
# Internal function for random initialization
randomInitfunction <- function(gmodel, nObservations) {
if(gmodel == 1) { # generating z if g = 1
zvalueRandomInit <- as.matrix(rep.int(1, times = nObservations),
ncol = gmodel,
nrow = nObservations)
} else { # generating z if g > 1
zValueConv <- 0
while(! zValueConv) {
# ensure that dimension of z is same as G (i.e.,
# if one column contains all 0s, then generate z again)
zvalueRandomInit <- t(stats::rmultinom(nObservations, size = 1,
prob = rep(1 / gmodel, gmodel)))
if(length(which(colSums(zvalueRandomInit) > 0)) == gmodel) {
zValueConv <- 1
}
}
}
return(zvalueRandomInit)
}
#' Model Selection Via Akaike Information Criterion
#'
#' Performs model selection using Akaike Information Criterion (AIC).
#' Formula: - 2 * logLikelihood + 2 * nParameters.
#'
#' @param logLikelihood A vector with value of final log-likelihoods for
#' each cluster size.
#' @param nParameters A vector with number of parameters for each
#' cluster size.
#' @param clusterRunOutput Output from mplnVariational, mplnMCMCParallel, or
#' mplnMCMCNonParallel, if available. Default value is NA. If provided,
#' the vector of cluster labels obtained by mclust::map() for best model
#' will be provided in the output.
#' @param gmin A positive integer specifying the minimum number of components
#' to be considered in the clustering run.
#' @param gmax A positive integer, >gmin, specifying the maximum number of
#' components to be considered in the clustering run.
#' @param parallel TRUE or FALSE indicating if MPLNClust::mplnMCMCParallel
#' has been used.
#'
#' @return Returns an S3 object of class MPLN with results.
#' \itemize{
#' \item allAICvalues - A vector of AIC values for each cluster size.
#' \item AICmodelselected - An integer specifying model selected by AIC.
#' \item AICmodelSelectedLabels - A vector of integers specifying cluster labels
#' for the model selected. Only provided if user input clusterRunOutput.
#' \item AICMessage - A character vector indicating if spurious clusters are
#' detected. Otherwise, NA.
#' }
#'
#' @examples
#' trueMu1 <- c(6.5, 6, 6, 6, 6, 6)
#' trueMu2 <- c(2, 2.5, 2, 2, 2, 2)
#'
#' trueSigma1 <- diag(6) * 2
#' trueSigma2 <- diag(6)
#'
#' # Generating simulated data
#' sampleData <- MPLNClust::mplnDataGenerator(nObservations = 100,
#' dimensionality = 6,
#' mixingProportions = c(0.79, 0.21),
#' mu = rbind(trueMu1, trueMu2),
#' sigma = rbind(trueSigma1, trueSigma2),
#' produceImage = "No")
#'
#' # Clustering
#' mplnResults <- MPLNClust::mplnVariational(dataset = sampleData$dataset,
#' membership = sampleData$trueMembership,
#' gmin = 1,
#' gmax = 2,
#' initMethod = "kmeans",
#' nInitIterations = 2,
#' normalize = "Yes")
#'
#' # Model selection
#' AICmodel <- MPLNClust::AICFunction(logLikelihood = mplnResults$logLikelihood,
#' nParameters = mplnResults$numbParameters,
#' clusterRunOutput = mplnResults$allResults,
#' gmin = mplnResults$gmin,
#' gmax = mplnResults$gmax,
#' parallel = FALSE)
#'
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}}
#'
#' @references
#'
#' Akaike, H. (1973). Information theory and an extension of the maximum likelihood
#' principle. In \emph{Second International Symposium on Information Theory}, New York, NY,
#' USA, pp. 267–281. Springer Verlag.
#'
#' @export
#'
AICFunction <- function(logLikelihood,
nParameters,
clusterRunOutput = NA,
gmin,
gmax,
parallel = FALSE) {
# Performing checks
if(is.numeric(gmin) != TRUE || is.numeric(gmax) != TRUE) {
stop("Class of gmin and gmin should be numeric.")
}
if (gmax < gmin) {
stop("gmax cannot be less than gmin.")
}
if(is.numeric(nParameters) != TRUE) {
stop("nParameters should be a vector of integers indicating
number of parameters for each cluster size.")
}
if(is.numeric(logLikelihood) != TRUE) {
stop("logLikelihood should be a vector of numeric values.")
}
if(length(logLikelihood) != (gmax - gmin + 1)) {
stop("logLikelihood should be a vector of length (gmax - gmin + 1).")
}
if(is.logical(parallel) != TRUE) {
stop("Should be logical, TRUE or FALSE indicating if
MPLNClust::mplnMCMCParallel has been used.")
}
AIC <- (- 2 * logLikelihood) + (2 * nParameters)
AICmodel <- seq(gmin, gmax, 1)[grep(min(AIC, na.rm = TRUE), AIC)]
AICMessage <- NA # For spurious clusters
# Obtain cluster labels for best model if clusterRunOutput is provided
if(all(is.na(clusterRunOutput)) == FALSE) {
if(isTRUE(parallel) == "FALSE") {
# If non parallel MCMC-EM or Variational EM run
AICmodelLabels <- clusterRunOutput[[grep(min(AIC, na.rm = TRUE), AIC)]]$clusterlabels
} else {
# If parallel MCMC-EM run
AICmodelLabels <- clusterRunOutput[[grep(min(AIC, na.rm = TRUE), AIC)]]$allResults$clusterlabels
}
# Check for spurious clusters, only possible if cluster labels provided
if (max(AICmodelLabels) != AICmodel) {
AICmodel <- max(AICmodelLabels)
AICMessage <- "Spurious or empty cluster resulted."
}
} else {
AICmodelLabels <- "clusterRunOutput not provided"
}
AICresults <- list(allAICvalues = AIC,
AICmodelselected = AICmodel,
AICmodelSelectedLabels = AICmodelLabels,
AICMessage = AICMessage)
class(AICresults) <- "AIC"
return(AICresults)
}
#' Model Selection Via Akaike Information Criterion by Bozdogan (1994)
#'
#' Performs model selection using Akaike Information Criterion by
#' Bozdogan (1994), called AIC3. Formula: - 2 * logLikelihood + 3 * nParameters.
#'
#' @param logLikelihood A vector with value of final log-likelihoods for
#' each cluster size.
#' @param nParameters A vector with number of parameters for each
#' cluster size.
#' @param clusterRunOutput Output from mplnVariational, mplnMCMCParallel, or
#' mplnMCMCNonParallel, if available. Default value is NA. If provided,
#' the vector of cluster labels obtained by mclust::map() for best model
#' will be provided in the output.
#' @param gmin A positive integer specifying the minimum number of components
#' to be considered in the clustering run.
#' @param gmax A positive integer, >gmin, specifying the maximum number of
#' components to be considered in the clustering run.
#' @param parallel TRUE or FALSE indicating if MPLNClust::mplnMCMCParallel
#' has been used.
#'
#' @return Returns an S3 object of class MPLN with results.
#' \itemize{
#' \item allAIC3values - A vector of AIC3 values for each cluster size.
#' \item AIC3modelselected - An integer specifying model selected by AIC3.
#' \item AIC3modelSelectedLabels - A vector of integers specifying cluster labels
#' for the model selected. Only provided if user input clusterRunOutput.
#' \item AIC3Message - A character vector indicating if spurious clusters are
#' detected. Otherwise, NA.
#' }
#'
#' @examples
#' trueMu1 <- c(6.5, 6, 6, 6, 6, 6)
#' trueMu2 <- c(2, 2.5, 2, 2, 2, 2)
#'
#' trueSigma1 <- diag(6) * 2
#' trueSigma2 <- diag(6)
#'
#' # Generating simulated data
#' sampleData <- MPLNClust::mplnDataGenerator(nObservations = 100,
#' dimensionality = 6,
#' mixingProportions = c(0.79, 0.21),
#' mu = rbind(trueMu1, trueMu2),
#' sigma = rbind(trueSigma1, trueSigma2),
#' produceImage = "No")
#'
#' # Clustering
#' mplnResults <- MPLNClust::mplnVariational(dataset = sampleData$dataset,
#' membership = sampleData$trueMembership,
#' gmin = 1,
#' gmax = 2,
#' initMethod = "kmeans",
#' nInitIterations = 2,
#' normalize = "Yes")
#'
#' # Model selection
#' AIC3model <- MPLNClust::AIC3Function(logLikelihood = mplnResults$logLikelihood,
#' nParameters = mplnResults$numbParameters,
#' clusterRunOutput = mplnResults$allResults,
#' gmin = mplnResults$gmin,
#' gmax = mplnResults$gmax,
#' parallel = FALSE)
#'
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}}
#'
#' @references
#'
#' Akaike, H. (1973). Information theory and an extension of the maximum likelihood
#' principle. In \emph{Second International Symposium on Information Theory}, New York, NY,
#' USA, pp. 267–281. Springer Verlag.
#'
#' #' Bozdogan, H. (1994). Mixture-model cluster analysis using model selection criteria
#' and a new informational measure of complexity. In \emph{Proceedings of the First US/Japan
#' Conference on the Frontiers of Statistical Modeling: An Informational Approach:
#' Volume 2 Multivariate Statistical Modeling}, pp. 69–113. Dordrecht: Springer Netherlands.
#'
#' @export
#'
AIC3Function <- function(logLikelihood,
nParameters,
clusterRunOutput = NA,
gmin,
gmax,
parallel = FALSE) {
# Performing checks
if(is.numeric(gmin) != TRUE || is.numeric(gmax) != TRUE) {
stop("Class of gmin and gmin should be numeric.")
}
if (gmax < gmin) {
stop("gmax cannot be less than gmin.")
}
if(is.numeric(nParameters) != TRUE) {
stop("nParameters should be a vector of integers indicating
number of parameters for each cluster size.")
}
if(is.numeric(logLikelihood) != TRUE) {
stop("logLikelihood should be a vector of numeric values.")
}
if(length(logLikelihood) != (gmax - gmin + 1)) {
stop("logLikelihood should be a vector of length (gmax - gmin + 1).")
}
if(is.logical(parallel) != TRUE) {
stop("Should be logical, TRUE or FALSE indicating if
MPLNClust::mplnMCMCParallel has been used.")
}
AIC3 <- (- 2 * logLikelihood) + (3 * nParameters)
AIC3model <- seq(gmin, gmax, 1)[grep(min(AIC3,na.rm = TRUE), AIC3)]
AIC3Message <- NA # For spurious clusters
# Obtain cluster labels for best model if clusterRunOutput is provided
if(all(is.na(clusterRunOutput)) == FALSE) {
if(isTRUE(parallel) == "FALSE") {
# If non parallel MCMC-EM or Variational EM run
AIC3modelLabels <- clusterRunOutput[[grep(min(AIC3,na.rm = TRUE), AIC3)]]$clusterlabels
} else {
# If parallel MCMC-EM run
AIC3modelLabels <- clusterRunOutput[[grep(min(AIC3,na.rm = TRUE), AIC3)]]$allResults$clusterlabels
}
# Check for spurious clusters, only possible if cluster labels provided
if (max(AIC3modelLabels) != AIC3model) {
AIC3model <- max(AIC3modelLabels)
AIC3Message <- "Spurious or empty cluster resulted."
}
} else {
AIC3modelLabels <- "clusterRunOutput not provided"
}
AIC3results <- list(allAIC3values = AIC3,
AIC3modelselected = AIC3model,
AIC3modelSelectedLabels = AIC3modelLabels,
AIC3Message = AIC3Message)
class(AIC3results) <- "AIC3"
return(AIC3results)
}
#' Model Selection Via Bayesian Information Criterion
#'
#' Performs model selection using Bayesian Information Criterion (BIC) by
#' Schwarz (1978). Formula: - 2 * logLikelihood + (nParameters * log(nObservations)).
#'
#' @param logLikelihood A vector with value of final log-likelihoods for
#' each cluster size.
#' @param nParameters A vector with number of parameters for each
#' cluster size.
#' @param nObservations A positive integer specifying the number of observations
#' in the dataset analyzed.
#' @param clusterRunOutput Output from mplnVariational, mplnMCMCParallel, or
#' mplnMCMCNonParallel, if available. Default value is NA. If provided,
#' the vector of cluster labels obtained by mclust::map() for best model
#' will be provided in the output.
#' @param gmin A positive integer specifying the minimum number of components
#' to be considered in the clustering run.
#' @param gmax A positive integer, >gmin, specifying the maximum number of
#' components to be considered in the clustering run.
#' @param parallel TRUE or FALSE indicating if MPLNClust::mplnMCMCParallel
#' has been used.
#'
#' @return Returns an S3 object of class MPLN with results.
#' \itemize{
#' \item allBICvalues - A vector of BIC values for each cluster size.
#' \item BICmodelselected - An integer specifying model selected by BIC
#' \item BICmodelSelectedLabels - A vector of integers specifying cluster labels
#' for the model selected. Only provided if user input clusterRunOutput.
#' \item BICMessage - A character vector indicating if spurious clusters are
#' detected. Otherwise, NA.
#' }
#'
#' @examples
#' trueMu1 <- c(6.5, 6, 6, 6, 6, 6)
#' trueMu2 <- c(2, 2.5, 2, 2, 2, 2)
#'
#' trueSigma1 <- diag(6) * 2
#' trueSigma2 <- diag(6)
#'
#' # Generating simulated data
#' sampleData <- MPLNClust::mplnDataGenerator(nObservations = 100,
#' dimensionality = 6,
#' mixingProportions = c(0.79, 0.21),
#' mu = rbind(trueMu1, trueMu2),
#' sigma = rbind(trueSigma1, trueSigma2),
#' produceImage = "No")
#'
#' # Clustering
#' mplnResults <- MPLNClust::mplnVariational(dataset = sampleData$dataset,
#' membership = sampleData$trueMembership,
#' gmin = 1,
#' gmax = 2,
#' initMethod = "kmeans",
#' nInitIterations = 2,
#' normalize = "Yes")
#'
#' # Model selection
#' BICmodel <- MPLNClust::BICFunction(logLikelihood = mplnResults$logLikelihood,
#' nParameters = mplnResults$numbParameters,
#' nObservations = nrow(mplnResults$dataset),
#' clusterRunOutput = mplnResults$allResults,
#' gmin = mplnResults$gmin,
#' gmax = mplnResults$gmax,
#' parallel = FALSE)
#'
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}}
#'
#' @references
#'
#' Schwarz, G. (1978). Estimating the dimension of a model. \emph{The Annals of Statistics}
#' 6.
#'
#' @export
#'
BICFunction <- function(logLikelihood,
nParameters,
nObservations,
clusterRunOutput = NA,
gmin,
gmax,
parallel = FALSE) {
# Performing checks
if(is.numeric(gmin) != TRUE || is.numeric(gmax) != TRUE) {
stop("Class of gmin and gmin should be numeric.")
}
if (gmax < gmin) {
stop("gmax cannot be less than gmin.")
}
if(is.numeric(nParameters) != TRUE) {
stop("nParameters should be a vector of integers indicating
number of parameters for each cluster size.")
}
if(is.numeric(logLikelihood) != TRUE) {
stop("logLikelihood should be a vector of numeric values.")
}
if(length(logLikelihood) != (gmax - gmin + 1)) {
stop("logLikelihood should be a vector of length (gmax - gmin + 1).")
}
if(is.logical(parallel) != TRUE) {
stop("Should be logical, TRUE or FALSE indicating if
MPLNClust::mplnMCMCParallel has been used.")
}
BIC <- (- 2 * logLikelihood) + (nParameters * log(nObservations))
BICmodel <- seq(gmin, gmax, 1)[grep(min(BIC, na.rm = TRUE), BIC)]
BICMessage <- NA # For spurious clusters
# Obtain cluster labels for best model if clusterRunOutput is provided
if(all(is.na(clusterRunOutput)) == FALSE) {
if(isTRUE(parallel) == "FALSE") {
# If non parallel MCMC-EM or Variational EM run
BICmodelLabels <- clusterRunOutput[[grep(min(BIC, na.rm = TRUE),
BIC)]]$clusterlabels
} else {
# If parallel MCMC-EM run
BICmodelLabels <- clusterRunOutput[[grep(min(BIC, na.rm = TRUE),
BIC)]]$allResults$clusterlabels
}
# Check for spurious clusters, only possible if cluster labels provided
if (max(BICmodelLabels) != BICmodel) {
BICmodel <- max(BICmodelLabels)
BICMessage <- "Spurious or empty cluster resulted."
}
} else {
BICmodelLabels <- "clusterRunOutput not provided"
}
BICresults <- list(allBICvalues = BIC,
BICmodelselected = BICmodel,
BICmodelSelectedLabels = BICmodelLabels,
BICMessage = BICMessage)
class(BICresults) <- "BIC"
return(BICresults)
}
#' Model Selection Via Integrated Completed Likelihood
#'
#' Performs model selection using integrated completed likelihood (ICL) by
#' Biernacki et al., (2000).
#'
#' @param logLikelihood A vector with value of final log-likelihoods for
#' each cluster size.
#' @param nParameters A vector with number of parameters for each
#' cluster size.
#' @param nObservations A positive integer specifying the number of observations
#' in the dataset analyzed.
#' @param clusterRunOutput Output from MPLNClust::mplnVariational,
#' MPLNClust::mplnMCMCParallel, or MPLNClust::mplnMCMCNonParallel functions.
#' Either clusterRunOutput or probaPost must be provided.
#' @param probaPost A list that is length (gmax - gmin + 1) containing posterior
#' probability at each g, for g = gmin:gmax. This argument is useful if
#' clustering output have been generated non-serially, e.g., g = 1:5 and
#' g = 6:10 rather than g = 1:10. Either clusterRunOutput or probaPost
#' must be provided.
#' @param gmin A positive integer specifying the minimum number of components
#' to be considered in the clustering run.
#' @param gmax A positive integer, > gmin, specifying the maximum number of
#' components to be considered in the clustering run.
#' @param parallel TRUE or FALSE indicating if MPLNClust::mplnMCMCParallel
#' has been used.
#'
#' @return Returns an S3 object of class MPLN with results.
#' \itemize{
#' \item allICLvalues - A vector of ICL values for each cluster size.
#' \item ICLmodelselected - An integer specifying model selected by ICL.
#' \item ICLmodelSelectedLabels - A vector of integers specifying cluster labels
#' for the model selected. Only provided if user input clusterRunOutput.
#' \item ICLMessage - A character vector indicating if spurious clusters are
#' detected. Otherwise, NA.
#' }
#'
#' @examples
#' trueMu1 <- c(6.5, 6, 6, 6, 6, 6)
#' trueMu2 <- c(2, 2.5, 2, 2, 2, 2)
#'
#' trueSigma1 <- diag(6) * 2
#' trueSigma2 <- diag(6)
#'
#' # Generating simluated data
#' sampleData <- MPLNClust::mplnDataGenerator(nObservations = 100,
#' dimensionality = 6,
#' mixingProportions = c(0.79, 0.21),
#' mu = rbind(trueMu1, trueMu2),
#' sigma = rbind(trueSigma1, trueSigma2),
#' produceImage = "No")
#'
#' # Clustering
#' mplnResults <- MPLNClust::mplnVariational(dataset = sampleData$dataset,
#' membership = sampleData$trueMembership,
#' gmin = 1,
#' gmax = 2,
#' initMethod = "kmeans",
#' nInitIterations = 2,
#' normalize = "Yes")
#'
#' # Model selection
#' ICLmodel <- MPLNClust::ICLFunction(logLikelihood = mplnResults$logLikelihood,
#' nParameters = mplnResults$numbParameters,
#' nObservations = nrow(mplnResults$dataset),
#' clusterRunOutput = mplnResults$allResults,
#' gmin = mplnResults$gmin,
#' gmax = mplnResults$gmax,
#' parallel = FALSE)
#'
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}}
#'
#' @references
#'
#' Biernacki, C., G. Celeux, and G. Govaert (2000). Assessing a mixture model for
#' clustering with the integrated classification likelihood. \emph{IEEE Transactions
#' on Pattern Analysis and Machine Intelligence} 22.
#'
#' @export
#' @importFrom mclust unmap
#'
ICLFunction <- function(logLikelihood,
nParameters,
nObservations,
clusterRunOutput = NA,
probaPost = NA,
gmax,
gmin,
parallel = FALSE) {
# Performing checks
if (gmax < gmin) {
stop("gmax cannot be less than gmin.")
}
if(is.numeric(nParameters) != TRUE) {
stop("nParameters should be a vector of integers indicating
number of parameters for each cluster size.")
}
if(is.numeric(logLikelihood) != TRUE) {
stop("logLikelihood should be a vector of numeric values.")
}
if(length(logLikelihood) != (gmax - gmin + 1)) {
stop("logLikelihood should be a vector of length (gmax - gmin + 1).")
}
if(is.logical(parallel) != TRUE) {
stop("Should be logical, TRUE or FALSE indicating if
MPLNClust::mplnMCMCParallel has been used.")
}
if(all(is.na(probaPost)) != TRUE) {
if(length(probaPost) != (gmax - gmin + 1)) {
stop("probaPost must be a list of length (gmax - gmin + 1)
containing posterior probability at each g.")
}
}
if(all(is.na(clusterRunOutput)) == TRUE && all(is.na(probaPost)) == TRUE) {
stop("Either clusterRunOutput or probaPost must be provided.")
}
BIC <- (- 2 * logLikelihood) + (nParameters * log(nObservations))
ICL <- vector()
# if clusterRunOutput is provided by user
if(all(is.na(clusterRunOutput)) != TRUE) {
for (g in 1:(gmax - gmin + 1)) {
if(isTRUE(parallel) == "FALSE") {
# If non parallel run
# check if clusterRunOutput[[g]] is NA
if(all(is.na(clusterRunOutput[[g]])) != TRUE) {
z <- clusterRunOutput[[g]]$probaPost
mapz <- mclust::unmap(clusterRunOutput[[g]]$clusterlabels)
}
} else {
# If parallel run
z <- clusterRunOutput[[g]]$allResults$probaPost
mapz <- mclust::unmap(clusterRunOutput[[g]]$allResults$clusterlabels)
}
forICL <- function(g){sum(log(z[which(mapz[, g] == 1), g]))}
if(all(is.na(clusterRunOutput[[g]])) != TRUE) {
ICL[g] <- BIC[g] + sum(sapply(1:ncol(mapz), forICL))
} else if(all(is.na(clusterRunOutput[[g]])) == TRUE) {
ICL[g] <- NA
}
}
ICLmodel <- seq(gmin, gmax, 1)[grep(min(ICL, na.rm = TRUE), ICL)]
if(isTRUE(parallel) == "FALSE") {
# If non parallel MCMC-EM or Variational EM run
ICLmodelLabels <- clusterRunOutput[[grep(min(ICL, na.rm = TRUE),
ICL)]]$clusterlabels
} else {
# If parallel MCMC-EM
ICLmodelLabels <- clusterRunOutput[[grep(min(ICL, na.rm = TRUE),
ICL)]]$allResults$clusterlabels
}
}
# if probaPost is provided by user
if(all(is.na(probaPost)) != TRUE) {
for (g in 1:(gmax - gmin + 1)) {
if(all(is.na(probaPost[[g]])) != TRUE) {
z <- probaPost[[g]]
mapz <- mclust::unmap(mclust::map(probaPost[[g]]))
forICL <- function(g){sum(log(z[which(mapz[, g] == 1), g]))}
ICL[g] <- BIC[g] + sum(sapply(1:ncol(mapz), forICL))
} else if(all(is.na(probaPost[[g]])) == TRUE) {
ICL[g] <- NA
}
}
ICLmodel <- seq(gmin, gmax, 1)[grep(min(ICL, na.rm = TRUE), ICL)]
ICLmodelLabels <- mclust::map(probaPost[[grep(min(ICL, na.rm = TRUE), ICL)]])
}
# Check for spurious clusters
ICLMessage <- NA
if (max(ICLmodelLabels) != ICLmodel) {
ICLmodel <- max(ICLmodelLabels)
ICLMessage <- "Spurious or empty cluster resulted."
}
ICLresults <- list(allICLvalues = ICL,
ICLmodelselected = ICLmodel,
ICLmodelSelectedLabels = ICLmodelLabels,
ICLMessage = ICLMessage)
class(ICLresults) <- "ICL"
return(ICLresults)
}
#' Generating Data Using Mixtures of MPLN
#'
#' This function simulates data from a mixture of MPLN model.
#'
#' @param nObservations A positive integer indicating the number of observations for
#' the dataset.
#' @param dimensionality A positive integer indicating the dimensionality for the
#' dataset.
#' @param mixingProportions A numeric vector that length equal to the number of total
#' components, indicating the proportion of each component. Vector content should
#' sum to 1.
#' @param mu A matrix of size (dimensionality x number of components), indicating
#' the mean for each component. See example.
#' @param sigma A matrix of size ((dimensionality * number of components) x
#' dimensionality), indicating the covariance matrix for each component.
#' See example.
#' @param produceImage A character string indicating whether or not to
#' produce an image. Options "Yes" or "No". Image will be produced as
#' 'Pairs plot of log-transformed data.png" in the current working
#' directory.
#' @param ImageName A character string indicating name for image, if
#' produceImage is set to "Yes". Default is "TwoComponents".
#'
#' @return Returns an S3 object of class mplnDataGenerator with results.
#' \itemize{
#' \item dataset - Simulated dataset.
#' \item trueMembership -A numeric vector indicating the membership of
#' each observation.
#' \item probaPost - A matrix indicating the posterior probability that
#' each observation belong to the component/cluster.
#' \item truenormfactors - A numeric vector indicating the true
#' normalization factors used for adjusting the library sizes.
#' \item observations - Number of observations in the simulated dataset.
#' \item dimensionality - Dimensionality of the simulated dataset.
#' \item mixingProportions - A numeric vector indicating the mixing
#' proportion of each component.
#' \item mu - True mean used for the simulated dataset.
#' \item sigma - True covariances used for the simulated dataset.
#'}
#'
#' @examples
#' trueMu1 <- c(6.5, 6, 6, 6, 6, 6)
#' trueMu2 <- c(2, 2.5, 2, 2, 2, 2)
#'
#' trueSigma1 <- diag(6) * 2
#' trueSigma2 <- diag(6)
#'
#' # Generating simulated data
#' sampleData <- MPLNClust::mplnDataGenerator(nObservations = 100,
#' dimensionality = 6,
#' mixingProportions = c(0.79, 0.21),
#' mu = rbind(trueMu1, trueMu2),
#' sigma = rbind(trueSigma1, trueSigma2),
#' produceImage = "No",
#' ImageName = "TwoComponents")
#'
#' @author Anjali Silva, \email{anjali@alumni.uoguelph.ca}
#'
#' @references
#' Aitchison, J. and C. H. Ho (1989). The multivariate Poisson-log normal distribution.
#' \emph{Biometrika} 76.
#'
#' Silva, A. et al. (2019). A multivariate Poisson-log normal mixture model
#' for clustering transcriptome sequencing data. \emph{BMC Bioinformatics} 20.
#' \href{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2916-0}{Link}
#'
#' @export
#' @import stats
#' @importFrom mvtnorm rmvnorm
#' @importFrom edgeR calcNormFactors
#' @importFrom grDevices dev.off
#' @importFrom grDevices png
#' @importFrom graphics pairs
mplnDataGenerator <- function(nObservations,
dimensionality,
mixingProportions,
mu,
sigma,
produceImage = "No",
ImageName = "sampleName") {
# Checking user input
if(is.numeric(nObservations) != TRUE) {
stop("nObservations argument should be of class numeric.")
}
if(is.numeric(dimensionality) != TRUE) {
stop("dimensionality argument should be of class numeric.")
}
if(is.numeric(mixingProportions) != TRUE) {
stop("mixingProportions argument should be a vector of class numeric.")
}
if (sum(mixingProportions) != 1) {
stop("mixingProportions argument should be a vector that sum to 1.")
}
if(is.matrix(mu) != TRUE) {
stop("mu argument should be of class matrix.")
}
if(ncol(mu) != dimensionality) {
stop("mu should be a matrix, which has number of columns equalling dimensionality.")
}
if(nrow(mu) != length(mixingProportions)) {
stop("mu should be a matrix, which has number of rows equalling number of components.")
}
if(is.matrix(sigma) != TRUE) {
stop("sigma argument should be of class matrix.")
}
if(ncol(sigma) != dimensionality) {
stop("sigma should be a matrix, which has number of columns equalling
dimensionality.")
}
if(nrow(sigma) != dimensionality * length(mixingProportions)) {
stop("sigma should be a matrix, which has number of rows equalling
(dimensionality * number of components).")
}
if (produceImage == "Yes" && is.character(ImageName) != TRUE) {
stop("ImageName should be a character string of class character.")
}
# Begin calculations - generate z
z <- t(stats::rmultinom(nObservations, size = 1, mixingProportions))
y <- theta <- nG <- vector("list", length = length(mixingProportions))
# For visualization only
theta2 <- matrix(NA, ncol = dimensionality, nrow = nObservations)
for (i in seq_along(1:length(mixingProportions))) {
nG[[i]] <- which(z[, i] == 1)
theta[[i]] <- mvtnorm::rmvnorm(n = length(nG[[i]]),
mean = mu[i, ],
sigma = sigma[((i - 1) *
dimensionality + 1):(i * dimensionality), ])
theta2[nG[[i]], ] <- mvtnorm::rmvnorm(n = length(nG[[i]]),
mean = mu[i, ],
sigma = sigma[((i -1) *
dimensionality + 1):(i * dimensionality), ])
}
y <- matrix(NA, ncol = dimensionality, nrow = nObservations)
for (i in seq_along(1:nObservations)) {
for (j in seq_along(1:dimensionality)) {
y[i, j] <- stats::rpois(1, exp(theta2[i, j]))
}
}
norms <- log(edgeR::calcNormFactors(y))
# Generating counts with norm factors
y2 <- matrix(NA, ncol = dimensionality, nrow = nObservations)
for (i in seq_along(1:nObservations)) {
for (j in seq_along(1:dimensionality)) {
y2[i, j] <- stats::rpois(n = 1, lambda = exp(theta2[i, j] + norms[j]))
}
}
if (produceImage == "Yes") {
# Obtaining path to save images
pathNow <- getwd()
grDevices::png(paste0(pathNow, "/PairsPlot_", ImageName,".png"))
graphics::pairs(log(y2 + 1 / 100), col = mclust::map(z) + 1,
main = "Pairs plot of log-transformed data")
grDevices::dev.off()
}
results <- list(dataset = y2,
trueMembership = mclust::map(z),
probaPost = z,
trueNormFactors = norms,
observations = nObservations,
dimensionality = dimensionality,
mixingProportions = mixingProportions,
trueMu = mu,
trueSigma = sigma)
class(results) <- "mplnDataGenerator"
return(results)
}
# [END]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.