R/mvplnVGAclus.R

Defines functions initializationRunVGA mvplnVGAclus

Documented in mvplnVGAclus

#' Clustering Using mixtures of MVPLN via variational Gaussian approximations
#'
#' Performs clustering using mixtures of matrix variate Poisson-log normal
#' (MVPLN) via variational Gaussian approximations. Model selection can
#' be done using AIC, AIC3, BIC and ICL.
#'
#' @details Starting values (argument: initMethod) and the number of
#'     iterations for each MCMC chain (argument: nIterations) play an
#'     important role in the successful operation of this algorithm.
#'     Occasionally an error may result due to singular matrix. In that
#'     case rerun the method and may set a different seed to ensure
#'     a different set of initialization values.
#'
#' @param dataset A list of length nUnits, containing Y_j matrices.
#'    A matrix Y_j has size r x p, and the dataset will have 'j' such
#'    matrices with j = 1,...,n. If a Y_j has all zeros, such Y_j will
#'    be removed prior to cluster analysis.
#' @param membership A numeric vector of size length(dataset) containing
#'    the cluster membership of each Y_j matrix. If not available, leave
#'    as "none".
#' @param gmin A positive integer specifying the minimum number of
#'    components to be considered in the clustering run.
#' @param gmax A positive integer, >gmin, specifying the maximum number of
#'    components to be considered in the clustering run.
#' @param initMethod A method for initialization. Current options are
#'    "kmeans", "random", "medoids", "clara", or "fanny". If nInitIterations
#'    is set to >= 1, then this method will be used for initialization.
#'    Default is "kmeans".
#' @param nInitIterations A positive integer or zero, specifying the number
#'    of initialization runs prior to clustering. The run with the highest
#'    loglikelihood is used to initialize values, if nInitIterations > 1.
#'    Default value is set to 0, in which case no initialization will
#'    be performed.
#' @param normalize A character string "Yes" or "No" specifying
#'     if normalization should be performed. Currently, normalization
#'     factors are calculated using TMM method of edgeR package.
#'     Default is "Yes", for which normalization will be performed.
#'
#' @return Returns an S3 object of class mvplnParallel with results.
#' \itemize{
#'   \item dataset - The input dataset on which clustering is performed.
#'   \item nUnits - Number of units in the input dataset.
#'   \item nVariables - Number of variables in the input dataset.
#'   \item nOccassions - Number of occassions in the input dataset.
#'   \item normFactors - A vector of normalization factors used for
#'      input dataset.
#'   \item gmin - Minimum number of components considered in the
#'      clustering run.
#'   \item gmax - Maximum number of components considered in the
#'      clustering run.
#'   \item initalizationMethod - Method used for initialization.
#'   \item allResults - A list with all results.
#'   \item loglikelihood - A vector with value of final log-likelihoods
#'      for each cluster size.
#'   \item nParameters - A vector with number of parameters for each
#'      cluster size.
#'   \item trueLabels - The vector of true labels, if provided by user.
#'   \item ICL_all - A list with all ICL model selection results.
#'   \item BIC_all - A list with all BIC model selection results.
#'   \item AIC_all - A list with all AIC model selection results.
#'   \item AIC3_all - A list with all AIC3 model selection results.
#'   \item totalTime - Total time used for clustering and model selection.
#' }
#'
#' @examples
#' \dontrun{
#' # Example 1
#' set.seed(1234)
#' trueG <- 2 # number of total G
#' truer <- 2 # number of total occasions
#' truep <- 3 # number of total responses
#' trueN <- 1000 # number of total units
#'
#' # Mu is a r x p matrix
#' trueM1 <- matrix(rep(6, (truer * truep)),
#'                  ncol = truep,
#'                  nrow = truer, byrow = TRUE)
#'
#' trueM2 <- matrix(rep(1, (truer * truep)),
#'                  ncol = truep,
#'                  nrow = truer,
#'                  byrow = TRUE)
#'
#' trueMall <- rbind(trueM1, trueM2)
#'
#' # Phi is a r x r matrix
#' # Loading needed packages for generating data
#' # if (!require(clusterGeneration)) install.packages("clusterGeneration")
#' # library("clusterGeneration")
#'
#' # Covariance matrix containing variances and covariances between r occasions
#' # truePhi1 <- clusterGeneration::genPositiveDefMat("unifcorrmat",
#' #                                                   dim = truer,
#' #                                                   rangeVar = c(1, 1.7))$Sigma
#' truePhi1 <- matrix(c(1.075551, -0.488301, -0.488301, 1.362777), nrow = 2)
#' truePhi1[1, 1] <- 1 # For identifiability issues
#'
#' # truePhi2 <- clusterGeneration::genPositiveDefMat("unifcorrmat",
#' #                                                   dim = truer,
#' #                                                   rangeVar = c(0.7, 0.7))$Sigma
#' truePhi2 <- matrix(c(0.7000000, 0.6585887, 0.6585887, 0.7000000), nrow = 2)
#' truePhi2[1, 1] <- 1 # For identifiability issues
#' truePhiall <- rbind(truePhi1, truePhi2)
#'
#' # Omega is a p x p matrix
#' # Covariance matrix containing variances and covariances between p responses
#' # trueOmega1 <- clusterGeneration::genPositiveDefMat("unifcorrmat", dim = truep,
#' #                                    rangeVar = c(1, 1.7))$Sigma
#' trueOmega1 <- matrix(c(1.0526554, 1.0841910, -0.7976842,
#'                        1.0841910,  1.1518811, -0.8068102,
#'                        -0.7976842, -0.8068102,  1.4090578),
#'                        nrow = 3)
#' # trueOmega2 <- clusterGeneration::genPositiveDefMat("unifcorrmat", dim = truep,
#' #                                    rangeVar = c(0.7, 0.7))$Sigma
#' trueOmega2 <- matrix(c(0.7000000, 0.5513744, 0.4441598,
#'                        0.5513744, 0.7000000, 0.4726577,
#'                        0.4441598, 0.4726577, 0.7000000),
#'                        nrow = 3)
#' trueOmegaAll <- rbind(trueOmega1, trueOmega2)
#'
#' # Generated simulated data
#' sampleData <- mixMVPLN::mvplnDataGenerator(nOccasions = truer,
#'                                            nResponses = truep,
#'                                            nUnits = trueN,
#'                                            mixingProportions = c(0.79, 0.21),
#'                                            matrixMean = trueMall,
#'                                            phi = truePhiall,
#'                                            omega = trueOmegaAll)
#'
#' # Clustering simulated matrix variate count data
#' clusteringResults <- mixMVPLN::mvplnVGAclus(
#'                       dataset = sampleData$dataset,
#'                       membership = sampleData$truemembership,
#'                       gmin = 1,
#'                       gmax = 3,
#'                       initMethod = "kmeans",
#'                       nInitIterations = 3,
#'                       normalize = "Yes")
#'
#' # Example 2
#' trueG <- 1 # number of total G
#' truer <- 2 # number of total occasions
#' truep <- 3 # number of total responses
#' trueN <- 1000 # number of total units
#' truePiG <- 1L # mixing proportion for G = 1
#'
#' # Mu is a r x p matrix
#' trueM1 <- matrix(c(6, 5.5, 6, 6, 5.5, 6),
#'                  ncol = truep,
#'                  nrow = truer,
#'                  byrow = TRUE)
#' trueMall <- rbind(trueM1)

#' # Phi is a r x r matrix
#' # truePhi1 <- clusterGeneration::genPositiveDefMat(
#' #                               "unifcorrmat",
#' #                                dim = truer,
#' #                               rangeVar = c(0.7, 1.7))$Sigma
#' truePhi1 <- matrix(c(1.3092747, 0.3219674,
#'                      0.3219674, 1.3233794), nrow = 2)
#' truePhi1[1, 1] <- 1 # for identifiability issues
#' truePhiall <- rbind(truePhi1)
#'
#' # Omega is a p x p matrix
#' # trueOmega1 <- genPositiveDefMat(
#' #                    "unifcorrmat",
#' #                     dim = truep,
#' #                     rangeVar = c(1, 1.7))$Sigma
#' trueOmega1 <- matrix(c(1.1625581, 0.9157741, 0.8203499,
#'                        0.9157741, 1.2216287, 0.7108193,
#'                        0.8203499, 0.7108193, 1.2118854), nrow = 3)
#' trueOmegaAll <- rbind(trueOmega1)
#'
#' # Generated simulated data
#' sampleData2 <- mixMVPLN::mvplnDataGenerator(
#'                          nOccasions = truer,
#'                          nResponses = truep,
#'                          nUnits = trueN,
#'                          mixingProportions = truePiG,
#'                          matrixMean = trueMall,
#'                          phi = truePhiall,
#'                          omega = trueOmegaAll)
#'
#' # Clustering simulated matrix variate count data
#' clusteringResults <- mixMVPLN::mvplnVGAclus(
#'                       dataset = sampleData2$dataset,
#'                       membership = sampleData2$truemembership,
#'                       gmin = 1,
#'                       gmax = 2,
#'                       initMethod = "clara",
#'                       nInitIterations = 2,
#'                       normalize = "Yes")
#' # Example 3
#' # mixture of six independent Poisson distributions with G = 2
#' set.seed(23456)
#' totalSet <- 1 # Number of datasets
#' data <- list()
#' for (i in 1:totalSet) {
#'  N <- 1000 # biological samples e.g. genes
#'  d <- 6 # dimensionality e.g. conditions*replicates = total samples
#'  G <- 2
#'
#'  piG <- c(0.45, 0.55) # mixing proportions
#'
#'  tmu1 <- c(1000, 1500, 1500, 1000, 1500, 1000)
#'  tmu2 <- c(1000, 1000, 1000, 1500, 1000, 1200)
#'  z <- t(rmultinom(n = N, size = 1, prob = piG))
#'  para <- list()
#'  para[[1]] <- NULL # for pi
#'  para[[2]] <- list() # for mu
#'  para[[3]] <- list() # for Z
#'  names(para) <- c("pi", "mu", "Z")
#'
#'  para[[1]] <- piG
#'  para[[2]] <- list(tmu1, tmu2)
#'  para[[3]] <- z
#'
#'  Y <- matrix(0, ncol = d, nrow = N)
#'
#'  for (g in 1:G) {
#'   obs <- which(para[[3]][, g] == 1)
#'   for (j in 1:d) {
#'    Y[obs, j] <- rpois(length(obs), para[[2]][[g]][j])
#'   }
#'  }
#'
#'  Y_mat <- list()
#'  for (obs in 1:N) {
#'   Y_mat[[obs]] <- matrix(Y[obs, ], nrow = 2, byrow = TRUE)
#'  }
#'
#'  data[[i]] <- list()
#'  data[[i]][[1]] <- para
#'  data[[i]][[2]] <- Y
#'  data[[i]][[3]] <- Y_mat
#' }
#' outputExample3 <- mixMVPLN::mvplnVGAclus(dataset = data[[1]][[3]],
#'                                membership = "none",
#'                                gmin = 1,
#'                                gmax = 3,
#'                                initMethod = "kmeans",
#'                                nInitIterations = 1,
#'                                normalize = "Yes")
#'
#' # Example 4
#' # mixture of six independent negative binomial distributions with G = 2
#' set.seed(23456)
#' N1 <- 2000 # biological samples e.g. genes
#' d1 <- 6 # dimensionality e.g. conditions*replicates = total samples
#' piG1 <- c(0.79, 0.21) # mixing proportions for G=2
#' trueMu1 <- rep(c(1000, 500), 3)
#' trueMu2 <- sample(c(1000, 500), 6, replace=TRUE)
#' trueMu <- list(trueMu1, trueMu2)
#'
#' z <- t(rmultinom(N1, size = 1, piG1))
#'
#' nG <- vector("list", length = length(piG1))
#' dataNB <- matrix(NA, ncol = d1, nrow = N1)
#'
#' for (i in 1:length(piG1)) { # This code is not generalized, for G = 2 only
#'   nG[[i]] <- which(z[, i] == 1)
#'   for (j in 1:d1) {
#'     dataNB[nG[[i]], j] <- rnbinom(n = length(nG[[i]]),
#'                                   mu = trueMu[[i]][j],
#'                                   size = 100)
#'   }
#' }
#'
#' dataNBmat <- list()
#' for (obs in 1:N1) {
#'   dataNBmat[[obs]] <- matrix(dataNB[obs,],
#'                              nrow = 2,
#'                              byrow = TRUE)
#' }
#'
#' outputExample4 <- mixMVPLN::mvplnVGAclus(dataset= dataNBmat,
#'                                   membership = "none",
#'                                   gmin = 1,
#'                                   gmax = 2,
#'                                   initMethod = "kmeans",
#'                                   nInitIterations = 1,
#'                                   normalize = "Yes")
#' }
#'
#' @author {Anjali Silva, \email{anjali@alumni.uoguelph.ca}, Sanjeena Dang,
#'          \email{sanjeenadang@cunet.carleton.ca}. }
#'
#' @references
#' Aitchison, J. and C. H. Ho (1989). The multivariate Poisson-log normal distribution.
#' \emph{Biometrika} 76.
#'
#' Akaike, H. (1973). Information theory and an extension of the maximum likelihood
#' principle. In \emph{Second International Symposium on Information Theory}, New York, NY,
#' USA, pp. 267–281. Springer Verlag.
#'
#' Arlot, S., Brault, V., Baudry, J., Maugis, C., and Michel, B. (2016).
#' capushe: CAlibrating Penalities Using Slope HEuristics. R package version 1.1.1.
#'
#' Biernacki, C., G. Celeux, and G. Govaert (2000). Assessing a mixture model for
#' clustering with the integrated classification likelihood. \emph{IEEE Transactions
#' on Pattern Analysis and Machine Intelligence} 22.
#'
#' Bozdogan, H. (1994). Mixture-model cluster analysis using model selection criteria
#' and a new informational measure of complexity. In \emph{Proceedings of the First US/Japan
#' Conference on the Frontiers of Statistical Modeling: An Informational Approach:
#' Volume 2 Multivariate Statistical Modeling}, pp. 69–113. Dordrecht: Springer Netherlands.
#'
#' Robinson, M.D., and Oshlack, A. (2010). A scaling normalization method for differential
#' expression analysis of RNA-seq data. \emph{Genome Biology} 11, R25.
#'
#' Schwarz, G. (1978). Estimating the dimension of a model. \emph{The Annals of Statistics}
#' 6.
#'
#' Silva, A. et al. (2019). A multivariate Poisson-log normal mixture model
#' for clustering transcriptome sequencing data. \emph{BMC Bioinformatics} 20.
#' \href{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2916-0}{Link}
#'
#' Silva, A. et al. (2018). Finite Mixtures of Matrix Variate Poisson-Log Normal Distributions
#' for Three-Way Count Data. \href{https://arxiv.org/abs/1807.08380}{arXiv preprint arXiv:1807.08380}.
#'
#' @export
#' @importFrom edgeR calcNormFactors
#' @importFrom mclust unmap
#' @importFrom mclust map
#' @importFrom mvtnorm rmvnorm
#' @importFrom utils tail
#'
mvplnVGAclus <- function(dataset,
                          membership = "none",
                          gmin,
                          gmax,
                          initMethod = "kmeans",
                          nInitIterations = 0,
                          normalize = "Yes") {

  ptm <- base::proc.time()

  # Performing checks
  if (typeof(unlist(dataset)) != "double" & typeof(unlist(dataset)) != "integer") {
    stop("dataset should be a list of count matrices.");}

  if (any((unlist(dataset) %% 1 == 0) == FALSE)) {
    stop("dataset should be a list of count matrices.")
  }

  if (is.list(dataset) != TRUE) {
    stop("dataset needs to be a list of matrices.")
  }

  if(is.numeric(gmin) != TRUE || is.numeric(gmax) != TRUE) {
    stop("Class of gmin and gmin should be numeric.")
  }

  if (gmax < gmin) {
    stop("gmax cannot be less than gmin.")
  }

  if (gmax > length(dataset)) {
    stop("gmax cannot be larger than nrow(dataset).")
  }


  if(all(membership != "none") && is.numeric(membership) != TRUE) {
    stop("membership should be a numeric vector containing the
      cluster membership. Otherwise, leave as 'none'.")
  }

  # Checking if missing membership values
  # First check for the case in which G = 1, otherwise check
  #   if missing cluster memberships
  if(all(membership != "none") && length(unique(membership)) != 1) {
    if(all(membership != "none") &&
       all((diff(sort(unique(membership))) == 1) != TRUE) ) {
      stop("Cluster memberships in the membership vector
        are missing a cluster, e.g. 1, 3, 4, 5, 6 is missing cluster 2.")
    }
  }

  if(all(membership != "none") && length(membership) != length(dataset)) {
    stop("membership should be a numeric vector, where length(membership)
      should equal the number of observations. Otherwise, leave as 'none'.")
  }

  if (is.character(initMethod) == TRUE) {
    if(initMethod != "none" & initMethod != "kmeans" & initMethod != "random" & initMethod != "medoids" & initMethod != "medoids" & initMethod != "clara" & initMethod != "fanny") {
      stop("initMethod should of class character, specifying
      either: none, kmeans, random, medoids, clara, or fanny.")
    }
  } else if (is.character(initMethod) != TRUE) {
    stop("initMethod should of class character, specifying
      either: none, kmeans, random, medoids, clara, or fanny.")
  }

  if (is.numeric(nInitIterations) != TRUE) {
    stop("nInitIterations should be positive integer or zero, specifying
      the number of initialization runs to be considered.")
  }

  if (is.character(normalize) != TRUE) {
    stop("normalize should be a string of class character specifying
      if normalization should be performed.")
  }

  if (normalize != "Yes" && normalize != "No") {
    stop("normalize should be a string indicating Yes or No, specifying
       if normalization should be performed.")
  }


  n <- length(dataset)
  p <- ncol(dataset[[1]])
  r <- nrow(dataset[[1]])
  d <- p * r


  # Changing dataset into a n x rp dataset
  TwoDdataset <- matrix(NA, ncol = d, nrow = n)
  sampleMatrix <- matrix(c(1:d), nrow = r, byrow = TRUE)
  for (u in 1:n) {
    for (e in 1:p) {
      for (s in 1:r) {
        TwoDdataset[u, sampleMatrix[s, e]] <- dataset[[u]][s, e]
      }
    }
  }

  # Check if entire row is zero
  if(any(rowSums(TwoDdataset) == 0)) {
    TwoDdataset <- TwoDdataset[- which(rowSums(TwoDdataset) == 0), ]
    n <- nrow(TwoDdataset)
    membership <- membership[- which(rowSums(TwoDdataset) == 0)]
  }

  if(all(is.na(membership) == TRUE)) {
    membership <- "Not provided" }

  # Calculating normalization factors
  if(normalize == "Yes") {
    normFactors <- as.vector(edgeR::calcNormFactors(as.matrix(TwoDdataset),
                                                    method = "TMM"))

  } else if(normalize == "No") {
    normFactors <- rep(0, d)
  } else {
    stop("Argument normalize should be 'Yes' or 'No' ")
  }

  parallelFA <- function(G, dataset,
                         TwoDdataset,
                         r, p, d, n,
                         normFactors,
                         nInitIterations,
                         initMethod) {

    # arranging normalization factors
    libMat <- matrix(normFactors, n, d, byrow = T)
    libMatList <- list()
    for (i in 1:n) {
      libMatList[[i]] <- t(matrix(libMat[i, ], nrow = p))
    }

    if (initMethod == "none") {

      # Initialization
      mu <- omega <- phi <- list() # mu is M;
      delta <- kappa <- sigma <- isigma <- iphi <- iomega <- list()
      # delta  is variational parameter Delta
      # kappa is variational parameter kappa
      # sigma is Psi in the math - kronecker of omega and Phi
      # isigma is inverse of Psi
      # iphi is inverse of Phi
      # iomega is inverse of omega

      m <- S <- list()
      # m is vectorized xi
      # S is Psi

      # Other intermediate items initialized
      Sk <- array(0, c(d, d, G) )
      start <- GX <- dGX <- zS <- list()

      iKappa <- iDelta <- startList <- list()
      # iKappa is inverse of kappa
      # iDelta is inverse of Delta

      kMeansResults <- kmeans(log(TwoDdataset+1),
                              centers = G,
                              nstart = 50,
                              iter.max = 20)$cluster
      zValue <- mclust::unmap(kMeansResults) ### Starting value for Z
      piG <- colSums(zValue) / n

      for (g in 1:G) {
        obs <- which(zValue[, g] == 1)
        mu[[g]] <- colMeans(log(TwoDdataset[obs, ] + 1 / 6))
        sigma[[g]] <- var(log(TwoDdataset[obs, ] + 1 / 6))
        isigma[[g]] <- solve(sigma[[g]])
        phi[[g]] <- diag(r) * sqrt(min(diag(var(log(TwoDdataset[obs, ] + 1 / 6)))))
        omega[[g]] <- diag(p) * sqrt(min(diag(var(log(TwoDdataset[obs, ] + 1 / 6)))))
        iphi[[g]] <- solve(phi[[g]])
        iomega[[g]] <- solve(omega[[g]])
      }

      for (g in 1:G) {
        start[[g]] <- log(TwoDdataset + 1 / 6) ###Starting value for M
        m[[g]] <- log(TwoDdataset + 1 / 6)
        S[[g]] <- list()
        delta[[g]] <- list()
        kappa[[g]] <- list()
        startList[[g]] <- list()
        for (i in 1:n) {
          startList[[g]][[i]] <- log(dataset[[i]])
          delta[[g]][[i]] <- diag(r) * 0.001
          kappa[[g]][[i]] <- diag(p) * 0.001
          S[[g]][[i]] <- delta[[g]][[i]] %x% kappa[[g]][[i]]
        }
      }
    } else {

    # Initialize based on specified initMethod method
    outputInitialization <- list()
    checklogL <- vector()
    for (initIt in seq_along(1:nInitIterations)) {
       set.seed(initIt)
       outputInitialization[[initIt]] <- initializationRunVGA(
                                                G = G,
                                                dataset = dataset,
                                                TwoDdataset = TwoDdataset,
                                                r = r,
                                                p = p,
                                                d = d,
                                                n = n,
                                                normFactors = normFactors,
                                                initMethod = initMethod)
       checklogL[initIt] <- outputInitialization[[initIt]]$finalLogLik
    }

    # select init run with highest logL
    maxRun <- which.max(checklogL)

    mu <- outputInitialization[[maxRun]]$mu
    omega <- outputInitialization[[maxRun]]$omega
    phi <- outputInitialization[[maxRun]]$phi
    delta <- outputInitialization[[maxRun]]$delta
    kappa <- outputInitialization[[maxRun]]$kappa
    sigma <- outputInitialization[[maxRun]]$sigma
    isigma <- outputInitialization[[maxRun]]$isigma
    iphi <- outputInitialization[[maxRun]]$iphi
    iomega <- outputInitialization[[maxRun]]$iomega
    m <- outputInitialization[[maxRun]]$m
    S <- outputInitialization[[maxRun]]$S
    Sk <- outputInitialization[[maxRun]]$Sk
    start <- outputInitialization[[maxRun]]$start
    GX <- outputInitialization[[maxRun]]$GX
    dGX <- outputInitialization[[maxRun]]$dGX
    zS <- outputInitialization[[maxRun]]$zS
    iKappa <- outputInitialization[[maxRun]]$iKappa
    iDelta <- outputInitialization[[maxRun]]$iDelta
    startList <- outputInitialization[[maxRun]]$startList
    zValue <- outputInitialization[[maxRun]]$zValue
    piG <- outputInitialization[[maxRun]]$piG
  }

    # start clustering after initialization
    it <- 1
    aloglik <- loglik <- NULL
    checks <- aloglik[c(1:5)] <- 0
    itMax <- 200

    while (checks == 0) {

      for (g in 1:G) {
        GX[[g]] <- dGX[[g]] <- zS[[g]] <- list()
        iDelta[[g]] <- iKappa[[g]] <- list()
        deltaO <- delta[[g]]
        kappaO <- kappa[[g]]

        for (i in 1:n) {
          iDelta[[g]][[i]] <- diag(c(t(diag(kappa[[g]][[i]])) %*%
                               t(exp(log(libMatList[[i]]) +
                               startList[[g]][[i]] +
                               0.5 * diag(delta[[g]][[i]]) %*%
                               t(diag(kappa[[g]][[i]])))))) +
                               iphi[[g]] * sum(diag(iomega[[g]] %*%
                               kappa[[g]][[i]]))
          delta[[g]][[i]] <- p * solve(iDelta[[g]][[i]])


          iKappa[[g]][[i]] <- diag(c(t(diag(delta[[g]][[i]])) %*%
                               (exp(log(libMatList[[i]]) +
                               startList[[g]][[i]] +
                               t(0.5 * diag(kappa[[g]][[i]]) %*%
                               t(diag(delta[[g]][[i]]))))))) +
                               iomega[[g]] * sum(diag(iphi[[g]] %*%
                               delta[[g]][[i]]))

          kappa[[g]][[i]] <- solve(iKappa[[g]][[i]])


          S[[g]][[i]] <- delta[[g]][[i]] %x% kappa[[g]][[i]]
          zS[[g]][[i]] <- zValue[i, g] * S[[g]][[i]]
          GX[[g]][[i]] <- TwoDdataset[i, ] -
                          exp(start[[g]][i, ] +
                          log(libMat[i,]) +
                          0.5 * diag(S[[g]][[i]])) -
                          (isigma[[g]]) %*% (start[[g]][i, ] - mu[[g]])
          m[[g]][i, ] <- start[[g]][i, ] + S[[g]][[i]] %*% GX[[g]][[i]]
          startList[[g]][[i]] <- t(matrix(m[[g]][i, ], nrow = p))
        }
        start[[g]] <- m[[g]]

        mu[[g]] <- colSums(zValue[, g] * m[[g]]) / sum(zValue[, g]) # this is xi

        # Updating Sample covariance
        muMat <- t(matrix(mu[[g]], nrow = p))
        phiM <- list()
        for (i in 1:n) {
          phiM[[i]] <- zValue[i,g]*(startList[[g]][[i]] - muMat) %*%
                        iomega[[g]] %*% t(startList[[g]][[i]] - muMat) +
                        zValue[i, g] * delta[[g]][[i]] * sum(diag(iomega[[g]] %*%
                        kappa[[g]][[i]]))
        }
        phi[[g]] <- Reduce("+", phiM) / sum(zValue[, g] * p)
        iphi[[g]] <- solve(phi[[g]])

        omegaM <- list()
        for (i in 1:n) {
          omegaM[[i]] <- zValue[i, g] * t(startList[[g]][[i]] - muMat) %*%
                          iphi[[g]] %*% (startList[[g]][[i]] - muMat) +
                          zValue[i, g] * kappa[[g]][[i]] * sum(diag(iphi[[g]] %*%
                          delta[[g]][[i]]))
        }
        omega[[g]] <- Reduce("+", omegaM) / sum(zValue[, g] * r)
        iomega[[g]] <- solve(omega[[g]])
        sigma[[g]] <- phi[[g]] %x% omega[[g]]
        isigma[[g]] <- iphi[[g]] %x% iomega[[g]]
      }

      piG <- colSums(zValue) / n
      # Internal functions
      funFive <- function(x, y = isigma[[g]]) {
        sum(diag(x %*% y))
      }

      FMatrix <- matrix(NA, ncol = G, nrow = n)

      for (g in 1:G) {
        two <- rowSums(exp(m[[g]] + log(libMat) +
               0.5 * matrix(unlist(lapply(S[[g]], diag)),
               ncol = d, byrow = TRUE)))
        five <- 0.5 * unlist(lapply(S[[g]], funFive))
        six <- 0.5 * log(unlist(lapply(S[[g]], det)))
        FMatrix[, g] <- piG[g] * exp(rowSums(m[[g]] * TwoDdataset) -
                  two - rowSums(lfactorial(TwoDdataset)) +
                  rowSums(log(libMat) * TwoDdataset) - 0.5 *
                  mahalanobis(m[[g]], center = mu[[g]], cov = isigma[[g]],
                  inverted = TRUE) - five + six + 0.5 *
                  log(det(isigma[[g]])) - d/2)
      }

      # loglik[it] <- sum(log(rowSums(FMatrix)))
      # if FMatrix values is 0, then log(0) = -Inf and logL error
      rowSumsFMatrix <- rowSums(FMatrix)
      if(any(rowSumsFMatrix == 0) == TRUE) {
        zeroRows <- which(rowSumsFMatrix == 0)
        rowSumsFMatrix[zeroRows] <- min(rowSumsFMatrix[-zeroRows])
      }
      loglik[it] <- sum(log(rowSumsFMatrix))


      zValue <- FMatrix / rowSumsFMatrix

      if (it <= 5) {
        zValue[zValue == "NaN"] <- 0
      }

      if (it > 5) {
        # Aitkaine's stopping criterion
        # Print for check; commented out
        # cat("\n At it:", it ,"loglik[it - 1]:",
        # loglik[it - 1], "- loglik[it - 2]:", loglik[it - 2],
        # "is", loglik[it - 1] - loglik[it - 2], "\n")
        if ((loglik[it - 1] - loglik[it - 2]) == 0) checks <- 1 else {
          a <- (loglik[it] - loglik[it - 1]) / (loglik[it - 1] - loglik[it - 2])
          addTo <- (1 / (1 - a) * (loglik[it] - loglik[it - 1]))
          aloglik[it] <- loglik[it - 1] + addTo
          if (abs(aloglik[it] - aloglik[it - 1]) < 0.05) {
            checks <- 1
          } else {
            checks <- checks
          }
        }
      }

      it <- it + 1
      if (it == itMax) {
        checks <- 1
      }
      finalPhi <- finalOmega <- list()
      for (g in 1:G) {
        finalPhi[[g]] <- phi[[g]] / diag(phi[[g]])[1]
        finalOmega[[g]] <- omega[[g]] * diag(phi[[g]])[1]
      }
    }

    programclust <- mclust::map(zValue)

    FinalGResults <- list(mu = matrix(unlist(mu),
                                      ncol = p,
                                      byrow = TRUE),
                          sigma = sigma,
                          phi = finalPhi,
                          omega = finalOmega,
                          probaPost = zValue,
                          loglikelihood = loglik,
                          proportion = piG,
                          clusterlabels = programclust,
                          iterations = it)

    class(FinalGResults) <- "FinalGResults"

    return(FinalGResults)
  }

  parallelFAOutput <- list()
  for(g in seq_along(1:(gmax - gmin + 1))) {

    if(length(1:(gmax - gmin + 1)) == gmax) {
      clustersize <- g
    } else if(length(1:(gmax - gmin + 1)) < gmax) {
      clustersize <- seq(gmin, gmax, 1)[g]
    }

    parallelFAOutput[[g]] <- parallelFA(
                 G = clustersize,
                 dataset = dataset,
                 TwoDdataset = TwoDdataset,
                 r = r,
                 p = p,
                 d = d,
                 n = n,
                 normFactors = normFactors,
                 nInitIterations = nInitIterations,
                 initMethod = initMethod)
    }


    # cluster data result extracting
    BIC <- ICL <- AIC <- AIC3 <- k <- ll <- vector()
    for(g in seq_along(1:(gmax - gmin + 1))) {
      #print(g)
      if(length(1:(gmax - gmin + 1)) == gmax) {
        clustersize <- g
      } else if(length(1:(gmax - gmin + 1)) < gmax) {
        clustersize <- seq(gmin, gmax, 1)[g]
      }

      # save the final log-likelihood
      ll[g] <- unlist(tail(parallelFAOutput[[g]]$loglik, n = 1))

      k[g] <- calcParameters(g = clustersize,
                             r = r,
                             p = p)

      # starting model selection
      if (g == max(1:(gmax - gmin + 1))) {
        bic <- BICFunction(ll = ll,
                           k = k,
                           n = n,
                           run = parallelFAOutput,
                           gmin = gmin,
                           gmax = gmax,
                           parallel = FALSE)

        icl <- ICLFunction(bIc = bic,
                           gmin = gmin,
                           gmax = gmax,
                           run = parallelFAOutput,
                           parallel = FALSE)

        aic <- AICFunction(ll = ll,
                           k = k,
                           run = parallelFAOutput,
                           gmin = gmin,
                           gmax = gmax,
                           parallel = FALSE)

        aic3 <- AIC3Function(ll = ll,
                             k = k,
                             run = parallelFAOutput,
                             gmin = gmin,
                             gmax = gmax,
                             parallel = FALSE)
      }
    }

    final <- base::proc.time() - ptm

    RESULTS <- list(dataset = dataset,
                    nUnits = n,
                    nVariables = p,
                    nOccassions = r,
                    normFactors = normFactors,
                    gmin = gmin,
                    gmax = gmax,
                    initalizationMethod = initMethod,
                    allResults = parallelFAOutput,
                    loglikelihood = ll,
                    nParameters = k,
                    trueLabels = membership,
                    ICLAll = icl,
                    BICAll = bic,
                    AICAll = aic,
                    AIC3All = aic3,
                    totalTime = final)

    class(RESULTS) <- "mvplnVGA"
    return(RESULTS)

}

initializationRunVGA <- function(G,
                              dataset,
                              TwoDdataset,
                              r, p, d, n,
                              normFactors,
                              initMethod) {

    # arranging normalization factors
    libMat <- matrix(normFactors, n, d, byrow = T)
    libMatList <- list()
    for (i in 1:n) {
      libMatList[[i]] <- t(matrix(libMat[i, ], nrow = p))
    }

    # Initialization
      mu <- omega <- phi <- list() # mu is M;
      delta <- kappa <- sigma <- isigma <-
                  iphi <- iomega <- list()
      # delta  is variational parameter Delta
      # kappa is variational parameter kappa
      # sigma is Psi in the math - kronecker of omega and Phi
      # isigma is inverse of Psi
      # iphi is inverse of Phi
      # iomega is inverse of omega

      m <- S <- list()
      # m is vectorized xi
      # S is Psi

      # Other intermediate items initialized
      Sk <- array(0, c(d, d, G) )
      start <- GX <- dGX <- zS <- list()

      iKappa <- iDelta <- startList <- list()
      # iKappa is inverse of kappa
      # iDelta is inverse of Delta


      if (initMethod == "kmeans") {
        # cat("\n initMethod == kmeans \n")
        zValue <- mclust::unmap(stats::kmeans(x = log(TwoDdataset + 1 / 3),
                                              centers = G)$cluster)

      } else if (initMethod == "random") {
        # cat("\n initMethod == random \n")
        if(G == 1) { # generating z if g = 1
          zValue <- as.matrix(rep.int(1, times = n),
                                       ncol = G,
                                       nrow = n)
        } else { # generating z if g>1
          zConv = 0
          while(! zConv) { # ensure that dimension of z is same as G (i.e.
            # if one column contains all 0s, then generate z again)
            zValue <- t(stats::rmultinom(n = n,
                                         size = 1,
                                         prob = rep(1 / G, G)))
            if(length(which(colSums(zValue) > 0)) == G) {
              zConv <- 1
            }
          }
        }
      } else if (initMethod == "medoids") {
        # cat("\n initMethod == medoids \n")
        zValue <- mclust::unmap(cluster::pam(x = log(TwoDdataset + 1 / 3),
                                                      k = G)$cluster)
      } else if (initMethod == "clara") {
        # cat("\n initMethod == clara \n")
        zValue <- mclust::unmap(cluster::clara(x = log(TwoDdataset + 1 / 3),
                                                        k = G)$cluster)
      } else if (initMethod == "fanny") {
        # cat("\n initMethod == fanny \n")
        zValue <- mclust::unmap(cluster::fanny(x = log(TwoDdataset + 1 / 3),
                                                        k = G)$cluster)
      }

      piG <- colSums(zValue) / n

      for (g in 1:G) {
        obs <- which(zValue[, g] == 1)
        mu[[g]] <- colMeans(log(TwoDdataset[obs, ] + 1 / 6))
        sigma[[g]] <- var(log(TwoDdataset[obs, ] + 1 / 6))
        isigma[[g]] <- solve(sigma[[g]])
        phi[[g]] <- diag(r) * sqrt(min(diag(var(log(TwoDdataset[obs, ] + 1 / 6)))))
        omega[[g]] <- diag(p) * sqrt(min(diag(var(log(TwoDdataset[obs, ] + 1 / 6)))))
        iphi[[g]] <- solve(phi[[g]])
        iomega[[g]] <- solve(omega[[g]])
      }

      for (g in 1:G) {
        start[[g]] <- log(TwoDdataset + 1/6) ###Starting value for M
        m[[g]] <- log(TwoDdataset + 1/6)
        S[[g]] <- list()
        delta[[g]] <- list()
        kappa[[g]] <- list()
        startList[[g]] <- list()
        for (i in 1:n) {
          startList[[g]][[i]] <- log(dataset[[i]])
          delta[[g]][[i]] <- diag(r) * 0.001
          kappa[[g]][[i]] <- diag(p) * 0.001
          S[[g]][[i]] <- delta[[g]][[i]] %x% kappa[[g]][[i]]
        }
      }


    it <- 1
    aloglik <- loglik <- NULL
    checks <- aloglik[c(1:5)] <- 0
    itMax <- 200

    while (checks == 0) {

      for (g in 1:G) {
        GX[[g]] <- dGX[[g]] <- zS[[g]] <- list()
        iDelta[[g]] <- iKappa[[g]] <- list()
        deltaO <- delta[[g]]
        kappaO <- kappa[[g]]

        for (i in 1:n) {
          iDelta[[g]][[i]] <- diag(c(t(diag(kappa[[g]][[i]])) %*%
                                       t(exp(log(libMatList[[i]]) +
                                       startList[[g]][[i]] +
                                       0.5 * diag(delta[[g]][[i]]) %*%
                                       t(diag(kappa[[g]][[i]])))))) +
            iphi[[g]] * sum(diag(iomega[[g]] %*%
                                   kappa[[g]][[i]]))
          delta[[g]][[i]] <- p * solve(iDelta[[g]][[i]])


          iKappa[[g]][[i]] <- diag(c(t(diag(delta[[g]][[i]])) %*%
                                       (exp(log(libMatList[[i]]) +
                                              startList[[g]][[i]] +
                                              t(0.5 * diag(kappa[[g]][[i]]) %*%
                                                  t(diag(delta[[g]][[i]]))))))) +
            iomega[[g]] * sum(diag(iphi[[g]] %*%
                                     delta[[g]][[i]]))

          kappa[[g]][[i]] <- solve(iKappa[[g]][[i]])


          S[[g]][[i]] <- delta[[g]][[i]] %x% kappa[[g]][[i]]
          zS[[g]][[i]] <- zValue[i, g] * S[[g]][[i]]
          GX[[g]][[i]] <- TwoDdataset[i, ] -
            exp(start[[g]][i, ] +
                  log(libMat[i,]) +
                  0.5 * diag(S[[g]][[i]])) -
            (isigma[[g]]) %*% (start[[g]][i, ] - mu[[g]])
          m[[g]][i, ] <- start[[g]][i, ] + S[[g]][[i]] %*% GX[[g]][[i]]
          startList[[g]][[i]] <- t(matrix(m[[g]][i, ], nrow = p))
        }
        start[[g]] <- m[[g]]

        mu[[g]] <- colSums(zValue[, g] * m[[g]]) / sum(zValue[, g]) # this is xi

        # Updating Sample covariance
        muMat <- t(matrix(mu[[g]], nrow = p))
        phiM <- list()
        for (i in 1:n) {
          phiM[[i]] <- zValue[i,g]*(startList[[g]][[i]] - muMat) %*%
            iomega[[g]] %*% t(startList[[g]][[i]] - muMat) +
            zValue[i, g] * delta[[g]][[i]] * sum(diag(iomega[[g]] %*%
                                                        kappa[[g]][[i]]))
        }
        phi[[g]] <- Reduce("+", phiM) / sum(zValue[, g] * p)
        iphi[[g]] <- solve(phi[[g]])

        omegaM <- list()
        for (i in 1:n) {
          omegaM[[i]] <- zValue[i, g] * t(startList[[g]][[i]] - muMat) %*%
            iphi[[g]] %*% (startList[[g]][[i]] - muMat) +
            zValue[i, g] * kappa[[g]][[i]] * sum(diag(iphi[[g]] %*%
                                                        delta[[g]][[i]]))
        }
        omega[[g]] <- Reduce("+", omegaM) / sum(zValue[, g] * r)
        iomega[[g]] <- solve(omega[[g]])
        sigma[[g]] <- phi[[g]] %x% omega[[g]]
        isigma[[g]] <- iphi[[g]] %x% iomega[[g]]
      }

      piG <- colSums(zValue) / n
      # Internal functions
      funFive <- function(x, y = isigma[[g]]) {
        sum(diag(x %*% y))
      }

      FMatrix <- matrix(NA, ncol = G, nrow = n)

      for (g in 1:G) {
        two <- rowSums(exp(m[[g]] + log(libMat) +
                             0.5 * matrix(unlist(lapply(S[[g]], diag)),
                                          ncol = d, byrow = TRUE)))
        five <- 0.5 * unlist(lapply(S[[g]], funFive))
        six <- 0.5 * log(unlist(lapply(S[[g]], det)))
        FMatrix[, g] <- piG[g] * exp(rowSums(m[[g]] * TwoDdataset) -
                                       two - rowSums(lfactorial(TwoDdataset)) +
                                       rowSums(log(libMat) * TwoDdataset) - 0.5 *
                                       mahalanobis(m[[g]], center = mu[[g]], cov = isigma[[g]],
                                                   inverted = TRUE) - five + six + 0.5 *
                                       log(det(isigma[[g]])) - d/2)
      }

      # loglik[it] <- sum(log(rowSums(FMatrix)))
      # if FMatrix values is 0, then log(0) = -Inf and logL error
      rowSumsFMatrix <- rowSums(FMatrix)
      if(any(rowSumsFMatrix == 0) == TRUE) {
        zeroRows <- which(rowSumsFMatrix == 0)
        rowSumsFMatrix[zeroRows] <- min(rowSumsFMatrix[-zeroRows])
      }
      loglik[it] <- sum(log(rowSumsFMatrix))


      zValue <- FMatrix / rowSumsFMatrix
      if (it <= 5) {
        zValue[zValue == "NaN"] <- 0
      }

      # cat("\n Initialization loglik[it - 1] - loglik[it - 2]:", loglik[it - 1] - loglik[it - 2])
      if (it > 5) {
        # Aitkaine's stopping criterion
        if ((loglik[it - 1] - loglik[it - 2]) == 0) checks <- 1 else {
          a <- (loglik[it] - loglik[it - 1]) / (loglik[it - 1] - loglik[it - 2])
          addTo <- (1 / (1 - a) * (loglik[it] - loglik[it - 1]))
          aloglik[it] <- loglik[it - 1] + addTo
          if (abs(aloglik[it] - aloglik[it - 1]) < 0.05) {
            checks <- 1
          } else {
            checks <- checks
          }
        }
      }

      it <- it + 1
      if (it == itMax) {
        checks <- 1
      }
      finalPhi <- finalOmega <- list()
      for (g in 1:G) {
        finalPhi[[g]] <- phi[[g]] / diag(phi[[g]])[1]
        finalOmega[[g]] <- omega[[g]] * diag(phi[[g]])[1]
      }
    }

    programclust <- mclust::map(zValue)

    initRunOutput <- list(mu = mu,
                          sigma = sigma,
                          omega = omega,
                          phi = phi,
                          omega = omega,
                          delta = delta,
                          kappa = kappa,
                          isigma = isigma,
                          iphi = iphi,
                          iomega = iomega,
                          m = m,
                          S = S,
                          Sk = Sk,
                          start = start,
                          GX = GX,
                          dGX = dGX,
                          zS = zS,
                          iKappa = iKappa,
                          iDelta = iDelta,
                          startList = startList,
                          zValue = zValue,
                          loglik = loglik,
                          finalLogLik = loglik[it-1],
                          piG = piG,
                          clusterlabels = programclust,
                          iterations = it)

    class(initRunOutput) <- "initializationRunVGA"

    return(initRunOutput)
  }


# [END]
anjalisilva/mixMVPLN documentation built on Jan. 15, 2024, 1:10 a.m.