R/mixture.R

Defines functions check_parameters_validity_multivariate check_parameters_validity_univariate predict_posterior_probability logsumexp estimate_supervised_multivariate_GMM estimate_supervised_univariate_GMM em_otrimle em_EMMIXmfa_multivariate em_pgmm_multivariate em_HDclassif_multivariate em_clustvarsel_multivariate em_GMKMcharlie_multivariate em_GMKMcharlie_univariate em_mclust_multivariate em_mclust_univariate em_mixtools_multivariate em_mixtools_univariate em_flexmix_multivariate em_flexmix_univariate em_bgmm_multivariate em_bgmm_univariate em_EMCluster_multivariate em_EMCluster_univariate em_Rmixmod_multivariate em_Rmixmod_univariate emnmix_multivariate emnmix_univariate initialize_em_multivariate initialize_em_univariate simulate_multivariate_GMM simulate_univariate_GMM

Documented in check_parameters_validity_multivariate check_parameters_validity_univariate em_bgmm_multivariate em_bgmm_univariate em_clustvarsel_multivariate em_EMCluster_multivariate em_EMCluster_univariate em_EMMIXmfa_multivariate em_flexmix_multivariate em_flexmix_univariate em_GMKMcharlie_multivariate em_GMKMcharlie_univariate em_HDclassif_multivariate em_mclust_multivariate em_mclust_univariate em_mixtools_multivariate em_mixtools_univariate emnmix_multivariate emnmix_univariate em_otrimle em_pgmm_multivariate em_Rmixmod_multivariate em_Rmixmod_univariate estimate_supervised_multivariate_GMM estimate_supervised_univariate_GMM initialize_em_multivariate initialize_em_univariate logsumexp predict_posterior_probability simulate_multivariate_GMM simulate_univariate_GMM

#' Generation of a mixture drawn from an univariate GMM, with possibility to add outliers
#'
#' @details We do not implement the possibility of adding outliers in the multivariate context
#'
#' @author Bastien CHASSAGNOL
#'
#' @param theta a list with 3 entries:
#' * The proportions `p`: \eqn{p} of each component (must be included between 0 and 1, and sum to one overall)
#' * The mean matrix `mu`: \eqn{\mathrm{\mu}=(\mu_{i,j}) \in \mathbb{R}^{n \times k}}, with each column
#' giving the mean values of the variables within a given component
#' * The 3-dimensional covariance matrix array `Sigma`: \eqn{\mathrm{\Sigma}=(\Sigma_{i,j,l}) \in \mathbb{R}^{n \times n \times k}}, with each matrix
#' \eqn{\Sigma_{..l}, l \in \{ 1, \ldots, k\}} storing the covariance matrix of a given component,
#' whose diagonal terms correspond to the variance of each variable, and off-terms diagonal elements return the covariance matrix
#' @param n the number of observations to be drawn
#' @param prop_outliers,interval first argument is the proportion of outliers, drawn from an uniform distribution whose
#' intervals are defined by the second argument twice the distance between the 0.05 and 0.95 quantile of the distribution
#'
#' @return Depending on the univariate or multivariate context:
#' * a list with the number of components k, the true parameters p, mu, sigma,
#' the observed variables x, the hidden observations s and an indicator of the outliers s_outliers
#' * a list with the number of components k, the true parameters p, mu, sigma,
#' the observations `X` living in space \eqn{\mathbb{R}^k}, the hidden indicator variables `s` (vector of size \eqn{n})
#'
#' @export


simulate_univariate_GMM <- function(theta = list(p = c(0.40, 0.60), mu = c(175, 165), sigma = c(10, 12)), n = 100,
                                    prop_outliers = 0, interval = 2) {
  p <- theta$p
  k <- length(p)
  # deal with underflow or removal of components
  stopifnot(check_parameters_validity_univariate(theta, k = k))
  # get values from theta parameter
  mu <- theta$mu
  sigma <- theta$sigma


  # generate hidden variables s set
  s <- sample(1:k, size = n, replace = TRUE, prob = p)

  # generate observed variable set
  x <- stats::rnorm(n, mean = mu[s], sd = sigma[s])

  # select randomly prop_outliers points to be drawn for uniform distribution
  # choice of points such that they are outside quantiles 0.025 of each distribution
  outliers_indexes <- sample(1:n, size = round(prop_outliers * n), replace = FALSE)
  s_outliers <- s
  s_outliers[outliers_indexes] <- 0

  # choice of enough ranged random distribution
  min_component <- which.min(sapply(1:k, function(j) stats::qnorm(0.05, mean = mu[j], sd = sigma[j])))
  max_component <- which.max(sapply(1:k, function(j) stats::qnorm(0.95, mean = mu[j], sd = sigma[j])))
  length_interval <- interval * (stats::qnorm(0.95, mean = mu[max_component], sd = sigma[max_component]) -
    stats::qnorm(0.05, mean = mu[min_component], sd = sigma[min_component]))
  x[outliers_indexes] <- runif(
    n = round(prop_outliers * n),
    min = stats::qnorm(0.01, mean = mu[min_component], sd = sigma[min_component]) - length_interval,
    max = stats::qnorm(0.99, mean = mu[max_component], sd = sigma[max_component]) + length_interval
  )


  return(list(k = k, p = p, mu = mu, sigma = sigma, x = x, s = s, s_outliers = s_outliers))
}


#' @rdname simulate_univariate_GMM
#' @export



simulate_multivariate_GMM <- function(theta, n = 500) {
  ##################################################################
  ##                        check validity                        ##
  ##################################################################

  p <- theta$p
  mu <- theta$mu
  sigma <- theta$sigma
  k <- length(p) # get values from theta parameter
  dimension_gaussian <- dim(mu)[1]


  stopifnot(
    "Parameters provided do not correspond to a fitted GMM estimation" =
      check_parameters_validity_multivariate(theta) == TRUE
  )


  #################################################################
  ##                    generate observations                    ##
  #################################################################
  # generate hidden variables s set
  s <- sample(1:k, size = n, replace = TRUE, prob = p)

  # store multi-dimensional matrix, as mvrnorm does not support indexing
  x <- matrix(0, nrow = n, ncol = dimension_gaussian)

  # generate observed variable set, with each observation drawn from a
  # multivariate GMM indexed by vector s
  for (i in 1:n) {
    x[i, ] <- MASS::mvrnorm(
      n = 1, mu = mu[, s[i]], Sigma = sigma[, , s[i]],
      tol = 1e-12, empirical = FALSE
    )
  }

  return(list(k = k, p = p, mu = mu, sigma = sigma, x = x, s = s))
}


#'  Return initial estimates to the EM algorithm for GMM estimation
#'
#'  One of the main drawback of the EM algorithm is that it requires initial guess as starting point.
#'  And so, careful initialisation, depending on the properties of the mixture, is required:
#'  * `initialize_em_univariate` returns the initial estimates in the univariate dimension.
#'  * `initialize_em_multivariate`returns the initial estimates in a multivariate context. It's worth noting
#'  that *quantiles* initialisation method is not available in the multivariate context, as
#'  no unique set of parametrisation could be returned.
#'
#' @author Bastien CHASSAGNOL
#'
#' @param x the vector of the observations
#' @param k the number of components
#' @param nstart the number of random restarts with kmeans, random and small EM method
#' @param prior_prob minimal uncertainty added to the minor components of each observation assigned by hierarchical clustering
#' @param short_iter,short_eps hyperparameters of the small EM method
#' @param initialisation_algorithm the choice of the initialisation method, between kmeans, quantiles, random, hc, small em and rebmix method
#' @param ... additional hyperparameters supplied with some of the initialisation methods
#'
#' @return a list of the estimated parameters, ordered by increasing mean for identifiability issues
#'
#' @export

initialize_em_univariate <- function(x = NULL, k = 2, nstart = 10L, short_iter = 200, short_eps = 10^-2, prior_prob = 0.05,
                                     initialisation_algorithm = c("kmeans", "quantiles", "random", "hc", "small em", "rebmix"), ...) {
  if (is.null(x)) {
    stop("Please provide data for x.")
  }

  initialisation_algorithm <- match.arg(
    initialisation_algorithm,
    c("kmeans", "quantiles", "random", "hc", "small em", "rebmix")
  )
  n <- length(x)

  ### initialize parametrization
  if (initialisation_algorithm == "kmeans") {
    fit <- stats::kmeans(x = x, centers = k,
                         nstart = nstart, iter.max = short_iter, algorithm = "Hartigan-Wong")
    estimated_theta <- list(
      p = fit$size / n,
      mu = as.vector(fit$centers),
      sigma = sqrt(fit$withinss / fit$size)
    )
  } else if (initialisation_algorithm == "quantiles") {
    fit <- bgmm::init.model.params(X = x, k = k, method = "all")
    estimated_theta <- list(
      p = fit$pi,
      mu = as.vector(fit$mu),
      sigma = sqrt(as.vector(fit$cvar))
    )
  } else if (initialisation_algorithm == "random") {
    all_logs <- lapply(1:nstart, function(y) {
      fit <- EMCluster::simple.init(as.matrix(x), nclass = k)
      logLikelihood_per_random <- EMCluster::logL(as.matrix(x), fit)
      return(list(parameters = fit, logLikelihood = logLikelihood_per_random))
    })

    best_model <- which.max(all_logs %>% purrr::map_dbl("logLikelihood"))
    fit <- purrr::map(all_logs, "parameters")[[best_model]]
    estimated_theta <- list(
      p = fit$pi,
      mu = as.vector(fit$Mu),
      sigma = sqrt(as.vector(fit$LTSigma))
    )
  } else if (initialisation_algorithm == "hc") {
    clustering_hc <- mclust::hcV(data = x, minclus = 1)
    s <- as.vector(mclust::hclass(clustering_hc, G = k))
    if (any(table(s) == 1)) {
      # degenerate case, often caused by the presence of an outlier. In that situation,
      # it gets impossible to perform supervised estimation, and we replace it instead with
      # an ad-hoc technics, adding artificially noisy probability on each cluster assignment
      z <- mclust::unmap(s) # a n * k table, acting as indicator of belonging for each cluster
      z[z == 1] <- 1 - prior_prob * (k - 1)
      z[z == 0] <- prior_prob # non-assigned class get by default a minimal probability value
      # generate first iteration, starting with the M-step of the EM algorithm
      fit <- mclust::meV(
        data = x, z = z, prior = NULL,
        control = mclust::emControl(itmax = 1, equalPro = FALSE)
      )
      estimated_theta <- list(p = fit$parameters$pro, mu = fit$parameters$mean, sigma = sqrt(fit$parameters$variance$sigmasq))
    } else {
      estimated_theta <- estimate_supervised_univariate_GMM(x = x, k = k, s = s, s_outliers = s)
      # if all groups have at least two observations, we can perform supervised estimation
    }
  } else if (initialisation_algorithm == "small em") {
    all_logs <- lapply(1:nstart, function(y) {
      # take some random points using EMCluster simple method
      start <- EMCluster::simple.init(as.matrix(x), nclass = k)
      # small runs of EM from mixtools package, as implementing the true EM algorithm
      fit <- em_mixtools_univariate(
        x = x,
        start = list(p = start$pi, mu = as.vector(start$Mu), sigma = sqrt(as.vector(start$LTSigma))),
        short_eps = short_eps, short_iter = short_iter, k = k
      )

      logLikelihood_per_em <- predict_posterior_probability(x, fit)$loglik
      return(list(parameters = fit, logLikelihood = logLikelihood_per_em))
    })

    best_model <- which.max(all_logs %>% purrr::map_dbl("logLikelihood"))
    estimated_theta <- purrr::map(all_logs, "parameters")[[best_model]]
  } else if (initialisation_algorithm == "rebmix") {
    EM_control <- methods::new("EM.Control",
      strategy = "exhaustive", variant = "EM", acceleration = "fixed",
      acceleration.multiplier = 1, tolerance = short_eps, maximum.iterations = 1
    )

    invisible(utils::capture.output(fit <- rebmix::REBMIX(
      Dataset = list(data.frame(x)), Preprocessing = "kernel density estimation",
      Restraints = "loose", cmax = k + 1, cmin = k, Criterion = "BIC", pdf = "normal", model = "REBMIX", EMcontrol = EM_control
    )))

    estimated_theta <- list(
      p = unlist(fit@w),
      mu = fit@Theta[[1]][grep("^theta1\\.", names(fit@Theta[[1]]))] %>% unlist() %>% unname(),
      sigma = fit@Theta[[1]][grep("^theta2\\.", names(fit@Theta[[1]]))] %>% unlist() %>% unname()
    )
  }

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(estimated_theta)
  return(ordered_estimated_theta)
}

#' @importClassesFrom rebmix EM.Control
#' @rdname initialize_em_univariate
#' @export
initialize_em_multivariate <- function(x, k = 2, nstart = 10L, short_iter = 200, short_eps = 10^-2, prior_prob = 0.05,
                                       initialisation_algorithm = c("kmeans", "random", "hc", "small em", "rebmix"), ...) {
  initialisation_algorithm <- match.arg(
    initialisation_algorithm,
    c("kmeans", "random", "hc", "small em", "rebmix")
  )
  n <- nrow(x) # number of observations
  dim_gaussian <- ncol(x) # dimension of the observation space

  ### initialize parametrization
  if (initialisation_algorithm == "kmeans") {
    fit <- stats::kmeans(x = x, centers = k, nstart = nstart,
                         iter.max = short_iter, algorithm = "Hartigan-Wong")
    estimated_theta <- estimate_supervised_multivariate_GMM(x = x, s = fit$cluster, k = k)
  } else if (initialisation_algorithm == "random") {
    # all_logs <- lapply(1:nstart, function(y) {
    #   fit <- EMCluster::simple.init(x, nclass = k)
    #   logLikelihood_per_random <- EMCluster::logL(x, fit)
    #   return(list(parameters = fit, logLikelihood = logLikelihood_per_random))
    # })
    # best_model <- which.max(all_logs %>% purrr::map_dbl("logLikelihood"))
    # fit <- purrr::map(all_logs, "parameters")[[best_model]]

    EMC <- EMCluster::.EMC.Rnd; EMC$n.candidate <- nstart
    fit <- EMCluster::rand.EM(x, nclass = k, min.n=2, EMC=EMC)
    estimated_theta <- list(
      p = fit$pi, mu = t(fit$Mu),
      sigma = trig_mat_to_array(fit$LTSigma)
    )
  } else if (initialisation_algorithm == "hc") {
    clustering_hc <- mclust::hcVVV(data = x, minclus = 1, ...)
    s <- c(mclust::hclass(clustering_hc, G = k))

    if (any(table(s) == 1)) {
      # degenerate case, often caused by the presence of an outlier. In that situation,
      # it gets impossible to perform supervised estimation, and we replace it instead with
      # an ad-hoc technics, adding artificially noisy probability on each cluster assignment
      saveRDS(list("distribution" = x, "k" = k), glue::glue("./errors/hc_clustering_error.rds"))
      z <- mclust::unmap(s) # a n * k table, acting as indicator of belonging for each cluster
      z[z == 1] <- 1 - prior_prob * (k - 1)
      z[z == 0] <- prior_prob # non-assigned class get by default a minimal probability value
      # generate first iteration, starting with the M-step of the EM algorithm
      fit <- mclust::meVVV(
        data = x, z = z, prior = NULL,
        control = mclust::emControl(itmax = 1, equalPro = FALSE)
      )
      estimated_theta <- list(p = fit$parameters$pro, mu = fit$parameters$mean, sigma = fit$parameters$variance$sigma)
    } else {
      estimated_theta <- estimate_supervised_multivariate_GMM(x = x, s = s, k = k)
      # if all groups have at least two observations, we can perform supervised estimation
    }
  } else if (initialisation_algorithm == "small em") {
    all_logs <- lapply(1:nstart, function(y) {
      # take some random points using EMCluster simple method
      start <- EMCluster::simple.init(as.matrix(x), nclass = k)
      # small runs of EM from mixtools package, as implementing the true EM algorithm
      fit <- em_mixtools_multivariate(
        x = x,
        start = list(p = start$pi, mu = t(start$Mu), sigma = trig_mat_to_array(start$LTSigma)),
        short_eps = short_eps, short_iter = short_iter, k = k
      )
      logLikelihood_per_em <- predict_posterior_probability(x, fit)$loglik
      return(list(parameters = fit, logLikelihood = logLikelihood_per_em))
    })

    best_model <- which.max(all_logs %>% purrr::map_dbl("logLikelihood"))
    estimated_theta <- purrr::map(all_logs, "parameters")[[best_model]]
  } else if (initialisation_algorithm == "rebmix") {
    EM_control <- methods::new("EM.Control",
      strategy = "exhaustive", variant = "EM", acceleration = "fixed",
      acceleration.multiplier = 1, tolerance = short_eps, maximum.iterations = 1
    )

    # one step of EM is required, just for the computation of the estimated parameters
    suppressMessages(    fit <- rebmix::REBMIX(model = "REBMVNORM", Dataset = list(data.frame(x)),
                                               Preprocessing = "kernel density estimation", cmin=k - 1, cmax=k + 1,
                                               Restraints = "loose", Criterion = "BIC", EMcontrol = EM_control))

    mu <- matrix(fit@Theta[[1]][grep("^theta1\\.", names(fit@Theta[[1]]))] %>% unlist() %>% unname(),
                 nrow = dim_gaussian, ncol = k
    ) # estimates are returned as an unique long vector

    sigma <- array(fit@Theta[[1]][seq(3, 3 * k, by = 3)] %>% unlist() %>% unname(),
                   dim = c(dim_gaussian, dim_gaussian, k)
    )
    estimated_theta <- list(p = unlist(fit@w), mu = mu, sigma = sigma)
  }

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(estimated_theta)
  # stop in case of bad implementation
  if (!check_parameters_validity_multivariate(ordered_estimated_theta)) {
    dir.create("./errors/initialisation_failures", showWarnings = F, recursive = T)
    saveRDS(
      object = list(x = x, algo = initialisation_algorithm, k=k),
      file = glue::glue("./errors/initialisation_failures/init_algo_{initialisation_algorithm}_error.rds")
    )
    # stop("Parameters learnt from the initiation algorithm are inconsistent with the number of clusters required,
    #          likely to come from rebmix overestimating the number of clusters, or from degenerate random.\n")
  }
  return(ordered_estimated_theta)
}





#' @title Custom R implementation of the EM algorithm in the univariate context
#'
#' @author Bastien CHASSAGNOL
#'
#' @param x the vector of the observations
#' @param k the number of components
#' @param itmax the maximal number of iterations to reach the threshold
#' @param epsilon the criterion threshold considered as the tolerance between two consecutive log-likelihoods
#' @param start a list of initial estimates provided by the user, with 3 entries:
#' * The proportions `p`: \eqn{p} of each component (must be included between 0 and 1, and sum to one overall)
#' * The mean matrix `mu`: \eqn{\mathrm{\mu}=(\mu_{i,j}) \in \mathbb{R}^{n \times k}}, with each column
#' giving the mean values of the variables within a given component
#' * The 3-dimensional covariance matrix array `Sigma`: \eqn{\mathrm{\Sigma}=(\Sigma_{i,j,l}) \in \mathbb{R}^{n \times n \times k}}, with each matrix
#' \eqn{\Sigma_{..l}, l \in \{ 1, \ldots, k\}} storing the covariance matrix of a given component,
#' whose diagonal terms correspond to the variance of each variable, and off-terms diagonal elements return the covariance matrix
#' @param initialisation_algorithm,nstart hyper-parameters, when the user rather uses
#' one of our implemented initialization algorithms
#' @param parallel only relevant for GMKMCharlie package which has a native parallel implementation (by default, takes half of the available clusters)
#' @param minprior Minimum prior probability of clusters, components falling below this threshold are removed during the iteration.
#' @param ... additional parameters for the reviewed packages
#'
#' @return a list of the estimated parameters, ordered by increasing mean for identifiability issues
#'
#' @importFrom stats IQR dnorm median quantile runif sd var
#'
#'
#' @seealso [emnmix_multivariate()]
#'
#' @export

emnmix_univariate <- function(x, k, epsilon = 10^-4, itmax = 500, nstart = 10L, start = NULL,
                              initialisation_algorithm = "kmeans", ...) {

  ### retrieve the initial configuration
  x <- as.vector(as.matrix(x))
  n <- length(x) # get number of observations

  # get initial estimates if not provided by the user
  if (is.null(start)) {
    start <- initialize_em_univariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }
  p <- start$p
  mu <- start$mu
  sigma <- start$sigma
  old_loglik <- +Inf

  for (iter in 1:itmax) {
    ###  E-step  ###
    # compute the posterior probabilities
    expection_results <- predict_posterior_probability(x, list(p = p, mu = mu, sigma = sigma))
    eta <- expection_results$eta
    new_loglik <- expection_results$loglik

    ### M-step ###
    # update component's weights
    S0 <- apply(eta, 2, sum)
    if (any(S0 < .Machine$double.eps)) {
      stop(glue::glue("Component {which(S0 <.Machine$double.eps)} has been removed."))
    }
    p <- S0 / n

    # update parameter vector zeta per component
    parametric_parameters <- apply(eta, 2, function(w) {
      stats::cov.wt(as.matrix(x), wt = w, cor = F, method = "ML", center = T)
    })
    mu <- purrr::map_dbl(parametric_parameters, "center")
    sigma <- sqrt(purrr::map_dbl(parametric_parameters, "cov"))

    # deal with underflow or removal of components
    if (!check_parameters_validity_univariate(list(p = p, mu = mu, sigma = sigma), k = k)) {
      break
    } ## early stop in case the algorithm is trapped in boundary space

    else if (abs(new_loglik - old_loglik) < epsilon) { # early stop if criterion threshold is met
      break
    }
    old_loglik <- new_loglik # update the newly computed log-likelihood
  }

  ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma)) # return an unique ordered parameter configuration
  return(ordered_estimated_theta)
}


#' @title Custom R implementation of the EM algorithm in the multivariate context
#'
#' @author Bastien CHASSAGNOL
#'
#' @param x the vector of the observations
#' @param k the number of components
#' @param itmax the maximal number of iterations to reach the threshold
#' @param epsilon the criterion threshold considered as the tolerance between two consecutive log-likelihoods
#' @param start a list of initial estimates provided by the user, with 3 entries:
#' * The proportions `p`: \eqn{p} of each component (must be included between 0 and 1, and sum to one overall)
#' * The mean matrix `mu`: \eqn{\mathrm{\mu}=(\mu_{i,j}) \in \mathbb{R}^{n \times k}}, with each column
#' giving the mean values of the variables within a given component
#' * The 3-dimensional covariance matrix array `Sigma`: \eqn{\mathrm{\Sigma}=(\Sigma_{i,j,l}) \in \mathbb{R}^{n \times n \times k}}, with each matrix
#' \eqn{\Sigma_{..l}, l \in \{ 1, \ldots, k\}} storing the covariance matrix of a given component,
#' whose diagonal terms correspond to the variance of each variable, and off-terms diagonal elements return the covariance matrix
#' @param initialisation_algorithm,nstart hyper-parameters, when the user rather uses
#' one of our implemented initialization algorithms
#' @param parallel only relevant for GMKMCharlie package which has a native parallel implementation (by default, takes half of the available clusters)
#' @param minprior Minimum prior probability of clusters, components falling below this threshold are removed during the iteration.
#' @param ... additional parameters for the reviewed packages
#'
#' @return a list of the estimated parameters, ordered by partial ordering
#' on their respective mean components for identifiability issues
#'
#' @seealso [emnmix_univariate()] for a bench of algorithms able to perform the Em estimation in the
#' fully unconstrained case, in the univariate dimension
#'
#' @export
emnmix_multivariate <- function(x, k, epsilon = 10^-4, itmax = 500, nstart = 10L, start = NULL,
                                initialisation_algorithm = "kmeans", ...) {

  ### retrieve the initial configuration
  n <- nrow(x)
  dim_gaussian <- ncol(x)

  # get initial estimates if not provided by the user
  if (is.null(start)) {
    start <- initialize_em_multivariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }
  p <- start$p
  mu <- start$mu
  sigma <- start$sigma
  old_loglik <- -Inf

  for (iter in 1:itmax) {
    ###  E-step  ###
    # compute the posterior probabilities
    expection_results <- predict_posterior_probability(x, list(p = p, mu = mu, sigma = sigma))
    eta <- expection_results$eta
    new_loglik <- expection_results$loglik

    ### M-step ###

    # update component's weights (similar formula, no matter the model)
    S0 <- apply(eta, 2, sum)
    if (any(S0 < .Machine$double.eps)) {
      stop(glue::glue("Component {which(S0 <.Machine$double.eps)} has been removed."))
    }
    p <- S0 / n

    # update parameter vector theta
    mu <- matrix(0, nrow = dim_gaussian, ncol = k)
    sigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))

    for (j in 1:k) {
      mu[, j] <- apply(x, 2, stats::weighted.mean, eta[, j])
      sigma[, , j] <- stats::cov.wt(x, wt = eta[, j], cor = F, center = T, method = "ML")$cov # ML is the maximum likelihood estimator, not the unbiased one
    }

    # deal with underflow or removal of components
    if (!check_parameters_validity_multivariate(list(p = p, mu = mu, sigma = sigma), k = k)) {
      break
    } ## early stop in case the algorithm is trapped in boundary space

    else if (abs(new_loglik - old_loglik) < epsilon) { # early stop if criterion threshold is met
      break
    }
    old_loglik <- new_loglik # update the newly computed log-likelihood
  }

  ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma)) # return an unique ordered parameter configuration
  return(ordered_estimated_theta)
}



#' @rdname emnmix_univariate
#' @importClassesFrom Rmixmod Strategy GaussianParameter
#' @export
em_Rmixmod_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                  epsilon = 10^-4, itmax = 500, start = NULL, ...) {

  # initialization section
  if (is.null(start)) {
    start <- initialize_em_univariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  # define strategy
  strategy_Rmixmod <- methods::new("Strategy",
    algo = "EM", nbTryInInit = 10L,
    initMethod = "parameter", nbIterationInAlgo = itmax,
    epsilonInAlgo = epsilon,
    parameter = methods::new("GaussianParameter",
      proportions = start$p,
      mean = as.matrix(start$mu),
      variance = lapply(start$sigma, function(x) as.matrix(x^2))
    )
  )

  # fit the model
  fit <- Rmixmod::mixmodCluster(
    data = as.vector(x), nbCluster = k, dataType = "quantitative",
    strategy = strategy_Rmixmod
  )@bestResult@parameters

  fit <- list(
    p = fit@proportions,
    mu = as.vector(fit@mean),
    sigma = sqrt(as.vector(unlist(fit@variance)))
  )
  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)
  return(ordered_estimated_theta)
}


#' @rdname emnmix_multivariate
#' @export
em_Rmixmod_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                    epsilon = 10^-4, itmax = 500, start = NULL, ...) {

  # initialization section
  if (is.null(start)) {
    start <- initialize_em_multivariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  # define strategy
  strategy_Rmixmod <- methods::new("Strategy",
    algo = "EM", nbTryInInit = 10L,
    initMethod = "parameter", nbIterationInAlgo = itmax,
    epsilonInAlgo = epsilon,
    parameter = methods::new("GaussianParameter",
      proportions = start$p,
      mean = t(start$mu),
      variance = lapply(seq(dim(start$sigma)[3]), function(x) start$sigma[, , x])
    )
  )

  # fit the model
  unconstrained_model <- Rmixmod::mixmodGaussianModel(listModels = "Gaussian_pk_Lk_Ck") # the fully unconstrained model
  fit <- Rmixmod::mixmodCluster(
    data = x %>% as.data.frame(), nbCluster = k, dataType = "quantitative",
    strategy = strategy_Rmixmod, models = unconstrained_model,
  )@bestResult@parameters

  fit <- list(
    p = fit@proportions,
    mu = t(fit@mean),
    sigma = fit@variance %>% simplify2array()
  )

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)
  return(ordered_estimated_theta)
}





#' @rdname emnmix_univariate
#' @export
em_EMCluster_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                    epsilon = 10^-4, itmax = 500, start = NULL, ...) {

  # initialization section
  if (is.null(start)) {
    start <- initialize_em_univariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }


  # fit the model
  fit <- EMCluster::emcluster(
    x = as.matrix(x),
    pi = start$p, Mu = as.matrix(start$mu), LTSigma = as.matrix(start$sigma),
    EMC = EMCluster::.EMControl(em.iter = itmax, em.eps = epsilon)
  )

  fit <- list(
    p = fit$pi,
    mu = as.vector(fit$Mu),
    sigma = sqrt(as.vector(fit$LTSigma))
  )
  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)

  return(ordered_estimated_theta)
}

#' @rdname emnmix_multivariate
#' @export
em_EMCluster_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                      epsilon = 10^-4, itmax = 500, start = NULL, ...) {

  # initialization section
  if (is.null(start)) {
    start <- initialize_em_multivariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  # fit the model
  fit <- EMCluster::emcluster(
    x = x,
    pi = start$p, Mu = t(start$mu), LTSigma = start$sigma %>% array_to_trig_mat(),
    EMC = EMCluster::.EMControl(em.iter = itmax, em.eps = epsilon)
  )

  fit <- list(
    p = fit$pi,
    mu = t(fit$Mu),
    sigma = fit$LTSigma %>% trig_mat_to_array()
  )
  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)

  return(ordered_estimated_theta)
}



#' @rdname emnmix_univariate
#' @export
em_bgmm_univariate <- function(x = x, k = 2, epsilon = 10^-4, itmax = 500,
                               initialisation_algorithm = "kmeans", start = NULL, ...) {

  # initialization section
  if (is.null(start)) {
    start <- initialize_em_univariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  # fit the model
  fit <- bgmm::unsupervised(
    X = x, k = k,
    init.params = list(
      pi = start$p,
      mu = as.matrix(start$mu),
      cvar = array(start$sigma^2, dim = c(k, 1, 1))
    ),
    stop.likelihood.change = epsilon, stop.max.nsteps = itmax, ...
  )

  fit <- list(
    p = fit$pi,
    mu = as.vector(fit$mu),
    sigma = sqrt(as.vector(fit$cvar))
  )

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)

  return(ordered_estimated_theta)
}

#' @rdname emnmix_multivariate
#' @export
em_bgmm_multivariate <- function(x = x, k = 2, epsilon = 10^-4, itmax = 500,
                                 initialisation_algorithm = "kmeans", start = NULL, ...) {

  # initialization section
  if (is.null(start)) {
    start <- initialize_em_multivariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  dim_gaussian <- ncol(x)
  cvar <- array(0, dim = c(k, dim_gaussian, dim_gaussian))
  # would be easier with reticulate function

  # convert from format D * D * k to k * D * D
  for (j in 1:k) {
    cvar[j, , ] <- start$sigma[, , j]
  }

  ### fit the model
  model_structure <- bgmm::getModelStructure(mean = "D", between = "D", within = "D", cov = "D") # default model, fully unconstrained
  fit <- bgmm::unsupervised(
    X = x, k = k,
    init.params = list(
      pi = start$p,
      mu = t(start$mu),
      cvar = cvar
    ),
    stop.likelihood.change = epsilon, stop.max.nsteps = itmax, model.structure = model_structure
  )

  sigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
  # convert from format k * D * D to D * D * k
  for (j in 1:k) {
    sigma[, , j] <- fit$cvar[j, , ]
  }
  fit <- list(
    p = fit$pi,
    mu = t(fit$mu),
    sigma = sigma
  )

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)

  return(ordered_estimated_theta)
}


#' @rdname emnmix_univariate
#' @export
em_flexmix_univariate <- function(x = x, k = 2, epsilon = 10^-4, itmax = 500, minprior = 0.05,
                                  initialisation_algorithm = "kmeans", start = NULL, ...) {

  # initialization section
  if (is.null(start)) {
    start <- initialize_em_univariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }
  # predicted class from initial guess
  predicted_classes_original <- predict_posterior_probability(x, start)$eta

  # fit the model
  fit <- flexmix::flexmix(x ~ 1,
    k = k, cluster = predicted_classes_original,
    model = flexmix::FLXMCnorm1(),
    control = list(tolerance = epsilon, iter.max = itmax, verbose = 0, minprior = minprior)
  )

  fit <- list(
    p = fit@size / length(x),
    mu = flexmix::parameters(fit)["mean", ],
    sigma = flexmix::parameters(fit)["sd", ]
  )

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)
  return(ordered_estimated_theta)
}


#' @rdname emnmix_multivariate
#' @export
em_flexmix_multivariate <- function(x = x, k = 2, epsilon = 10^-4, itmax = 500, minprior = 0.05,
                                    initialisation_algorithm = "kmeans", start = NULL, ...) {

  # initialization section
  if (is.null(start)) {
    start <- initialize_em_multivariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  # flexmix requires posterior probabilities, instead of initial estimates
  dim_gaussian <- ncol(x)
  cholsigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
  eta <- predict_posterior_probability(x, start)$eta

  # fit the model
  fit <- flexmix::flexmix(x ~ 1,
    k = k, cluster = eta,
    model = flexmix::FLXMCmvnorm(diagonal = F),
    control = list(tolerance = epsilon, iter.max = itmax, verbose = 0, minprior = minprior)
  )

  # adjust to the specific output of flexmix (d^2 * k)
  sigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
  sigma_flexmix <- flexmix::parameters(fit)[grep("^cov", row.names(flexmix::parameters(fit))), ]
  for (j in 1:k) {
    sigma[, , j] <- matrix(sigma_flexmix[, j], nrow = dim_gaussian, ncol = dim_gaussian)
  }
  fit <- list(
    p = fit@size,
    mu = flexmix::parameters(fit)[grep("^center", row.names(flexmix::parameters(fit))), ],
    sigma = sigma
  )

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)

  return(ordered_estimated_theta)
}


#' @rdname emnmix_univariate
#' @export
em_mixtools_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                   epsilon = 10^-4, itmax = 500, start = NULL, ...) {
  # initialization section
  if (is.null(start)) {
    start <- initialize_em_univariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  fit <- mixtools::normalmixEM(
    x = x, lambda = start$p, mu = start$mu, sigma =
      start$sigma, k = k, epsilon = epsilon, maxit = itmax
  )
  fit <- list(p = fit$lambda, mu = fit$mu, sigma = fit$sigma)
  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)

  return(ordered_estimated_theta)
}

#' @rdname emnmix_multivariate
#' @export
em_mixtools_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                     epsilon = 10^-4, itmax = 500, start = NULL, ...) {
  # initialization section
  if (is.null(start)) {
    start <- initialize_em_multivariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  invisible(utils::capture.output(fit <- mixtools::mvnormalmixEM(
    x = x, lambda = start$p, mu = start$mu %>% as.data.frame() %>% as.list(),
    sigma = lapply(seq(dim(start$sigma)[3]), function(x) start$sigma[, , x]),
    k = k, epsilon = epsilon, maxit = itmax, arbmean = T, arbvar = T
  )))

  fit <- list(p = fit$lambda, mu = fit$mu %>% simplify2array(), sigma = fit$sigma %>% simplify2array())

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)

  return(ordered_estimated_theta)
}

#' @rdname emnmix_univariate
#' @export
em_mclust_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", start = NULL,
                                 epsilon = 10^-4, itmax = 500, ...) {

  # set relevant parameters to be done
  control <- mclust::emControl(tol = epsilon, itmax = itmax)
  # initialization section
  if (is.null(start)) {
    start <- initialize_em_univariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  start_em <- list(pro = start$p, mean = start$mu, variance = list(sigmasq = start$sigma^2))
  estimated_theta <- mclust::emV(
    data = x, parameters = start_em,
    modelNames = "V", warn = FALSE, control = control
  )$parameters
  # return an ordered list by mean values
  fit <- list(
    p = estimated_theta$pro,
    mu = estimated_theta$mean,
    sigma = sqrt(estimated_theta$variance$sigmasq)
  )

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)
  return(ordered_estimated_theta)
}

#' @rdname emnmix_multivariate
#' @export
em_mclust_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", start = NULL,
                                   epsilon = 10^-4, itmax = 500, ...) {


  # initialization section
  if (is.null(start)) {
    start <- initialize_em_multivariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  dim_gaussian <- ncol(x)
  cholsigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
  for (j in 1:k) {
    cholsigma[, , j] <- chol(start$sigma[, , j]) # cholesky decomposition is required as input for the mclust VVV package
  }

  start_em <- list(pro = start$p, mean = start$mu, variance = list(
    sigma = start$sigma,
    cholsigma = cholsigma
  ))

  control <- mclust::emControl(tol = epsilon, itmax = itmax) # set control parameters

  estimated_theta <- mclust::emVVV(
    data = x, parameters = start_em, warn = FALSE, control = control, prior = NULL
  )$parameters

  fit <- list(
    p = estimated_theta$pro,
    mu = estimated_theta$mean,
    sigma = estimated_theta$variance$sigma
  )

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)
  return(ordered_estimated_theta)
}



#' @rdname emnmix_univariate
#' @inheritParams GMKMcharlie::GM
#' @export
em_GMKMcharlie_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", embedNoise = 1e-6,
                                      epsilon = 10^-4, itmax = 500, start = NULL, parallel = FALSE, ...) {
  # initialization section
  if (is.null(start)) {
    start <- initialize_em_univariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  avalaible_processors <- ifelse(parallel, parallel::detectCores() %/% 2, 1)
  fit <- GMKMcharlie::GM(t(as.matrix(x)),
    embedNoise = embedNoise,
    alpha = start$p, mu = t(as.matrix(start$mu)), sigma = t(as.matrix(start$sigma)), G = k,
    convergenceEPS = epsilon, maxIter = itmax, maxCore = avalaible_processors, verbose = FALSE
  )

  fit <- list(
    p = fit$alpha,
    mu = fit$mu, sigma = sqrt(unlist(fit$sigma))
  )
  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)
  return(ordered_estimated_theta)
}



#' @rdname emnmix_multivariate
#' @inheritParams GMKMcharlie::GM
#' @export
em_GMKMcharlie_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", embedNoise = 1e-6,
                                        epsilon = 10^-4, itmax = 500, start = NULL, parallel = FALSE, ...) {
  # initialization section
  if (is.null(start)) {
    start <- initialize_em_multivariate(
      x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
      initialisation_algorithm = initialisation_algorithm, ...
    )
  }

  avalaible_processors <- ifelse(parallel, parallel::detectCores() %/% 2, 1)
  fit <- GMKMcharlie::GM(t(x),
    alpha = start$p, mu = start$mu, embedNoise = embedNoise,
    sigma = lapply(seq(dim(start$sigma)[3]), function(x) start$sigma[, , x]), G = k,
    convergenceEPS = epsilon, maxIter = itmax, maxCore = avalaible_processors, verbose = FALSE,
    checkInitialization = T
  )
  fit <- list(
    p = fit$alpha,
    mu = fit$mu, sigma = fit$sigma %>% simplify2array()
  )

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)
  return(ordered_estimated_theta)
}



#' @rdname emnmix_multivariate
#' @inheritParams clustvarsel::clustvarsel
#' @import clustvarsel
#' @export
em_clustvarsel_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                        epsilon = 10^-4, itmax = 500, back_steps=20, start = NULL, ...) {

  ###  Alternative process
  # first, retrieve the most relevant dimensions
  # kept_dimensions <- clustvarsel::clustvarsel(x, G = k, direction="backward", search="greedy",
  #                                             emModels2 = "VVV", hcModel = "VVV", allow.EEE = FALSE,
  #                                             verbose=FALSE, parallel=TRUE, forcetwo=FALSE, ...)$subset
  # # second, with the updated dataset, retrieve relevant dimensions
  # initial_theta <- initialize_em_multivariate(
  #   x = x[, kept_dimensions], k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
  #   initialisation_algorithm = initialisation_algorithm)
  # # third, perform the initialisation with mclust, the most widely used package
  # ordered_estimated_theta <- em_mclust_multivariate (x = x, k = 2, start = initial_theta,
  #                                                    itmax = itmax, epsilon = epsilon)
  #
  #

  # in the canonical process, the only initialisation possible is with hierarchical clustering
  # set parallel to FALSE to avoid redundancy with parallel clustering
  estimated_theta <- clustvarsel::clustvarsel(x, G = k, direction="backward", search="greedy",
                         emModels2 = "VVV", hcModel = "VVV", allow.EEE = FALSE,
                         verbose=FALSE, parallel=FALSE, forcetwo=TRUE, itermax=back_steps)$model$parameters

  fit <- list(
    p = estimated_theta$pro,
    mu = estimated_theta$mean,
    sigma = estimated_theta$variance$sigma)

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(fit)
  return(ordered_estimated_theta)
}


#' @rdname emnmix_multivariate
#' @export
em_HDclassif_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                      epsilon = 10^-4, itmax = 500, start = NULL,
                                      kmeans.control=list(iter.max=200L, nstart=10L, algorithm="Hartigan-Wong"),
                                      mc.cores=getOption("mc.cores", parallel::detectCores())) {

  if (initialisation_algorithm=="kmeans") {
    fit <- HDclassif::hddc(x, K = k, model = c("AkjBkQkDk"), threshold = 0.2,
                           itermax = itmax, eps = epsilon,
                           algo = "EM", d_select = "Cattell", init = "kmeans",
                           show = FALSE,  scaling = FALSE, mc.cores=mc.cores, # alternatively
                           min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
                           nb.rep = 1, kmeans.control = kmeans.control, keepAllRes = F)
  }
  else if (initialisation_algorithm=="small-em") {
    fit <- HDclassif::hddc(x, K = k, model = c("AkjBkQkDk"), threshold = 0.2,
                            itermax = itmax, eps = epsilon,
                            algo = "EM", d_select = "Cattell", init = "mini-em",
                            show = FALSE,  scaling = FALSE, mc.cores=mc.cores, # alternatively
                            min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
                            nb.rep = 1, keepAllRes = F, mini.nb=c(10L, 200))
  }
  else if (initialisation_algorithm=="random") {
    fit <- HDclassif::hddc(x, K = k, model = c("AkjBkQkDk"), threshold = 0.2,
                           itermax = itmax, eps = epsilon,
                           algo = "EM", d_select = "Cattell", init = "param",
                           show = FALSE,  scaling = FALSE, mc.cores=mc.cores, # alternatively
                           min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
                           nb.rep = 1, keepAllRes = F)
  }
  else { # only rebmix or hc will compile
    start <- initialize_em_multivariate(x = x, k = k,
                                        nstart = 10L, itmax = 200, epsilon = 10^-2,
                                        initialisation_algorithm = initialisation_algorithm)
    post_proba <- predict_posterior_probability(x, start)$eta
    predicted_classes <- apply(post_proba, MARGIN = 1, which.max)

    fit <- HDclassif::hddc(x, K = k, model = c("AkjBkQkDk"), threshold = 0.2,
                           itermax = itmax, eps = epsilon,
                           algo = "EM", d_select = "Cattell", init = "vector",
                           show = FALSE,  scaling = FALSE, mc.cores=mc.cores, # alternatively
                           min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
                           nb.rep = 1, keepAllRes = F, init.vector = predicted_classes)
  }








  # convert back to the original space
  p <- c(fit$prop); D <- ncol(fit$mu); mu <- matrix(fit$mu, nrow=D, ncol=k)
  sigma <- array(0, dim=c(D, D, k))
  for (j in 1:k) {
    d_j <- ncol(fit$Q[[j]])
    sigma[,,j] <- fit$Q[[j]] %*% diag(c(fit$a[j,1:d_j]-fit$b[j]), nrow = d_j) %*% t(fit$Q[[j]]) + fit$b[j] * diag(D)
  }

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma))
  return(ordered_estimated_theta)
}




#' @rdname emnmix_multivariate
#' @export
em_pgmm_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                                 epsilon = 10^-4, itmax = 500, aic_acc=0.1, start = NULL,
                                 arguments_HDclassif=list(init= "kmeans",
                                                          kmeans.control=list(iter.max=200L, nstart=10L,
                                                                              algorithm="Hartigan-Wong"),
                                                          mc.cores=getOption("mc.cores",
                                                                             parallel::detectCores()))) {

  ###  way too long to compute all models

  # D <- ncol(x); # as stated in Bouveyron 2007, minimal dimension onto which to project
  # d_j <- min(D, min(quadraticRoots(a=1, b=-(1 + 2*D), c=D^2 - D))) %>% trunc()
  # d_j <- ifelse((D-d_j)^2==D + d_j, d_j-1, d_j)

  d_j <- do.call(HDclassif::hddc, c(list(data=x, K = k, model = c("AkjBkQkD"), threshold = 0.2,
                                         itermax = itmax, eps = epsilon, algo = "EM", d_select = "Cattell",
                                         show = FALSE,  scaling = FALSE,
                                         min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
                                         nb.rep = 1, keepAllRes = F), arguments_HDclassif)) %>%
    magrittr::extract2("d") %>% mean()

  fit <- pgmm::pgmmEM(x, rG = k, rq=d_j, class=NULL, #  1:d_j
                      icl = FALSE, zstart=2,
                      modelSubset="UUU", tol=aic_acc) # We use a different criterion of acceleration here


  # convert back to the original space
  eta <- fit$zhat; p <- c(); n <- nrow(x); D <- ncol(x)
  mu <- matrix(0, nrow = D, ncol = k); sigma <- array(0, dim = c(D, D, k))
  for (j in 1:k) {
    p <- c(p, sum(eta[,j])/n)
    mu[, j] <- apply(x, 2, stats::weighted.mean, eta[, j])
    sigma[, , j] <- fit$load[[j]] %*% t(fit$load[[j]]) + fit$noisev[[j]]
  }

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma))
  return(ordered_estimated_theta)
}

#' @rdname emnmix_multivariate
#' @export
em_EMMIXmfa_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", conv_measure = 'diff',
                                     epsilon = 10^-4, itmax = 500, start = NULL, nkmeans = 10L,
                                     arguments_HDclassif=list(init= "kmeans",
                                                              kmeans.control=list(iter.max=200L, nstart=10L,
                                                                                  algorithm="Hartigan-Wong"),
                                                              mc.cores=getOption("mc.cores", parallel::detectCores()))) {

  # getOption("mc.cores", parallel::detectCores()) set it to one to avoid redundancy with the cluster
  ###  use HDclassif to retrieve the internal dimension
  # since this dimension can be specific to any dimension, we rather take the median (we might weight)
  # by the relative abundance of each class
  d_j <- do.call(HDclassif::hddc, c(list(data=x, K = k, model = c("AkjBkQkD"), threshold = 0.2,
                         itermax = itmax, eps = epsilon, algo = "EM", d_select = "Cattell",
                         show = FALSE,  scaling = FALSE,
                         min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
                         nb.rep = 1, keepAllRes = F), arguments_HDclassif))  %>%
    magrittr::extract2("d") %>% mean()

  # the fit itself
  if (initialisation_algorithm== "kmeans") {
    suppressWarnings(suppressMessages(invisible(utils::capture.output(
      fit <- EMMIXmfa::mcfa(Y=x, g=k, q=d_j, itmax = itmax, nkmeans = 1,
                            tol = epsilon, conv_measure = conv_measure, warn_messages = FALSE)))))
  }
  else if (initialisation_algorithm== "random") {
    suppressWarnings(suppressMessages(invisible(utils::capture.output(
      fit <- EMMIXmfa::mcfa(Y=x, g=k, q=d_j, itmax = itmax, init_method = "rand-A", nrandom=10L,
                            tol = epsilon, conv_measure = conv_measure, warn_messages = FALSE, nkmeans = 1)))))
  }
  else {
    start <- initialize_em_multivariate(x = x, k = k,
                                        nstart = 10L, itmax = 200, epsilon = 10^-2,
                                        initialisation_algorithm = initialisation_algorithm)
    post_proba <- predict_posterior_probability(x, start)$eta
    predicted_classes <- apply(post_proba, MARGIN = 1, which.max)

    suppressWarnings(suppressMessages(invisible(utils::capture.output(
      fit <- EMMIXmfa::mcfa(Y=x, g=k, q=d_j, itmax = itmax, init_clust = predicted_classes %>% as.factor(),
                            tol = epsilon, conv_measure = conv_measure, warn_messages = FALSE, nkmeans = 1)))))
  }




  p <- c(fit$pivec); mu <- fit$A %*% fit$xi; D <- ncol(x)
  sigma <- array(0, dim = c(D, D, k))
  for (j in 1:k) {
    sigma[,,j] <- fit$A %*% fit$omega[,,j] %*% t(fit$A) + fit$D
  }

  ### return an unique ordered parameter configuration
  ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma))
  return(ordered_estimated_theta)
}
















#' @rdname emnmix_univariate
#' @export
em_otrimle <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
                       epsilon = 10^-4, itmax = 500, ...) {
  if (initialisation_algorithm == "hc") {
    start <- otrimle::InitClust(as.matrix(x), G = 2, modelName = "V")
  } else {
    stop("This initialisation method is not enabled with otrimle package.")
  }
  avalaible_processors <- parallel::detectCores() %/% 2
  fit <- otrimle::otrimle(
    data = as.matrix(x), G = k, initial = start,
    iter.max = itmax, tol = epsilon, ncores = avalaible_processors, monitor = FALSE
  )

  # return an ordered list by mean values + proportion of outliers
  ordered_estimated_theta <- list(
    p = fit$pi[-1][order(fit$mean)],
    mu = sort(fit$mean),
    sigma = sqrt(fit$cov[order(fit$mean)]),
    prop_extra = fit$exproportion
  )
  ordered_estimated_theta <- ordered_estimated_theta %>% purrr::map(unname)
  return(ordered_estimated_theta)
}


















#' Estimate the parameters in supervised case (all labels associated to the observations are available)
#'
#' @author Bastien CHASSAGNOL
#'
#' @param x the univariate or multivariate distribution of observed variables
#' @param s the index of the clusters, ranging from 1 to \eqn{k}, the number of clusters
#' @param s_outliers binary index, with 0 corresponding to outlying points
#' @param k the number of classes (by default, the number of unique values within the indicator vector)
#'
#' @return a list with three arguments, respectively the proportions, the means, and the variance / covariance
#'
#' @seealso  \code{\link{simulate_univariate_GMM}}
#'
#' @export

estimate_supervised_univariate_GMM <- function(x, s, s_outliers = s, k = length(unique(s))) {

  ##### retrieve relevant values
  x <- x[s_outliers != 0]
  s_outliers <- s_outliers[s_outliers != 0]
  n <- length(x)
  eta <- table(1:n, s_outliers) # only 0 or 1 inputs, matrix of weights

  ##### estimate the parameters
  p <- table(s_outliers) %>%
    c() %>%
    unname() %>%
    magrittr::divide_by(n)

  # update parameter vector zeta per component
  parametric_parameters <- apply(eta, 2, function(w) {
    stats::cov.wt(as.matrix(x), wt = w, cor = F, method = "ML", center = T)
  })
  mu <- purrr::map_dbl(parametric_parameters, "center")
  sigma <- sqrt(purrr::map_dbl(parametric_parameters, "cov"))

  return(list(p = p, mu = mu, sigma = sigma))
}


#' @rdname estimate_supervised_univariate_GMM
#' @export

estimate_supervised_multivariate_GMM <- function(x, s, k = length(unique(s))) {
  dim_gaussian <- ncol(x)
  n <- nrow(x)
  mu <- matrix(0, nrow = dim_gaussian, ncol = k)
  sigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))

  # estimate the ratios
  p <- table(s) %>%
    c() %>%
    unname() %>%
    magrittr::divide_by(n)

  for (j in 1:k) {
    mu[, j] <- apply(x[s == j, ], 2, mean) # estimate the means
    sigma[, , j] <- stats::cov.wt(x[s == j, ], method = "ML")$cov # estimate covariances
  }

  return(list(p = p, mu = mu, sigma = sigma))
}




#' @title Helper functions for the parameter estimation of GMMs
#'
#' @description These functions are small chunks of code designed to decompose the computation of the EM algorithm into simpler steps.
#' `logsumexp` returns the computation of equation \eqn{\log(\exp(sum(x)))}, avoiding numerical overflows
#'
#' @param l a vector of numeric terms
#' @return a numeric scalar value, result of the previously described equation

logsumexp <- function(l) {
  i <- which.max(l)
  res <- l[i] + log1p(sum(exp(l[-i] - l[i])))
  if (is.nan(res)) res <- -Inf
  return(res)
}


#' @describeIn logsumexp `predict_posterior_probability` returns the expected probability for each observation
#' to belong to any of the \eqn{k} clusters set a priori, given the estimated parameters
#'
#' @param x the vector of observed values, of size \eqn{n}
#' @param estimated_theta the estimated parameters
#' @return a list with two elements:
#' * the posterior probability matrix, `eta`: \eqn{\eta=(\eta_{i,j}) \in [0, 1]^{n \times k}}, with \eqn{\eta_{i,j}}
#' giving the posterior probability of observation \eqn{i} to belong to cluster \eqn{j}
#' * `loglik` returns the expected log-likelihood of our experiment
#' @export

predict_posterior_probability <- function(x, estimated_theta) {
  k <- length(estimated_theta$p)
  p <- estimated_theta$p
  mu <- estimated_theta$mu
  sigma <- estimated_theta$sigma
  n <- ifelse(is.array(estimated_theta$sigma), nrow(x), length(x))
  eta <- matrix(NA, nrow = n, ncol = k) # store posterior distribution for each observation (P(S=i|X))

  if (is.array(estimated_theta$sigma)) {
    # multivariate scenario
    for (j in 1:k) {
      eta[, j] <- log(p[j]) + mvtnorm::dmvnorm(x, mean = mu[, j], sigma = sigma[, , j], log = TRUE)
    }
  } else {
    # univariate scenario
    for (j in 1:k) {
      eta[, j] <- log(p[j]) + stats::dnorm(x, mu[j], sigma[j], log = TRUE) # compute posterior probability, using log tip
    }
  }
  # trick of the log-likelihood is useful for any function belonging to the exponential family
  # which encompasses both univariate and multivariate Normal distribution
  aux <- apply(eta, 1, logsumexp)
  eta <- exp(eta - aux)
  return(list(eta = eta, loglik = sum(aux)))
}




#' Check whether the estimation has been trapped in the boundary space
#'
#' * Function `check_parameters_validity_univariate` asserts at each step of the EM algorithm that
#' it doesn't fall in a degenerate case (either the package performing the EM computation has failed, and returns an
#' error message, or the algorithm is trapped in the boundary space, leading to inconsistent division by zero).
#' * Function `check_parameters_validity_multivariate` has the same functionality, but
#' is adjusted to multivariate parametrisation, and includes additionally a checking whether
#' the covariance matrix is positive definite or not
#'
#' @param theta the set of parameters defining GMMs, with three inputs, namely p, mu and sigma
#' @param k the number of components, by default the number of proportions indicated in vector p
#'
#' @seealso [logsumexp()]
#' @export

check_parameters_validity_univariate <- function(theta, k = length(theta$p)) {
  machine_limit <- .Machine$double.eps
  machine_max <- .Machine$double.xmax
  theta <- theta[c("p", "mu", "sigma")]
  if (any(is.na(unlist(theta))) | length(unlist(theta)) == 0) {
    warning(paste0(
      "Missing data with estimated theta, with ratios: ", paste(theta$p, collapse = " / "), ", ",
      "mu: ", paste(theta$mu, collapse = " / "), " and sigma: ", paste(theta$sigma, collapse = " / ")
    ))
    return(FALSE)
  } else if (any(theta$p < machine_limit | theta$p > 1 - machine_limit) |
    any(theta$sigma < machine_limit) | any(theta$sigma > machine_max) | any(sapply(theta, length) != k)) {
    warning(paste0(
      "Numerical overflow with estimated theta, with ratios: ", paste(theta$p, collapse = " / "), ", ",
      "mu: ", paste(theta$mu, collapse = " / "), " and sigma: ", paste(theta$sigma, collapse = " / ")
    ))
    return(FALSE)
  } else {
    return(TRUE)
  }
}


#' @rdname check_parameters_validity_univariate
check_parameters_validity_multivariate <- function(theta, k = length(theta$p)) {
  machine_limit <- .Machine$double.eps
  machine_max <- .Machine$double.xmax
  is_valid_parametrisation <- TRUE # store whether the parametrisation is correctly performed

  p <- theta$p
  mu <- theta$mu
  sigma <- theta$sigma
  k <- length(p) # get values from theta parameter
  dimension_gaussian <- dim(mu)[1]

  if (any(is.na(unlist(theta))) | length(unlist(theta)) == 0) {
    warning("NA values or no output from the estimation. \n")
    is_valid_parametrisation <- FALSE
  }

  # check the parametrisation of proportions (sum-to-one constraint)
  if ((1 - sum(p)) > machine_limit | any(p < machine_limit) | any(p > 1 - machine_limit)) {
    warning("One at least of your proportions does not enforce the sum-to-one constraint. \n")
    is_valid_parametrisation <- FALSE
  }

  # check parametrisation of means
  if (!identical(dim(mu), c(dimension_gaussian, k)) | any(c(mu) > machine_max)) {
    warning("The mean parameter must be of dimension ndim * k, with k the number of clusters. \n")
    is_valid_parametrisation <- FALSE
  }

  # check the parametrisation of covariance
  if (!identical(dim(sigma), c(dimension_gaussian, dimension_gaussian, k)) | any(c(sigma) > machine_max)) {
    is_valid_parametrisation <- FALSE
  } else {
    for (j in 1:k) {
      if (!is_positive_definite(sigma[, , j])) {
        warning(glue::glue("Covariance matrix for component {j} is not positive definite.\n"))
        is_valid_parametrisation <- FALSE
        break
      }
      if (!isSymmetric(sigma[, , j])) {
        warning(glue::glue("Covariance matrix for component {j} is not symmetric. \n"))
        is_valid_parametrisation <- FALSE
        break
      }
    }
  }

  return(is_valid_parametrisation)
}
bastienchassagnol/RGMMBench documentation built on Oct. 26, 2023, 5:58 p.m.