#' Generation of a mixture drawn from an univariate GMM, with possibility to add outliers
#'
#' @details We do not implement the possibility of adding outliers in the multivariate context
#'
#' @author Bastien CHASSAGNOL
#'
#' @param theta a list with 3 entries:
#' * The proportions `p`: \eqn{p} of each component (must be included between 0 and 1, and sum to one overall)
#' * The mean matrix `mu`: \eqn{\mathrm{\mu}=(\mu_{i,j}) \in \mathbb{R}^{n \times k}}, with each column
#' giving the mean values of the variables within a given component
#' * The 3-dimensional covariance matrix array `Sigma`: \eqn{\mathrm{\Sigma}=(\Sigma_{i,j,l}) \in \mathbb{R}^{n \times n \times k}}, with each matrix
#' \eqn{\Sigma_{..l}, l \in \{ 1, \ldots, k\}} storing the covariance matrix of a given component,
#' whose diagonal terms correspond to the variance of each variable, and off-terms diagonal elements return the covariance matrix
#' @param n the number of observations to be drawn
#' @param prop_outliers,interval first argument is the proportion of outliers, drawn from an uniform distribution whose
#' intervals are defined by the second argument twice the distance between the 0.05 and 0.95 quantile of the distribution
#'
#' @return Depending on the univariate or multivariate context:
#' * a list with the number of components k, the true parameters p, mu, sigma,
#' the observed variables x, the hidden observations s and an indicator of the outliers s_outliers
#' * a list with the number of components k, the true parameters p, mu, sigma,
#' the observations `X` living in space \eqn{\mathbb{R}^k}, the hidden indicator variables `s` (vector of size \eqn{n})
#'
#' @export
simulate_univariate_GMM <- function(theta = list(p = c(0.40, 0.60), mu = c(175, 165), sigma = c(10, 12)), n = 100,
prop_outliers = 0, interval = 2) {
p <- theta$p
k <- length(p)
# deal with underflow or removal of components
stopifnot(check_parameters_validity_univariate(theta, k = k))
# get values from theta parameter
mu <- theta$mu
sigma <- theta$sigma
# generate hidden variables s set
s <- sample(1:k, size = n, replace = TRUE, prob = p)
# generate observed variable set
x <- stats::rnorm(n, mean = mu[s], sd = sigma[s])
# select randomly prop_outliers points to be drawn for uniform distribution
# choice of points such that they are outside quantiles 0.025 of each distribution
outliers_indexes <- sample(1:n, size = round(prop_outliers * n), replace = FALSE)
s_outliers <- s
s_outliers[outliers_indexes] <- 0
# choice of enough ranged random distribution
min_component <- which.min(sapply(1:k, function(j) stats::qnorm(0.05, mean = mu[j], sd = sigma[j])))
max_component <- which.max(sapply(1:k, function(j) stats::qnorm(0.95, mean = mu[j], sd = sigma[j])))
length_interval <- interval * (stats::qnorm(0.95, mean = mu[max_component], sd = sigma[max_component]) -
stats::qnorm(0.05, mean = mu[min_component], sd = sigma[min_component]))
x[outliers_indexes] <- runif(
n = round(prop_outliers * n),
min = stats::qnorm(0.01, mean = mu[min_component], sd = sigma[min_component]) - length_interval,
max = stats::qnorm(0.99, mean = mu[max_component], sd = sigma[max_component]) + length_interval
)
return(list(k = k, p = p, mu = mu, sigma = sigma, x = x, s = s, s_outliers = s_outliers))
}
#' @rdname simulate_univariate_GMM
#' @export
simulate_multivariate_GMM <- function(theta, n = 500) {
##################################################################
## check validity ##
##################################################################
p <- theta$p
mu <- theta$mu
sigma <- theta$sigma
k <- length(p) # get values from theta parameter
dimension_gaussian <- dim(mu)[1]
stopifnot(
"Parameters provided do not correspond to a fitted GMM estimation" =
check_parameters_validity_multivariate(theta) == TRUE
)
#################################################################
## generate observations ##
#################################################################
# generate hidden variables s set
s <- sample(1:k, size = n, replace = TRUE, prob = p)
# store multi-dimensional matrix, as mvrnorm does not support indexing
x <- matrix(0, nrow = n, ncol = dimension_gaussian)
# generate observed variable set, with each observation drawn from a
# multivariate GMM indexed by vector s
for (i in 1:n) {
x[i, ] <- MASS::mvrnorm(
n = 1, mu = mu[, s[i]], Sigma = sigma[, , s[i]],
tol = 1e-12, empirical = FALSE
)
}
return(list(k = k, p = p, mu = mu, sigma = sigma, x = x, s = s))
}
#' Return initial estimates to the EM algorithm for GMM estimation
#'
#' One of the main drawback of the EM algorithm is that it requires initial guess as starting point.
#' And so, careful initialisation, depending on the properties of the mixture, is required:
#' * `initialize_em_univariate` returns the initial estimates in the univariate dimension.
#' * `initialize_em_multivariate`returns the initial estimates in a multivariate context. It's worth noting
#' that *quantiles* initialisation method is not available in the multivariate context, as
#' no unique set of parametrisation could be returned.
#'
#' @author Bastien CHASSAGNOL
#'
#' @param x the vector of the observations
#' @param k the number of components
#' @param nstart the number of random restarts with kmeans, random and small EM method
#' @param prior_prob minimal uncertainty added to the minor components of each observation assigned by hierarchical clustering
#' @param short_iter,short_eps hyperparameters of the small EM method
#' @param initialisation_algorithm the choice of the initialisation method, between kmeans, quantiles, random, hc, small em and rebmix method
#' @param ... additional hyperparameters supplied with some of the initialisation methods
#'
#' @return a list of the estimated parameters, ordered by increasing mean for identifiability issues
#'
#' @export
initialize_em_univariate <- function(x = NULL, k = 2, nstart = 10L, short_iter = 200, short_eps = 10^-2, prior_prob = 0.05,
initialisation_algorithm = c("kmeans", "quantiles", "random", "hc", "small em", "rebmix"), ...) {
if (is.null(x)) {
stop("Please provide data for x.")
}
initialisation_algorithm <- match.arg(
initialisation_algorithm,
c("kmeans", "quantiles", "random", "hc", "small em", "rebmix")
)
n <- length(x)
### initialize parametrization
if (initialisation_algorithm == "kmeans") {
fit <- stats::kmeans(x = x, centers = k,
nstart = nstart, iter.max = short_iter, algorithm = "Hartigan-Wong")
estimated_theta <- list(
p = fit$size / n,
mu = as.vector(fit$centers),
sigma = sqrt(fit$withinss / fit$size)
)
} else if (initialisation_algorithm == "quantiles") {
fit <- bgmm::init.model.params(X = x, k = k, method = "all")
estimated_theta <- list(
p = fit$pi,
mu = as.vector(fit$mu),
sigma = sqrt(as.vector(fit$cvar))
)
} else if (initialisation_algorithm == "random") {
all_logs <- lapply(1:nstart, function(y) {
fit <- EMCluster::simple.init(as.matrix(x), nclass = k)
logLikelihood_per_random <- EMCluster::logL(as.matrix(x), fit)
return(list(parameters = fit, logLikelihood = logLikelihood_per_random))
})
best_model <- which.max(all_logs %>% purrr::map_dbl("logLikelihood"))
fit <- purrr::map(all_logs, "parameters")[[best_model]]
estimated_theta <- list(
p = fit$pi,
mu = as.vector(fit$Mu),
sigma = sqrt(as.vector(fit$LTSigma))
)
} else if (initialisation_algorithm == "hc") {
clustering_hc <- mclust::hcV(data = x, minclus = 1)
s <- as.vector(mclust::hclass(clustering_hc, G = k))
if (any(table(s) == 1)) {
# degenerate case, often caused by the presence of an outlier. In that situation,
# it gets impossible to perform supervised estimation, and we replace it instead with
# an ad-hoc technics, adding artificially noisy probability on each cluster assignment
z <- mclust::unmap(s) # a n * k table, acting as indicator of belonging for each cluster
z[z == 1] <- 1 - prior_prob * (k - 1)
z[z == 0] <- prior_prob # non-assigned class get by default a minimal probability value
# generate first iteration, starting with the M-step of the EM algorithm
fit <- mclust::meV(
data = x, z = z, prior = NULL,
control = mclust::emControl(itmax = 1, equalPro = FALSE)
)
estimated_theta <- list(p = fit$parameters$pro, mu = fit$parameters$mean, sigma = sqrt(fit$parameters$variance$sigmasq))
} else {
estimated_theta <- estimate_supervised_univariate_GMM(x = x, k = k, s = s, s_outliers = s)
# if all groups have at least two observations, we can perform supervised estimation
}
} else if (initialisation_algorithm == "small em") {
all_logs <- lapply(1:nstart, function(y) {
# take some random points using EMCluster simple method
start <- EMCluster::simple.init(as.matrix(x), nclass = k)
# small runs of EM from mixtools package, as implementing the true EM algorithm
fit <- em_mixtools_univariate(
x = x,
start = list(p = start$pi, mu = as.vector(start$Mu), sigma = sqrt(as.vector(start$LTSigma))),
short_eps = short_eps, short_iter = short_iter, k = k
)
logLikelihood_per_em <- predict_posterior_probability(x, fit)$loglik
return(list(parameters = fit, logLikelihood = logLikelihood_per_em))
})
best_model <- which.max(all_logs %>% purrr::map_dbl("logLikelihood"))
estimated_theta <- purrr::map(all_logs, "parameters")[[best_model]]
} else if (initialisation_algorithm == "rebmix") {
EM_control <- methods::new("EM.Control",
strategy = "exhaustive", variant = "EM", acceleration = "fixed",
acceleration.multiplier = 1, tolerance = short_eps, maximum.iterations = 1
)
invisible(utils::capture.output(fit <- rebmix::REBMIX(
Dataset = list(data.frame(x)), Preprocessing = "kernel density estimation",
Restraints = "loose", cmax = k + 1, cmin = k, Criterion = "BIC", pdf = "normal", model = "REBMIX", EMcontrol = EM_control
)))
estimated_theta <- list(
p = unlist(fit@w),
mu = fit@Theta[[1]][grep("^theta1\\.", names(fit@Theta[[1]]))] %>% unlist() %>% unname(),
sigma = fit@Theta[[1]][grep("^theta2\\.", names(fit@Theta[[1]]))] %>% unlist() %>% unname()
)
}
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(estimated_theta)
return(ordered_estimated_theta)
}
#' @importClassesFrom rebmix EM.Control
#' @rdname initialize_em_univariate
#' @export
initialize_em_multivariate <- function(x, k = 2, nstart = 10L, short_iter = 200, short_eps = 10^-2, prior_prob = 0.05,
initialisation_algorithm = c("kmeans", "random", "hc", "small em", "rebmix"), ...) {
initialisation_algorithm <- match.arg(
initialisation_algorithm,
c("kmeans", "random", "hc", "small em", "rebmix")
)
n <- nrow(x) # number of observations
dim_gaussian <- ncol(x) # dimension of the observation space
### initialize parametrization
if (initialisation_algorithm == "kmeans") {
fit <- stats::kmeans(x = x, centers = k, nstart = nstart,
iter.max = short_iter, algorithm = "Hartigan-Wong")
estimated_theta <- estimate_supervised_multivariate_GMM(x = x, s = fit$cluster, k = k)
} else if (initialisation_algorithm == "random") {
# all_logs <- lapply(1:nstart, function(y) {
# fit <- EMCluster::simple.init(x, nclass = k)
# logLikelihood_per_random <- EMCluster::logL(x, fit)
# return(list(parameters = fit, logLikelihood = logLikelihood_per_random))
# })
# best_model <- which.max(all_logs %>% purrr::map_dbl("logLikelihood"))
# fit <- purrr::map(all_logs, "parameters")[[best_model]]
EMC <- EMCluster::.EMC.Rnd; EMC$n.candidate <- nstart
fit <- EMCluster::rand.EM(x, nclass = k, min.n=2, EMC=EMC)
estimated_theta <- list(
p = fit$pi, mu = t(fit$Mu),
sigma = trig_mat_to_array(fit$LTSigma)
)
} else if (initialisation_algorithm == "hc") {
clustering_hc <- mclust::hcVVV(data = x, minclus = 1, ...)
s <- c(mclust::hclass(clustering_hc, G = k))
if (any(table(s) == 1)) {
# degenerate case, often caused by the presence of an outlier. In that situation,
# it gets impossible to perform supervised estimation, and we replace it instead with
# an ad-hoc technics, adding artificially noisy probability on each cluster assignment
saveRDS(list("distribution" = x, "k" = k), glue::glue("./errors/hc_clustering_error.rds"))
z <- mclust::unmap(s) # a n * k table, acting as indicator of belonging for each cluster
z[z == 1] <- 1 - prior_prob * (k - 1)
z[z == 0] <- prior_prob # non-assigned class get by default a minimal probability value
# generate first iteration, starting with the M-step of the EM algorithm
fit <- mclust::meVVV(
data = x, z = z, prior = NULL,
control = mclust::emControl(itmax = 1, equalPro = FALSE)
)
estimated_theta <- list(p = fit$parameters$pro, mu = fit$parameters$mean, sigma = fit$parameters$variance$sigma)
} else {
estimated_theta <- estimate_supervised_multivariate_GMM(x = x, s = s, k = k)
# if all groups have at least two observations, we can perform supervised estimation
}
} else if (initialisation_algorithm == "small em") {
all_logs <- lapply(1:nstart, function(y) {
# take some random points using EMCluster simple method
start <- EMCluster::simple.init(as.matrix(x), nclass = k)
# small runs of EM from mixtools package, as implementing the true EM algorithm
fit <- em_mixtools_multivariate(
x = x,
start = list(p = start$pi, mu = t(start$Mu), sigma = trig_mat_to_array(start$LTSigma)),
short_eps = short_eps, short_iter = short_iter, k = k
)
logLikelihood_per_em <- predict_posterior_probability(x, fit)$loglik
return(list(parameters = fit, logLikelihood = logLikelihood_per_em))
})
best_model <- which.max(all_logs %>% purrr::map_dbl("logLikelihood"))
estimated_theta <- purrr::map(all_logs, "parameters")[[best_model]]
} else if (initialisation_algorithm == "rebmix") {
EM_control <- methods::new("EM.Control",
strategy = "exhaustive", variant = "EM", acceleration = "fixed",
acceleration.multiplier = 1, tolerance = short_eps, maximum.iterations = 1
)
# one step of EM is required, just for the computation of the estimated parameters
suppressMessages( fit <- rebmix::REBMIX(model = "REBMVNORM", Dataset = list(data.frame(x)),
Preprocessing = "kernel density estimation", cmin=k - 1, cmax=k + 1,
Restraints = "loose", Criterion = "BIC", EMcontrol = EM_control))
mu <- matrix(fit@Theta[[1]][grep("^theta1\\.", names(fit@Theta[[1]]))] %>% unlist() %>% unname(),
nrow = dim_gaussian, ncol = k
) # estimates are returned as an unique long vector
sigma <- array(fit@Theta[[1]][seq(3, 3 * k, by = 3)] %>% unlist() %>% unname(),
dim = c(dim_gaussian, dim_gaussian, k)
)
estimated_theta <- list(p = unlist(fit@w), mu = mu, sigma = sigma)
}
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(estimated_theta)
# stop in case of bad implementation
if (!check_parameters_validity_multivariate(ordered_estimated_theta)) {
dir.create("./errors/initialisation_failures", showWarnings = F, recursive = T)
saveRDS(
object = list(x = x, algo = initialisation_algorithm, k=k),
file = glue::glue("./errors/initialisation_failures/init_algo_{initialisation_algorithm}_error.rds")
)
# stop("Parameters learnt from the initiation algorithm are inconsistent with the number of clusters required,
# likely to come from rebmix overestimating the number of clusters, or from degenerate random.\n")
}
return(ordered_estimated_theta)
}
#' @title Custom R implementation of the EM algorithm in the univariate context
#'
#' @author Bastien CHASSAGNOL
#'
#' @param x the vector of the observations
#' @param k the number of components
#' @param itmax the maximal number of iterations to reach the threshold
#' @param epsilon the criterion threshold considered as the tolerance between two consecutive log-likelihoods
#' @param start a list of initial estimates provided by the user, with 3 entries:
#' * The proportions `p`: \eqn{p} of each component (must be included between 0 and 1, and sum to one overall)
#' * The mean matrix `mu`: \eqn{\mathrm{\mu}=(\mu_{i,j}) \in \mathbb{R}^{n \times k}}, with each column
#' giving the mean values of the variables within a given component
#' * The 3-dimensional covariance matrix array `Sigma`: \eqn{\mathrm{\Sigma}=(\Sigma_{i,j,l}) \in \mathbb{R}^{n \times n \times k}}, with each matrix
#' \eqn{\Sigma_{..l}, l \in \{ 1, \ldots, k\}} storing the covariance matrix of a given component,
#' whose diagonal terms correspond to the variance of each variable, and off-terms diagonal elements return the covariance matrix
#' @param initialisation_algorithm,nstart hyper-parameters, when the user rather uses
#' one of our implemented initialization algorithms
#' @param parallel only relevant for GMKMCharlie package which has a native parallel implementation (by default, takes half of the available clusters)
#' @param minprior Minimum prior probability of clusters, components falling below this threshold are removed during the iteration.
#' @param ... additional parameters for the reviewed packages
#'
#' @return a list of the estimated parameters, ordered by increasing mean for identifiability issues
#'
#' @importFrom stats IQR dnorm median quantile runif sd var
#'
#'
#' @seealso [emnmix_multivariate()]
#'
#' @export
emnmix_univariate <- function(x, k, epsilon = 10^-4, itmax = 500, nstart = 10L, start = NULL,
initialisation_algorithm = "kmeans", ...) {
### retrieve the initial configuration
x <- as.vector(as.matrix(x))
n <- length(x) # get number of observations
# get initial estimates if not provided by the user
if (is.null(start)) {
start <- initialize_em_univariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
p <- start$p
mu <- start$mu
sigma <- start$sigma
old_loglik <- +Inf
for (iter in 1:itmax) {
### E-step ###
# compute the posterior probabilities
expection_results <- predict_posterior_probability(x, list(p = p, mu = mu, sigma = sigma))
eta <- expection_results$eta
new_loglik <- expection_results$loglik
### M-step ###
# update component's weights
S0 <- apply(eta, 2, sum)
if (any(S0 < .Machine$double.eps)) {
stop(glue::glue("Component {which(S0 <.Machine$double.eps)} has been removed."))
}
p <- S0 / n
# update parameter vector zeta per component
parametric_parameters <- apply(eta, 2, function(w) {
stats::cov.wt(as.matrix(x), wt = w, cor = F, method = "ML", center = T)
})
mu <- purrr::map_dbl(parametric_parameters, "center")
sigma <- sqrt(purrr::map_dbl(parametric_parameters, "cov"))
# deal with underflow or removal of components
if (!check_parameters_validity_univariate(list(p = p, mu = mu, sigma = sigma), k = k)) {
break
} ## early stop in case the algorithm is trapped in boundary space
else if (abs(new_loglik - old_loglik) < epsilon) { # early stop if criterion threshold is met
break
}
old_loglik <- new_loglik # update the newly computed log-likelihood
}
ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma)) # return an unique ordered parameter configuration
return(ordered_estimated_theta)
}
#' @title Custom R implementation of the EM algorithm in the multivariate context
#'
#' @author Bastien CHASSAGNOL
#'
#' @param x the vector of the observations
#' @param k the number of components
#' @param itmax the maximal number of iterations to reach the threshold
#' @param epsilon the criterion threshold considered as the tolerance between two consecutive log-likelihoods
#' @param start a list of initial estimates provided by the user, with 3 entries:
#' * The proportions `p`: \eqn{p} of each component (must be included between 0 and 1, and sum to one overall)
#' * The mean matrix `mu`: \eqn{\mathrm{\mu}=(\mu_{i,j}) \in \mathbb{R}^{n \times k}}, with each column
#' giving the mean values of the variables within a given component
#' * The 3-dimensional covariance matrix array `Sigma`: \eqn{\mathrm{\Sigma}=(\Sigma_{i,j,l}) \in \mathbb{R}^{n \times n \times k}}, with each matrix
#' \eqn{\Sigma_{..l}, l \in \{ 1, \ldots, k\}} storing the covariance matrix of a given component,
#' whose diagonal terms correspond to the variance of each variable, and off-terms diagonal elements return the covariance matrix
#' @param initialisation_algorithm,nstart hyper-parameters, when the user rather uses
#' one of our implemented initialization algorithms
#' @param parallel only relevant for GMKMCharlie package which has a native parallel implementation (by default, takes half of the available clusters)
#' @param minprior Minimum prior probability of clusters, components falling below this threshold are removed during the iteration.
#' @param ... additional parameters for the reviewed packages
#'
#' @return a list of the estimated parameters, ordered by partial ordering
#' on their respective mean components for identifiability issues
#'
#' @seealso [emnmix_univariate()] for a bench of algorithms able to perform the Em estimation in the
#' fully unconstrained case, in the univariate dimension
#'
#' @export
emnmix_multivariate <- function(x, k, epsilon = 10^-4, itmax = 500, nstart = 10L, start = NULL,
initialisation_algorithm = "kmeans", ...) {
### retrieve the initial configuration
n <- nrow(x)
dim_gaussian <- ncol(x)
# get initial estimates if not provided by the user
if (is.null(start)) {
start <- initialize_em_multivariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
p <- start$p
mu <- start$mu
sigma <- start$sigma
old_loglik <- -Inf
for (iter in 1:itmax) {
### E-step ###
# compute the posterior probabilities
expection_results <- predict_posterior_probability(x, list(p = p, mu = mu, sigma = sigma))
eta <- expection_results$eta
new_loglik <- expection_results$loglik
### M-step ###
# update component's weights (similar formula, no matter the model)
S0 <- apply(eta, 2, sum)
if (any(S0 < .Machine$double.eps)) {
stop(glue::glue("Component {which(S0 <.Machine$double.eps)} has been removed."))
}
p <- S0 / n
# update parameter vector theta
mu <- matrix(0, nrow = dim_gaussian, ncol = k)
sigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
for (j in 1:k) {
mu[, j] <- apply(x, 2, stats::weighted.mean, eta[, j])
sigma[, , j] <- stats::cov.wt(x, wt = eta[, j], cor = F, center = T, method = "ML")$cov # ML is the maximum likelihood estimator, not the unbiased one
}
# deal with underflow or removal of components
if (!check_parameters_validity_multivariate(list(p = p, mu = mu, sigma = sigma), k = k)) {
break
} ## early stop in case the algorithm is trapped in boundary space
else if (abs(new_loglik - old_loglik) < epsilon) { # early stop if criterion threshold is met
break
}
old_loglik <- new_loglik # update the newly computed log-likelihood
}
ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma)) # return an unique ordered parameter configuration
return(ordered_estimated_theta)
}
#' @rdname emnmix_univariate
#' @importClassesFrom Rmixmod Strategy GaussianParameter
#' @export
em_Rmixmod_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_univariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
# define strategy
strategy_Rmixmod <- methods::new("Strategy",
algo = "EM", nbTryInInit = 10L,
initMethod = "parameter", nbIterationInAlgo = itmax,
epsilonInAlgo = epsilon,
parameter = methods::new("GaussianParameter",
proportions = start$p,
mean = as.matrix(start$mu),
variance = lapply(start$sigma, function(x) as.matrix(x^2))
)
)
# fit the model
fit <- Rmixmod::mixmodCluster(
data = as.vector(x), nbCluster = k, dataType = "quantitative",
strategy = strategy_Rmixmod
)@bestResult@parameters
fit <- list(
p = fit@proportions,
mu = as.vector(fit@mean),
sigma = sqrt(as.vector(unlist(fit@variance)))
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_Rmixmod_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_multivariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
# define strategy
strategy_Rmixmod <- methods::new("Strategy",
algo = "EM", nbTryInInit = 10L,
initMethod = "parameter", nbIterationInAlgo = itmax,
epsilonInAlgo = epsilon,
parameter = methods::new("GaussianParameter",
proportions = start$p,
mean = t(start$mu),
variance = lapply(seq(dim(start$sigma)[3]), function(x) start$sigma[, , x])
)
)
# fit the model
unconstrained_model <- Rmixmod::mixmodGaussianModel(listModels = "Gaussian_pk_Lk_Ck") # the fully unconstrained model
fit <- Rmixmod::mixmodCluster(
data = x %>% as.data.frame(), nbCluster = k, dataType = "quantitative",
strategy = strategy_Rmixmod, models = unconstrained_model,
)@bestResult@parameters
fit <- list(
p = fit@proportions,
mu = t(fit@mean),
sigma = fit@variance %>% simplify2array()
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_univariate
#' @export
em_EMCluster_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_univariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
# fit the model
fit <- EMCluster::emcluster(
x = as.matrix(x),
pi = start$p, Mu = as.matrix(start$mu), LTSigma = as.matrix(start$sigma),
EMC = EMCluster::.EMControl(em.iter = itmax, em.eps = epsilon)
)
fit <- list(
p = fit$pi,
mu = as.vector(fit$Mu),
sigma = sqrt(as.vector(fit$LTSigma))
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_EMCluster_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_multivariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
# fit the model
fit <- EMCluster::emcluster(
x = x,
pi = start$p, Mu = t(start$mu), LTSigma = start$sigma %>% array_to_trig_mat(),
EMC = EMCluster::.EMControl(em.iter = itmax, em.eps = epsilon)
)
fit <- list(
p = fit$pi,
mu = t(fit$Mu),
sigma = fit$LTSigma %>% trig_mat_to_array()
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_univariate
#' @export
em_bgmm_univariate <- function(x = x, k = 2, epsilon = 10^-4, itmax = 500,
initialisation_algorithm = "kmeans", start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_univariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
# fit the model
fit <- bgmm::unsupervised(
X = x, k = k,
init.params = list(
pi = start$p,
mu = as.matrix(start$mu),
cvar = array(start$sigma^2, dim = c(k, 1, 1))
),
stop.likelihood.change = epsilon, stop.max.nsteps = itmax, ...
)
fit <- list(
p = fit$pi,
mu = as.vector(fit$mu),
sigma = sqrt(as.vector(fit$cvar))
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_bgmm_multivariate <- function(x = x, k = 2, epsilon = 10^-4, itmax = 500,
initialisation_algorithm = "kmeans", start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_multivariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
dim_gaussian <- ncol(x)
cvar <- array(0, dim = c(k, dim_gaussian, dim_gaussian))
# would be easier with reticulate function
# convert from format D * D * k to k * D * D
for (j in 1:k) {
cvar[j, , ] <- start$sigma[, , j]
}
### fit the model
model_structure <- bgmm::getModelStructure(mean = "D", between = "D", within = "D", cov = "D") # default model, fully unconstrained
fit <- bgmm::unsupervised(
X = x, k = k,
init.params = list(
pi = start$p,
mu = t(start$mu),
cvar = cvar
),
stop.likelihood.change = epsilon, stop.max.nsteps = itmax, model.structure = model_structure
)
sigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
# convert from format k * D * D to D * D * k
for (j in 1:k) {
sigma[, , j] <- fit$cvar[j, , ]
}
fit <- list(
p = fit$pi,
mu = t(fit$mu),
sigma = sigma
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_univariate
#' @export
em_flexmix_univariate <- function(x = x, k = 2, epsilon = 10^-4, itmax = 500, minprior = 0.05,
initialisation_algorithm = "kmeans", start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_univariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
# predicted class from initial guess
predicted_classes_original <- predict_posterior_probability(x, start)$eta
# fit the model
fit <- flexmix::flexmix(x ~ 1,
k = k, cluster = predicted_classes_original,
model = flexmix::FLXMCnorm1(),
control = list(tolerance = epsilon, iter.max = itmax, verbose = 0, minprior = minprior)
)
fit <- list(
p = fit@size / length(x),
mu = flexmix::parameters(fit)["mean", ],
sigma = flexmix::parameters(fit)["sd", ]
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_flexmix_multivariate <- function(x = x, k = 2, epsilon = 10^-4, itmax = 500, minprior = 0.05,
initialisation_algorithm = "kmeans", start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_multivariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
# flexmix requires posterior probabilities, instead of initial estimates
dim_gaussian <- ncol(x)
cholsigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
eta <- predict_posterior_probability(x, start)$eta
# fit the model
fit <- flexmix::flexmix(x ~ 1,
k = k, cluster = eta,
model = flexmix::FLXMCmvnorm(diagonal = F),
control = list(tolerance = epsilon, iter.max = itmax, verbose = 0, minprior = minprior)
)
# adjust to the specific output of flexmix (d^2 * k)
sigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
sigma_flexmix <- flexmix::parameters(fit)[grep("^cov", row.names(flexmix::parameters(fit))), ]
for (j in 1:k) {
sigma[, , j] <- matrix(sigma_flexmix[, j], nrow = dim_gaussian, ncol = dim_gaussian)
}
fit <- list(
p = fit@size,
mu = flexmix::parameters(fit)[grep("^center", row.names(flexmix::parameters(fit))), ],
sigma = sigma
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_univariate
#' @export
em_mixtools_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_univariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
fit <- mixtools::normalmixEM(
x = x, lambda = start$p, mu = start$mu, sigma =
start$sigma, k = k, epsilon = epsilon, maxit = itmax
)
fit <- list(p = fit$lambda, mu = fit$mu, sigma = fit$sigma)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_mixtools_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, start = NULL, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_multivariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
invisible(utils::capture.output(fit <- mixtools::mvnormalmixEM(
x = x, lambda = start$p, mu = start$mu %>% as.data.frame() %>% as.list(),
sigma = lapply(seq(dim(start$sigma)[3]), function(x) start$sigma[, , x]),
k = k, epsilon = epsilon, maxit = itmax, arbmean = T, arbvar = T
)))
fit <- list(p = fit$lambda, mu = fit$mu %>% simplify2array(), sigma = fit$sigma %>% simplify2array())
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_univariate
#' @export
em_mclust_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", start = NULL,
epsilon = 10^-4, itmax = 500, ...) {
# set relevant parameters to be done
control <- mclust::emControl(tol = epsilon, itmax = itmax)
# initialization section
if (is.null(start)) {
start <- initialize_em_univariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
start_em <- list(pro = start$p, mean = start$mu, variance = list(sigmasq = start$sigma^2))
estimated_theta <- mclust::emV(
data = x, parameters = start_em,
modelNames = "V", warn = FALSE, control = control
)$parameters
# return an ordered list by mean values
fit <- list(
p = estimated_theta$pro,
mu = estimated_theta$mean,
sigma = sqrt(estimated_theta$variance$sigmasq)
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_mclust_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", start = NULL,
epsilon = 10^-4, itmax = 500, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_multivariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
dim_gaussian <- ncol(x)
cholsigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
for (j in 1:k) {
cholsigma[, , j] <- chol(start$sigma[, , j]) # cholesky decomposition is required as input for the mclust VVV package
}
start_em <- list(pro = start$p, mean = start$mu, variance = list(
sigma = start$sigma,
cholsigma = cholsigma
))
control <- mclust::emControl(tol = epsilon, itmax = itmax) # set control parameters
estimated_theta <- mclust::emVVV(
data = x, parameters = start_em, warn = FALSE, control = control, prior = NULL
)$parameters
fit <- list(
p = estimated_theta$pro,
mu = estimated_theta$mean,
sigma = estimated_theta$variance$sigma
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_univariate
#' @inheritParams GMKMcharlie::GM
#' @export
em_GMKMcharlie_univariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", embedNoise = 1e-6,
epsilon = 10^-4, itmax = 500, start = NULL, parallel = FALSE, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_univariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
avalaible_processors <- ifelse(parallel, parallel::detectCores() %/% 2, 1)
fit <- GMKMcharlie::GM(t(as.matrix(x)),
embedNoise = embedNoise,
alpha = start$p, mu = t(as.matrix(start$mu)), sigma = t(as.matrix(start$sigma)), G = k,
convergenceEPS = epsilon, maxIter = itmax, maxCore = avalaible_processors, verbose = FALSE
)
fit <- list(
p = fit$alpha,
mu = fit$mu, sigma = sqrt(unlist(fit$sigma))
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @inheritParams GMKMcharlie::GM
#' @export
em_GMKMcharlie_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", embedNoise = 1e-6,
epsilon = 10^-4, itmax = 500, start = NULL, parallel = FALSE, ...) {
# initialization section
if (is.null(start)) {
start <- initialize_em_multivariate(
x = x, k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm, ...
)
}
avalaible_processors <- ifelse(parallel, parallel::detectCores() %/% 2, 1)
fit <- GMKMcharlie::GM(t(x),
alpha = start$p, mu = start$mu, embedNoise = embedNoise,
sigma = lapply(seq(dim(start$sigma)[3]), function(x) start$sigma[, , x]), G = k,
convergenceEPS = epsilon, maxIter = itmax, maxCore = avalaible_processors, verbose = FALSE,
checkInitialization = T
)
fit <- list(
p = fit$alpha,
mu = fit$mu, sigma = fit$sigma %>% simplify2array()
)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @inheritParams clustvarsel::clustvarsel
#' @import clustvarsel
#' @export
em_clustvarsel_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, back_steps=20, start = NULL, ...) {
### Alternative process
# first, retrieve the most relevant dimensions
# kept_dimensions <- clustvarsel::clustvarsel(x, G = k, direction="backward", search="greedy",
# emModels2 = "VVV", hcModel = "VVV", allow.EEE = FALSE,
# verbose=FALSE, parallel=TRUE, forcetwo=FALSE, ...)$subset
# # second, with the updated dataset, retrieve relevant dimensions
# initial_theta <- initialize_em_multivariate(
# x = x[, kept_dimensions], k = k, nstart = 10L, itmax = 200, epsilon = 10^-2,
# initialisation_algorithm = initialisation_algorithm)
# # third, perform the initialisation with mclust, the most widely used package
# ordered_estimated_theta <- em_mclust_multivariate (x = x, k = 2, start = initial_theta,
# itmax = itmax, epsilon = epsilon)
#
#
# in the canonical process, the only initialisation possible is with hierarchical clustering
# set parallel to FALSE to avoid redundancy with parallel clustering
estimated_theta <- clustvarsel::clustvarsel(x, G = k, direction="backward", search="greedy",
emModels2 = "VVV", hcModel = "VVV", allow.EEE = FALSE,
verbose=FALSE, parallel=FALSE, forcetwo=TRUE, itermax=back_steps)$model$parameters
fit <- list(
p = estimated_theta$pro,
mu = estimated_theta$mean,
sigma = estimated_theta$variance$sigma)
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(fit)
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_HDclassif_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, start = NULL,
kmeans.control=list(iter.max=200L, nstart=10L, algorithm="Hartigan-Wong"),
mc.cores=getOption("mc.cores", parallel::detectCores())) {
if (initialisation_algorithm=="kmeans") {
fit <- HDclassif::hddc(x, K = k, model = c("AkjBkQkDk"), threshold = 0.2,
itermax = itmax, eps = epsilon,
algo = "EM", d_select = "Cattell", init = "kmeans",
show = FALSE, scaling = FALSE, mc.cores=mc.cores, # alternatively
min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
nb.rep = 1, kmeans.control = kmeans.control, keepAllRes = F)
}
else if (initialisation_algorithm=="small-em") {
fit <- HDclassif::hddc(x, K = k, model = c("AkjBkQkDk"), threshold = 0.2,
itermax = itmax, eps = epsilon,
algo = "EM", d_select = "Cattell", init = "mini-em",
show = FALSE, scaling = FALSE, mc.cores=mc.cores, # alternatively
min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
nb.rep = 1, keepAllRes = F, mini.nb=c(10L, 200))
}
else if (initialisation_algorithm=="random") {
fit <- HDclassif::hddc(x, K = k, model = c("AkjBkQkDk"), threshold = 0.2,
itermax = itmax, eps = epsilon,
algo = "EM", d_select = "Cattell", init = "param",
show = FALSE, scaling = FALSE, mc.cores=mc.cores, # alternatively
min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
nb.rep = 1, keepAllRes = F)
}
else { # only rebmix or hc will compile
start <- initialize_em_multivariate(x = x, k = k,
nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm)
post_proba <- predict_posterior_probability(x, start)$eta
predicted_classes <- apply(post_proba, MARGIN = 1, which.max)
fit <- HDclassif::hddc(x, K = k, model = c("AkjBkQkDk"), threshold = 0.2,
itermax = itmax, eps = epsilon,
algo = "EM", d_select = "Cattell", init = "vector",
show = FALSE, scaling = FALSE, mc.cores=mc.cores, # alternatively
min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
nb.rep = 1, keepAllRes = F, init.vector = predicted_classes)
}
# convert back to the original space
p <- c(fit$prop); D <- ncol(fit$mu); mu <- matrix(fit$mu, nrow=D, ncol=k)
sigma <- array(0, dim=c(D, D, k))
for (j in 1:k) {
d_j <- ncol(fit$Q[[j]])
sigma[,,j] <- fit$Q[[j]] %*% diag(c(fit$a[j,1:d_j]-fit$b[j]), nrow = d_j) %*% t(fit$Q[[j]]) + fit$b[j] * diag(D)
}
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma))
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_pgmm_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, aic_acc=0.1, start = NULL,
arguments_HDclassif=list(init= "kmeans",
kmeans.control=list(iter.max=200L, nstart=10L,
algorithm="Hartigan-Wong"),
mc.cores=getOption("mc.cores",
parallel::detectCores()))) {
### way too long to compute all models
# D <- ncol(x); # as stated in Bouveyron 2007, minimal dimension onto which to project
# d_j <- min(D, min(quadraticRoots(a=1, b=-(1 + 2*D), c=D^2 - D))) %>% trunc()
# d_j <- ifelse((D-d_j)^2==D + d_j, d_j-1, d_j)
d_j <- do.call(HDclassif::hddc, c(list(data=x, K = k, model = c("AkjBkQkD"), threshold = 0.2,
itermax = itmax, eps = epsilon, algo = "EM", d_select = "Cattell",
show = FALSE, scaling = FALSE,
min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
nb.rep = 1, keepAllRes = F), arguments_HDclassif)) %>%
magrittr::extract2("d") %>% mean()
fit <- pgmm::pgmmEM(x, rG = k, rq=d_j, class=NULL, # 1:d_j
icl = FALSE, zstart=2,
modelSubset="UUU", tol=aic_acc) # We use a different criterion of acceleration here
# convert back to the original space
eta <- fit$zhat; p <- c(); n <- nrow(x); D <- ncol(x)
mu <- matrix(0, nrow = D, ncol = k); sigma <- array(0, dim = c(D, D, k))
for (j in 1:k) {
p <- c(p, sum(eta[,j])/n)
mu[, j] <- apply(x, 2, stats::weighted.mean, eta[, j])
sigma[, , j] <- fit$load[[j]] %*% t(fit$load[[j]]) + fit$noisev[[j]]
}
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma))
return(ordered_estimated_theta)
}
#' @rdname emnmix_multivariate
#' @export
em_EMMIXmfa_multivariate <- function(x = x, k = 2, initialisation_algorithm = "kmeans", conv_measure = 'diff',
epsilon = 10^-4, itmax = 500, start = NULL, nkmeans = 10L,
arguments_HDclassif=list(init= "kmeans",
kmeans.control=list(iter.max=200L, nstart=10L,
algorithm="Hartigan-Wong"),
mc.cores=getOption("mc.cores", parallel::detectCores()))) {
# getOption("mc.cores", parallel::detectCores()) set it to one to avoid redundancy with the cluster
### use HDclassif to retrieve the internal dimension
# since this dimension can be specific to any dimension, we rather take the median (we might weight)
# by the relative abundance of each class
d_j <- do.call(HDclassif::hddc, c(list(data=x, K = k, model = c("AkjBkQkD"), threshold = 0.2,
itermax = itmax, eps = epsilon, algo = "EM", d_select = "Cattell",
show = FALSE, scaling = FALSE,
min.individuals = 2, noise.ctrl = 1e-08, subset = Inf,
nb.rep = 1, keepAllRes = F), arguments_HDclassif)) %>%
magrittr::extract2("d") %>% mean()
# the fit itself
if (initialisation_algorithm== "kmeans") {
suppressWarnings(suppressMessages(invisible(utils::capture.output(
fit <- EMMIXmfa::mcfa(Y=x, g=k, q=d_j, itmax = itmax, nkmeans = 1,
tol = epsilon, conv_measure = conv_measure, warn_messages = FALSE)))))
}
else if (initialisation_algorithm== "random") {
suppressWarnings(suppressMessages(invisible(utils::capture.output(
fit <- EMMIXmfa::mcfa(Y=x, g=k, q=d_j, itmax = itmax, init_method = "rand-A", nrandom=10L,
tol = epsilon, conv_measure = conv_measure, warn_messages = FALSE, nkmeans = 1)))))
}
else {
start <- initialize_em_multivariate(x = x, k = k,
nstart = 10L, itmax = 200, epsilon = 10^-2,
initialisation_algorithm = initialisation_algorithm)
post_proba <- predict_posterior_probability(x, start)$eta
predicted_classes <- apply(post_proba, MARGIN = 1, which.max)
suppressWarnings(suppressMessages(invisible(utils::capture.output(
fit <- EMMIXmfa::mcfa(Y=x, g=k, q=d_j, itmax = itmax, init_clust = predicted_classes %>% as.factor(),
tol = epsilon, conv_measure = conv_measure, warn_messages = FALSE, nkmeans = 1)))))
}
p <- c(fit$pivec); mu <- fit$A %*% fit$xi; D <- ncol(x)
sigma <- array(0, dim = c(D, D, k))
for (j in 1:k) {
sigma[,,j] <- fit$A %*% fit$omega[,,j] %*% t(fit$A) + fit$D
}
### return an unique ordered parameter configuration
ordered_estimated_theta <- enforce_identifiability(list(p = p, mu = mu, sigma = sigma))
return(ordered_estimated_theta)
}
#' @rdname emnmix_univariate
#' @export
em_otrimle <- function(x = x, k = 2, initialisation_algorithm = "kmeans",
epsilon = 10^-4, itmax = 500, ...) {
if (initialisation_algorithm == "hc") {
start <- otrimle::InitClust(as.matrix(x), G = 2, modelName = "V")
} else {
stop("This initialisation method is not enabled with otrimle package.")
}
avalaible_processors <- parallel::detectCores() %/% 2
fit <- otrimle::otrimle(
data = as.matrix(x), G = k, initial = start,
iter.max = itmax, tol = epsilon, ncores = avalaible_processors, monitor = FALSE
)
# return an ordered list by mean values + proportion of outliers
ordered_estimated_theta <- list(
p = fit$pi[-1][order(fit$mean)],
mu = sort(fit$mean),
sigma = sqrt(fit$cov[order(fit$mean)]),
prop_extra = fit$exproportion
)
ordered_estimated_theta <- ordered_estimated_theta %>% purrr::map(unname)
return(ordered_estimated_theta)
}
#' Estimate the parameters in supervised case (all labels associated to the observations are available)
#'
#' @author Bastien CHASSAGNOL
#'
#' @param x the univariate or multivariate distribution of observed variables
#' @param s the index of the clusters, ranging from 1 to \eqn{k}, the number of clusters
#' @param s_outliers binary index, with 0 corresponding to outlying points
#' @param k the number of classes (by default, the number of unique values within the indicator vector)
#'
#' @return a list with three arguments, respectively the proportions, the means, and the variance / covariance
#'
#' @seealso \code{\link{simulate_univariate_GMM}}
#'
#' @export
estimate_supervised_univariate_GMM <- function(x, s, s_outliers = s, k = length(unique(s))) {
##### retrieve relevant values
x <- x[s_outliers != 0]
s_outliers <- s_outliers[s_outliers != 0]
n <- length(x)
eta <- table(1:n, s_outliers) # only 0 or 1 inputs, matrix of weights
##### estimate the parameters
p <- table(s_outliers) %>%
c() %>%
unname() %>%
magrittr::divide_by(n)
# update parameter vector zeta per component
parametric_parameters <- apply(eta, 2, function(w) {
stats::cov.wt(as.matrix(x), wt = w, cor = F, method = "ML", center = T)
})
mu <- purrr::map_dbl(parametric_parameters, "center")
sigma <- sqrt(purrr::map_dbl(parametric_parameters, "cov"))
return(list(p = p, mu = mu, sigma = sigma))
}
#' @rdname estimate_supervised_univariate_GMM
#' @export
estimate_supervised_multivariate_GMM <- function(x, s, k = length(unique(s))) {
dim_gaussian <- ncol(x)
n <- nrow(x)
mu <- matrix(0, nrow = dim_gaussian, ncol = k)
sigma <- array(0, dim = c(dim_gaussian, dim_gaussian, k))
# estimate the ratios
p <- table(s) %>%
c() %>%
unname() %>%
magrittr::divide_by(n)
for (j in 1:k) {
mu[, j] <- apply(x[s == j, ], 2, mean) # estimate the means
sigma[, , j] <- stats::cov.wt(x[s == j, ], method = "ML")$cov # estimate covariances
}
return(list(p = p, mu = mu, sigma = sigma))
}
#' @title Helper functions for the parameter estimation of GMMs
#'
#' @description These functions are small chunks of code designed to decompose the computation of the EM algorithm into simpler steps.
#' `logsumexp` returns the computation of equation \eqn{\log(\exp(sum(x)))}, avoiding numerical overflows
#'
#' @param l a vector of numeric terms
#' @return a numeric scalar value, result of the previously described equation
logsumexp <- function(l) {
i <- which.max(l)
res <- l[i] + log1p(sum(exp(l[-i] - l[i])))
if (is.nan(res)) res <- -Inf
return(res)
}
#' @describeIn logsumexp `predict_posterior_probability` returns the expected probability for each observation
#' to belong to any of the \eqn{k} clusters set a priori, given the estimated parameters
#'
#' @param x the vector of observed values, of size \eqn{n}
#' @param estimated_theta the estimated parameters
#' @return a list with two elements:
#' * the posterior probability matrix, `eta`: \eqn{\eta=(\eta_{i,j}) \in [0, 1]^{n \times k}}, with \eqn{\eta_{i,j}}
#' giving the posterior probability of observation \eqn{i} to belong to cluster \eqn{j}
#' * `loglik` returns the expected log-likelihood of our experiment
#' @export
predict_posterior_probability <- function(x, estimated_theta) {
k <- length(estimated_theta$p)
p <- estimated_theta$p
mu <- estimated_theta$mu
sigma <- estimated_theta$sigma
n <- ifelse(is.array(estimated_theta$sigma), nrow(x), length(x))
eta <- matrix(NA, nrow = n, ncol = k) # store posterior distribution for each observation (P(S=i|X))
if (is.array(estimated_theta$sigma)) {
# multivariate scenario
for (j in 1:k) {
eta[, j] <- log(p[j]) + mvtnorm::dmvnorm(x, mean = mu[, j], sigma = sigma[, , j], log = TRUE)
}
} else {
# univariate scenario
for (j in 1:k) {
eta[, j] <- log(p[j]) + stats::dnorm(x, mu[j], sigma[j], log = TRUE) # compute posterior probability, using log tip
}
}
# trick of the log-likelihood is useful for any function belonging to the exponential family
# which encompasses both univariate and multivariate Normal distribution
aux <- apply(eta, 1, logsumexp)
eta <- exp(eta - aux)
return(list(eta = eta, loglik = sum(aux)))
}
#' Check whether the estimation has been trapped in the boundary space
#'
#' * Function `check_parameters_validity_univariate` asserts at each step of the EM algorithm that
#' it doesn't fall in a degenerate case (either the package performing the EM computation has failed, and returns an
#' error message, or the algorithm is trapped in the boundary space, leading to inconsistent division by zero).
#' * Function `check_parameters_validity_multivariate` has the same functionality, but
#' is adjusted to multivariate parametrisation, and includes additionally a checking whether
#' the covariance matrix is positive definite or not
#'
#' @param theta the set of parameters defining GMMs, with three inputs, namely p, mu and sigma
#' @param k the number of components, by default the number of proportions indicated in vector p
#'
#' @seealso [logsumexp()]
#' @export
check_parameters_validity_univariate <- function(theta, k = length(theta$p)) {
machine_limit <- .Machine$double.eps
machine_max <- .Machine$double.xmax
theta <- theta[c("p", "mu", "sigma")]
if (any(is.na(unlist(theta))) | length(unlist(theta)) == 0) {
warning(paste0(
"Missing data with estimated theta, with ratios: ", paste(theta$p, collapse = " / "), ", ",
"mu: ", paste(theta$mu, collapse = " / "), " and sigma: ", paste(theta$sigma, collapse = " / ")
))
return(FALSE)
} else if (any(theta$p < machine_limit | theta$p > 1 - machine_limit) |
any(theta$sigma < machine_limit) | any(theta$sigma > machine_max) | any(sapply(theta, length) != k)) {
warning(paste0(
"Numerical overflow with estimated theta, with ratios: ", paste(theta$p, collapse = " / "), ", ",
"mu: ", paste(theta$mu, collapse = " / "), " and sigma: ", paste(theta$sigma, collapse = " / ")
))
return(FALSE)
} else {
return(TRUE)
}
}
#' @rdname check_parameters_validity_univariate
check_parameters_validity_multivariate <- function(theta, k = length(theta$p)) {
machine_limit <- .Machine$double.eps
machine_max <- .Machine$double.xmax
is_valid_parametrisation <- TRUE # store whether the parametrisation is correctly performed
p <- theta$p
mu <- theta$mu
sigma <- theta$sigma
k <- length(p) # get values from theta parameter
dimension_gaussian <- dim(mu)[1]
if (any(is.na(unlist(theta))) | length(unlist(theta)) == 0) {
warning("NA values or no output from the estimation. \n")
is_valid_parametrisation <- FALSE
}
# check the parametrisation of proportions (sum-to-one constraint)
if ((1 - sum(p)) > machine_limit | any(p < machine_limit) | any(p > 1 - machine_limit)) {
warning("One at least of your proportions does not enforce the sum-to-one constraint. \n")
is_valid_parametrisation <- FALSE
}
# check parametrisation of means
if (!identical(dim(mu), c(dimension_gaussian, k)) | any(c(mu) > machine_max)) {
warning("The mean parameter must be of dimension ndim * k, with k the number of clusters. \n")
is_valid_parametrisation <- FALSE
}
# check the parametrisation of covariance
if (!identical(dim(sigma), c(dimension_gaussian, dimension_gaussian, k)) | any(c(sigma) > machine_max)) {
is_valid_parametrisation <- FALSE
} else {
for (j in 1:k) {
if (!is_positive_definite(sigma[, , j])) {
warning(glue::glue("Covariance matrix for component {j} is not positive definite.\n"))
is_valid_parametrisation <- FALSE
break
}
if (!isSymmetric(sigma[, , j])) {
warning(glue::glue("Covariance matrix for component {j} is not symmetric. \n"))
is_valid_parametrisation <- FALSE
break
}
}
}
return(is_valid_parametrisation)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.