R/hypvolgmm.R
In mclustAddons: Addons for the 'mclust' Package

Documented in hypvolgmm hypvolgmm_hdlevel hypvoltmvnorm hypvolunif

#' @title Gaussian mixture-based hypervolume estimation of multivariate data
#' 
#' @description 
#' Estimates the hypervolume of a multivariate dataset by fitting a 
#' Gaussian Mixture Model (GMM).
#'
#' @param data A numeric vector, matrix, or data frame. If a matrix or
#' 	data frame, rows correspond to observations and columns correspond
#' 	to variables. Categorical variables and missing values are not allowed.
#' @param method A character specifying the sampling scheme to be used for
#' 	the estimation. Available options are: 
#'   - \code{"lhs"} = Latin Hypercube Sampling (default); 
#'   - \code{"smc"} = Simple Monte Carlo sampling;
#'   - \code{"is"} = Importance Sampling.
#' @param G An integer vector specifying the numbers of mixture components to fit, 
#' 	then model selection if performed via BIC.
#' @param prior A logical specifying if a prior (on the covariance) matrix should 
#'	be used for regularization.
#' @param nsim Integer specifying the number of simulations used in MC sampling.
#' @param hdlevel A numerical value in the range (0,1] specifying the level
#'	of the highest density region (HDR) to be used for defining the GMM hull.
#'	If not provided, a default data-driven value is computed.
#' @param GMM Optional `Mclust` or `densityMclust` object to be used.
#' If provided, the `data` argument is ignored.
#' @param \dots Further arguments passed to or from other methods.
#'
#' @return
#' Returns a list containing the following elements: 
#' * `GMM` An object of class `densityMclust` containing the fitted
#'    Gaussian mixture.
#' * `method` The sampling method employed.
#' * `hdlevel` The hdlevel used for definying the GMM hull.
#' * `nsim`	The number of simulations used in MC sampling.
#' * `ESS`	The effective sample size.
#' * `h`	The density level of GMM hull.
#' * `gamma`	The estimated parameter used for adjustment
#' * `logvol` The estimated log hypervolume.
#'  
#' @author Luca Scrucca
#'  
#' @examples
#' x = matrix(rnorm(100), ncol = 2)
#' mod = hypvolgmm(x)
#' summary(mod$GMM)
#' mod$logvol
#' 
#' @export

hypvolgmm <- function(data, 
                      method = c("lhs", "smc", "is", "none"), 
                      G = 1:9, 
                      prior = TRUE,
                      nsim = 1e6, 
                      hdlevel = NULL,
                      GMM = NULL,
                      ...)
{
  data = data.matrix(data)
  n = nrow(data)
  d = ncol(data)
  method = match.arg(method, several.ok = FALSE)
  nsim = ESS = as.numeric(nsim)

  if(is.null(GMM))
  {
    GMM = densityMclust(data, G = G,
                        modelNames = if(NCOL(data) > 1) "VVV" else "V",
                        prior = if(prior) priorControl(shrinkage = 0) else NULL,
                        verbose = FALSE, plot = FALSE)
  } else
  {
    stopifnot(inherits(GMM, c("Mclust", "densityMclust")))
    GMM = as.densityMclust(GMM)
  }
  if(method == "none") return(GMM)

  logdens = log(GMM$density)
  hdlevel = if(is.null(hdlevel)) hypvolgmm_hdlevel(exp(logdens))$hdl else hdlevel
  h = mclust::hdrlevels(exp(logdens), prob = hdlevel)
  r = apply(data, 2, range)
  logVol_U = sum(log(apply(r, 2, diff)))
  switch(method,
         "lhs" = 
           {
             x = hypcube_lhs(nsim, d, lbound = r[1,], ubound = r[2,])
             logdens_x = mclust::dens(data = x, 
                                      modelName = GMM$modelName, 
                                      parameters = GMM$parameters, 
                                      logarithm = TRUE)
             gamma = (sum(logdens_x >= log(h)) + 1) / (nsim + 2)
             logvol = logVol_U + log(gamma)
           },
         "smc" = 
           {
             x = hypcube_smc(nsim, d, lbound = r[1,], ubound = r[2,])
             logdens_x = mclust::dens(data = x, 
                                      modelName = GMM$modelName, 
                                      parameters = GMM$parameters, 
                                      logarithm = TRUE)
             gamma = (sum(logdens_x >= log(h)) + 1) / (nsim + 2)
             logvol = logVol_U + log(gamma)
           },
         "is" = 
           {
             # draw from proposal (GMM)
             x = mclust::sim(n = nsim, 
                             modelName = GMM$modelName,
                             parameters = GMM$parameters)[,-1,drop=FALSE]
             # keep draws inside H (truncated GMM)
             inside = inside_range(x, r)
             x = x[inside, , drop = FALSE]
#            # on original scale:             
# 					 # evaluate GMM density at proposal
#            dens_x = mclust::dens(data = x,
#                                  modelName = GMM$modelName,
#                                  parameters = GMM$parameters,
#                                  logarithm = FALSE)
#            # IS weights = density ratio target/proposal
#            weights = 1*(dens_x >= h) / dens_x
             logdens_x = mclust::dens(data = x,
                                      modelName = GMM$modelName,
                                      parameters = GMM$parameters,
                                      logarithm = TRUE)
             # IS weights = density ratio target/proposal
             weights = 1*(logdens_x >= log(h))/exp(logdens_x)
             # log-volume estimate
             logvol = log(sum(weights)/nsim) # ≈ log(mean(weights))
             gamma = exp(logvol - logVol_U)
             ESS = (sum(weights)^2) / sum(weights^2)
           }
  )

  out = list(GMM = GMM, 
             method = method,
             hdlevel = hdlevel,
             nsim = nsim,
             ESS = ESS,
             h = h,
             gamma = gamma,
             logvol = logvol)
  class(out) = "hypvolgmm"
  return(out)
}

#' @title Default HDR level defining the GMM hull 
#' 
#' @description 
#' Compute default HDR level used to define the GMM hull. This is obtained by
#' fitting a two-segment piecewise linear regression model (i.e., single change 
#' point model) to a grid of \eqn{(\alpha, h_\alpha)} values with \eqn{\alpha} 
#' in \eqn{[0.9, 1]}.
#'
#' @param dens A vector of density estimates for the \eqn{n} observed data points.
#' @param \dots Further arguments passed to or from other methods.
#'
#' @return
#' Returns a list containing the following elements: 
#' * `alpha` A vector of alpha values specifying the HDR levels.
#' * `h_alpha` A vector of HDR density values corresponding to `alpha`.
#' * `pcwsreg` The two-segment piecewise linear regression model.
#' * `hdl`	The optimal HDR level selected as the change-point value.
#' * `h`	The HDR density values corresponding to the optimal HDR level.
#'  
#' @author Luca Scrucca
#'  
#' @examples
#' x = subset(faithful, eruptions > 3)
#' mod = densityMclust(x, plot = FALSE)
#' out = hypvolgmm_hdlevel(mod$density)
#' with(out, 
#' {
#'   plot(alpha, h_alpha, type = "b", pch = 20)
#'   lines(alpha[1:pcwsreg$c],
#'         pcwsreg$a1 * alpha[1:pcwsreg$c] + pcwsreg$b1,
#'         lty = 2, lwd = 2, col = "red2")
#'   lines(alpha[pcwsreg$c:length(alpha)],
#'         pcwsreg$a2 * alpha[pcwsreg$c:length(alpha)] + pcwsreg$b2,
#'         lty = 2, lwd = 2, col = "red2")
#'   abline(v = hdl, lty = 3, col = "red2", lwd = 2)
#'   abline(h = h, lty = 3, col = "red2", lwd = 2)
#' })
#' 
#' plot(x, xlim = c(2,6), ylim = c(60,100))
#' surfacePlot(data = mod$data, parameters = mod$parameters,
#'             what = "density", type = "contour", 
#'             levels = c(out$h, min(mod$density)),
#'             xlim = c(2,6), ylim = c(60,100), lty = c(1,2),
#'             add = TRUE, drawlabels = FALSE)
#' 
#' @export

hypvolgmm_hdlevel <- function(dens, ...) 
{
  dens = as.vector(dens)
  alpha = ppoints(dens, a = 0.5)
  alpha = alpha[alpha >= 0.9]
  h_alpha = mclust::hdrlevels(dens, prob = alpha)
  pcwsreg = mclust:::pcws2_reg(alpha, h_alpha)
  out = list(alpha = alpha, h_alpha = h_alpha, 
             pcwsreg = pcwsreg, 
             hdl = alpha[pcwsreg$c],
             h = h_alpha[pcwsreg$c])
  return(out)
}

# hypcube_lhs_R <- function(n, d, lbound = rep(0,d), ubound = rep(1,d))
# {
# # Latin Hypercube Sampling
# #
# # n = the number of sampled points
# # d = the dimension of the sample space
# # lbound = the lower bounds of the d-dimensional hypercube
# # ubound = the upper bounds of the d-dimensional hypercube
# #
# # Example:
# # x <- hypcube_lhs(100, 2)
# # plot(x, xlim = c(0,1), ylim = c(0,1))
# # rug(x[,1]); rug(x[,2],side = 2)
# # x <- hypcube_lhs(100, 2, lbound = c(-5,1), ubound = c(10,3))
# # plot(x, xlim = c(-5,10), ylim = c(1,3))
# # rug(x[,1]); rug(x[,2],side = 2)
# 
#   if(length(n) > 1 | !is.numeric(n) | floor(n) != n | n < 1)
#     stop("n must be a positive integer")
#   if(length(d) > 1 | !is.numeric(d) | floor(d) != d | d < 1)
#     stop("d must be a positive integer")
# 
#   X <- matrix(as.double(NA), nrow = n, ncol = d)
#   for(j in 1:d)
#   {
#     X[,j] <- sample(n, size = n, replace = FALSE)
#     X[,j] <- (X[,j] + runif(n) - 1)/n
#     X[,j] <- lbound[j] + (ubound[j]-lbound[j])*X[,j]
#   }
# 
#   return(X)
# }

#' @title Approximate hypervolume for multivariate data
#' 
#' @description 
#' Simple approximations for the hypervolume of a multivariate dataset
#' by assuming a uniform distribution.
#'
#' @param data A numeric vector, matrix, or data frame. If a matrix or
#' 	data frame, rows correspond to observations and columns correspond
#' 	to variables. Categorical variables and missing values are not allowed.
#' @param pc A logical value indicating whether the volume of hyperectangle
#'  enclosing the data (`FALSE`) or the volume of hyperectangle aligned with 
#'  the principal axes enclosing the data should be computed.
#' @param logarithm A logical value indicating whether or not the logarithm 
#'  of the hypervolume should be returned.
#' @param \dots Further arguments passed to or from other methods.
#'
#' @return
#' Returns the (log) hypervolume from a uniform distribution aligned with the 
#' original axes or along the principal components.
#' 
#' @author Luca Scrucca
#'  
#' @examples
#' x1 = rnorm(1000)
#' x2 = 0.8*x1 + rnorm(1000)
#' x = cbind(x1, x2)
#' hypvolunif(x)
#' hypvolunif(x, pc = TRUE)
#' 
#' @export

hypvolunif <- function(data, pc = FALSE, logarithm = TRUE, ...)
{
  data <- as.matrix(data)
  if(pc) data <- princomp(data)$scores
  logV = sum(log(apply(data, 2, function(x) diff(range(x)))))
  if(logarithm) logV else exp(logV)
}

#' @title Approximate hypervolume for multivariate data
#' 
#' @description 
#' Approximate the hypervolume of a multivariate dataset by assuming
#' a truncated multivariate normal (TMVN) distribution within given radius.
#'
#' @param data A numeric vector, matrix, or data frame. If a matrix or
#' 	data frame, rows correspond to observations and columns correspond
#' 	to variables. Categorical variables and missing values are not allowed.
#' @param radius A numerical value specifying the radius to be used for
#'  truncating the multivariate Gaussian distribution. If not provided
#'  is set to \eqn{sqrt(d+1)}, where \eqn{d} is the dimensionality of 
#'  the data, i.e. number of columns of `data`.
#' @param \dots Further arguments passed to or from other methods.
#'
#' @return
#' Returns a list with the following components:
#' * `sigma` The estimated covariance matrix.
#' * `radius` The radius used for computation.
#' * `logvol` The log of hypervolume.
#' * `logdens` The smallest log-density for observed data points within the
#'  radius. 
#'  
#' @author Luca Scrucca
#'  
#' @examples
#' x1 = rnorm(1000)
#' x2 = 0.8*x1 + rnorm(1000)
#' x = cbind(x1, x2)
#' hypvoltmvnorm(x, radius = 1)
#' 
#' @export

hypvoltmvnorm <- function(data, radius = NULL, ...)
{
  data <- as.matrix(data)
  d <- ncol(data)
  if(is.null(radius)) radius <- sqrt(d+1)
  
  if(d==1)
  {
    mu_hat = mean(data)
    sigma_hat = var(data)
    sigma_svd = sigma_hat
    log_det_sigma_hat = log(sigma_hat)
  } else
  {
    mu_hat = colMeans(data)
    sigma_hat = cov(data)
    sigma_svd = svd(sigma_hat)
    log_det_sigma_hat = sum(log(sigma_svd$d))
  }
  
  logvol = d*log(radius) + (d/2)*log(pi) + log_det_sigma_hat/2 - lgamma(d/2+1)
  
  # which samples are in the set A from TMVN
  md = mahalanobis(data, center = mu_hat, cov = sigma_hat)
  inA = (md < radius^2)

  # min log-density of data points in A
  logdens = min(mclust::dmvnorm(data[inA,], mean = mu_hat,
                                sigma =sigma_hat, log = TRUE))
  # -d/2 * log(2*pi) + log_det_sigma_hat/2 - radius^2/2
  
  out = list(sigma = sigma_hat, 
             radius = radius,
             logvol = logvol,
             logdens = logdens)
  return(out)
}

mvnormvol <- function(d, sigma = NULL, conf_level = 0.999, logarithm = TRUE) 
{
  d = as.numeric(d)[1]
  if(is.null(sigma)) sigma = diag(d)
  c = qchisq(conf_level, df = d)  # Chi-squared quantile
  r = sqrt(c)                     # radius of the hypersphere
  # compute the log-volume of the hypersphere in d dimensions
  logvol <- d/2 * log(pi) - lgamma(d/2 + 1) + d*log(r) + log(det(sigma))
  if(logarithm) return(logvol) else return(exp(logvol))
}