Nothing
#' @title Gaussian mixture-based hypervolume estimation of multivariate data
#'
#' @description
#' Estimates the hypervolume of a multivariate dataset by fitting a
#' Gaussian Mixture Model (GMM).
#'
#' @param data A numeric vector, matrix, or data frame. If a matrix or
#' data frame, rows correspond to observations and columns correspond
#' to variables. Categorical variables and missing values are not allowed.
#' @param method A character specifying the sampling scheme to be used for
#' the estimation. Available options are:
#' - \code{"lhs"} = Latin Hypercube Sampling (default);
#' - \code{"smc"} = Simple Monte Carlo sampling;
#' - \code{"is"} = Importance Sampling.
#' @param G An integer vector specifying the numbers of mixture components to fit,
#' then model selection if performed via BIC.
#' @param prior A logical specifying if a prior (on the covariance) matrix should
#' be used for regularization.
#' @param nsim Integer specifying the number of simulations used in MC sampling.
#' @param hdlevel A numerical value in the range (0,1] specifying the level
#' of the highest density region (HDR) to be used for defining the GMM hull.
#' If not provided, a default data-driven value is computed.
#' @param GMM Optional `Mclust` or `densityMclust` object to be used.
#' If provided, the `data` argument is ignored.
#' @param \dots Further arguments passed to or from other methods.
#'
#' @return
#' Returns a list containing the following elements:
#' * `GMM` An object of class `densityMclust` containing the fitted
#' Gaussian mixture.
#' * `method` The sampling method employed.
#' * `hdlevel` The hdlevel used for definying the GMM hull.
#' * `nsim` The number of simulations used in MC sampling.
#' * `ESS` The effective sample size.
#' * `h` The density level of GMM hull.
#' * `gamma` The estimated parameter used for adjustment
#' * `logvol` The estimated log hypervolume.
#'
#' @author Luca Scrucca
#'
#' @examples
#' x = matrix(rnorm(100), ncol = 2)
#' mod = hypvolgmm(x)
#' summary(mod$GMM)
#' mod$logvol
#'
#' @export
hypvolgmm <- function(data,
method = c("lhs", "smc", "is", "none"),
G = 1:9,
prior = TRUE,
nsim = 1e6,
hdlevel = NULL,
GMM = NULL,
...)
{
data = data.matrix(data)
n = nrow(data)
d = ncol(data)
method = match.arg(method, several.ok = FALSE)
nsim = ESS = as.numeric(nsim)
if(is.null(GMM))
{
GMM = densityMclust(data, G = G,
modelNames = if(NCOL(data) > 1) "VVV" else "V",
prior = if(prior) priorControl(shrinkage = 0) else NULL,
verbose = FALSE, plot = FALSE)
} else
{
stopifnot(inherits(GMM, c("Mclust", "densityMclust")))
GMM = as.densityMclust(GMM)
}
if(method == "none") return(GMM)
logdens = log(GMM$density)
hdlevel = if(is.null(hdlevel)) hypvolgmm_hdlevel(exp(logdens))$hdl else hdlevel
h = mclust::hdrlevels(exp(logdens), prob = hdlevel)
r = apply(data, 2, range)
logVol_U = sum(log(apply(r, 2, diff)))
switch(method,
"lhs" =
{
x = hypcube_lhs(nsim, d, lbound = r[1,], ubound = r[2,])
logdens_x = mclust::dens(data = x,
modelName = GMM$modelName,
parameters = GMM$parameters,
logarithm = TRUE)
gamma = (sum(logdens_x >= log(h)) + 1) / (nsim + 2)
logvol = logVol_U + log(gamma)
},
"smc" =
{
x = hypcube_smc(nsim, d, lbound = r[1,], ubound = r[2,])
logdens_x = mclust::dens(data = x,
modelName = GMM$modelName,
parameters = GMM$parameters,
logarithm = TRUE)
gamma = (sum(logdens_x >= log(h)) + 1) / (nsim + 2)
logvol = logVol_U + log(gamma)
},
"is" =
{
# draw from proposal (GMM)
x = mclust::sim(n = nsim,
modelName = GMM$modelName,
parameters = GMM$parameters)[,-1,drop=FALSE]
# keep draws inside H (truncated GMM)
inside = inside_range(x, r)
x = x[inside, , drop = FALSE]
# # on original scale:
# # evaluate GMM density at proposal
# dens_x = mclust::dens(data = x,
# modelName = GMM$modelName,
# parameters = GMM$parameters,
# logarithm = FALSE)
# # IS weights = density ratio target/proposal
# weights = 1*(dens_x >= h) / dens_x
logdens_x = mclust::dens(data = x,
modelName = GMM$modelName,
parameters = GMM$parameters,
logarithm = TRUE)
# IS weights = density ratio target/proposal
weights = 1*(logdens_x >= log(h))/exp(logdens_x)
# log-volume estimate
logvol = log(sum(weights)/nsim) # ≈ log(mean(weights))
gamma = exp(logvol - logVol_U)
ESS = (sum(weights)^2) / sum(weights^2)
}
)
out = list(GMM = GMM,
method = method,
hdlevel = hdlevel,
nsim = nsim,
ESS = ESS,
h = h,
gamma = gamma,
logvol = logvol)
class(out) = "hypvolgmm"
return(out)
}
#' @title Default HDR level defining the GMM hull
#'
#' @description
#' Compute default HDR level used to define the GMM hull. This is obtained by
#' fitting a two-segment piecewise linear regression model (i.e., single change
#' point model) to a grid of \eqn{(\alpha, h_\alpha)} values with \eqn{\alpha}
#' in \eqn{[0.9, 1]}.
#'
#' @param dens A vector of density estimates for the \eqn{n} observed data points.
#' @param \dots Further arguments passed to or from other methods.
#'
#' @return
#' Returns a list containing the following elements:
#' * `alpha` A vector of alpha values specifying the HDR levels.
#' * `h_alpha` A vector of HDR density values corresponding to `alpha`.
#' * `pcwsreg` The two-segment piecewise linear regression model.
#' * `hdl` The optimal HDR level selected as the change-point value.
#' * `h` The HDR density values corresponding to the optimal HDR level.
#'
#' @author Luca Scrucca
#'
#' @examples
#' x = subset(faithful, eruptions > 3)
#' mod = densityMclust(x, plot = FALSE)
#' out = hypvolgmm_hdlevel(mod$density)
#' with(out,
#' {
#' plot(alpha, h_alpha, type = "b", pch = 20)
#' lines(alpha[1:pcwsreg$c],
#' pcwsreg$a1 * alpha[1:pcwsreg$c] + pcwsreg$b1,
#' lty = 2, lwd = 2, col = "red2")
#' lines(alpha[pcwsreg$c:length(alpha)],
#' pcwsreg$a2 * alpha[pcwsreg$c:length(alpha)] + pcwsreg$b2,
#' lty = 2, lwd = 2, col = "red2")
#' abline(v = hdl, lty = 3, col = "red2", lwd = 2)
#' abline(h = h, lty = 3, col = "red2", lwd = 2)
#' })
#'
#' plot(x, xlim = c(2,6), ylim = c(60,100))
#' surfacePlot(data = mod$data, parameters = mod$parameters,
#' what = "density", type = "contour",
#' levels = c(out$h, min(mod$density)),
#' xlim = c(2,6), ylim = c(60,100), lty = c(1,2),
#' add = TRUE, drawlabels = FALSE)
#'
#' @export
hypvolgmm_hdlevel <- function(dens, ...)
{
dens = as.vector(dens)
alpha = ppoints(dens, a = 0.5)
alpha = alpha[alpha >= 0.9]
h_alpha = mclust::hdrlevels(dens, prob = alpha)
pcwsreg = mclust:::pcws2_reg(alpha, h_alpha)
out = list(alpha = alpha, h_alpha = h_alpha,
pcwsreg = pcwsreg,
hdl = alpha[pcwsreg$c],
h = h_alpha[pcwsreg$c])
return(out)
}
# hypcube_lhs_R <- function(n, d, lbound = rep(0,d), ubound = rep(1,d))
# {
# # Latin Hypercube Sampling
# #
# # n = the number of sampled points
# # d = the dimension of the sample space
# # lbound = the lower bounds of the d-dimensional hypercube
# # ubound = the upper bounds of the d-dimensional hypercube
# #
# # Example:
# # x <- hypcube_lhs(100, 2)
# # plot(x, xlim = c(0,1), ylim = c(0,1))
# # rug(x[,1]); rug(x[,2],side = 2)
# # x <- hypcube_lhs(100, 2, lbound = c(-5,1), ubound = c(10,3))
# # plot(x, xlim = c(-5,10), ylim = c(1,3))
# # rug(x[,1]); rug(x[,2],side = 2)
#
# if(length(n) > 1 | !is.numeric(n) | floor(n) != n | n < 1)
# stop("n must be a positive integer")
# if(length(d) > 1 | !is.numeric(d) | floor(d) != d | d < 1)
# stop("d must be a positive integer")
#
# X <- matrix(as.double(NA), nrow = n, ncol = d)
# for(j in 1:d)
# {
# X[,j] <- sample(n, size = n, replace = FALSE)
# X[,j] <- (X[,j] + runif(n) - 1)/n
# X[,j] <- lbound[j] + (ubound[j]-lbound[j])*X[,j]
# }
#
# return(X)
# }
#' @title Approximate hypervolume for multivariate data
#'
#' @description
#' Simple approximations for the hypervolume of a multivariate dataset
#' by assuming a uniform distribution.
#'
#' @param data A numeric vector, matrix, or data frame. If a matrix or
#' data frame, rows correspond to observations and columns correspond
#' to variables. Categorical variables and missing values are not allowed.
#' @param pc A logical value indicating whether the volume of hyperectangle
#' enclosing the data (`FALSE`) or the volume of hyperectangle aligned with
#' the principal axes enclosing the data should be computed.
#' @param logarithm A logical value indicating whether or not the logarithm
#' of the hypervolume should be returned.
#' @param \dots Further arguments passed to or from other methods.
#'
#' @return
#' Returns the (log) hypervolume from a uniform distribution aligned with the
#' original axes or along the principal components.
#'
#' @author Luca Scrucca
#'
#' @examples
#' x1 = rnorm(1000)
#' x2 = 0.8*x1 + rnorm(1000)
#' x = cbind(x1, x2)
#' hypvolunif(x)
#' hypvolunif(x, pc = TRUE)
#'
#' @export
hypvolunif <- function(data, pc = FALSE, logarithm = TRUE, ...)
{
data <- as.matrix(data)
if(pc) data <- princomp(data)$scores
logV = sum(log(apply(data, 2, function(x) diff(range(x)))))
if(logarithm) logV else exp(logV)
}
#' @title Approximate hypervolume for multivariate data
#'
#' @description
#' Approximate the hypervolume of a multivariate dataset by assuming
#' a truncated multivariate normal (TMVN) distribution within given radius.
#'
#' @param data A numeric vector, matrix, or data frame. If a matrix or
#' data frame, rows correspond to observations and columns correspond
#' to variables. Categorical variables and missing values are not allowed.
#' @param radius A numerical value specifying the radius to be used for
#' truncating the multivariate Gaussian distribution. If not provided
#' is set to \eqn{sqrt(d+1)}, where \eqn{d} is the dimensionality of
#' the data, i.e. number of columns of `data`.
#' @param \dots Further arguments passed to or from other methods.
#'
#' @return
#' Returns a list with the following components:
#' * `sigma` The estimated covariance matrix.
#' * `radius` The radius used for computation.
#' * `logvol` The log of hypervolume.
#' * `logdens` The smallest log-density for observed data points within the
#' radius.
#'
#' @author Luca Scrucca
#'
#' @examples
#' x1 = rnorm(1000)
#' x2 = 0.8*x1 + rnorm(1000)
#' x = cbind(x1, x2)
#' hypvoltmvnorm(x, radius = 1)
#'
#' @export
hypvoltmvnorm <- function(data, radius = NULL, ...)
{
data <- as.matrix(data)
d <- ncol(data)
if(is.null(radius)) radius <- sqrt(d+1)
if(d==1)
{
mu_hat = mean(data)
sigma_hat = var(data)
sigma_svd = sigma_hat
log_det_sigma_hat = log(sigma_hat)
} else
{
mu_hat = colMeans(data)
sigma_hat = cov(data)
sigma_svd = svd(sigma_hat)
log_det_sigma_hat = sum(log(sigma_svd$d))
}
logvol = d*log(radius) + (d/2)*log(pi) + log_det_sigma_hat/2 - lgamma(d/2+1)
# which samples are in the set A from TMVN
md = mahalanobis(data, center = mu_hat, cov = sigma_hat)
inA = (md < radius^2)
# min log-density of data points in A
logdens = min(mclust::dmvnorm(data[inA,], mean = mu_hat,
sigma =sigma_hat, log = TRUE))
# -d/2 * log(2*pi) + log_det_sigma_hat/2 - radius^2/2
out = list(sigma = sigma_hat,
radius = radius,
logvol = logvol,
logdens = logdens)
return(out)
}
mvnormvol <- function(d, sigma = NULL, conf_level = 0.999, logarithm = TRUE)
{
d = as.numeric(d)[1]
if(is.null(sigma)) sigma = diag(d)
c = qchisq(conf_level, df = d) # Chi-squared quantile
r = sqrt(c) # radius of the hypersphere
# compute the log-volume of the hypersphere in d dimensions
logvol <- d/2 * log(pi) - lgamma(d/2 + 1) + d*log(r) + log(det(sigma))
if(logarithm) return(logvol) else return(exp(logvol))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.