library(knitr) opts_chunk$set(fig.align = "center", out.width = "80%", fig.width = 7, fig.height = 6, dev.args = list(pointsize=12), par = TRUE, # needed for setting hook collapse = TRUE, # collapse input & output code in chunks warning = FALSE) knit_hooks$set(par = function(before, options, envir) { if(before && options$fig.show != "none") par(family = "sans", mar=c(4.1,4.1,1.1,1.1), mgp=c(3,1,0), tcl=-0.5) }) set.seed(1) # for exact reproducibility
mclustAddons is a contributed R package that extends the functionality available in the mclust package (Scrucca et al. 2016).
In particular, the following methods are included:
density estimation for data with bounded support (Scrucca, 2019);
modal clustering using modal EM algorithm for Gaussian mixtures (Scrucca, 2021);
entropy estimation via Gaussian mixture modeling (Robin & Scrucca, 2023).
This document gives a quick tour of mclustAddons (version r packageVersion("mclustAddons")
). It was written in R Markdown, using the knitr package for production.
References on the methodologies implemented are provided at the end of this document.
library(mclustAddons)
x <- rchisq(200, 3) xgrid <- seq(-2, max(x), length=1000) f <- dchisq(xgrid, 3) # true density dens <- densityMclustBounded(x, lbound = 0) summary(dens, parameters = TRUE) plot(dens, what = "density") lines(xgrid, f, lty = 2) plot(dens, what = "density", data = x, breaks = 15)
x <- rbeta(200, 5, 1.5) xgrid <- seq(-0.1, 1.1, length=1000) f <- dbeta(xgrid, 5, 1.5) # true density dens <- densityMclustBounded(x, lbound = 0, ubound = 1) summary(dens, parameters = TRUE) plot(dens, what = "density") plot(dens, what = "density", data = x, breaks = 11)
x1 <- rchisq(200, 3) x2 <- 0.5*x1 + sqrt(1-0.5^2)*rchisq(200, 5) x <- cbind(x1, x2) dens <- densityMclustBounded(x, lbound = c(0,0)) summary(dens, parameters = TRUE) plot(dens, what = "BIC") plot(dens, what = "density") points(x, cex = 0.3) abline(h = 0, v = 0, lty = 3) plot(dens, what = "density", type = "hdr") abline(h = 0, v = 0, lty = 3) plot(dens, what = "density", type = "persp")
The data consist in the lengths of 86 spells of psychiatric treatment undergone by control patients in a suicide study (Silverman, 1986).
data("suicide") dens <- densityMclustBounded(suicide, lbound = 0) summary(dens, parameters = TRUE) plot(dens, what = "density", lwd = 2, col = "dodgerblue2", data = suicide, breaks = 15, xlab = "Length of psychiatric treatment") rug(suicide)
This dataset provides the proportion of white student enrollment in 56 school districts in Nassau County (Long Island, New York), for the 1992-1993 school year (Simonoff 1996, Sec. 3.2).
data("racial") x <- racial$PropWhite dens <- densityMclustBounded(x, lbound = 0, ubound = 1) summary(dens, parameters = TRUE) plot(dens, what = "density", lwd = 2, col = "dodgerblue2", data = x, breaks = 15, xlab = "Proportion of white student enrolled in schools") rug(x)
data(Baudry_etal_2010_JCGS_examples) GMM <- Mclust(ex4.1) plot(GMM, what = "classification") MEM <- MclustMEM(GMM) summary(MEM) plot(MEM) plot(MEM, addPoints = FALSE)
GMM <- Mclust(ex4.4.2) plot(GMM, what = "classification") MEM <- MclustMEM(GMM) summary(MEM) plot(MEM) plot(MEM, addDensity = FALSE)
Univariate Gaussian
EntropyGauss(1) # population entropy x = rnorm(1000) # generate sample EntropyGauss(var(x)) # sample entropy assuming Gaussian distribution mod = densityMclust(x, plot = FALSE) EntropyGMM(mod) # GMM-based entropy estimate plot(mod, what = "density", data = x, breaks = 31); rug(x)
Univariate Mixed-Gaussian\ Consider the mixed-Gaussian distribution $f(x) = 0.5 \times N(-2,1) + 0.5 \times N(2,1)$, whose entropy is 2.051939 in the population.
cl = rbinom(1000, size = 1, prob = 0.5) x = ifelse(cl == 1, rnorm(1000, 2, 1), rnorm(1000, -2, 1)) # generate sample mod = densityMclust(x, plot = FALSE) EntropyGMM(mod) # GMM-based entropy estimate plot(mod, what = "density", data = x, breaks = 31); rug(x)
Multivariate Chi-squared\ Consider a 10-dimensional independent $\chi^2$ distribution, whose entropy is 24.23095 in the population.
x = matrix(rchisq(1000*10, df = 5), nrow = 1000, ncol = 10) mod1 = densityMclust(x, plot = FALSE) EntropyGMM(mod1) # GMM-based entropy estimate, not too bad but... mod2 = densityMclustBounded(x, lbound = rep(0,10)) EntropyGMM(mod2) # much more accurate
data(faithful) mod = densityMclust(faithful, plot = FALSE) EntropyGMM(mod) # GMM-based entropy estimate # or provide the data and fit GMM implicitly EntropyGMM(faithful)
data(iris) mod = densityMclust(iris[,1:4], plot = FALSE) EntropyGMM(mod) # GMM-based entropy estimate
Scrucca L., Fop M., Murphy T. B. and Raftery A. E. (2016) mclust 5: clustering, classification and density estimation using Gaussian finite mixture models. The R Journal 8/1, pp. 289-317. https://doi.org/10.32614/RJ-2016-021
Scrucca L. (2019) A transformation-based approach to Gaussian mixture density estimation for bounded data, Biometrical Journal, 61:4, 873–888. https://doi.org/10.1002/bimj.201800174
Scrucca L. (2021) A fast and efficient Modal EM algorithm for Gaussian mixtures. Statistical Analysis and Data Mining, 14:4, 305–314. https://doi.org/10.1002/sam.11527
Robin S. and Scrucca L. (2023) Mixture-based estimation of entropy. Computational Statistics & Data Analysis, 177, 107582. https://doi.org/10.1016/j.csda.2022.107582
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.