## tools for Gaussian mixture models
#' @export
fitmix.simulate <- function(n, p, mu, sigma) {
## simulate n observations from length(p) component mixture with proportions p, means mu and variances sigma^2
n <- as.integer(n)
stopifnot(n > 0)
stopifnot(length(mu) == length(p))
if (length(sigma) == 1) sigma <- rep(sigma, length(p))
stopifnot(length(sigma) == length(p))
m <- sample(1:length(p), n, replace = TRUE, prob = p) # component membership indices
return(x = mu[m] + rnorm(n)*sigma[m])
}
#' @export
fitmix.r2 <- function(p, mu, sigma) {
## true r2 explained by components in normal mixture, with equal variances for all components
stopifnot(length(p) == length(mu))
stopifnot(length(sigma) == 1)
ve <- sum(p*mu^2)/sum(p) - (sum(p*mu)/sum(p))^2 # Variance explained
return(r2 = ve/(ve + sigma^2))
}
#' @export
fitmix.plot <- function(x, p, mu, sigma) {
## make nice plot using kernel estimate for observations, plus contribution of mixture
## components to mixture density
stopifnot(length(mu) == length(p))
if (length(sigma) == 1) sigma <- rep(sigma, length(p))
stopifnot(length(sigma) == length(p))
k <- length(p)
xd <- density(x)
np <- length(xd$x)
plot(xd$x, xd$y, type = "n",
ylim = c(0, max(xd$y*1.5)),
xlab = "x", ylab = "Density", yaxt = "n")
rug(x)
polygon(c(xd$x[1], xd$x, xd$x[np], xd$x[1]),
c(0, xd$y, 0, 0), col = "grey", lty = 0)
for (i in 1:k) lines(xd$x, p[i]*dnorm(xd$x, mu[i], sigma[i]), col = rainbow(k)[i], lwd = 3)
lines(xd$x, rowSums(sapply(1:k, function(i) p[i]*dnorm(xd$x, mu[i], sigma[i]))), lwd = 3)
legend("topleft", col = c("grey", rainbow(k), "black"), lty = c(0, rep(1, k+1)), lwd = 3,
pch = c(15, rep(NA, k+1)), pt.cex = 2,
legend = c("observed", paste("mixture component", 1:k), "mixture total"), bty = "n")
return(invisible(NULL))
}
fitmix1 <- function(x, k, tol = 1e-6, maxit = 100,
p = NULL, mu = NULL, sigma = NULL,
p.binomial = FALSE, mu.additive = FALSE, sigma.common = FALSE) {
## fit mixture of k normal distributions to x using EM
## some special options (restricted M steps) for quantitative genetics applications
x <- as.vector(x)
x <- x[!is.na(x)] # hard drop NAs
stopifnot(length(x) > 1)
k <- as.integer(k); stopifnot(k > 0)
tol <- as.double(tol); stopifnot(tol > 0)
maxit <- as.integer(maxit); stopifnot(maxit > 0)
if (is.null(p)) p <- rep(1/k, k) # reasonable initial values if not specified
p <- as.double(p)
stopifnot(length(p) == k); stopifnot(all(p >= 0)); stopifnot(sum(p) > 0)
p <- p/sum(p) # normalise if necessary
if (is.null(mu)) mu <- quantile(x, seq(from = 1/(k+1), to = 1-1/(k+1), length.out = k)) # reasonable initial values if not specified
mu <- as.double(mu)
stopifnot(length(mu) == k)
if (is.null(sigma)) sigma <- rep(sd(x)/3, k) # reasonable initial values if not specified
sigma <- as.double(sigma)
stopifnot(length(sigma) == k); stopifnot(all(sigma > 0))
p.binomial <- as.logical(p.binomial)
mu.additive <- as.logical(mu.additive)
sigma.common <- as.logical(sigma.common)
if (mu.additive & !sigma.common) stop("mu.additive == TRUE requires sigma.common == TRUE")
## store progress of loglik by iteration, relative to llnull == loglik for k=1 model
loglik <- rep(NA, maxit)
llnull <- -length(x)/2 * (log(2*pi) + log(mean(x^2)-mean(x)^2) + 1)
for (ii in 1:maxit) {
lij <- sapply(1:k, function(i) return(p[i] * dnorm(x, mu[i], sigma[i])))
## likelihood for i-th observation in j-th component
loglik[ii] <- sum(log(apply(lij, 1, sum))) - llnull
if (ii > 1 && loglik[ii] - loglik[ii-1] < tol) break
pji <- apply(lij, 1, function(pp) return(pp/sum(pp))) # note transposed wrt lij
## probability of i-th observation being in j-th component, conditional on p, mu, sigma
p <- rowSums(pji)/sum(pji) # should have sum(pji)==length(x)
if (p.binomial) {
pp <- sum(p*(0:(k-1)))/(k-1)
p <- choose(k-1,0:(k-1))*pp^(0:(k-1))*(1 - pp)^(k-1:k) # k-1:k == k-(1:k)
}
if (!sigma.common) {
# to ensure correctness, ignore mu.additive argument
mu <- sapply(1:k, function(j) return(sum(pji[j, ]*x)/sum(pji[j, ]))) # == weighted.mean(x, pji[j, ])
sigma <- sqrt(sapply(1:k, function(j) sum(pji[j, ]*(x - mu[j])^2)/sum(pji[j, ]))) # estimate for each component
} else {
if (!mu.additive) {
mu <- sapply(1:k, function(j) return(sum(pji[j, ]*x)/sum(pji[j, ]))) # == weighted.mean(x, pji[j, ])
} else {
ptmp1 <- rowSums(pji)
ptmp2 <- sapply(1:k, function(j) return(sum(pji[j, ]*x)))
stmp <- solve(matrix(c(sum(ptmp1), sum(ptmp1*(0:(k-1))), sum(ptmp1*(0:(k-1))), sum(ptmp1*(0:(k-1))^2)), ncol = 2),
matrix(c(sum(ptmp2), sum(ptmp2*(0:(k-1)))), ncol = 1))
mu <- stmp[1] + (0:(k-1))*stmp[2]
}
sigma <- rep(sqrt(sum(sapply(1:k, function(j) sum(pji[j, ]*(x - mu[j])^2)))/sum(pji)), k) # common estimate
}
}
return(list(p = p, mu = mu, sigma = sigma, loglik = loglik[!is.na(loglik)]))
}
#' @export
fitmix <- function(x, k, tol = 1e-6, maxit = 100, restarts = 20,
p.binomial = FALSE, mu.additive = FALSE, sigma.common = FALSE) {
## wrapper for fitmix1, calling once with default initial values
## then (restarts) times with random initial values
x <- as.vector(x)
x <- x[!is.na(x)] # hard drop NAs
stopifnot(length(x) > 1)
k <- as.integer(k); stopifnot(k > 0)
tol <- as.double(tol); stopifnot(tol > 0)
maxit <- as.integer(maxit); stopifnot(maxit > 0)
restarts <- as.integer(restarts); stopifnot(restarts >= 0)
p.binomial <- as.logical(p.binomial)
mu.additive <- as.logical(mu.additive)
sigma.common <- as.logical(sigma.common)
if (mu.additive & !sigma.common) stop("mu.additive == TRUE requires sigma.common == TRUE")
afit <- fitmix1(x, k, tol = tol, maxit = maxit,
p.binomial = p.binomial, mu.additive = mu.additive, sigma.common = sigma.common)
p.best <- afit$p
mu.best <- afit$mu
sigma.best <- afit$sigma
loglik <- c(max(afit$loglik), rep(NA, restarts))
if (restarts >= 1) {
for (run in 1:restarts) {
ptmp <- rexp(k); ptmp <- ptmp/sum(ptmp) ## Dirichlet(1,1,...,1)
afit <- fitmix1(x, k, tol = tol, maxit = maxit,
p = ptmp, mu = sample(x, 3), sigma = rep(sd(x)/3, 3),
p.binomial = p.binomial, mu.additive = mu.additive, sigma.common = sigma.common)
if (max(afit$loglik) > max(loglik, na.rm = TRUE)) {
p.best <- afit$p
mu.best <- afit$mu
sigma.best <- afit$sigma
}
loglik[run + 1] <- max(afit$loglik)
}
}
return(list(p = p.best, mu = mu.best, sigma = sigma.best, loglik = loglik))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.