#' Estimate a mixture model using an EM algorithm
#'
#' @export
#'
#' @param formula a formula expression as for regression models, of pooled form \code{cbind(y1, y2, y3) ~ .}
#' @param data a \code{data.frame} in which to interpret the variables occurring in \code{formula}
#' @param penalization a scalar between 0 and 1 to penalize observations whose type is unobserved.
#' 1 discards those observations, 0 gives them weight equal to observations whose type is observed.
#' @param munCol column in both \code{data} and \code{dataMun}
#' containing the municipality id, syntax as in \code{dplyr}
#' @param zCol column in both \code{data} and \code{dataMun}
#' containing the (possibly missing) municipality type, syntax as in \code{dplyr}
#' @param yearCol column in \code{data} containing the year, syntax as in \code{dplyr}
#' @param stateCol column in \code{data} containing the state, syntax as in \code{dplyr}
#' @param tol threshold used to determined convergence
#' @param mixTrace report each \code{mixTrace} iteration Leave \code{NULL} for no reporting.
#' @param mixMaxIt maximum number of iterations
#' @param pool should \code{data} be pooled automatically?
#' @param k rounding parameter for numerical stability. Rounds to the order of \code{log(-k)}
#' @param nSamples number of bootstrap replicates. If \code{NULL}, standard errors are not estimated.
#' @param parallel should the bootstrap be parallelized? if \code{TRUE}, the \code{cl} argument shoud not be \code{NULL}
#' @param seed optional seed
#' @param cl cluster used for parallelization
#' @param ... additional parameters passed on to \code{\link[nnet]{multinom}}
#' @details Make sure to check the \link[=EMmethods]{methods} man page
#' to see related methods, as well as the \link{posteriorprobs} man page
#' for how to deal with predicted probabilities.
#' @export
EM <- function(formula, data, penalization = 0,
munCol = j, zCol = Z, yearCol = year, stateCol = state,
tol = 1e-3, mixTrace = 1, mixMaxit = 1000,
pool = T, k = 10, nSamples = NULL, parallel = F, seed = NULL, cl = NULL,
...) {
munCol <- rlang::enquo(munCol)
zCol <- rlang::enquo(zCol)
yearCol <- rlang::enquo(yearCol)
stateCol <- rlang::enquo(stateCol)
data <- dplyr::rename(data, j = !!munCol) %>%
dplyr::arrange(j) %>%
dplyr::mutate(obs = as.integer(!is.na(!!zCol)))
if(is.null(seed)) {
seed <- sample.int(1e6, 1)
}
set.seed(seed)
coefNames <- colnames(model.matrix(delete.response(terms(formula)), data))
coefNames <- c(paste0("beta1-", coefNames), paste0("beta2-", coefNames))
out <- EMfit(formula, data,
tol = tol, mixTrace = mixTrace, mixMaxit = mixMaxit,
seed = seed, pool = pool, penalization = penalization,
zCol = zCol, coefNames = coefNames, k = k, ...)
replace0 <- list(0)
names(replace0) <- rlang::as_name(zCol)
nObs <- getNObs(formula, tidyr::replace_na(data, replace0), j, !!yearCol)
if(!is.null(nSamples)) {
if(is.null(seed)) seed <- sample.int(1e6, size = 1)
set.seed(seed)
seeds <- sample.int(1e6, nSamples, replace = F)
seeds2 <- sample.int(1e6, nSamples, replace = F)
bootSrc <- dplyr::select(data, mun = j, state = !!stateCol) %>% dplyr::distinct()
boots <- getBoots(bootSrc, seeds)
boots <- purrr::map2(boots, seeds2, ~ list(boot = .x, seed = .y))
bootFUN <- function(boot, formula, data,
tol, mixMaxit,
pool, zCol, k, penalization,
coefNames, munCol, ...) {
dataBoot <- unlist(sapply(boot$boot, function(b) which(data$j == b)))
data <- data[dataBoot,]
mod <- c(
list(...),
list(formula = formula, data = data, tol = tol, penalization = penalization,
mixTrace = NULL, mixMaxit = mixMaxit, seed = boot$seed, pool = pool,
zCol = zCol, coefNames = coefNames, k = k)
)
mod$trace <- F
mod <- tryCatch(do.call(EMfit, mod), error = function(e) e)
if(class(mod)[1] == "EMfit") {
list(coef = bootCoefAdjust(mod$coef, c("gamma", coefNames)),
convergence = mod$convergence,
trace = mod$trace,
error = ifelse(!is.null(mod$error), mod$error, NA))
} else {
coef <- rep(NA, length(coefNames)+1)
names(coef) <- c("gamma", coefNames)
list(coef = coef,
convergence = F,
trace = NULL,
error = paste(as.character(unlist(mod)), collapse = " "))
}
}
if(!parallel) {
boots <- lapply(boots, bootFUN, formula = formula, data = data, tol = tol,
mixMaxit = mixMaxit, pool = pool, munCol = munCol, penalization = penalization,
zCol = zCol, coefNames = coefNames, k = k, ... = ...
)
} else {
boots <- parallel::parLapplyLB(
cl = cl, boots, bootFUN,
formula = formula, data = data, tol = tol, penalization = penalization,
mixMaxit = mixMaxit, pool = pool, munCol = munCol,
zCol = zCol, coefNames = coefNames, k = k, ... = ...
)
}
bootCoefs <- do.call(rbind, purrr::map(boots, ~ .$coef))
colnames(bootCoefs) <- c("gamma", coefNames)
bootsDiag <- list(
convergence = tibble::tibble(
sample = 1:nSamples,
convergence = purrr::map_lgl(boots, ~ .$convergence),
error = purrr::map_chr(boots, ~ .$error)
),
traces = purrr::map_dfr(boots, ~ .$trace, .id = "sample") %>%
dplyr::mutate(sample = as.integer(sample))
)
vc <- cov(bootCoefs, use = "pairwise.complete.obs")
out$boots <- bootCoefs
out$vcov <- vc
out$bootsDiag <- bootsDiag
}
out <- c(out, list(
formula = formula,
nSpell = nObs$nSpell,
nObs = nObs$nObs,
nMun = nObs$nMun,
munCol = munCol,
yearCol = yearCol,
stateCol = stateCol,
zCol = zCol,
call = match.call(),
nSamples = nSamples,
df = length(out$coef),
penalization = penalization,
k = k
))
out$deviance <- 2*out$logLik
out$AIC <- 2*(out$df - out$logLik)
out$weights <- dplyr::rename(out$weights, !!munCol := j, !!zCol := Z)
class(out) <- "EM"
out
}
EMfit <- function(formula, data, seed, tol, mixTrace, mixMaxit, zCol, coefNames, k, pool, penalization, ...) {
# some data prep
set.seed(seed)
dataMun <- dplyr::select(data, j, !!zCol, obs) %>% dplyr::distinct()
keyJ <- tibble::tibble(oJ = dataMun$j, nJ = match(dataMun$j, dataMun$j))
data$j <- match(data$j, dataMun$j)
dataMun$j <- 1:nrow(dataMun)
replace0 <- list(0)
replace1 <- list(1)
names(replace0) <- names(replace1) <- rlang::as_name(zCol)
dataAug <- dplyr::bind_rows(
dplyr::mutate(data, trueZ = !!zCol, !!zCol := 0),
dplyr::mutate(data, trueZ = !!zCol, !!zCol := 1)
) %>% dplyr::arrange(Z, j)
Y <- model.response(model.frame(formula, data = dataAug))
X <- model.matrix(delete.response(terms(formula)), data = dataAug)
Z <- w <- dplyr::pull(dataMun, !!zCol)
# initialization
if(!is.null(mixTrace)) message("initial calibration")
calData <- dplyr::filter(data, obs == 1)
if(pool) calData <- poolDf(formula, calData)
params <- list(
gamma = mean(w, na.rm = T),
mod = nnet::multinom(formula, data = calData, ...)
)
initBeta <- linCoefs(params$mod)
initBeta <- bootCoefAdjust(initBeta, coefNames)
initBeta[is.na(initBeta)] <- 0
initBeta <- t(matrix(initBeta, ncol = 2))
params$beta <- initBeta
w[is.na(w)] <- mean(w, na.rm = T)
# EM
diff <- tol + 1
LLOld <- NULL
i <- 0
LLs <- c()
diffs <- c()
while(diff >= tol & i < mixMaxit) {
multLik <- getMultLik(params, dataAug, Y, X, k, zCol)
w <- getWeights(multLik, k)
params <- estimate(formula, w, Z, dataAug, params$beta, pool, zCol, penalization, ...)
if(is.null(LLOld)) {
LL <- logLik(multLik, k, zCol, penalization)
LLOld <- LL - (tol + 1)
} else {
LLOld <- LL
LL <- logLik(multLik, k, zCol, penalization)
}
diff <- LL - LLOld
LLs <- c(LLs, LL)
diffs <- c(diffs, diff)
i <- i + 1
if(!is.null(mixTrace)) {
if(i %% mixTrace == 0) message(sprintf("Iteration %s: Delta = %s", i, round(diff, 4)))
if(diff < 0) warning(sprintf("/!\\ Iteration %s: Delta = %s", i, round(diff, 4)))
}
}
coefs <- c(gamma = params$gamma, linCoefs(params$mod))
convergence <- TRUE
error <- NULL
if(diff < 0) {
convergence <- FALSE
error <- "decreasing LL"
}
if(diff > tol) {
convergence <- FALSE
error <- "reached max. number of iterations"
}
out <- list(
coef = coefs,
weights = tibble::tibble(j = keyJ$oJ, Z = Z, prob = w),
model = params$mod,
logLik = LL,
convergence = convergence,
error = error,
nIter = i,
seed = seed,
trace = tibble::tibble(
iteration = 1:i,
LL = LLs,
diff = diffs
)
)
class(out) <- "EMfit"
out
}
# translate coefs from nnet::multinom to Wts (param for starting weights in nnet::nnet)
coefToWts <- function(co) {
co <- cbind(0, t(co))
co <- rbind(0, co)
as.numeric(co)
}
# gets (log) density of a multinomial draw, i.e. f(x), from coefficients of multinomial -- saves log(exp(log(...)))
mdmultinom <- function(Y, X, beta, log = F, k) {
Xbeta <- X %*% t(beta)
icomp <- Y[,-1] * Xbeta - lgamma(Y[,-1]+1)
icomp <- rowSums(icomp)
N <- rowSums(Y)
gcomp <- lgamma(N+1) - N * apply(cbind(0, Xbeta), 1, LSE, k = k)
out <- icomp + gcomp
if(!log) out <- exp(out)
out
}
# log-sum-exp trick
# with smoothing as in http://u.cs.biu.ac.il/~shey/919-2011/index.files/underflow%20and%20smoothing%20in%20EM.pdf
LSE <- function(x, k) {
m <- max(x)
x <- x - m
x <- x[x > -k]
m + log(sum(exp(x)))
}
# function for M-step
estimate <- function(formula, w, Z, dfAug, start, pool, zCol, penalization, ...) {
wGamma <- ifelse(is.na(Z), 1 - penalization, 1)
gamma <- weighted.mean(ifelse(is.na(Z), w, Z), wGamma)
zTmp <- dplyr::pull(dfAug, !!zCol)
dfAug$wgt <- w[dfAug$j]
dfAug$wgt <- ifelse(zTmp == 1, dfAug$wgt, 1-dfAug$wgt)
dfAug$wgt <- ifelse(dfAug$obs == 1, as.integer(zTmp == dfAug$trueZ), dfAug$wgt)
dfAug$wgt <- ifelse(dfAug$obs == 1, dfAug$wgt, (1-penalization) * dfAug$wgt)
if(pool) {
dfEst <- poolDf(formula, dfAug, dfAug$wgt)
mod <- nnet::multinom(formula, data = dfEst, Wts = coefToWts(start), ...)
} else {
mod <- nnet::multinom(formula, data = dfAug, weights = wgt, Wts = coefToWts(start), ...)
}
list(beta = coef(mod), gamma = gamma, mod = mod)
}
# function for E-step
getWeights <- function(multLik, k) {
n <- nrow(multLik)
w <- matrix(multLik$pi + multLik$pr, ncol = 2)
# smoothing as in http://u.cs.biu.ac.il/~shey/919-2011/index.files/underflow%20and%20smoothing%20in%20EM.pdf
apply(w, 1, function(v) {
m <- max(v)
v <- v - m
if(v[2] < -k) {
return(0)
}
if(v[1] < -k) {
return(1)
}
v <- exp(v)
return(v[2] / sum(v))
})
}
# function for some intermediate object
getMultLik <- function(params, dfAug, Y, X, k, zCol) {
pr <- dplyr::mutate(dfAug, pr = mdmultinom(Y, X, coef(params$mod), log = T, k = k)) %>%
dplyr::group_by(j, !!zCol, trueZ, obs) %>%
dplyr::summarize(pr = sum(pr)) %>%
dplyr::ungroup() %>%
dplyr::mutate(
pi = params$gamma,
pi = ifelse(!!zCol == 1, pi, 1-pi)
) %>%
dplyr::select(j, Z = !!zCol, trueZ, obs, pr, pi) %>%
dplyr::arrange(Z, j)
}
# compute incomplete likelihood
logLik <- function(multLik, k, zCol, penalization) {
llk <- dplyr::filter(multLik, obs == 1) %>%
dplyr::filter(trueZ == Z)
llu <- dplyr::filter(multLik, obs == 0)
llu <- cbind(
llu$pi[llu$Z == 0] + llu$pr[llu$Z == 0],
llu$pi[llu$Z == 1] + llu$pr[llu$Z == 1]
)
ll <- sum(llk$pi + llk$pr) +
(1-penalization) * sum(apply(llu,1,LSE, k = k))
ll
}
#### methods ####
#' @exportClass EM
setClass("EM")
#' @export
#' @rdname EMmethods
extract.EM <- function(
model,
include.aic = TRUE, include.loglik = TRUE, include.nobs = TRUE, include.nind = T, include.nmun = T,
level = .95, use.ci = T
) {
cl <- list(
coef.names = names(coef(model)),
coef = coef(model),
se = sqrt(diag(vcov(model))),
pvalues = 2 * pnorm(abs(coef(model)), sd = sqrt(diag(vcov(model))), lower.tail = F),
gof.names = character(0),
gof = numeric(0),
gof.decimal = logical(0)
)
if(use.ci) {
cl$ci.low <- confint(model, level = level)[,1]
cl$ci.up <- confint(model, level = level)[,2]
}
if(include.loglik) {
cl$gof.names <- c(cl$gof.names, "Log Likelihood")
cl$gof <- c(cl$gof, as.numeric(model$logLik))
cl$gof.decimal <- c(cl$gof.decimal, T)
}
if(include.aic) {
cl$gof.names <- c(cl$gof.names, "AIC")
cl$gof <- c(cl$gof, as.numeric(model$AIC))
cl$gof.decimal <- c(cl$gof.decimal, T)
}
if(include.nobs) {
cl$gof.names <- c(cl$gof.names, "Num. obs.")
cl$gof <- c(cl$gof, as.numeric(model$nSpell))
cl$gof.decimal <- c(cl$gof.decimal, F)
}
if(include.nind) {
cl$gof.names <- c(cl$gof.names, "Num. ind.")
cl$gof <- c(cl$gof, as.numeric(model$nObs))
cl$gof.decimal <- c(cl$gof.decimal, F)
}
if(include.nmun) {
cl$gof.names <- c(cl$gof.names, "Num. mun.")
cl$gof <- c(cl$gof, as.numeric(model$nMun))
cl$gof.decimal <- c(cl$gof.decimal, F)
}
do.call(texreg::createTexreg, cl)
}
setMethod("extract", signature = "EM", definition = extract.EM)
#' Methods for \code{\link{EM}} objects
#' @param ... use to select variables a la \code{\link[dplyr]{select}}
#' @param level levels for confidence intervals
#' @param digits number of digits to be displayed
#' @param newdata a \code{data.frame} in which to look for variables with which to predict
#' @param type the type of prediction required. \code{"posterior"} returns the posterior probability of having Z = 1.
#' @param k rounding parameter for numerical stability. Rounds to the order of \code{log(-k)}
#' @param include.aic Should AIC be reported in \code{\link[texreg]{texreg}} output?
#' @param include.loglik Should log-likelihood be reported in \code{\link[texreg]{texreg}} output?
#' @param include.nobs Should number of observations be reported in \code{\link[texreg]{texreg}} output?
#' @param include.nind Should number of individuals be reported in \code{\link[texreg]{texreg}} output?
#' @param include.nmun Should number of municipalities be reported in \code{\link[texreg]{texreg}} output?
#' @param use.ci Should confidence intervals be reported in \code{\link[texreg]{texreg}} output?
#' @name EMmethods
NULL
#' @export
#' @rdname EMmethods
coef.EM <- function(object, ...) {
if(...length() == 0) {
object$coef
} else {
df <- tibble::as_tibble(t(as.matrix(object$coef)))
df <- dplyr::select(df, ...)
cnames <- colnames(df)
df <- as.numeric(t(as.matrix(df)))
names(df) <- cnames
df
}
}
#' @export
#' @rdname EMmethods
vcov.EM <- function(object) {
object$vcov
}
#' @export
#' @rdname EMmethods
summary.EM <- function(
object,
...,
digits = 1) {
out <- list(
coefficients = coef(object, ... = ...),
call = object$call,
nObs = object$nObs,
nMun = object$nMun,
nSpell = object$nSpell,
digits = digits,
k = length(coef(object)),
convergence = object$convergence,
error = object$error,
AIC = object$AIC,
deviance = object$deviance,
df = object$df,
boot = !is.null(object$nSamples),
nSamples = object$nSamples,
nIter = object$nIter
)
out$sd <- se(object, ... = ...)
if(!is.null(out$sd)) {
out$z <- out$coefficients / out$sd
out$pval <- 2 * pnorm(abs(out$coefficients), sd = out$sd, lower.tail = F)
} else {
out$sd <- out$pval <- out$z <- rep(NA, length(out$coefficients))
}
if(!out$convergence) {
if(rev(object$trace$diff)[1] < 0) out$diff <- rev(object$trace$diff)[1]
}
out$kselect <- length(out$coefficients)
if(out$boot) {
out$nConverge <- sum(object$bootsDiag$convergence$convergence)
if(out$nConverge != out$nSamples) {
diag <- table(object$bootsDiag$convergence$error[!object$bootsDiag$convergence$convergence])
out$diag <- matrix(diag, ncol = 1, dimnames = list(names(diag), "N"))
meanDiff <- object$bootsDiag$traces$diff
meanDiff <- meanDiff[meanDiff < 0]
if(length(meanDiff) > 0) out$meanDiff <- meanDiff
}
}
class(out) <- "summary.EM"
out
}
#' @export
print.summary.EM <- function(x) {
cat("Call:\n")
print(x$call)
cat("\nModel Info: \n")
sprintf(" %-13s %i spells, %i individuals, %i municipalities\n",
"observations:", x$nSpell, x$nObs, x$nMun) %>% cat()
sprintf(" iterations: %i\n", x$nIter) %>% cat()
if(x$boot) {
sprintf(" %-13s bootsrap (%i samples)\n",
" SE:", x$nSamples) %>% cat()
} else {
cat(" SE: none\n")
}
sprintf(" %-13s %i\n\n",
"predictors:", x$k) %>% cat()
if(x$k == x$kselect) {
cat("Estimates: \n")
} else {
sprintf("Estimates: (showing %i/%i predictors)\n", x$kselect, x$k) %>% cat()
}
comat <- cbind(x$coefficients, x$sd, x$z, x$pval)
colnames(comat) <- c("Estimate", "Std. Error", "z value", "Pr(>|z|)")
printCoefmat(comat, has.Pvalue = T, P.values = T, digits = x$digits)
cat("\nFit: \n")
sprintf(" Residual deviance: %.*f on %i degrees of freedom\n", x$digits, x$deviance, x$df) %>% cat()
sprintf(" AIC: %.*f\n", x$digits, x$AIC) %>% cat()
if(x$boot) {
cat("\n Diagnostics: \n")
if(!x$convergence) {
sprintf(" The model did not converge.\n Error: %s", x$error) %>% cat()
if(!is.null(x$diff)) sprintf(", Delta last iteration: %.*g", x$digits, x$diff) %>% cat()
cat("\n")
}
sprintf(" %i/%i bootsrap samples converged.\n", x$nConverge, x$nSamples) %>% cat()
if(!is.null(x$diag)) {
cat(" Summary of errors in bootstrap samples: \n")
print(x$diag)
}
if(!is.null(x$meanDiff)) {
cat(" Decreasing LL errors in bootsrap samples:\n Delta last iteration\n")
print(summary(x$meanDiff))
}
}
}
#' @export
#' @rdname EMmethods
predict.EM <- function(object, newdata, type = c("posterior"), k = object$k) {
type <- type[1]
if(type == "posterior") {
co <- coef(object)
gamma <- co[1]
betas <- t(matrix(co[-1], ncol = 2))
zCol <- object$zCol
munCol <- object$munCol
newdata <- dplyr::arrange(newdata, !!munCol) %>%
dplyr::rename(j = !!munCol)
muns <- dplyr::pull(newdata, j)
data0 <- dplyr::mutate(newdata, !!zCol := 0)
data1 <- dplyr::mutate(newdata, !!zCol := 1)
y <- model.response(model.frame(formula(object), data0))
x0 <- model.matrix(delete.response(terms(formula(object))), data0)
x1 <- model.matrix(delete.response(terms(formula(object))), data1)
ll0 <- log(1-gamma) + rowsum(mdmultinom(y, x0, betas, T, k), muns)
ll1 <- log(gamma) + rowsum(mdmultinom(y, x1, betas, T, k), muns)
w <- cbind(ll0, ll1)
w <- apply(w, 1, function(v) {
m <- max(v)
v <- v - m
if(v[2] < -k) {
return(0)
}
if(v[1] < -k) {
return(1)
}
v <- exp(v)
return(v[2] / sum(v))
})
names(w) <- unique(muns)
w
}
}
#' @rdname posteriorprobs
#' @export
posterior_probs.EM <- function(object) object$weights
#' @rdname posteriorprobs
#' @export
confusion_table.EM <- function(object, cutoff = .5) {
pr <- posterior_probs(object) %>% na.omit() %>%
dplyr::mutate(prob = as.integer(prob > cutoff))
table(predicted = pr$prob, true = dplyr::pull(pr, !!object$zCol))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.