Nothing
#' Predict from merMod objects with a prediction interval
#' @description This function provides a way to capture model uncertainty in
#' predictions from multi-level models fit with \code{lme4}. By drawing a sampling
#' distribution for the random and the fixed effects and then estimating the fitted
#' value across that distribution, it is possible to generate a prediction interval
#' for fitted values that includes all variation in the model except for variation
#' in the covariance parameters, theta. This is a much faster alternative than
#' bootstrapping for models fit to medium to large datasets.
#' @param merMod a merMod object from lme4
#' @param newdata a data.frame of new data to predict
#' @param which a character specifying what to return, by default it returns the
#' full interval, but you can also select to return only the fixed variation or
#' the random component variation. If full is selected the resulting data.frame
#' will be \code{nrow(newdata) * number of model levels} long
#' @param level the width of the prediction interval
#' @param n.sims number of simulation samples to construct
#' @param stat take the median or mean of simulated intervals
#' @param type type of prediction to develop
#' @param include.resid.var logical, include or exclude the residual variance for
#' linear models
#' @param returnSims logical, should all n.sims simulations be returned?
#' @param seed numeric, optional argument to set seed for simulations
#' @param fix.intercept.variance logical; should the variance of the intercept
#' term be adjusted downwards to roughly correct for its covariance with the
#' random effects, as if all the random effects are intercept effects?
#' @param ignore.fixed.terms a numeric or string vector of indexes or names of
#' fixed effects which should be considered as fully known (zero variance). This
#' can result in under-conservative intervals, but for models with random effects
#' nested inside fixed effects, holding the fixed effects constant intervals may
#' give intervals with closer to nominal coverage than the over-conservative
#' intervals without this option, which ignore negative correlation between the
#' outer (fixed) and inner (random) coefficients.
#' @param .parallel, logical should parallel computation be used, default is FALSE
#' @param .paropts, -NOT USED: Caused issue #54- a list of additional options passed into the foreach function
#' when parallel computation is enabled. This is important if (for example) your
#' code relies on external data or packages: use the .export and .packages arguments
#' to supply them so that all cluster nodes have the correct environment set up
#' for computing.
#' @return a data.frame with three columns:
#' \describe{
#' \item{\code{fit}}{The center of the distribution of predicted values as defined by
#' the \code{stat} parameter.}
#' \item{\code{lwr}}{The lower prediction interval bound corresponding to the quantile cut
#' defined in \code{level}.}
#' \item{\code{upr}}{The upper prediction interval bound corresponding to the quantile cut
#' defined in \code{level}.}
#' }
#' If returnSims = TRUE, then the individual simulations are attached to this
#' data.frame in the attribute \code{sim.results} and are stored as a matrix.
#' @details To generate a prediction interval, the function first computes a simulated
#' distribution of all of the parameters in the model. For the random, or grouping,
#' effects, this is done by sampling from a multivariate normal distribution which
#' is defined by the BLUP estimate provided by \code{ranef} and the associated
#' variance-covariance matrix for each observed level of each grouping terms. For
#' each grouping term, an array is build that has as many rows as there are levels
#' of the grouping factor, as many columns as there are predictors at that level
#' (e.g. an intercept and slope), and is stacked as high as there are number of
#' simulations. These arrays are then multiplied by the new data provided to the
#' function to produce a matrix of yhat values. The result is a matrix of the simulated
#' values of the linear predictor for each observation for each simulation. Each
#' grouping term has such a matrix for each observation. These values can be added
#' to get the estimate of the fitted value for the random effect terms, and this
#' can then be added to a matrix of simulated values for the fixed effect level to
#' come up with \code{n.sims} number of possible yhat values for each observation.
#'
#' The distribution of simulated values is cut according to the interval requested
#' by the function. The median or mean value as well as the upper and lower bounds
#' are then returned. These can be presented either on the linear predictor scale
#' or on the response scale using the link function in the \code{merMod}.
#' @note \code{merTools} includes the functions \code{subBoot} and \code{thetaExtract}
#' to allow the user to estimate the variability in \code{theta} from a larger
#' model by bootstrapping the model fit on a subset, to allow faster estimation.
#' @export
#' @import lme4
#' @importFrom abind abind
#' @importFrom mvtnorm rmvnorm
#' @importFrom foreach %dopar%
#' @importFrom foreach foreach
#' @examples
#' \donttest{
#' m1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)
#' regFit <- predict(m1, newdata = sleepstudy[11, ]) # a single value is returned
#' intFit <- predictInterval(m1, newdata = sleepstudy[11, ]) # bounded values
#' # Can do glmer
#' d1 <- cbpp
#' d1$y <- d1$incidence / d1$size
#' gm2 <- glmer(y ~ period + (1 | herd), family = binomial, data = d1,
#' nAGQ = 9, weights = d1$size)
#' regFit <- predict(gm2, newdata = d1[1:10, ])
#' # get probabilities
#' regFit <- predict(gm2, newdata = d1[1:10, ], type = "response")
#' intFit <- predictInterval(gm2, newdata = d1[1:10, ], type = "probability")
#' intFit <- predictInterval(gm2, newdata = d1[1:10, ], type = "linear.prediction")
#' }
predictInterval <- function(merMod, newdata, which=c("full", "fixed", "random", "all"),
level = 0.8,
n.sims = 1000, stat=c("median","mean"),
type=c("linear.prediction", "probability"),
include.resid.var=TRUE, returnSims = FALSE,
seed=NULL, .parallel = FALSE, .paropts = NULL,
fix.intercept.variance = FALSE, #This does NOT work with random slope models
ignore.fixed.terms = NULL)
{
if(missing(newdata)){
newdata <- merMod@frame
}
if(any(c("data.frame") != class(newdata))){
if(any(c("tbl_df", "tbl") %in% class(newdata))){
newdata <- as.data.frame(newdata)
warning("newdata is tbl_df or tbl object from dplyr package and has been
coerced to a data.frame")
} else{
newdata <- as.data.frame(newdata)
}
}
predict.type <- match.arg(type,
c("linear.prediction", "probability"),
several.ok = FALSE)
stat.type <- match.arg(stat,
c("median","mean"),
several.ok = FALSE)
which.eff <- match.arg(which,
c("full", "fixed", "random", "all"),
several.ok = FALSE)
if (!is.null(seed))
set.seed(seed)
else if (!exists(".Random.seed", envir = .GlobalEnv))
runif(1)
##First: check if it is a GLMM or NLMM and draw from sigma distribution or incorporate scale parameter if GLMM
merMod.devcomp <- getME(merMod, "devcomp")
if (merMod.devcomp$dims[["GLMM"]] == 0 &
merMod.devcomp$dims[["NLMM"]] == 0) {
sigmahat <- sqrt(1/rgamma(n.sims, 0.5 * residDF.merMod(merMod),
0.5 * merMod.devcomp$cmp[["pwrss"]]))
if (predict.type=="probability") {
predict.type="linear.prediction"
warning(" Asking for predictions on the probability scale makes no sense, resetting predict.type to linear.prediction",
call.=FALSE)
}
}
else if (merMod.devcomp$dims[["GLMM"]] == TRUE &
merMod@resp$family$family == "binomial" &
merMod@resp$family$link %in% c("logit", "probit")) {
sigmahat <- rep(1,n.sims)
}
else {
warning(" Prediction for NLMMs or GLMMs that are not mixed binomial regressions is not tested. Sigma set at 1.")
sigmahat <- rep(1,n.sims)
}
#TODO: Making a sparse model matrix fails with nested specification, e.g. (1|order/vore)
# sleep_total ~ bodywt + (1 | vore/order)
newdata.modelMatrix <- buildModelMatrix(model= merMod, newdata = newdata)
# When there is no fixed effect intercept but there is a group level intercept
# We need to do something!
rr <- ranef(merMod, condVar = TRUE)
re.xb <- vector(getME(merMod, "n_rfacs"), mode = "list")
names(re.xb) <- names(ngrps(merMod))
for (j in names(re.xb)) {
reMeans <- as.matrix(rr[[j]])
reMatrix <- attr(rr[[j]], which = "postVar")
# OK, let's knock out all the random effects we don't need
if (j %in% names(newdata)){ # get around if names do not line up because of nesting
obslvl <- unique(as.character(newdata[, j]))
alllvl <- rownames(reMeans)
keep <- intersect(obslvl, alllvl)
} else {
obslvl <- colnames(newdata.modelMatrix)
alllvl <- rownames(reMeans)
keep <- intersect(obslvl, alllvl)
}
# Add switch if no random groups are observed to avoid indexing errors,
# we burn 1 sample of 1 group of all coefficients that will eventually
# be multiplied by zero later on
if (length(keep) > 0 & !identical(keep, alllvl)) {
reMeans <- reMeans[keep, , drop=FALSE]
dimnames(reMatrix)[[3]] <- alllvl
reMatrix <- reMatrix[, , keep, drop = FALSE]
} else if (length(keep) > 0 & identical(keep, alllvl)){
dimnames(reMatrix)[[3]] <- alllvl
# dimnames(reMeans)[[2]] <- j # we need to get the variable name into this ojbect
reMatrix <- reMatrix[, , keep, drop = FALSE]
} else{
reMeans <- reMeans[1, , drop=FALSE]
reMatrix <- reMatrix[, , 1, drop = FALSE]
}
tmpList <- vector(length = nrow(reMeans), mode = "list")
for (k in 1:nrow(reMeans)){
meanTmp <- reMeans[k, ]
names(meanTmp) <- NULL
matrixTmp <- as.matrix(reMatrix[, , k])
tmpList[[k]] <- as.matrix(mvtnorm::rmvnorm(n= n.sims,
mean=meanTmp,
sigma=matrixTmp, method = "chol"))
}
REcoefs <- sapply(tmpList, identity, simplify="array")
# rm(tmpList)
dimnames(REcoefs) <- list(1:n.sims,
attr(reMeans, "dimnames")[[2]],
attr(reMeans, "dimnames")[[1]]
)
if (j %in% names(newdata)) { # get around if names do not line up because of nesting
newdata.modelMatrix <- as.matrix(newdata.modelMatrix) ## give up, sparse to dense now
tmp <- cbind(as.data.frame(newdata.modelMatrix), var = newdata[, j])
tmp <- tmp[, !duplicated(colnames(tmp))]
keep <- names(tmp)[names(tmp) %in% colnames(REcoefs)]
if (length(keep) == 0) {
keep <- grep(dimnames(REcoefs)[[2]], names(tmp), value = TRUE)
}
if (length(keep) == 0) {
tmp <- cbind(model.frame(subbars(formula(merMod)), data = newdata),
var = newdata[, j])
keep <- grep(dimnames(REcoefs)[[2]], names(tmp), value = TRUE)
}
if ( length(keep) == 0) {
# Add in an intercept for RE purposes
tmp <- cbind(as.data.frame(newdata.modelMatrix), var = newdata[, j])
tmp <- tmp[, !duplicated(colnames(tmp))]
tmp <- cbind(data.frame(1), tmp)
names(tmp)[1] <- "(Intercept)"
keep <- "(Intercept)"
}
tmp <- tmp[, c(keep, "var"), drop = FALSE]
tmp[, "var"] <- as.character(tmp[, "var"])
colnames(tmp)[which(names(tmp) == "var")] <- names(newdata[, j, drop = FALSE])
if (all(grepl(":", keep))) {
# Strip out the interaction after
keep <- unique(gsub("(.*):.*", "\\1", keep))
}
} else {
# If newdata.modelMatrix is still sparse at this point, we need to convert it safely
if(is(newdata.modelMatrix, "dgCMatrix")){
newdata.modelMatrix <- as.matrix(newdata.modelMatrix) ## give up, sparse to dense now
tmp <- as.data.frame(newdata.modelMatrix)
} else {
tmp <- as.data.frame(newdata.modelMatrix)
}
tmp <- tmp[, !duplicated(colnames(tmp))] # deduplicate columns because
# column names can be duplicated to account for multiple effects
# but we've already reconciled all the effects
tmp$var <- names(tmp[keep])[max.col(tmp[keep])] #changed alllvl to keep in
#this line re: issue #53 where newdata doesn't have all levels of rfx in
#nested specification (with ":") so this just takes the subset of alllvl
#that are specified in model
keep <- names(tmp)[names(tmp) %in% dimnames(REcoefs)[[2]]]
tmp <- tmp[, c(keep, "var"), drop = FALSE]
tmp[, "var"] <- as.character(tmp[, "var"])
colnames(tmp)[which(names(tmp) == "var")] <- j
}
#######################
################
tmp.pred <- function(data, coefs, group){
new.levels <- unique(as.character(data[, group])[!as.character(data[, group]) %in% dimnames(coefs)[[3]]])
msg <- paste(" The following levels of ", group, " from newdata \n -- ", paste0(new.levels, collapse=", "),
" -- are not in the model data. \n Currently, predictions for these values are based only on the \n fixed coefficients and the observation-level error.", sep="")
if(length(new.levels > 0)){
warning(msg, call.=FALSE)
}
yhatTmp <- array(data = NA, dim = c(nrow(data), dim(coefs)[1]))
colIdx <- ncol(data) - 1
colLL <- length(1:colIdx)
if(colLL > dim(coefs)[2]) {
# copy over
coefs_new <- array(NA, dim = c(dim(coefs)[1], colLL,
dim(coefs)[3]))
dimnames(coefs_new)[c(1, 3)] <- dimnames(coefs)[c(1, 3)]
dimnames(coefs_new)[[2]] <- rep(dimnames(coefs)[[2]], dim(coefs_new)[2])
for (k in 1:colLL) {
coefs_new[, k, 1:dim(coefs)[3]] <- coefs[, 1, 1:dim(coefs)[3]]
}
coefs <- coefs_new
}
for(i in 1:nrow(data)){
lvl <- as.character(data[, group][i])
if(!lvl %in% new.levels){
yhatTmp[i, ] <- as.numeric(data[i, 1:colIdx]) %*% t(coefs[, 1:colIdx, lvl])
} else{
# 0 out the RE for these new levels
yhatTmp[i, ] <- rep(0, colIdx) %*% t(coefs[, 1:colIdx, 1])
}
}
rownames(yhatTmp) <- rownames(data)
rm(data)
return(yhatTmp)
}
#########################
####
if(nrow(tmp) > 1000 | .parallel) {
if (requireNamespace("foreach", quietly=TRUE)) {
if(.parallel){
setup_parallel()
}
tmp2 <- split(tmp, (1:nrow(tmp) %/% 500)) #TODO: Find optimum splitting factor
tmp2 <- tmp2[lapply(tmp2,length) > 0]
fe_call <- as.call(c(list(quote(foreach::foreach), i = seq_along(tmp2),
.combine = 'rbind')))
fe <- eval(fe_call)
re.xb[[j]] <- foreach::`%dopar%`(fe, tmp.pred(data = tmp2[[i]],
coefs = REcoefs[, keep, , drop = FALSE],
group = j))
rm(tmp2)
} else {
warning("foreach package is unavailable, parallel computing not available")
re.xb[[j]] <- tmp.pred(data = tmp, coefs = REcoefs[, keep, , drop = FALSE],
group = j)
}
} else{
re.xb[[j]] <- tmp.pred(data = tmp, coefs = REcoefs[, keep, , drop = FALSE],
group = j)
}
rm(tmp)
}
rm(REcoefs)
# TODO: Add a check for new.levels that is outside of the above loop
# for now, ignore this check
if (include.resid.var==FALSE) {
# if (length(new.levels)==0)
sigmahat <- rep(1, n.sims)
# else {
# include.resid.var=TRUE
# warning(" \n Since new levels were detected resetting include.resid.var to TRUE.")
# }
}
# fixed.xb is nrow(newdata) x n.sims
##Calculate yhat as sum of the components (fixed plus all groupling factors)
fe.tmp <- fixef(merMod)
vcov.tmp <- as.matrix(vcov(merMod))
# Detect if an intercept is present
# TODO - is this reliable
if (is.na(names(attr(VarCorr(merMod)[[j]],"stddev")["(Intercept)"]))) {
fix.intercept.variance <- FALSE
message("No intercept detected, setting fix.intercept.variance to FALSE")
}
# If intercept is not in fixed terms
if (!"(Intercept)" %in% names(fixef(merMod)) && fix.intercept.variance) {
# TODO - decide if this is an error or if we should allow it to continue with warning
warning("No fixed-effect intercept detected. Variance adjustment may be unreliable.")
}
if (fix.intercept.variance) {
#Assuming all random effects include intercepts.
intercept.variance <- vcov.tmp[1,1]
groupsizes <- ngrps(merMod)
for(j in names(groupsizes)){ #for every group of random e
groupExtraPrecision <- 0
groupVar <- (attr(VarCorr(merMod)[[j]],"stddev")["(Intercept)"])^2
reMatrix <- attr(rr[[j]], which = "postVar")
for (eff in 1:dim(reMatrix)[3]) {
term <- 1/(reMatrix[1,1,eff] + groupVar)
if (term > 0) {
groupExtraPrecision <- groupExtraPrecision + term
} else {
warning("fix.intercept.variance got negative precision; better turn it off.")
}
}
intercept.variance <- intercept.variance - 1/groupExtraPrecision
}
if (intercept.variance < 0) {
warning("fix.intercept.variance got negative variance; better turn it off.")
}
ratio <- intercept.variance/vcov.tmp[1,1]
prec.tmp <- solve(vcov.tmp)
prec.tmp[1,1] <- prec.tmp[1,1] / ratio
vcov.tmp[1,] <- vcov.tmp[1,] * ratio
vcov.tmp <- solve(prec.tmp, tol=1e-50)
}
if (!is.null(ignore.fixed.terms)) {
prec.tmp <- solve(vcov.tmp)
for (term in ignore.fixed.terms) {
prec.tmp[term,term] <- prec.tmp[term,term] * 1e15
}
vcov.tmp <- solve(prec.tmp, tol=1e-50)
}
if(n.sims > 2000 | .parallel){
if(.parallel){
setup_parallel()
}
i <- 1:n.sims
fe_call <- as.call(c(list(quote(foreach::foreach), i = i,
.combine = 'rbind')))
fe <- eval(fe_call)
betaSim <- foreach::`%dopar%`(fe, mvtnorm::rmvnorm(n = 1, mean = fe.tmp,
sigma = vcov.tmp,
method = "chol"))
} else {
betaSim <- abind::abind(lapply(1:n.sims,
function(x) mvtnorm::rmvnorm(n = 1, mean = fe.tmp,
sigma = vcov.tmp,
method = "chol")), along=1)
}
# Pad betaSim
colnames(betaSim) <- names(fe.tmp)
rownames(betaSim) <- 1:n.sims
newdata.modelMatrix <- buildModelMatrix(merMod, newdata = newdata, which = "fixed")
if (ncol(newdata.modelMatrix) > ncol(betaSim)) {
pad <- matrix(rep(0), nrow = nrow(betaSim),
ncol = ncol(newdata.modelMatrix) - ncol(betaSim))
if(ncol(pad) > 0){
message("Fixed effect matrix has been padded with 0 coefficients
for random slopes not included in the fixed effects and interaction terms.")
}
colnames(pad) <- setdiff(colnames(newdata.modelMatrix), colnames(betaSim))
betaSim <- cbind(betaSim, pad)
keep <- intersect(colnames(newdata.modelMatrix), colnames(betaSim))
newdata.modelMatrix <- newdata.modelMatrix[, keep]
betaSim <- betaSim[, keep]
}
re.xb$fixed <- newdata.modelMatrix %*% t(betaSim)
######
if(which.eff == "full"){
yhat <- Reduce('+', re.xb)
} else if(which.eff == "fixed"){
yhat <- Reduce('+', re.xb["fixed"])
} else if(which.eff == "random"){
re.xb["fixed"] <- NULL
yhat <- Reduce('+', re.xb)
} else if(which.eff == "all"){
yhat <- Reduce('+', re.xb)
N <- nrow(newdata)
if (include.resid.var==TRUE){
for(i in 1:length(re.xb)){
re.xb[[i]] <- abind::abind(lapply(1:n.sims, function(x) rnorm(N, re.xb[[i]][, x], sigmahat[x])), along=2)
}
}
pi.comps <- re.xb
}
rm(re.xb)
N <- nrow(newdata)
outs <- data.frame("fit" = rep(NA, N),
"upr" = rep(NA, N),
"lwr" = rep(NA, N))
upCI <- 1 - ((1-level)/2)
loCI <- ((1-level)/2)
if (include.resid.var==TRUE){
yhat <- abind::abind(lapply(1:n.sims,
function(x) rnorm(N, yhat[,x], sigmahat[x])),
along = 2)
}
# Output prediction intervals
if (stat.type == "median") {
outs[, 1:3] <- t(apply(yhat, 1, quantile, prob = c(0.5, upCI, loCI),
na.rm=TRUE))
}
if (stat.type == "mean") {
outs$fit <- apply(yhat, 1, mean, na.rm=TRUE)
outs[, 2:3] <- t(apply(yhat, 1, quantile, prob = c(upCI, loCI),
na.rm=TRUE))
}
if (predict.type == "probability") {
if(nrow(outs) == 1) {
outs <- t(apply(outs, 2, merMod@resp$family$linkinv))
} else {
outs <- apply(outs, 2, merMod@resp$family$linkinv)
}
}
##############################
# Construct observation predictors for each component of the model
##########################
if(which.eff == "all"){
if(returnSims == TRUE){
allSims <- pi.comps
}
for(i in 1:length(pi.comps)){
if( stat.type == "median"){
pi.comps[[i]] <- t(apply(pi.comps[[i]], 1, quantile,
prob = c(0.5, upCI, loCI), na.rm=TRUE))
pi.comps[[i]] <- as.data.frame(pi.comps[[i]])
names(pi.comps[[i]]) <- c("fit", "upr", "lwr")
}
if(stat.type == "mean"){
tmp <- pi.comps[[i]]
pi.comps[[i]] <- data.frame("fit" = rep(NA, N), "upr" =NA,
"lwr" = NA)
pi.comps[[i]]$fit <- apply(tmp, 1, mean, na.rm=TRUE)
pi.comps[[i]][, 2:3] <- t(apply(tmp, 1, quantile, prob = c(upCI, loCI), na.rm=TRUE))
}
if (predict.type == "probability") {
pi.comps[[i]] <- apply(pi.comps[[i]], 2, merMod@resp$family$linkinv)
pi.comps[[i]] <- as.data.frame(pi.comps[[i]])
names(pi.comps[[i]]) <- c("fit", "upr", "lwr")
}
}
componentOut <- dplyr::bind_rows(pi.comps, .id="effect")
outs <- cbind(data.frame("effect" = "combined"), outs)
outs <- suppressWarnings(bind_rows(outs, componentOut))
outs$obs <- rep(1:N, nrow(outs) %/% N)
rm(pi.comps)
}
#Close it out
if(returnSims == FALSE){
return(as.data.frame(outs))
} else if(returnSims == TRUE){
outs <- as.data.frame(outs)
if(which.eff == "all"){
attr(outs, "sim.results") <- allSims
} else{
attr(outs, "sim.results") <- yhat
}
return(outs)
}
}
## TODO: Finish exporting so that all returns the individual predictions for
# each random effect separately
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.