Nothing
#' Calculate the predicted value for each observation across the distribution
#' of the random effect terms.
#'
#' \code{REmargins} calculates the average predicted value for each row of a
#' new data frame across the distribution of \code{\link{expectedRank}} for a
#' merMod object. This allows the user to make meaningful comparisons about the
#' influence of random effect terms on the scale of the response variable,
#' for user-defined inputs, and accounting for the variability in grouping terms.
#'
#' The function simulates the
#'
#' The function predicts the response at every level in the random effect term
#' specified by the user. Then, the expected rank of each group level is binned
#' to the number of bins specified by the user. Finally, a weighted mean of the
#' fitted value for all observations in each bin of the expected ranks is
#' calculated using the inverse of the variance as the weight -- so that less
#' precise estimates are downweighted in the calculation of the mean for the bin.
#' Finally, a standard error for the bin mean is calculated.
#'
#' @param merMod An object of class merMod
#'
#' @param newdata a data frame of observations to calculate group-level differences
#' for
#'
#' @param groupFctr The name of the grouping factor over which the random
#' coefficient of interest varies. This is the variable to the right of the
#' pipe, \code{|}, in the [g]lmer formula. This parameter is optional, if not
#' specified, it will perform the calculation for the first effect listed
#' by \code{ranef}. If the length is > 1 then the combined effect of all
#' listed groups will calculated and marginalized over co-occurences of those
#' groups if desired.
#'
#' @param term The name of the random coefficient of interest. This is the
#' variable to the left of the pipe, \code{|}, in the [g]lmer formula. Partial
#' matching is attempted on the intercept term so the following character
#' strings will all return rankings based on the intercept (\emph{provided that
#' they do not match the name of another random coefficient for that factor}):
#' \code{c("(Intercept)", "Int", "intercep", ...)}.
#'
#' @param breaks an integer representing the number of bins to divide the group
#' effects into, the default is 3.
#' @param .parallel, logical should parallel computation be used, default is TRUE
#'
#' @param ... additional arguments to pass to \code{\link{predictInterval}}
#'
#' @return A data.frame with all unique combinations of the number of cases, rows
#' in the newdata element:
#' \describe{
#' \item{...}{The columns of the original data taken from \code{newdata}}
#' \item{case}{The row number of the observation from newdata. Each row in newdata will be
#' repeated for all unique levels of the grouping_var, term, and breaks.}
#' \item{grouping_var}{The grouping variable the random effect is being marginalized over.}
#' \item{term}{The term for the grouping variable the random effect is being marginalized over.}
#' \item{breaks}{The ntile of the effect size for \code{grouping_var} and \code{term}}
#' \item{original_group_level}{The original grouping value for this \code{case}}
#' \item{fit_combined}{The predicted value from \code{predictInterval} for this case simulated
#' at the Nth ntile of the expected rank distribution of \code{grouping_var} and \code{term}}
#' \item{upr_combined}{The upper bound of the predicted value.}
#' \item{lwr_combined}{The lower bound of the predicted value.}
#' \item{fit_XX}{For each grouping term in newdata the predicted value is decomposed into its
#' fit components via predictInterval and these are all returned here}
#' \item{upr_XX}{The upper bound for the effect of each grouping term}
#' \item{lwr_XX}{The lower bound for the effect of each grouping term}
#' \item{fit_fixed}{The predicted fit with all the grouping terms set to 0 (average)}
#' \item{upr_fixed}{The upper bound fit with all the grouping terms set to 0 (average)}
#' \item{lwr_fixed}{The lower bound fit with all the grouping terms set to 0 (average)}
#' }
#'
#'
#' @references
#' Gatz, DF and Smith, L. The Standard Error of a Weighted Mean Concentration.
#' I. Bootstrapping vs other methods. \emph{Atmospheric Environment}.
#' 1995;11(2)1185-1193. Available at
#' \url{https://www.sciencedirect.com/science/article/pii/135223109400210C}
#'
#' Cochran, WG. 1977. Sampling Techniques (3rd Edition). Wiley, New York.
#'
#' @seealso \code{\link{expectedRank}}, \code{\link{predictInterval}}
#' @importFrom stats reshape
#' @examples
#' \donttest{
#' fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
#' mfx <- REmargins(merMod = fm1, newdata = sleepstudy[1:10,])
#'
#' # You can also pass additional arguments to predictInterval through REimpact
#' g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=InstEval)
#' margin_df <- REmargins(g1, newdata = InstEval[20:25, ], groupFctr = c("s"),
#' breaks = 4)
#' margin_df <- REmargins(g1, newdata = InstEval[20:25, ], groupFctr = c("d"),
#' breaks = 3)
#' }
#' @export
REmargins <- function(merMod, newdata = NULL, groupFctr = NULL, term = NULL, breaks = 4,
.parallel = FALSE, ...){
# Validate inputs
if (is.null(groupFctr)) {
# If the user doesn't tell us which term to use, we take the first term
groupFctr <- names(ranef(merMod))[1]
}
if (is.null(newdata)) {
# If the user doesn't give us data, we take the whole dataset
# TODO - how does performance scale to a large number of observations?
newdata <- merMod@frame
}
# If the user doesn't tell us what term to use, we take all the terms
if (is.null(term)) {
term <- names(ranef(merMod)[[groupFctr]])
# Sub out intercept
term[term == "(Intercept)"] <- "Intercept"
}
# This is a rough way to break the ER distribution into quantiles
brks <- ceiling(seq(1, 100, by = 100/breaks))
# Fallback so we always take a 99th percentile effect (for the maximum)
if (!99 %in% brks) {
brks <- c(brks, 99)
}
# Inputs are validated - now we get the effect distribution
# Generate the expected rank distribution
ER_DF <- expectedRank(merMod, groupFctr = groupFctr, term = term)
# With many effects there is a lot of duplication - drop duplicated pctER
ER_DF <- ER_DF[!duplicated(ER_DF[, c("groupFctr", "term", "pctER")]), ]
# Now we create a data frame to capture the factor levels of each groupFctr that
# correspond to the right break in the expectedRank distribution of the random
# effect grouping factor and term
par_df <- expand.grid("grouping_var" = groupFctr, "term" = term, "breaks" = 1:breaks)
# Keep only factor levels that have effects at the margins
# Need to match closest value here
# Find N closest values
# Drop duplicates
# For each combination build an index of candidate rows/effect levels
# Then choose the level that has the most precise estimate within a
# tolerance of the effect size
for (trm in term) {
for (i in seq_along(brks)) {
# Compute each terms distance from the break
rank_dist <- abs(ER_DF$pctER[ER_DF$term == trm] - brks[i])
# Get the index for the rank that minimizes the distance
# TODO - how to break ties here?
tmp <- which(rank_dist %in% rank_dist[order(rank_dist)][1])
# Store the result in the par_df object
par_df$groupLevel[par_df$breaks == i & par_df$term == trm] <- ER_DF$groupLevel[tmp]
}
}
# Get ready to expand the data
sim_data <- as.data.frame(lapply(newdata, rep, each = nrow(par_df)))
# sim_data now repeats each row of newdata by the number of rows in par_df
# case labels the rows with an integer for later mapping
sim_data$case <- rep(1:nrow(newdata), each = nrow(par_df))
sim_data <- cbind(sim_data, par_df)
sim_data$original_group_level <- sim_data[, groupFctr]
sim_data[, groupFctr] <- sim_data$groupLevel
sim_data$groupLevel <- NULL
#
# Maybe strongly recommend parallel here?
if (.parallel & requireNamespace("foreach", quietly = TRUE)) {
# TODO use future here
setup_parallel()
out <- predictInterval(merMod, newdata = sim_data,
which = "all", ...)
out_w <- stats::reshape(out, direction = "wide",
idvar = "obs", timevar = "effect",
v.names = c("fit", "upr", "lwr"), sep = "_")
out_w$obs <- NULL
sim_data <- cbind(sim_data, out_w)
} else if ( .parallel & !requireNamespace("foreach", quietly = TRUE)) {
warning("foreach package is unavailable, parallel computing not available")
} else {
out <- predictInterval(merMod, newdata = sim_data,
which = "all", ...)
out_w <- stats::reshape(out, direction = "wide",
idvar = "obs", timevar = "effect",
v.names = c("fit", "upr", "lwr"), sep = "_")
out_w$obs <- NULL
sim_data <- cbind(sim_data, out_w)
}
# Case is the number of the row in newdata
# obs is the variance among the selected random effects to marginalize over
# So we want to collapse by case if we can
return(sim_data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.