#' @name parade
#' @title Generate dataset for a diagnostic parade
#'
#' @description This function generates a parade (= \code{\link[nullabor]{lineup()}} in the \pkg{nullabor} package)
#' that hides the observations, fitted values, and residuals of a statistical model you
#' want to diagnose among the observations, fitted values, and residuals of a number similar models
#' that were fitted on simulated outcome data. The sets of simulated outcome data are
#' generated from the original model so that this model's assumptions are literally true
#' for the simulated data. The 'tibble' (dataframe) created by this can be used to draw
#' panels of diagnostic plots (see examples).
#'
#' @section Transformed predictors:
#' If you want to include transformed predictors in the model call (e.g., \code{log(x)}),
#' transform the predictor before using it in the model call (see examples).
#'
#' This function relies on \code{\link[broom]{augment}} in the \pkg{broom} package. Since \code{augment()}
#' cannot handle model calls with \code{poly()} or \code{ns()}, \code{parade()} can't handle these, either.
#' (For `lmer` models, the \code{\link[broom.mixed]{augment}} function in the \pkg{broom.mixed} package is used.)
#'
#' @param model The name of the statistical model you want to diagnose.
#' Currently only \code{lm()}, \code{gam()} (from the \pkg{mgcv} package) and \code{lmer()} (from the \pkg{lme4} package) models are supported.
#' For the \code{lmer()} models, only residual diagnostics are supported; support for BLUP ('random effects') diagnostics
#' is still lacking.
#' @param full_data By default, the output will only include variables that are part of the model.
#' If you want to include all the variables that are present in the dataframe
#' on which the model was fitted, supply this dataframe's name to full_data.
#' @param size The number of simulated and actual datasets that the parade will contain.
#' This defaults to 20, meaning that the actual dataset will be hidden among 19
#' simulated datasets.
#' @keywords model diagnostics, assumptions, lineup protocol
#' @export
#' @return A tibble containing predictors, outcomes, fitted values and residuals for both the real dataset and simulated datasets.
#'
#' @examples
#' # A simple regression model
#' m <- lm(mpg ~ disp, data = mtcars)
#'
#' # Generate parade and check linearity
#' my_parade <- parade(m)
#' my_parade
#' lin_plot(my_parade)
#' reveal(my_parade)
#'
#' # Regenerate parade and check constant variance
#' my_parade <- parade(m)
#' var_plot(my_parade)
#' reveal(my_parade)
#'
#' # Regenerate parade and check normality
#' my_parade <- parade(m)
#' norm_qq(my_parade)
#' norm_hist(my_parade)
#' norm_hist(my_parade, bins = 10)
#' reveal(my_parade)
#'
#' # If you want to include all predictors in the dataset in the parade:
#' my_parade <- parade(m, full_data = mtcars)
#' my_parade
#'
#' # If you want to generate a parade with 50 instead of 20 plots:
#' my_parade <- parade(m, size = 50)
#' norm_qq(my_parade)
#'
#' # The function also works for generalised additive models fitted with mgcv:
#' library(mgcv)
#' m.gam <- gam(mpg ~ s(disp) + wt + s(qsec), data = mtcars)
#' my_parade <- parade(m.gam)
#' lin_plot(my_parade)
#' my_parade <- parade(m.gam)
#' norm_qq(my_parade)
#'
#' m.gam <- gam(mpg ~ te(disp, qsec) + wt, data = mtcars)
#' my_parade <- parade(m.gam)
#' lin_plot(my_parade)
#'
#' # And has some limited support for lmer() models (from the lme4 package)
#' library(lme4)
#' m.lmer <- lmer(Reaction ~ Days + (Days|Subject), data = sleepstudy)
#' my_parade <- parade(m.lmer)
#' norm_hist(my_parade, bins = 15)
#' # Support for diagnosing the BLUPs would be nice.
#'
#' # Transformed predictors:
#' # This won't work:
#' # m <- lm(mpg ~ log2(disp), data = mtcars)
#' # my_parade <- parade(m)
#'
#' # This will:
#' mtcars$log2.disp <- log2(mtcars$disp)
#' m <- lm(mpg ~ log2.disp, data = mtcars)
#' my_parade <- parade(m)
parade <- function(model, full_data = NULL, size = 20) {
# Throw error for unsupported models. Better support for lmer() would be nice.
if (!(class(model)[1] %in% c("lm", "gam", "lmerMod"))) {
stop(paste0("This function currently only works with lm(), gam() and lmer() models. ",
"The class of the model you tried to fit was ",
class(model), "."))
}
# Throw error if size isn't at least 2.
if (!(size >= 2)) {
stop(paste0("The size parameter needs to be set to at least 2. ",
"It is currently set at ", size, "."))
}
# We'll need the tidyverse (magrittr, broom, dplyr etc.)
if (!requireNamespace("tidyverse", quietly = TRUE)) {
stop("The \"tidyverse\" package is needed for this function to work. Please install it.",
call. = FALSE)
}
require("tidyverse")
# Get data frame with all variables in model
df <- model.frame(model)
# Get the name of the outcome variable
outcome_var <- colnames(df)[1]
# Simulate (size - 1) vectors of outcome data from the model
simulated_data <- data.frame(simulate(model, nsim = size - 1))
# Check if full_data was specified, and if so, if it was correctly specified.
if (is.null(full_data)) {
# OK: use model frame to generate data frame for parade
# Get all the variable names.
other_vars <- colnames(df)[-1]
var_names <- c(other_vars, outcome_var)
df <- cbind(simulated_data, df)
} else if (!is.data.frame(full_data)) {
# full_data needs to be a data frame (tibbles also work)
stop(paste0("You supplied an object to 'full_data' that isn't a data frame. ",
"Its class is ", class(full_data), "."))
} else if (!(all(colnames(df) %in% colnames(full_data)))) {
# Not all variables in the model occur in full_data
stop(paste0("The data frame supplied to 'full_data' needs to contains ",
"all variables used in the model supplied to 'model'."), "\n",
paste0("The following variables are missing in 'full_data': "), "\n",
paste(shQuote(colnames(df)[!(colnames(df) %in% colnames(full_data))]), collapse = ", "), "\n",
paste0("Did you specify the correct data frame?"))
} else if (nrow(df) != nrow(full_data)) {
# Different datasets?
stop(paste0("The dataset on which the model was fitted and the one passed to 'full_data' ",
"contain a different number of observations. ",
"If you're sure you supplied the correct data frame and model, ",
"check if full_data contains missing data in the outcome or predictors ",
"and remove these if necessary."))
} else if (!is.null(full_data)) {
# OK: use full_data to generate date frame
other_vars <- colnames(full_data)[!(colnames(full_data) %in% outcome_var)]
# Get all the variable names
var_names <- c(other_vars, outcome_var)
df <- cbind(simulated_data, df[, outcome_var], full_data[, other_vars])
}
# Rename the columns with the simulated and true outcome variable
colnames(df)[1:size] <- c(paste("..sample", 1:size))
# Convert df to the long format. For now, ..sample 'size' contains
# the true data.
df <- df |>
pivot_longer(starts_with("..sample "),
names_to = "..sample",
values_to = "outcome")
# Rename the column with the true and simulated outcomes appropriately
colnames(df)[ncol(df)] <- outcome_var
# Refit the model to each vector of simulated and true outcomes.
# For more complex models, it may save some time to not refit the
# last model, but I'll leave that to someone with 1337 R skillz.
if (class(model)[1] %in% c("lm", "gam")) {
df <- df |>
group_by(..sample) |>
# Refit the original model to each sample (using update()),
# output the fitted and residual values (using augment()),
# and retain dfs variables (using data = .):
do(broom::augment(update(model, data = .), data = .)) |>
# Add transformed residuals
mutate(.abs_resid = abs(.resid)) |>
mutate(.sqrt_abs_resid = sqrt(abs(.resid))) |>
# augment() outputs some additional info;
# only retain variables, fitted values and residuals here
select("..sample", make.names(var_names), ".fitted", ".resid", ".abs_resid", ".sqrt_abs_resid") |>
# small problem here: if I(...^2), this gets converted to I.disp.2.
ungroup()
}
else if (class(model)[1] == "lmerMod") {
# Especially for lmer fits, the model won't always converge.
# The following function adds whether there was a convergence problem
# to the output.
augment_lmer <- function(model, data) {
model_refit <- update(model, data = data)
results <- broom.mixed::augment(model_refit, data = data)
results$convergence_error <- ifelse(is.null(model_refit@optinfo$conv$lme4$code), 0, 1)
return(results)
}
# Only simulated datasets that didn't yield convergence errors are retained in the output.
df <- df |>
group_by(..sample) |>
# Refit the original model to each sample,
# output the fitted and residual values,
# and retain dfs variables:
do(augment_lmer(model, data = .)) |>
# Remove simulated datasets for which there was a convergence error
filter(convergence_error == 0) |>
# Add transformed residuals
mutate(.abs_resid = abs(.resid)) |>
mutate(.sqrt_abs_resid = sqrt(abs(.resid))) |>
# augment() outputs some additional info;
# only retain variables, fitted values and residuals here
select("..sample", all_of(var_names), ".fitted", ".resid", ".abs_resid", ".sqrt_abs_resid") |>
ungroup()
}
# Generate a random number between 1 and size. This will be
# the position of the true data in the parade.
pos <- sample(1:size, 1)
# This ugly bit moves the true data to position 'pos':
df_sample <- data.frame(
.sample2 = paste("..sample", 1:size),
.sample = c((1:size)[-pos], pos)
)
df_sample$.sample2 <- as.character(df_sample$.sample2)
df <- df |>
left_join(df_sample, by = c("..sample" = ".sample2")) |>
select(-..sample)
# Add the true data's position as an attribute to the parade.
attr(df, "position") <- pos
# Add the names of the variables by category
attr(df, "outcome_var") <- outcome_var
attr(df, "other_vars") <- other_vars
attr(df, "predictor_vars") <- colnames(model.frame(model)[-1])
# Specify that this parade contains the raw data, not summaries
attr(df, "data_type") <- "raw"
# Add information about the model class
attr(df, "model_class") <- class(model)[1]
# Warning about convergence errors
unique_samples <- length(unique(df$.sample))
if (unique_samples < size) {
warning(paste0("There were convergence errors for ",
size - unique_samples,
" dataset(s). These datasets are not retained in the output."))
}
# Output the parade
df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.