#' @name parade_summary
#' @title Summarise residual information in a diagnostic parade per cell or per unique predictor value
#'
#' @description This function takes the output of the \code{\link{parade}} function
#' and summarises the residuals (and the fitted values) for each unique
#' combination of variables. This may be useful when checking the
#' constant-variance assumption but the nature of the data is such that
#' plotting the raw residuals will make them stand out from the distractor
#' plots even if non-constant variance isn't much of a problem.
#' Feed the output of this function to \code{\link{var_plot}} to obtain a quick
#' diagnostic plot for the constant-variance assumption.
#'
#' I think \code{parade_summary} is mostly useful when dealing with fairly
#' discrete outcome data, so the function will throw a warning if the
#' outcome data seem fairly continuous. (Arbitrarily when there are
#' more than 20 unique outcome values.)
#' It will also throw a warning when the non-outcome data used to define
#' the cells aren't categorical or when the number of observations per
#' cell seems low (arbitrarily fewer than 5 observations).
#'
#' The assignment of cells in the design to cell numbers in the parade
#' summary is random. That is, one particular predictor combination
#' may be associated with Cell 3 when running `parade_summary()` one time,
#' but with Cell 1 when running it a second time. Within a given parade
#' summary, however, the same cell number always refers to the same
#' combination of predictor combinations.
#'
#' @param parade The name of an object generated using the \code{parade()} function.
#' @param predictors_only If you supplied a dataset to the \code{full_data} argument
#' in the \code{parade()} call, \code{parade_summary()} will by default compute mean fitted values
#' and a host of residual summaries for each unique combination of all the non-outcome
#' variables in this dataset. To override this behaviour, set \code{predictors_only}
#' to \code{FALSE}; this causes \code{parade_summary()} to only compute mean fitted values
#' and residual summaries for each unique combination of the predictors that
#' are actually in the model.
#' @keywords model diagnostics, assumptions, lineup protocol
#' @export
#' @examples
#' # Fit model
#' m <- lm(mpg ~ gear, data = mtcars)
#'
#' # Generate parade
#' my_parade <- parade(m)
#'
#' # Summarise residuals by cell -
#' # you'll get some warnings. The second one
#' # because 'gear' is represented as a numeric variable.
#' my_sum_parade <- parade_summary(my_parade)
#'
#' # Draw plot
#' var_plot(my_sum_parade)
#'
#' reveal(my_sum_parade)
#' # or
#' reveal(my_parade)
parade_summary <- function(parade, predictors_only = FALSE) {
# This function computes the mean residuals and fitted
# values per cell for all samples in a parade. The cells
# are defined as the unique combinations of values of
# the non-outcome variables in the dataset (default) or
# as the unique combinations of the values of the
# model's predictor variables (if 'predictors_only = TRUE').
# I think this function is mainly useful if:
# - there is a limited number of cells (i.e., no continuous
# non-outcome variables);
# - the outcome variable is pretty categorical (e.g.,
# if it was measured on a Likert-scale) so that it will stand
# out in standard diagnostic plots like a sore thumb.
# The function throws warnings if some grouping variables aren't
# categorical or if the outcome variable isn't all too categorical.
# Check if parade is a parade
if (is.null(attr(parade, "position"))) {
stop("The object you supplied doesn't seem to be a parade generated by the parade() function.")
}
# We'll need the tidyverse (magrittr, broom, dplyr etc.)
if (!requireNamespace("tidyverse", quietly = TRUE)) {
stop("The \"tidyverse\" package is needed for this function to work. Please install it.",
call. = FALSE)
}
require("tidyverse")
# Throw a warning if the outcome variable isn't very categorical.
# (Arbitrarily defined as 20 or more unique observations.)
n_outcome_levels <- parade |>
filter(.sample == attr(parade, "position")) |>
select(attr(parade, "outcome_var")) |>
distinct() |>
nrow()
if (n_outcome_levels > 19) {
warning(paste0("The outcome variable (", attr(parade, "outcome_var"), ") contains ",
n_outcome_levels, " unique values. ",
"Perhaps you can draw standard diagnostic plots ",
"instead of averaging the residuals?"))
}
if (predictors_only == FALSE) {
# Summarise by all variables,
# including those not used for fitting model
grouping_vars <- attr(parade, "other_vars")
} else if (predictors_only == TRUE) {
# Summarise by predictor variables only.
grouping_vars <- attr(parade, "predictor_vars")
}
# Throw a warning if not all grouping variables are characters or factors.
if (!(all(sapply(parade[, grouping_vars], class) %in% c("character", "factor")))) {
warning(paste0("Not all grouping variables are characters or factors. ",
"Are you sure you want to use this function?"))
}
df_summary <- parade |>
group_by(.sample, !!! rlang::syms(grouping_vars)) |>
summarise(.fitted = unique(.fitted),
.mean_resid = mean(.resid, na.rm = TRUE),
.mean_abs_resid = mean(.abs_resid, na.rm = TRUE),
.mean_sqrt_abs_resid = mean(.sqrt_abs_resid, na.rm = TRUE),
.var = var(.resid, na.rm = TRUE),
.sd = sd(.resid, na.rm = TRUE),
.n = n()) |>
ungroup() |>
# Randomly reorder the summary data: this ensures that the mapping of the data to the
# cells is arbitrary.
sample_frac(1, replace = FALSE) |>
group_by(.sample) |>
mutate(.cell = factor(row_number())) |>
ungroup()
# Throw a warning if any of the cells hardly contain any data.
if (min(df_summary$.n) < 5) {
warning(paste0("Some cells contain only ",
min(df_summary$.n), " observation(s)."))
}
# Add the position of the true data
attr(df_summary, "position") <- attr(parade, "position")
# Add the names of the variables by category
attr(df_summary, "outcome_var") <- attr(parade, "outcome_var")
attr(df_summary, "other_vars") <- attr(parade, "other_vars")
attr(df_summary, "predictor_vars") <- attr(parade, "predictor_vars")
# Specify that this parade contains the summary data, not the raw data
attr(df_summary, "data_type") <- "summary"
df_summary
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.