#' Adjust data for the effect of other variable(s)
#'
#' This function can be used to adjust the data for the effect of other
#' variables present in the dataset. It is based on an underlying fitting of
#' regressions models, allowing for quite some flexibility, such as including
#' factors as random effects in mixed models (multilevel partialization),
#' continuous variables as smooth terms in general additive models (non-linear
#' partialization) and/or fitting these models under a Bayesian framework. The
#' values returned by this function are the residuals of the regression models.
#' Note that a regular correlation between two "adjusted" variables is
#' equivalent to the partial correlation between them.
#'
#' @param data A data frame.
#' @param effect Character vector of column names to be adjusted for (regressed
#' out). If `NULL` (the default), all variables will be selected.
#' @param multilevel If `TRUE`, the factors are included as random factors.
#' Else, if `FALSE` (default), they are included as fixed effects in the
#' simple regression model.
#' @param additive If `TRUE`, continuous variables as included as smooth terms
#' in additive models. The goal is to regress-out potential non-linear
#' effects.
#' @param bayesian If `TRUE`, the models are fitted under the Bayesian framework
#' using `rstanarm`.
#' @param keep_intercept If `FALSE` (default), the intercept of the model is
#' re-added. This avoids the centering around 0 that happens by default
#' when regressing out another variable (see the examples below for a
#' visual representation of this).
#' @inheritParams extract_column_names
#' @inheritParams standardize
#'
#' @return A data frame comparable to `data`, with adjusted variables.
#'
#' @examplesIf all(insight::check_if_installed(c("bayestestR", "rstanarm", "gamm4"), quietly = TRUE))
#' adjusted_all <- adjust(attitude)
#' head(adjusted_all)
#' adjusted_one <- adjust(attitude, effect = "complaints", select = "rating")
#' head(adjusted_one)
#' \donttest{
#' adjust(attitude, effect = "complaints", select = "rating", bayesian = TRUE)
#' adjust(attitude, effect = "complaints", select = "rating", additive = TRUE)
#' attitude$complaints_LMH <- cut(attitude$complaints, 3)
#' adjust(attitude, effect = "complaints_LMH", select = "rating", multilevel = TRUE)
#' }
#'
#' # Generate data
#' data <- bayestestR::simulate_correlation(n = 100, r = 0.7)
#' data$V2 <- (5 * data$V2) + 20 # Add intercept
#'
#' # Adjust
#' adjusted <- adjust(data, effect = "V1", select = "V2")
#' adjusted_icpt <- adjust(data, effect = "V1", select = "V2", keep_intercept = TRUE)
#'
#' # Visualize
#' plot(
#' data$V1, data$V2,
#' pch = 19, col = "blue",
#' ylim = c(min(adjusted$V2), max(data$V2)),
#' main = "Original (blue), adjusted (green), and adjusted - intercept kept (red) data"
#' )
#' abline(lm(V2 ~ V1, data = data), col = "blue")
#' points(adjusted$V1, adjusted$V2, pch = 19, col = "green")
#' abline(lm(V2 ~ V1, data = adjusted), col = "green")
#' points(adjusted_icpt$V1, adjusted_icpt$V2, pch = 19, col = "red")
#' abline(lm(V2 ~ V1, data = adjusted_icpt), col = "red")
#'
#' @export
adjust <- function(data,
effect = NULL,
select = is.numeric,
exclude = NULL,
multilevel = FALSE,
additive = FALSE,
bayesian = FALSE,
keep_intercept = FALSE,
ignore_case = FALSE,
regex = FALSE,
verbose = FALSE) {
# make sure column names are syntactically valid
.check_dataframe_names(data, action = "error")
# check for formula notation, convert to character vector
if (inherits(effect, "formula")) {
effect <- all.vars(effect)
}
# Find predictors
if (is.null(effect)) {
effect <- names(data)
}
if (is.null(select)) {
select <- is.numeric
}
select <- .select_nse(select,
data,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
# Factors
formula_random <- NULL
facs <- names(data[effect][!vapply(data[effect], is.numeric, logical(1L))])
if (length(facs) >= 1 && multilevel) {
if (additive) {
formula_random <- stats::as.formula(paste("~", paste(paste0("(1|", facs, ")"), collapse = " + ")))
} else {
formula_random <- paste("+", paste(paste0("(1|", facs, ")"), collapse = " + "))
}
effect <- effect[!effect %in% facs]
}
# Fit models
out <- data.frame(.ID = seq_len(nrow(data)))
for (var in select) {
predictors <- effect[effect != var]
if (additive) {
predictors_num <- names(data[predictors][vapply(data[predictors], is.numeric, logical(1L))])
predictors[predictors == predictors_num] <- paste0("s(", predictors_num, ")")
}
formula_predictors <- paste(c("1", predictors), collapse = " + ")
model_formula <- paste(var, "~", formula_predictors)
x <- .model_adjust_for(
data = data[unique(c(var, effect, facs))],
model_formula = model_formula,
multilevel = multilevel,
additive = additive,
bayesian = bayesian,
formula_random = formula_random,
keep_intercept = keep_intercept
)
out[var] <- x
}
out[names(data)[!names(data) %in% names(out)]] <- data[names(data)[!names(data) %in% names(out)]]
out[names(data)]
}
#' @rdname adjust
#' @export
data_adjust <- adjust
#' @keywords internal
.model_adjust_for <- function(data,
model_formula,
multilevel = FALSE,
additive = FALSE,
bayesian = FALSE,
formula_random = NULL,
keep_intercept = FALSE) {
# Additive -----------------------
if (additive) {
# Bayesian
if (bayesian) {
insight::check_if_installed("rstanarm")
model <- rstanarm::stan_gamm4(stats::as.formula(model_formula), random = formula_random, data = data, refresh = 0)
# Frequentist
} else {
insight::check_if_installed("gamm4")
model <- gamm4::gamm4(stats::as.formula(model_formula), random = formula_random, data = data)
}
# Linear -------------------------
} else if (bayesian) {
# Bayesian
insight::check_if_installed("rstanarm")
if (multilevel) {
model <- rstanarm::stan_lmer(paste(model_formula, formula_random), data = data, refresh = 0)
} else {
model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0)
}
} else if (multilevel) {
# Frequentist
insight::check_if_installed("lme4")
model <- lme4::lmer(paste(model_formula, formula_random), data = data)
} else {
model <- stats::lm(model_formula, data = data)
}
adjusted <- insight::get_residuals(model)
# Re-add intercept if need be
if (keep_intercept) {
intercept <- insight::get_intercept(model)
if (length(intercept) > 1) intercept <- stats::median(intercept) # For bayesian model
if (is.na(intercept)) intercept <- 0
adjusted <- adjusted + intercept
}
# Deal with missing data
out <- rep(NA, nrow(data))
out[stats::complete.cases(data)] <- as.vector(adjusted)
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.