Nothing
#' @title Dual multicollinearity filtering algorithm
#'
#' @description
#' Automatizes multicollinearity filtering via pairwise correlation and/or variance inflation factors in dataframes with numeric and categorical predictors.
#'
#' The argument \code{max_cor} determines the maximum pairwise correlation allowed in the resulting selection of predictors, while \code{max_vif} does the same for variance inflation factors.
#'
#' The argument \code{preference_order} accepts a character vector of predictor names ranked from first to last index, or a dataframe resulting from [preference_order()]. When two predictors in this vector or dataframe are highly collinear, the one with a lower ranking is removed. This option helps protect predictors of interest. If not provided, predictors are ranked from lower to higher multicollinearity.
#' Please check the sections **Variance Inflation Factors**, **VIF-based Filtering**, and **Pairwise Correlation Filtering** at the end of this help file for further details.
#'
#'
#' @inheritSection collinear Pairwise Correlation Filtering
#' @inheritSection collinear Variance Inflation Factors
#' @inheritSection collinear VIF-based Filtering
#'
#' @inheritParams collinear
#' @param response (optional; character or NULL) Name of one response variable in \code{df}. Used to exclude columns when \code{predictors} is NULL, and to filter \code{preference_order} when it is a dataframe and contains several responses. Default: NULL.
#' @param max_cor (optional; numeric or NULL) Maximum correlation allowed between pairs of \code{predictors}. Valid values are between 0.01 and 0.99, and recommended values are between 0.5 (strict) and 0.9 (permissive). Default: 0.7
#'
#' @param max_vif (optional, numeric or NULL) Maximum Variance Inflation Factor allowed for \code{predictors} during multicollinearity filtering. Recommended values are between 2.5 (strict) and 10 (permissive). Default: 5
#' @return character vector: names of selected predictors
#' @examples
#' data(vi_smol)
#'
#' ## OPTIONAL: parallelization setup
#' ## irrelevant when all predictors are numeric
#' ## only worth it for large data with many categoricals
#' # future::plan(
#' # future::multisession,
#' # workers = future::availableCores() - 1
#' # )
#'
#' ## OPTIONAL: progress bar
#' # progressr::handlers(global = TRUE)
#'
#' x <- collinear_select(
#' df = vi_smol,
#' predictors = c(
#' "koppen_zone", #character
#' "soil_type", #factor
#' "topo_elevation", #numeric
#' "soil_temperature_mean" #numeric
#' ),
#' max_cor = 0.7,
#' max_vif = 5
#' )
#'
#' x
#'
#' ## OPTIONAL: disable parallelization
#' #future::plan(future::sequential)
#' @autoglobal
#' @family multicollinearity_filtering
#' @author Blas M. Benito, PhD
#' @export
collinear_select <- function(
df = NULL,
response = NULL,
predictors = NULL,
preference_order = NULL,
max_cor = 0.61,
max_vif = 5,
quiet = FALSE,
...
) {
dots <- list(...)
function_name <- validate_arg_function_name(
default_name = "collinear::collinear_select()",
function_name = dots$function_name
)
df <- validate_arg_df_not_null(
df = df,
function_name = function_name
)
quiet <- validate_arg_quiet(
function_name = function_name,
quiet = quiet
)
response <- validate_arg_responses(
df = df,
responses = response,
max_responses = 1,
quiet = quiet,
function_name = function_name
)
predictors <- validate_arg_predictors(
df = df,
responses = response,
predictors = predictors,
quiet = quiet,
function_name = function_name
)
df.ncol <- ncol(df)
df <- validate_arg_df(
df = df,
responses = response,
predictors = predictors,
quiet = quiet,
function_name = function_name
)
#revalidate predictors if any columns were removed
if (ncol(df) < df.ncol) {
attributes(response)$validated <- NULL
attributes(predictors)$validated <- NULL
response <- validate_arg_responses(
df = df,
responses = response,
max_responses = 1,
quiet = quiet,
function_name = function_name
)
predictors <- validate_arg_predictors(
df = df,
responses = response,
predictors = predictors,
quiet = quiet,
function_name = function_name
)
}
#univariate case
if (length(predictors) == 1) {
if (quiet == FALSE) {
message(
"\n",
function_name,
": only one valid predictor in 'predictors', skipping multicollinearity filtering."
)
}
return(predictors)
}
if (is.null(max_cor) && is.null(max_vif)) {
stop(
"\n",
function_name,
": arguments 'max_cor' and 'max_vif' cannot be NULL at once.",
call. = FALSE
)
}
m <- cor_matrix(
df = df,
predictors = predictors,
function_name = function_name,
quiet = quiet,
m = dots$m
)
max_cor <- validate_arg_max_cor(
max_cor = max_cor,
function_name = function_name,
quiet = quiet
)
max_vif <- validate_arg_max_vif(
max_vif = max_vif,
function_name = function_name,
quiet = quiet
)
preference.order <- validate_arg_preference_order(
df = df,
response = response,
predictors = predictors,
preference_order = preference_order,
quiet = quiet,
function_name = function_name,
m = m
)
preference.order <- preference.order$predictor
#organize the correlation matrix according to preference_order
m.class <- class(m)
m <- m[
preference.order,
preference.order
]
class(m) <- m.class
#check if filtering is needed
skip_cor <- FALSE
skip_vif <- FALSE
#check cor threshold
if (!is.null(max_cor)) {
if (max(abs(m[upper.tri(m)])) <= max_cor) {
skip_cor <- TRUE
if (quiet == FALSE && is.null(max_vif)) {
message(
"\n",
function_name,
": maximum pairwise correlation is <= ",
max_cor,
", multicollinearity filtering is not required."
)
return(predictors)
}
}
}
if (!is.null(max_vif)) {
current_max_vif <- max(
vif(
m = m,
quiet = quiet,
function_name = function_name
)
)
#check vif threshold
if (current_max_vif <= max_vif) {
skip_vif <- TRUE
if (quiet == FALSE && is.null(max_cor)) {
message(
"\n",
function_name,
": maximum VIF is <= ",
max_vif,
", multicollinearity filtering is not required."
)
return(predictors)
}
}
}
#both are below thresholds
if (skip_cor && skip_vif) {
if (quiet == FALSE) {
message(
"\n",
function_name,
": multicollinearity is below 'max_cor' and 'max_vif', filtering is not required."
)
}
return(predictors)
}
#filtering loop
selected <- preference.order[1]
candidates <- preference.order[-1]
#iterate over candidates
for (candidate in candidates) {
#correlation criterion (if applicable)
cor_ok <- TRUE
if (!is.null(max_cor) && !skip_cor) {
cor_ok <- max(abs(m[selected, candidate])) <= max_cor
}
#VIF criterion, only checked if cor passed
vif_ok <- TRUE
if (!is.null(max_vif) && !skip_vif && cor_ok) {
selected_columns <- c(
selected,
candidate
)
current_vif <- vif(
m = m[
selected_columns,
selected_columns
],
quiet = quiet,
function_name = function_name
) |>
max()
vif_ok <- current_vif <= max_vif
}
#add candidate if all criteria is TRUE
if (cor_ok && vif_ok) {
selected <- c(selected, candidate)
}
}
attr(
x = selected,
which = "validated"
) <- TRUE
selected
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.