Nothing
# AUTO SELECTION ----
#' Select Function to Compute Preference Order
#'
#' @description
#' Internal function to select a proper f_...() function to compute preference order depending on the types of the response variable and the predictors. The selection criteria is available as a data frame generated by [f_auto_rules()].
#'
#'
#' @inheritParams collinear
#' @return function name
#' @family preference_order_tools
#' @export
#' @autoglobal
#' @examples
#' f <- f_auto(
#' df = vi[1:1000, ],
#' response = "vi_numeric",
#' predictors = vi_predictors_numeric
#' )
f_auto <- function(
df = NULL,
response = NULL,
predictors = NULL,
quiet = FALSE
){
if(is.null(df) || is.null(response) || is.null(predictors)) {
return(NULL)
}
#data frame with heuristic
rules <- f_auto_rules()
response_type <- identify_response_type(
df = df,
response = response,
quiet = quiet
)
if(response_type == "unknown"){
stop(
"collinear::f_auto(): response type is 'unknown', please select an f_...() function suitable for your response data.",
call. = FALSE
)
}
predictors_type <- identify_predictors_type(
df = df,
predictors = predictors
)
if(predictors_type == "unknown"){
stop(
"collinear::f_auto(): predictors type is 'unknown', please select an f_...() function suitable for your predictor data.",
call. = FALSE
)
}
#select function
f_name <- rules[
rules$response_type == response_type &
rules$predictors_type == predictors_type,
"name"
]
if(quiet == FALSE){
message(
"\ncollinear::f_auto(): selected function: '",
f_name,
"()'."
)
}
f_name
}
#' Data Frame of Preference Functions
#'
#' @return data frame
#' @export
#' @autoglobal
#' @family preference_order_tools
#' @examples
#' f_functions()
f_functions <- function(){
f_list <- list(
c("f_r2_pearson", "numeric", "numeric", "cor(x, y, method = 'pearson')^2", "r-squared"),
c("f_r2_spearman", "numeric", "numeric", "cor(x, y, method = 'spearman')^2", "pseudo r-squared"),
c("f_r2_glm_gaussian", "numeric", "numeric, categorical", "glm(y ~ x, family = gaussian(link = 'identity'))", "r-squared"),
c("f_r2_glm_gaussian_poly2", "numeric", "numeric, categorical", "glm(y ~ poly(x, degree = 2, raw = TRUE), family = gaussian(link = 'identity'))", "r-squared"),
c("f_r2_gam_gaussian", "numeric", "numeric, categorical", "mgcv::gam(y ~ s(x), family = gaussian(link = 'identity'))", "r-squared"),
c("f_r2_rpart", "numeric", "numeric, categorical", "rpart::rpart(y ~ x)", "r-squared"),
c("f_r2_rf", "numeric", "numeric, categorical", "ranger::ranger(y ~ x)", "r-squared"),
c("f_r2_glm_poisson", "integer counts", "numeric, categorical", "glm(y ~ x, family = poisson(link = 'log'))", "r-squared"),
c("f_r2_glm_poisson_poly2", "integer counts", "numeric, categorical", "glm(y ~ poly(x, degree = 2, raw = TRUE), family = poisson(link = 'log'))", "r-squared"),
c("f_r2_gam_poisson", "integer counts", "numeric, categorical", "mgcv::gam(y ~ s(x), family = poisson(link = 'log'))", "r-squared"),
c("f_auc_glm_binomial", "binomial", "numeric, categorical", "glm(y ~ x, family = quasibinomial(link = 'logit'), weights = case_weights(y))", "AUC"),
c("f_auc_glm_binomial_poly2", "binomial", "numeric, categorical", "glm(y ~ poly(x, degree = 2, raw = TRUE), family = quasibinomial(link = 'logit'), weights = collinear::case_weights(y))", "AUC"),
c("f_auc_glm_binomial_poly2", "binomial", "numeric, categorical", "glm(y ~ poly(x, degree = 2, raw = TRUE), family = quasibinomial(link = 'logit'), weights = collinear::case_weights(y))", "AUC"),
c("f_auc_gam_binomial", "binomial", "numeric, categorical", "mgcv::gam(y ~ s(x), family = quasibinomial(link = 'logit'), weights = collinear::case_weights(y))", "AUC"),
c("f_auc_rpart", "binomial", "numeric, categorical", "rpart::rpart(y ~ x, weights = collinear::case_weights(y))", "AUC"),
c("f_auc_rf", "binomial", "numeric, categorical", "ranger::ranger(y ~ x, case.weights = collinear::case_weights(y))", "AUC"),
c("f_v", "categorical", "categorical", "collinear::cramer_v(x, y)", "Cramer's V"),
c("f_v_rf_categorical", "categorical", "numeric, categorical", "ranger::ranger(y ~ x, case.weights = collinear::case_weights(y))", "Cramer's V")
)
f_df <- f_list |>
as.data.frame() |>
t() |>
as.data.frame()
rownames(f_df) <- NULL
colnames(f_df) <- c(
"name",
"response_type",
"predictors_types",
"expression",
"preference_metric"
)
f_df
}
#' Rules to Select Default f Argument to Compute Preference Order
#'
#' @description
#' Data frame with rules used by [f_auto()] to select the function `f` to compute preference order in [preference_order()].
#'
#'
#' @return data frame
#' @family preference_order_tools
#' @export
#' @autoglobal
#' @examples
#' f_auto_rules()
f_auto_rules <- function(){
data.frame(
name = c(
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_pearson",
"f_r2_rf",
"f_r2_rf",
"f_auc_rf",
"f_auc_rf",
"f_auc_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_r2_rf",
"f_v_rf_categorical",
"f_v",
"f_v_rf_categorical"
),
response_type = c(
"continuous-binary",
"continuous-binary",
"continuous-binary",
"continuous-low",
"continuous-low",
"continuous-low",
"continuous-high",
"continuous-high",
"continuous-high",
"integer-binomial",
"integer-binomial",
"integer-binomial",
"integer-binary",
"integer-binary",
"integer-binary",
"integer-low",
"integer-low",
"integer-low",
"integer-high",
"integer-high",
"integer-high",
"categorical",
"categorical",
"categorical"
),
predictors_type = c(
"numeric",
"categorical",
"mixed",
"numeric",
"categorical",
"mixed",
"numeric",
"categorical",
"mixed",
"numeric",
"categorical",
"mixed",
"numeric",
"categorical",
"mixed",
"numeric",
"categorical",
"mixed",
"numeric",
"categorical",
"mixed",
"numeric",
"categorical",
"mixed"
)
)
}
# NUMERIC RESPONSE ----
#' Association Between a Continuous Response and a Continuous Predictor
#'
#' @description
#' These functions take a data frame with two numeric continuous columns "x" (predictor) and "y" (response), fit a univariate model, and return the R-squared of the observations versus the model predictions:
#' \itemize{
#'
#' \item `f_r2_pearson()`: Pearson's R-squared.
#'
#' \item `f_r2_spearman()`: Spearman's R-squared.
#'
#' \item `f_r2_glm_gaussian()`: Pearson's R-squared of a GLM model fitted with [stats::glm()], with formula `y ~ s(x)` and family `stats::gaussian(link = "identity")`.
#'
#' \item `f_r2_glm_gaussian_poly2()`: Pearson's R-squared of a GLM model fitted with [stats::glm()], with formula `y ~ stats::poly(x, degree = 2, raw = TRUE)` and family `stats::gaussian(link = "identity")`.
#'
#' \item `f_r2_gam_gaussian()`: Pearson's R-squared of a GAM model fitted with [mgcv::gam()], with formula `y ~ s(x)` and family `stats::gaussian(link = "identity")`.
#' \item `f_r2_rpart()`: Pearson's R-squared of a Recursive Partition Tree fitted with [rpart::rpart()] with formula `y ~ x`.
#'
#' \item `f_r2_rf()`: Pearson's R-squared of a 100 trees Random Forest model fitted with [ranger::ranger()] and formula `y ~ x`.
#'
#' }
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#' \item "x": (numeric) continuous predictor.
#' \item "y" (numeric) continuous response.
#' }
#'
#' @return numeric: R-squared
#' @examples
#'
#load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #numeric response and predictor
#' #to data frame without NAs
#' df <- data.frame(
#' y = vi[["vi_numeric"]],
#' x = vi[["swi_max"]]
#' ) |>
#' na.omit()
#'
#' # Continuous response
#'
#' #Pearson R-squared
#' f_r2_pearson(df = df)
#'
#' #Spearman R-squared
#' f_r2_spearman(df = df)
#'
#' #R-squared of a gaussian gam
#' f_r2_glm_gaussian(df = df)
#'
#' #gaussian glm with second-degree polynomials
#' f_r2_glm_gaussian_poly2(df = df)
#'
#' #R-squared of a gaussian gam
#' f_r2_gam_gaussian(df = df)
#'
#' #recursive partition tree
#' f_r2_rpart(df = df)
#'
#' #random forest model
#' f_r2_rf(df = df)
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @examples
#'
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #continuous response and predictor
#' #to data frame without NAs
#' df <- data.frame(
#' y = vi[["vi_numeric"]],
#' x = vi[["swi_max"]]
#' ) |>
#' na.omit()
#'
#' # Continuous response
#'
#' #Pearson R-squared
#' f_r2_pearson(df = df)
#'
#' #Spearman R-squared
#' f_r2_spearman(df = df)
#'
#' #R-squared of a gaussian gam
#' f_r2_glm_gaussian(df = df)
#'
#' #gaussian glm with second-degree polynomials
#' f_r2_glm_gaussian_poly2(df = df)
#'
#' #R-squared of a gaussian gam
#' f_r2_gam_gaussian(df = df)
#'
#' #recursive partition tree
#' f_r2_rpart(df = df)
#'
#' #random forest model
#' f_r2_rf(df = df)
#'
#' @name f_r2
NULL
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_pearson <- function(df){
stats::cor(
x = df[["x"]],
y = df[["y"]],
method = "pearson"
)^2
}
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_spearman <- function(df){
stats::cor(
x = df[["x"]],
y = df[["y"]],
method = "spearman"
)^2
}
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_glm_gaussian <- function(df){
p <- stats::glm(
formula = y ~ x,
data = df,
family = stats::gaussian(
link = "identity"
)
) |>
stats::predict(
type = "response"
) |>
suppressWarnings() |>
suppressMessages()
performance_score_r2(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_glm_gaussian_poly2 <- function(df){
p <- stats::glm(
formula = y ~ stats::poly(
x,
degree = 2,
raw = TRUE
),
data = df,
family = stats::gaussian(
link = "identity"
)
) |>
stats::predict(
type = "response"
) |>
suppressWarnings() |>
suppressMessages()
performance_score_r2(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_gam_gaussian <- function(df){
p <- mgcv::gam(
formula = y ~ s(x),
data = df,
family = stats::gaussian(link = "identity"),
select = TRUE
) |>
stats::predict(
type = "response"
)
performance_score_r2(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_rpart <- function(df){
p <- rpart::rpart(
formula = y ~ x,
data = df,
control = rpart::rpart.control(
minbucket = floor(nrow(df) * 0.05)
)
) |>
stats::predict(
type = "vector"
)
performance_score_r2(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_rf <- function(df){
m <- ranger::ranger(
formula = y ~ x,
data = df,
num.threads = 1,
num.trees = 100,
min.node.size = floor(nrow(df) * 0.05),
seed = 1
)
p <- stats::predict(
object = m,
data = df
)$predictions
performance_score_r2(
o = df[["y"]],
p = p
)
}
# COUNTS RESPONSE ----
#' Association Between a Count Response and a Continuous Predictor
#'
#' @description
#' These functions take a data frame with a integer counts response "y", and a continuous predictor "x", fit a univariate model, and return the R-squared of observations versus predictions:
#' \itemize{
#'
#' \item `f_r2_glm_poisson()` Pearson's R-squared between a count response and the predictions of a GLM model with formula `y ~ x` and family `stats::poisson(link = "log")`.
#'
#' \item `f_r2_glm_poisson_poly2()` Pearson's R-squared between a count response and the predictions of a GLM model with formula `y ~ stats::poly(x, degree = 2, raw = TRUE)` and family `stats::poisson(link = "log")`.
#'
#' \item `f_r2_gam_poisson()` Pearson's R-squared between a count response and the predictions of a [mgcv::gam()] model with formula `y ~ s(x)` and family `stats::poisson(link = "log")`.
#'
#' \item `f_r2_rpart()`: Pearson's R-squared of a Recursive Partition Tree fitted with [rpart::rpart()] with formula `y ~ x`.
#'
#' \item `f_r2_rf()`: Pearson's R-squared of a 100 trees Random Forest model fitted with [ranger::ranger()] and formula `y ~ x`.
#' }
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#' \item "x": (numeric) continuous predictor.
#' \item "y" (integer) counts response.
#' }
#' @rdname f_r2_counts
#' @family preference_order_functions
#' @examples
#'
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #integer counts response and continuous predictor
#' #to data frame without NAs
#' df <- data.frame(
#' y = vi[["vi_counts"]],
#' x = vi[["swi_max"]]
#' ) |>
#' na.omit()
#'
#' #GLM model with Poisson family
#' f_r2_glm_poisson(df = df)
#'
#' #GLM model with second degree polynomials and Poisson family
#' f_r2_glm_poisson_poly2(df = df)
#'
#' #GAM model with Poisson family
#' f_r2_gam_poisson(df = df)
#' @name f_r2_counts
NULL
#' @autoglobal
#' @rdname f_r2_counts
#' @family preference_order_functions
#' @export
f_r2_glm_poisson <- function(df){
p <- stats::glm(
formula = y ~ x,
data = df,
family = stats::poisson(
link = "log"
)
) |>
stats::predict(
type = "response"
) |>
suppressWarnings() |>
suppressMessages()
performance_score_r2(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_r2_counts
#' @family preference_order_functions
#' @export
f_r2_glm_poisson_poly2 <- function(df){
p <- stats::glm(
formula = y ~ stats::poly(
x,
degree = 2,
raw = TRUE
),
data = df,
family = stats::poisson(
link = "log"
)
) |>
stats::predict(
type = "response"
) |>
suppressWarnings() |>
suppressMessages()
performance_score_r2(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_r2_counts
#' @family preference_order_functions
#' @export
f_r2_gam_poisson <- function(df){
p <- mgcv::gam(
formula = y ~ s(x),
data = df,
family = stats::poisson(link = "log"),
select = TRUE
) |>
stats::predict(
type = "response"
)
performance_score_r2(
o = df[["y"]],
p = p
)
}
# BINOMIAL RESPONSE ----
#' Association Between a Binomial Response and a Continuous Predictor
#'
#' @description
#' These functions take a data frame with a binomial response "y" with unique values 1 and 0, and a continuous predictor "x", fit a univariate model, to return the Area Under the ROC Curve (AUC) of observations versus predictions:
#' \itemize{
#'
#' \item `f_auc_glm_binomial()`: AUC of a binomial response against the predictions of a GLM model with formula `y ~ x`, family `stats::quasibinomial(link = "logit")`, and weighted cases (see [case_weights()]) to control for unbalanced data.
#'
#' \item `f_auc_glm_binomial_poly2()`: AUC of a binomial response against the predictions of a GLM model with formula `y ~ stats::poly(x, degree = 2, raw = TRUE)`, family `stats::quasibinomial(link = "logit")`, and weighted cases (see [case_weights()]) to control for unbalanced data.
#'
#' \item `f_auc_gam_binomial()`: AUC of a GAM model with formula `y ~ s(x)`, family `stats::quasibinomial(link = "logit")`, and weighted cases.
#'
#' \item `f_auc_rpart()`: AUC of a Recursive Partition Tree with weighted cases.
#'
#' \item `f_auc_rf()`: AUC of a Random Forest model with weighted cases.
#' }
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#' \item "x": (numeric) continuous predictor.
#' \item "y" (integer) binomial response with unique values 0 and 1.
#' }
#' @family preference_order_functions
#' @examples
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #integer counts response and continuous predictor
#' #to data frame without NAs
#' df <- data.frame(
#' y = vi[["vi_binomial"]],
#' x = vi[["swi_max"]]
#' ) |>
#' na.omit()
#'
#' #AUC of GLM with binomial response and weighted cases
#' f_auc_glm_binomial(df = df)
#'
#' #AUC of GLM as above plus second degree polynomials
#' f_auc_glm_binomial_poly2(df = df)
#'
#' #AUC of binomial GAM with weighted cases
#' f_auc_gam_binomial(df = df)
#'
#' #AUC of recursive partition tree with weighted cases
#' f_auc_rpart(df = df)
#'
#' #AUC of random forest with weighted cases
#' f_auc_rf(df = df)
#' @name f_auc
NULL
#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_glm_binomial <- function(df){
p <- stats::glm(
formula = y ~ x,
data = df,
family = stats::quasibinomial(
link = "logit"
),
weights = case_weights(
x = df[["y"]]
)
) |>
stats::predict(
type = "response"
) |>
suppressWarnings() |>
suppressMessages()
performance_score_auc(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_glm_binomial_poly2 <- function(df){
p <- stats::glm(
formula = y ~ stats::poly(
x,
degree = 2,
raw = TRUE
),
data = df,
family = stats::quasibinomial(
link = "logit"
),
weights = case_weights(
x = df[["y"]]
)
) |>
stats::predict(
type = "response"
) |>
suppressWarnings() |>
suppressMessages()
performance_score_auc(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_gam_binomial <- function(df){
p <- mgcv::gam(
formula = y ~ s(x),
data = df,
family = stats::quasibinomial(link = "logit"),
weights = case_weights(
x = df[["y"]]
),
select = TRUE
) |>
stats::predict(
type = "response"
) |>
suppressWarnings() |>
suppressMessages()
performance_score_auc(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_rpart <- function(df){
p <- rpart::rpart(
formula = y ~ x,
data = df,
weights = case_weights(
x = df[["y"]])
,
control = rpart::rpart.control(
minbucket = floor(nrow(df) * 0.05)
)
) |>
stats::predict(
type = "vector"
)
performance_score_auc(
o = df[["y"]],
p = p
)
}
#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_rf <- function(df){
m <- ranger::ranger(
formula = y ~ x,
data = df,
case.weights = case_weights(
x = df[["y"]]
),
num.threads = 1,
num.trees = 100,
min.node.size = floor(nrow(df) * 0.05),
seed = 1
)
p <- stats::predict(
object = m,
data = df
)$predictions
performance_score_auc(
o = df[["y"]],
p = p
)
}
# CATEGORICAL RESPONSE ----
#' Association Between a Categorical Response and a Categorical Predictor
#'
#' @description
#' Computes Cramer's V, a measure of association between categorical or factor variables. Please see [cor_cramer_v()] for further details.
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#' \item "x": (character or factor) categorical predictor.
#' \item "y": (character or factor) categorical response.
#' }
#' @return numeric: Cramer's V
#' @examples
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #categorical response and predictor
#' #to data frame without NAs
#' df <- data.frame(
#' y = vi[["vi_factor"]],
#' x = vi[["soil_type"]]
#' ) |>
#' na.omit()
#'
#' #Cramer's V
#' f_v(df = df)
#' @autoglobal
#' @family preference_order_functions
#' @export
f_v <- function(df){
cor_cramer_v(
x = df[["x"]],
y = df[["y"]],
check_input = FALSE
)
}
#' Association Between a Categorical Response and a Categorical or Numeric Predictor
#'
#' @description
#' Computes the Cramer's V between a categorical response (of class "character" or "factor") and the prediction of a Random Forest model with a categorical or numeric predictor and weighted cases.
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#' \item "x": (character, factor, or numeric) categorical or numeric predictor.
#' \item "y" (character or factor) categorical response.
#' }
#' @return numeric: Cramer's V
#' @examples
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #categorical response and predictor
#' #to data frame without NAs
#' df <- data.frame(
#' y = vi[["vi_factor"]],
#' x = vi[["soil_type"]]
#' ) |>
#' na.omit()
#'
#' #Cramer's V of a Random Forest model
#' f_v_rf_categorical(df = df)
#'
#' #categorical response and numeric predictor
#' df <- data.frame(
#' y = vi[["vi_factor"]],
#' x = vi[["swi_mean"]]
#' ) |>
#' na.omit()
#'
#' f_v_rf_categorical(df = df)
#' @autoglobal
#' @family preference_order_functions
#' @export
f_v_rf_categorical <- function(df){
df[["y"]] <- as.factor(df[["y"]])
m <- ranger::ranger(
formula = y ~ x,
data = df,
case.weights = case_weights(
x = df[["y"]]
),
num.threads = 1,
num.trees = 100,
min.node.size = floor(nrow(df) * 0.05),
seed = 1
)
p <- stats::predict(
object = m,
data = df
)$predictions
performance_score_v(
o = df[["y"]],
p = p
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.