R/preference_order_methods.R

Defines functions f_v_rf_categorical f_v f_auc_rf f_auc_rpart f_auc_gam_binomial f_auc_glm_binomial_poly2 f_auc_glm_binomial f_r2_gam_poisson f_r2_glm_poisson_poly2 f_r2_glm_poisson f_r2_rf f_r2_rpart f_r2_gam_gaussian f_r2_glm_gaussian_poly2 f_r2_glm_gaussian f_r2_spearman f_r2_pearson f_auto_rules f_functions f_auto

Documented in f_auc_gam_binomial f_auc_glm_binomial f_auc_glm_binomial_poly2 f_auc_rf f_auc_rpart f_auto f_auto_rules f_functions f_r2_gam_gaussian f_r2_gam_poisson f_r2_glm_gaussian f_r2_glm_gaussian_poly2 f_r2_glm_poisson f_r2_glm_poisson_poly2 f_r2_pearson f_r2_rf f_r2_rpart f_r2_spearman f_v f_v_rf_categorical

# AUTO SELECTION ----


#' Select Function to Compute Preference Order
#'
#' @description
#' Internal function to select a proper f_...() function to compute preference order depending on the types of the response variable and the predictors. The selection criteria is available as a data frame generated by [f_auto_rules()].
#'
#'
#' @inheritParams collinear
#' @return function name
#' @family preference_order_tools
#' @export
#' @autoglobal
#' @examples
#' f <- f_auto(
#'   df = vi[1:1000, ],
#'   response = "vi_numeric",
#'   predictors = vi_predictors_numeric
#'   )
f_auto <- function(
    df = NULL,
    response = NULL,
    predictors = NULL,
    quiet = FALSE
){

  if(is.null(df) || is.null(response) || is.null(predictors)) {
    return(NULL)
  }

  #data frame with heuristic
  rules <- f_auto_rules()

  response_type <- identify_response_type(
    df = df,
    response = response,
    quiet = quiet
  )

  if(response_type == "unknown"){
    stop(
      "collinear::f_auto(): response type is 'unknown', please select an f_...() function suitable for your response data.",
      call. = FALSE
    )
  }

  predictors_type <- identify_predictors_type(
    df = df,
    predictors = predictors
  )

  if(predictors_type == "unknown"){
    stop(
      "collinear::f_auto(): predictors type is 'unknown', please select an f_...() function suitable for your predictor data.",
      call. = FALSE
    )
  }

  #select function
  f_name <- rules[
    rules$response_type == response_type &
      rules$predictors_type == predictors_type,
    "name"
  ]

  if(quiet == FALSE){

    message(
      "\ncollinear::f_auto(): selected function: '",
      f_name,
      "()'."
    )

  }


  f_name

}

#' Data Frame of Preference Functions
#'
#' @return data frame
#' @export
#' @autoglobal
#' @family preference_order_tools
#' @examples
#' f_functions()
f_functions <- function(){

  f_list <- list(
    c("f_r2_pearson", "numeric", "numeric", "cor(x, y, method = 'pearson')^2", "r-squared"),
    c("f_r2_spearman", "numeric", "numeric", "cor(x, y, method = 'spearman')^2", "pseudo r-squared"),
    c("f_r2_glm_gaussian", "numeric", "numeric, categorical", "glm(y ~ x, family = gaussian(link = 'identity'))", "r-squared"),
    c("f_r2_glm_gaussian_poly2", "numeric", "numeric, categorical", "glm(y ~ poly(x, degree = 2, raw = TRUE), family = gaussian(link = 'identity'))", "r-squared"),
    c("f_r2_gam_gaussian", "numeric", "numeric, categorical", "mgcv::gam(y ~ s(x), family = gaussian(link = 'identity'))", "r-squared"),
    c("f_r2_rpart", "numeric", "numeric, categorical", "rpart::rpart(y ~ x)", "r-squared"),
    c("f_r2_rf", "numeric", "numeric, categorical", "ranger::ranger(y ~ x)", "r-squared"),
    c("f_r2_glm_poisson", "integer counts", "numeric, categorical", "glm(y ~ x, family = poisson(link = 'log'))", "r-squared"),
    c("f_r2_glm_poisson_poly2", "integer counts", "numeric, categorical", "glm(y ~ poly(x, degree = 2, raw = TRUE), family = poisson(link = 'log'))", "r-squared"),
    c("f_r2_gam_poisson", "integer counts", "numeric, categorical", "mgcv::gam(y ~ s(x), family = poisson(link = 'log'))", "r-squared"),
    c("f_auc_glm_binomial", "binomial", "numeric, categorical", "glm(y ~ x, family = quasibinomial(link = 'logit'), weights = case_weights(y))", "AUC"),
    c("f_auc_glm_binomial_poly2", "binomial", "numeric, categorical", "glm(y ~ poly(x, degree = 2, raw = TRUE), family = quasibinomial(link = 'logit'), weights = collinear::case_weights(y))", "AUC"),
    c("f_auc_glm_binomial_poly2", "binomial", "numeric, categorical", "glm(y ~ poly(x, degree = 2, raw = TRUE), family = quasibinomial(link = 'logit'), weights = collinear::case_weights(y))", "AUC"),
    c("f_auc_gam_binomial", "binomial", "numeric, categorical", "mgcv::gam(y ~ s(x), family = quasibinomial(link = 'logit'), weights = collinear::case_weights(y))", "AUC"),
    c("f_auc_rpart", "binomial", "numeric, categorical", "rpart::rpart(y ~ x, weights = collinear::case_weights(y))", "AUC"),
    c("f_auc_rf", "binomial", "numeric, categorical", "ranger::ranger(y ~ x, case.weights = collinear::case_weights(y))", "AUC"),
    c("f_v", "categorical", "categorical", "collinear::cramer_v(x, y)", "Cramer's V"),
    c("f_v_rf_categorical", "categorical", "numeric, categorical", "ranger::ranger(y ~ x, case.weights = collinear::case_weights(y))", "Cramer's V")
  )

  f_df <- f_list |>
    as.data.frame() |>
    t() |>
    as.data.frame()

  rownames(f_df) <- NULL

  colnames(f_df) <- c(
    "name",
    "response_type",
    "predictors_types",
    "expression",
    "preference_metric"
  )

  f_df

}


#' Rules to Select Default f Argument to Compute Preference Order
#'
#' @description
#' Data frame with rules used by [f_auto()] to select the function `f` to compute preference order in [preference_order()].
#'
#'
#' @return data frame
#' @family preference_order_tools
#' @export
#' @autoglobal
#' @examples
#' f_auto_rules()
f_auto_rules <- function(){

  data.frame(
    name = c(
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_pearson",
      "f_r2_rf",
      "f_r2_rf",
      "f_auc_rf",
      "f_auc_rf",
      "f_auc_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_r2_rf",
      "f_v_rf_categorical",
      "f_v",
      "f_v_rf_categorical"
    ),
    response_type = c(
      "continuous-binary",
      "continuous-binary",
      "continuous-binary",
      "continuous-low",
      "continuous-low",
      "continuous-low",
      "continuous-high",
      "continuous-high",
      "continuous-high",
      "integer-binomial",
      "integer-binomial",
      "integer-binomial",
      "integer-binary",
      "integer-binary",
      "integer-binary",
      "integer-low",
      "integer-low",
      "integer-low",
      "integer-high",
      "integer-high",
      "integer-high",
      "categorical",
      "categorical",
      "categorical"
    ),
    predictors_type = c(
      "numeric",
      "categorical",
      "mixed",
      "numeric",
      "categorical",
      "mixed",
      "numeric",
      "categorical",
      "mixed",
      "numeric",
      "categorical",
      "mixed",
      "numeric",
      "categorical",
      "mixed",
      "numeric",
      "categorical",
      "mixed",
      "numeric",
      "categorical",
      "mixed",
      "numeric",
      "categorical",
      "mixed"
    )
  )

}



# NUMERIC RESPONSE ----

#' Association Between a Continuous Response and a Continuous Predictor
#'
#' @description
#' These functions take a data frame with two numeric continuous columns "x" (predictor) and "y" (response), fit a univariate model, and return the R-squared of the observations versus the model predictions:
#' \itemize{
#'
#'   \item `f_r2_pearson()`: Pearson's R-squared.
#'
#'   \item `f_r2_spearman()`: Spearman's R-squared.
#'
#'   \item `f_r2_glm_gaussian()`: Pearson's R-squared of a GLM model fitted with [stats::glm()], with formula `y ~ s(x)` and family `stats::gaussian(link = "identity")`.
#'
#'   \item `f_r2_glm_gaussian_poly2()`: Pearson's R-squared of a GLM model fitted with [stats::glm()], with formula `y ~ stats::poly(x, degree = 2, raw = TRUE)` and family `stats::gaussian(link = "identity")`.
#'
#'   \item `f_r2_gam_gaussian()`: Pearson's R-squared of a GAM model fitted with [mgcv::gam()], with formula `y ~ s(x)` and family `stats::gaussian(link = "identity")`.

#'   \item `f_r2_rpart()`: Pearson's R-squared of a Recursive Partition Tree fitted with [rpart::rpart()] with formula `y ~ x`.
#'
#'   \item `f_r2_rf()`: Pearson's R-squared of a 100 trees Random Forest model fitted with [ranger::ranger()] and formula `y ~ x`.
#'
#' }
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#'   \item "x": (numeric) continuous predictor.
#'   \item "y" (numeric) continuous response.
#' }
#'
#' @return numeric: R-squared
#' @examples
#'
#load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #numeric response and predictor
#' #to data frame without NAs
#' df <- data.frame(
#'   y = vi[["vi_numeric"]],
#'   x = vi[["swi_max"]]
#' ) |>
#'   na.omit()
#'
#' # Continuous response
#'
#' #Pearson R-squared
#' f_r2_pearson(df = df)
#'
#' #Spearman R-squared
#' f_r2_spearman(df = df)
#'
#' #R-squared of a gaussian gam
#' f_r2_glm_gaussian(df = df)
#'
#' #gaussian glm with second-degree polynomials
#' f_r2_glm_gaussian_poly2(df = df)
#'
#' #R-squared of a gaussian gam
#' f_r2_gam_gaussian(df = df)
#'
#' #recursive partition tree
#' f_r2_rpart(df = df)
#'
#' #random forest model
#' f_r2_rf(df = df)
#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @examples
#'
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #continuous response and predictor
#' #to data frame without NAs
#' df <- data.frame(
#'   y = vi[["vi_numeric"]],
#'   x = vi[["swi_max"]]
#' ) |>
#'   na.omit()
#'
#' # Continuous response
#'
#' #Pearson R-squared
#' f_r2_pearson(df = df)
#'
#' #Spearman R-squared
#' f_r2_spearman(df = df)
#'
#' #R-squared of a gaussian gam
#' f_r2_glm_gaussian(df = df)
#'
#' #gaussian glm with second-degree polynomials
#' f_r2_glm_gaussian_poly2(df = df)
#'
#' #R-squared of a gaussian gam
#' f_r2_gam_gaussian(df = df)
#'
#' #recursive partition tree
#' f_r2_rpart(df = df)
#'
#' #random forest model
#' f_r2_rf(df = df)
#'
#' @name f_r2
NULL

#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_pearson <- function(df){

  stats::cor(
    x = df[["x"]],
    y = df[["y"]],
    method = "pearson"
  )^2

}


#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_spearman <- function(df){

  stats::cor(
    x = df[["x"]],
    y = df[["y"]],
    method = "spearman"
  )^2

}

#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_glm_gaussian <- function(df){

  p <- stats::glm(
    formula = y ~ x,
    data = df,
    family = stats::gaussian(
      link = "identity"
    )
  ) |>
    stats::predict(
      type = "response"
    ) |>
    suppressWarnings() |>
    suppressMessages()

  performance_score_r2(
    o = df[["y"]],
    p = p
  )

}


#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_glm_gaussian_poly2 <- function(df){

  p <- stats::glm(
    formula = y ~ stats::poly(
      x,
      degree = 2,
      raw = TRUE
    ),
    data = df,
    family = stats::gaussian(
      link = "identity"
    )
  ) |>
    stats::predict(
      type = "response"
    ) |>
    suppressWarnings() |>
    suppressMessages()

  performance_score_r2(
    o = df[["y"]],
    p = p
  )

}


#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_gam_gaussian <- function(df){

  p <- mgcv::gam(
    formula = y ~ s(x),
    data = df,
    family = stats::gaussian(link = "identity"),
    select = TRUE
  ) |>
    stats::predict(
      type = "response"
    )

  performance_score_r2(
    o = df[["y"]],
    p = p
  )

}



#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_rpart <- function(df){

  p <- rpart::rpart(
    formula = y ~ x,
    data = df,
    control = rpart::rpart.control(
      minbucket = floor(nrow(df) * 0.05)
    )
  ) |>
    stats::predict(
      type = "vector"
    )

  performance_score_r2(
    o = df[["y"]],
    p = p
  )

}

#' @autoglobal
#' @rdname f_r2
#' @family preference_order_functions
#' @export
f_r2_rf <- function(df){

  m <- ranger::ranger(
    formula = y ~ x,
    data = df,
    num.threads = 1,
    num.trees = 100,
    min.node.size = floor(nrow(df) * 0.05),
    seed = 1
  )

  p <- stats::predict(
    object = m,
    data = df
    )$predictions

  performance_score_r2(
    o = df[["y"]],
    p = p
  )

}

# COUNTS RESPONSE ----

#' Association Between a Count Response and a Continuous Predictor
#'
#' @description
#' These functions take a data frame with a integer counts response "y", and a continuous predictor "x", fit a univariate model, and return the R-squared of observations versus predictions:
#' \itemize{
#'
#'   \item `f_r2_glm_poisson()` Pearson's R-squared between a count response and the predictions of a GLM model with formula `y ~ x` and family `stats::poisson(link = "log")`.
#'
#'   \item `f_r2_glm_poisson_poly2()` Pearson's R-squared between a count response and the predictions of a GLM model with formula `y ~ stats::poly(x, degree = 2, raw = TRUE)` and family `stats::poisson(link = "log")`.
#'
#'   \item `f_r2_gam_poisson()` Pearson's R-squared between a count response and the predictions of a [mgcv::gam()] model with formula `y ~ s(x)` and family `stats::poisson(link = "log")`.
#'
#'   \item `f_r2_rpart()`: Pearson's R-squared of a Recursive Partition Tree fitted with [rpart::rpart()] with formula `y ~ x`.
#'
#'   \item `f_r2_rf()`: Pearson's R-squared of a 100 trees Random Forest model fitted with [ranger::ranger()] and formula `y ~ x`.
#' }
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#'   \item "x": (numeric) continuous predictor.
#'   \item "y" (integer) counts response.
#' }
#' @rdname f_r2_counts
#' @family preference_order_functions
#' @examples
#'
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #integer counts response and continuous predictor
#' #to data frame without NAs
#' df <- data.frame(
#'   y = vi[["vi_counts"]],
#'   x = vi[["swi_max"]]
#' ) |>
#'   na.omit()
#'
#' #GLM model with Poisson family
#' f_r2_glm_poisson(df = df)
#'
#' #GLM model with second degree polynomials and Poisson family
#' f_r2_glm_poisson_poly2(df = df)
#'
#' #GAM model with Poisson family
#' f_r2_gam_poisson(df = df)
#' @name f_r2_counts
NULL


#' @autoglobal
#' @rdname f_r2_counts
#' @family preference_order_functions
#' @export
f_r2_glm_poisson <- function(df){

  p <- stats::glm(
    formula = y ~ x,
    data = df,
    family = stats::poisson(
      link = "log"
    )
  ) |>
    stats::predict(
      type = "response"
    ) |>
    suppressWarnings() |>
    suppressMessages()

  performance_score_r2(
    o = df[["y"]],
    p = p
  )

}

#' @autoglobal
#' @rdname f_r2_counts
#' @family preference_order_functions
#' @export
f_r2_glm_poisson_poly2 <- function(df){

  p <- stats::glm(
    formula = y ~ stats::poly(
      x,
      degree = 2,
      raw = TRUE
    ),
    data = df,
    family = stats::poisson(
      link = "log"
    )
  ) |>
    stats::predict(
      type = "response"
    ) |>
    suppressWarnings() |>
    suppressMessages()

  performance_score_r2(
    o = df[["y"]],
    p = p
  )

}


#' @autoglobal
#' @rdname f_r2_counts
#' @family preference_order_functions
#' @export
f_r2_gam_poisson <- function(df){

  p <- mgcv::gam(
    formula = y ~ s(x),
    data = df,
    family = stats::poisson(link = "log"),
    select = TRUE
  ) |>
    stats::predict(
      type = "response"
    )

  performance_score_r2(
    o = df[["y"]],
    p = p
  )

}

# BINOMIAL RESPONSE ----

#' Association Between a Binomial Response and a Continuous Predictor
#'
#' @description
#' These functions take a data frame with a binomial response "y" with unique values 1 and 0, and a continuous predictor "x", fit a univariate model, to return the Area Under the ROC Curve (AUC) of observations versus predictions:
#' \itemize{
#'
#'   \item `f_auc_glm_binomial()`: AUC of a binomial response against the predictions of a GLM model with formula `y ~ x`, family `stats::quasibinomial(link = "logit")`, and weighted cases (see [case_weights()]) to control for unbalanced data.
#'
#'   \item `f_auc_glm_binomial_poly2()`: AUC of a binomial response against the predictions of a GLM model with formula `y ~ stats::poly(x, degree = 2, raw = TRUE)`, family `stats::quasibinomial(link = "logit")`, and weighted cases (see [case_weights()]) to control for unbalanced data.
#'
#'   \item `f_auc_gam_binomial()`: AUC  of a GAM model with formula  `y ~ s(x)`, family `stats::quasibinomial(link = "logit")`, and weighted cases.
#'
#'   \item `f_auc_rpart()`: AUC of a Recursive Partition Tree with weighted cases.
#'
#'   \item `f_auc_rf()`: AUC of a Random Forest model with weighted cases.
#' }
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#'   \item "x": (numeric) continuous predictor.
#'   \item "y" (integer) binomial response with unique values 0 and 1.
#' }
#' @family preference_order_functions
#' @examples
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #integer counts response and continuous predictor
#' #to data frame without NAs
#' df <- data.frame(
#'   y = vi[["vi_binomial"]],
#'   x = vi[["swi_max"]]
#' ) |>
#'   na.omit()
#'
#' #AUC of GLM with binomial response and weighted cases
#' f_auc_glm_binomial(df = df)
#'
#' #AUC of GLM as above plus second degree polynomials
#' f_auc_glm_binomial_poly2(df = df)
#'
#' #AUC of binomial GAM with weighted cases
#' f_auc_gam_binomial(df = df)
#'
#' #AUC of recursive partition tree with weighted cases
#' f_auc_rpart(df = df)
#'
#' #AUC of random forest with weighted cases
#' f_auc_rf(df = df)
#' @name f_auc
NULL

#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_glm_binomial <- function(df){

  p <- stats::glm(
    formula = y ~ x,
    data = df,
    family = stats::quasibinomial(
      link = "logit"
      ),
    weights = case_weights(
      x = df[["y"]]
      )
  ) |>
    stats::predict(
      type = "response"
    ) |>
    suppressWarnings() |>
    suppressMessages()

  performance_score_auc(
    o = df[["y"]],
    p = p
  )

}


#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_glm_binomial_poly2 <- function(df){

  p <- stats::glm(
    formula = y ~ stats::poly(
      x,
      degree = 2,
      raw = TRUE
    ),
    data = df,
    family = stats::quasibinomial(
      link = "logit"
    ),
    weights = case_weights(
      x = df[["y"]]
      )
  ) |>
    stats::predict(
      type = "response"
    ) |>
    suppressWarnings() |>
    suppressMessages()

  performance_score_auc(
    o = df[["y"]],
    p = p
  )

}

#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_gam_binomial <- function(df){

  p <- mgcv::gam(
    formula = y ~ s(x),
    data = df,
    family = stats::quasibinomial(link = "logit"),
    weights = case_weights(
      x = df[["y"]]
      ),
    select = TRUE
  ) |>
    stats::predict(
      type = "response"
    ) |>
    suppressWarnings() |>
    suppressMessages()

  performance_score_auc(
    o = df[["y"]],
    p = p
  )

}

#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_rpart <- function(df){

  p <- rpart::rpart(
    formula = y ~ x,
    data = df,
    weights = case_weights(
      x = df[["y"]])
    ,
    control = rpart::rpart.control(
      minbucket = floor(nrow(df) * 0.05)
    )
  ) |>
    stats::predict(
      type = "vector"
    )

  performance_score_auc(
    o = df[["y"]],
    p = p
  )

}

#' @autoglobal
#' @rdname f_auc
#' @family preference_order_functions
#' @export
f_auc_rf <- function(df){

  m <- ranger::ranger(
    formula = y ~ x,
    data = df,
    case.weights = case_weights(
      x = df[["y"]]
      ),
    num.threads = 1,
    num.trees = 100,
    min.node.size = floor(nrow(df) * 0.05),
    seed = 1
  )

  p <- stats::predict(
    object = m,
    data = df
  )$predictions

  performance_score_auc(
    o = df[["y"]],
    p = p
  )

}

# CATEGORICAL RESPONSE ----

#' Association Between a Categorical Response and a Categorical Predictor
#'
#' @description
#' Computes Cramer's V, a measure of association between categorical or factor variables. Please see [cor_cramer_v()] for further details.
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#'   \item "x": (character or factor) categorical predictor.
#'   \item "y": (character or factor) categorical response.
#' }
#' @return numeric: Cramer's V
#' @examples
#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #categorical response and predictor
#' #to data frame without NAs
#' df <- data.frame(
#'   y = vi[["vi_factor"]],
#'   x = vi[["soil_type"]]
#' ) |>
#'   na.omit()
#'
#' #Cramer's V
#' f_v(df = df)
#' @autoglobal
#' @family preference_order_functions
#' @export
f_v <- function(df){

  cor_cramer_v(
    x = df[["x"]],
    y = df[["y"]],
    check_input = FALSE
  )

}

#' Association Between a Categorical Response and a Categorical or Numeric Predictor
#'
#' @description
#' Computes the Cramer's V between a categorical response (of class "character" or "factor") and the prediction of a Random Forest model with a categorical or numeric predictor and weighted cases.
#'
#' @param df (required, data frame) with columns:
#' \itemize{
#'   \item "x": (character, factor, or numeric) categorical or numeric predictor.
#'   \item "y" (character or factor) categorical response.
#' }
#' @return numeric: Cramer's V
#' @examples

#' #load example data
#' data(vi)
#'
#' #reduce size to speed-up example
#' vi <- vi[1:1000, ]
#'
#' #categorical response and predictor
#' #to data frame without NAs
#' df <- data.frame(
#'   y = vi[["vi_factor"]],
#'   x = vi[["soil_type"]]
#' ) |>
#'   na.omit()
#'
#' #Cramer's V of a Random Forest model
#' f_v_rf_categorical(df = df)
#'
#' #categorical response and numeric predictor
#' df <- data.frame(
#'   y = vi[["vi_factor"]],
#'   x = vi[["swi_mean"]]
#' ) |>
#'   na.omit()
#'
#' f_v_rf_categorical(df = df)
#' @autoglobal
#' @family preference_order_functions
#' @export
f_v_rf_categorical <- function(df){

  df[["y"]] <- as.factor(df[["y"]])

  m <- ranger::ranger(
    formula = y ~ x,
    data = df,
    case.weights = case_weights(
      x = df[["y"]]
    ),
    num.threads = 1,
    num.trees = 100,
    min.node.size = floor(nrow(df) * 0.05),
    seed = 1
  )

  p <- stats::predict(
    object = m,
    data = df
  )$predictions

  performance_score_v(
    o = df[["y"]],
    p = p
  )

}

Try the collinear package in your browser

Any scripts or data that you put into this service are public.

collinear documentation built on April 12, 2025, 1:36 a.m.