Nothing
#' Bagging functions
#'
#' General suite of bagging functions for several models.
#'
#' @param x A data frame, matrix, or recipe (depending on the method being used).
#' @param y A numeric or factor vector of outcomes. Categorical outcomes (i.e
#' classes) should be represented as factors, not integers.
#' @param formula An object of class "formula" (or one that can be coerced to
#' that class): a symbolic description of the model to be fitted. Note that
#' this package does not support multivariate outcomes and that, if some
#' predictors are factors, dummy variables will _not_ be created unless by the
#' underlying model function.
#' @param data A data frame containing the variables used in the formula or
#' recipe.
#' @param weights A numeric vector of non-negative case weights. These values are
#' not used during bootstrap resampling.
#' @param base_model A single character value for the model being bagged. Possible
#' values are "CART", "MARS", "nnet", and "C5.0" (classification only).
#' @param times A single integer greater than 1 for the maximum number of bootstrap
#' samples/ensemble members (some model fits might fail).
#' @param control A list of options generated by `control_bag()`.
#' @param cost A non-negative scale (for two class problems) or a square cost matrix.
#' @param ... Optional arguments to pass to the base model function.
#' @details `bagger()` fits separate models to bootstrap samples. The
#' prediction function for each model object is encoded in an R expression and
#' the original model object is discarded. When making predictions, each
#' prediction formula is evaluated on the new data and aggregated using the
#' mean.
#'
#' Variable importance scores are calculated using implementations in each
#' package. When requested, the results are in a tibble with column names
#' `term` (the predictor), `value` (the importance score), and `used` (the
#' percentage of times that the variable was in the prediction equation).
#'
#' The models can be fit in parallel using the \pkg{future} package. The
#' enable parallelism, use the `future::plan()` function to declare _how_ the
#' computations should be distributed. Note that this will almost certainly
#' multiply the memory requirements required to fit the models.
#'
#' For neural networks, variable importance is calculated using the method
#' of Garson described in Gevrey _et al_ (2003)
#'
#' @references Gevrey, M., Dimopoulos, I., and Lek, S. (2003). Review and
#' comparison of methods to study the contribution of variables in artificial
#' neural network models. Ecological Modelling, 160(3), 249-264.
#' @examples
#' if (rlang::is_installed(c("recipes", "modeldata"))) {
#' library(recipes)
#' library(dplyr)
#'
#' data(biomass, package = "modeldata")
#'
#' biomass_tr <-
#' biomass %>%
#' dplyr::filter(dataset == "Training") %>%
#' dplyr::select(-dataset, -sample)
#'
#' biomass_te <-
#' biomass %>%
#' dplyr::filter(dataset == "Testing") %>%
#' dplyr::select(-dataset, -sample)
#'
#' # ------------------------------------------------------------------------------
#'
#' ctrl <- control_bag(var_imp = TRUE)
#'
#' # ------------------------------------------------------------------------------
#'
#' # `times` is low to make the examples run faster
#'
#'
#' set.seed(7687)
#' cart_bag <- bagger(x = biomass_tr[, -6], y = biomass_tr$HHV,
#' base_model = "CART", times = 5, control = ctrl)
#' cart_bag
#'
#' # ------------------------------------------------------------------------------
#' # Other interfaces
#'
#' # Recipes can be used
#' biomass_rec <-
#' recipe(HHV ~ ., data = biomass_tr) %>%
#' step_pca(all_predictors())
#'
#' set.seed(7687)
#' cart_pca_bag <- bagger(biomass_rec, data = biomass_tr, base_model = "CART",
#' times = 5, control = ctrl)
#'
#' cart_pca_bag
#' }
#' @export
#' @include validate.R
bagger <- function(x, ...) {
UseMethod("bagger")
}
#' @export
#' @rdname bagger
bagger.default <- function(x, ...) {
cli::cli_abort("{.fn bagger} is not defined for {obj_type_friendly(x)}.", call. = FALSE)
}
# XY method - data frame
#' @export
#' @rdname bagger
bagger.data.frame <-
function(x, y, weights = NULL,
base_model = "CART",
times = 11L,
control = control_bag(),
cost = NULL,
...) {
times <- integer_B(times)
seed <- sample.int(10^5, 1)
validate_args(base_model, times, control, cost)
processed <- hardhat::mold(x, y)
res <-
bagger_bridge(processed,
weights,
base_model,
seed,
times,
control,
cost,
...)
res$model_df <- select_rs(res$model_df)
res
}
# XY method - matrix
#' @export
#' @rdname bagger
bagger.matrix <-
function(x, y, weights = NULL,
base_model = "CART",
times = 11L,
control = control_bag(),
cost = NULL,
...) {
times <- integer_B(times)
seed <- sample.int(10^5, 1)
validate_args(base_model, times, control, cost)
processed <- hardhat::mold(x, y)
res <-
bagger_bridge(processed,
weights,
base_model,
seed,
times,
control,
cost,
...)
res$model_df <- select_rs(res$model_df)
res
}
# Formula method
#' @export
#' @rdname bagger
bagger.formula <-
function(formula, data, weights = NULL,
base_model = "CART",
times = 11L,
control = control_bag(),
cost = NULL,
...) {
times <- integer_B(times)
seed <- sample.int(10^5, 1)
validate_args(base_model, times, control, cost)
bp <- hardhat::default_formula_blueprint(indicators = "none")
processed <- hardhat::mold(formula, data, blueprint = bp)
res <-
bagger_bridge(processed,
weights,
base_model,
seed,
times,
control,
cost,
...)
res$model_df <- select_rs(res$model_df)
res
}
# Recipe method
#' @export
#' @rdname bagger
bagger.recipe <-
function(x, data,
base_model = "CART",
times = 11L,
control = control_bag(),
cost = NULL,
...) {
times <- integer_B(times)
seed <- sample.int(10^5, 1)
validate_args(base_model, times, control, cost)
processed <- hardhat::mold(x, data)
res <-
bagger_bridge(processed,
weights = NULL,
base_model,
seed,
times,
control,
cost,
...)
res$model_df <- select_rs(res$model_df)
res
}
# ------------------------------------------------------------------------------
#' Controlling the bagging process
#'
#' `control_bag()` can set options for ancillary aspects of the bagging process.
#'
#' @param var_imp A single logical: should variable importance scores be calculated?
#' @param allow_parallel A single logical: should the model fits be done in
#' parallel (even if a parallel `plan()` has been created)?
#' @param reduce Should models be modified to reduce their size on disk?
#' @param sampling Either "none" or "down". For classification only. The
#' training data, after bootstrapping, will be sampled down within each class
#' (with replacement) to the size of the smallest class.
#' @param extract A function (or NULL) that can extract model-related aspects
#' of each ensemble member. See Details and example below.
#' @return A list.
#' @details
#' Any arbitrary item can be saved from the model object (including the model
#' object itself) using the `extract` argument, which should be a function with
#' arguments `x` (for the model object), and `...`. The results of this
#' function are saved into a list column called `extras` (see the example below).
#' @examples
#' # Extracting model components
#'
#' num_term_nodes <- function(x, ...) {
#' tibble::tibble(num_nodes = sum(x$frame$var == "<leaf>"))
#' }
#'
#' set.seed(7687)
#' with_extras <- bagger(mpg ~ ., data = mtcars,
#' base_model = "CART", times = 5,
#' control = control_bag(extract = num_term_nodes))
#'
#' dplyr::bind_rows(with_extras$model_df$extras)
#' @export
control_bag <-
function(var_imp = TRUE,
allow_parallel = TRUE,
sampling = "none",
reduce = TRUE,
extract = NULL) {
res <-
list(
var_imp = var_imp,
allow_parallel = allow_parallel,
sampling = sampling,
reduce = reduce,
extract = extract
)
validate_control(res)
}
#' @export
print.bagger <- function(x, ...) {
if (is.null(x$cost)) {
mod_name <- x$base_model[1]
} else {
mod_name <- paste("Cost-Sensitive", x$base_model[1])
}
cat("Bagged ", mod_name, " (", x$base_model[2], " with ",
nrow(x$model_df), " members)\n", sep = "")
if (!is.null(x$imp)) {
cat("\nVariable importance scores include:\n\n")
print(x$imp)
}
cat("\n")
invisible(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.