R/bagger.R

Defines functions print.bagger control_bag bagger.recipe bagger.formula bagger.matrix bagger.data.frame bagger.default bagger

Documented in bagger bagger.data.frame bagger.default bagger.formula bagger.matrix bagger.recipe control_bag

#' Bagging functions
#'
#' General suite of bagging functions for several models.
#'
#' @param x A data frame, matrix, or recipe (depending on the method being used).
#' @param y A numeric or factor vector of outcomes. Categorical outcomes (i.e
#'  classes) should be represented as factors, not integers.
#' @param formula  An object of class "formula" (or one that can be coerced to
#'  that class): a symbolic description of the model to be fitted. Note that
#'  this package does not support multivariate outcomes and that, if some
#'  predictors are factors, dummy variables will _not_ be created unless by the
#'  underlying model function.
#' @param data A data frame containing the variables used in the formula or
#'  recipe.
#' @param weights A numeric vector of non-negative case weights. These values are
#' not used during bootstrap resampling.
#' @param base_model A single character value for the model being bagged. Possible
#'  values are "CART", "MARS", "nnet", and "C5.0" (classification only).
#' @param times A single integer greater than 1 for the maximum number of bootstrap
#'  samples/ensemble members (some model fits might fail).
#' @param control A list of options generated by `control_bag()`.
#' @param cost A non-negative scale (for two class problems) or a square cost matrix.
#' @param ... Optional arguments to pass to the base model function.
#' @details `bagger()` fits separate models to bootstrap samples. The
#'  prediction function for each model object is encoded in an R expression and
#'  the original model object is discarded. When making predictions, each
#'  prediction formula is evaluated on the new data and aggregated using the
#'  mean.
#'
#' Variable importance scores are calculated using implementations in each
#'  package. When requested, the results are in a tibble with column names
#'  `term` (the predictor), `value` (the importance score), and `used` (the
#'  percentage of times that the variable was in the prediction equation).
#'
#' The models can be fit in parallel using the \pkg{future} package. The
#'  enable parallelism, use the `future::plan()` function to declare _how_ the
#'  computations should be distributed. Note that this will almost certainly
#'  multiply the memory requirements required to fit the models.
#'
#' For neural networks, variable importance is calculated using the method
#' of Garson described in Gevrey _et al_ (2003)
#'
#' @references Gevrey, M., Dimopoulos, I., and Lek, S. (2003). Review and
#' comparison of methods to study the contribution of variables in artificial
#' neural network models. Ecological Modelling, 160(3), 249-264.
#' @examples
#' if (rlang::is_installed(c("recipes", "modeldata"))) {
#'   library(recipes)
#'   library(dplyr)
#'
#'   data(biomass, package = "modeldata")
#'
#'   biomass_tr <-
#'     biomass %>%
#'     dplyr::filter(dataset == "Training") %>%
#'     dplyr::select(-dataset, -sample)
#'
#'   biomass_te <-
#'     biomass %>%
#'     dplyr::filter(dataset == "Testing") %>%
#'     dplyr::select(-dataset, -sample)
#'
#'   # ------------------------------------------------------------------------------
#'
#'   ctrl <- control_bag(var_imp = TRUE)
#'
#'   # ------------------------------------------------------------------------------
#'
#'   # `times` is low to make the examples run faster
#'
#'
#'   set.seed(7687)
#'   cart_bag <- bagger(x = biomass_tr[, -6], y = biomass_tr$HHV,
#'                      base_model = "CART", times = 5, control = ctrl)
#'   cart_bag
#'
#'   # ------------------------------------------------------------------------------
#'   # Other interfaces
#'
#'   # Recipes can be used
#'   biomass_rec <-
#'     recipe(HHV ~ ., data = biomass_tr) %>%
#'     step_pca(all_predictors())
#'
#'   set.seed(7687)
#'   cart_pca_bag <- bagger(biomass_rec, data = biomass_tr, base_model = "CART",
#'                          times = 5, control = ctrl)
#'
#'   cart_pca_bag
#' }
#' @export
#' @include validate.R
bagger <- function(x, ...) {
  UseMethod("bagger")
}

#' @export
#' @rdname bagger
bagger.default <- function(x, ...) {
  cli::cli_abort("{.fn bagger} is not defined for {obj_type_friendly(x)}.", call. = FALSE)
}

# XY method - data frame

#' @export
#' @rdname bagger
bagger.data.frame <-
  function(x, y, weights = NULL,
           base_model = "CART",
           times = 11L,
           control = control_bag(),
           cost = NULL,
           ...) {
    times <- integer_B(times)
    seed <- sample.int(10^5, 1)
    validate_args(base_model, times, control, cost)

    processed <- hardhat::mold(x, y)
    res <-
      bagger_bridge(processed,
                    weights,
                    base_model,
                    seed,
                    times,
                    control,
                    cost,
                    ...)

    res$model_df <- select_rs(res$model_df)
    res
  }

# XY method - matrix

#' @export
#' @rdname bagger
bagger.matrix <-
  function(x, y, weights = NULL,
           base_model = "CART",
           times = 11L,
           control = control_bag(),
           cost = NULL,
           ...) {
    times <- integer_B(times)
    seed <- sample.int(10^5, 1)

    validate_args(base_model, times, control, cost)

    processed <- hardhat::mold(x, y)
    res <-
      bagger_bridge(processed,
                    weights,
                    base_model,
                    seed,
                    times,
                    control,
                    cost,
                    ...)

    res$model_df <- select_rs(res$model_df)
    res
  }

# Formula method

#' @export
#' @rdname bagger
bagger.formula <-
  function(formula, data, weights = NULL,
           base_model = "CART",
           times = 11L,
           control = control_bag(),
           cost = NULL,
           ...) {
    times <- integer_B(times)
    seed <- sample.int(10^5, 1)

    validate_args(base_model, times, control, cost)

    bp <- hardhat::default_formula_blueprint(indicators = "none")
    processed <- hardhat::mold(formula, data, blueprint = bp)
    res <-
      bagger_bridge(processed,
                    weights,
                    base_model,
                    seed,
                    times,
                    control,
                    cost,
                    ...)

    res$model_df <- select_rs(res$model_df)
    res
  }

# Recipe method

#' @export
#' @rdname bagger
bagger.recipe <-
  function(x, data,
           base_model = "CART",
           times = 11L,
           control = control_bag(),
           cost = NULL,
           ...) {
    times <- integer_B(times)
    seed <- sample.int(10^5, 1)

    validate_args(base_model, times, control, cost)

    processed <- hardhat::mold(x, data)
    res <-
      bagger_bridge(processed,
                    weights = NULL,
                    base_model,
                    seed,
                    times,
                    control,
                    cost,
                    ...)

    res$model_df <- select_rs(res$model_df)
    res
  }

# ------------------------------------------------------------------------------


#' Controlling the bagging process
#'
#' `control_bag()` can set options for ancillary aspects of the bagging process.
#'
#' @param var_imp A single logical: should variable importance scores be calculated?
#' @param allow_parallel A single logical: should the model fits be done in
#'  parallel (even if a parallel `plan()` has been created)?
#' @param reduce Should models be modified to reduce their size on disk?
#' @param sampling Either "none" or "down". For classification only. The
#' training data, after bootstrapping, will be sampled down within each class
#' (with replacement) to the size of the smallest class.
#' @param extract A function (or NULL) that can extract model-related aspects
#'  of each ensemble member. See Details and example below.
#' @return A list.
#' @details
#' Any arbitrary item can be saved from the model object (including the model
#'  object itself) using the `extract` argument, which should be a function with
#'  arguments `x` (for the model object), and `...`. The results of this
#'  function are saved into a list column called `extras` (see the example below).
#' @examples
#' # Extracting model components
#'
#' num_term_nodes <- function(x, ...) {
#'   tibble::tibble(num_nodes = sum(x$frame$var == "<leaf>"))
#' }
#'
#' set.seed(7687)
#' with_extras <- bagger(mpg ~ ., data = mtcars,
#'                       base_model = "CART", times = 5,
#'                       control = control_bag(extract = num_term_nodes))
#'
#' dplyr::bind_rows(with_extras$model_df$extras)
#' @export
control_bag <-
  function(var_imp = TRUE,
           allow_parallel = TRUE,
           sampling = "none",
           reduce = TRUE,
           extract = NULL) {
    res <-
      list(
        var_imp = var_imp,
        allow_parallel = allow_parallel,
        sampling = sampling,
        reduce = reduce,
        extract = extract
      )

    validate_control(res)
  }

#' @export
print.bagger <- function(x, ...) {

  if (is.null(x$cost)) {
    mod_name <- x$base_model[1]
  } else {
    mod_name <- paste("Cost-Sensitive", x$base_model[1])
  }

  cat("Bagged ", mod_name, " (", x$base_model[2], " with ",
      nrow(x$model_df), " members)\n", sep = "")

  if (!is.null(x$imp)) {
    cat("\nVariable importance scores include:\n\n")
    print(x$imp)
  }
  cat("\n")
  invisible(x)
}

Try the baguette package in your browser

Any scripts or data that you put into this service are public.

baguette documentation built on April 4, 2025, 12:22 a.m.