stacks: Tidy Model Stacking

Documented in blend_predictions

#' Determine stacking coefficients from a data stack
#'
#' @description 
#' Evaluates a data stack by fitting a regularized model on the 
#' assessment predictions from each candidate member to predict 
#' the true outcome.
#' 
#' This process determines the "stacking coefficients" of the model 
#' stack. The stacking coefficients are used to weight the
#' predictions from each candidate (represented by a unique column
#' in the data stack), and are given by the betas of a LASSO model
#' fitting the true outcome with the predictions given in the
#' remaining columns of the data stack.
#' 
#' Candidates with non-zero stacking coefficients are model stack 
#' members, and need to be trained on the full training set (rather
#' than just the assessment set) with [fit_members()]. This function
#' is typically used after a number of calls to [add_candidates()].
#' 
#' @details 
#' Note that a regularized linear model is one of many possible
#' learning algorithms that could be used to fit a stacked ensemble
#' model. For implementations of additional ensemble learning algorithms, see
#' [h2o::h2o.stackedEnsemble()] and `SuperLearner::SuperLearner()`.
#' 
#' @param data_stack A `data_stack` object
#' @param penalty A numeric vector of proposed values for total amount of
#'   regularization used in member weighting. Higher penalties will generally 
#'   result in fewer members being included in the resulting model stack, and 
#'   vice versa. The package will tune over a grid formed from the cross 
#'   product of the `penalty` and `mixture` arguments.
#' @param mixture A number between zero and one (inclusive) giving the 
#'   proportion of L1 regularization (i.e. lasso) in the model. `mixture = 1`
#'   indicates a pure lasso model, `mixture = 0` indicates ridge regression, and
#'   values in `(0, 1)` indicate an elastic net. The package will tune over 
#'   a grid formed from the cross product of the `penalty` and `mixture` 
#'   arguments.
#' @param non_negative A logical giving whether to restrict stacking 
#'   coefficients to non-negative values. If `TRUE` (default), 0 is passed as 
#'   the `lower.limits` argument to [glmnet::glmnet()] in fitting the
#'   model on the data stack. Otherwise, `-Inf`.
#' @param metric A call to [yardstick::metric_set()]. The metric(s) to use in 
#'   tuning the lasso penalty on the stacking coefficients. Default values are
#'   determined by [tune::tune_grid()] from the outcome class.
#' @param control An object inheriting from `control_grid` to be passed to
#'   the model determining stacking coefficients. See [tune::control_grid()]
#'   documentation for details on possible values. Note that any `extract`
#'   entry will be overwritten internally.
#' @param times Number of bootstrap samples tuned over by the model that
#'   determines stacking coefficients. See [rsample::bootstraps()] to
#'   learn more.
#' @inheritParams add_candidates
#' 
#' @return A `model_stack` object—while `model_stack`s largely contain the
#' same elements as `data_stack`s, the primary data objects shift from the
#' assessment set predictions to the member models.
#' 
#' @template note_example_data
#' 
#' @examplesIf (stacks:::should_run_examples(suggests = "kernlab"))
#' # see the "Example Data" section above for
#' # clarification on the objects used in these examples!
#' 
#' # put together a data stack
#' reg_st <- 
#'   stacks() %>%
#'   add_candidates(reg_res_lr) %>%
#'   add_candidates(reg_res_svm) %>%
#'   add_candidates(reg_res_sp)
#'   
#' reg_st
#'
#' # evaluate the data stack
#' reg_st %>%
#'   blend_predictions()
#' 
#' # include fewer models by proposing higher penalties
#' reg_st %>% 
#'   blend_predictions(penalty = c(.5, 1))
#' 
#' # allow for negative stacking coefficients 
#' # with the non_negative argument
#' reg_st %>% 
#'   blend_predictions(non_negative = FALSE)
#'   
#' # use a custom metric in tuning the lasso penalty
#' library(yardstick)
#' reg_st %>% 
#'   blend_predictions(metric = metric_set(rmse))
#'   
#' # pass control options for stack blending
#' reg_st %>% 
#'   blend_predictions(
#'     control = tune::control_grid(allow_par = TRUE)
#'   )
#'  
#' # to speed up the stacking process for preliminary
#' # results, bump down the `times` argument:
#' reg_st %>% 
#'   blend_predictions(times = 5)
#'   
#' # the process looks the same with 
#' # multinomial classification models
#' class_st <-
#'   stacks() %>%
#'   add_candidates(class_res_nn) %>%
#'   add_candidates(class_res_rf) %>%
#'   blend_predictions()
#'   
#' class_st
#' 
#' # ...or binomial classification models
#' log_st <-
#'   stacks() %>%
#'   add_candidates(log_res_nn) %>%
#'   add_candidates(log_res_rf) %>%
#'   blend_predictions()
#'   
#' log_st
#' 
#' @family core verbs
#' @export
blend_predictions <- function(data_stack, 
                              penalty = 10 ^ (-6:-1),
                              mixture = 1,
                              non_negative = TRUE, 
                              metric = NULL,
                              control = tune::control_grid(), 
                              times = 25,
                              ...) {
  check_inherits(data_stack, "data_stack")
  check_blend_data_stack(data_stack)
  check_regularization(penalty, "penalty")
  check_regularization(mixture, "mixture")
  check_inherits(non_negative, "logical")
  if (!is.null(metric)) {
    check_inherits(metric, "metric_set")
  }
  control <- parsnip::condense_control(control, tune::control_grid())
  check_inherits(times, "numeric")
  check_empty_ellipses(...)
  
  outcome <- attr(data_stack, "outcome")

  preds_formula <-
    rlang::new_formula(as.name(outcome), as.name("."), env = rlang::base_env())
  
  lvls <- levels(data_stack[[outcome]])
  
  dat <- process_data_stack(data_stack)
  
  ll <- if (non_negative) {0} else {-Inf}
  
  tune_quo <- rlang::new_quosure(tune::tune(), env = rlang::empty_env())
  
  if (attr(data_stack, "mode") == "regression") {
    model_spec <- 
      parsnip::linear_reg(penalty = !!tune_quo, mixture = !!tune_quo) %>%
      parsnip::set_engine("glmnet", lower.limits = !!ll, lambda.min.ratio = 0)
    
    preds_wf <-
      workflows::workflow() %>%
      workflows::add_model(model_spec) %>%
      workflows::add_formula(preds_formula)
  } else {
    # The class probabilities add up to one so we remove the probability columns
    # associated with the first level of the outcome. 
    col_filter <- paste0(".pred_", lvls[1])
    cols_drop <- grepl(col_filter, colnames(dat), fixed = TRUE)
    dat <- dat[, !cols_drop]
    if (length(lvls) == 2) {
      model_spec <-
        parsnip::logistic_reg(penalty = !!tune_quo, mixture = !!tune_quo) %>% 
        parsnip::set_engine("glmnet", lower.limits = !!ll, lambda.min.ratio = 0) %>% 
        parsnip::set_mode("classification")
    } else {
      model_spec <-
        parsnip::multinom_reg(penalty = !!tune_quo, mixture = !!tune_quo) %>% 
        parsnip::set_engine("glmnet", lower.limits = !!ll, lambda.min.ratio = 0) %>% 
        parsnip::set_mode("classification")
    }
    
    preds_wf <- 
      workflows::workflow() %>%
      workflows::add_recipe(
        recipes::recipe(
          preds_formula, 
          data = dat
          )
      ) %>%
      workflows::add_model(model_spec)
  }
  
  get_models <- function(x) {
    x %>% 
      workflows::extract_fit_parsnip() %>% 
      purrr::pluck("fit")
  }
  
  control$extract <- get_models

  candidates <- 
    preds_wf %>%
    tune::tune_grid(
      resamples = rsample::bootstraps(dat, times = times),
      grid = tidyr::expand_grid(penalty = penalty, mixture = mixture),
      metrics = metric,
      control = control
    )
  
  metric <- tune::.get_tune_metric_names(candidates)[1]
  best_param <- tune::select_best(candidates, metric = metric)
  coefs <-
    model_spec %>%
    tune::finalize_model(best_param) %>%
    parsnip::fit(formula = preds_formula, data = dat)

  model_stack <- 
    structure(
      list(model_defs = attr(data_stack, "model_defs"),
           coefs = coefs,
           penalty = list(
             penalty = best_param$penalty, 
             mixture = best_param$mixture,
             metric = metric
            ),
           metrics = glmnet_metrics(candidates),
           equations = get_expressions(coefs),
           cols_map = attr(data_stack, "cols_map"),
           model_metrics = attr(data_stack, "model_metrics"),
           train = attr(data_stack, "train"),
           mode = attr(data_stack, "mode"),
           outcome = attr(data_stack, "outcome"),
           data_stack = dat,
           splits = attr(data_stack, "splits")),
      class = c("linear_stack", "model_stack", "list")
    )
  
  if (model_stack_constr(model_stack)) {model_stack}
}

check_regularization <- function(x, arg, call = caller_env()) {
  if (!is.numeric(x)) {
    cli_abort(
      "The argument to '{arg}' must be a numeric, but the supplied {arg}'s 
       class is {.var {class(x)}}.",
      call = call
    )
  }
  
  if (length(x) == 0) {
    cli_abort("Please supply one or more {arg} values.",
              call = call)
  }
  
  if (arg == "penalty") {
    if (any(x < 0)) {
      cli_abort("Please supply only nonnegative values to the {arg} argument.",
                call = call)
    }
  }
  
  if (arg == "mixture") {
    if (any(x < 0) || any(x > 1)) {
      cli_abort("Please supply only values in [0, 1] to the {arg} argument.",
                call = call)
    }
  }
}

# ------------------------------------------------------------------------------

first_extract <- function(.x) {
  .x$.extracts[[1]]
}

glmnet_metrics <- function(x) {
  res <- tune::collect_metrics(x)
  pens <- sort(unique(res$penalty))
  res_ <- vctrs::vec_unique(res[, c("penalty", "mixture", ".config")])
  num_mem <- x[, c("id", ".extracts")]
  num_mem <- tidyr::unnest(num_mem, .extracts)
  num_mem <- dplyr::group_nest(num_mem, id, penalty, mixture)
  num_mem$data <- purrr::map(num_mem$data, first_extract)
  num_mem$members <- purrr::map(num_mem$data, num_members, pens)
  num_mem <- num_mem[, c("mixture", "members")]
  num_mem <- num_mem %>%
    tidyr::unnest(cols = members) %>% 
    dplyr::group_by(penalty, mixture) %>% 
    dplyr::summarize(
      .metric = "num_members",
      .estimator = "Poisson",
      mean = mean(members, na.rm = TRUE), 
      n = sum(!is.na(members)),
      std_err = sqrt(mean/n),
      .groups = "drop"
    ) %>% 
    dplyr::full_join(res_, by = c("penalty", "mixture"))
  
  out <- vctrs::vec_rbind(res, num_mem)
  out <- vctrs::vec_slice(out, vctrs::vec_order(out[".config"]))
  
  out
}

num_members <- function(x, penalties) {
  glmn_coef <-  coef(x, s = penalties)
  if (is.list(glmn_coef)) {
    glmn_coef <- do.call("rbind", glmn_coef)
  }
  glmn_coef <- glmn_coef[rownames(glmn_coef) != "(Intercept)",,drop = FALSE]
  mems <- apply(glmn_coef, 2, function(x) sum(x != 0))
  tibble::tibble(penalty = penalties, members = unname(mems))  
}

# set attributes from new_attr that are not
# already set in x
safe_attr <- function(x, new_attr) {
  res <- x
  
  x_attr <- attributes(x)
  
  dup_attrs <- names(new_attr) %in% names(x_attr)
  
  attributes(res) <- c(x_attr, new_attr[!dup_attrs])
  
  attr(res, "rset_info") <- NULL
  
  res
}

check_blend_data_stack <- function(data_stack, call = caller_env()) {
  # many possible checks we could do here are redundant with those we
  # carry out in fit_members() -- just check for bare stacks, 1-candidate
  # stacks, and non-stack objects
  if (!inherits(data_stack, "data_stack")) {
    check_inherits(data_stack, "data_stack", call = call)
  } else if (ncol(data_stack) == 0) {
      cli_abort(
        "The data stack supplied as the argument to `data_stack` has no 
         candidate members. Please first add candidates with 
         the {.help [`add_candidates()`](stacks::add_candidates)} function.",
        call = call
      )
  } else if ((ncol(data_stack) == 2 && attr(data_stack, "mode") == "regression") || 
             ncol(data_stack) == length(levels(data_stack[[1]])) + 1) {
    cli_abort(
      "The supplied data stack only contains one candidate member. Please 
       add more candidate members using 
      {.help [`add_candidates()`](stacks::add_candidates)} before blending.",
      call = call
    )
  }
  
  invisible(NULL)
}

process_data_stack <- function(data_stack, call = caller_env()) {
  dat <- tibble::as_tibble(data_stack) %>% na.omit()

  # retain only the tbl_df attributes (#214)
  attributes(dat) <- attributes(dat)[
    names(attributes(tibble::new_tibble(list())))
  ]
  
  if (nrow(dat) == 0) {
    cli_abort(
      "All rows in the data stack have at least one missing value. 
       Please ensure that all candidates supply predictions.",
      call = call
    )
  }
  
  if (nrow(dat) < nrow(data_stack)) {
    cli_inform(
      "{nrow(data_stack) - nrow(dat)} of the {nrow(data_stack)} rows in the  
       data stack {cli::qty({nrow(data_stack) - nrow(dat)})} {?has/have} missing 
       values, and will be omitted in the blending process."
    )
  }

  dat
}