R/tidy_xgboost_lgb.R
In autostats: Auto Stats

#' #' tidy xgboost
#' #'
#' #' Accepts a formula to run an xgboost model. Automatically determines whether the formula is
#' #' for classification or regression. Returns the xgboost model.
#' #'
#' #' In binary classification the target variable must be a factor with the first level set to the event of interest.
#' #' A higher probability will predict the first level.
#' #'
#' #' reference for parameters: \href{https://xgboost.readthedocs.io/en/stable/parameter.html}{xgboost docs}
#' #'
#' #' @param .data dataframe
#' #' @param formula formula
#' #' @param ... additional parameters to be passed to  \code{\link[parsnip]{set_engine}}
#' #' @param tree_depth Tree Depth (xgboost: max_depth) (type: integer, default: 6L); Typical values: 3-10
#' #' @param trees # Trees (xgboost: nrounds) (type: integer, default: 15L)
#' #' @param learn_rate Learning Rate (xgboost: eta) (type: double, default: 0.3); Typical values: 0.01-0.3
#' #' @param mtry # Randomly Selected Predictors (xgboost: colsample_bynode) (type: numeric, range 0 - 1) (or type: integer if \code{count = TRUE})
#' #' @param min_n  Minimal Node Size (xgboost: min_child_weight) (type: integer, default: 1L); [typical range: 2-10] Keep small value for highly imbalanced class data where leaf nodes can have smaller size groups. Otherwise increase size to prevent overfitting outliers.
#' #' @param loss_reduction Minimum Loss Reduction (xgboost: gamma) (type: double, default: 0.0);  range: 0 to Inf; typical value: 0 - 20 assuming low-mid tree depth
#' #' @param sample_size Proportion Observations Sampled (xgboost: subsample) (type: double, default: 1.0); Typical values: 0.5 - 1
#' #' @param stop_iter # Iterations Before Stopping (xgboost: early_stop) (type: integer, default: 15L) only enabled if validation set is provided
#' #' @param counts if \code{TRUE} specify \code{mtry} as an integer number of cols. Default \code{FALSE} to specify \code{mtry} as fraction of cols from 0 to 1
#' #' @param tree_method xgboost tree_method. default is \code{auto}. reference: \href{https://xgboost.readthedocs.io/en/stable/treemethod.html}{tree method docs}
#' #' @param monotone_constraints an integer vector with length of the predictor cols, of \code{-1, 1, 0} corresponding to decreasing, increasing, and no constraint respectively for the index of the predictor col. reference: \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{monotonicity docs}.
#' #' @param num_parallel_tree should be set to the size of the forest being trained. default 1L
#' #' @param lambda [default=1] L2 regularization term on weights. Increasing this value will make model more conservative.
#' #' @param alpha [default=0] L1 regularization term on weights. Increasing this value will make model more conservative.
#' #' @param scale_pos_weight [default=1] Control the balance of positive and negative weights, useful for unbalanced classes. if set to TRUE, calculates sum(negative instances) / sum(positive instances). If first level is majority class, use values < 1, otherwise normally values >1 are used to balance the class distribution.
#' #' @param verbosity [default=1] Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 (debug).
#' #' @param validate default TRUE. report accuracy metrics on a validation set.
#' #'
#' #' @return xgb.Booster model
#' #' @export
#' #'
#' #' @examples
#' #'
#' #' options(rlang_trace_top_env = rlang::current_env())
#' #'
#' #'
#' #'# regression on numeric variable
#' #'
#' #'iris %>%
#' #'  framecleaner::create_dummies(Species) -> iris_dummy
#' #'
#' #'iris_dummy %>%
#' #'  tidy_formula(target= Petal.Length) -> petal_form
#' #'
#' #'iris_dummy %>%
#' #'  tidy_xgboost(
#' #'    petal_form,
#' #'    trees = 500,
#' #'    mtry = .5
#' #'  )  -> xg1
#' #'
#' #'xg1 %>%
#' #'  visualize_model(top_n = 2)
#' #'
#' #'xg1 %>%
#' #'  tidy_predict(newdata = iris_dummy, form = petal_form) -> iris_preds
#' #'
#' #'iris_preds %>%
#' #'  eval_preds()
#' #'
#' #'
#' #'# binary classification
#' #'# returns probabilty and labels
#' #'
#' #'iris %>%
#' #'  tidy_formula(Species) -> species_form
#' #'
#' #'iris %>%
#' #'  dplyr::filter(Species != "versicolor") %>%
#' #'  dplyr::mutate(Species = forcats::fct_drop(Species)) -> iris_binary
#' #'
#' #'iris_binary %>%
#' #'  tidy_xgboost(formula = species_form, trees = 50L, mtry = 0.2) -> xgb_bin
#' #'
#' #'xgb_bin %>%
#' #'  tidy_predict(newdata = iris_binary, form = species_form) -> iris_binary1
#' #'
#' #'iris_binary1 %>%
#' #'  eval_preds()
#' #'
#' #'
#' #'# multiclass classification that returns labels
#' #'
#' #'
#' #'
#' #'
#' #'iris %>%
#' #'  tidy_xgboost(species_form,
#' #'               objective = "multi:softmax",
#' #'               trees = 100,
#' #'               tree_depth = 3L,
#' #'               loss_reduction = 0.5) -> xgb2
#' #'
#' #'
#' #'
#' #'xgb2 %>%
#' #'  tidy_predict(newdata = iris, form = species_form) -> iris_preds
#' #'
#' #'# additional yardstick metrics can be supplied to the dots in eval_preds
#' #'
#' #'iris_preds %>%
#' #'  eval_preds(yardstick::j_index)
#' #'
#' #'
#' #'# multiclass classification that returns probabilities
#' #'
#' #'
#' #'iris %>%
#' #'  tidy_xgboost(species_form,
#' #'               objective = "multi:softprob",
#' #'               trees = 50L,
#' #'               sample_size = .2,
#' #'               mtry = .5,
#' #'               tree_depth = 2L,
#' #'               loss_reduction = 3) -> xgb2_prob
#' #'
#' #' # predict on the data that already has the class labels, so the resulting data frame
#' #' # has class and prob predictions
#' #'
#' #'xgb2_prob %>%
#' #'  tidy_predict(newdata = iris_preds, form = species_form) -> iris_preds1
#' #'
#' #'# also requires the labels in the dataframe to evaluate preds
#' #'# the model name must be supplied as well. Then roc metrics can be calculated
#' #'#iris_preds1 %>%
#' #'#  eval_preds( yardstick::average_precision, softprob_model = "xgb2_prob"
#' #'#  )
#' #'
#' #'
#' tidy_xgboost <- function(.data, formula, ...,
#'                          engine = c("xgboost", "lightgbm"),
#'                          mtry = 1.0,
#'                          trees = 15L,
#'                          min_n = 1L,
#'                          tree_depth = 6L,
#'                          learn_rate = 0.3,
#'                          loss_reduction = 0.0,
#'                          sample_size = 1.0,
#'                          stop_iter = 10L,
#'                          counts = FALSE,
#'                          tree_method = c("auto", "exact", "approx", "hist", "gpu_hist"),
#'                          monotone_constraints = 0L,
#'                          num_parallel_tree = 1L,
#'                          lambda = 1,
#'                          alpha = 0,
#'                          scale_pos_weight = 1,
#'                          verbosity = 0L,
#'                          validate = TRUE){
#'
#'
#'   tree_method <- match.arg(tree_method)
#'   engine <-  match.arg(engine)
#'
#'   formula %>%
#'     rlang::f_lhs() -> target
#'
#'   n <- NULL
#'
#'
#'   .data %>%
#'     dplyr::pull(!!target) %>%
#'     is.numeric() -> numer_tg
#'
#'   .data %>%
#'     dplyr::pull(!!target) %>%
#'     is.character() -> chr_tg
#'
#' if(isTRUE(scale_pos_weight)){
#'   .data %>%
#'     dplyr::count(!!target) %>%
#'     dplyr::pull(n) -> classcounts
#'
#'   scale_pos_weight <- classcounts[2] / classcounts[1]
#' }
#'
#'
#'
#'
#'   xgboost_recipe <-
#'     recipes::recipe(data = .data, formula = formula)
#'     # recipes::step_zv(recipes::all_predictors()) %>%
#'     # recipes::step_dummy(where(is.character) | where(is.factor), -!!target)
#'
#'
#'   xgboost_spec0 <-  parsnip::boost_tree(
#'     mtry = mtry,
#'     trees = trees,
#'     min_n = min_n,
#'     tree_depth = tree_depth,
#'     learn_rate = learn_rate,
#'     loss_reduction = loss_reduction,
#'     sample_size = sample_size,
#'     stop_iter = stop_iter
#'   )
#'
#'   if(engine == "xgboost"){
#'     xgboost_spec0 %>%
#'     parsnip::set_engine("xgboost", ...,
#'                         counts = counts,
#'                         tree_method = tree_method,
#'                         monotone_constraints = monotone_constraints,
#'                         num_parallel_tree = num_parallel_tree,
#'                         lambda = lambda,
#'                         alpha = alpha,
#'                         scale_pos_weight = scale_pos_weight,
#'                         verbosity = verbosity) -> xgboost_spec0}
#'   else if(engine == "lightgbm") {
#'     xgboost_spec0 %>%
#'       parsnip::set_engine("lightgbm", ...,
#'                           counts = counts) -> xgboost_spec0
#'
#'   }
#'
#'
#'
#'
#'   if(numer_tg){
#'     mode_set <- "regression"
#'
#'
#'     xgboost_spec0 %>%
#'       parsnip::set_mode(mode_set)  -> xgboost_spec
#'
#'   } else{
#'     mode_set <- "classification"
#'
#' if(chr_tg){
#'
#'   message("classification requires target to be a factor with the first level as the event class")
#'   stop()
#' }
#'
#'
#'     .data %>%
#'       dplyr::mutate(!!target := as.factor(!!target) %>% forcats::fct_drop()) -> .data
#'
#'
#'     xgboost_spec0 %>%
#'       parsnip::set_mode(mode_set)  -> xgboost_spec
#'
#'   }
#'
#'   xgboost_workflow <-
#'     workflows::workflow() %>%
#'     workflows::add_recipe(xgboost_recipe) %>%
#'     workflows::add_model(xgboost_spec)
#'
#'   xgboost_workflow %>%
#'     parsnip::fit(.data) -> model_fit
#'
#'   model_fit %>%
#'     workflows::pull_workflow_fit() %>%
#'     purrr::pluck("fit") -> xgbooster
#'
#'
#'   if(engine == "lightgbm"){
#'     xgbooster$eval_train()[[1]]$name -> xgb_obj
#'   }
#'
#'   else if (utils::packageVersion("parsnip") > "1.0.0") {
#'     xgbooster$call$objective -> xgb_obj
#'   } else {
#'     xgbooster$call$params$objective -> xgb_obj
#'   }
#'
#'   if(validate & xgb_obj != "multi:softprob"){
#'
#'     rsample::initial_split(.data) -> split1
#'     rsample::assessment(split1) -> assessment_set
#'     rsample::analysis(split1) -> analysis_set
#'
#'     xgboost_workflow %>%
#'       parsnip::fit(assessment_set) -> val_fit
#'
#'     val_fit %>%
#'       workflows::pull_workflow_fit() %>%
#'       purrr::pluck("fit") -> val_booster
#' suppressMessages({
#'     val_booster %>%
#'       tidy_predict(newdata = analysis_set, form = formula) -> val_frame
#' })
#' model <- NULL
#'
#'     val_frame %>%
#'       eval_preds()  -> val_acc
#'
#'     if(mode_set == "classification"){
#'       val_frame %>%
#'         dplyr::count(!!target, sort = T) %>%
#'         dplyr::pull(n) -> target_counts
#'
#'       val_frame %>%
#'         dplyr::count(!!target, sort = T) %>%
#'         dplyr::slice(1) %>%
#'         dplyr::pull(1) -> majority_class
#'
#'
#'
#'       event_prop <- target_counts[1] / (target_counts[1] + target_counts[2])
#'
#'       tibble::tibble(.metric = "baseline_accuracy",
#'                      .estimator = "binary",
#'                      .estimate = event_prop,
#'                      target = NA) -> baseline_acc
#'
#'       baseline_acc %>%
#'         dplyr::bind_rows(val_acc) %>%
#'         dplyr::select(-model) -> val_acc
#'     }
#'
#'     message("accuracy tested on a validation set")
#'
#'     print(val_acc)
#'   }
#'
#' visualize_model(xgbooster) -> imp_plot
#'
#' if(engine == "xgboost"){
#' print(imp_plot)}
#'
#' xgbooster
#'
#'
#' }
#'
#' #' Plot varimp xgboost
#' #'
#' #'  recommended parameters to control;
#' #'
#' #' \itemize{
#' #' \item{\code{top_n}}{ number of features to include in the graph}
#' #' }
#' #'
#' #' @param xgb xgb.Booster model
#' #' @param font font
#' #' @param top_n top n important variables
#' #' @param aggregate a character vector. Predictors containing the string will be aggregated, and renamed to that string.
#' #' @param as_table logical, default FALSE. If TRUE returns importances in a data frame
#' #' @param ... additional arguments for \code{\link[xgboost]{xgb.ggplot.importance}}
#' #' @keywords internal
#' #'
#' #' @return ggplot
#' #'
#' plot_varimp_xgboost <- function(xgb,  top_n = 10L, aggregate = NULL, as_table = FALSE, ...){
#'
#'   agg <- Feature <- NULL
#'
#'   xgb$feature_names -> f1
#'
#'   length(f1) -> lf
#'
#'   as.character(1:lf) -> nms
#'
#'   xgb$feature_names <- nms
#'
#'   xgboost::xgb.importance(model = xgb ) -> xgb_imp
#'
#'   as.integer(xgb_imp$Feature) -> rg_ind
#'
#'   f1[rg_ind] -> unscrambled_names
#'
#'   xgb_imp$Feature <- unscrambled_names
#'
#'   xgb$feature_names <- f1
#'
#'
#'   if(!is.null(aggregate)){
#'
#'     xgb_imp %>%
#'       dplyr::mutate(agg = stringr::str_extract(Feature, stringr::str_c(
#'         aggregate, collapse = "|"))) %>%
#'       dplyr::mutate(Feature = dplyr::coalesce(agg, Feature)) %>%
#'       dplyr::select(-agg) %>%
#'       dplyr::group_by(Feature) %>%
#'       dplyr::summarise(dplyr::across(where(is.numeric), sum)) %>%
#'       data.table::as.data.table() -> xgb_imp
#'   }
#'
#'
#'   xgb_imp %>%
#'     xgboost::xgb.ggplot.importance(..., top_n = top_n) +
#'     ggplot2::theme_minimal() +
#'     ggplot2::theme(panel.border = ggplot2::element_blank(),
#'                    panel.grid.major = ggplot2::element_blank(),
#'                    panel.grid.minor = ggplot2::element_blank(),
#'                    axis.line = ggplot2::element_line(colour = "black"))+
#'     ggeasy::easy_remove_legend() +
#'     ggplot2::ylab("Importance from xgboost") -> xgb_plot
#'
#'   if(as_table){
#'
#'     imp_out <- xgb_imp
#'   } else{
#'     imp_out <- xgb_plot
#'   }
#'
#'   imp_out
#' }
#'
#'
#' #' create monotone constraints
#' #'
#' #' helper function to create the integer vector to pass to the \code{monotone_constraints} argument in xgboost
#' #'
#' #' @param .data dataframe, training data for tidy_xgboost
#' #' @param formula formula used for tidy_xgboost
#' #' @param decreasing character vector or tidyselect regular expression to designate decreasing cols
#' #' @param increasing character vector or tidyselect regular expression to designate increasing cols
#' #'
#' #' @return a named integer vector with entries of 0, 1, -1
#' #' @export
#' #'
#' #' @examples
#' #'
#' #'
#' #'
#' #' iris %>%
#' #'framecleaner::create_dummies(Species) -> iris_dummy
#' #'
#' #'iris_dummy %>%
#' #'  tidy_formula(target= Petal.Length) -> petal_form
#' #'
#' #'iris_dummy %>%
#' #'  create_monotone_constraints(petal_form,
#' #'                              decreasing = tidyselect::matches("Petal|Species"),
#' #'                              increasing = "Sepal.Width")
#' #'
#' create_monotone_constraints <- function(.data, formula, decreasing = NULL, increasing = NULL){
#'
#'   formula %>%
#'     f_formula_to_charvec(.data = .data) -> cols
#'
#'   .data %>%
#'     framecleaner::select_otherwise(decreasing) -> dec
#'
#'   .data %>%
#'     framecleaner::select_otherwise(increasing) -> inc
#'
#'   mc <- list()
#'
#'   for(i in cols){
#'
#'     mc[[i]] <- dplyr::case_when(
#'       i %in% dec ~ -1L,
#'       i %in% inc ~ 1L,
#'       TRUE ~ 0L
#'     )
#'   }
#'
#'   unlist(mc)
#' }