R/cv_xgboost.R

Defines functions cv_xgboost

Documented in cv_xgboost

#' xgboost - parameter tuning and model selection with k-fold cross-validation
#' and grid search
#'
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param params Parameter grid generated by [cv_param_grid()].
#' @param n_folds Number of folds. Default is 5.
#' @param n_threads The number of parallel threads. For optimal speed,
#'   match this to the number of physical CPU cores, not threads.
#'   See respective model documentation for more details. Default is 1.
#' @param seed Random seed for reproducibility.
#' @param verbose Show progress?
#'
#' @return A data frame containing the complete tuning grid and the AUC values,
#'   with the best parameter combination and the highest AUC value.
#'
#' @importFrom pROC auc
#'
#' @export
#'
#' @examplesIf is_installed_xgboost()
#' sim_data <- msaenet::msaenet.sim.binomial(
#'   n = 100,
#'   p = 10,
#'   rho = 0.6,
#'   coef = rnorm(5, mean = 0, sd = 10),
#'   snr = 1,
#'   p.train = 0.8,
#'   seed = 42
#' )
#'
#' params <- cv_xgboost(
#'   sim_data$x.tr,
#'   sim_data$y.tr,
#'   params = cv_param_grid(
#'     n_iterations = c(100, 200),
#'     max_depth = c(3, 5),
#'     learning_rate = c(0.1, 0.5)
#'   ),
#'   n_folds = 5,
#'   n_threads = 1,
#'   seed = 42,
#'   verbose = FALSE
#' )
#'
#' params$df
cv_xgboost <- function(
    x, y,
    params = cv_param_grid(),
    n_folds = 5,
    n_threads = 1,
    seed = 42,
    verbose = TRUE) {
  params <- map_params_xgboost(params)

  set.seed(seed)
  nrow_x <- nrow(x)
  index <- sample(rep_len(1L:n_folds, nrow_x))
  df_grid <- expand.grid(
    "nrounds" = params$nrounds,
    "max_depth" = params$max_depth,
    "eta" = params$eta,
    "metric" = NA
  )
  nrow_grid <- nrow(df_grid)

  x <- as.matrix(x)

  pb <- progress_bar$new(
    format = "  searching grid [:bar] :percent in :elapsed",
    total = nrow_grid * n_folds, clear = FALSE, width = 60
  )

  for (j in 1L:nrow_grid) {
    ypred <- matrix(NA, ncol = 2L, nrow = nrow_x)
    for (i in 1L:n_folds) {
      if (verbose) pb$tick()

      xtrain <- x[index != i, , drop = FALSE]
      ytrain <- y[index != i]
      xtest <- x[index == i, , drop = FALSE]
      ytest <- y[index == i]

      xtrain <- xgboost_dmatrix(xtrain, label = ytrain)
      xtest <- xgboost_dmatrix(xtest)

      fit <- xgboost_train(
        params = list(
          objective = "binary:logistic",
          eval_metric = "auc",
          max_depth = df_grid[j, "max_depth"],
          eta = df_grid[j, "eta"]
        ),
        data = xtrain,
        nrounds = df_grid[j, "nrounds"],
        nthread = n_threads
      )
      ypredvec <- predict(fit, xtest)
      ypred[index == i, 1L] <- ytest
      ypred[index == i, 2L] <- ypredvec
    }
    colnames(ypred) <- c("y.real", "y.pred")
    df_grid[j, "metric"] <- as.numeric(pROC::auc(ypred[, "y.real"], ypred[, "y.pred"], quiet = TRUE))
  }

  best_row <- which.max(df_grid$metric)
  best_metric <- df_grid$metric[best_row]
  best_nrounds <- df_grid$nrounds[best_row]
  best_eta <- df_grid$eta[best_row]
  best_max_depth <- df_grid$max_depth[best_row]

  structure(
    list(
      "df" = df_grid,
      "metric" = best_metric,
      "nrounds" = best_nrounds,
      "eta" = best_eta,
      "max_depth" = best_max_depth
    ),
    class = c("cv_params", "cv_xgboost")
  )
}
nanxstats/stackgbm documentation built on May 3, 2024, 5:47 p.m.