R/cv_catboost.R

Defines functions cv_catboost

Documented in cv_catboost

#' catboost - parameter tuning and model selection with k-fold cross-validation
#' and grid search
#'
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param params Parameter grid generated by [cv_param_grid()].
#' @param n_folds Number of folds. Default is 5.
#' @param n_threads The number of parallel threads. For optimal speed,
#'   match this to the number of physical CPU cores, not threads.
#'   See respective model documentation for more details. Default is 1.
#' @param seed Random seed for reproducibility.
#' @param verbose Show progress?
#'
#' @return
#' A data frame containing the complete tuning grid and the AUC values,
#' with the best parameter combination and the highest AUC value.
#'
#' @export
#'
#' @examplesIf is_installed_catboost()
#' sim_data <- msaenet::msaenet.sim.binomial(
#'   n = 100,
#'   p = 10,
#'   rho = 0.6,
#'   coef = rnorm(5, mean = 0, sd = 10),
#'   snr = 1,
#'   p.train = 0.8,
#'   seed = 42
#' )
#'
#' params <- cv_catboost(
#'   sim_data$x.tr,
#'   sim_data$y.tr,
#'   params = cv_param_grid(
#'     n_iterations = c(100, 200),
#'     max_depth = c(3, 5),
#'     learning_rate = c(0.1, 0.5)
#'   ),
#'   n_folds = 5,
#'   n_threads = 1,
#'   seed = 42,
#'   verbose = FALSE
#' )
#'
#' params$df
cv_catboost <- function(
    x, y,
    params = cv_param_grid(),
    n_folds = 5,
    n_threads = 1,
    seed = 42,
    verbose = TRUE) {
  params <- map_params_catboost(params)

  set.seed(seed)
  nrow_x <- nrow(x)
  index <- sample(rep_len(1L:n_folds, nrow_x))
  df_grid <- expand.grid(
    "iterations" = params$iterations,
    "depth" = params$depth,
    "metric" = NA
  )
  nrow_grid <- nrow(df_grid)

  # x <- as.matrix(x) # uncomment to use non-categorical features

  pb <- progress_bar$new(
    format = "  searching grid [:bar] :percent in :elapsed",
    total = nrow_grid * n_folds, clear = FALSE, width = 60
  )

  for (j in 1L:nrow_grid) {
    ypred <- matrix(NA, ncol = 2L, nrow = nrow_x)
    for (i in 1L:n_folds) {
      if (verbose) pb$tick()

      xtrain <- x[index != i, , drop = FALSE]
      ytrain <- y[index != i]
      xtest <- x[index == i, , drop = FALSE]
      ytest <- y[index == i]

      train_pool <- catboost_load_pool(data = xtrain, label = ytrain)
      test_pool <- catboost_load_pool(data = xtest, label = NULL)
      fit <- catboost_train(
        train_pool,
        test_pool = NULL,
        params = list(
          loss_function = "Logloss",
          iterations = df_grid[j, "iterations"],
          depth = df_grid[j, "depth"],
          logging_level = "Silent",
          thread_count = n_threads
        )
      )
      ypredvec <- catboost_predict(fit, pool = test_pool, prediction_type = "Probability")
      ypred[index == i, 1L] <- ytest
      ypred[index == i, 2L] <- ypredvec
    }
    colnames(ypred) <- c("y.real", "y.pred")
    df_grid[j, "metric"] <- as.numeric(pROC::auc(ypred[, "y.real"], ypred[, "y.pred"], quiet = TRUE))
  }

  best_row <- which.max(df_grid$metric)
  best_metric <- df_grid$metric[best_row]
  best_iterations <- df_grid$iterations[best_row]
  best_depth <- df_grid$depth[best_row]

  structure(
    list(
      "df" = df_grid,
      "metric" = best_metric,
      "iterations" = best_iterations,
      "depth" = best_depth
    ),
    class = c("cv_params", "cv_catboost")
  )
}
nanxstats/stackgbm documentation built on May 3, 2024, 5:47 p.m.