R/cv_lightgbm.R

Defines functions cv_lightgbm

Documented in cv_lightgbm

#' lightgbm - parameter tuning and model selection with k-fold cross-validation
#' and grid search
#'
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param params Parameter grid generated by [cv_param_grid()].
#' @param n_folds Number of folds. Default is 5.
#' @param n_threads The number of parallel threads. For optimal speed,
#'   match this to the number of physical CPU cores, not threads.
#'   See respective model documentation for more details. Default is 1.
#' @param seed Random seed for reproducibility.
#' @param verbose Show progress?
#'
#' @return
#' A data frame containing the complete tuning grid and the AUC values,
#' with the best parameter combination and the highest AUC value.
#'
#' @export
#'
#' @examplesIf is_installed_lightgbm()
#' sim_data <- msaenet::msaenet.sim.binomial(
#'   n = 100,
#'   p = 10,
#'   rho = 0.6,
#'   coef = rnorm(5, mean = 0, sd = 10),
#'   snr = 1,
#'   p.train = 0.8,
#'   seed = 42
#' )
#'
#' params <- suppressWarnings(
#'   cv_lightgbm(
#'     sim_data$x.tr,
#'     sim_data$y.tr,
#'     params = cv_param_grid(
#'       n_iterations = c(100, 200),
#'       max_depth = c(3, 5),
#'       learning_rate = c(0.1, 0.5)
#'     ),
#'     n_folds = 5,
#'     n_threads = 1,
#'     seed = 42,
#'     verbose = FALSE
#'   )
#' )
#'
#' params$df
cv_lightgbm <- function(
    x, y,
    params = cv_param_grid(),
    n_folds = 5,
    n_threads = 1,
    seed = 42,
    verbose = TRUE) {
  params <- map_params_lightgbm(params)

  set.seed(seed)
  nrow_x <- nrow(x)
  index <- sample(rep_len(1L:n_folds, nrow_x))
  df_grid <- expand.grid(
    "num_iterations" = params$num_iterations,
    "max_depth" = params$max_depth,
    "learning_rate" = params$learning_rate,
    "metric" = NA
  )
  nrow_grid <- nrow(df_grid)

  x <- as.matrix(x)

  pb <- progress_bar$new(
    format = "  searching grid [:bar] :percent in :elapsed",
    total = nrow_grid * n_folds, clear = FALSE, width = 60
  )

  for (j in 1L:nrow_grid) {
    ypred <- matrix(NA, ncol = 2L, nrow = nrow_x)
    for (i in 1L:n_folds) {
      if (verbose) pb$tick()

      xtrain <- x[index != i, , drop = FALSE]
      ytrain <- y[index != i]
      xtest <- x[index == i, , drop = FALSE]
      ytest <- y[index == i]

      fit <- lightgbm_train(
        data = xtrain,
        label = ytrain,
        params = list(
          objective = "binary",
          learning_rate = df_grid[j, "learning_rate"],
          num_iterations = df_grid[j, "num_iterations"],
          max_depth = df_grid[j, "max_depth"],
          num_leaves = 2^(df_grid[j, "max_depth"]) - 1,
          num_threads = n_threads
        ),
        verbose = -1
      )
      ypredvec <- predict(fit, xtest)
      ypred[index == i, 1L] <- ytest
      ypred[index == i, 2L] <- ypredvec
    }
    colnames(ypred) <- c("y.real", "y.pred")
    df_grid[j, "metric"] <- as.numeric(pROC::auc(ypred[, "y.real"], ypred[, "y.pred"], quiet = TRUE))
  }

  best_row <- which.max(df_grid$metric)
  best_metric <- df_grid$metric[best_row]
  best_num_iterations <- df_grid$num_iterations[best_row]
  best_max_depth <- df_grid$max_depth[best_row]
  best_learning_rate <- df_grid$learning_rate[best_row]

  structure(
    list(
      "df" = df_grid,
      "metric" = best_metric,
      "num_iterations" = best_num_iterations,
      "max_depth" = best_max_depth,
      "learning_rate" = best_learning_rate
    ),
    class = c("cv_params", "cv_lightgbm")
  )
}
nanxstats/stackgbm documentation built on May 3, 2024, 5:47 p.m.