inst/doc/example-grid-search.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, eval=FALSE--------------------------------------------------------
# library(starburst)

## ----data, eval=FALSE---------------------------------------------------------
# set.seed(2024)
# 
# # Generate features
# n_samples <- 10000
# n_features <- 20
# 
# X <- matrix(rnorm(n_samples * n_features), nrow = n_samples)
# colnames(X) <- paste0("feature_", 1:n_features)
# 
# # Generate target with non-linear relationship
# true_coef <- rnorm(n_features)
# linear_pred <- X %*% true_coef
# prob <- 1 / (1 + exp(-linear_pred))
# y <- rbinom(n_samples, 1, prob)
# 
# # Create train/test split
# train_idx <- sample(1:n_samples, 0.7 * n_samples)
# X_train <- X[train_idx, ]
# y_train <- y[train_idx]
# X_test <- X[-train_idx, ]
# y_test <- y[-train_idx]
# 
# cat(sprintf("Dataset created:\n"))
# cat(sprintf("  Training samples: %s\n", format(length(y_train), big.mark = ",")))
# cat(sprintf("  Test samples: %s\n", format(length(y_test), big.mark = ",")))
# cat(sprintf("  Features: %d\n", n_features))
# cat(sprintf("  Class balance: %.1f%% / %.1f%%\n",
#             mean(y_train) * 100, (1 - mean(y_train)) * 100))

## ----grid, eval=FALSE---------------------------------------------------------
# # Define parameter space
# param_grid <- expand.grid(
#   learning_rate = c(0.01, 0.05, 0.1),
#   max_depth = c(3, 5, 7),
#   subsample = c(0.6, 0.8, 1.0),
#   min_child_weight = c(1, 3, 5),
#   stringsAsFactors = FALSE
# )
# 
# cat(sprintf("Grid search space:\n"))
# cat(sprintf("  Total parameter combinations: %d\n", nrow(param_grid)))
# cat(sprintf("  With 5-fold CV: %d model fits\n\n", nrow(param_grid) * 5))

## ----train-fn, eval=FALSE-----------------------------------------------------
# # Simple gradient boosting implementation (for demonstration)
# # In practice, use xgboost, lightgbm, or other optimized libraries
# train_gbm <- function(X, y, params, n_trees = 50) {
#   # Simplified GBM simulation
#   # This is a mock implementation - in real use, call xgboost, etc.
# 
#   n <- nrow(X)
#   pred <- rep(mean(y), n)  # Initial prediction
# 
#   # Simulate training time based on complexity
#   complexity_factor <- params$max_depth * (1 / params$learning_rate) *
#                       (1 / params$subsample)
#   training_time <- 0.001 * complexity_factor * n_trees
# 
#   Sys.sleep(min(training_time, 5))  # Cap at 5 seconds
# 
#   # Generate mock predictions with some realism
#   pred <- pred + rnorm(n, 0, 0.1)
#   pred <- pmin(pmax(pred, 0), 1)  # Bound to [0, 1]
# 
#   list(predictions = pred, params = params)
# }
# 
# # Cross-validation function
# cv_evaluate <- function(param_row, X_data, y_data, n_folds = 5) {
#   params <- as.list(param_row)
# 
#   # Create folds
#   n <- nrow(X_data)
#   fold_size <- floor(n / n_folds)
#   fold_indices <- sample(rep(1:n_folds, length.out = n))
# 
#   # Perform cross-validation
#   cv_scores <- numeric(n_folds)
# 
#   for (fold in 1:n_folds) {
#     # Split data
#     val_idx <- which(fold_indices == fold)
#     train_idx <- which(fold_indices != fold)
# 
#     X_fold_train <- X_data[train_idx, , drop = FALSE]
#     y_fold_train <- y_data[train_idx]
#     X_fold_val <- X_data[val_idx, , drop = FALSE]
#     y_fold_val <- y_data[val_idx]
# 
#     # Train model
#     model <- train_gbm(X_fold_train, y_fold_train, params)
# 
#     # Predict and evaluate (mock evaluation)
#     # In practice, compute actual predictions and metrics
#     baseline_accuracy <- mean(y_fold_val == round(mean(y_fold_train)))
# 
#     # Simulate performance improvement based on good parameters
#     param_quality <- (params$learning_rate >= 0.05) * 0.02 +
#                     (params$max_depth >= 5) * 0.02 +
#                     (params$subsample >= 0.8) * 0.01 +
#                     rnorm(1, 0, 0.02)
# 
#     accuracy <- min(baseline_accuracy + param_quality, 1.0)
#     cv_scores[fold] <- accuracy
#   }
# 
#   # Return results
#   list(
#     params = params,
#     mean_cv_score = mean(cv_scores),
#     std_cv_score = sd(cv_scores),
#     cv_scores = cv_scores
#   )
# }

## ----local, eval=FALSE--------------------------------------------------------
# # Test with 10 parameter combinations
# set.seed(999)
# sample_params <- param_grid[sample(1:nrow(param_grid), 10), ]
# 
# cat(sprintf("Running local benchmark (%d parameter combinations)...\n",
#             nrow(sample_params)))
# local_start <- Sys.time()
# 
# local_results <- lapply(1:nrow(sample_params), function(i) {
#   cv_evaluate(sample_params[i, ], X_train, y_train, n_folds = 5)
# })
# 
# local_time <- as.numeric(difftime(Sys.time(), local_start, units = "mins"))
# 
# cat(sprintf("✓ Completed in %.2f minutes\n", local_time))
# cat(sprintf("  Average time per combination: %.1f seconds\n",
#             local_time * 60 / nrow(sample_params)))
# cat(sprintf("  Estimated time for full grid (%d combinations): %.1f minutes\n",
#             nrow(param_grid), local_time * nrow(param_grid) / nrow(sample_params)))

## ----cloud, eval=FALSE--------------------------------------------------------
# n_workers <- 27  # Process ~3 parameter combinations per worker
# 
# cat(sprintf("Running grid search (%d combinations) on %d workers...\n",
#             nrow(param_grid), n_workers))
# 
# cloud_start <- Sys.time()
# 
# results <- starburst_map(
#   1:nrow(param_grid),
#   function(i) cv_evaluate(param_grid[i, ], X_train, y_train, n_folds = 5),
#   workers = n_workers,
#   cpu = 2,
#   memory = "4GB"
# )
# 
# cloud_time <- as.numeric(difftime(Sys.time(), cloud_start, units = "mins"))
# 
# cat(sprintf("\n✓ Completed in %.2f minutes\n", cloud_time))

## ----analysis, eval=FALSE-----------------------------------------------------
# # Extract results
# cv_scores <- sapply(results, function(x) x$mean_cv_score)
# cv_stds <- sapply(results, function(x) x$std_cv_score)
# 
# # Combine with parameters
# results_df <- cbind(param_grid,
#                    mean_score = cv_scores,
#                    std_score = cv_stds)
# 
# # Sort by performance
# results_df <- results_df[order(-results_df$mean_score), ]
# 
# cat("\n=== Grid Search Results ===\n\n")
# cat(sprintf("Total combinations evaluated: %d\n", nrow(results_df)))
# cat(sprintf("Best CV score: %.4f (± %.4f)\n",
#             results_df$mean_score[1], results_df$std_score[1]))
# 
# cat("\n=== Best Hyperparameters ===\n")
# cat(sprintf("  Learning rate: %.3f\n", results_df$learning_rate[1]))
# cat(sprintf("  Max depth: %d\n", results_df$max_depth[1]))
# cat(sprintf("  Subsample: %.2f\n", results_df$subsample[1]))
# cat(sprintf("  Min child weight: %d\n", results_df$min_child_weight[1]))
# 
# cat("\n=== Top 5 Parameter Combinations ===\n")
# for (i in 1:5) {
#   cat(sprintf("\n%d. Score: %.4f (± %.4f)\n", i,
#               results_df$mean_score[i], results_df$std_score[i]))
#   cat(sprintf("   lr=%.3f, depth=%d, subsample=%.2f, min_child=%d\n",
#               results_df$learning_rate[i],
#               results_df$max_depth[i],
#               results_df$subsample[i],
#               results_df$min_child_weight[i]))
# }
# 
# # Parameter importance analysis
# cat("\n=== Parameter Impact Analysis ===\n")
# for (param in c("learning_rate", "max_depth", "subsample", "min_child_weight")) {
#   param_means <- aggregate(mean_score ~ get(param),
#                           data = results_df, FUN = mean)
#   names(param_means)[1] <- param
# 
#   cat(sprintf("\n%s:\n", param))
#   for (i in 1:nrow(param_means)) {
#     cat(sprintf("  %s: %.4f\n",
#                 param_means[i, 1],
#                 param_means[i, 2]))
#   }
# }
# 
# # Visualize results (if in interactive session)
# if (interactive()) {
#   # Score distribution
#   hist(results_df$mean_score,
#        breaks = 20,
#        main = "Distribution of Cross-Validation Scores",
#        xlab = "Mean CV Score",
#        col = "lightblue",
#        border = "white")
#   abline(v = results_df$mean_score[1], col = "red", lwd = 2, lty = 2)
# 
#   # Learning rate effect
#   boxplot(mean_score ~ learning_rate, data = results_df,
#           main = "Learning Rate Impact",
#           xlab = "Learning Rate",
#           ylab = "CV Score",
#           col = "lightgreen")
# }

## ----random-search, eval=FALSE------------------------------------------------
# # Generate random parameter combinations
# n_random <- 100
# 
# random_params <- data.frame(
#   learning_rate = runif(n_random, 0.001, 0.3),
#   max_depth = sample(2:10, n_random, replace = TRUE),
#   subsample = runif(n_random, 0.5, 1.0),
#   min_child_weight = sample(1:10, n_random, replace = TRUE),
#   stringsAsFactors = FALSE
# )
# 
# cat(sprintf("Running random search (%d combinations)...\n", n_random))
# 
# random_results <- starburst_map(
#   1:nrow(random_params),
#   function(i) cv_evaluate(random_params[i, ], X_train, y_train, n_folds = 5),
#   workers = 33,
#   cpu = 2,
#   memory = "4GB"
# )
# 
# # Find best parameters
# random_scores <- sapply(random_results, function(x) x$mean_cv_score)
# best_idx <- which.max(random_scores)
# 
# cat("\nBest random search result:\n")
# cat(sprintf("  Score: %.4f\n", random_scores[best_idx]))
# cat(sprintf("  Learning rate: %.4f\n", random_params$learning_rate[best_idx]))
# cat(sprintf("  Max depth: %d\n", random_params$max_depth[best_idx]))

## ----bayesian, eval=FALSE-----------------------------------------------------
# # Bayesian optimization would involve:
# # 1. Evaluate a small initial set (e.g., 10 combinations)
# # 2. Fit a Gaussian process to predict performance
# # 3. Use acquisition function to select next promising points
# # 4. Evaluate new points in parallel
# # 5. Repeat until convergence
# 
# # This requires specialized packages like mlrMBO or rBayesianOptimization
# # but can be parallelized with starburst for the evaluation step

## ----eval=FALSE---------------------------------------------------------------
# system.file("examples/grid-search.R", package = "starburst")

## ----eval=FALSE---------------------------------------------------------------
# source(system.file("examples/grid-search.R", package = "starburst"))

Try the starburst package in your browser

Any scripts or data that you put into this service are public.

starburst documentation built on March 19, 2026, 5:08 p.m.