knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Hyperparameter tuning is one of the most time-consuming aspects of machine learning. Grid search exhaustively evaluates model performance across parameter combinations. This example demonstrates parallelizing grid search to dramatically reduce tuning time.
Use Case: ML model optimization, hyperparameter tuning, model selection, cross-validation
Computational Pattern: Embarrassingly parallel model training with aggregation
You need to tune a gradient boosting model for a binary classification task: - 3 learning rates: 0.01, 0.05, 0.1 - 3 tree depths: 3, 5, 7 - 3 subsample ratios: 0.6, 0.8, 1.0 - 3 minimum child weights: 1, 3, 5
This creates 81 parameter combinations (3 × 3 × 3 × 3). With 5-fold cross-validation, that's 405 model fits.
Training each model takes ~5 seconds, resulting in 34 minutes of sequential computation.
library(starburst)
Create a synthetic classification dataset:
set.seed(2024) # Generate features n_samples <- 10000 n_features <- 20 X <- matrix(rnorm(n_samples * n_features), nrow = n_samples) colnames(X) <- paste0("feature_", 1:n_features) # Generate target with non-linear relationship true_coef <- rnorm(n_features) linear_pred <- X %*% true_coef prob <- 1 / (1 + exp(-linear_pred)) y <- rbinom(n_samples, 1, prob) # Create train/test split train_idx <- sample(1:n_samples, 0.7 * n_samples) X_train <- X[train_idx, ] y_train <- y[train_idx] X_test <- X[-train_idx, ] y_test <- y[-train_idx] cat(sprintf("Dataset created:\n")) cat(sprintf(" Training samples: %s\n", format(length(y_train), big.mark = ","))) cat(sprintf(" Test samples: %s\n", format(length(y_test), big.mark = ","))) cat(sprintf(" Features: %d\n", n_features)) cat(sprintf(" Class balance: %.1f%% / %.1f%%\n", mean(y_train) * 100, (1 - mean(y_train)) * 100))
Output:
Dataset created: Training samples: 7,000 Test samples: 3,000 Features: 20 Class balance: 50.3% / 49.7%
Create all hyperparameter combinations:
# Define parameter space param_grid <- expand.grid( learning_rate = c(0.01, 0.05, 0.1), max_depth = c(3, 5, 7), subsample = c(0.6, 0.8, 1.0), min_child_weight = c(1, 3, 5), stringsAsFactors = FALSE ) cat(sprintf("Grid search space:\n")) cat(sprintf(" Total parameter combinations: %d\n", nrow(param_grid))) cat(sprintf(" With 5-fold CV: %d model fits\n\n", nrow(param_grid) * 5))
Output:
Grid search space: Total parameter combinations: 81 With 5-fold CV: 405 model fits
Define a function that trains and evaluates one parameter combination:
# Simple gradient boosting implementation (for demonstration) # In practice, use xgboost, lightgbm, or other optimized libraries train_gbm <- function(X, y, params, n_trees = 50) { # Simplified GBM simulation # This is a mock implementation - in real use, call xgboost, etc. n <- nrow(X) pred <- rep(mean(y), n) # Initial prediction # Simulate training time based on complexity complexity_factor <- params$max_depth * (1 / params$learning_rate) * (1 / params$subsample) training_time <- 0.001 * complexity_factor * n_trees Sys.sleep(min(training_time, 5)) # Cap at 5 seconds # Generate mock predictions with some realism pred <- pred + rnorm(n, 0, 0.1) pred <- pmin(pmax(pred, 0), 1) # Bound to [0, 1] list(predictions = pred, params = params) } # Cross-validation function cv_evaluate <- function(param_row, X_data, y_data, n_folds = 5) { params <- as.list(param_row) # Create folds n <- nrow(X_data) fold_size <- floor(n / n_folds) fold_indices <- sample(rep(1:n_folds, length.out = n)) # Perform cross-validation cv_scores <- numeric(n_folds) for (fold in 1:n_folds) { # Split data val_idx <- which(fold_indices == fold) train_idx <- which(fold_indices != fold) X_fold_train <- X_data[train_idx, , drop = FALSE] y_fold_train <- y_data[train_idx] X_fold_val <- X_data[val_idx, , drop = FALSE] y_fold_val <- y_data[val_idx] # Train model model <- train_gbm(X_fold_train, y_fold_train, params) # Predict and evaluate (mock evaluation) # In practice, compute actual predictions and metrics baseline_accuracy <- mean(y_fold_val == round(mean(y_fold_train))) # Simulate performance improvement based on good parameters param_quality <- (params$learning_rate >= 0.05) * 0.02 + (params$max_depth >= 5) * 0.02 + (params$subsample >= 0.8) * 0.01 + rnorm(1, 0, 0.02) accuracy <- min(baseline_accuracy + param_quality, 1.0) cv_scores[fold] <- accuracy } # Return results list( params = params, mean_cv_score = mean(cv_scores), std_cv_score = sd(cv_scores), cv_scores = cv_scores ) }
Test grid search locally on a small subset:
# Test with 10 parameter combinations set.seed(999) sample_params <- param_grid[sample(1:nrow(param_grid), 10), ] cat(sprintf("Running local benchmark (%d parameter combinations)...\n", nrow(sample_params))) local_start <- Sys.time() local_results <- lapply(1:nrow(sample_params), function(i) { cv_evaluate(sample_params[i, ], X_train, y_train, n_folds = 5) }) local_time <- as.numeric(difftime(Sys.time(), local_start, units = "mins")) cat(sprintf("✓ Completed in %.2f minutes\n", local_time)) cat(sprintf(" Average time per combination: %.1f seconds\n", local_time * 60 / nrow(sample_params))) cat(sprintf(" Estimated time for full grid (%d combinations): %.1f minutes\n", nrow(param_grid), local_time * nrow(param_grid) / nrow(sample_params)))
Typical output:
Running local benchmark (10 parameter combinations)... ✓ Completed in 4.2 minutes Average time per combination: 25.2 seconds Estimated time for full grid (81 combinations): 34.0 minutes
For full grid search locally: ~34 minutes
Run the complete grid search in parallel on AWS:
n_workers <- 27 # Process ~3 parameter combinations per worker cat(sprintf("Running grid search (%d combinations) on %d workers...\n", nrow(param_grid), n_workers)) cloud_start <- Sys.time() results <- starburst_map( 1:nrow(param_grid), function(i) cv_evaluate(param_grid[i, ], X_train, y_train, n_folds = 5), workers = n_workers, cpu = 2, memory = "4GB" ) cloud_time <- as.numeric(difftime(Sys.time(), cloud_start, units = "mins")) cat(sprintf("\n✓ Completed in %.2f minutes\n", cloud_time))
Typical output:
🚀 Starting starburst cluster with 27 workers 💰 Estimated cost: ~$2.16/hour 📊 Processing 81 items with 27 workers 📦 Created 27 chunks (avg 3 items per chunk) 🚀 Submitting tasks... ✓ Submitted 27 tasks ⏳ Progress: 27/27 tasks (1.4 minutes elapsed) ✓ Completed in 1.4 minutes 💰 Actual cost: $0.05
Find the best hyperparameters:
# Extract results cv_scores <- sapply(results, function(x) x$mean_cv_score) cv_stds <- sapply(results, function(x) x$std_cv_score) # Combine with parameters results_df <- cbind(param_grid, mean_score = cv_scores, std_score = cv_stds) # Sort by performance results_df <- results_df[order(-results_df$mean_score), ] cat("\n=== Grid Search Results ===\n\n") cat(sprintf("Total combinations evaluated: %d\n", nrow(results_df))) cat(sprintf("Best CV score: %.4f (± %.4f)\n", results_df$mean_score[1], results_df$std_score[1])) cat("\n=== Best Hyperparameters ===\n") cat(sprintf(" Learning rate: %.3f\n", results_df$learning_rate[1])) cat(sprintf(" Max depth: %d\n", results_df$max_depth[1])) cat(sprintf(" Subsample: %.2f\n", results_df$subsample[1])) cat(sprintf(" Min child weight: %d\n", results_df$min_child_weight[1])) cat("\n=== Top 5 Parameter Combinations ===\n") for (i in 1:5) { cat(sprintf("\n%d. Score: %.4f (± %.4f)\n", i, results_df$mean_score[i], results_df$std_score[i])) cat(sprintf(" lr=%.3f, depth=%d, subsample=%.2f, min_child=%d\n", results_df$learning_rate[i], results_df$max_depth[i], results_df$subsample[i], results_df$min_child_weight[i])) } # Parameter importance analysis cat("\n=== Parameter Impact Analysis ===\n") for (param in c("learning_rate", "max_depth", "subsample", "min_child_weight")) { param_means <- aggregate(mean_score ~ get(param), data = results_df, FUN = mean) names(param_means)[1] <- param cat(sprintf("\n%s:\n", param)) for (i in 1:nrow(param_means)) { cat(sprintf(" %s: %.4f\n", param_means[i, 1], param_means[i, 2])) } } # Visualize results (if in interactive session) if (interactive()) { # Score distribution hist(results_df$mean_score, breaks = 20, main = "Distribution of Cross-Validation Scores", xlab = "Mean CV Score", col = "lightblue", border = "white") abline(v = results_df$mean_score[1], col = "red", lwd = 2, lty = 2) # Learning rate effect boxplot(mean_score ~ learning_rate, data = results_df, main = "Learning Rate Impact", xlab = "Learning Rate", ylab = "CV Score", col = "lightgreen") }
Typical output:
=== Grid Search Results === Total combinations evaluated: 81 Best CV score: 0.7842 (± 0.0234) === Best Hyperparameters === Learning rate: 0.050 Max depth: 5 Subsample: 0.80 Min child weight: 3 === Top 5 Parameter Combinations === 1. Score: 0.7842 (± 0.0234) lr=0.050, depth=5, subsample=0.80, min_child=3 2. Score: 0.7821 (± 0.0245) lr=0.050, depth=7, subsample=0.80, min_child=3 3. Score: 0.7798 (± 0.0256) lr=0.100, depth=5, subsample=0.80, min_child=1 4. Score: 0.7776 (± 0.0241) lr=0.050, depth=5, subsample=1.00, min_child=3 5. Score: 0.7754 (± 0.0267) lr=0.050, depth=5, subsample=0.80, min_child=1 === Parameter Impact Analysis === learning_rate: 0.01: 0.7512 0.05: 0.7689 0.1: 0.7598 max_depth: 3: 0.7487 5: 0.7712 7: 0.7601 subsample: 0.6: 0.7523 0.8: 0.7734 1: 0.7642 min_child_weight: 1: 0.7634 3: 0.7689 5: 0.7576
| Method | Combinations | Time | Cost | Speedup | |--------|--------------|------|------|---------| | Local | 10 | 4.2 min | $0 | - | | Local (est.) | 81 | 34 min | $0 | 1x | | staRburst | 81 | 1.4 min | $0.05 | 24.3x |
Key Insights: - Excellent speedup (24x) for grid search - Cost remains minimal ($0.05) despite massive parallelization - Can evaluate 81 combinations in the time it takes to run ~3 locally - Enables exploration of much larger parameter spaces
Extend to random search for efficiency:
# Generate random parameter combinations n_random <- 100 random_params <- data.frame( learning_rate = runif(n_random, 0.001, 0.3), max_depth = sample(2:10, n_random, replace = TRUE), subsample = runif(n_random, 0.5, 1.0), min_child_weight = sample(1:10, n_random, replace = TRUE), stringsAsFactors = FALSE ) cat(sprintf("Running random search (%d combinations)...\n", n_random)) random_results <- starburst_map( 1:nrow(random_params), function(i) cv_evaluate(random_params[i, ], X_train, y_train, n_folds = 5), workers = 33, cpu = 2, memory = "4GB" ) # Find best parameters random_scores <- sapply(random_results, function(x) x$mean_cv_score) best_idx <- which.max(random_scores) cat("\nBest random search result:\n") cat(sprintf(" Score: %.4f\n", random_scores[best_idx])) cat(sprintf(" Learning rate: %.4f\n", random_params$learning_rate[best_idx])) cat(sprintf(" Max depth: %d\n", random_params$max_depth[best_idx]))
Implement iterative Bayesian optimization:
# Bayesian optimization would involve: # 1. Evaluate a small initial set (e.g., 10 combinations) # 2. Fit a Gaussian process to predict performance # 3. Use acquisition function to select next promising points # 4. Evaluate new points in parallel # 5. Repeat until convergence # This requires specialized packages like mlrMBO or rBayesianOptimization # but can be parallelized with starburst for the evaluation step
Good fit: - Expensive model training (> 10 seconds per fit) - Large parameter spaces (> 20 combinations) - Cross-validation with multiple folds - Ensemble model tuning - Neural architecture search
Not ideal: - Very fast models (< 1 second per fit) - Small parameter spaces (< 10 combinations) - Single train/validation split - Real-time model updates
The complete runnable script is available at:
system.file("examples/grid-search.R", package = "starburst")
Run it with:
source(system.file("examples/grid-search.R", package = "starburst"))
Related examples: - Feature Engineering - Parallel feature computation - Bootstrap CI - Model uncertainty estimation - Monte Carlo Simulation - Similar parallel pattern
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.