knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE )
staRburst makes it trivial to scale your parallel R code from your laptop to 100+ AWS workers. This vignette walks through setup and common usage patterns.
# Install from GitHub remotes::install_github("yourname/starburst")
Before using staRburst, you need to configure AWS resources. This only needs to be done once.
library(starburst) # Interactive setup wizard (takes ~2 minutes) starburst_setup()
This will: - Validate your AWS credentials - Create an S3 bucket for data transfer - Create an ECR repository for Docker images - Set up ECS cluster and VPC resources - Check Fargate quotas and offer to request increases
The simplest way to use staRburst is with the furrr package:
library(furrr) library(starburst) # Define your work expensive_simulation <- function(i) { # Some computation that takes a few minutes results <- replicate(1000, { x <- rnorm(10000) mean(x^2) }) mean(results) } # Local execution (single core) plan(sequential) system.time({ results_local <- future_map(1:100, expensive_simulation) }) #> ~16 minutes on typical laptop # Cloud execution (50 workers) plan(future_starburst, workers = 50) system.time({ results_cloud <- future_map(1:100, expensive_simulation) }) #> ~2 minutes (including 45s startup) #> Cost: ~$0.85 # Results are identical identical(results_local, results_cloud) #> [1] TRUE
library(starburst) library(furrr) # Simulate portfolio returns simulate_portfolio <- function(seed) { set.seed(seed) # Random walk for 252 trading days returns <- rnorm(252, mean = 0.0003, sd = 0.02) prices <- cumprod(1 + returns) list( final_value = prices[252], max_drawdown = max(cummax(prices) - prices) / max(prices), sharpe_ratio = mean(returns) / sd(returns) * sqrt(252) ) } # Run 10,000 simulations on 100 workers plan(future_starburst, workers = 100) results <- future_map(1:10000, simulate_portfolio, .options = furrr_options(seed = TRUE)) # Analyze results final_values <- sapply(results, `[[`, "final_value") hist(final_values, breaks = 50, main = "Distribution of Portfolio Final Values") # 95% confidence interval quantile(final_values, c(0.025, 0.975))
Performance: - Local (single core): ~4 hours - Cloud (100 workers): ~3 minutes - Cost: ~$1.80
library(starburst) library(furrr) # Your data data <- read.csv("my_data.csv") # Bootstrap function bootstrap_regression <- function(i, data) { # Resample with replacement boot_indices <- sample(nrow(data), replace = TRUE) boot_data <- data[boot_indices, ] # Fit model model <- lm(y ~ x1 + x2 + x3, data = boot_data) # Return coefficients coef(model) } # Run 10,000 bootstrap samples plan(future_starburst, workers = 50) boot_results <- future_map(1:10000, bootstrap_regression, data = data) # Convert to matrix boot_coefs <- do.call(rbind, boot_results) # 95% confidence intervals for each coefficient apply(boot_coefs, 2, quantile, probs = c(0.025, 0.975))
library(starburst) library(furrr) # Process one sample process_sample <- function(sample_id) { # Read from S3 (data already in cloud) fastq_path <- sprintf("s3://my-genomics-data/samples/%s.fastq", sample_id) data <- read_fastq(fastq_path) # Align reads aligned <- align_reads(data, reference = "hg38") # Call variants variants <- call_variants(aligned) # Return summary list( sample_id = sample_id, num_variants = nrow(variants), variants = variants ) } # Process 1000 samples on 100 workers sample_ids <- list.files("s3://my-genomics-data/samples/", pattern = ".fastq$") plan(future_starburst, workers = 100) results <- future_map(sample_ids, process_sample, .progress = TRUE) # Combine results all_variants <- do.call(rbind, lapply(results, `[[`, "variants"))
Performance: - Local (sequential): ~208 hours (8.7 days) - Cloud (100 workers): ~2 hours - Cost: ~$47
If your data is already in S3, workers can read it directly:
plan(future_starburst, workers = 50) results <- future_map(file_list, function(file) { # Workers read directly from S3 data <- read.csv(sprintf("s3://my-bucket/%s", file)) process(data) })
For smaller datasets, you can pass data as arguments:
# Load data locally data <- read.csv("local_file.csv") # staRburst automatically uploads to S3 and distributes plan(future_starburst, workers = 50) results <- future_map(1:1000, function(i) { # Each worker gets a copy of 'data' bootstrap_analysis(data, i) })
For very large objects, pre-upload to S3:
# Upload once large_data <- read.csv("huge_file.csv") s3_path <- starburst_upload(large_data, "s3://my-bucket/large_data.rds") # Workers read from S3 plan(future_starburst, workers = 100) results <- future_map(1:1000, function(i) { # Read from S3 inside worker data <- readRDS(s3_path) process(data, i) })
# Check cost before running plan(future_starburst, workers = 100, cpu = 4, memory = "8GB") #> Estimated cost: ~$3.50/hour
# Set maximum cost per job starburst_config( max_cost_per_job = 10, # Don't start jobs that would cost >$10 cost_alert_threshold = 5 # Warn when approaching $5 ) # Now jobs exceeding limit will error before starting plan(future_starburst, workers = 1000) # Would cost ~$35/hour #> Error: Estimated cost ($35/hr) exceeds limit ($10/hr)
plan(future_starburst, workers = 50) results <- future_map(data, process) #> Cluster runtime: 23 minutes #> Total cost: $1.34
starburst_quota_status() #> Fargate vCPU Quota: 100 / 100 used #> Allows: ~25 workers with 4 vCPUs each #> #> Recommended: Request increase to 500 vCPUs
starburst_request_quota_increase(vcpus = 500) #> Requesting Fargate vCPU quota increase: #> Current: 100 vCPUs #> Requested: 500 vCPUs #> #> ✓ Quota increase requested (Case ID: 12345678) #> ✓ AWS typically approves within 1-24 hours
If you request more workers than your quota allows, staRburst automatically uses wave-based execution:
# Quota allows 25 workers, but you request 100 plan(future_starburst, workers = 100, cpu = 4) #> ⚠ Requested: 100 workers (400 vCPUs) #> ⚠ Current quota: 100 vCPUs (allows 25 workers max) #> #> 📋 Execution plan: #> • Running in 4 waves of 25 workers each #> #> 💡 Request quota increase to 500 vCPUs? [y/n]: y #> #> ✓ Quota increase requested #> ⚡ Starting wave 1 (25 workers)... results <- future_map(1:1000, expensive_function) #> ⚡ Wave 1: 100% complete (250 tasks) #> ⚡ Wave 2: 100% complete (500 tasks) #> ⚡ Wave 3: 100% complete (750 tasks) #> ⚡ Wave 4: 100% complete (1000 tasks)
# View logs from most recent cluster starburst_logs() # View logs from specific task starburst_logs(task_id = "abc-123") # View last 100 log lines starburst_logs(last_n = 100)
starburst_status() #> Active Clusters: #> • starburst-xyz123: 50 workers running #> • starburst-abc456: 25 workers running
Environment mismatch: Packages not found on workers
# Rebuild environment starburst_rebuild_environment()
Task failures: Some tasks failing
# Check logs starburst_logs(task_id = "failed-task-id") # Often due to memory limits - increase worker memory plan(future_starburst, workers = 50, memory = "16GB") # Default is 8GB
Slow data transfer: Large objects taking too long
# Use Arrow for data frames library(arrow) write_parquet(my_data, "s3://bucket/data.parquet") # Workers read Arrow results <- future_map(1:100, function(i) { data <- read_parquet("s3://bucket/data.parquet") process(data, i) })
✅ Good: Each task takes >5 minutes
# 100 tasks, each takes 10 minutes # Local: 1000 minutes, Cloud: ~10 minutes
❌ Bad: Each task takes <1 minute
# 10000 tasks, each takes 30 seconds # Startup overhead (45s) dominates
Instead of:
# 10,000 tiny tasks results <- future_map(1:10000, small_function)
Do:
# 100 batches of 100 tasks each batches <- split(1:10000, ceiling(seq_along(1:10000) / 100)) results <- future_map(batches, function(batch) { lapply(batch, small_function) }) # Flatten results results <- unlist(results, recursive = FALSE)
Don't:
big_data <- read.csv("10GB_file.csv") # Upload for every task results <- future_map(1:1000, function(i) process(big_data, i))
Do:
# Upload once to S3 s3_path <- "s3://bucket/big_data.csv" write.csv(big_data, s3_path) # Workers read from S3 results <- future_map(1:1000, function(i) { data <- read.csv(s3_path) process(data, i) })
starburst_config( max_cost_per_job = 50, # Prevent accidents cost_alert_threshold = 25 # Get warned early )
# staRburst auto-cleans, but you can force it plan(sequential) # Switch back to local # Old cluster resources are cleaned up automatically
# High CPU, low memory (CPU-bound work) plan(future_starburst, workers = 50, cpu = 8, memory = "16GB") # Low CPU, high memory (memory-bound work) plan(future_starburst, workers = 25, cpu = 4, memory = "32GB")
# Increase timeout for long-running tasks (default 1 hour) plan(future_starburst, workers = 10, timeout = 7200) # 2 hours
# Use specific region (default from config) plan(future_starburst, workers = 50, region = "us-west-2")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.