inst/doc/getting-started.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)

## -----------------------------------------------------------------------------
# # Install from GitHub
# remotes::install_github("yourname/starburst")

## -----------------------------------------------------------------------------
# library(starburst)
# 
# # Interactive setup wizard (takes ~2 minutes)
# starburst_setup()

## -----------------------------------------------------------------------------
# library(furrr)
# library(starburst)
# 
# # Define your work
# expensive_simulation <- function(i) {
#   # Some computation that takes a few minutes
#   results <- replicate(1000, {
#     x <- rnorm(10000)
#     mean(x^2)
#   })
#   mean(results)
# }
# 
# # Local execution (single core)
# plan(sequential)
# system.time({
#   results_local <- future_map(1:100, expensive_simulation)
# })
# #> ~16 minutes on typical laptop
# 
# # Cloud execution (50 workers)
# plan(future_starburst, workers = 50)
# system.time({
#   results_cloud <- future_map(1:100, expensive_simulation)
# })
# #> ~2 minutes (including 45s startup)
# #> Cost: ~$0.85
# 
# # Results are identical
# identical(results_local, results_cloud)
# #> [1] TRUE

## -----------------------------------------------------------------------------
# library(starburst)
# library(furrr)
# 
# # Simulate portfolio returns
# simulate_portfolio <- function(seed) {
#   set.seed(seed)
# 
#   # Random walk for 252 trading days
#   returns <- rnorm(252, mean = 0.0003, sd = 0.02)
#   prices <- cumprod(1 + returns)
# 
#   list(
#     final_value = prices[252],
#     max_drawdown = max(cummax(prices) - prices) / max(prices),
#     sharpe_ratio = mean(returns) / sd(returns) * sqrt(252)
#   )
# }
# 
# # Run 10,000 simulations on 100 workers
# plan(future_starburst, workers = 100)
# 
# results <- future_map(1:10000, simulate_portfolio, .options = furrr_options(seed = TRUE))
# 
# # Analyze results
# final_values <- sapply(results, `[[`, "final_value")
# hist(final_values, breaks = 50, main = "Distribution of Portfolio Final Values")
# 
# # 95% confidence interval
# quantile(final_values, c(0.025, 0.975))

## -----------------------------------------------------------------------------
# library(starburst)
# library(furrr)
# 
# # Your data
# data <- read.csv("my_data.csv")
# 
# # Bootstrap function
# bootstrap_regression <- function(i, data) {
#   # Resample with replacement
#   boot_indices <- sample(nrow(data), replace = TRUE)
#   boot_data <- data[boot_indices, ]
# 
#   # Fit model
#   model <- lm(y ~ x1 + x2 + x3, data = boot_data)
# 
#   # Return coefficients
#   coef(model)
# }
# 
# # Run 10,000 bootstrap samples
# plan(future_starburst, workers = 50)
# 
# boot_results <- future_map(1:10000, bootstrap_regression, data = data)
# 
# # Convert to matrix
# boot_coefs <- do.call(rbind, boot_results)
# 
# # 95% confidence intervals for each coefficient
# apply(boot_coefs, 2, quantile, probs = c(0.025, 0.975))

## -----------------------------------------------------------------------------
# library(starburst)
# library(furrr)
# 
# # Process one sample
# process_sample <- function(sample_id) {
#   # Read from S3 (data already in cloud)
#   fastq_path <- sprintf("s3://my-genomics-data/samples/%s.fastq", sample_id)
#   data <- read_fastq(fastq_path)
# 
#   # Align reads
#   aligned <- align_reads(data, reference = "hg38")
# 
#   # Call variants
#   variants <- call_variants(aligned)
# 
#   # Return summary
#   list(
#     sample_id = sample_id,
#     num_variants = nrow(variants),
#     variants = variants
#   )
# }
# 
# # Process 1000 samples on 100 workers
# sample_ids <- list.files("s3://my-genomics-data/samples/", pattern = ".fastq$")
# 
# plan(future_starburst, workers = 100)
# 
# results <- future_map(sample_ids, process_sample, .progress = TRUE)
# 
# # Combine results
# all_variants <- do.call(rbind, lapply(results, `[[`, "variants"))

## -----------------------------------------------------------------------------
# plan(future_starburst, workers = 50)
# 
# results <- future_map(file_list, function(file) {
#   # Workers read directly from S3
#   data <- read.csv(sprintf("s3://my-bucket/%s", file))
#   process(data)
# })

## -----------------------------------------------------------------------------
# # Load data locally
# data <- read.csv("local_file.csv")
# 
# # staRburst automatically uploads to S3 and distributes
# plan(future_starburst, workers = 50)
# 
# results <- future_map(1:1000, function(i) {
#   # Each worker gets a copy of 'data'
#   bootstrap_analysis(data, i)
# })

## -----------------------------------------------------------------------------
# # Upload once
# large_data <- read.csv("huge_file.csv")
# s3_path <- starburst_upload(large_data, "s3://my-bucket/large_data.rds")
# 
# # Workers read from S3
# plan(future_starburst, workers = 100)
# 
# results <- future_map(1:1000, function(i) {
#   # Read from S3 inside worker
#   data <- readRDS(s3_path)
#   process(data, i)
# })

## -----------------------------------------------------------------------------
# # Check cost before running
# plan(future_starburst, workers = 100, cpu = 4, memory = "8GB")
# #> Estimated cost: ~$3.50/hour

## -----------------------------------------------------------------------------
# # Set maximum cost per job
# starburst_config(
#   max_cost_per_job = 10,      # Don't start jobs that would cost >$10
#   cost_alert_threshold = 5     # Warn when approaching $5
# )
# 
# # Now jobs exceeding limit will error before starting
# plan(future_starburst, workers = 1000)  # Would cost ~$35/hour
# #> Error: Estimated cost ($35/hr) exceeds limit ($10/hr)

## -----------------------------------------------------------------------------
# plan(future_starburst, workers = 50)
# 
# results <- future_map(data, process)
# 
# #> Cluster runtime: 23 minutes
# #> Total cost: $1.34

## -----------------------------------------------------------------------------
# starburst_quota_status()
# #> Fargate vCPU Quota: 100 / 100 used
# #> Allows: ~25 workers with 4 vCPUs each
# #>
# #> Recommended: Request increase to 500 vCPUs

## -----------------------------------------------------------------------------
# starburst_request_quota_increase(vcpus = 500)
# #> Requesting Fargate vCPU quota increase:
# #>   Current: 100 vCPUs
# #>   Requested: 500 vCPUs
# #>
# #> ✓ Quota increase requested (Case ID: 12345678)
# #> ✓ AWS typically approves within 1-24 hours

## -----------------------------------------------------------------------------
# # Quota allows 25 workers, but you request 100
# plan(future_starburst, workers = 100, cpu = 4)
# 
# #> ⚠ Requested: 100 workers (400 vCPUs)
# #> ⚠ Current quota: 100 vCPUs (allows 25 workers max)
# #>
# #> 📋 Execution plan:
# #>   • Running in 4 waves of 25 workers each
# #>
# #> 💡 Request quota increase to 500 vCPUs? [y/n]: y
# #>
# #> ✓ Quota increase requested
# #> ⚡ Starting wave 1 (25 workers)...
# 
# results <- future_map(1:1000, expensive_function)
# 
# #> ⚡ Wave 1: 100% complete (250 tasks)
# #> ⚡ Wave 2: 100% complete (500 tasks)
# #> ⚡ Wave 3: 100% complete (750 tasks)
# #> ⚡ Wave 4: 100% complete (1000 tasks)

## -----------------------------------------------------------------------------
# # View logs from most recent cluster
# starburst_logs()
# 
# # View logs from specific task
# starburst_logs(task_id = "abc-123")
# 
# # View last 100 log lines
# starburst_logs(last_n = 100)

## -----------------------------------------------------------------------------
# starburst_status()
# #> Active Clusters:
# #>   • starburst-xyz123: 50 workers running
# #>   • starburst-abc456: 25 workers running

## -----------------------------------------------------------------------------
# # Rebuild environment
# starburst_rebuild_environment()

## -----------------------------------------------------------------------------
# # Check logs
# starburst_logs(task_id = "failed-task-id")
# 
# # Often due to memory limits - increase worker memory
# plan(future_starburst, workers = 50, memory = "16GB")  # Default is 8GB

## -----------------------------------------------------------------------------
# # Use Arrow for data frames
# library(arrow)
# write_parquet(my_data, "s3://bucket/data.parquet")
# 
# # Workers read Arrow
# results <- future_map(1:100, function(i) {
#   data <- read_parquet("s3://bucket/data.parquet")
#   process(data, i)
# })

## -----------------------------------------------------------------------------
# # 100 tasks, each takes 10 minutes
# # Local: 1000 minutes, Cloud: ~10 minutes

## -----------------------------------------------------------------------------
# # 10000 tasks, each takes 30 seconds
# # Startup overhead (45s) dominates

## -----------------------------------------------------------------------------
# # 10,000 tiny tasks
# results <- future_map(1:10000, small_function)

## -----------------------------------------------------------------------------
# # 100 batches of 100 tasks each
# batches <- split(1:10000, ceiling(seq_along(1:10000) / 100))
# 
# results <- future_map(batches, function(batch) {
#   lapply(batch, small_function)
# })
# 
# # Flatten results
# results <- unlist(results, recursive = FALSE)

## -----------------------------------------------------------------------------
# big_data <- read.csv("10GB_file.csv")  # Upload for every task
# results <- future_map(1:1000, function(i) process(big_data, i))

## -----------------------------------------------------------------------------
# # Upload once to S3
# s3_path <- "s3://bucket/big_data.csv"
# write.csv(big_data, s3_path)
# 
# # Workers read from S3
# results <- future_map(1:1000, function(i) {
#   data <- read.csv(s3_path)
#   process(data, i)
# })

## -----------------------------------------------------------------------------
# starburst_config(
#   max_cost_per_job = 50,           # Prevent accidents
#   cost_alert_threshold = 25        # Get warned early
# )

## -----------------------------------------------------------------------------
# # staRburst auto-cleans, but you can force it
# plan(sequential)  # Switch back to local
# # Old cluster resources are cleaned up automatically

## -----------------------------------------------------------------------------
# # High CPU, low memory (CPU-bound work)
# plan(future_starburst, workers = 50, cpu = 8, memory = "16GB")
# 
# # Low CPU, high memory (memory-bound work)
# plan(future_starburst, workers = 25, cpu = 4, memory = "32GB")

## -----------------------------------------------------------------------------
# # Increase timeout for long-running tasks (default 1 hour)
# plan(future_starburst, workers = 10, timeout = 7200)  # 2 hours

## -----------------------------------------------------------------------------
# # Use specific region (default from config)
# plan(future_starburst, workers = 50, region = "us-west-2")

Try the starburst package in your browser

Any scripts or data that you put into this service are public.

starburst documentation built on March 19, 2026, 5:08 p.m.