knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Generating personalized reports for many customers is a common bottleneck. This example demonstrates parallelizing RMarkdown/Quarto report generation using staRburst.
Use Case: Monthly customer reports, automated analytics, personalized dashboards, regulatory reporting
Computational Pattern: I/O-bound parallel processing with document rendering
You need to generate 50 customized monthly reports for different customers: - Each report includes data analysis, visualizations, and summary statistics - Each report takes 1-2 minutes to render - Sequential generation would take 50-100 minutes - Reports must be delivered by end of business day
library(starburst) library(rmarkdown)
Create a simple RMarkdown template:
# Create report template report_template <- ' --- title: "Monthly Analytics Report" output: html_document params: customer_id: "" customer_name: "" month: "" data: NULL --- ```{r template-setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) ``` # Monthly Report for `r params$customer_name` **Customer ID:** `r params$customer_id` **Period:** `r params$month` **Report Generated:** `r format(Sys.time(), "%Y-%m-%d %H:%M")` --- ## Executive Summary This report summarizes activity for `r params$customer_name` during `r params$month`. \`\`\`{r summary} data <- params$data cat(sprintf("Total transactions: %d\\n", nrow(data))) cat(sprintf("Total revenue: $%.2f\\n", sum(data$revenue))) cat(sprintf("Average order value: $%.2f\\n", mean(data$revenue))) cat(sprintf("Active days: %d\\n", length(unique(data$date)))) \`\`\` ## Revenue Trend \`\`\`{r revenue-plot, fig.width=8, fig.height=4} daily_revenue <- aggregate(revenue ~ date, data, sum) plot(daily_revenue$date, daily_revenue$revenue, type = "l", lwd = 2, col = "steelblue", main = "Daily Revenue Trend", xlab = "Date", ylab = "Revenue ($)") grid() \`\`\` ## Top Products \`\`\`{r top-products} top_products <- head( aggregate(revenue ~ product, data, sum), 10 ) top_products <- top_products[order(-top_products$revenue), ] knitr::kable(top_products, format.args = list(big.mark = ",")) \`\`\` ## Summary Statistics \`\`\`{r stats} stats <- data.frame( Metric = c("Total Orders", "Avg Order Value", "Max Order", "Min Order", "Std Dev"), Value = c( nrow(data), round(mean(data$revenue), 2), round(max(data$revenue), 2), round(min(data$revenue), 2), round(sd(data$revenue), 2) ) ) knitr::kable(stats, format.args = list(big.mark = ",")) \`\`\` --- *This report was automatically generated using staRburst parallel processing.* ' # Save template to file writeLines(report_template, "report_template.Rmd")
Create synthetic customer data:
# Function to generate data for one customer generate_customer_data <- function(customer_id) { set.seed(customer_id) n_transactions <- sample(100:500, 1) dates <- sort(sample(seq.Date( from = as.Date("2026-01-01"), to = as.Date("2026-01-31"), by = "day" ), n_transactions, replace = TRUE)) products <- c("Product A", "Product B", "Product C", "Product D", "Product E") data.frame( customer_id = customer_id, date = dates, product = sample(products, n_transactions, replace = TRUE), revenue = round(rnorm(n_transactions, mean = 150, sd = 50), 2), stringsAsFactors = FALSE ) } # Generate customer list n_customers <- 50 customers <- data.frame( customer_id = sprintf("CUST%03d", 1:n_customers), customer_name = paste("Company", LETTERS[1:n_customers %% 26 + 1], (1:n_customers %/% 26) + 1), stringsAsFactors = FALSE ) head(customers)
Define function to generate one report:
generate_report <- function(customer_info) { customer_id <- customer_info$customer_id customer_name <- customer_info$customer_name # Generate data for this customer data <- generate_customer_data(as.numeric(gsub("CUST", "", customer_id))) # Output file path output_file <- sprintf("report_%s.html", customer_id) tryCatch({ # Render report rmarkdown::render( input = "report_template.Rmd", output_file = output_file, params = list( customer_id = customer_id, customer_name = customer_name, month = "January 2026", data = data ), quiet = TRUE ) list( customer_id = customer_id, success = TRUE, output_file = output_file, file_size = file.size(output_file), render_time = Sys.time() ) }, error = function(e) { list( customer_id = customer_id, success = FALSE, error = as.character(e), render_time = Sys.time() ) }) }
Test with a few reports locally:
# Test with 5 reports test_customers <- head(customers, 5) cat(sprintf("Rendering %d reports locally...\n", nrow(test_customers))) local_start <- Sys.time() local_results <- lapply( split(test_customers, 1:nrow(test_customers)), generate_report ) local_time <- as.numeric(difftime(Sys.time(), local_start, units = "secs")) cat(sprintf("✓ Completed in %.1f seconds\n", local_time)) cat(sprintf(" Average: %.1f seconds per report\n", local_time / 5)) cat(sprintf(" Estimated time for %d reports: %.1f minutes\n\n", n_customers, (local_time / 5 * n_customers) / 60))
Typical output:
Rendering 5 reports locally... ✓ Completed in 23.4 seconds Average: 4.7 seconds per report Estimated time for 50 reports: 3.9 minutes
Render all reports in parallel:
cat(sprintf("Rendering %d reports on AWS...\n", n_customers)) # Convert data frame rows to list for starburst_map customer_list <- split(customers, 1:nrow(customers)) results <- starburst_map( customer_list, generate_report, workers = 25, cpu = 2, memory = "4GB" )
Typical output:
🚀 Starting starburst cluster with 25 workers 💰 Estimated cost: ~$2.00/hour 📊 Processing 50 items with 25 workers 📦 Created 25 chunks (avg 2 items per chunk) 🚀 Submitting tasks... ✓ Submitted 25 tasks ⏳ Progress: 25/25 tasks (0.4 minutes elapsed) ✓ Completed in 0.4 minutes 💰 Actual cost: $0.01
Analyze the generation results:
# Check success rate success_count <- sum(sapply(results, function(x) x$success)) failure_count <- sum(!sapply(results, function(x) x$success)) cat("\n=== Report Generation Summary ===\n\n") cat(sprintf("Total reports: %d\n", length(results))) cat(sprintf("Successfully generated: %d (%.1f%%)\n", success_count, (success_count / length(results)) * 100)) cat(sprintf("Failed: %d\n\n", failure_count)) # File size summary successful_results <- results[sapply(results, function(x) x$success)] file_sizes <- sapply(successful_results, function(x) x$file_size) cat("File size statistics:\n") cat(sprintf(" Total size: %.2f MB\n", sum(file_sizes) / 1024^2)) cat(sprintf(" Average size: %.1f KB\n", mean(file_sizes) / 1024)) cat(sprintf(" Range: %.1f - %.1f KB\n\n", min(file_sizes) / 1024, max(file_sizes) / 1024)) # Show sample of generated reports cat("Generated reports:\n") report_files <- sapply(successful_results[1:10], function(x) x$output_file) print(report_files)
Typical output:
=== Report Generation Summary === Total reports: 50 Successfully generated: 50 (100.0%) Failed: 0 File size statistics: Total size: 2.45 MB Average size: 50.2 KB Range: 45.3 - 55.8 KB Generated reports: [1] "report_CUST001.html" "report_CUST002.html" [3] "report_CUST003.html" "report_CUST004.html" [5] "report_CUST005.html" "report_CUST006.html" [7] "report_CUST007.html" "report_CUST008.html" [9] "report_CUST009.html" "report_CUST010.html"
| Method | Reports | Time | Cost | Speedup | |--------|---------|------|------|---------| | Local | 50 | 3.9 min | $0 | 1x | | staRburst | 50 (10 workers) | 1.2 min | $0.004 | 3.3x | | staRburst | 50 (25 workers) | 0.4 min | $0.01 | 9.8x | | staRburst | 50 (50 workers) | 0.3 min | $0.02 | 13x |
Key Insights: - Near-linear scaling with worker count - Sweet spot: 25-50 workers for this workload - Minimal cost ($0.01) for significant time savings - Can easily scale to 500+ reports
Automatically distribute reports after generation:
generate_and_distribute <- function(customer_info) { # Generate report result <- generate_report(customer_info) if (result$success) { # Upload to S3 (example) tryCatch({ # paws::s3()$put_object( # Bucket = "my-reports-bucket", # Key = sprintf("reports/2026-01/%s", result$output_file), # Body = readBin(result$output_file, "raw", # file.size(result$output_file)) # ) result$uploaded <- TRUE result$s3_url <- sprintf("s3://my-reports-bucket/reports/2026-01/%s", result$output_file) }, error = function(e) { result$uploaded <- FALSE result$upload_error <- as.character(e) }) } result }
Good fit: - Many independent reports (> 10) - Report rendering takes > 30 seconds - Time-sensitive delivery requirements - CPU or I/O intensive rendering
Not ideal: - Very simple reports (< 10 seconds to render) - Reports with shared state or dependencies - Interactive report generation - Real-time reporting
The complete runnable script is available at:
system.file("examples/reports.R", package = "starburst")
Run it with:
source(system.file("examples/reports.R", package = "starburst"))
Related examples: - API Calls - Another I/O-bound parallel task - Feature Engineering - Data processing patterns
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.