knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This guide covers security considerations for deploying and using staRburst in production environments.
Never hard-code AWS credentials in your code:
# NEVER DO THIS - credentials exposed in code Sys.setenv( AWS_ACCESS_KEY_ID = "AKIA...", AWS_SECRET_ACCESS_KEY = "wJalr..." )
Why it's dangerous: - Credentials in code can be committed to version control - Logs may capture environment variables - Code sharing exposes credentials
When running on AWS infrastructure (EC2, ECS, Lambda):
# No credentials needed - automatically uses instance/task IAM role library(starburst) plan(starburst, workers = 10)
Benefits: - No credential management required - Automatic credential rotation - Fine-grained permissions via IAM policies - Audit trail via CloudTrail
Setup: 1. Create IAM role with required permissions 2. Attach role to EC2 instance or ECS task 3. staRburst automatically discovers and uses role credentials
For local development, use AWS CLI profiles:
# Credentials stored in ~/.aws/credentials Sys.setenv(AWS_PROFILE = "my-starburst-profile") library(starburst) plan(starburst, workers = 10)
Setup:
# Configure AWS CLI profile
aws configure --profile my-starburst-profile
For cross-account access or enhanced security:
library(paws.security.identity) sts <- paws.security.identity::sts() # Assume role with MFA credentials <- sts$assume_role( RoleArn = "arn:aws:iam::123456789012:role/StarburstRole", RoleSessionName = "starburst-session", SerialNumber = "arn:aws:iam::123456789012:mfa/user", TokenCode = "123456" # MFA token ) # Use temporary credentials Sys.setenv( AWS_ACCESS_KEY_ID = credentials$Credentials$AccessKeyId, AWS_SECRET_ACCESS_KEY = credentials$Credentials$SecretAccessKey, AWS_SESSION_TOKEN = credentials$Credentials$SessionToken )
Benefits: - Time-limited credentials (expire after 1-12 hours) - Can require MFA for sensitive operations - Separate credentials for different roles
Create an IAM policy for staRburst workers:
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "StarburstWorkerAccess",
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject",
"s3:ListBucket"
],
"Resource": [
"arn:aws:s3:::my-starburst-bucket",
"arn:aws:s3:::my-starburst-bucket/*"
],
"Condition": {
"StringLike": {
"s3:prefix": ["sessions/*", "tasks/*", "results/*"]
}
}
}
]
}
Encrypt data at rest in S3:
# Option 1: Enable default encryption via AWS Console or CLI library(paws.storage) s3 <- paws.storage::s3() s3$put_bucket_encryption( Bucket = "my-starburst-bucket", ServerSideEncryptionConfiguration = list( Rules = list( list( ApplyServerSideEncryptionByDefault = list( SSEAlgorithm = "AES256" # Or "aws:kms" for KMS ), BucketKeyEnabled = TRUE ) ) ) )
Encryption Options: - AES256 - S3-managed keys (SSE-S3, free) - aws:kms - AWS KMS managed keys (SSE-KMS, more control, costs apply) - aws:kms:dsse - Dual-layer encryption for compliance (SSE-KMS-DSSE)
Protect against accidental deletion:
s3$put_bucket_versioning( Bucket = "my-starburst-bucket", VersioningConfiguration = list( Status = "Enabled" ) )
Benefits: - Recover from accidental deletions - Rollback to previous versions - Required for certain compliance frameworks
Ensure bucket is never publicly accessible:
s3$put_public_access_block( Bucket = "my-starburst-bucket", PublicAccessBlockConfiguration = list( BlockPublicAcls = TRUE, IgnorePublicAcls = TRUE, BlockPublicPolicy = TRUE, RestrictPublicBuckets = TRUE ) )
Restrict access to specific IAM roles:
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "DenyUnencryptedObjectUploads",
"Effect": "Deny",
"Principal": "*",
"Action": "s3:PutObject",
"Resource": "arn:aws:s3:::my-starburst-bucket/*",
"Condition": {
"StringNotEquals": {
"s3:x-amz-server-side-encryption": "AES256"
}
}
},
{
"Sid": "DenyInsecureTransport",
"Effect": "Deny",
"Principal": "*",
"Action": "s3:*",
"Resource": "arn:aws:s3:::my-starburst-bucket/*",
"Condition": {
"Bool": {
"aws:SecureTransport": "false"
}
}
}
]
}
Deploy workers in private subnets without internet access:
library(starburst) # Configure to use private subnets starburst_config( subnets = c("subnet-private-1a", "subnet-private-1b"), security_groups = c("sg-starburst-workers") ) plan(starburst, workers = 10)
Requirements for Private Subnets: - VPC endpoints for S3, ECR, CloudWatch Logs - NAT Gateway if internet access needed for some operations
Minimal security group (no inbound, only outbound):
{
"GroupName": "starburst-workers",
"Description": "Security group for staRburst ECS tasks",
"VpcId": "vpc-...",
"SecurityGroupIngress": [],
"SecurityGroupEgress": [
{
"IpProtocol": "tcp",
"FromPort": 443,
"ToPort": 443,
"CidrIp": "0.0.0.0/0",
"Description": "HTTPS to AWS services"
}
]
}
Avoid internet traffic for AWS service calls:
S3 Gateway Endpoint (free):
aws ec2 create-vpc-endpoint \ --vpc-id vpc-... \ --service-name com.amazonaws.us-east-1.s3 \ --route-table-ids rtb-...
ECR Interface Endpoints (charges apply):
# ECR API endpoint aws ec2 create-vpc-endpoint \ --vpc-id vpc-... \ --service-name com.amazonaws.us-east-1.ecr.api \ --vpc-endpoint-type Interface \ --subnet-ids subnet-... \ --security-group-ids sg-... # ECR Docker endpoint aws ec2 create-vpc-endpoint \ --vpc-id vpc-... \ --service-name com.amazonaws.us-east-1.ecr.dkr \ --vpc-endpoint-type Interface \ --subnet-ids subnet-... \ --security-group-ids sg-...
CloudWatch Logs Endpoint:
aws ec2 create-vpc-endpoint \ --vpc-id vpc-... \ --service-name com.amazonaws.us-east-1.logs \ --vpc-endpoint-type Interface \ --subnet-ids subnet-... \ --security-group-ids sg-...
Benefits: - All AWS API traffic stays within AWS network - Reduced data transfer costs - Better security posture (no internet exposure)
Create AWS Budget to monitor staRburst costs:
staRburst enforces maximum 500 workers per plan:
# This will error - prevents accidental huge deployments plan(starburst, workers = 10000) # Error: Workers must be <= 500 # Maximum allowed plan(starburst, workers = 500)
Regularly check for orphaned sessions:
# List all sessions sessions <- starburst_list_sessions() print(sessions) # Cleanup old sessions for (session_id in sessions$session_id) { # Check if session is old created_date <- as.Date(sessions$created_at[sessions$session_id == session_id]) if (Sys.Date() - created_date > 7) { cat(sprintf("Cleaning up old session: %s\n", session_id)) session <- starburst_session_attach(session_id) session$cleanup(stop_workers = TRUE, force = TRUE) } }
Prevent sessions from running indefinitely:
# Session auto-terminates after 24 hours session <- starburst_session( workers = 10, absolute_timeout = 86400 # 24 hours in seconds )
Before launching large jobs:
# Estimate cost workers <- 100 cpu <- 4 memory_gb <- 8 runtime_hours <- 2 # Fargate pricing (us-east-1, 2026) vcpu_price <- 0.04048 memory_price <- 0.004445 cost_per_worker <- (cpu * vcpu_price) + (memory_gb * memory_price) total_cost <- workers * cost_per_worker * runtime_hours cat(sprintf("Estimated cost: $%.2f for %d hours\n", total_cost, runtime_hours)) # Estimated cost: $19.84 for 2 hours
Capture all API calls for security auditing:
What CloudTrail Captures: - Who launched staRburst workers (IAM user/role) - When tasks were created/stopped - S3 object access (if data events enabled) - API failures and authorization errors
staRburst automatically logs to CloudWatch:
/aws/ecs/starburst-workerIncrease retention for compliance:
library(paws.management) logs <- paws.management::cloudwatchlogs() logs$put_retention_policy( logGroupName = "/aws/ecs/starburst-worker", retentionInDays = 30 # or 90, 180, 365, etc. )
Track all S3 bucket access:
s3$put_bucket_logging( Bucket = "my-starburst-bucket", BucketLoggingStatus = list( LoggingEnabled = list( TargetBucket = "my-logging-bucket", TargetPrefix = "starburst-access-logs/" ) ) )
Access logs include: - Who accessed objects - When objects were accessed - What operations were performed - Source IP addresses
Regularly review logs for suspicious activity:
# CloudWatch Insights query for failed authentications library(paws.management) logs <- paws.management::cloudwatchlogs() # Query for errors in last 24 hours query <- "fields @timestamp, @message | filter @message like /ERROR|AccessDenied|Forbidden/ | sort @timestamp desc" start_time <- as.integer(Sys.time() - 86400) # 24 hours ago end_time <- as.integer(Sys.time()) result <- logs$start_query( logGroupName = "/aws/ecs/starburst-worker", startTime = start_time, endTime = end_time, queryString = query )
Only send necessary data to workers:
# ✅ Good - only send needed data large_data <- read.csv("huge_dataset.csv") sample_data <- large_data[1:1000, ] # Use sample for testing plan(starburst, workers = 10) results <- future_map(1:10, function(i) { # sample_data automatically uploaded as global analyze(sample_data[i, ]) }) # ❌ Bad - sends entire large dataset plan(starburst, workers = 10) results <- future_map(1:10, function(i) { analyze(large_data[i, ]) # Uploads all of large_data })
# Collect and immediately delete from S3 session <- starburst_session(workers = 10) task_id <- session$submit(quote(sensitive_computation())) # Wait for completion results <- session$collect(wait = TRUE) # Immediately cleanup (force = TRUE deletes S3 files) session$cleanup(force = TRUE) # Save results locally if needed saveRDS(results, "local_results.rds")
For highly sensitive data:
# Encrypt before upload library(sodium) # Generate key (store securely, e.g., AWS Secrets Manager) key <- random(32) # Encrypt data before passing to future sensitive_data <- serialize(my_data, NULL) encrypted_data <- data_encrypt(sensitive_data, key) # Run computation plan(starburst, workers = 1) result <- future({ # Decrypt in worker decrypted <- data_decrypt(encrypted_data, key) my_data <- unserialize(decrypted) # Process... })
staRburst uses multi-stage builds to minimize image size:
Enable ECR image scanning:
library(paws.compute) ecr <- paws.compute::ecr() ecr$put_image_scanning_configuration( repositoryName = "starburst-worker", imageScanningConfiguration = list( scanOnPush = TRUE ) )
Automatically delete old images:
# Delete images older than 30 days ecr$put_lifecycle_policy( repositoryName = "starburst-worker", lifecyclePolicyText = jsonlite::toJSON(list( rules = list( list( rulePriority = 1, description = "Delete old images", selection = list( tagStatus = "any", countType = "sinceImagePushed", countUnit = "days", countNumber = 30 ), action = list(type = "expire") ) ) ), auto_unbox = TRUE) )
For HIPAA-regulated workloads:
For EU data processing:
For SOC 2 compliance:
Immediate Actions: 1. Disable/rotate credentials immediately 2. Review CloudTrail logs for unauthorized activity 3. Check for new resources created by attacker 4. Terminate suspicious ECS tasks 5. Review S3 bucket access logs
# List all active sessions sessions <- starburst_list_sessions() # Check for sessions you didn't create print(sessions) # Cleanup suspicious sessions for (session_id in suspicious_ids) { session <- starburst_session_attach(session_id) session$cleanup(stop_workers = TRUE, force = TRUE) }
Before deploying staRburst to production:
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.