inst/examples/example.R

# =====================================
# RNA-Seq Analysis Example Workflow
# =====================================
#
# NOTE: This example uses tempdir() to comply with CRAN policies.
# In real usage, you would run this in your project directory.

# Clear environment
rm(list = ls())

library(Capsule)

# Create project directory in tempdir() (CRAN compliant)
project_dir <- file.path(tempdir(), "Capsule_full_test")
if (dir.exists(project_dir)) {
  unlink(project_dir, recursive = TRUE)
}
dir.create(project_dir, showWarnings = FALSE)

# Store original directory
original_dir <- getwd()
setwd(project_dir)

# Ensure we return to original directory on exit
on.exit({
  setwd(original_dir)
  # Clean up temp directory
  unlink(project_dir, recursive = TRUE)
}, add = TRUE)

cat("Working in temporary directory:", project_dir, "\n")

# ============================================================================
# Step 1: Initialize Capsule
# ============================================================================
cat("=== Step 1: Initializing Capsule ===\n")
init_capsule(
  use_renv = FALSE,  # Disable to avoid interactive prompts
  use_git = FALSE,
  create_gitignore = TRUE
)

cat("\nDirectory structure created:\n")
cat("✓ .capsule/\n")
cat("✓ .capsule/scripts/\n")
cat("✓ .capsule/snapshots/\n")

# ============================================================================
# Step 2: Set Random Seed
# ============================================================================
cat("\n=== Step 2: Setting Random Seed ===\n")
set_seed(42, analysis_name = "simulated_rnaseq_de",
         registry_file = ".capsule/seed_registry.json")

# ============================================================================
# Step 3: Generate Simulated Data
# ============================================================================
cat("\n=== Step 3: Generating Simulated RNA-seq Data ===\n")

set.seed(42)  # For reproducibility of simulation

# Parameters
n_genes <- 1000
n_samples <- 12

# Sample metadata
sample_ids <- paste0("Sample_", 1:n_samples)
conditions <- rep(c("Control", "Tumor"), each = 6)
batch <- rep(c("Batch1", "Batch2"), times = 6)

metadata <- data.frame(
  sample_id = sample_ids,
  condition = factor(conditions),
  batch = factor(batch),
  age = sample(40:70, n_samples, replace = TRUE),
  gender = sample(c("M", "F"), n_samples, replace = TRUE),
  row.names = sample_ids
)

# Generate count matrix
gene_names <- paste0("GENE_", sprintf("%04d", 1:n_genes))
counts_matrix <- matrix(
  rpois(n_genes * n_samples, lambda = 100),
  nrow = n_genes,
  ncol = n_samples,
  dimnames = list(gene_names, sample_ids)
)

# Add differential expression (100 true DE genes)
de_gene_idx <- sample(1:n_genes, 100)
fc <- runif(100, min = 2, max = 5)  # Fold changes between 2-5x
for (i in seq_len(100)) {
  counts_matrix[de_gene_idx[i], 7:12] <-
    round(counts_matrix[de_gene_idx[i], 7:12] * fc[i])
}

cat("Generated:", n_genes, "genes ×", n_samples, "samples\n")
cat("True DE genes:", length(de_gene_idx), "\n")

# ============================================================================
# Step 4: Save Data Files
# ============================================================================
cat("\n=== Step 4: Saving Data Files ===\n")
dir.create("data", showWarnings = FALSE)

write.csv(counts_matrix, "data/counts.csv")
write.csv(metadata, "data/metadata.csv")

# Save ground truth
ground_truth <- data.frame(
  gene = gene_names[de_gene_idx],
  true_fc = fc,
  stringsAsFactors = FALSE
)
write.csv(ground_truth, "data/ground_truth.csv", row.names = FALSE)

cat("✓ data/counts.csv\n")
cat("✓ data/metadata.csv\n")
cat("✓ data/ground_truth.csv\n")

# ============================================================================
# Step 5: Track Data with Capsule
# ============================================================================
cat("\n=== Step 5: Tracking Data Provenance ===\n")

track_data(
  "data/counts.csv",
  source = "generated",
  description = "Simulated RNA-seq count matrix",
  metadata = list(n_genes = n_genes, n_samples = n_samples),
  registry_file = ".capsule/data_registry.json"
)

track_data(
  "data/metadata.csv",
  source = "generated",
  description = "Sample metadata with clinical information",
  registry_file = ".capsule/data_registry.json"
)

track_data(
  "data/ground_truth.csv",
  source = "generated",
  description = "Ground truth DE genes for validation",
  registry_file = ".capsule/data_registry.json"
)

# ============================================================================
# Step 6: Track Analysis Parameters
# ============================================================================
cat("\n=== Step 6: Tracking Analysis Parameters ===\n")

analysis_params <- list(
  alpha = 0.05,
  lfc_threshold = log2(1.5),
  min_count = 10,
  p_adjust_method = "BH",
  design_formula = "~ batch + condition"
)

track_params(
  analysis_params,
  analysis_name = "simulated_rnaseq_de",
  description = "Differential expression analysis parameters",
  registry_file = ".capsule/param_registry.json"
)

# ============================================================================
# Step 7: Create Analysis Script
# ============================================================================
cat("\n=== Step 7: Creating Analysis Script ===\n")

analysis_code <- '#!/usr/bin/env Rscript
# RNA-seq Differential Expression Analysis
# Auto-generated by Capsule test

library(Capsule)

cat("Loading data...\\n")
counts <- read.csv("data/counts.csv", row.names = 1)
metadata <- read.csv("data/metadata.csv", row.names = 1)

cat("Samples:", ncol(counts), "\\n")
cat("Genes:", nrow(counts), "\\n")

# Filter low count genes
min_count <- 10
min_samples <- 3
keep <- rowSums(counts >= min_count) >= min_samples
counts_filtered <- counts[keep, ]
cat("After filtering:", nrow(counts_filtered), "genes\\n")

# Calculate means
control_idx <- metadata$condition == "Control"
tumor_idx <- metadata$condition == "Tumor"

control_mean <- rowMeans(counts_filtered[, control_idx])
tumor_mean <- rowMeans(counts_filtered[, tumor_idx])

# Log2 fold change
log2fc <- log2((tumor_mean + 1) / (control_mean + 1))

# T-test for each gene
cat("Running statistical tests...\\n")
pvalues <- apply(counts_filtered, 1, function(gene_counts) {
  tryCatch({
    t.test(gene_counts[control_idx], gene_counts[tumor_idx])$p.value
  }, error = function(e) 1.0)
})

# Adjust p-values
padj <- p.adjust(pvalues, method = "BH")

# Create results table
results <- data.frame(
  gene = rownames(counts_filtered),
  baseMean = (control_mean + tumor_mean) / 2,
  log2FoldChange = log2fc,
  pvalue = pvalues,
  padj = padj,
  significant = (padj < 0.05) & (abs(log2fc) > log2(1.5)),
  stringsAsFactors = FALSE
)

# Sort by p-value
results <- results[order(results$pvalue), ]

# Summary
n_sig <- sum(results$significant, na.rm = TRUE)
n_up <- sum(results$significant & results$log2FoldChange > 0, na.rm = TRUE)
n_down <- sum(results$significant & results$log2FoldChange < 0, na.rm = TRUE)

cat("\\nResults Summary:\\n")
cat("  Total genes tested:", nrow(results), "\\n")
cat("  Significant DE genes:", n_sig, "\\n")
cat("  Up-regulated:", n_up, "\\n")
cat("  Down-regulated:", n_down, "\\n")

# Save results
dir.create("results", showWarnings = FALSE)
write.csv(results, "results/de_results.csv", row.names = FALSE)

sig_results <- results[results$significant & !is.na(results$significant), ]
write.csv(sig_results, "results/significant_genes.csv", row.names = FALSE)

cat("\\nResults saved to results/ directory\\n")

# Track output
track_data("results/de_results.csv", source = "generated",
           description = "Complete DE analysis results",
           registry_file = ".capsule/data_registry.json")
track_data("results/significant_genes.csv", source = "generated",
           description = "Significant DE genes only",
           registry_file = ".capsule/data_registry.json")

cat("Analysis complete!\\n")
'

dir.create("scripts", showWarnings = FALSE)
writeLines(analysis_code, "scripts/de_analysis.R")
cat("✓ scripts/de_analysis.R\n")

# Make it executable (Unix-like systems)
if (.Platform$OS.type == "unix") {
  Sys.chmod("scripts/de_analysis.R", mode = "0755")
}

# ============================================================================
# Step 8: Run the Analysis
# ============================================================================
cat("\n=== Step 8: Running Analysis Script ===\n")
source("scripts/de_analysis.R")

# ============================================================================
# Step 9: Validate Results
# ============================================================================
cat("\n=== Step 9: Validating Against Ground Truth ===\n")

results <- read.csv("results/significant_genes.csv")
ground_truth <- read.csv("data/ground_truth.csv")

detected_genes <- results$gene
true_genes <- ground_truth$gene

tp <- sum(detected_genes %in% true_genes)
fp <- sum(!(detected_genes %in% true_genes))
fn <- sum(!(true_genes %in% detected_genes))

precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
f1 <- 2 * (precision * recall) / (precision + recall)

cat("\nValidation Metrics:\n")
cat("  True Positives:", tp, "\n")
cat("  False Positives:", fp, "\n")
cat("  False Negatives:", fn, "\n")
cat("  Precision:", round(precision, 3), "\n")
cat("  Recall:", round(recall, 3), "\n")
cat("  F1 Score:", round(f1, 3), "\n")

# ============================================================================
# Step 10: Verify Data Integrity
# ============================================================================
cat("\n=== Step 10: Verifying Data Integrity ===\n")
verify_result <- verify_data()

# ============================================================================
# Step 11: Create Workflow Snapshot
# ============================================================================
cat("\n=== Step 11: Creating Complete Workflow Snapshot ===\n")

snapshot_files <- snapshot_workflow(
  snapshot_name = "test_complete",
  analysis_name = "simulated_rnaseq_de",
  source_script = "scripts/de_analysis.R",
  description = "Complete RNA-seq DE analysis with validation",
  generate_docker = TRUE,
  generate_script = TRUE,
  generate_report = TRUE
)

# ============================================================================
# Step 12: Inspect What Was Created
# ============================================================================
cat("\n=== Step 12: Inspecting Generated Artifacts ===\n")

snapshot_dir <- ".capsule/snapshots/test_complete"

cat("\n1. Snapshot Directory Contents:\n")
if (dir.exists(snapshot_dir)) {
  files <- list.files(snapshot_dir, recursive = TRUE)
  for (f in files) {
    size <- file.info(file.path(snapshot_dir, f))$size
    cat(sprintf("  ✓ %-50s %8d bytes\n", f, size))
  }
} else {
  cat("  ✗ Snapshot directory not found!\n")
}

cat("\n2. Reproducible Script Preview:\n")
repro_script <- file.path(snapshot_dir, "simulated_rnaseq_de_reproducible.R")
if (file.exists(repro_script)) {
  lines <- readLines(repro_script, n = 25)
  cat("  First 20 lines:\n")
  cat("  ", paste(rep("-", 70), collapse = ""), "\n", sep = "")
  for (i in seq_len(min(20, length(lines)))) {
    cat("  ", lines[i], "\n", sep = "")
  }
  cat("  ", paste(rep("-", 70), collapse = ""), "\n", sep = "")
}

cat("\n3. Docker Configuration:\n")
dockerfile <- file.path(snapshot_dir, "docker/Dockerfile")
if (file.exists(dockerfile)) {
  lines <- readLines(dockerfile, n = 15)
  cat("  Dockerfile preview:\n")
  for (i in seq_len(min(10, length(lines)))) {
    cat("  ", lines[i], "\n", sep = "")
  }
}

cat("\n4. Reproducibility Report:\n")
report <- file.path(snapshot_dir, "reproducibility_report.md")
if (file.exists(report)) {
  cat("  ✓ Report generated at:", report, "\n")
  cat("  File size:", file.info(report)$size, "bytes\n")
}

cat("\n5. Registries:\n")
for (reg in c("data_registry.json", "param_registry.json", "seed_registry.json")) {
  reg_path <- file.path(".capsule", reg)
  if (file.exists(reg_path)) {
    cat("  ✓", reg, "-", file.info(reg_path)$size, "bytes\n")
  }
}

# ============================================================================
# Final Summary
# ============================================================================
cat("\n")
cat("=====================================================================\n")
cat("                  Capsule Test Successfully Completed              \n")
cat("=====================================================================\n")
cat("\n")
cat("Summary of Generated Artifacts:\n")
cat("\n")
cat("📁 Project Structure:\n")
cat("  ├── data/                    # Input data\n")
cat("  ├── scripts/                 # Analysis scripts\n")
cat("  ├── results/                 # Analysis outputs\n")
cat("  └── .capsule/              # Capsule tracking\n")
cat("      ├── *_registry.json      # Tracking registries\n")
cat("      └── snapshots/           # Complete snapshots\n")
cat("          └── test_complete/\n")
cat("              ├── *_reproducible.R    # Executable script\n")
cat("              ├── docker/             # Docker config\n")
cat("              └── *.json              # Metadata\n")
cat("\n")
cat("✓ Data provenance tracked\n")
cat("✓ Random seed recorded\n")
cat("✓ Parameters documented\n")
cat("✓ Analysis executed\n")
cat("✓ Results validated (F1 =", round(f1, 3), ")\n")
cat("✓ Complete snapshot created\n")
cat("\n")
cat("Next Steps:\n")
cat("1. View reproducibility report:\n")
cat("   cat", report, "\n")
cat("\n")
cat("2. Test the reproducible script:\n")
cat("   cd", project_dir, "\n")
cat("   Rscript", repro_script, "\n")
cat("\n")
cat("3. Use Docker for full reproducibility:\n")
cat("   cd", file.path(project_dir, snapshot_dir, "docker"), "\n")
cat("   docker-compose up\n")
cat("\n")
cat("=====================================================================\n")

cat("\nTest completed successfully! ✨\n\n")

Try the Capsule package in your browser

Any scripts or data that you put into this service are public.

Capsule documentation built on Nov. 11, 2025, 5:14 p.m.