Nothing
# =====================================
# RNA-Seq Analysis Example Workflow
# =====================================
#
# NOTE: This example uses tempdir() to comply with CRAN policies.
# In real usage, you would run this in your project directory.
# Clear environment
rm(list = ls())
library(Capsule)
# Create project directory in tempdir() (CRAN compliant)
project_dir <- file.path(tempdir(), "Capsule_full_test")
if (dir.exists(project_dir)) {
unlink(project_dir, recursive = TRUE)
}
dir.create(project_dir, showWarnings = FALSE)
# Store original directory
original_dir <- getwd()
setwd(project_dir)
# Ensure we return to original directory on exit
on.exit({
setwd(original_dir)
# Clean up temp directory
unlink(project_dir, recursive = TRUE)
}, add = TRUE)
cat("Working in temporary directory:", project_dir, "\n")
# ============================================================================
# Step 1: Initialize Capsule
# ============================================================================
cat("=== Step 1: Initializing Capsule ===\n")
init_capsule(
use_renv = FALSE, # Disable to avoid interactive prompts
use_git = FALSE,
create_gitignore = TRUE
)
cat("\nDirectory structure created:\n")
cat("✓ .capsule/\n")
cat("✓ .capsule/scripts/\n")
cat("✓ .capsule/snapshots/\n")
# ============================================================================
# Step 2: Set Random Seed
# ============================================================================
cat("\n=== Step 2: Setting Random Seed ===\n")
set_seed(42, analysis_name = "simulated_rnaseq_de",
registry_file = ".capsule/seed_registry.json")
# ============================================================================
# Step 3: Generate Simulated Data
# ============================================================================
cat("\n=== Step 3: Generating Simulated RNA-seq Data ===\n")
set.seed(42) # For reproducibility of simulation
# Parameters
n_genes <- 1000
n_samples <- 12
# Sample metadata
sample_ids <- paste0("Sample_", 1:n_samples)
conditions <- rep(c("Control", "Tumor"), each = 6)
batch <- rep(c("Batch1", "Batch2"), times = 6)
metadata <- data.frame(
sample_id = sample_ids,
condition = factor(conditions),
batch = factor(batch),
age = sample(40:70, n_samples, replace = TRUE),
gender = sample(c("M", "F"), n_samples, replace = TRUE),
row.names = sample_ids
)
# Generate count matrix
gene_names <- paste0("GENE_", sprintf("%04d", 1:n_genes))
counts_matrix <- matrix(
rpois(n_genes * n_samples, lambda = 100),
nrow = n_genes,
ncol = n_samples,
dimnames = list(gene_names, sample_ids)
)
# Add differential expression (100 true DE genes)
de_gene_idx <- sample(1:n_genes, 100)
fc <- runif(100, min = 2, max = 5) # Fold changes between 2-5x
for (i in seq_len(100)) {
counts_matrix[de_gene_idx[i], 7:12] <-
round(counts_matrix[de_gene_idx[i], 7:12] * fc[i])
}
cat("Generated:", n_genes, "genes ×", n_samples, "samples\n")
cat("True DE genes:", length(de_gene_idx), "\n")
# ============================================================================
# Step 4: Save Data Files
# ============================================================================
cat("\n=== Step 4: Saving Data Files ===\n")
dir.create("data", showWarnings = FALSE)
write.csv(counts_matrix, "data/counts.csv")
write.csv(metadata, "data/metadata.csv")
# Save ground truth
ground_truth <- data.frame(
gene = gene_names[de_gene_idx],
true_fc = fc,
stringsAsFactors = FALSE
)
write.csv(ground_truth, "data/ground_truth.csv", row.names = FALSE)
cat("✓ data/counts.csv\n")
cat("✓ data/metadata.csv\n")
cat("✓ data/ground_truth.csv\n")
# ============================================================================
# Step 5: Track Data with Capsule
# ============================================================================
cat("\n=== Step 5: Tracking Data Provenance ===\n")
track_data(
"data/counts.csv",
source = "generated",
description = "Simulated RNA-seq count matrix",
metadata = list(n_genes = n_genes, n_samples = n_samples),
registry_file = ".capsule/data_registry.json"
)
track_data(
"data/metadata.csv",
source = "generated",
description = "Sample metadata with clinical information",
registry_file = ".capsule/data_registry.json"
)
track_data(
"data/ground_truth.csv",
source = "generated",
description = "Ground truth DE genes for validation",
registry_file = ".capsule/data_registry.json"
)
# ============================================================================
# Step 6: Track Analysis Parameters
# ============================================================================
cat("\n=== Step 6: Tracking Analysis Parameters ===\n")
analysis_params <- list(
alpha = 0.05,
lfc_threshold = log2(1.5),
min_count = 10,
p_adjust_method = "BH",
design_formula = "~ batch + condition"
)
track_params(
analysis_params,
analysis_name = "simulated_rnaseq_de",
description = "Differential expression analysis parameters",
registry_file = ".capsule/param_registry.json"
)
# ============================================================================
# Step 7: Create Analysis Script
# ============================================================================
cat("\n=== Step 7: Creating Analysis Script ===\n")
analysis_code <- '#!/usr/bin/env Rscript
# RNA-seq Differential Expression Analysis
# Auto-generated by Capsule test
library(Capsule)
cat("Loading data...\\n")
counts <- read.csv("data/counts.csv", row.names = 1)
metadata <- read.csv("data/metadata.csv", row.names = 1)
cat("Samples:", ncol(counts), "\\n")
cat("Genes:", nrow(counts), "\\n")
# Filter low count genes
min_count <- 10
min_samples <- 3
keep <- rowSums(counts >= min_count) >= min_samples
counts_filtered <- counts[keep, ]
cat("After filtering:", nrow(counts_filtered), "genes\\n")
# Calculate means
control_idx <- metadata$condition == "Control"
tumor_idx <- metadata$condition == "Tumor"
control_mean <- rowMeans(counts_filtered[, control_idx])
tumor_mean <- rowMeans(counts_filtered[, tumor_idx])
# Log2 fold change
log2fc <- log2((tumor_mean + 1) / (control_mean + 1))
# T-test for each gene
cat("Running statistical tests...\\n")
pvalues <- apply(counts_filtered, 1, function(gene_counts) {
tryCatch({
t.test(gene_counts[control_idx], gene_counts[tumor_idx])$p.value
}, error = function(e) 1.0)
})
# Adjust p-values
padj <- p.adjust(pvalues, method = "BH")
# Create results table
results <- data.frame(
gene = rownames(counts_filtered),
baseMean = (control_mean + tumor_mean) / 2,
log2FoldChange = log2fc,
pvalue = pvalues,
padj = padj,
significant = (padj < 0.05) & (abs(log2fc) > log2(1.5)),
stringsAsFactors = FALSE
)
# Sort by p-value
results <- results[order(results$pvalue), ]
# Summary
n_sig <- sum(results$significant, na.rm = TRUE)
n_up <- sum(results$significant & results$log2FoldChange > 0, na.rm = TRUE)
n_down <- sum(results$significant & results$log2FoldChange < 0, na.rm = TRUE)
cat("\\nResults Summary:\\n")
cat(" Total genes tested:", nrow(results), "\\n")
cat(" Significant DE genes:", n_sig, "\\n")
cat(" Up-regulated:", n_up, "\\n")
cat(" Down-regulated:", n_down, "\\n")
# Save results
dir.create("results", showWarnings = FALSE)
write.csv(results, "results/de_results.csv", row.names = FALSE)
sig_results <- results[results$significant & !is.na(results$significant), ]
write.csv(sig_results, "results/significant_genes.csv", row.names = FALSE)
cat("\\nResults saved to results/ directory\\n")
# Track output
track_data("results/de_results.csv", source = "generated",
description = "Complete DE analysis results",
registry_file = ".capsule/data_registry.json")
track_data("results/significant_genes.csv", source = "generated",
description = "Significant DE genes only",
registry_file = ".capsule/data_registry.json")
cat("Analysis complete!\\n")
'
dir.create("scripts", showWarnings = FALSE)
writeLines(analysis_code, "scripts/de_analysis.R")
cat("✓ scripts/de_analysis.R\n")
# Make it executable (Unix-like systems)
if (.Platform$OS.type == "unix") {
Sys.chmod("scripts/de_analysis.R", mode = "0755")
}
# ============================================================================
# Step 8: Run the Analysis
# ============================================================================
cat("\n=== Step 8: Running Analysis Script ===\n")
source("scripts/de_analysis.R")
# ============================================================================
# Step 9: Validate Results
# ============================================================================
cat("\n=== Step 9: Validating Against Ground Truth ===\n")
results <- read.csv("results/significant_genes.csv")
ground_truth <- read.csv("data/ground_truth.csv")
detected_genes <- results$gene
true_genes <- ground_truth$gene
tp <- sum(detected_genes %in% true_genes)
fp <- sum(!(detected_genes %in% true_genes))
fn <- sum(!(true_genes %in% detected_genes))
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
f1 <- 2 * (precision * recall) / (precision + recall)
cat("\nValidation Metrics:\n")
cat(" True Positives:", tp, "\n")
cat(" False Positives:", fp, "\n")
cat(" False Negatives:", fn, "\n")
cat(" Precision:", round(precision, 3), "\n")
cat(" Recall:", round(recall, 3), "\n")
cat(" F1 Score:", round(f1, 3), "\n")
# ============================================================================
# Step 10: Verify Data Integrity
# ============================================================================
cat("\n=== Step 10: Verifying Data Integrity ===\n")
verify_result <- verify_data()
# ============================================================================
# Step 11: Create Workflow Snapshot
# ============================================================================
cat("\n=== Step 11: Creating Complete Workflow Snapshot ===\n")
snapshot_files <- snapshot_workflow(
snapshot_name = "test_complete",
analysis_name = "simulated_rnaseq_de",
source_script = "scripts/de_analysis.R",
description = "Complete RNA-seq DE analysis with validation",
generate_docker = TRUE,
generate_script = TRUE,
generate_report = TRUE
)
# ============================================================================
# Step 12: Inspect What Was Created
# ============================================================================
cat("\n=== Step 12: Inspecting Generated Artifacts ===\n")
snapshot_dir <- ".capsule/snapshots/test_complete"
cat("\n1. Snapshot Directory Contents:\n")
if (dir.exists(snapshot_dir)) {
files <- list.files(snapshot_dir, recursive = TRUE)
for (f in files) {
size <- file.info(file.path(snapshot_dir, f))$size
cat(sprintf(" ✓ %-50s %8d bytes\n", f, size))
}
} else {
cat(" ✗ Snapshot directory not found!\n")
}
cat("\n2. Reproducible Script Preview:\n")
repro_script <- file.path(snapshot_dir, "simulated_rnaseq_de_reproducible.R")
if (file.exists(repro_script)) {
lines <- readLines(repro_script, n = 25)
cat(" First 20 lines:\n")
cat(" ", paste(rep("-", 70), collapse = ""), "\n", sep = "")
for (i in seq_len(min(20, length(lines)))) {
cat(" ", lines[i], "\n", sep = "")
}
cat(" ", paste(rep("-", 70), collapse = ""), "\n", sep = "")
}
cat("\n3. Docker Configuration:\n")
dockerfile <- file.path(snapshot_dir, "docker/Dockerfile")
if (file.exists(dockerfile)) {
lines <- readLines(dockerfile, n = 15)
cat(" Dockerfile preview:\n")
for (i in seq_len(min(10, length(lines)))) {
cat(" ", lines[i], "\n", sep = "")
}
}
cat("\n4. Reproducibility Report:\n")
report <- file.path(snapshot_dir, "reproducibility_report.md")
if (file.exists(report)) {
cat(" ✓ Report generated at:", report, "\n")
cat(" File size:", file.info(report)$size, "bytes\n")
}
cat("\n5. Registries:\n")
for (reg in c("data_registry.json", "param_registry.json", "seed_registry.json")) {
reg_path <- file.path(".capsule", reg)
if (file.exists(reg_path)) {
cat(" ✓", reg, "-", file.info(reg_path)$size, "bytes\n")
}
}
# ============================================================================
# Final Summary
# ============================================================================
cat("\n")
cat("=====================================================================\n")
cat(" Capsule Test Successfully Completed \n")
cat("=====================================================================\n")
cat("\n")
cat("Summary of Generated Artifacts:\n")
cat("\n")
cat("📁 Project Structure:\n")
cat(" ├── data/ # Input data\n")
cat(" ├── scripts/ # Analysis scripts\n")
cat(" ├── results/ # Analysis outputs\n")
cat(" └── .capsule/ # Capsule tracking\n")
cat(" ├── *_registry.json # Tracking registries\n")
cat(" └── snapshots/ # Complete snapshots\n")
cat(" └── test_complete/\n")
cat(" ├── *_reproducible.R # Executable script\n")
cat(" ├── docker/ # Docker config\n")
cat(" └── *.json # Metadata\n")
cat("\n")
cat("✓ Data provenance tracked\n")
cat("✓ Random seed recorded\n")
cat("✓ Parameters documented\n")
cat("✓ Analysis executed\n")
cat("✓ Results validated (F1 =", round(f1, 3), ")\n")
cat("✓ Complete snapshot created\n")
cat("\n")
cat("Next Steps:\n")
cat("1. View reproducibility report:\n")
cat(" cat", report, "\n")
cat("\n")
cat("2. Test the reproducible script:\n")
cat(" cd", project_dir, "\n")
cat(" Rscript", repro_script, "\n")
cat("\n")
cat("3. Use Docker for full reproducibility:\n")
cat(" cd", file.path(project_dir, snapshot_dir, "docker"), "\n")
cat(" docker-compose up\n")
cat("\n")
cat("=====================================================================\n")
cat("\nTest completed successfully! ✨\n\n")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.