Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 10,
fig.height = 7,
dpi = 300,
out.width = "100%"
)
## ----setup, message = FALSE---------------------------------------------------
library(evanverse)
library(dplyr)
library(ggplot2)
library(tidyr)
## ----gene-conversion-demo, eval = FALSE---------------------------------------
# # Example gene symbols commonly used in cancer research
# cancer_genes <- c("BRCA1", "BRCA2", "TP53", "EGFR", "MYC", "RAS", "PIK3CA", "AKT1")
#
# # Convert gene symbols to Ensembl IDs
# ensembl_ids <- convert_gene_id(
# genes = cancer_genes,
# from = "symbol",
# to = "ensembl",
# species = "human"
# )
#
# # Display conversion results
# conversion_table <- data.frame(
# Gene_Symbol = cancer_genes,
# Ensembl_ID = ensembl_ids
# )
#
# print(conversion_table)
## ----gene-conversion-mock-----------------------------------------------------
# Mock example for demonstration (since biomaRt requires internet)
cancer_genes <- c("BRCA1", "BRCA2", "TP53", "EGFR", "MYC", "KRAS", "PIK3CA", "AKT1")
# Simulated conversion results
mock_conversion <- data.frame(
Gene_Symbol = cancer_genes,
Ensembl_ID = c(
"ENSG00000012048", "ENSG00000139618", "ENSG00000141510",
"ENSG00000146648", "ENSG00000136997", "ENSG00000133703",
"ENSG00000171608", "ENSG00000142208"
),
Entrez_ID = c(672, 675, 7157, 1956, 4609, 3845, 5290, 207),
stringsAsFactors = FALSE
)
cat("𧬠Gene ID Conversion Example\n")
cat("=============================\n")
print(mock_conversion)
cat("\nš Conversion Summary:\n")
cat(" ⢠Input genes:", length(cancer_genes), "\n")
cat(" ⢠Successful conversions:", nrow(mock_conversion), "\n")
cat(" ⢠Success rate:", round(100 * nrow(mock_conversion) / length(cancer_genes), 1), "%\n")
## ----conversion-workflow------------------------------------------------------
# Simulate a real-world scenario with mixed identifier types
mixed_identifiers <- c(
"BRCA1", "ENSG00000139618", "7157", "EGFR",
"ENSG00000136997", "3845", "PIK3CA", "207"
)
# Function to detect identifier type
detect_id_type <- function(ids) {
sapply(ids, function(id) {
if (grepl("^ENSG", id)) return("ensembl")
if (grepl("^[0-9]+$", id)) return("entrez")
return("symbol")
})
}
id_types <- detect_id_type(mixed_identifiers)
cat("š Identifier Type Detection:\n")
print(data.frame(
Identifier = mixed_identifiers,
Detected_Type = id_types
))
# Group by identifier type for batch conversion
id_groups <- split(mixed_identifiers, id_types)
cat("\nš¦ Grouped Identifiers for Conversion:\n")
str(id_groups)
## ----gmt-processing-demo, eval = FALSE----------------------------------------
# # Example: Process a pathway GMT file
# # pathway_df <- gmt2df("path/to/c2.cp.kegg.v7.4.symbols.gmt")
# # pathway_list <- gmt2list("path/to/c2.cp.kegg.v7.4.symbols.gmt")
#
# # Display structure
# # head(pathway_df, 10)
# # length(pathway_list)
## ----gmt-mock-demo------------------------------------------------------------
# Create mock GMT data to demonstrate structure
mock_pathways <- list(
"KEGG_GLYCOLYSIS_GLUCONEOGENESIS" = c(
"HK1", "HK2", "GPI", "PFKL", "ALDOA", "TPI1", "GAPDH",
"PGK1", "PGAM1", "ENO1", "PKM", "LDHA", "PDK1"
),
"KEGG_CITRATE_CYCLE" = c(
"CS", "ACO1", "IDH1", "OGDH", "SUCLA2", "SDHA",
"FH", "MDH1", "PCK1", "PDK1", "DLAT"
),
"KEGG_FATTY_ACID_SYNTHESIS" = c(
"ACACA", "FASN", "ACLY", "ACC2", "ELOVL6", "SCD",
"FADS1", "FADS2", "ACSL1", "GPAM"
),
"KEGG_DNA_REPAIR" = c(
"BRCA1", "BRCA2", "TP53", "ATM", "CHEK1", "CHEK2",
"RAD51", "XRCC1", "PARP1", "MSH2", "MLH1"
)
)
# Convert list to data frame format (simulating gmt2df output)
mock_gmt_df <- do.call(rbind, lapply(names(mock_pathways), function(pathway) {
data.frame(
pathway = pathway,
gene = mock_pathways[[pathway]],
stringsAsFactors = FALSE
)
}))
cat("š GMT File Processing Results\n")
cat("==============================\n")
cat("Number of pathways:", length(mock_pathways), "\n")
cat("Total gene-pathway associations:", nrow(mock_gmt_df), "\n")
cat("Average genes per pathway:", round(mean(lengths(mock_pathways)), 1), "\n\n")
cat("Sample pathway data frame:\n")
print(head(mock_gmt_df, 12))
# Pathway size distribution
pathway_sizes <- lengths(mock_pathways)
cat("\nš Pathway Size Distribution:\n")
print(data.frame(
Pathway = names(pathway_sizes),
Gene_Count = pathway_sizes
))
## ----gene-set-overlap, fig.cap="Gene set overlap analysis showing relationships between biological pathways"----
# Analyze overlaps between pathways
pathway_genes <- mock_pathways[1:3] # Use first 3 pathways for Venn diagram
# Create Venn diagram for pathway overlaps
venn_plot <- plot_venn(
set1 = pathway_genes[[1]],
set2 = pathway_genes[[2]],
set3 = pathway_genes[[3]],
category.names = names(pathway_genes),
fill = get_palette("qual_vivid", type = "qualitative", n = 3),
title = "Metabolic Pathway Gene Overlaps"
)
print(venn_plot)
# Calculate detailed overlap statistics
all_genes <- unique(unlist(pathway_genes))
cat("\nš Detailed Overlap Analysis:\n")
cat("===============================\n")
cat("Total unique genes across pathways:", length(all_genes), "\n")
# Pairwise overlaps
pathway_names <- names(pathway_genes)
for (i in 1:(length(pathway_names) - 1)) {
for (j in (i + 1):length(pathway_names)) {
overlap <- length(intersect(pathway_genes[[i]], pathway_genes[[j]]))
cat(sprintf("%s ā© %s: %d genes\n",
gsub("KEGG_", "", pathway_names[i]),
gsub("KEGG_", "", pathway_names[j]),
overlap))
}
}
## ----rnaseq-workflow, fig.cap="Differential expression analysis visualization with volcano plot"----
# Simulate RNA-seq differential expression results
set.seed(123)
n_genes <- 2000
# Simulate log fold changes and p-values
gene_results <- data.frame(
Gene = paste0("Gene_", 1:n_genes),
LogFC = rnorm(n_genes, mean = 0, sd = 1.2),
PValue = rbeta(n_genes, shape1 = 1, shape2 = 10),
stringsAsFactors = FALSE
)
# Add some significant genes
significant_indices <- sample(1:n_genes, 200)
gene_results$LogFC[significant_indices] <- gene_results$LogFC[significant_indices] +
sample(c(-2, 2), 200, replace = TRUE)
gene_results$PValue[significant_indices] <- gene_results$PValue[significant_indices] * 0.01
# Calculate adjusted p-values
gene_results$FDR <- p.adjust(gene_results$PValue, method = "BH")
# Classify genes
gene_results$Regulation <- "Not Significant"
gene_results$Regulation[gene_results$FDR < 0.05 & gene_results$LogFC > 1] <- "Up-regulated"
gene_results$Regulation[gene_results$FDR < 0.05 & gene_results$LogFC < -1] <- "Down-regulated"
# Create volcano plot
volcano_colors <- c(
"Up-regulated" = get_palette("qual_vivid", type = "qualitative", n = 3)[1],
"Down-regulated" = get_palette("qual_vivid", type = "qualitative", n = 3)[2],
"Not Significant" = "#CCCCCC"
)
p1 <- ggplot(gene_results, aes(x = LogFC, y = -log10(FDR), color = Regulation)) +
geom_point(alpha = 0.6, size = 1.2) +
scale_color_manual(values = volcano_colors) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "#666666") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "#666666") +
labs(
title = "Differential Gene Expression Analysis",
subtitle = "Volcano plot showing treatment vs. control comparison",
x = "Logā Fold Change",
y = "-logāā(FDR-adjusted p-value)",
color = "Regulation"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
plot.subtitle = element_text(size = 11, color = "#666666"),
legend.position = "bottom"
)
print(p1)
# Summary statistics
regulation_summary <- table(gene_results$Regulation)
cat("\nš Differential Expression Summary:\n")
cat("===================================\n")
print(regulation_summary)
cat("\nTop 10 up-regulated genes (by fold change):\n")
top_up <- gene_results[gene_results$Regulation == "Up-regulated", ] %>%
arrange(desc(LogFC)) %>%
head(10)
print(top_up[, c("Gene", "LogFC", "FDR")])
## ----pathway-enrichment, fig.cap="Pathway enrichment analysis showing biological processes affected by treatment"----
# Simulate pathway enrichment analysis results
enrichment_results <- data.frame(
Pathway = c(
"Cell Cycle", "Apoptosis", "DNA Repair", "Inflammation",
"Metabolism", "Signaling", "Transport", "Development"
),
GeneRatio = c(0.15, 0.22, 0.18, 0.31, 0.09, 0.25, 0.12, 0.08),
FDR = c(0.001, 0.003, 0.008, 0.0001, 0.045, 0.002, 0.021, 0.089),
GeneCount = c(23, 34, 28, 48, 14, 39, 18, 12),
stringsAsFactors = FALSE
)
# Calculate enrichment score
enrichment_results$EnrichmentScore <- -log10(enrichment_results$FDR)
# Create enrichment plot
p2 <- ggplot(enrichment_results, aes(x = GeneRatio, y = reorder(Pathway, EnrichmentScore))) +
geom_point(aes(color = EnrichmentScore, size = GeneCount), alpha = 0.8) +
scale_color_gradientn(
colors = get_palette("seq_blush", type = "sequential", n = 4),
name = "-logāā(FDR)"
) +
scale_size_continuous(name = "Gene Count", range = c(3, 12)) +
geom_vline(xintercept = 0.1, linetype = "dashed", color = "#666666", alpha = 0.7) +
labs(
title = "Pathway Enrichment Analysis",
subtitle = "Biological processes enriched in differentially expressed genes",
x = "Gene Ratio (enriched genes / pathway total)",
y = "Biological Pathway"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
plot.subtitle = element_text(size = 11, color = "#666666"),
panel.grid.major.y = element_blank(),
legend.position = "right"
)
print(p2)
cat("\nšÆ Pathway Enrichment Summary:\n")
cat("==============================\n")
significant_pathways <- enrichment_results[enrichment_results$FDR < 0.05, ]
cat("Significant pathways (FDR < 0.05):", nrow(significant_pathways), "\n")
cat("Most enriched pathway:", significant_pathways$Pathway[which.max(significant_pathways$EnrichmentScore)], "\n")
cat("Total genes in significant pathways:", sum(significant_pathways$GeneCount), "\n")
## ----multiomics-integration, fig.cap="Multi-omics data integration showing genomic variants and expression changes"----
# Simulate multi-omics data integration
set.seed(456)
selected_genes <- c("BRCA1", "TP53", "EGFR", "MYC", "KRAS", "PIK3CA", "AKT1", "PTEN")
# Create integrated omics data
omics_data <- data.frame(
Gene = rep(selected_genes, each = 3),
DataType = rep(c("Mutation", "CNV", "Expression"), length(selected_genes)),
Value = c(
# Mutation frequencies (0-1)
c(0.12, 0.34, 0.08, 0.15, 0.22, 0.09, 0.06, 0.18),
# Copy number variations (-2 to 2)
c(-0.5, -1.2, 1.8, 0.3, 0.8, -0.8, 1.1, -1.5),
# Expression fold changes (-3 to 3)
c(-1.5, -2.8, 2.1, 1.8, -1.2, 2.3, -0.8, -2.1)
),
Patient_Group = rep(c("Group_A", "Group_B", "Group_C"), length(selected_genes))
)
# Normalize values for visualization
omics_data$Normalized_Value <- ave(omics_data$Value, omics_data$DataType,
FUN = function(x) scale(x)[,1])
# Create heatmap
p3 <- ggplot(omics_data, aes(x = DataType, y = Gene, fill = Normalized_Value)) +
geom_tile(color = "white", size = 0.5) +
scale_fill_gradientn(
colors = get_palette("div_contrast", type = "diverging"),
name = "Z-score",
limits = c(-2, 2),
breaks = c(-2, -1, 0, 1, 2)
) +
labs(
title = "Multi-omics Cancer Gene Analysis",
subtitle = "Integrated view of mutations, copy number, and expression",
x = "Data Type",
y = "Cancer-related Genes"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
plot.subtitle = element_text(size = 11, color = "#666666"),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)
)
print(p3)
# Summary by data type
cat("\n𧬠Multi-omics Data Summary:\n")
cat("============================\n")
summary_stats <- omics_data %>%
group_by(DataType) %>%
summarise(
Mean_Value = round(mean(Value), 3),
SD_Value = round(sd(Value), 3),
Min_Value = round(min(Value), 3),
Max_Value = round(max(Value), 3),
.groups = 'drop'
)
print(summary_stats)
## ----biomarker-discovery, fig.cap="Biomarker discovery showing gene expression patterns across clinical subtypes"----
# Simulate clinical biomarker data
set.seed(789)
n_patients <- 120
n_biomarkers <- 20
# Generate patient clinical data
clinical_data <- data.frame(
Patient_ID = paste0("P", 1:n_patients),
Subtype = sample(c("Luminal_A", "Luminal_B", "HER2+", "TNBC"), n_patients,
replace = TRUE, prob = c(0.4, 0.2, 0.15, 0.25)),
Stage = sample(c("I", "II", "III", "IV"), n_patients,
replace = TRUE, prob = c(0.3, 0.35, 0.25, 0.1)),
Age = round(rnorm(n_patients, 55, 12)),
Survival_Months = round(rexp(n_patients, rate = 0.02)),
stringsAsFactors = FALSE
)
# Generate biomarker expression data
biomarker_genes <- paste0("Biomarker_", 1:n_biomarkers)
expression_data <- matrix(rnorm(n_patients * n_biomarkers, mean = 5, sd = 2),
nrow = n_patients, ncol = n_biomarkers)
colnames(expression_data) <- biomarker_genes
rownames(expression_data) <- clinical_data$Patient_ID
# Add subtype-specific expression patterns
luminal_a_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "Luminal_A"]
her2_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "HER2+"]
tnbc_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "TNBC"]
# Simulate subtype-specific biomarkers
expression_data[luminal_a_patients, "Biomarker_1"] <-
expression_data[luminal_a_patients, "Biomarker_1"] + 3
expression_data[her2_patients, "Biomarker_5"] <-
expression_data[her2_patients, "Biomarker_5"] + 4
expression_data[tnbc_patients, "Biomarker_12"] <-
expression_data[tnbc_patients, "Biomarker_12"] + 2.5
# Convert to long format for visualization
expression_long <- as.data.frame(expression_data) %>%
mutate(Patient_ID = rownames(.)) %>%
gather(Biomarker, Expression, -Patient_ID) %>%
left_join(clinical_data, by = "Patient_ID")
# Select top biomarkers for visualization
top_biomarkers <- c("Biomarker_1", "Biomarker_5", "Biomarker_12", "Biomarker_8")
plot_data <- expression_long %>%
filter(Biomarker %in% top_biomarkers)
# Create biomarker expression plot
p5 <- ggplot(plot_data, aes(x = Subtype, y = Expression, fill = Subtype)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.5) +
geom_jitter(alpha = 0.3, width = 0.2, size = 0.8) +
scale_fill_manual(
values = get_palette("qual_vivid", type = "qualitative", n = 4)
) +
facet_wrap(~Biomarker, scales = "free_y", ncol = 2) +
labs(
title = "Biomarker Expression Across Cancer Subtypes",
subtitle = "Potential subtype-specific biomarkers for precision medicine",
x = "Cancer Subtype",
y = "Expression Level (log2 normalized)",
fill = "Subtype"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
plot.subtitle = element_text(size = 11, color = "#666666"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom",
strip.background = element_rect(fill = "#E3F2FD", color = NA)
)
print(p5)
# Statistical summary
cat("\nš Biomarker Analysis Summary:\n")
cat("==============================\n")
subtype_counts <- table(clinical_data$Subtype)
print(subtype_counts)
cat("\nMean expression by subtype for key biomarkers:\n")
biomarker_summary <- plot_data %>%
group_by(Biomarker, Subtype) %>%
summarise(
Mean_Expression = round(mean(Expression), 2),
SD = round(sd(Expression), 2),
.groups = 'drop'
) %>%
arrange(Biomarker, desc(Mean_Expression))
print(biomarker_summary)
## ----data-management, eval = FALSE--------------------------------------------
# # Example of downloading reference data
# # Note: These functions require internet connection and may take time
#
# # Download gene reference annotation
# gene_ref <- download_gene_ref(
# species = "human",
# build = "hg38",
# feature_type = "gene"
# )
#
# # Download GEO dataset
# geo_data <- download_geo_data(
# geo_id = "GSE123456",
# destdir = "data/geo_downloads"
# )
#
# # Download pathway databases
# pathway_url <- "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.4/c2.cp.kegg.v7.4.symbols.gmt"
# download_url(
# url = pathway_url,
# dest = "data/pathways/kegg_pathways.gmt"
# )
## ----data-management-demo-----------------------------------------------------
# Demonstrate file organization for bioinformatics projects
cat("š Recommended Project Structure for Bioinformatics:\n")
cat("==================================================\n")
cat("project/\n")
cat("āāā data/\n")
cat("ā āāā raw/ # Original data files\n")
cat("ā āāā processed/ # Cleaned/normalized data\n")
cat("ā āāā reference/ # Genome annotations, databases\n")
cat("ā āāā results/ # Analysis outputs\n")
cat("āāā scripts/\n")
cat("ā āāā preprocessing/ # Data cleaning scripts\n")
cat("ā āāā analysis/ # Statistical analysis\n")
cat("ā āāā visualization/ # Plotting scripts\n")
cat("āāā docs/ # Documentation, protocols\n")
cat("āāā reports/ # Final reports, publications\n\n")
# Demonstrate batch file handling
file_extensions <- c("fastq.gz", "bam", "vcf", "gmt", "gff3", "bed")
file_descriptions <- c(
"Raw sequencing reads",
"Aligned sequencing data",
"Variant calls",
"Gene set definitions",
"Gene annotations",
"Genomic intervals"
)
file_info <- data.frame(
Extension = file_extensions,
Description = file_descriptions,
stringsAsFactors = FALSE
)
cat("šļø Common Bioinformatics File Types:\n")
print(file_info)
## ----best-practices-----------------------------------------------------------
cat("š¬ BIOINFORMATICS BEST PRACTICES\n")
cat("================================\n\n")
cat("š Data Management:\n")
cat(" ⢠Use version control (Git) for all scripts\n")
cat(" ⢠Document data provenance and processing steps\n")
cat(" ⢠Implement checkpoints and intermediate file saves\n")
cat(" ⢠Use consistent file naming conventions\n\n")
cat("𧬠Gene Identifier Handling:\n")
cat(" ⢠Always validate gene ID conversions\n")
cat(" ⢠Store original identifiers alongside converted ones\n")
cat(" ⢠Document the genome build and annotation version\n")
cat(" ⢠Handle missing/ambiguous identifiers gracefully\n\n")
cat("š Statistical Analysis:\n")
cat(" ⢠Apply appropriate multiple testing corrections\n")
cat(" ⢠Set significance thresholds before analysis\n")
cat(" ⢠Report effect sizes along with p-values\n")
cat(" ⢠Validate results with independent datasets when possible\n\n")
cat("šØ Visualization Guidelines:\n")
cat(" ⢠Use color-blind friendly palettes\n")
cat(" ⢠Include appropriate scales and legends\n")
cat(" ⢠Provide clear titles and axis labels\n")
cat(" ⢠Consider publication requirements for figures\n")
## ----qc-checklist-------------------------------------------------------------
cat("ā
QUALITY CONTROL CHECKLIST\n")
cat("============================\n\n")
cat("š Data Quality:\n")
cat(" [ ] Check for missing values and outliers\n")
cat(" [ ] Verify sample sizes and statistical power\n")
cat(" [ ] Validate gene identifier mappings\n")
cat(" [ ] Assess data distribution and normalization\n\n")
cat("š Analysis Validation:\n")
cat(" [ ] Cross-validate results with different methods\n")
cat(" [ ] Perform sensitivity analyses\n")
cat(" [ ] Check for batch effects and confounders\n")
cat(" [ ] Compare with known biological expectations\n\n")
cat("š Results Reporting:\n")
cat(" [ ] Include sample sizes and effect sizes\n")
cat(" [ ] Report confidence intervals\n")
cat(" [ ] Document software versions and parameters\n")
cat(" [ ] Provide supplementary data and code\n")
## ----complete-pipeline--------------------------------------------------------
cat("š COMPLETE BIOINFORMATICS PIPELINE EXAMPLE\n")
cat("===========================================\n\n")
# Simulate a complete analysis workflow
pipeline_steps <- data.frame(
Step = 1:8,
Process = c(
"Data Import & Quality Control",
"Gene ID Conversion & Mapping",
"Differential Expression Analysis",
"Multiple Testing Correction",
"Pathway Enrichment Analysis",
"Gene Set Overlap Analysis",
"Visualization & Plotting",
"Results Export & Reporting"
),
evanverse_Functions = c(
"read_table_flex(), file_info()",
"convert_gene_id(), replace_void()",
"User analysis + evanverse utilities",
"Built-in R functions",
"gmt2df(), gmt2list()",
"plot_venn(), combine_logic()",
"get_palette(), plot_*() functions",
"write_xlsx_flex(), remind()"
),
Estimated_Time = c("5-10 min", "10-15 min", "30-60 min", "5 min",
"15-30 min", "10-20 min", "20-40 min", "10-15 min")
)
print(pipeline_steps)
cat("\nā±ļø Total Estimated Pipeline Time: 2-4 hours\n")
cat("šÆ Key Success Factors:\n")
cat(" ⢠Proper data validation at each step\n")
cat(" ⢠Consistent identifier handling\n")
cat(" ⢠Appropriate statistical methods\n")
cat(" ⢠Clear documentation and visualization\n")
## ----bio-quick-ref, eval = FALSE----------------------------------------------
# # Gene identifier conversion
# convert_gene_id(genes, from = "symbol", to = "ensembl", species = "human")
#
# # Pathway analysis
# pathways <- gmt2list("pathways.gmt")
# plot_venn(gene_sets, colors = get_palette("qual_vivid"))
#
# # Data visualization
# get_palette("div_contrast", type = "diverging")
# plot_venn(gene_sets)
#
# # Data management
# download_geo_data("GSE123456")
# read_table_flex("expression_data.txt")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.