Nothing
knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6, fig.align = "center", out.width = "95%", dpi = 90, message = FALSE, warning = FALSE ) options(width = 100)
This case study demonstrates a complete analysis pipeline using missoNet for genomic data analysis, specifically for regional DNA methylation QTL (meQTL) mapping. We'll analyze how genetic variants (SNPs) influence DNA methylation levels across multiple CpG sites while accounting for missing data and network structure among CpG sites.
In epigenome-wide association studies (EWAS):
# Load required packages library(missoNet) library(ggplot2) library(reshape2) library(gridExtra) # Set ggplot theme theme_set(theme_minimal(base_size = 11))
# Set dimensions mimicking a genomic region n <- 600 # Number of samples p <- 80 # Number of SNPs in the region q <- 20 # Number of CpG sites # Create realistic correlation structure # SNPs: Linkage disequilibrium (LD) blocks create_ld_structure <- function(p, n_blocks = 4, within_block_cor = 0.7) { Sigma <- matrix(0, p, p) block_size <- p / n_blocks for (b in 1:n_blocks) { idx <- ((b-1)*block_size + 1):(b*block_size) for (i in idx) { for (j in idx) { Sigma[i, j] <- within_block_cor^abs(i - j) } } } diag(Sigma) <- 1 return(Sigma) } Sigma.X <- create_ld_structure(p, n_blocks = 4) # Variable missing rates (technical dropout patterns) # Higher missingness for CpG sites with extreme GC content gc_content <- runif(q, 0.3, 0.7) # Simulated GC content rho_vec <- 0.01 + 0.3 * (abs(gc_content - 0.5) / 0.2)^2 # U-shaped missing pattern # Generate the data sim <- generateData( n = n, p = p, q = q, rho = rho_vec, # Missing rates missing.type = "MAR", # Missing depends on technical factors Sigma.X = Sigma.X, Beta.row.sparsity = 0.15, # 15% of SNPs are meQTLs Beta.elm.sparsity = 0.4, # Each meQTL affects 40% of CpGs seed = 100 ) # Add meaningful variable names colnames(sim$X) <- sprintf("rs%d", 1000000 + 1:p) # SNP IDs colnames(sim$Y) <- sprintf("cg%d", 2000000 + 1:q) # CpG IDs colnames(sim$Z) <- colnames(sim$Y) rownames(sim$X) <- rownames(sim$Y) <- rownames(sim$Z) <- paste0("Sample", 1:n) { cat("\nDataset Summary:\n") cat("================\n") cat("Samples:", n, "\n") cat("SNPs:", p, "\n") cat("CpG sites:", q, "\n") cat("Overall missing rate:", sprintf("%.1f%%", mean(is.na(sim$Z)) * 100), "\n") cat("True meQTLs:", sum(rowSums(abs(sim$Beta)) > 0), "\n") }
# Analyze missing patterns miss_by_cpg <- colMeans(is.na(sim$Z)) miss_by_sample <- rowMeans(is.na(sim$Z)) # Create visualization par(mfrow = c(2, 2)) # 1. Missing rate by CpG plot(miss_by_cpg, type = "h", lwd = 2, col = "steelblue", xlab = "CpG Site", ylab = "Missing Rate", main = "Missing Data by CpG Site") abline(h = mean(miss_by_cpg), col = "red", lty = 2) # 2. Missing rate by sample hist(miss_by_sample, breaks = 20, col = "lightblue", xlab = "Missing Rate", main = "Distribution of Missing Rates (Samples)") abline(v = mean(miss_by_sample), col = "red", lwd = 2) # 3. Heatmap of missingness image(t(is.na(sim$Z[1:100, ])), col = c("white", "darkred"), xlab = "CpG Site", ylab = "Sample (first 100)", main = "Missing Data Pattern") # 4. Correlation of missingness with GC content plot(gc_content, miss_by_cpg, pch = 19, col = "darkblue", xlab = "GC Content", ylab = "Missing Rate", main = "Technical Dropout vs GC Content") lines(lowess(gc_content, miss_by_cpg), col = "red", lwd = 2)
# Use complete data for visualization Y_complete <- sim$Y cor_cpg <- cor(Y_complete) # Create enhanced heatmap library(RColorBrewer) colors <- colorRampPalette(brewer.pal(9, "RdBu"))(100) heatmap(cor_cpg, col = rev(colors), symm = TRUE, main = "CpG Methylation Correlation Structure", xlab = "CpG Sites", ylab = "CpG Sites", margins = c(8, 8)) # Identify CpG modules hc <- hclust(as.dist(1 - abs(cor_cpg))) modules <- cutree(hc, k = 3) cat("\nCpG modules identified:", table(modules), "\n")
fit_initial <- missoNet( X = sim$X, Y = sim$Z, GoF = "BIC", adaptive.search = TRUE, # Fast exploration verbose = 1 ) # Examine initial selection { cat("\nStep 1: Initial parameter exploration\n") cat("=====================================\n") cat(" Lambda.beta:", fit_initial$est.min$lambda.beta, "\n") cat(" Lambda.theta:", fit_initial$est.min$lambda.theta, "\n") cat(" Active SNPs:", sum(rowSums(abs(fit_initial$est.min$Beta)) > 1e-8), "\n") cat(" Network edges:", sum(abs(fit_initial$est.min$Theta[upper.tri(fit_initial$est.min$Theta)]) > 1e-8), "\n") }
# Define refined grid based on initial exploration lambda.beta.refined <- 10^(seq( log10(min(max(fit_initial$lambda.beta.seq) * 0.9, fit_initial$est.min$lambda.beta * 50)), log10(max(max(fit_initial$lambda.beta.seq) * 0.005, fit_initial$est.min$lambda.beta / 20)), length.out = 25 )) lambda.theta.refined <- 10^(seq( log10(min(max(fit_initial$lambda.theta.seq) * 0.9, fit_initial$est.min$lambda.theta * 50)), log10(max(max(fit_initial$lambda.theta.seq) * 0.005, fit_initial$est.min$lambda.theta / 20)), length.out = 25 )) # Perform 5-fold cross-validation cvfit <- cv.missoNet( X = sim$X, Y = sim$Z, kfold = 5, lambda.beta = lambda.beta.refined, lambda.theta = lambda.theta.refined, compute.1se = TRUE, verbose = 0, seed = 1000 )
# Compare different model choices models <- list( "CV Minimum" = cvfit$est.min, "1SE Beta" = cvfit$est.1se.beta, "1SE Theta" = cvfit$est.1se.theta, "Initial BIC" = fit_initial$est.min ) if (!is.null(models$`1SE Beta`) & !is.null(models$`1SE Theta`)) { # Ensure models exist comparison <- data.frame( Model = names(models), Lambda.Beta = sapply(models, function(x) x$lambda.beta), Lambda.Theta = sapply(models, function(x) x$lambda.theta), Active.SNPs = sapply(models, function(x) sum(rowSums(abs(x$Beta)) > 1e-8)), Total.Effects = sapply(models, function(x) sum(abs(x$Beta) > 1e-8)), Network.Edges = sapply(models, function(x) sum(abs(x$Theta[upper.tri(x$Theta)]) > 1e-8)) ) print(comparison, digits = 4) } # Select the more regularized model, fallback if NULL if (!is.null(models$`1SE Beta`)) { final_model <- cvfit$est.1se.beta } else final_model <- fit_initial$est.min
# Extract coefficients Beta <- final_model$Beta rownames(Beta) <- colnames(sim$X) colnames(Beta) <- colnames(sim$Z) # Identify significant associations threshold <- 1e-3 sig_meqtls <- which(abs(Beta) > threshold, arr.ind = TRUE) if (nrow(sig_meqtls) > 0) { meqtl_df <- data.frame( SNP = rownames(Beta)[sig_meqtls[,1]], CpG = colnames(Beta)[sig_meqtls[,2]], Effect = Beta[sig_meqtls], AbsEffect = abs(Beta[sig_meqtls]) ) meqtl_df <- meqtl_df[order(meqtl_df$AbsEffect, decreasing = TRUE), ] cat("Top 15 meQTL associations:\n") print(head(meqtl_df, 15), digits = 3) # Visualization top_snps <- unique(meqtl_df$SNP[1:min(30, nrow(meqtl_df))]) Beta_subset <- Beta[top_snps, , drop = FALSE] # Create heatmap par(mfrow = c(1, 1)) colors <- colorRampPalette(c("blue", "white", "red"))(100) heatmap(as.matrix(Beta_subset), col = colors, scale = "none", main = "Top meQTL Effects", xlab = "CpG Sites", ylab = "SNPs", margins = c(8, 8)) }
# Extract precision matrix and convert to partial correlations Theta <- final_model$Theta rownames(Theta) <- colnames(Theta) <- colnames(sim$Z) # Compute partial correlations partial_cor <- -cov2cor(Theta) diag(partial_cor) <- 0 # Network statistics edge_threshold <- 0.1 n_edges <- sum(abs(partial_cor[upper.tri(partial_cor)]) > edge_threshold) { cat("\nNetwork Statistics:\n") cat(" Total possible edges:", q * (q-1) / 2, "\n") cat(" Selected edges (|r| >", edge_threshold, "):", n_edges, "\n") cat(" Network density:", sprintf("%.1f%%", 100 * n_edges / (q * (q-1) / 2)), "\n") } # Identify hub CpGs degree <- colSums(abs(partial_cor) > edge_threshold) hub_cpgs <- names(sort(degree, decreasing = TRUE)[1:5]) cat("\nHub CpG sites (highest connectivity):\n") for (cpg in hub_cpgs) { cat(" ", cpg, ": degree =", degree[cpg], "\n") } # Visualize network if (requireNamespace("igraph", quietly = TRUE)) { library(igraph) # Create network from significant edges edges <- which(abs(partial_cor) > 0.15 & upper.tri(partial_cor), arr.ind = TRUE) if (nrow(edges) > 0) { edge_list <- data.frame( from = rownames(partial_cor)[edges[,1]], to = colnames(partial_cor)[edges[,2]], weight = abs(partial_cor[edges]) ) g <- graph_from_data_frame(edge_list, directed = FALSE) # Node properties V(g)$size <- 5 + sqrt(degree[V(g)$name]) * 3 V(g)$color <- ifelse(V(g)$name %in% hub_cpgs, "red", "lightblue") # Plot par(mfrow = c(1, 1)) plot(g, layout = layout_with_fr(g), vertex.label.cex = 0.7, edge.width = E(g)$weight * 3, main = "CpG Conditional Dependency Network") legend("topright", legend = c("Hub CpG", "Regular CpG"), pch = 21, pt.bg = c("red", "lightblue"), pt.cex = 2) } }
# Analyze how meQTLs relate to network structure active_snps <- which(rowSums(abs(Beta)) > threshold) if (length(active_snps) > 0) { cat("\nIntegration Analysis:\n") cat("=====================\n") # For each active SNP, check which CpGs it affects for (i in active_snps[1:min(5, length(active_snps))]) { affected_cpgs <- which(abs(Beta[i,]) > threshold) if (length(affected_cpgs) > 1) { # Check if affected CpGs are connected in the network subnet_partial <- partial_cor[affected_cpgs, affected_cpgs] mean_connection <- mean(abs(subnet_partial[upper.tri(subnet_partial)])) cat("\n", rownames(Beta)[i], "affects", length(affected_cpgs), "CpGs\n") cat("Mean network connection among affected CpGs:", round(mean_connection, 3), "\n") cat("Affected CpGs:", paste(colnames(Beta)[affected_cpgs], collapse = ", "), "\n") } } }
# Split data for validation n_train <- round(0.75 * n) train_idx <- sample(n, n_train) test_idx <- setdiff(1:n, train_idx) # Refit on training data cvfit_train <- cv.missoNet( X = sim$X[train_idx, ], Y = sim$Z[train_idx, ], kfold = 5, lambda.beta = lambda.beta.refined, lambda.theta = lambda.theta.refined, verbose = 0 ) # Predictions Y_pred <- predict(cvfit_train, newx = sim$X[test_idx, ]) Y_test <- sim$Y[test_idx, ] # True complete values # Calculate performance metrics mse_per_cpg <- colMeans((Y_pred - Y_test)^2) cor_per_cpg <- sapply(1:q, function(j) cor(Y_pred[,j], Y_test[,j])) # Visualization par(mfrow = c(1, 2)) # MSE vs missing rate plot(miss_by_cpg, mse_per_cpg, pch = 19, col = "darkblue", xlab = "Missing Rate", ylab = "Prediction MSE", main = "Prediction Error vs Missing Rate") lines(lowess(miss_by_cpg, mse_per_cpg), col = "red", lwd = 2) # Correlation vs missing rate plot(miss_by_cpg, cor_per_cpg, pch = 19, col = "darkgreen", xlab = "Missing Rate", ylab = "Prediction Correlation", main = "Prediction Accuracy vs Missing Rate") lines(lowess(miss_by_cpg, cor_per_cpg), col = "red", lwd = 2) { cat("\nPrediction Performance:\n") cat(" Mean MSE:", round(mean(mse_per_cpg), 4), "\n") cat(" Mean correlation:", round(mean(cor_per_cpg), 3), "\n") cat(" Worst CpG MSE:", round(max(mse_per_cpg), 4), "\n") cat(" Best CpG correlation:", round(max(cor_per_cpg), 3), "\n") }
# Bootstrap stability (simplified for demonstration) n_boot <- 10 selection_freq_beta <- matrix(0, p, q) selection_freq_theta <- matrix(0, q, q) for (b in 1:n_boot) { # Bootstrap sample boot_idx <- sample(n, replace = TRUE) # Fit model fit_boot <- missoNet( X = sim$X[boot_idx, ], Y = sim$Z[boot_idx, ], lambda.beta = final_model$lambda.beta, lambda.theta = final_model$lambda.theta, verbose = 0 ) # Track selections selection_freq_beta <- selection_freq_beta + (abs(fit_boot$est.min$Beta) > threshold) selection_freq_theta <- selection_freq_theta + (abs(fit_boot$est.min$Theta) > edge_threshold) } # Normalize selection_freq_beta <- selection_freq_beta / n_boot selection_freq_theta <- selection_freq_theta / n_boot # Identify stable features stable_meqtls <- which(selection_freq_beta > 0.8, arr.ind = TRUE) stable_edges <- which(selection_freq_theta > 0.8 & upper.tri(selection_freq_theta), arr.ind = TRUE) { cat("\nStability Results:\n") cat(" Stable meQTLs (>80% selection):", nrow(stable_meqtls), "\n") cat(" Stable network edges (>80% selection):", nrow(stable_edges), "\n") } if (nrow(stable_meqtls) > 0) { cat("\nMost stable meQTL associations:\n") stable_df <- data.frame( SNP = colnames(sim$X)[stable_meqtls[,1]], CpG = colnames(sim$Z)[stable_meqtls[,2]], Frequency = selection_freq_beta[stable_meqtls] ) print(head(stable_df[order(stable_df$Frequency, decreasing = TRUE), ], 10)) }
# Annotate SNPs with genes (simulated) active_snp_ids <- which(rowSums(abs(Beta)) > threshold) if (length(active_snp_ids) > 0) { gene_names <- paste0("GENE", sample(1:50, length(active_snp_ids), replace = TRUE)) snp_annotation <- data.frame( SNP = colnames(sim$X)[active_snp_ids], Gene = gene_names, Effect_Size = rowSums(abs(Beta[active_snp_ids, ])) ) cat("Genes with meQTLs:\n") gene_summary <- aggregate(Effect_Size ~ Gene, snp_annotation, sum) gene_summary <- gene_summary[order(gene_summary$Effect_Size, decreasing = TRUE), ] print(head(gene_summary, 10)) } # CpG annotation (simulated) cpg_annotation <- data.frame( CpG = colnames(sim$Z), Region = sample(c("Promoter", "Gene Body", "Enhancer", "Intergenic"), q, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)), Island = sample(c("Island", "Shore", "Shelf", "Open Sea"), q, replace = TRUE, prob = c(0.3, 0.2, 0.2, 0.3)) ) { cat("\nCpG distribution by genomic region:\n") print(table(cpg_annotation$Region)) } { cat("\nCpG distribution by island status:\n") print(table(cpg_annotation$Island)) }
cat("\n========================================\n") cat(" ANALYSIS SUMMARY REPORT\n") cat("========================================\n\n") cat("\nDATA CHARACTERISTICS:\n") cat("--------------------\n") cat("• Samples analyzed:", n, "\n") cat("• SNPs tested:", p, "\n") cat("• CpG sites measured:", q, "\n") cat("• Missing data rate:", sprintf("%.1f%%", mean(is.na(sim$Z)) * 100), "\n") cat("• Missing pattern: MAR (technical dropout)\n\n") cat("\nMODEL SELECTION:\n") cat("---------------\n") cat("• Method: 5-fold cross-validation\n") cat("• Selection criterion: 1-SE.Beta CV error\n") cat("• Lambda (Beta):", sprintf("%.4f", final_model$lambda.beta), "\n") cat("• Lambda (Theta):", sprintf("%.4f", final_model$lambda.theta), "\n\n") cat("\nKEY FINDINGS:\n") cat("------------\n") cat("• meQTLs identified:", sum(rowSums(abs(Beta)) > threshold), "/", p, "SNPs\n") cat("• SNP-CpG associations:", sum(abs(Beta) > threshold), "\n") cat("• CpG network edges:", n_edges, "/", q*(q-1)/2, "possible\n") cat("• Hub CpGs identified:", length(hub_cpgs), "\n\n") cat("\nMODEL PERFORMANCE:\n") cat("-----------------\n") cat("• Mean prediction correlation:", sprintf("%.3f", mean(cor_per_cpg)), "\n") cat("• Mean prediction MSE:", sprintf("%.4f", mean(mse_per_cpg)), "\n") cat("• Stability (bootstrap):", sprintf("%.0f%%", 100 * nrow(stable_meqtls) / max(sum(abs(Beta) > threshold), 1)), "of associations stable\n\n") cat("\nBIOLOGICAL INSIGHTS (SIMULATED):\n") cat("-------------------\n") cat("• Primary affected regions: Promoters and gene bodies\n") cat("• Network structure suggests co-regulated CpG modules\n") cat("• Hub CpGs may represent key regulatory sites\n")
This case study demonstrated a complete genomic data analysis workflow using missoNet:
The missoNet framework successfully:
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.