Nothing
## ----setup, include = FALSE-----------------------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 7,
fig.height = 5,
fig.align = "center",
out.width = "90%",
dpi = 90,
message = FALSE,
warning = FALSE
)
options(width = 100)
## -------------------------------------------------------------------------------------------------
library(missoNet)
## -------------------------------------------------------------------------------------------------
# Generate example data
sim <- generateData(n = 300, p = 30, q = 12, rho = 0.1,
missing.type = "MCAR")
# 5-fold cross-validation
cvfit <- cv.missoNet(
X = sim$X,
Y = sim$Z,
kfold = 5,
compute.1se = TRUE, # Also compute 1-SE models
seed = 123, # For reproducibility
verbose = 1
)
# ====== Cross-Validation Summary ======
{
cat("\nNumber of folds:", cvfit$kfold, "\n")
cat("Lambda pairs evaluated:", length(cvfit$param_set$cv.errors.mean), "\n")
cat("Minimum CV error:", round(min(cvfit$param_set$cv.errors.mean), 4), "\n")
}
# ====== Optimal parameters (min CV error) ======
{
cat("\nlambda.beta:", cvfit$est.min$lambda.beta, "\n")
cat("lambda.theta:", cvfit$est.min$lambda.theta, "\n")
}
# ====== 1-SE parameters ======
{
cat("\nlambda.beta:", cvfit$est.1se.beta$lambda.beta, "\n")
cat("lambda.theta:", cvfit$est.1se.theta$lambda.theta, "\n")
}
## -------------------------------------------------------------------------------------------------
# Compare three models
models <- list(
"Minimum CV" = cvfit$est.min,
"1SE Beta" = cvfit$est.1se.beta,
"1SE Theta" = cvfit$est.1se.theta
)
# Create comparison table
model_comparison <- data.frame(
Model = names(models),
Lambda.Beta = numeric(3),
Lambda.Theta = numeric(3),
Nonzero.Coef = numeric(3),
Network.Edges = numeric(3)
)
for (i in 1:3) {
if (!is.null(models[[i]])) {
model_comparison$Lambda.Beta[i] <- models[[i]]$lambda.beta
model_comparison$Lambda.Theta[i] <- models[[i]]$lambda.theta
model_comparison$Nonzero.Coef[i] <- sum(abs(models[[i]]$Beta) > 1e-8)
model_comparison$Network.Edges[i] <-
sum(abs(models[[i]]$Theta[upper.tri(models[[i]]$Theta)]) > 1e-8)
}
}
print(model_comparison, digits = 4)
## -------------------------------------------------------------------------------------------------
# Heatmap of CV errors
plot(cvfit, type = "heatmap", detailed.axes = FALSE)
## -------------------------------------------------------------------------------------------------
# 3D surface plot
plot(cvfit, type = "scatter", plt.surf = TRUE)
## -------------------------------------------------------------------------------------------------
# Extract and analyze CV errors
cv_errors <- cvfit$param_set$cv.errors.mean
cv_errors_sd <- (cvfit$param_set$cv.errors.upper - cvfit$param_set$cv.errors.lower) / 2
# Create error surface data
lambda_beta_unique <- unique(cvfit$param_set$cv.grid.beta)
lambda_theta_unique <- unique(cvfit$param_set$cv.grid.theta)
# Find optimal region
min_error <- min(cv_errors)
near_optimal <- which(cv_errors <= min_error * 1.05) # Within 5% of minimum
# ====== Regularization path statistics ======
{
cat("\n Error range: [", round(min(cv_errors), 4), ", ",
round(max(cv_errors), 4), "]\n", sep = "")
cat(" Models within 5% of minimum:", length(near_optimal), "\n")
cat(" This suggests", ifelse(length(near_optimal) > 10, "stable", "sensitive"),
"parameter selection\n")
}
## ----eval = FALSE---------------------------------------------------------------------------------
# library(parallel)
#
# # Detect available cores
# n_cores <- detectCores()
# cat("Available cores:", n_cores, "\n")
#
# # Create cluster, use 2 cores for illustration
# cl <- makeCluster(min(n_cores - 1, 2))
#
# # Run parallel cross-validation
# cvfit_parallel <- cv.missoNet(
# X = sim$X,
# Y = sim$Z,
# kfold = 5,
# parallel = TRUE,
# cl = cl,
# verbose = 1
# )
#
# # Clean up
# stopCluster(cl)
## -------------------------------------------------------------------------------------------------
# Standard grid search
time_standard <- system.time({
fit_standard <- missoNet(
X = sim$X,
Y = sim$Z,
adaptive.search = FALSE,
verbose = 0
)
})
# Adaptive search (faster)
time_adaptive <- system.time({
fit_adaptive <- missoNet(
X = sim$X,
Y = sim$Z,
adaptive.search = TRUE,
verbose = 0
)
})
# ====== Computation time comparison ======
{
cat("\n Standard search:", round(time_standard[3], 2), "seconds\n")
cat(" Adaptive search:", round(time_adaptive[3], 2), "seconds\n")
cat(" Speedup:", round(time_standard[3] / time_adaptive[3], 1), "x\n")
}
## -------------------------------------------------------------------------------------------------
p <- ncol(sim$X)
q <- ncol(sim$Z)
# Scenario: Prior knowledge about important predictors
# Group 1: Known important (predictors 1-10) - light penalty
# Group 2: Possibly important (predictors 11-30) - moderate penalty
# Group 3: Likely unimportant (predictors 31-60) - heavy penalty
beta.pen.factor <- matrix(1, p, q)
beta.pen.factor[1:10, ] <- 0.1 # Light penalty
beta.pen.factor[11:20, ] <- 1.0 # Standard penalty
beta.pen.factor[21:p, ] <- 10.0 # Heavy penalty
# Network: Encourage block structure
theta.pen.factor <- matrix(1, q, q)
# Block 1: responses 1-6
theta.pen.factor[1:6, 1:6] <- 0.5
# Block 2: responses 7-12
theta.pen.factor[7:12, 7:12] <- 0.5
# Between blocks: heavy penalty
theta.pen.factor[1:6, 7:12] <- 2.0
theta.pen.factor[7:12, 1:6] <- 2.0
# Fit with structured penalties
fit_structured <- missoNet(
X = sim$X,
Y = sim$Z,
beta.pen.factor = beta.pen.factor,
theta.pen.factor = theta.pen.factor,
verbose = 0
)
# Analyze selection pattern
selected_groups <- c(
Group1 = sum(rowSums(abs(fit_structured$est.min$Beta[1:10, ])) > 1e-8),
Group2 = sum(rowSums(abs(fit_structured$est.min$Beta[11:20, ])) > 1e-8),
Group3 = sum(rowSums(abs(fit_structured$est.min$Beta[21:p, ])) > 1e-8)
)
# Penalty factors successfully guided selection toward prior knowledge
# Selected predictors by group:
print(selected_groups)
## ----eval = FALSE---------------------------------------------------------------------------------
# # Create focused grid around expected optimal region
# # Based on preliminary analysis or domain knowledge
#
# # Log-scale grid with concentration
# create_focused_grid <- function(center, width, n_points) {
# log_center <- log10(center)
# log_seq <- seq(log_center - width/2, log_center + width/2,
# length.out = n_points)
# return(10^log_seq)
# }
#
# # Focused grids based on problem characteristics
# lambda.beta.focused <- create_focused_grid(
# center = 0.05, # Expected optimal region
# width = 1.5, # Log10 scale width
# n_points = 30
# )
#
# lambda.theta.focused <- create_focused_grid(
# center = 0.01,
# width = 1.0,
# n_points = 30
# )
#
# # Fit with focused grid
# fit_focused <- missoNet(
# X = sim$X,
# Y = sim$Z,
# lambda.beta = lambda.beta.focused,
# lambda.theta = lambda.theta.focused
# )
## -------------------------------------------------------------------------------------------------
# Fit with detailed verbosity
fit_verbose <- missoNet(
X = sim$X,
Y = sim$Z,
lambda.beta = 0.05,
lambda.theta = 0.01,
verbose = 2 # Detailed output
)
# Check convergence information
if (!is.null(fit_verbose$est.min$converged)) {
cat("\nConvergence Status:\n")
cat(" Beta optimization:",
ifelse(fit_verbose$est.min$converged$Beta, "✓ Converged", "✗ Not converged"), "\n")
cat(" Theta optimization:",
ifelse(fit_verbose$est.min$converged$Theta, "✓ Converged", "✗ Not converged"), "\n")
}
if (!is.null(fit_verbose$est.min$iterations)) {
cat("\nIterations used:\n")
cat(" Beta:", fit_verbose$est.min$iterations$Beta, "\n")
cat(" Theta:", fit_verbose$est.min$iterations$Theta, "\n")
}
## ----eval = FALSE---------------------------------------------------------------------------------
# # Simulate a difficult problem
# sim_difficult <- generateData(
# n = 100,
# p = 150, # p > n (high-dimensional)
# q = 20,
# rho = 0.25, # Higher missing rate
# missing.type = "MAR"
# )
#
# # Strategy 1: Relax tolerances
# fit_relaxed <- missoNet(
# X = sim_difficult$X,
# Y = sim_difficult$Z,
# beta.tol = 1e-3,
# theta.tol = 1e-3,
# verbose = 0
# )
#
# # Strategy 2: Increase iterations
# fit_more_iter <- missoNet(
# X = sim_difficult$X,
# Y = sim_difficult$Z,
# beta.max.iter = 10000,
# theta.max.iter = 10000,
# verbose = 0
# )
#
# # Strategy 3: Use adaptive search for better initialization
# fit_adaptive_init <- missoNet(
# X = sim_difficult$X,
# Y = sim_difficult$Z,
# adaptive.search = TRUE,
# verbose = 0
# )
## -------------------------------------------------------------------------------------------------
# Fit with relaxed network option
fit_relax <- missoNet(
X = sim$X,
Y = sim$Z,
relax.net = TRUE, # Refit network without penalty
verbose = 0
)
# Compare penalized vs relaxed estimates
if (!is.null(fit_relax$est.min$relax.net)) {
Theta_pen <- fit_relax$est.min$Theta
Theta_relax <- fit_relax$est.min$relax.net
# Extract edges
edges_pen <- Theta_pen[upper.tri(Theta_pen)]
edges_relax <- Theta_relax[upper.tri(Theta_relax)]
# Select non-zero edges from penalized model
active_edges <- abs(edges_pen) > 1e-8
# === Relaxed network estimation ===
cat("\n Active edges:", sum(active_edges), "\n")
cat(" Mean |edge| (penalized):",
round(mean(abs(edges_pen[active_edges])), 4), "\n")
cat(" Mean |edge| (relaxed):",
round(mean(abs(edges_relax[active_edges])), 4), "\n")
cat(" Relaxed estimates are typically larger (less shrinkage)\n")
}
## -------------------------------------------------------------------------------------------------
# Nested CV for unbiased performance estimation
# Outer loop: Performance evaluation
# Inner loop: Parameter selection
evaluate_nested_cv <- function(X, Y, outer_folds = 5, inner_folds = 5) {
n <- nrow(X)
outer_fold_ids <- rep(1:outer_folds, length.out = n)
outer_fold_ids <- sample(outer_fold_ids)
test_errors <- numeric(outer_folds)
for (k in 1:outer_folds) {
# Split data
test_idx <- which(outer_fold_ids == k)
train_idx <- setdiff(1:n, test_idx)
# Inner CV for parameter selection
cvfit_inner <- cv.missoNet(
X = X[train_idx, ],
Y = Y[train_idx, ],
kfold = inner_folds,
verbose = 0
)
# Predict on test fold
y_pred <- predict(cvfit_inner, newx = X[test_idx, ])
y_true <- Y[test_idx, ]
# Calculate error (ignoring missing values)
mask <- !is.na(y_true)
test_errors[k] <- mean((y_pred[mask] - y_true[mask])^2)
}
return(list(
mean_error = mean(test_errors),
se_error = sd(test_errors) / sqrt(outer_folds),
fold_errors = test_errors
))
}
# Run nested CV
nested_results <- evaluate_nested_cv(
sim$X,
sim$Z,
outer_folds = 3,
inner_folds = 3
)
{
cat("\nNested Cross-Validation Results:\n")
cat(" Mean test error:", round(nested_results$mean_error, 4), "\n")
cat(" Standard error:", round(nested_results$se_error, 4), "\n")
cat(" 95% CI: [",
round(nested_results$mean_error - 1.96 * nested_results$se_error, 4), ", ",
round(nested_results$mean_error + 1.96 * nested_results$se_error, 4), "]\n", sep = "")
}
## ----echo = FALSE, include = TRUE-----------------------------------------------------------------
{
cat("=== Recommended missoNet Workflow ===\n\n")
cat("1. Data Preparation:\n")
cat(" - Check missing patterns\n")
cat(" - Consider QC (usually recommended)\n")
cat(" - Assess dimensionality (p vs n)\n\n")
cat("2. Initial Exploration:\n")
cat(" - Quick fit with adaptive.search = TRUE\n")
cat(" - Examine selected lambda ranges\n")
cat(" - Check convergence\n\n")
cat("3. Cross-Validation:\n")
cat(" - Use 5-10 folds depending on sample size\n")
cat(" - Enable compute.1se = TRUE\n")
cat(" - Consider parallel processing for large data\n\n")
cat("4. Model Selection:\n")
cat(" - Use eBIC for high-dimensional settings\n")
cat(" - Consider 1-SE models for better generalization\n")
cat(" - Validate on independent test set if available\n\n")
cat("5. Diagnostics:\n")
cat(" - Check convergence status\n")
cat(" - Examine sparsity patterns\n")
cat(" - Validate predictions\n")
}
## ----echo = FALSE, include = TRUE-----------------------------------------------------------------
# Create performance recommendation table
tips <- data.frame(
Scenario = c(
"Large n (>1000)",
"Large p (>500)",
"Large q (>50)",
"High missing (>30%)",
"Dense solutions expected"
),
Recommendation = c(
"Use parallel processing, adaptive search",
"Reduce n.lambda.beta, use eBIC",
"Reduce n.lambda.theta, consider diagonal penalty",
"Increase tolerances, check MAR assumption",
"Reduce lambda ranges, focus grid"
)
)
print(tips, right = FALSE)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.