Cross-Validation and Advanced Features in missoNet

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  out.width = "90%",
  dpi = 90,
  message = FALSE,
  warning = FALSE
)
options(width = 100)

Introduction

This vignette covers advanced features of missoNet:

library(missoNet)

Cross-Validation Framework

Basic Cross-Validation

Cross-validation provides data-driven selection of regularization parameters:

# Generate example data
sim <- generateData(n = 300, p = 30, q = 12, rho = 0.1, 
                    missing.type = "MCAR")

# 5-fold cross-validation
cvfit <- cv.missoNet(
  X = sim$X,
  Y = sim$Z,
  kfold = 5,
  compute.1se = TRUE,  # Also compute 1-SE models
  seed = 123,          # For reproducibility
  verbose = 1
)


# ====== Cross-Validation Summary ======
{
  cat("\nNumber of folds:", cvfit$kfold, "\n")
  cat("Lambda pairs evaluated:", length(cvfit$param_set$cv.errors.mean), "\n")
  cat("Minimum CV error:", round(min(cvfit$param_set$cv.errors.mean), 4), "\n")
}

# ====== Optimal parameters (min CV error) ======
{
  cat("\nlambda.beta:", cvfit$est.min$lambda.beta, "\n")
  cat("lambda.theta:", cvfit$est.min$lambda.theta, "\n")
}

# ====== 1-SE parameters ======
{
  cat("\nlambda.beta:", cvfit$est.1se.beta$lambda.beta, "\n")
  cat("lambda.theta:", cvfit$est.1se.theta$lambda.theta, "\n")
}

Understanding the 1-SE Rule

The 1-SE rule selects simpler models within one standard error of minimum:

# Compare three models
models <- list(
  "Minimum CV" = cvfit$est.min,
  "1SE Beta" = cvfit$est.1se.beta,
  "1SE Theta" = cvfit$est.1se.theta
)

# Create comparison table
model_comparison <- data.frame(
  Model = names(models),
  Lambda.Beta = numeric(3),
  Lambda.Theta = numeric(3),
  Nonzero.Coef = numeric(3),
  Network.Edges = numeric(3)
)

for (i in 1:3) {
  if (!is.null(models[[i]])) {
    model_comparison$Lambda.Beta[i] <- models[[i]]$lambda.beta
    model_comparison$Lambda.Theta[i] <- models[[i]]$lambda.theta
    model_comparison$Nonzero.Coef[i] <- sum(abs(models[[i]]$Beta) > 1e-8)
    model_comparison$Network.Edges[i] <- 
      sum(abs(models[[i]]$Theta[upper.tri(models[[i]]$Theta)]) > 1e-8)
  }
}

print(model_comparison, digits = 4)

Visualizing CV Results

# Heatmap of CV errors
plot(cvfit, type = "heatmap", detailed.axes = FALSE)
# 3D surface plot
plot(cvfit, type = "scatter", plt.surf = TRUE)

CV Error Analysis

# Extract and analyze CV errors
cv_errors <- cvfit$param_set$cv.errors.mean
cv_errors_sd <- (cvfit$param_set$cv.errors.upper - cvfit$param_set$cv.errors.lower) / 2

# Create error surface data
lambda_beta_unique <- unique(cvfit$param_set$cv.grid.beta)
lambda_theta_unique <- unique(cvfit$param_set$cv.grid.theta)

# Find optimal region
min_error <- min(cv_errors)
near_optimal <- which(cv_errors <= min_error * 1.05)  # Within 5% of minimum

# ====== Regularization path statistics ======
{
  cat("\n  Error range: [", round(min(cv_errors), 4), ", ", 
      round(max(cv_errors), 4), "]\n", sep = "")
  cat("  Models within 5% of minimum:", length(near_optimal), "\n")
  cat("  This suggests", ifelse(length(near_optimal) > 10, "stable", "sensitive"), 
      "parameter selection\n")
}

Parallel Processing

Setting Up Parallel CV

Parallel processing is most beneficial for:

library(parallel)

# Detect available cores
n_cores <- detectCores()
cat("Available cores:", n_cores, "\n")

# Create cluster, use 2 cores for illustration
cl <- makeCluster(min(n_cores - 1, 2))

# Run parallel cross-validation
cvfit_parallel <- cv.missoNet(
  X = sim$X,
  Y = sim$Z,
  kfold = 5,
  parallel = TRUE,
  cl = cl,
  verbose = 1
)

# Clean up
stopCluster(cl)

Advanced Parameter Tuning

Adaptive Grid Search

# Standard grid search
time_standard <- system.time({
  fit_standard <- missoNet(
    X = sim$X,
    Y = sim$Z,
    adaptive.search = FALSE,
    verbose = 0
  )
})

# Adaptive search (faster)
time_adaptive <- system.time({
  fit_adaptive <- missoNet(
    X = sim$X,
    Y = sim$Z,
    adaptive.search = TRUE,
    verbose = 0
  )
})


# ====== Computation time comparison ======
{
  cat("\n  Standard search:", round(time_standard[3], 2), "seconds\n")
  cat("  Adaptive search:", round(time_adaptive[3], 2), "seconds\n")
  cat("  Speedup:", round(time_standard[3] / time_adaptive[3], 1), "x\n")
}

Custom Penalty Factors

p <- ncol(sim$X)
q <- ncol(sim$Z)

# Scenario: Prior knowledge about important predictors
# Group 1: Known important (predictors 1-10) - light penalty
# Group 2: Possibly important (predictors 11-30) - moderate penalty  
# Group 3: Likely unimportant (predictors 31-60) - heavy penalty

beta.pen.factor <- matrix(1, p, q)
beta.pen.factor[1:10, ] <- 0.1      # Light penalty
beta.pen.factor[11:20, ] <- 1.0     # Standard penalty
beta.pen.factor[21:p, ] <- 10.0     # Heavy penalty

# Network: Encourage block structure
theta.pen.factor <- matrix(1, q, q)
# Block 1: responses 1-6
theta.pen.factor[1:6, 1:6] <- 0.5
# Block 2: responses 7-12
theta.pen.factor[7:12, 7:12] <- 0.5
# Between blocks: heavy penalty
theta.pen.factor[1:6, 7:12] <- 2.0
theta.pen.factor[7:12, 1:6] <- 2.0

# Fit with structured penalties
fit_structured <- missoNet(
  X = sim$X,
  Y = sim$Z,
  beta.pen.factor = beta.pen.factor,
  theta.pen.factor = theta.pen.factor,
  verbose = 0
)

# Analyze selection pattern
selected_groups <- c(
  Group1 = sum(rowSums(abs(fit_structured$est.min$Beta[1:10, ])) > 1e-8),
  Group2 = sum(rowSums(abs(fit_structured$est.min$Beta[11:20, ])) > 1e-8),
  Group3 = sum(rowSums(abs(fit_structured$est.min$Beta[21:p, ])) > 1e-8)
)


# Penalty factors successfully guided selection toward prior knowledge
# Selected predictors by group:
print(selected_groups)

Multi-Resolution Lambda Grids

# Create focused grid around expected optimal region
# Based on preliminary analysis or domain knowledge

# Log-scale grid with concentration
create_focused_grid <- function(center, width, n_points) {
  log_center <- log10(center)
  log_seq <- seq(log_center - width/2, log_center + width/2, 
                 length.out = n_points)
  return(10^log_seq)
}

# Focused grids based on problem characteristics
lambda.beta.focused <- create_focused_grid(
  center = 0.05,   # Expected optimal region
  width = 1.5,     # Log10 scale width
  n_points = 30
)

lambda.theta.focused <- create_focused_grid(
  center = 0.01,
  width = 1.0,
  n_points = 30
)

# Fit with focused grid
fit_focused <- missoNet(
  X = sim$X,
  Y = sim$Z,
  lambda.beta = lambda.beta.focused,
  lambda.theta = lambda.theta.focused
)

Convergence Diagnostics

Monitoring Convergence

# Fit with detailed verbosity
fit_verbose <- missoNet(
  X = sim$X,
  Y = sim$Z,
  lambda.beta = 0.05,
  lambda.theta = 0.01,
  verbose = 2  # Detailed output
)


# Check convergence information
if (!is.null(fit_verbose$est.min$converged)) {
  cat("\nConvergence Status:\n")
  cat("  Beta optimization:", 
      ifelse(fit_verbose$est.min$converged$Beta, "✓ Converged", "✗ Not converged"), "\n")
  cat("  Theta optimization:", 
      ifelse(fit_verbose$est.min$converged$Theta, "✓ Converged", "✗ Not converged"), "\n")
}


if (!is.null(fit_verbose$est.min$iterations)) {
  cat("\nIterations used:\n")
  cat("  Beta:", fit_verbose$est.min$iterations$Beta, "\n")
  cat("  Theta:", fit_verbose$est.min$iterations$Theta, "\n")
}

Handling Convergence Issues

Strategies for difficult problems:

# Simulate a difficult problem
sim_difficult <- generateData(
  n = 100,
  p = 150,  # p > n (high-dimensional)
  q = 20,
  rho = 0.25,  # Higher missing rate
  missing.type = "MAR"
)

# Strategy 1: Relax tolerances
fit_relaxed <- missoNet(
  X = sim_difficult$X,
  Y = sim_difficult$Z,
  beta.tol = 1e-3,
  theta.tol = 1e-3,
  verbose = 0
)

# Strategy 2: Increase iterations
fit_more_iter <- missoNet(
  X = sim_difficult$X,
  Y = sim_difficult$Z,
  beta.max.iter = 10000,
  theta.max.iter = 10000,
  verbose = 0
)

# Strategy 3: Use adaptive search for better initialization
fit_adaptive_init <- missoNet(
  X = sim_difficult$X,
  Y = sim_difficult$Z,
  adaptive.search = TRUE,
  verbose = 0
)

Relaxed Network Estimation

De-biased Precision Matrix

# Fit with relaxed network option
fit_relax <- missoNet(
  X = sim$X,
  Y = sim$Z,
  relax.net = TRUE,  # Refit network without penalty
  verbose = 0
)

# Compare penalized vs relaxed estimates
if (!is.null(fit_relax$est.min$relax.net)) {
  Theta_pen <- fit_relax$est.min$Theta
  Theta_relax <- fit_relax$est.min$relax.net

  # Extract edges
  edges_pen <- Theta_pen[upper.tri(Theta_pen)]
  edges_relax <- Theta_relax[upper.tri(Theta_relax)]

  # Select non-zero edges from penalized model
  active_edges <- abs(edges_pen) > 1e-8

  # === Relaxed network estimation ===
  cat("\n  Active edges:", sum(active_edges), "\n")
  cat("  Mean |edge| (penalized):", 
      round(mean(abs(edges_pen[active_edges])), 4), "\n")
  cat("  Mean |edge| (relaxed):", 
      round(mean(abs(edges_relax[active_edges])), 4), "\n")
  cat("  Relaxed estimates are typically larger (less shrinkage)\n")
}

Model Selection Strategies

Nested Cross-Validation

# Nested CV for unbiased performance estimation
# Outer loop: Performance evaluation
# Inner loop: Parameter selection

evaluate_nested_cv <- function(X, Y, outer_folds = 5, inner_folds = 5) {
  n <- nrow(X)
  outer_fold_ids <- rep(1:outer_folds, length.out = n)
  outer_fold_ids <- sample(outer_fold_ids)

  test_errors <- numeric(outer_folds)

  for (k in 1:outer_folds) {
    # Split data
    test_idx <- which(outer_fold_ids == k)
    train_idx <- setdiff(1:n, test_idx)

    # Inner CV for parameter selection
    cvfit_inner <- cv.missoNet(
      X = X[train_idx, ],
      Y = Y[train_idx, ],
      kfold = inner_folds,
      verbose = 0
    )

    # Predict on test fold
    y_pred <- predict(cvfit_inner, newx = X[test_idx, ])
    y_true <- Y[test_idx, ]

    # Calculate error (ignoring missing values)
    mask <- !is.na(y_true)
    test_errors[k] <- mean((y_pred[mask] - y_true[mask])^2)
  }

  return(list(
    mean_error = mean(test_errors),
    se_error = sd(test_errors) / sqrt(outer_folds),
    fold_errors = test_errors
  ))
}

# Run nested CV
nested_results <- evaluate_nested_cv(
  sim$X, 
  sim$Z,
  outer_folds = 3,
  inner_folds = 3
)

{
  cat("\nNested Cross-Validation Results:\n")
  cat("  Mean test error:", round(nested_results$mean_error, 4), "\n")
  cat("  Standard error:", round(nested_results$se_error, 4), "\n")
  cat("  95% CI: [", 
      round(nested_results$mean_error - 1.96 * nested_results$se_error, 4), ", ",
      round(nested_results$mean_error + 1.96 * nested_results$se_error, 4), "]\n", sep = "")
}

Best Practices Summary

Recommended Workflow

{
  cat("=== Recommended missoNet Workflow ===\n\n")

  cat("1. Data Preparation:\n")
  cat("   - Check missing patterns\n")
  cat("   - Consider QC (usually recommended)\n")
  cat("   - Assess dimensionality (p vs n)\n\n")

  cat("2. Initial Exploration:\n")
  cat("   - Quick fit with adaptive.search = TRUE\n")
  cat("   - Examine selected lambda ranges\n")
  cat("   - Check convergence\n\n")

  cat("3. Cross-Validation:\n")
  cat("   - Use 5-10 folds depending on sample size\n")
  cat("   - Enable compute.1se = TRUE\n")
  cat("   - Consider parallel processing for large data\n\n")

  cat("4. Model Selection:\n")
  cat("   - Use eBIC for high-dimensional settings\n")
  cat("   - Consider 1-SE models for better generalization\n")
  cat("   - Validate on independent test set if available\n\n")

  cat("5. Diagnostics:\n")
  cat("   - Check convergence status\n")
  cat("   - Examine sparsity patterns\n")
  cat("   - Validate predictions\n")
}

Performance Tips

# Create performance recommendation table
tips <- data.frame(
  Scenario = c(
    "Large n (>1000)",
    "Large p (>500)", 
    "Large q (>50)",
    "High missing (>30%)",
    "Dense solutions expected"
  ),
  Recommendation = c(
    "Use parallel processing, adaptive search",
    "Reduce n.lambda.beta, use eBIC",
    "Reduce n.lambda.theta, consider diagonal penalty",
    "Increase tolerances, check MAR assumption",
    "Reduce lambda ranges, focus grid"
  )
)

print(tips, right = FALSE)


Try the missoNet package in your browser

Any scripts or data that you put into this service are public.

missoNet documentation built on Sept. 9, 2025, 5:55 p.m.