inst/doc/missoNet-cross-validation.R

## ----setup, include = FALSE-----------------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  out.width = "90%",
  dpi = 90,
  message = FALSE,
  warning = FALSE
)
options(width = 100)

## -------------------------------------------------------------------------------------------------
library(missoNet)

## -------------------------------------------------------------------------------------------------
# Generate example data
sim <- generateData(n = 300, p = 30, q = 12, rho = 0.1, 
                    missing.type = "MCAR")

# 5-fold cross-validation
cvfit <- cv.missoNet(
  X = sim$X,
  Y = sim$Z,
  kfold = 5,
  compute.1se = TRUE,  # Also compute 1-SE models
  seed = 123,          # For reproducibility
  verbose = 1
)


# ====== Cross-Validation Summary ======
{
  cat("\nNumber of folds:", cvfit$kfold, "\n")
  cat("Lambda pairs evaluated:", length(cvfit$param_set$cv.errors.mean), "\n")
  cat("Minimum CV error:", round(min(cvfit$param_set$cv.errors.mean), 4), "\n")
}

# ====== Optimal parameters (min CV error) ======
{
  cat("\nlambda.beta:", cvfit$est.min$lambda.beta, "\n")
  cat("lambda.theta:", cvfit$est.min$lambda.theta, "\n")
}

# ====== 1-SE parameters ======
{
  cat("\nlambda.beta:", cvfit$est.1se.beta$lambda.beta, "\n")
  cat("lambda.theta:", cvfit$est.1se.theta$lambda.theta, "\n")
}

## -------------------------------------------------------------------------------------------------
# Compare three models
models <- list(
  "Minimum CV" = cvfit$est.min,
  "1SE Beta" = cvfit$est.1se.beta,
  "1SE Theta" = cvfit$est.1se.theta
)

# Create comparison table
model_comparison <- data.frame(
  Model = names(models),
  Lambda.Beta = numeric(3),
  Lambda.Theta = numeric(3),
  Nonzero.Coef = numeric(3),
  Network.Edges = numeric(3)
)

for (i in 1:3) {
  if (!is.null(models[[i]])) {
    model_comparison$Lambda.Beta[i] <- models[[i]]$lambda.beta
    model_comparison$Lambda.Theta[i] <- models[[i]]$lambda.theta
    model_comparison$Nonzero.Coef[i] <- sum(abs(models[[i]]$Beta) > 1e-8)
    model_comparison$Network.Edges[i] <- 
      sum(abs(models[[i]]$Theta[upper.tri(models[[i]]$Theta)]) > 1e-8)
  }
}

print(model_comparison, digits = 4)

## -------------------------------------------------------------------------------------------------
# Heatmap of CV errors
plot(cvfit, type = "heatmap", detailed.axes = FALSE)

## -------------------------------------------------------------------------------------------------
# 3D surface plot
plot(cvfit, type = "scatter", plt.surf = TRUE)

## -------------------------------------------------------------------------------------------------
# Extract and analyze CV errors
cv_errors <- cvfit$param_set$cv.errors.mean
cv_errors_sd <- (cvfit$param_set$cv.errors.upper - cvfit$param_set$cv.errors.lower) / 2

# Create error surface data
lambda_beta_unique <- unique(cvfit$param_set$cv.grid.beta)
lambda_theta_unique <- unique(cvfit$param_set$cv.grid.theta)

# Find optimal region
min_error <- min(cv_errors)
near_optimal <- which(cv_errors <= min_error * 1.05)  # Within 5% of minimum

# ====== Regularization path statistics ======
{
  cat("\n  Error range: [", round(min(cv_errors), 4), ", ", 
      round(max(cv_errors), 4), "]\n", sep = "")
  cat("  Models within 5% of minimum:", length(near_optimal), "\n")
  cat("  This suggests", ifelse(length(near_optimal) > 10, "stable", "sensitive"), 
      "parameter selection\n")
}

## ----eval = FALSE---------------------------------------------------------------------------------
# library(parallel)
# 
# # Detect available cores
# n_cores <- detectCores()
# cat("Available cores:", n_cores, "\n")
# 
# # Create cluster, use 2 cores for illustration
# cl <- makeCluster(min(n_cores - 1, 2))
# 
# # Run parallel cross-validation
# cvfit_parallel <- cv.missoNet(
#   X = sim$X,
#   Y = sim$Z,
#   kfold = 5,
#   parallel = TRUE,
#   cl = cl,
#   verbose = 1
# )
# 
# # Clean up
# stopCluster(cl)

## -------------------------------------------------------------------------------------------------
# Standard grid search
time_standard <- system.time({
  fit_standard <- missoNet(
    X = sim$X,
    Y = sim$Z,
    adaptive.search = FALSE,
    verbose = 0
  )
})

# Adaptive search (faster)
time_adaptive <- system.time({
  fit_adaptive <- missoNet(
    X = sim$X,
    Y = sim$Z,
    adaptive.search = TRUE,
    verbose = 0
  )
})


# ====== Computation time comparison ======
{
  cat("\n  Standard search:", round(time_standard[3], 2), "seconds\n")
  cat("  Adaptive search:", round(time_adaptive[3], 2), "seconds\n")
  cat("  Speedup:", round(time_standard[3] / time_adaptive[3], 1), "x\n")
}

## -------------------------------------------------------------------------------------------------
p <- ncol(sim$X)
q <- ncol(sim$Z)

# Scenario: Prior knowledge about important predictors
# Group 1: Known important (predictors 1-10) - light penalty
# Group 2: Possibly important (predictors 11-30) - moderate penalty  
# Group 3: Likely unimportant (predictors 31-60) - heavy penalty

beta.pen.factor <- matrix(1, p, q)
beta.pen.factor[1:10, ] <- 0.1      # Light penalty
beta.pen.factor[11:20, ] <- 1.0     # Standard penalty
beta.pen.factor[21:p, ] <- 10.0     # Heavy penalty

# Network: Encourage block structure
theta.pen.factor <- matrix(1, q, q)
# Block 1: responses 1-6
theta.pen.factor[1:6, 1:6] <- 0.5
# Block 2: responses 7-12
theta.pen.factor[7:12, 7:12] <- 0.5
# Between blocks: heavy penalty
theta.pen.factor[1:6, 7:12] <- 2.0
theta.pen.factor[7:12, 1:6] <- 2.0

# Fit with structured penalties
fit_structured <- missoNet(
  X = sim$X,
  Y = sim$Z,
  beta.pen.factor = beta.pen.factor,
  theta.pen.factor = theta.pen.factor,
  verbose = 0
)

# Analyze selection pattern
selected_groups <- c(
  Group1 = sum(rowSums(abs(fit_structured$est.min$Beta[1:10, ])) > 1e-8),
  Group2 = sum(rowSums(abs(fit_structured$est.min$Beta[11:20, ])) > 1e-8),
  Group3 = sum(rowSums(abs(fit_structured$est.min$Beta[21:p, ])) > 1e-8)
)


# Penalty factors successfully guided selection toward prior knowledge
# Selected predictors by group:
print(selected_groups)

## ----eval = FALSE---------------------------------------------------------------------------------
# # Create focused grid around expected optimal region
# # Based on preliminary analysis or domain knowledge
# 
# # Log-scale grid with concentration
# create_focused_grid <- function(center, width, n_points) {
#   log_center <- log10(center)
#   log_seq <- seq(log_center - width/2, log_center + width/2,
#                  length.out = n_points)
#   return(10^log_seq)
# }
# 
# # Focused grids based on problem characteristics
# lambda.beta.focused <- create_focused_grid(
#   center = 0.05,   # Expected optimal region
#   width = 1.5,     # Log10 scale width
#   n_points = 30
# )
# 
# lambda.theta.focused <- create_focused_grid(
#   center = 0.01,
#   width = 1.0,
#   n_points = 30
# )
# 
# # Fit with focused grid
# fit_focused <- missoNet(
#   X = sim$X,
#   Y = sim$Z,
#   lambda.beta = lambda.beta.focused,
#   lambda.theta = lambda.theta.focused
# )

## -------------------------------------------------------------------------------------------------
# Fit with detailed verbosity
fit_verbose <- missoNet(
  X = sim$X,
  Y = sim$Z,
  lambda.beta = 0.05,
  lambda.theta = 0.01,
  verbose = 2  # Detailed output
)


# Check convergence information
if (!is.null(fit_verbose$est.min$converged)) {
  cat("\nConvergence Status:\n")
  cat("  Beta optimization:", 
      ifelse(fit_verbose$est.min$converged$Beta, "✓ Converged", "✗ Not converged"), "\n")
  cat("  Theta optimization:", 
      ifelse(fit_verbose$est.min$converged$Theta, "✓ Converged", "✗ Not converged"), "\n")
}


if (!is.null(fit_verbose$est.min$iterations)) {
  cat("\nIterations used:\n")
  cat("  Beta:", fit_verbose$est.min$iterations$Beta, "\n")
  cat("  Theta:", fit_verbose$est.min$iterations$Theta, "\n")
}

## ----eval = FALSE---------------------------------------------------------------------------------
# # Simulate a difficult problem
# sim_difficult <- generateData(
#   n = 100,
#   p = 150,  # p > n (high-dimensional)
#   q = 20,
#   rho = 0.25,  # Higher missing rate
#   missing.type = "MAR"
# )
# 
# # Strategy 1: Relax tolerances
# fit_relaxed <- missoNet(
#   X = sim_difficult$X,
#   Y = sim_difficult$Z,
#   beta.tol = 1e-3,
#   theta.tol = 1e-3,
#   verbose = 0
# )
# 
# # Strategy 2: Increase iterations
# fit_more_iter <- missoNet(
#   X = sim_difficult$X,
#   Y = sim_difficult$Z,
#   beta.max.iter = 10000,
#   theta.max.iter = 10000,
#   verbose = 0
# )
# 
# # Strategy 3: Use adaptive search for better initialization
# fit_adaptive_init <- missoNet(
#   X = sim_difficult$X,
#   Y = sim_difficult$Z,
#   adaptive.search = TRUE,
#   verbose = 0
# )

## -------------------------------------------------------------------------------------------------
# Fit with relaxed network option
fit_relax <- missoNet(
  X = sim$X,
  Y = sim$Z,
  relax.net = TRUE,  # Refit network without penalty
  verbose = 0
)

# Compare penalized vs relaxed estimates
if (!is.null(fit_relax$est.min$relax.net)) {
  Theta_pen <- fit_relax$est.min$Theta
  Theta_relax <- fit_relax$est.min$relax.net
  
  # Extract edges
  edges_pen <- Theta_pen[upper.tri(Theta_pen)]
  edges_relax <- Theta_relax[upper.tri(Theta_relax)]
  
  # Select non-zero edges from penalized model
  active_edges <- abs(edges_pen) > 1e-8
  
  # === Relaxed network estimation ===
  cat("\n  Active edges:", sum(active_edges), "\n")
  cat("  Mean |edge| (penalized):", 
      round(mean(abs(edges_pen[active_edges])), 4), "\n")
  cat("  Mean |edge| (relaxed):", 
      round(mean(abs(edges_relax[active_edges])), 4), "\n")
  cat("  Relaxed estimates are typically larger (less shrinkage)\n")
}

## -------------------------------------------------------------------------------------------------
# Nested CV for unbiased performance estimation
# Outer loop: Performance evaluation
# Inner loop: Parameter selection

evaluate_nested_cv <- function(X, Y, outer_folds = 5, inner_folds = 5) {
  n <- nrow(X)
  outer_fold_ids <- rep(1:outer_folds, length.out = n)
  outer_fold_ids <- sample(outer_fold_ids)
  
  test_errors <- numeric(outer_folds)
  
  for (k in 1:outer_folds) {
    # Split data
    test_idx <- which(outer_fold_ids == k)
    train_idx <- setdiff(1:n, test_idx)
    
    # Inner CV for parameter selection
    cvfit_inner <- cv.missoNet(
      X = X[train_idx, ],
      Y = Y[train_idx, ],
      kfold = inner_folds,
      verbose = 0
    )
    
    # Predict on test fold
    y_pred <- predict(cvfit_inner, newx = X[test_idx, ])
    y_true <- Y[test_idx, ]
    
    # Calculate error (ignoring missing values)
    mask <- !is.na(y_true)
    test_errors[k] <- mean((y_pred[mask] - y_true[mask])^2)
  }
  
  return(list(
    mean_error = mean(test_errors),
    se_error = sd(test_errors) / sqrt(outer_folds),
    fold_errors = test_errors
  ))
}

# Run nested CV
nested_results <- evaluate_nested_cv(
  sim$X, 
  sim$Z,
  outer_folds = 3,
  inner_folds = 3
)

{
  cat("\nNested Cross-Validation Results:\n")
  cat("  Mean test error:", round(nested_results$mean_error, 4), "\n")
  cat("  Standard error:", round(nested_results$se_error, 4), "\n")
  cat("  95% CI: [", 
      round(nested_results$mean_error - 1.96 * nested_results$se_error, 4), ", ",
      round(nested_results$mean_error + 1.96 * nested_results$se_error, 4), "]\n", sep = "")
}

## ----echo = FALSE, include = TRUE-----------------------------------------------------------------
{
  cat("=== Recommended missoNet Workflow ===\n\n")
  
  cat("1. Data Preparation:\n")
  cat("   - Check missing patterns\n")
  cat("   - Consider QC (usually recommended)\n")
  cat("   - Assess dimensionality (p vs n)\n\n")
  
  cat("2. Initial Exploration:\n")
  cat("   - Quick fit with adaptive.search = TRUE\n")
  cat("   - Examine selected lambda ranges\n")
  cat("   - Check convergence\n\n")
  
  cat("3. Cross-Validation:\n")
  cat("   - Use 5-10 folds depending on sample size\n")
  cat("   - Enable compute.1se = TRUE\n")
  cat("   - Consider parallel processing for large data\n\n")
  
  cat("4. Model Selection:\n")
  cat("   - Use eBIC for high-dimensional settings\n")
  cat("   - Consider 1-SE models for better generalization\n")
  cat("   - Validate on independent test set if available\n\n")
  
  cat("5. Diagnostics:\n")
  cat("   - Check convergence status\n")
  cat("   - Examine sparsity patterns\n")
  cat("   - Validate predictions\n")
}

## ----echo = FALSE, include = TRUE-----------------------------------------------------------------
# Create performance recommendation table
tips <- data.frame(
  Scenario = c(
    "Large n (>1000)",
    "Large p (>500)", 
    "Large q (>50)",
    "High missing (>30%)",
    "Dense solutions expected"
  ),
  Recommendation = c(
    "Use parallel processing, adaptive search",
    "Reduce n.lambda.beta, use eBIC",
    "Reduce n.lambda.theta, consider diagonal penalty",
    "Increase tolerances, check MAR assumption",
    "Reduce lambda ranges, focus grid"
  )
)

print(tips, right = FALSE)

Try the missoNet package in your browser

Any scripts or data that you put into this service are public.

missoNet documentation built on Sept. 9, 2025, 5:55 p.m.