R/stepOptim.R
In missoNet: Joint Sparse Regression & Network Learning with Missing Data

# =====================================================================
# stepOptim.R - Two-Stage Pre-Optimization for missoNet
# =====================================================================
# Purpose: Efficient hyperparameter selection using a two-stage search
# Stage 1: Fix lambda.theta, search over lambda.beta
# Stage 2: Fix lambda.beta (from Stage 1), search over lambda.theta
# =====================================================================

stepOptim <- function(X, Y, init.obj, GoF, cv, eta, eps,
                      lamB.vec, lamTh.vec, n.lamB, n.lamTh,
                      lamB.min.ratio, lamTh.min.ratio,
                      lamB.scale.factor = NULL, lamTh.scale.factor = NULL,
                      Beta.maxit, Beta.thr, Theta.maxit, Theta.thr,
                      verbose) {
  
  # -------------------- Setup --------------------
  n <- nrow(X); p <- ncol(X); q <- ncol(Y)
  
  X <- robust_scale(X, init.obj$mx, init.obj$sdx)
  Y <- robust_scale(Y, init.obj$my, init.obj$sdy)
  Z <- Y; Z[is.na(Z)] <- 0
  
  # -------------------- Observation Probability Matrices --------------------
  # rho.mat.1: pxq matrix for X'Y scaling
  # rho.mat.2: qxq matrix for Y'Y scaling
  
  obs_prob <- init.obj$obs_prob
  rho.mat.1 <- matrix(obs_prob, nrow = p, ncol = q, byrow = TRUE)
  rho.mat.2 <- outer(obs_prob, obs_prob, `*`); diag(rho.mat.2) <- obs_prob
  
  # -------------------- Precompute Sufficient Statistics --------------------
  # These are the unbiased moment estimators accounting for missing data
  
  xtx <- crossprod(X)
  xtx <- make_positive_definite(xtx)
  
  til.xty <- crossprod(X, Z) / rho.mat.1  # Unbiased X'Y
  til.ytx <- t(til.xty)
  til.yty <- crossprod(Z) / rho.mat.2     # Unbiased Y'Y
  til.yty <- make_positive_definite(til.yty)
  
  # -------------------- Lambda Grid Generation --------------------
  if (is.null(lamB.vec) || is.null(lamTh.vec)) {
    lam.obj <- InitLambda(
      lamB = lamB.vec, lamTh = lamTh.vec, init.obj = init.obj,
      n = n, p = p, q = q,
      n.lamB = n.lamB, n.lamTh = n.lamTh,
      lamB.min.ratio = lamB.min.ratio,
      lamTh.min.ratio = lamTh.min.ratio,
      lamB.scale.factor = NULL,
      lamTh.scale.factor = NULL
    )
    lamB.vec <- sort(unique(lam.obj$lamB.vec), decreasing = TRUE)
    lamTh.vec <- sort(unique(lam.obj$lamTh.vec), decreasing = TRUE)
  } else {
    lamB.vec <- sort(unique(lamB.vec), decreasing = TRUE)
    lamTh.vec <- sort(unique(lamTh.vec), decreasing = TRUE)
  }
  
  # -------------------- Adaptive Tolerance Settings --------------------
  # For pre-optimization, we can use looser tolerances for speed
  # These will be tightened in the final model fitting
  
  n_eff <- init.obj$n_eff %||% (n * (1 - mean(init.obj$rho.vec)))  # Effective sample size
  high_dim <- (n_eff < p) || (n_eff < q)                           # Check if high-dimensional
  
  if (high_dim) {
    # High-dimensional: prioritize speed with looser tolerances
    beta_tol_fast <- min(0.01, Beta.thr * 10)
    theta_tol_fast <- min(0.01, Theta.thr * 10)
    beta_maxit_fast <- min(Beta.maxit, 500)
    theta_maxit_fast <- min(Theta.maxit, 500)
  } else {
    # Low-dimensional: can afford tighter tolerances
    beta_tol_fast <- min(0.001, Beta.thr * 5)
    theta_tol_fast <- min(0.001, Theta.thr * 5)
    beta_maxit_fast <- min(Beta.maxit, 1000)
    theta_maxit_fast <- min(Theta.maxit, 1000)
  }
  
  # =====================================================================
  # Stage 1: Search over lambda.beta (with fixed lambda.theta)
  # =====================================================================
  
  GoF.vec <- numeric(length(lamB.vec))
  temp.list <- lapply(seq_along(lamB.vec), function(x) matrix(0, p, q))
  if (cv) {
    mse.vec <- numeric(length(lamB.vec))
  }
  
  # Initialize Theta with a conservative choice (largest lambda.theta)
  residual.cov <- til.yty / (n - 1)  # Initial residual covariance (Beta = 0)
  lamTh.mat <- lamTh.vec[1] * init.obj$lamTh.pf
  lamTh.mat[lamTh.mat == 0] <- 1e-12
  diag(lamTh.mat) <- 1e-12  # Don't penalize diagonal
  
  Theta.out <- tryCatch({
    run_glasso(S = residual.cov,
               rho = lamTh.mat,
               thr = theta_tol_fast,
               maxIt = theta_maxit_fast)
  }, error = function(e) {
    if (verbose > 0) cat("    Glasso failed in warm start, using diagonal approximation\n")
    list(wi = diag(pmin(pmax(1/diag(residual.cov), eps), 1/eps)))
  })
  Theta <- make_symmetric(Theta.out$wi)
  
  if (verbose > 0) {
    cat("\n  - Stage: 1/2 (Searching lambda.beta)\n")
    pb <- txtProgressBar(min = 0, max = length(lamB.vec), style = 3, width = 50, char = "=")
  }
  
  # Initialize Beta
  Beta <- matrix(0, p, q)
  
  for (i in seq_along(lamB.vec)) {
    # Update Beta with current lambda.beta
    lamB.mat <- lamB.vec[i] * init.obj$lamB.pf
    lamB.mat[lamB.mat == 0] <- 1e-12
    
    Beta <- updateBeta(
      Theta = Theta, 
      B0 = Beta,  # Warm start
      n = n, 
      xtx = xtx, 
      xty = til.xty, 
      lamB = lamB.mat,
      eta = eta, 
      tolin = beta_tol_fast, 
      maxitrin = beta_maxit_fast
    )$Bhat
    
    # Update residual covariance
    E <- Y - X %*% Beta
    residual.cov <- getResCov(E, n, rho.mat.2)
    
    # Update Theta
    Theta.out <- tryCatch({
      run_glasso(S = residual.cov,
                 rho = lamTh.mat, 
                 thr = theta_tol_fast,
                 maxIt = theta_maxit_fast)
    }, error = function(e) {
      list(wi = Theta)  # Keep previous Theta
    })
    Theta <- make_symmetric(Theta.out$wi)
    
    # Store evaluation metrics
    if (cv) {
      mse.vec[i] <- getEvalErr(
        yty = til.yty, 
        ytx = til.ytx, 
        xty = til.xty, 
        xtx = xtx, 
        Beta = Beta, 
        n = n
      )
    }
    
    GoF.vec[i] <- GoF_func(
      GoF = GoF, 
      gamma = 0.5, 
      n = n, 
      p = p, 
      q = q,
      Theta = Theta, 
      S = residual.cov, 
      Beta = Beta
    )
    
    temp.list[[i]] <- Beta
    
    if (verbose > 0) {
      setTxtProgressBar(pb, i)
    }
  }
  
  if (verbose > 0) {
    close(pb)
  }
  
  # Select optimal lambda.beta
  if (cv) {
    # Use weighted average of MSE and GoF rankings
    # rank_mse <- rank(mse.vec, ties.method = "average")
    # rank_gof <- rank(GoF.vec, ties.method = "average")
    # combined_score <- 0.5 * rank_mse / length(mse.vec) + 0.5 * rank_gof / length(GoF.vec)
    # lamB.id <- which.min(combined_score)
    lamB.id <- ceiling((which.min(mse.vec) + which.min(GoF.vec)) / 2)
  } else {
    lamB.id <- which.min(GoF.vec)
  }
  
  lamB.min <- lamB.vec[lamB.id]
  lamB.mat <- lamB.min * init.obj$lamB.pf
  lamB.mat[lamB.mat == 0] <- 1e-12
  Beta <- temp.list[[lamB.id]]
  
  # =====================================================================
  # Stage 2: Search over lambda.theta (with fixed lambda.beta)
  # =====================================================================
  
  GoF.vec <- numeric(length(lamTh.vec))
  if (cv) {
    mse.vec <- numeric(length(lamTh.vec))
  }
  
  # Compute residual covariance with optimal Beta
  E <- Y - X %*% Beta
  residual.cov <- getResCov(E, n, rho.mat.2)
  
  if (verbose > 0) {
    cat("  - Stage: 2/2 (Searching lambda.theta)\n")
    pb <- txtProgressBar(min = 0, max = length(lamTh.vec), style = 3, width = 50, char = "=")
  }
  
  # Re-initialize Theta for Stage 2
  # Use the first (largest) lambda.theta as starting point
  lamTh.mat <- lamTh.vec[1] * init.obj$lamTh.pf
  lamTh.mat[lamTh.mat == 0] <- 1e-12
  
  Theta.out <- tryCatch({
    run_glasso(S = residual.cov,
               rho = lamTh.mat,
               thr = theta_tol_fast,
               maxIt = theta_maxit_fast)
  }, error = function(e) {
    if (verbose > 0) cat("    Glasso failed in warm start, using diagonal approximation\n")
    list(wi = diag(pmin(pmax(1/diag(residual.cov), eps), 1/eps)))
  })
  Theta <- make_symmetric(Theta.out$wi)
  
  for (i in seq_along(lamTh.vec)) {
    # Update Theta with current lambda.theta
    lamTh.mat <- lamTh.vec[i] * init.obj$lamTh.pf
    lamTh.mat[lamTh.mat == 0] <- 1e-12
    
    Theta.out <- tryCatch({
      run_glasso(S = residual.cov,
                 rho = lamTh.mat, 
                 thr = theta_tol_fast,
                 maxIt = theta_maxit_fast)
    }, error = function(e) {
      list(wi = Theta)  # Keep previous Theta
    })
    Theta <- make_symmetric(Theta.out$wi)
    
    # Update Beta with fixed lambda.beta
    Beta <- updateBeta(
      Theta = Theta, 
      B0 = Beta,  # Warm start
      n = n, 
      xtx = xtx, 
      xty = til.xty, 
      lamB = lamB.mat,
      eta = eta, 
      tolin = beta_tol_fast, 
      maxitrin = beta_maxit_fast
    )$Bhat
    
    # Update residual covariance
    E <- Y - X %*% Beta
    residual.cov <- getResCov(E, n, rho.mat.2)
    
    # Store evaluation metrics
    if (cv) {
      mse.vec[i] <- getEvalErr(
        yty = til.yty, 
        ytx = til.ytx, 
        xty = til.xty, 
        xtx = xtx, 
        Beta = Beta, 
        n = n
      )
    }
    
    GoF.vec[i] <- GoF_func(
      GoF = GoF, 
      gamma = 0.5, 
      n = n, 
      p = p, 
      q = q,
      Theta = Theta, 
      S = residual.cov, 
      Beta = Beta
    )
    
    if (verbose > 0) {
      setTxtProgressBar(pb, i)
    }
  }
  
  if (verbose > 0) {
    close(pb); cat("\n")
  }
  
  # Select optimal lambda.theta
  if (cv) {
    # Use weighted average favoring GoF for Theta selection
    # rank_mse <- rank(mse.vec, ties.method = "average")
    # rank_gof <- rank(GoF.vec, ties.method = "average")
    # combined_score <- 0.3 * rank_mse / length(mse.vec) + 0.7 * rank_gof / length(GoF.vec)
    # lamTh.id <- which.min(combined_score)
    lamTh.id <- ceiling((which.min(mse.vec) + which.min(GoF.vec)) / 2)
  } else {
    lamTh.id <- which.min(GoF.vec)
  }
  
  lamTh.min <- lamTh.vec[lamTh.id]
  
  # -------------------- Return Results --------------------
  return(list(
    lamB.unique = lamB.vec,
    lamB.min = lamB.min,
    lamB.id = lamB.id,
    lamTh.unique = lamTh.vec,
    lamTh.min = lamTh.min,
    lamTh.id = lamTh.id
  ))
}


# GoF_func <- function(GoF, gamma, n, p, q, Theta, S, Beta) {
#   # Compute goodness-of-fit criterion
#   if (GoF == "eBIC") {
#     # Extended BIC
#     loglik <- -n/2 * (q * log(2 * pi) + determinant(solve(Theta), logarithm = TRUE)$modulus + 
#                         sum(diag(S %*% Theta)))
#     df_beta <- sum(Beta != 0)
#     df_theta <- sum(Theta[upper.tri(Theta)] != 0)
#     df <- df_beta + df_theta
#     
#     # eBIC with gamma parameter for high-dimensional adjustment
#     ebic <- -2 * loglik + log(n) * df + 4 * gamma * df * log(max(p, q))
#     return(ebic)
#   } else if (GoF == "AIC") {
#     # Standard AIC
#     loglik <- -n/2 * (q * log(2 * pi) + determinant(solve(Theta), logarithm = TRUE)$modulus + 
#                         sum(diag(S %*% Theta)))
#     df <- sum(Beta != 0) + sum(Theta[upper.tri(Theta)] != 0)
#     aic <- -2 * loglik + 2 * df
#     return(aic)
#   } else {
#     stop("GoF must be either 'eBIC' or 'AIC'")
#   }
# }