R/generateData.R
In missoNet: Joint Sparse Regression & Network Learning with Missing Data

Documented in generateData

#' Generate synthetic data with missing values for missoNet
#'
#' @description
#' Generates synthetic data from a conditional Gaussian graphical model with 
#' user-specified missing data mechanisms. This function is designed for simulation 
#' studies and testing of the missoNet package, supporting three types of 
#' missingness: Missing Completely At Random (MCAR), Missing At Random (MAR), 
#' and Missing Not At Random (MNAR).
#'
#' @param n Integer. Sample size (number of observations). Must be at least 2.
#' @param p Integer. Number of predictor variables. Must be at least 1.
#' @param q Integer. Number of response variables. Must be at least 2.
#' @param rho Numeric scalar or vector of length \code{q}. Proportion of missing 
#'   values for each response variable. Values must be in [0, 1). If scalar, 
#'   the same missing rate is applied to all responses.
#' @param missing.type Character string specifying the missing data mechanism. 
#'   One of:
#'   \itemize{
#'     \item \code{"MCAR"} (default): Missing Completely At Random
#'     \item \code{"MAR"}: Missing At Random (depends on predictors)
#'     \item \code{"MNAR"}: Missing Not At Random (depends on response values)
#'   }
#' @param X Optional \code{n x p} matrix. User-supplied predictor matrix. If 
#'   \code{NULL} (default), predictors are simulated from a multivariate normal 
#'   distribution with mean zero and covariance \code{Sigma.X}.
#' @param Beta Optional \code{p x q} matrix. Regression coefficient matrix. If 
#'   \code{NULL} (default), a sparse coefficient matrix is generated with 
#'   sparsity controlled by \code{Beta.row.sparsity} and \code{Beta.elm.sparsity}.
#' @param E Optional \code{n x q} matrix. Error/noise matrix. If \code{NULL} 
#'   (default), errors are simulated from a multivariate normal distribution 
#'   with mean zero and precision matrix \code{Theta}.
#' @param Theta Optional \code{q x q} positive definite matrix. Precision matrix 
#'   (inverse covariance) for the response variables. If \code{NULL} (default), 
#'   a block-structured precision matrix is generated with four types of graph 
#'   structures. Only used when \code{E = NULL}.
#' @param Sigma.X Optional \code{p x p} positive definite matrix. Covariance 
#'   matrix for the predictors. If \code{NULL} (default), an AR(1) covariance 
#'   structure with correlation 0.7 is used. Only used when \code{X = NULL}.
#' @param Beta.row.sparsity Numeric in \[0, 1\]. Proportion of rows in Beta that 
#'   contain at least one non-zero element. Default is 0.2. Only used when 
#'   \code{Beta = NULL}.
#' @param Beta.elm.sparsity Numeric in \[0, 1\]. Proportion of non-zero elements 
#'   within active rows of Beta. Default is 0.2. Only used when \code{Beta = NULL}.
#' @param seed Optional integer. Random seed for reproducibility.
#'
#' @details
#' The function generates data through the following model:
#' \deqn{Y = XB + E}
#' where:
#' \itemize{
#'   \item \eqn{X \in \mathbb{R}^{n \times p}} is the predictor matrix
#'   \item \eqn{B \in \mathbb{R}^{p \times q}} is the coefficient matrix
#'   \item \eqn{E \sim \mathcal{MVN}(0, \Theta^{-1})} is the error matrix
#'   \item \eqn{Y \in \mathbb{R}^{n \times q}} is the complete response matrix
#' }
#' 
#' Missing values are then introduced to create \eqn{Z} (the observed response 
#' matrix with NAs) according to the specified mechanism:
#' 
#' \strong{MCAR}: Each element has probability \code{rho[j]} of being missing, 
#' independent of all variables.
#' 
#' \strong{MAR}: Missingness depends on the predictors through a logistic model:
#' \deqn{P(Z_{ij} = NA) = \mathrm{logit}^{-1}(XB)_{ij} \times c_j}
#' where \eqn{c_j} is calibrated to achieve the target missing rate.
#' 
#' \strong{MNAR}: The lowest \code{rho[j]} proportion of values in each column 
#' are set as missing.
#'
#' @return A list containing:
#' \item{X}{\code{n x p} matrix. Predictor matrix (either user-supplied or simulated).}
#' \item{Y}{\code{n x q} matrix. Complete response matrix without missing values.}
#' \item{Z}{\code{n x q} matrix. Response matrix with missing values (coded as NA).}
#' \item{Beta}{\code{p x q} matrix. Regression coefficient matrix used in generation.}
#' \item{Theta}{\code{q x q} matrix or NULL. Precision matrix (if used in generation).}
#' \item{rho}{Numeric vector of length \code{q}. Missing rates for each response.}
#' \item{missing.type}{Character string. The missing mechanism used.}
#'
#' @examples
#' # Example 1: Basic usage with default settings
#' sim.dat <- generateData(n = 300, p = 50, q = 20, rho = 0.1, seed = 857)
#' 
#' # Check dimensions and missing rate
#' dim(sim.dat$X)      # 300 x 50
#' dim(sim.dat$Z)      # 300 x 20
#' mean(is.na(sim.dat$Z))  # approximately 0.1
#' 
#' # Example 2: Variable missing rates with MAR mechanism
#' rho.vec <- seq(0.05, 0.25, length.out = 20)
#' sim.dat <- generateData(n = 300, p = 50, q = 20, 
#'                        rho = rho.vec, 
#'                        missing.type = "MAR")
#' 
#' # Example 3: High sparsity in coefficient matrix
#' sim.dat <- generateData(n = 500, p = 100, q = 30,
#'                        rho = 0.15,
#'                        Beta.row.sparsity = 0.1,  # 10% active predictors
#'                        Beta.elm.sparsity = 0.3)  # 30% active in each row
#' 
#' # Example 4: User-supplied matrices
#' n <- 300; p <- 50; q <- 20
#' X <- matrix(rnorm(n*p), n, p)
#' Beta <- matrix(rnorm(p*q) * rbinom(p*q, 1, 0.1), p, q)  # 10% non-zero
#' Theta <- diag(q) + 0.1  # Simple precision structure
#' 
#' sim.dat <- generateData(X = X, Beta = Beta, Theta = Theta,
#'                        n = n, p = p, q = q,
#'                        rho = 0.2, missing.type = "MNAR")
#' 
#' \donttest{
#' # Example 5: Use generated data with missoNet
#' library(missoNet)
#' sim.dat <- generateData(n = 400, p = 50, q = 10, rho = 0.15)
#' 
#' # Split into training and test sets
#' train.idx <- 1:300
#' test.idx <- 301:400
#' 
#' # Fit missoNet model
#' fit <- missoNet(X = sim.dat$X[train.idx, ], 
#'                Y = sim.dat$Z[train.idx, ],
#'                lambda.beta = 0.1, 
#'                lambda.theta = 0.1)
#' 
#' # Evaluate on test set
#' pred <- predict(fit, newx = sim.dat$X[test.idx, ])
#' }
#'
#' @author Yixiao Zeng \email{yixiao.zeng@@mail.mcgill.ca}, Celia M. T. Greenwood
#' 
#' @seealso 
#' \code{\link{missoNet}} for fitting models to data with missing values,
#' \code{\link{cv.missoNet}} for cross-validation
#'
#' @export
#' @importFrom mvtnorm rmvnorm
#' @importFrom stats rbinom runif rnorm quantile

generateData <- function(n, p, q, rho, missing.type = "MCAR",
                         X = NULL, Beta = NULL, E = NULL,
                         Theta = NULL, Sigma.X = NULL,
                         Beta.row.sparsity = 0.2, Beta.elm.sparsity = 0.2,
                         seed = NULL) {
  
  # ==================== Input Validation ====================
  
  # Validate dimensions
  if (!is.numeric(n) || length(n) != 1 || n < 2) {
    stop("'n' must be a single numeric value >= 2")
  }
  if (!is.numeric(p) || length(p) != 1 || p < 1) {
    stop("'p' must be a single numeric value >= 1")
  }
  if (!is.numeric(q) || length(q) != 1 || q < 2) {
    stop("'q' must be a single numeric value >= 2 (missoNet requires multiple responses)")
  }
  
  # Validate missing type
  missing.type <- match.arg(missing.type, c("MCAR", "MAR", "MNAR"))
  
  # Set seed if provided
  if (!is.null(seed)) {
    if (!is.numeric(seed) || length(seed) != 1) {
      stop("'seed' must be a single numeric value")
    }
    set.seed(seed)
  }
  
  # ==================== Generate/Validate X ====================
  
  if (is.null(X)) {
    X <- genX(n = n, p = p, Sigma.X = Sigma.X)
  } else {
    X <- as.matrix(X)
    if (!is.numeric(X)) stop("'X' must be numeric")
    if (nrow(X) != n) stop(sprintf("'X' has %d rows but n = %d", nrow(X), n))
    if (ncol(X) != p) {
      p <- ncol(X)
      message(sprintf("Note: Updated p to %d to match supplied X", p))
    }
    if (any(!is.finite(X))) stop("'X' contains non-finite values")
  }
  
  # ==================== Generate/Validate Beta ====================
  
  if (is.null(Beta)) {
    Beta <- genBeta(p = p, q = q, 
                    s.row = Beta.row.sparsity, 
                    s.elm = Beta.elm.sparsity)
  } else {
    Beta <- as.matrix(Beta)
    if (!is.numeric(Beta)) stop("'Beta' must be numeric")
    if (nrow(Beta) != p || ncol(Beta) != q) {
      stop(sprintf("'Beta' dimensions (%d x %d) don't match p x q (%d x %d)",
                   nrow(Beta), ncol(Beta), p, q))
    }
    if (any(!is.finite(Beta))) stop("'Beta' contains non-finite values")
  }
  
  # ==================== Generate Y ====================
  
  if (is.null(E)) {
    # Generate E from MVN with precision Theta
    if (is.null(Theta)) {
      Theta <- genTheta(q)
    } else {
      Theta <- as.matrix(Theta)
      if (!is.numeric(Theta)) stop("'Theta' must be numeric")
      if (nrow(Theta) != q || ncol(Theta) != q) {
        stop(sprintf("'Theta' must be a %d x %d matrix", q, q))
      }
      if (!isSymmetric(Theta, tol = 1e-8)) {
        stop("'Theta' must be symmetric")
      }
      eigvals <- eigen(Theta, symmetric = TRUE, only.values = TRUE)$values
      if (min(eigvals) <= 1e-8) {
        stop(sprintf("'Theta' must be positive definite (min eigenvalue = %.2e)", 
                     min(eigvals)))
      }
    }
    Y <- genY(X = X, Beta = Beta, Theta = Theta)
  } else {
    # Use user-supplied E
    E <- as.matrix(E)
    if (!is.numeric(E)) stop("'E' must be numeric")
    if (nrow(E) != n || ncol(E) != q) {
      stop(sprintf("'E' dimensions (%d x %d) don't match n x q (%d x %d)",
                   nrow(E), ncol(E), n, q))
    }
    if (any(!is.finite(E))) stop("'E' contains non-finite values")
    Y <- X %*% Beta + E
    Theta <- NULL  # Not used when E is supplied
  }
  
  # ==================== Process Missing Rates ====================
  
  if (!is.numeric(rho)) stop("'rho' must be numeric")
  if (length(rho) == 1) {
    rho <- rep(rho, q)
  } else if (length(rho) != q) {
    stop(sprintf("'rho' must be a scalar or vector of length %d", q))
  }
  if (any(rho < 0) || any(rho >= 1)) {
    stop("All elements of 'rho' must be in [0, 1)")
  }
  if (any(rho > 0.8)) {
    warning("High missing rates (>80%) may lead to estimation difficulties")
  }
  
  # ==================== Generate Missing Data ====================
  
  Z <- genZ(X = X, Beta = Beta, Y = Y, rho = rho, type = missing.type)
  
  # ==================== Return Results ====================
  
  structure(
    list(
      X = X,
      Y = Y,
      Z = Z,
      Beta = Beta,
      Theta = Theta,
      rho = rho,
      missing.type = missing.type
    ),
    class = "missoNet.sim"
  )
}


# ============================================================================
#                           INTERNAL HELPER FUNCTIONS
# ============================================================================

#' Generate sparse coefficient matrix Beta
#' @noRd
genBeta <- function(p, q, s.row, s.elm) {
  # Validate sparsity parameters
  if (s.row < 0 || s.row > 1) {
    stop("Beta.row.sparsity must be in [0, 1]")
  }
  if (s.elm < 0 || s.elm > 1) {
    stop("Beta.elm.sparsity must be in [0, 1]")
  }
  
  # Generate base coefficient matrix
  B <- matrix(rnorm(p * q, mean = 0, sd = 1), p, q)
  
  # Handle special cases
  if (s.row == 0 || s.elm == 0) {
    return(matrix(0, p, q))  # All zeros
  }
  if (s.row == 1 && s.elm == 1) {
    return(B)  # Fully dense
  }
  
  # Apply row sparsity (which rows are active)
  n_active_rows <- max(1, round(p * s.row))
  active_rows <- sort(sample.int(p, n_active_rows, replace = FALSE))
  
  # Apply element sparsity within active rows
  n_active_cols <- max(1, round(q * s.elm))
  B_sparse <- matrix(0, p, q)
  
  for (i in active_rows) {
    active_cols <- sample.int(q, n_active_cols, replace = FALSE)
    B_sparse[i, active_cols] <- B[i, active_cols]
  }
  
  return(B_sparse)
}


#' Generate block-structured precision matrix Theta
#' @noRd
genTheta <- function(q) {
  if (q < 10) {
    # Simple structure for small q
    return(diag(q) + 0.3)
  }
  
  # Allocate blocks roughly equally
  block_sizes <- rep(q %/% 4, 4)
  remainder <- q %% 4
  if (remainder > 0) {
    block_sizes[seq_len(remainder)] <- block_sizes[seq_len(remainder)] + 1
  }
  
  block_ends <- cumsum(block_sizes)
  block_starts <- c(1, block_ends[-4] + 1)
  
  Th <- matrix(0, q, q)
  
  # Block 1: Independent (diagonal)
  idx1 <- block_starts[1]:block_ends[1]
  Th[idx1, idx1] <- diag(length(idx1))
  
  # Block 2: Weak complete graph
  if (block_sizes[2] > 0) {
    idx2 <- block_starts[2]:block_ends[2]
    n2 <- length(idx2)
    for (i in 1:(n2-1)) {
      for (j in (i+1):n2) {
        Th[idx2[i], idx2[j]] <- Th[idx2[j], idx2[i]] <- runif(1, -0.3, -0.1)
      }
    }
    # Ensure positive definiteness
    diag(Th[idx2, idx2]) <- rowSums(abs(Th[idx2, idx2])) + 0.5
  }
  
  # Block 3: Strong complete graph  
  if (block_sizes[3] > 0) {
    idx3 <- block_starts[3]:block_ends[3]
    n3 <- length(idx3)
    for (i in 1:(n3-1)) {
      for (j in (i+1):n3) {
        Th[idx3[i], idx3[j]] <- Th[idx3[j], idx3[i]] <- runif(1, -0.7, -0.4)
      }
    }
    diag(Th[idx3, idx3]) <- rowSums(abs(Th[idx3, idx3])) + 0.8
  }
  
  # Block 4: Chain structure
  if (block_sizes[4] > 1) {
    idx4 <- block_starts[4]:block_ends[4]
    n4 <- length(idx4)
    for (i in 1:(n4-1)) {
      Th[idx4[i], idx4[i+1]] <- Th[idx4[i+1], idx4[i]] <- runif(1, -0.8, -0.5)
    }
    diag(Th[idx4, idx4]) <- rowSums(abs(Th[idx4, idx4])) + 0.5
  } else if (block_sizes[4] == 1) {
    Th[block_ends[4], block_ends[4]] <- 1
  }
  
  # Final positive definiteness check
  eigvals <- eigen(Th, symmetric = TRUE, only.values = TRUE)$values
  if (min(eigvals) <= 0) {
    Th <- Th + (abs(min(eigvals)) + 0.1) * diag(q)
  }
  
  return(Th)
}


#' Generate predictor matrix X
#' @noRd
genX <- function(n, p, Sigma.X = NULL) {
  if (is.null(Sigma.X)) {
    # AR(1) covariance with correlation 0.7
    Sigma.X <- outer(1:p, 1:p, function(i, j) 0.7^abs(i - j))
  } else {
    Sigma.X <- as.matrix(Sigma.X)
    if (nrow(Sigma.X) != p || ncol(Sigma.X) != p) {
      stop(sprintf("Sigma.X must be a %d x %d matrix", p, p))
    }
    eigvals <- eigen(Sigma.X, symmetric = TRUE, only.values = TRUE)$values
    if (min(eigvals) <= 1e-8) {
      stop("Sigma.X must be positive definite")
    }
  }
  
  mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = Sigma.X, checkSymmetry = FALSE)
}


#' Generate response matrix Y
#' @noRd
genY <- function(X, Beta, Theta) {
  n <- nrow(X)
  q <- ncol(Beta)
  
  # Compute covariance from precision
  Sigma <- solve(Theta)
  Sigma <- 0.5 * (Sigma + t(Sigma))  # Ensure exact symmetry
  
  # Generate errors
  E <- mvtnorm::rmvnorm(n, mean = rep(0, q), sigma = Sigma, checkSymmetry = FALSE)
  
  # Generate responses
  X %*% Beta + E
}


#' Introduce missing values according to specified mechanism
#' @noRd
genZ <- function(X, Beta, Y, rho, type) {
  n <- nrow(Y)
  q <- ncol(Y)
  Z <- Y  # Start with complete data
  
  if (type == "MCAR") {
    # Missing Completely At Random
    for (j in 1:q) {
      if (rho[j] > 0) {
        n_miss <- round(n * rho[j])
        if (n_miss > 0 && n_miss < n) {
          miss_idx <- sample.int(n, n_miss, replace = FALSE)
          Z[miss_idx, j] <- NA
        } else if (n_miss >= n) {
          Z[, j] <- NA  # All missing (edge case)
        }
      }
    }
    
  } else if (type == "MAR") {
    # Missing At Random (depends on X)
    XB <- X %*% Beta
    for (j in 1:q) {
      if (rho[j] > 0) {
        # Logistic probabilities
        linear_pred <- XB[, j]
        linear_pred <- pmax(pmin(linear_pred, 10), -10)  # Avoid overflow
        probs <- 1 / (1 + exp(-linear_pred))
        
        # Calibrate to achieve target rate
        current_mean <- mean(probs)
        if (current_mean > 0) {
          scale_factor <- min(rho[j] / current_mean, 0.99 / max(probs))
          miss_probs <- pmin(scale_factor * probs, 0.99)
          
          # Generate missingness
          miss_ind <- rbinom(n, size = 1, prob = miss_probs)
          Z[miss_ind == 1, j] <- NA
        }
      }
    }
    
  } else if (type == "MNAR") {
    # Missing Not At Random (depends on Y)
    for (j in 1:q) {
      if (rho[j] > 0) {
        # Missing for lowest values
        threshold <- quantile(Y[, j], probs = rho[j], na.rm = TRUE)
        Z[Y[, j] <= threshold, j] <- NA
      }
    }
  }
  
  return(Z)
}