Nothing
#' Generate synthetic data with missing values for missoNet
#'
#' @description
#' Generates synthetic data from a conditional Gaussian graphical model with
#' user-specified missing data mechanisms. This function is designed for simulation
#' studies and testing of the missoNet package, supporting three types of
#' missingness: Missing Completely At Random (MCAR), Missing At Random (MAR),
#' and Missing Not At Random (MNAR).
#'
#' @param n Integer. Sample size (number of observations). Must be at least 2.
#' @param p Integer. Number of predictor variables. Must be at least 1.
#' @param q Integer. Number of response variables. Must be at least 2.
#' @param rho Numeric scalar or vector of length \code{q}. Proportion of missing
#' values for each response variable. Values must be in [0, 1). If scalar,
#' the same missing rate is applied to all responses.
#' @param missing.type Character string specifying the missing data mechanism.
#' One of:
#' \itemize{
#' \item \code{"MCAR"} (default): Missing Completely At Random
#' \item \code{"MAR"}: Missing At Random (depends on predictors)
#' \item \code{"MNAR"}: Missing Not At Random (depends on response values)
#' }
#' @param X Optional \code{n x p} matrix. User-supplied predictor matrix. If
#' \code{NULL} (default), predictors are simulated from a multivariate normal
#' distribution with mean zero and covariance \code{Sigma.X}.
#' @param Beta Optional \code{p x q} matrix. Regression coefficient matrix. If
#' \code{NULL} (default), a sparse coefficient matrix is generated with
#' sparsity controlled by \code{Beta.row.sparsity} and \code{Beta.elm.sparsity}.
#' @param E Optional \code{n x q} matrix. Error/noise matrix. If \code{NULL}
#' (default), errors are simulated from a multivariate normal distribution
#' with mean zero and precision matrix \code{Theta}.
#' @param Theta Optional \code{q x q} positive definite matrix. Precision matrix
#' (inverse covariance) for the response variables. If \code{NULL} (default),
#' a block-structured precision matrix is generated with four types of graph
#' structures. Only used when \code{E = NULL}.
#' @param Sigma.X Optional \code{p x p} positive definite matrix. Covariance
#' matrix for the predictors. If \code{NULL} (default), an AR(1) covariance
#' structure with correlation 0.7 is used. Only used when \code{X = NULL}.
#' @param Beta.row.sparsity Numeric in \[0, 1\]. Proportion of rows in Beta that
#' contain at least one non-zero element. Default is 0.2. Only used when
#' \code{Beta = NULL}.
#' @param Beta.elm.sparsity Numeric in \[0, 1\]. Proportion of non-zero elements
#' within active rows of Beta. Default is 0.2. Only used when \code{Beta = NULL}.
#' @param seed Optional integer. Random seed for reproducibility.
#'
#' @details
#' The function generates data through the following model:
#' \deqn{Y = XB + E}
#' where:
#' \itemize{
#' \item \eqn{X \in \mathbb{R}^{n \times p}} is the predictor matrix
#' \item \eqn{B \in \mathbb{R}^{p \times q}} is the coefficient matrix
#' \item \eqn{E \sim \mathcal{MVN}(0, \Theta^{-1})} is the error matrix
#' \item \eqn{Y \in \mathbb{R}^{n \times q}} is the complete response matrix
#' }
#'
#' Missing values are then introduced to create \eqn{Z} (the observed response
#' matrix with NAs) according to the specified mechanism:
#'
#' \strong{MCAR}: Each element has probability \code{rho[j]} of being missing,
#' independent of all variables.
#'
#' \strong{MAR}: Missingness depends on the predictors through a logistic model:
#' \deqn{P(Z_{ij} = NA) = \mathrm{logit}^{-1}(XB)_{ij} \times c_j}
#' where \eqn{c_j} is calibrated to achieve the target missing rate.
#'
#' \strong{MNAR}: The lowest \code{rho[j]} proportion of values in each column
#' are set as missing.
#'
#' @return A list containing:
#' \item{X}{\code{n x p} matrix. Predictor matrix (either user-supplied or simulated).}
#' \item{Y}{\code{n x q} matrix. Complete response matrix without missing values.}
#' \item{Z}{\code{n x q} matrix. Response matrix with missing values (coded as NA).}
#' \item{Beta}{\code{p x q} matrix. Regression coefficient matrix used in generation.}
#' \item{Theta}{\code{q x q} matrix or NULL. Precision matrix (if used in generation).}
#' \item{rho}{Numeric vector of length \code{q}. Missing rates for each response.}
#' \item{missing.type}{Character string. The missing mechanism used.}
#'
#' @examples
#' # Example 1: Basic usage with default settings
#' sim.dat <- generateData(n = 300, p = 50, q = 20, rho = 0.1, seed = 857)
#'
#' # Check dimensions and missing rate
#' dim(sim.dat$X) # 300 x 50
#' dim(sim.dat$Z) # 300 x 20
#' mean(is.na(sim.dat$Z)) # approximately 0.1
#'
#' # Example 2: Variable missing rates with MAR mechanism
#' rho.vec <- seq(0.05, 0.25, length.out = 20)
#' sim.dat <- generateData(n = 300, p = 50, q = 20,
#' rho = rho.vec,
#' missing.type = "MAR")
#'
#' # Example 3: High sparsity in coefficient matrix
#' sim.dat <- generateData(n = 500, p = 100, q = 30,
#' rho = 0.15,
#' Beta.row.sparsity = 0.1, # 10% active predictors
#' Beta.elm.sparsity = 0.3) # 30% active in each row
#'
#' # Example 4: User-supplied matrices
#' n <- 300; p <- 50; q <- 20
#' X <- matrix(rnorm(n*p), n, p)
#' Beta <- matrix(rnorm(p*q) * rbinom(p*q, 1, 0.1), p, q) # 10% non-zero
#' Theta <- diag(q) + 0.1 # Simple precision structure
#'
#' sim.dat <- generateData(X = X, Beta = Beta, Theta = Theta,
#' n = n, p = p, q = q,
#' rho = 0.2, missing.type = "MNAR")
#'
#' \donttest{
#' # Example 5: Use generated data with missoNet
#' library(missoNet)
#' sim.dat <- generateData(n = 400, p = 50, q = 10, rho = 0.15)
#'
#' # Split into training and test sets
#' train.idx <- 1:300
#' test.idx <- 301:400
#'
#' # Fit missoNet model
#' fit <- missoNet(X = sim.dat$X[train.idx, ],
#' Y = sim.dat$Z[train.idx, ],
#' lambda.beta = 0.1,
#' lambda.theta = 0.1)
#'
#' # Evaluate on test set
#' pred <- predict(fit, newx = sim.dat$X[test.idx, ])
#' }
#'
#' @author Yixiao Zeng \email{yixiao.zeng@@mail.mcgill.ca}, Celia M. T. Greenwood
#'
#' @seealso
#' \code{\link{missoNet}} for fitting models to data with missing values,
#' \code{\link{cv.missoNet}} for cross-validation
#'
#' @export
#' @importFrom mvtnorm rmvnorm
#' @importFrom stats rbinom runif rnorm quantile
generateData <- function(n, p, q, rho, missing.type = "MCAR",
X = NULL, Beta = NULL, E = NULL,
Theta = NULL, Sigma.X = NULL,
Beta.row.sparsity = 0.2, Beta.elm.sparsity = 0.2,
seed = NULL) {
# ==================== Input Validation ====================
# Validate dimensions
if (!is.numeric(n) || length(n) != 1 || n < 2) {
stop("'n' must be a single numeric value >= 2")
}
if (!is.numeric(p) || length(p) != 1 || p < 1) {
stop("'p' must be a single numeric value >= 1")
}
if (!is.numeric(q) || length(q) != 1 || q < 2) {
stop("'q' must be a single numeric value >= 2 (missoNet requires multiple responses)")
}
# Validate missing type
missing.type <- match.arg(missing.type, c("MCAR", "MAR", "MNAR"))
# Set seed if provided
if (!is.null(seed)) {
if (!is.numeric(seed) || length(seed) != 1) {
stop("'seed' must be a single numeric value")
}
set.seed(seed)
}
# ==================== Generate/Validate X ====================
if (is.null(X)) {
X <- genX(n = n, p = p, Sigma.X = Sigma.X)
} else {
X <- as.matrix(X)
if (!is.numeric(X)) stop("'X' must be numeric")
if (nrow(X) != n) stop(sprintf("'X' has %d rows but n = %d", nrow(X), n))
if (ncol(X) != p) {
p <- ncol(X)
message(sprintf("Note: Updated p to %d to match supplied X", p))
}
if (any(!is.finite(X))) stop("'X' contains non-finite values")
}
# ==================== Generate/Validate Beta ====================
if (is.null(Beta)) {
Beta <- genBeta(p = p, q = q,
s.row = Beta.row.sparsity,
s.elm = Beta.elm.sparsity)
} else {
Beta <- as.matrix(Beta)
if (!is.numeric(Beta)) stop("'Beta' must be numeric")
if (nrow(Beta) != p || ncol(Beta) != q) {
stop(sprintf("'Beta' dimensions (%d x %d) don't match p x q (%d x %d)",
nrow(Beta), ncol(Beta), p, q))
}
if (any(!is.finite(Beta))) stop("'Beta' contains non-finite values")
}
# ==================== Generate Y ====================
if (is.null(E)) {
# Generate E from MVN with precision Theta
if (is.null(Theta)) {
Theta <- genTheta(q)
} else {
Theta <- as.matrix(Theta)
if (!is.numeric(Theta)) stop("'Theta' must be numeric")
if (nrow(Theta) != q || ncol(Theta) != q) {
stop(sprintf("'Theta' must be a %d x %d matrix", q, q))
}
if (!isSymmetric(Theta, tol = 1e-8)) {
stop("'Theta' must be symmetric")
}
eigvals <- eigen(Theta, symmetric = TRUE, only.values = TRUE)$values
if (min(eigvals) <= 1e-8) {
stop(sprintf("'Theta' must be positive definite (min eigenvalue = %.2e)",
min(eigvals)))
}
}
Y <- genY(X = X, Beta = Beta, Theta = Theta)
} else {
# Use user-supplied E
E <- as.matrix(E)
if (!is.numeric(E)) stop("'E' must be numeric")
if (nrow(E) != n || ncol(E) != q) {
stop(sprintf("'E' dimensions (%d x %d) don't match n x q (%d x %d)",
nrow(E), ncol(E), n, q))
}
if (any(!is.finite(E))) stop("'E' contains non-finite values")
Y <- X %*% Beta + E
Theta <- NULL # Not used when E is supplied
}
# ==================== Process Missing Rates ====================
if (!is.numeric(rho)) stop("'rho' must be numeric")
if (length(rho) == 1) {
rho <- rep(rho, q)
} else if (length(rho) != q) {
stop(sprintf("'rho' must be a scalar or vector of length %d", q))
}
if (any(rho < 0) || any(rho >= 1)) {
stop("All elements of 'rho' must be in [0, 1)")
}
if (any(rho > 0.8)) {
warning("High missing rates (>80%) may lead to estimation difficulties")
}
# ==================== Generate Missing Data ====================
Z <- genZ(X = X, Beta = Beta, Y = Y, rho = rho, type = missing.type)
# ==================== Return Results ====================
structure(
list(
X = X,
Y = Y,
Z = Z,
Beta = Beta,
Theta = Theta,
rho = rho,
missing.type = missing.type
),
class = "missoNet.sim"
)
}
# ============================================================================
# INTERNAL HELPER FUNCTIONS
# ============================================================================
#' Generate sparse coefficient matrix Beta
#' @noRd
genBeta <- function(p, q, s.row, s.elm) {
# Validate sparsity parameters
if (s.row < 0 || s.row > 1) {
stop("Beta.row.sparsity must be in [0, 1]")
}
if (s.elm < 0 || s.elm > 1) {
stop("Beta.elm.sparsity must be in [0, 1]")
}
# Generate base coefficient matrix
B <- matrix(rnorm(p * q, mean = 0, sd = 1), p, q)
# Handle special cases
if (s.row == 0 || s.elm == 0) {
return(matrix(0, p, q)) # All zeros
}
if (s.row == 1 && s.elm == 1) {
return(B) # Fully dense
}
# Apply row sparsity (which rows are active)
n_active_rows <- max(1, round(p * s.row))
active_rows <- sort(sample.int(p, n_active_rows, replace = FALSE))
# Apply element sparsity within active rows
n_active_cols <- max(1, round(q * s.elm))
B_sparse <- matrix(0, p, q)
for (i in active_rows) {
active_cols <- sample.int(q, n_active_cols, replace = FALSE)
B_sparse[i, active_cols] <- B[i, active_cols]
}
return(B_sparse)
}
#' Generate block-structured precision matrix Theta
#' @noRd
genTheta <- function(q) {
if (q < 10) {
# Simple structure for small q
return(diag(q) + 0.3)
}
# Allocate blocks roughly equally
block_sizes <- rep(q %/% 4, 4)
remainder <- q %% 4
if (remainder > 0) {
block_sizes[seq_len(remainder)] <- block_sizes[seq_len(remainder)] + 1
}
block_ends <- cumsum(block_sizes)
block_starts <- c(1, block_ends[-4] + 1)
Th <- matrix(0, q, q)
# Block 1: Independent (diagonal)
idx1 <- block_starts[1]:block_ends[1]
Th[idx1, idx1] <- diag(length(idx1))
# Block 2: Weak complete graph
if (block_sizes[2] > 0) {
idx2 <- block_starts[2]:block_ends[2]
n2 <- length(idx2)
for (i in 1:(n2-1)) {
for (j in (i+1):n2) {
Th[idx2[i], idx2[j]] <- Th[idx2[j], idx2[i]] <- runif(1, -0.3, -0.1)
}
}
# Ensure positive definiteness
diag(Th[idx2, idx2]) <- rowSums(abs(Th[idx2, idx2])) + 0.5
}
# Block 3: Strong complete graph
if (block_sizes[3] > 0) {
idx3 <- block_starts[3]:block_ends[3]
n3 <- length(idx3)
for (i in 1:(n3-1)) {
for (j in (i+1):n3) {
Th[idx3[i], idx3[j]] <- Th[idx3[j], idx3[i]] <- runif(1, -0.7, -0.4)
}
}
diag(Th[idx3, idx3]) <- rowSums(abs(Th[idx3, idx3])) + 0.8
}
# Block 4: Chain structure
if (block_sizes[4] > 1) {
idx4 <- block_starts[4]:block_ends[4]
n4 <- length(idx4)
for (i in 1:(n4-1)) {
Th[idx4[i], idx4[i+1]] <- Th[idx4[i+1], idx4[i]] <- runif(1, -0.8, -0.5)
}
diag(Th[idx4, idx4]) <- rowSums(abs(Th[idx4, idx4])) + 0.5
} else if (block_sizes[4] == 1) {
Th[block_ends[4], block_ends[4]] <- 1
}
# Final positive definiteness check
eigvals <- eigen(Th, symmetric = TRUE, only.values = TRUE)$values
if (min(eigvals) <= 0) {
Th <- Th + (abs(min(eigvals)) + 0.1) * diag(q)
}
return(Th)
}
#' Generate predictor matrix X
#' @noRd
genX <- function(n, p, Sigma.X = NULL) {
if (is.null(Sigma.X)) {
# AR(1) covariance with correlation 0.7
Sigma.X <- outer(1:p, 1:p, function(i, j) 0.7^abs(i - j))
} else {
Sigma.X <- as.matrix(Sigma.X)
if (nrow(Sigma.X) != p || ncol(Sigma.X) != p) {
stop(sprintf("Sigma.X must be a %d x %d matrix", p, p))
}
eigvals <- eigen(Sigma.X, symmetric = TRUE, only.values = TRUE)$values
if (min(eigvals) <= 1e-8) {
stop("Sigma.X must be positive definite")
}
}
mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = Sigma.X, checkSymmetry = FALSE)
}
#' Generate response matrix Y
#' @noRd
genY <- function(X, Beta, Theta) {
n <- nrow(X)
q <- ncol(Beta)
# Compute covariance from precision
Sigma <- solve(Theta)
Sigma <- 0.5 * (Sigma + t(Sigma)) # Ensure exact symmetry
# Generate errors
E <- mvtnorm::rmvnorm(n, mean = rep(0, q), sigma = Sigma, checkSymmetry = FALSE)
# Generate responses
X %*% Beta + E
}
#' Introduce missing values according to specified mechanism
#' @noRd
genZ <- function(X, Beta, Y, rho, type) {
n <- nrow(Y)
q <- ncol(Y)
Z <- Y # Start with complete data
if (type == "MCAR") {
# Missing Completely At Random
for (j in 1:q) {
if (rho[j] > 0) {
n_miss <- round(n * rho[j])
if (n_miss > 0 && n_miss < n) {
miss_idx <- sample.int(n, n_miss, replace = FALSE)
Z[miss_idx, j] <- NA
} else if (n_miss >= n) {
Z[, j] <- NA # All missing (edge case)
}
}
}
} else if (type == "MAR") {
# Missing At Random (depends on X)
XB <- X %*% Beta
for (j in 1:q) {
if (rho[j] > 0) {
# Logistic probabilities
linear_pred <- XB[, j]
linear_pred <- pmax(pmin(linear_pred, 10), -10) # Avoid overflow
probs <- 1 / (1 + exp(-linear_pred))
# Calibrate to achieve target rate
current_mean <- mean(probs)
if (current_mean > 0) {
scale_factor <- min(rho[j] / current_mean, 0.99 / max(probs))
miss_probs <- pmin(scale_factor * probs, 0.99)
# Generate missingness
miss_ind <- rbinom(n, size = 1, prob = miss_probs)
Z[miss_ind == 1, j] <- NA
}
}
}
} else if (type == "MNAR") {
# Missing Not At Random (depends on Y)
for (j in 1:q) {
if (rho[j] > 0) {
# Missing for lowest values
threshold <- quantile(Y[, j], probs = rho[j], na.rm = TRUE)
Z[Y[, j] <= threshold, j] <- NA
}
}
}
return(Z)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.