# R/dataGeneration.R In cmfilter: Coordinate-Wise Mediation Filter

#### Documented in generateMed

```#' Generate a high-dimensional mediation dataset
#'
#' This function generates a dataset from an x -> M -> y model, where M may
#' be of any size with any correlation matrix.
#'
#' @param n Sample size
#' @param a Vector of a path coefficients within 0 and 1
#' @param b Vector of b path coefficients within 0 and 1
#' @param r2y Proportion of explained variance in y. Set to
#' \code{b \%*\% Sigma \%*\% b} for \code{var(y) == 1}.
#' @param dir Direct path from x to y
#' @param Sigma Desired true covariance matrix between the mediators M
#' @param residual Whether Sigma indicates residual or marginal covariance
#' @param empirical Ensure observed data matrix has exactly the requested covmat
#' (only if Sigma is specified)
#' @param scaley Whether to standardise y (changes b path coefficients)
#' @param forma Functional form of the a paths. Function that accepts a matrix
#' as input and transforms each column to the desired form.
#' @param formb Functional form of the b paths. Function that accepts a vector.
#'
#' @return A data frame with columns x, M.1 - M.p, y
#'
#' @examples
#' # Generate a suppression dataset where M.2 is suppressed
#' sup <- generateMed(n = 100,
#'                    a = c(-0.4, 0.4),
#'                    b = c(0.8, 0.48),
#'                    Sigma = matrix(c(1, -0.6, -0.6, 1), 2))
#'
#' @importFrom stats rnorm cov
#' @import Matrix
#'
#' @export
generateMed <- function(n = 1e2L,
a = 0.3, b = 0.3,
r2y = 0.5, dir = 0,
Sigma, residual = FALSE, empirical = FALSE,
scaley = FALSE,
forma = identity, formb = identity) {

# Input checking
if (class(a) != "dsparseVector") {
# user knows what they're doing if sparse vectors, skip input check
if (r2y > 1 || r2y <= 0) {
stop("Arg r2y should be in range (0,1]")
}
if (length(a) != length(b)) {
stop("There should be as many a paths as b paths!")
}
if (!missing(Sigma)) {
if (!is.matrix(Sigma) ||
ncol(Sigma) != nrow(Sigma) ||
!isSymmetric(Sigma)) {
stop("Sigma needs to be a square symmetric covariance matrix.")
}
if (ncol(Sigma) != length(a)) {
stop("Sigma should have as many cols & rows as a & b paths.")
}
}
}

if (missing(Sigma)) {
if (any(c(a > 1, a < -1))) {
stop("Elements in 'a' out of range, should all be between 0 and 1",
"if Sigma is not specified. (Assume variance of 1 for M)")
}

# No residual covariance, fast to generate
# Calculate residual variance of y
vary <- b %*% b + dir^2 # variance of y = sum(b^2) because M std & no cov
resy <- vary/r2y - vary # residual variance of y

# Simulate the residuals of M
resM <- sapply(1 - a^2, function(x) rnorm(n, sd = x))

} else {

# There is residual covariance

# Switch between residual and marginal covariance of M.
# This uses the Schur complement which simplifies to Sigma +/- tcrossprod(a)
# because C = B' and A = 1. See Abadir & Magnus p. 102.
if (residual) {
psi <- Sigma
Sigma <- psi + tcrossprod(a)
} else {
psi <- Sigma - tcrossprod(a)
}

# Marginal covariance matrix of x & M
if (class(Sigma) == "dgCMatrix") {
# use sparse matrices. empirical unavailable
if (empirical) stop("Empirical not available with sparse matrices")

# generate residuals of M
prec <- Matrix::solve(psi, sparse = TRUE)
resM <- sparseMVN::rmvn.sparse(n, numeric(length(a)), Cholesky(prec))

# calculate residual variance of y
Sigma2 <- cbind(as.matrix(a), Sigma)
SigmaXM <- rbind(t(as.matrix(Matrix::c.sparseVector(1, a))), Sigma2)
pathsToY <- Matrix::c.sparseVector(dir, b)
vary <- as.numeric(t(pathsToY) %*% SigmaXM %*% pathsToY)
resy <- vary/r2y - vary # residual variance of y

} else {

sigmaXM <- diag(ncol(Sigma) + 1)
sigmaXM[-1,-1] <- Sigma
sigmaXM[ 1,-1] <- a
sigmaXM[-1, 1] <- a

# Calculate residual variance of y
pathsToY <- c(dir, b)
vary <- t(pathsToY) %*% sigmaXM %*% pathsToY # propagation of error
resy <- vary/r2y - vary # residual variance of y

# Simulate the residuals of M
if (!empirical) resM <- MASS::mvrnorm(n, numeric(ncol(psi)), psi)

}
}

# simulate the dataset
if (!empirical) {
x <- rnorm(n)
M <- forma(x %*% t(a)) + resM
y <- formb(M %*% b) + formb(dir * x) + rnorm(n, sd = sqrt(resy))
if (scaley) y <- scale(y)
if (class(M) == "dgeMatrix") M <- matrix(M, n)
if (class(y) == "dgeMatrix") y <- as.numeric(y)
return(data.frame(x = x, M = M, y = y))
} else {
if (missing(Sigma)) stop("Empirical not allowed without Sigma")
# calculate full sigma
nvars <- length(a) + 2
fullSigma <- diag(nvars)
fullSigma[-nvars, -nvars] <- sigmaXM
fullSigma[nvars,-nvars] <- fullSigma[-nvars, nvars] <- pathsToY %*% sigmaXM
fullSigma[nvars, nvars] <- vary + resy

# now it's simple to generate
dmat <- matrix(rnorm(n*nvars), n)

d <- as.data.frame(empiricalTransform(dmat, fullSigma))
colnames(d) <- c("x", paste0("M.", 1:length(a)), "y")
return(d)
}
}

#' @keywords internal
empiricalTransform <- function(X, Sigma) {
# Force covariance matrix Sigma on X as per Mair, Satorra et al.
return(X %*% expm::sqrtm(solve(cov(X))) %*% expm::sqrtm(Sigma))
}
```

## Try the cmfilter package in your browser

Any scripts or data that you put into this service are public.

cmfilter documentation built on April 11, 2019, 5:07 p.m.