Nothing
#' Simulate data with hierarchical subspaces
#'
#' @description Simulates data with hierarchical subspaces. Data are generated with two factors that induce heterogeneity
#'
#' @param nobs positive integer for the sample size per subpopulation
#' @param nvars positive integer for the dimension
#' @param x.type variable type for covariates, either \code{"continuous"} (where the covariates are multivariate normal with a variance-matrix
#' with AR-1 form with parameter \code{rho}) or \code{"some_categorical"} (where half covariates are continuous and
#' the other half are binary with dependencies on the continuous covariates)
#' @param sd.y standard deviation of responsee
#' @param rho correlation parameter for AR-1 covariance structure for continuous covariates
#' @param model model number used, either "1", "2", or "3", each corresponds to a different outcome model setting
#' @importFrom stats rnorm var
#' @importFrom MASS mvrnorm
#' @return A list with the following elements
#' \itemize{
#' \item x a matrix of covariates with number of rows equal to the total sample size and columns equal to the number of variables
#' \item z a matrix with number of rows equal to the total sample size and columns as dummy variables indicating presence of a stratifying factor
#' \item y a vector of all responses
#' \item beta a list of the true sufficient dimension reduction matrices, one for each subpopulation
#' \item z.combinations all possible combinations of the stratifying factors \code{z}
#' \item snr scalar the observed signal-to-noise ratio for the response
#' \item d.correct the true dimensions of the dimension reduction spaces
#' }
#' @export
#' @examples
#'
#' library(hierSDR)
#'
#' set.seed(123)
#' dat <- simulate_data(nobs = 100, nvars = 6,
#' x.type = "some_categorical",
#' sd.y = 1, model = 2)
#'
#' x <- dat$x ## covariates
#' z <- dat$z ## factor indicators
#' y <- dat$y ## response
#'
#' dat$beta ## true coefficients that generate the subspaces
#'
#' dat$snr ## signal-to-noise ratio
#'
#' str(x)
#' str(z)
#'
#' dat$z.combinations ## what combinations of z represent different subpops
#'
#' ## correct structural dimensions:
#' dat$d.correct
#'
#'
simulate_data <- function(nobs,
nvars,
x.type = c("continuous", "some_categorical"),
sd.y = 1,
rho = 0.5,
model = c("1", "2", "3"))
{
# stopifnot(
# "need nfactors > 1" = nfactors > 1,
# "need nfactors < 6" = nfactors < 6,
# "need n > 9" = n > 9,
# "need p > 0" = p > 0
# )
nfactors <- 2
stopifnot(
nfactors > 1,
nfactors < 6,
nobs > 9,
nvars > 1,
nvars < 101,
rho < 1,
rho > -1,
sd.y > 0
)
ndata <- 2 ^ nfactors
model.num <- as.numeric(model)
if (x.type == "continuous")
{
cov.mat <- rho ^ abs(outer(1:(nvars), 1:(nvars), "-"))
x.list <- replicate(ndata, list( mvrnorm(n = nobs, mu = rep(0, nvars), Sigma = cov.mat) ))
} else if (x.type == "some_categorical")
{
if (nvars <= 10)
{
subtr <- floor(nvars / 2)
} else
{
subtr <- 10
}
cov.mat <- rho ^ abs(outer(1:(nvars - subtr), 1:(nvars - subtr), "-"))
x.list <- vector(mode = "list", length = ndata)
for (g in 1:ndata)
{
x.tmp <- mvrnorm(n = nobs, mu = rep(0, nvars - subtr), Sigma = cov.mat)
if (subtr == 10)
{
x.tmp2 <- apply(x.tmp[,1:5], 2, function(col) rbinom(length(col), 1, 1 / (1 + exp(-col))))
x.tmp3 <- apply(x.tmp[,6:10], 2, function(col) 0.1 * col ^ 2 + rnorm(length(col), sd = 1))
} else
{
x.tmp2 <- apply(x.tmp[,1:floor(subtr/2),drop=FALSE], 2, function(col) rbinom(length(col), 1, 1 / (1 + exp(-col))))
x.tmp3 <- apply(x.tmp[,(floor(subtr/2) + 1):subtr,drop=FALSE], 2, function(col) 0.1 * col ^ 2 + rnorm(length(col), sd = 1))
}
x.list[[g]] <- data.matrix(cbind(x.tmp, x.tmp2, x.tmp3))
}
}
# put all subpops into one tall matrix
x.tall <- do.call(rbind, x.list)
# factor dummy variables
z <- cbind(c(rep(0,nrow(x.list[[1]])), rep(1,nrow(x.list[[2]])), rep(0, nrow(x.list[[3]]) ), rep(1, nrow(x.list[[4]]) ) ),
c(rep(0,nrow(x.list[[1]])), rep(0,nrow(x.list[[2]])), rep(1, nrow(x.list[[3]]) ), rep(1, nrow(x.list[[4]]) ) ))
eta.b2.models <- c(3)
eta.zero.models <- c(2, 3)
## correct structural dimensions
if (model.num == 1)
{
d.correct <- c(1, 0, 0, 0)
} else if (model.num == 2)
{
d.correct <- c(1, 0, 0, 1)
} else if (model.num == 3)
{
d.correct <- c(1, 1, 0, 0)
}
#
# beta_none - common for all. in this sim will only have 1 column
#
# beta_a - will have dimension 1 for model _____
#
# beta_b - will always be of dimension zero (ie we won't have it)
#
# beta_ab - will have dimension 1 for model _____
#
## randomly generate true coefficients
beta.none <- matrix(c(runif(floor(nvars / 2), min = 0, max = 0.25) * (2 * rbinom(floor(nvars / 2), 1, 0.5) - 1),
runif(ceiling(nvars / 2), min = 0, max = 0.5) * (2 * rbinom(ceiling(nvars / 2), 1, 0.5) - 1)), ncol = 1)
eta.a <- matrix(c(runif(floor(nvars / 2), min = 0, max = 0.25) * (2 * rbinom(floor(nvars / 2), 1, 0.5) - 1),
runif(ceiling(nvars / 2), min = 0, max = 0.5) * (2 * rbinom(ceiling(nvars / 2), 1, 0.5) - 1)), ncol = 1)
eta.ab <- matrix(c(runif(floor(nvars / 2), min = 0, max = 0.25) * (2 * rbinom(floor(nvars / 2), 1, 0.5) - 1),
runif(ceiling(nvars / 2), min = 0, max = 0.5) * (2 * rbinom(ceiling(nvars / 2), 1, 0.5) - 1)), ncol = 1)
beta.none[1:ncol(beta.none),] <- 0.5*diag(ncol(beta.none))
eta.a[1:ncol(eta.a),] <- 0.5*diag(ncol(eta.a))
eta.ab[1:ncol(eta.ab),] <- 0.5*diag(ncol(eta.ab))
beta.none <- grassmannify(beta.none)$beta
eta.a <- grassmannify(eta.a)$beta
eta.ab <- grassmannify(eta.ab)$beta
if (model.num == 1)
{
beta.a <- beta.b <- beta.ab <- beta.none
} else if (model.num == 2)
{
beta.a <- beta.b <- beta.none
beta.ab <- cbind(beta.none, eta.ab)
} else if (model.num == 3)
{
beta.a <- cbind(beta.none, eta.a)
beta.b <- beta.none
beta.ab <- cbind(beta.none, eta.a)
}
beta.true.list <- list(none = beta.none, a = beta.a, b = beta.b, ab = beta.ab)
if (model.num == 1)
{
y.true.none <- 2 * ((x.list[[1]] %*% beta.none[,1]) ^ 2)
y.true.a <- 0.5 * (x.list[[2]] %*% beta.a[,1]) ^ 3
y.true.b <- 2 * (x.list[[3]] %*% beta.b[,1])
y.true.ab <- (x.list[[4]] %*% beta.ab[,1]) / (0.5 + (x.list[[4]] %*% beta.ab[,1] + 1.5) ^ 2)
y.true <- c(y.true.none, y.true.a, y.true.b, y.true.ab)
y <- y.true + rnorm(length(y.true), sd = sd.y)
} else if (model.num == 2)
{
y.true.none <- 2 * ((x.list[[1]] %*% beta.none[,1]) ^ 2)
y.true.a <- 1 / (0.1 + 0.5 * drop(x.list[[2]] %*% beta.a[,1]) ^ 2) - 0.5 * drop(x.list[[2]] %*% beta.a[,1]) ^ 2
y.true.b <- 2 * ((x.list[[3]] %*% beta.b[,1]) ^ 2)# ^ 2
y.true.ab <- 2 * ((x.list[[4]] %*% beta.ab[,1]) ^ 2) +
(x.list[[4]] %*% beta.ab[,1]) * (x.list[[4]] %*% beta.ab[,2])
y.true <- c(y.true.none, y.true.a, y.true.b, y.true.ab)
y <- y.true + rnorm(length(y.true), sd = sd.y)
} else if (model.num == 3)
{
y.true.none <- 1 * abs(x.list[[1]] %*% beta.none[,1])^2
y.true.a <- 0.5 * (x.list[[2]] %*% beta.a[,1]) * ((x.list[[2]] %*% beta.a[,2])) ^ 2
y.true.b <- 1 * apply(exp( (x.list[[3]] %*% beta.b) ) , 1, sum)
y.true.ab <- 5 * drop(exp(-(x.list[[4]] %*% beta.ab[,1])^2 )) * drop(x.list[[4]] %*% beta.ab[,2])
y.true <- c(y.true.none, y.true.a, y.true.b, y.true.ab)
y <- y.true + rnorm(length(y.true), sd = sd.y)
}
snr <- sqrt(var(y.true)) / sqrt(var(y - y.true))
z.combinations <- subpop.struct(2L, incl.none = TRUE)
colnames(z.combinations) <- colnames(z) <- c("a", "b")
list(x = x.tall,
z = z,
y = drop(y),
beta = beta.true.list,
snr = snr,
z.combinations = z.combinations,
d.correct = d.correct)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.