Nothing
#'
#' Generate synthetic data from a dedicated factor model
#'
#' This function simulates data from a dedicated factor model. The parameters of
#' the model are either passed by the user or simulated by the function.
#'
#' @param N
#' Number of observations in data set.
#' @param dedic
#' Vector of indicators. The number of manifest variables is equal to the
#' length of this vector, and the number of factors is equal to the
#' number of unique nonzero elements. Each integer element indicates on
#' which latent factor the corresponding variable loads uniquely.
#' @param alpha
#' Vector of factor loadings, should be of same length as \code{dedic}.
#' If missing, values are simulated (see details below).
#' @param sigma
#' Idiosyncratic variances, should be of same length as \code{dedic}.
#' If missing, values are simulated (see details below).
#' @param R
#' Covariance matrix of the latent factors.
#' If missing, values are simulated (see details below).
#' @param R.corr
#' If TRUE, covariance matrix \code{R} is rescaled to be a correlation
#' matrix.
#' @param max.corr
#' Maximum correlation allowed between the latent factors.
#' @param R.max.trial
#' Maximum number of trials allowed to sample from the truncated
#' distribution of the covariance matrix of the latent factors
#' (accept/reject sampling scheme, to make sure \code{max.corr} is not
#' exceeded).
#'
#' @details The function simulates data from the following dedicated factor
#' model, for \eqn{i = 1, ..., N}:
#' \deqn{Y_i = \alpha \theta_i + \epsilon_i}
#' \deqn{\theta_i \sim \mathcal{N}(0, R)}{\theta_i ~ N(0, R)}
#' \deqn{\epsilon_i \sim \mathcal{N}(0, \Sigma)}{\epsilon_i ~ N(0, \Sigma)}
#' where the \eqn{K}-vector \eqn{\theta_i} contains the latent factors, and
#' \eqn{\alpha} is the \eqn{(M \times K)}{(M*K)}-matrix of factor loadings. Each
#' row \eqn{m} of \eqn{\alpha} contains only zeros, besides its element
#' indicated by the \eqn{m}th element of \code{dedic} that is equal to the
#' \eqn{m}th element of \code{alpha} (denoted \eqn{\alpha_m^\Delta} below).
#' The \eqn{M}-vector \eqn{\epsilon_i} is the vector of error terms, with
#' \eqn{\Sigma = diag(}\code{sigma}\eqn{)}. \eqn{M} is equal to the length of
#' the vector \code{dedic}, and \eqn{K} is equal to the maximum value of this
#' vector.
#'
#' Only \code{N} and \code{dedic} are required, all the other parameters can be
#' missing, completely or partially. Missing values (\code{NA}) are
#' independently sampled from the following distributions, for each manifest
#' variable \eqn{m = 1, ..., M}:
#'
#' Factor loadings:
#' \deqn{\alpha_m^\Delta = (-1)^{\phi_m}\sqrt{a_m}}{
#' \alpha_m^\Delta = (-1)^\phi_m\sqrt(a_m)}
#' \deqn{\phi_m \sim \mathcal{B}er(0.5)}{\phi_m ~ Ber(0.5)}
#' \deqn{a_m \sim \mathcal{U}nif (0.04, 0.64)}{a_m ~ Unif (0.04, 0.64)}
#'
#' Idiosyncratic variances:
#' \deqn{\sigma^2_m \sim \mathcal{U}nif (0.2, 0.8)}{
#' \sigma^2_m ~ Unif (0.2, 0.8)}
#'
#' For the variables that do not load on any factors (i.e., for which the
#' corresponding elements of \code{dedic} are equal to 0), it is specified that
#' \eqn{\alpha_m^\Delta = 0} and \eqn{\sigma^2_m = 1}.
#'
#' Covariance matrix of the latent factors:
#' \deqn{\Omega \sim \mathcal{I}nv-\mathcal{W}ishart(K+5, I_K)}{
#' \Omega ~ Inv-Wishart(K+5, I_K)}
#' which is rescaled to be a correlation matrix if \code{R.corr = TRUE}:
#' \deqn{R = \Lambda^{-1/2} \Omega \Lambda^{-1/2}}{
#' R = \Lambda^-1/2 \Omega \Lambda^-1/2}
#' \deqn{\Lambda = diag(\Omega)}
#'
#' Note that the distribution of the covariance matrix is truncated such that
#' all the off-diagonal elements of the implied correlation matrix \eqn{R} are
#' below \code{max.corr} in absolute value. The truncation is also applied if
#' the covariance matrix is used instead of the correlation matrix (i.e., if
#' \code{R.corr = FALSE}).
#'
#' The distributions and the corresponding default values used to simulate the
#' model parameters are specified as in the Monte Carlo study of CFSHP, see
#' section 4.1 (p.43).
#'
#' @return The function returns a data frame with \code{N} observations
#' simulated from the corresponding dedicated factor model.
#' The parameters used to generate the data are saved as attributes:
#' \code{dedic}, \code{alpha}, \code{sigma} and \code{R}.
#'
#' @author Rémi Piatek \email{remi.piatek@@gmail.com}
#'
#' @references
#' G. Conti, S. Frühwirth-Schnatter, J.J. Heckman,
#' R. Piatek (2014): ``Bayesian Exploratory Factor Analysis'',
#' \emph{Journal of Econometrics}, 183(1), pages 31-57,
#' \doi{10.1016/j.jeconom.2014.06.008}.
#'
#' @examples
#' # generate 1000 observations from model with 4 factors and 20 variables
#' # (5 variables loading on each factor)
#' dat <- simul.dedic.facmod(N = 1000, dedic = rep(1:4, each = 5))
#'
#' # generate data set with 5000 observations from the following model:
#' dedic <- rep(1:3, each = 4) # 3 factors and 12 manifest variables
#' alpha <- rep(c(1, NA, NA, NA), 3) # set first loading to 1 for each factor,
#' # sample remaining loadings from default
#' sigma <- rep(0.5, 12) # idiosyncratic variances all set to 0.5
#' R <- toeplitz(c(1, .6, .3)) # Toeplitz matrix
#' dat <- simul.dedic.facmod(N = 5000, dedic, alpha, sigma, R)
#'
#' @export simul.dedic.facmod
#' @import checkmate
#' @importFrom stats rWishart rnorm runif cov2cor
simul.dedic.facmod <- function(N, dedic, alpha, sigma, R, R.corr = TRUE,
max.corr = 0.85, R.max.trial = 1000)
{
# sanity checks for N and dedic, relabel appropriately
assertInt(N, lower = 1)
assertIntegerish(dedic, lower = 0, any.missing = FALSE, min.len = 3)
dedic <- relabel.dedic(dedic)
nvar <- length(dedic)
nfac <- max(dedic)
# factor loading matrix
if (missing(alpha)) {
alpha <- rep(NA, nvar)
}
assertNumeric(alpha, finite = TRUE, len = nvar)
nna <- sum(is.na(alpha))
if (nna > 0) {
alpha[is.na(alpha)] <- sample(c(-1, 1), nna, replace = TRUE) *
sqrt(runif(nna, min = 0.04, max = 0.64))
}
alpha.mat <- matrix(0, nvar, nfac)
for (i in 1:nvar) {
if (dedic[i] > 0)
alpha.mat[i, dedic[i]] <- alpha[i]
}
# error terms
if (missing(sigma)) sigma <- rep(NA, nvar)
assertNumeric(sigma, lower = 0.001, finite = TRUE, len = nvar)
nna <- sum(is.na(sigma))
if (nna > 0) sigma[is.na(sigma)] <- runif(nna, min = 0.2, max = 0.8)
sigma[dedic == 0] <- 1
eps <- matrix(rnorm(N * nvar), N, nvar) %*% diag(sqrt(sigma))
# latent factors
assertLogical(R.corr, len = 1, any.missing = FALSE)
assertNumber(max.corr, lower = 0, upper = 1)
assertInt(R.max.trial, lower = 1, upper = 10^6)
if (missing(R)) {
for (i in 1:R.max.trial) {
Omega <- rWishart(n = 1, df = nfac + 5, Sigma = diag(nfac))
R <- cov2cor(Omega[, , 1])
rho <- max(abs(R[lower.tri(R)]))
if (rho <= max.corr)
break
}
if (rho > max.corr)
stop("R.max.trial exceeded, could not sample R")
if (!R.corr)
R <- Omega
} else {
if (!is.pos.semidefinite.matrix(R))
stop("R is not positive semidefinite.")
if (R.corr & !all(diag(R) == 1))
stop("R is not a correlation matrix")
}
theta <- matrix(rnorm(N * nfac), N, nfac) %*% chol(R)
# manifest variables
Y <- theta %*% t(alpha.mat) + eps
colnames(Y) <- paste0("Y", 1:nvar)
rownames(Y) <- 1:N
# label parameters
names(dedic) <- paste0("Y", 1:nvar)
names(alpha) <- paste0("alpha:", 1:nvar)
names(sigma) <- paste0("sigma:", 1:nvar)
rownames(R) <- colnames(R) <- paste0("R:", 1:nfac)
# return simulated data
output <- as.data.frame(Y)
attr(output, "dedic") <- dedic
attr(output, "alpha") <- alpha
attr(output, "sigma") <- sigma
attr(output, "R") <- R
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.