#' Simulate Data
#'
#' Simulate data for the CLCM
#' @param N integer specifying the sample size
#' @param number.timepoints integer specify the number of timepoints, 1 or 2
#' @param Q the Q-matrix, a matrix of 1s and 0s specifying the factor loading structure.
#' The default is 1 factor (K=1), which forms two latent classes
#' @param item.type character vector specifying the type of item to be modeled
#' \itemize{
#' \item `Ordinal` - Ordinal model, 1 slope parameter, C-1 intercept parameters,
#' where C is the number of categories. The slope parameter distinguishes the latent classes.
#' \item `Nominal` - Nominal or Multinomial model, 2*(C-1) parameters. The slope parameters
#' distinguish the latent classes.
#' \item `Poisson` - Poisson model, 2 parameters.
#' The model is paramterized using lambda.
#' See ?rpois for details.
#' The lambda parameter is stratified on the latent classes.
#' The slope parameter distinguishes the latent classes.
#'
#' \item `Neg_Binom` - Negative Binomial model, 3 parameters.
#' The model is parameterized using mu and size.
#' See ?rnbinom for details on parameters.
#' The mu parameter is stratified on the latent classes; the size parameter is common to
#' all latent classes.
#'The slope parameter distinguishes the latent classes.
#'
#'
#' \item `ZINB` - Zero-Inflated Negative Binomial model, 4 parameters.
#' The model is parameterized as mu, size, and zi (zero inflation parameter).
#' The mu parameter is stratified on the latent classes;
#' the size and zi parameters are common to all latent classes.
#' The slope parameter distinguishes the latent classes.
#'
#'
#' \item `ZIP`- Zero-Inflated Poisson model, 3 parameters.
#' The model is parameterized using lambda and zi (zero inflation parameter).
#' The lambda parameter is stratified on latent classes.
#' The zi parameter is common to all latent classes.
#' The slope parameter distinguishes the latent classes.
#'
#'
#' \item `Normal` - Normal distribution model, 2 parameters.
#' The model is parameterized using a mean and variance
#' The mean is stratified on latent classes.
#' The variance is common to all latent classes (pooled across latent classes)
#' The slope parameter distinguishes the latent classes.
#'
#'
#' \item `Beta` - Beta distribution model, 3 parameters.
#' Support ranges from 0 to 1.
#' The model is parameterized using shape1 and shape2 paramters.
#' See ?rbeta for details
#' The shape1 parameter is stratified on latent classes
#' The shape 2 parameter is common to all latent classes
#' The slope parameter distinguishes the latent classes.
#' }
#'
#' @param categories.j numeric vector specifying the number of categories of each item.
#' For 'Normal' or 'Beta' item types, the value should be NA
#'
#' @param lc.prop list of the latent class proportions at each timepoint.
#' For example, `lc.prop = list(c('Time_1' = c(0, 1), 'Time_2' = c(0.6, 0.4))`
#' specifies that at timepoint 1 all subjects are in latent class 2,
#' and at timepoint 2, 40% are in latent class 2, with the remainder in latent class 1
#' @param transition.matrix a 2^K by 2^K numeric matrix that specifies the transition probabilities.
#' This is used in conjunction with the lc.prop at timepoint 1.
#' See Vignettes for a detailed example.
#'
#' @param post a matrix of the true posterior distributions - long data format.
#' Generate the posterior distributions according to user-preference, then pass posterior distributions
#' to the function.
#' This is the preferred way to specify the latent class proportions and transition probabilities because it
#' offers maximum control and flexibility.
#' See Vignettes for detailed examples on generating posterior distributions.
#'
#' @param param list of item parameters, default is to use the values in the function
#' @param item.names character vector of item names
#' @return Returns simulated item responses and posterior distributions.
#' @export
#' @examples
#' \dontrun{
#' set.seed(3112021)
#' simulate_clcm(N=50, number.timepoints = 1,
#' item.type = rep('Ordinal', 5),
#' categories.j = rep(4, 5),
#' lc.prop = list('Time_1' = c(0.5, 0.5)) )
#' }
#'
simulate_clcm <- function(N,
number.timepoints,
Q = NULL,
item.type = NULL,
categories.j = NULL,
transition.matrix = NULL,
lc.prop = NULL,
post = NULL,
param = NULL,
item.names = NULL){
# Check:
if(is.null(item.type)){ stop('Specify Item Type - see documentation for options') }
if(is.null(categories.j)){ stop('Specify the number of categories per item - see documentation for details') }
if( is.null(lc.prop) & is.null(post) ){ stop('Pass latent class structure - see documentation for details') }
if( !is.null(lc.prop) & !is.null(post) ){ stop('Pass **either** latent class proportions or matrices')}
if(is.null(Q)){ print('No Q-matrix passed to estimation function; default is two latent classes');
Q <- matrix(1, nrow = length(item.type), ncol = 1, dimnames = list(paste0('Item_', 1:length(item.type)), NULL))
}
# Requires Q-matrix - create alpha and eta from Q-matrix
# TODO extend eta to other types of condensation rules - currently only conjunctive
# All condensation rules are equivalent for single attribute items
K <- ncol(Q)
alpha <- pattern(K)
eta <- alpha %*% t(Q)
eta <- ifelse(eta == matrix(1,2^K,1) %*% colSums(t(Q)),1, 0 )
# pass eta to all functions
dat <- data.frame(
'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
'Time' = rep(paste0('Time_', 1:number.timepoints), each = N),
stringsAsFactors=F)
# Initialize
J <- length(item.type)
# Likely will not pass item parameters, but is an option
if(is.null(param)){
param <- vector(mode = "list", length = J)
names(param) <- paste0('Item_', 1:J, ' ', item.type)
# Loop to set item parameter values:
for(j in 1:J){
if(item.type[j] == 'Ordinal'){
tmp1 <- 2
names(tmp1) <- 'slope'
tmp2 <- seq(1, -2, length.out = categories.j[j] - 1)
names(tmp2) <- paste0('intercept_', 1:(categories.j[j] - 1))
tmp <- c(tmp1, tmp2)
param[[j]] <- tmp
}
if(item.type[j] == 'Neg_Binom'){
tmp <- c(2, 1, 1)
names(tmp) <- c('slope', 'intercept', 'size')
param[[j]] <- tmp
}
if(item.type[j] == 'Poisson'){
tmp <- c(2, 1)
names(tmp) <- c('slope', 'intercept')
param[[j]] <- tmp
}
if(item.type[j] == 'ZINB'){
tmp <- c(2, 1, 1, 1)
names(tmp) <- c('slope', 'intercept', 'size', 'zi')
param[[j]] <- tmp
}
if(item.type[j] == 'ZIP'){
tmp <- c(2, 1, 1)
names(tmp) <- c('slope', 'intercept', 'zi')
param[[j]] <- tmp
}
if(item.type[j] == 'Normal'){
tmp <- c(2, 0, 1)
names(tmp) <- c('slope', 'intercept', 'sigma')
param[[j]] <- tmp
}
if(item.type[j] == 'Beta'){
tmp <- c(2, 2, 2)
names(tmp) <- c('slope', 'intercept', 'shape2')
param[[j]] <- tmp
}
if(item.type[j] == 'Nominal'){
# Matrix
tmp1 <- seq(-2, -1, length.out = categories.j[j] - 1)
tmp2 <- seq(1, 2, length.out = categories.j[j] - 1)
tmp <- rbind(tmp1, tmp2)
rownames(tmp) <- c('intercept', 'slope')
colnames(tmp) <- paste0('k_', 1:(categories.j[j]-1))
param[[j]] <- tmp
}
}# end loop to set parameter values
} else { # end if else for optional item parameter - likely will require generation in the above
param <- param # if you pass item parameters,just use those
}
# Joint Distribution:
# If passed the latent class proportions, then sample from those proportions
if(!is.null(post)){
# if passed a matrix of subject-level latent classifications, use those
# Note that can use latent regression to generate these - see Vignettes
post <- post
} else {
if(!is.null(transition.matrix)){
post <- matrix(0, nrow = nrow(dat), ncol = 2 ^ K)
# First time point:
tt <- sort(unique(dat$Time))[1]
ii <- which(dat[ , 'Time'] == tt)
post.1 <- post[ ii, ]
post.1[ cbind(1:length(ii), sample( c(1:2^K), size = length(ii), replace = T, prob = lc.prop[[ tt ]]))] <- 1
post[ ii , ] <- post.1
# Second time point:
tt <- sort(unique(dat$Time))[2]
ii <- which(dat[ , 'Time'] == tt)
post.2 <- post.1 %*% tau
# Posterior Distributions all sum to 1?
if(!all(apply(post.2, 1, sum) == 1)){ stop("Posterior distributions don't sum to 1; check latent class proportions and transition matrix") }
p2 <- t(apply(post.2, 1, cumsum))
lca2 <- apply(matrix(runif(n = nrow(p2)), nrow = nrow(p2), ncol = ncol(p2), byrow = T) > p2, 1, sum) + 1
post.2[] <- 0
post.2[ cbind(1:length(lca2), lca2) ] <- 1
post[ ii , ] <- post.2
} else {
post <- matrix(0, nrow = nrow(dat), ncol = 2 ^ K)
for(tt in unique(dat$Time) ){
ii <- which(dat[ , 'Time'] == tt)
post.t <- post[ ii, ]
post.t[ cbind(1:length(ii), sample( c(1:2^K), size = length(ii), replace = T, prob = lc.prop[[ tt ]]))] <- 1
post[ ii , ] <- post.t
}# end tt
}
} # end Generation of Joint Distribution
# Initialize Item Responses:
X <- matrix(NA, nrow = nrow(dat), ncol = J)
# Generate Item Responses:
j <- 1
for(j in 1:J){
# Reduced Posterior Distribution
# from 2^K down to 2, conjunctive models
# use eta_j to create it
post.reduced <- post %*% cbind(1- eta[ , j], eta[, j])
if(item.type[j] == 'Beta'){
Pj <- Return_logPj(item.type_j = item.type[j], param_j = param[[j]], j = j, eta = eta, categories.j_j = NA)
shape2 <- Pj['shape2']
Pj <- matrix(Pj[grep('shape1', names(Pj))], ncol = 1)
## shape1 <- post %*% Pj
shape1 <- post.reduced %*% Pj
X[ , j] <- rbeta(n = nrow(shape1), shape1 = shape1, shape2 = shape2)
}# End Beta
if(item.type[j] == 'Normal'){
Pj <- Return_logPj(item.type_j = item.type[j], param_j = param[[j]], j = j, eta = eta, categories.j_j = NA)
sigma <- Pj['sigma']
Pj <- matrix(Pj[grep('mean', names(Pj))], ncol = 1)
## mean.l <- post %*% Pj
mean.l <- post.reduced %*% Pj
X[, j] <- rnorm(n = nrow(mean.l), mean = mean.l, sd = sigma) #
}# End Normal
if(item.type[j] == 'Ordinal'){
logPj <- Return_logPj(item.type_j = item.type[j], param_j = param[[j]], j = j, eta = eta, categories.j_j = NA)
Pj <- exp(logPj)
## tmp <- post %*% t(Pj)
tmp <- post.reduced %*% t(Pj)
X[, j] <- apply(runif(n = N) > t(apply(tmp, 1, cumsum)), 1, sum) #+ 1 TODO: TEST! 2.20.21
}# End Ordinal
if(item.type[j] %in% c('Nominal', 'Neg_Binom', 'Poisson', 'ZIP', 'ZINB') ){
logPj <- Return_logPj(item.type_j = item.type[j], param_j = param[[j]], j = j, eta = eta,
categories.j_j = 0:(categories.j[j]-1))
Pj <- exp(logPj)
## Pij <- post %*% t(Pj)
Pij <- post.reduced %*% t(Pj)
for(i in 1:nrow(Pij)){
X[i, j] <- sample(x = 0:(categories.j[j]-1), size = 1, prob = Pij[i, ])
}
}# End Neg Binom, Poisson, ZIP, ZINB
}# End Item Generation Loop
# Item Names
if(is.null(item.names)){ item.names <- paste0('Item_', 1:J) }
colnames(X) <- item.names
post.names <- paste0('true_post_LC_', apply(alpha, 1, paste0, collapse = '')) # 3.6.21, fixed
colnames(post) <- post.names
lca <- apply(post, 1, which.max)
#names(lca) <- 'true_lca'
dat <- cbind.data.frame(dat, X, post, 'true_lca' = lca) # 3.8.21: added in lca for ease
return(list('dat' = dat,
'item.responses' = X,
'post' = post,
'lca' = lca,
'param' = param,
'item.type' = item.type,
'item.names' = item.names,
'categories' = categories.j,
'lc.prop' = lc.prop,
'Q' = Q,
'alpha' = alpha,
'K' = K,
'eta' = eta))
} #end Simulate_Data function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.