# R/simulate.R In truthiness: Illusory Truth Longitudinal Study

#### Documented in derive_fixed

#' @importFrom stats qnorm
#' @importFrom stats rnorm
NULL

#' Derive Fixed-Effects Parameters from Phase-by-Phase Effects
#'
#' @param phase_eff A four-element vector specifying the illusory
#'   truth effect at each of the four testing phases (in log odds
#'   units).
#'
#' @return Vector with eight elements containing the fixed effects
#'   coefficients for deviation-coded predictors.
#'
#' @export
derive_fixed <- function(phase_eff) {
if (length(phase_eff) != 4L)
stop("'phase_eff' must be a vector of length 4.")

a <- phase_eff
b <- phase_eff
c <- phase_eff
d <- phase_eff

c((Intercept) = (a + b + c + d) / 8,
R             = (a + b + c + d) / 4,
I1            = (b - a) / 2,
I2            = (c - a) / 2,
I3            = (d - a) / 2,
R:I1        = (b - a),
R:I2        = (c - a),
R:I3        = (d - a))
}

#' Simulate Truth Rating Data
#'
#' @param nsubj Number of subjects. Because of counterbalancing, must
#'   be a multiple of 8.
#'
#' @param phase_eff A four-element vector giving the size of the
#'   illusory truth effect at each of the four phases (on the log odds
#'   scale). Use \code{rep(0, 4)} for testing Type I error rate. A
#'   value of .14 gives an effect of approximately 1/10 of a scale
#'   point.
#'
#' @param thresh Cut-points (thresholds) for the seven point scale
#'   (must be a six-element vector).
#'
#' @param subj_rfx A 4x4 covariance matrix with by-subject variance
#'   components for the intercept, main effect of repetition, main
#'   effect of interval, and repetition-by-interval interaction. Only
#'   the variances (elements on the diagonal) are used in the
#'   simulation (see Details).
#' @param item_rfx A 4x4 covariance matrix with by-statement variance
#'   components for the intercept, main effect of repetition, main
#'   effect of interval, and repetition-by-interval interaction. Only
#'   the variances (elements on the diagonal) are used in the
#'   simulation (see Details).
#' @param dropout A vector encoding assumptions about the proportion
#'   of subjects dropping out of the study over the four testing
#'   intervals (immediate, 1 day, 1 week, 1 month). The first element
#'   represents the proportion of subjects who completed the first
#'   phase (immediate) but who drop out before the next interval one
#'   day later. The second element represents the proportion of the
#'   remaining participants dropping out after 1 day and before 1
#'   week. The third and final element represents the proportion of
#'   remaining participants dropping out after 1 week and before 1
#'   month. For example, the default values of \code{c(.05, .1, .1)}
#'   encode dropout rates of 5\%, 10\%, and 10\%.
#'
#' @details By default, the thresholds and parameter estimates for
#'   variance components used in the simulation are from the
#'   data. Only the variances from the by-subject and by-item
#'   covariance matrices are used. Unlike Nadarevic and Erdfelder, who
#'   only had two testing intervals, the simulated study assumes four
#'   intervals, coded by three predictors for the main effect and
#'   three for the interaction with repetition. The code below depicts
#'   how the four-element variance vector from the original study is
#'   translated into the eight variances needed for the simulated
#'   data.
#'
#' \code{newvar_subj <- rep(diag(subj_rfx), c(1, 1, 3, 3))}
#'
#' \code{newvar_item <- rep(diag(item_rfx), c(1, 1, 3, 3))}
#'
#' The simulated data includes ratings for 128 stimulus items for each
#'   subject. Half of the statements are repeated (old) and half are
#'   new.  A quarter of the items (32) are tested at each phase.
#'
#' It is assumed that the key effect present in the data is the
#' interaction term, which is designed to represent an illusory-truth
#' effect that first appears at the second testing interval (1 day)
#' and remains over the subsequent two intervals without changing
#' size. All other fixed effects in the model (main effect of R and
#' three effects encoding the main effect of interval) are driven by
#' the interaction term.
#'
#' @return A data frame, with \code{nsubj * 128} rows and 11 variables, where:
#'
#' \describe{
#'
#' \item{\code{subj_id}}{Unique subject identifier.}
#'
#' \item{\code{list_id}}{Which set of statements the subject received.}
#'
#' \item{\code{stim_id}}{Unique stimulus (statement) identifier.}
#'
#' \item{\code{repetition}}{Whether the statement was old or new.}
#'
#' \item{\code{interval}}{Testing interval (immediate, 1 day, 1 week, 1 month).}
#'
#' \item{\code{eta}}{The simulated response tendency, on the log odds scale.}
#'
#' \item{\code{trating}}{The simulated rating value.}
#'
#' \item{\code{R}}{Deviation-coded predictor for repetition (old = 1/2,
#'   new = -1/2).}
#'
#' \item{\code{I1}}{Deviation-coded predictor for interval comparing baseline
#'   (immediate) to 1 day.}
#'
#' \item{\code{I2}}{Deviation-coded predictor for interval comparing
#'   baseline (immediate) to 1 week.}
#'
#' \item{\code{I3}}{Deviation-coded predictor for interval comparing
#' baseline (immediate) to 1 month.}
#'
#' }
#'
#' @examples
#' # demonstrate how to convert from four variances to eight
#' four_var <- diag(ordinal::VarCorr(clmm_maximal)$subj_id) #' four_var #' rep(four_var, c(1, 1, 3, 3)) #' #' # basic usage #' dat <- gen_data(256) #' #' # demonstrate deviation coding #' dat %>% dplyr::distinct(repetition, interval, R, I1, I2, I3) #' #' # demonstrate dropouts #' dat %>% dplyr::distinct(subj_id, interval) %>% dplyr::count(interval) #' #' @seealso \code{\link{clmm_maximal}}, \code{\link{NE_exp1}} #' @export gen_data <- function(nsubj, phase_eff = rep(0, 4), thresh = alpha_6_to_7(truthiness::clmm_maximal$alpha),
subj_rfx =
ordinal::VarCorr(truthiness::clmm_maximal)$subj_id, item_rfx = ordinal::VarCorr(truthiness::clmm_maximal)$item_id,
dropout = c(.05, .1, .1)) {
list_id <- n <- repetition <- interval <- NULL
(Intercept).s <- (Intercept).i <- R.s <- R.i <- R <- NULL
I1.s <- I1.i <- I1 <- I2.s <- I2.i <- I2 <- I3.s <- I3.i <- I3 <- NULL
R:I1.s <- R:I2.s <- R:I3.s <- R:I1.i <- R:I2.i <- R:I3.i <- NULL
eta <- subj_id <- stim_id <- trating <- NULL

if (nsubj %% 8)
stop("'nsubj' must be multiple of 8 (the number of stimulus lists)")

if (length(thresh) != 6L)
stop("'thresh' must be a six-element vector")

nitem <- truthiness::stimulus_conditions %>%
dplyr::count(list_id) %>% dplyr::pull(n) %>% unique()
stopifnot(length(nitem) == 1L)

betas <- derive_fixed(phase_eff)

## variance-covariance matrices
## covariances set to zero
subj_mx <- matrix(0, nrow = 8, ncol = 8,
dimnames = list(names(betas), names(betas)))
diag(subj_mx) <- rep(diag(subj_rfx), c(1, 1, 3, 3))

item_mx <- matrix(0, nrow = 8, ncol = 8,
dimnames = list(names(betas), names(betas)))
diag(item_mx) <- rep(diag(item_rfx), c(1, 1, 3, 3))

## generate random effects for subjects
mus <- rep(0, length(names(betas)))
names(mus) <- names(betas)
sfx <- MASS::mvrnorm(nsubj, mus, subj_mx) %>%
tibble::as_tibble() %>%
dplyr::mutate(subj_id = factor(dplyr::row_number()))

## generate random effects for items
ifx <- MASS::mvrnorm(nitem, mus, item_mx) %>%
tibble::as_tibble() %>%
dplyr::mutate(stim_id = factor(dplyr::row_number()))

subj <- tibble::tibble(subj_id = factor(seq_len(nsubj)),
list_id = factor(rep(1:8, nsubj / 8L)))

trials <- dplyr::inner_join(subj, truthiness::stimulus_conditions,
"list_id") %>%
dplyr::inner_join(sfx, "subj_id") %>%
dplyr::inner_join(ifx, "stim_id", suffix = c(".s", ".i")) %>%
dplyr::mutate(
R = dplyr::if_else(repetition == "repeated", 1/2, -1/2),
I1 = dplyr::if_else(interval == "1 day", 3/4, -1/4),
I2 = dplyr::if_else(interval == "1 week", 3/4, -1/4),
I3 = dplyr::if_else(interval == "1 month", 3/4, -1/4),
eta =
(Intercept).s + (Intercept).i +
(R.s + R.i + betas["R"]) * R +
(I1.s + I1.i + betas["I1"]) * I1 +
(I2.s + I2.i + betas["I2"]) * I2 +
(I3.s + I3.i + betas["I3"]) * I3 +
(R:I1.s + R:I1.i + betas["R:I1"]) * R * I1 +
(R:I2.s + R:I2.i + betas["R:I2"]) * R * I2 +
(R:I3.s + R:I3.i + betas["R:I3"]) * R * I3,
trating = factor(eta2resp(eta, thresh + betas["(Intercept)"]),
levels = 1:7,
ordered = TRUE)) %>%
dplyr::select(subj_id, list_id, stim_id, repetition, interval, eta,
trating, R, I1, I2, I3)

## now drop participants
## nremaining * dropout rate
ndrop <- ceiling(dropout * nsubj)
ndrop <- ceiling((nsubj - ndrop) * dropout)
ndrop <- ceiling((nsubj - ndrop - ndrop) * dropout)

all_sphase <- dplyr::distinct(trials, subj_id, interval)

remaining <- dplyr::distinct(trials, subj_id) %>% dplyr::pull() %>%
as.integer()
lvls <- trials %>% dplyr::pull(interval) %>% levels()
res <- list()
for (.i in seq_along(ndrop)) {
dropped <- sample(remaining, ndrop[.i])
res[[.i]] <- tidyr::crossing(subj_id = dropped,
interval = lvls[(.i + 1L):length(lvls)])
remaining <- dplyr::setdiff(remaining, dropped)
}

stopifnot((dplyr::intersect(
dplyr::intersect(dplyr::distinct(res[], subj_id),
dplyr::distinct(res[], subj_id)),
dplyr::distinct(res[], subj_id)) %>% nrow()) == 0L)

discard <- dplyr::bind_rows(res[], res[], res[]) %>%
dplyr::mutate(subj_id = factor(subj_id,
levels = levels(trials[["subj_id"]])),
interval = factor(interval, levels = lvls))