Nothing
#' @importFrom MASS mvrnorm
#' @importFrom stats rnorm rgamma rbeta
######## GENERATE A DATAFRAME WITH SIMULATED DATA ###########################################
# Sven Hohenstein, Reinhold Kliegl, 2009-2015
# Version 0.6.3 (June 2015)
#
# changes:
# - removed all `cat`s
################
# INPUT:
#
# B: vector of between-subject factors levels (it's length equals the number of between-subject factors)
# (you can use B = c() or B = NULL if there are no between-subject factors)
# W: vector of within-subject factors levels (it's length equals the number of within-subject factors)
# (you can use W = c() or W = NULL if there are no within-subject factors)
# n: number of observations within each between-subjects factor level combination (natural number)
# (the total number of observations equals n * prod(B); the minimum n equals prod(W), but can not be lower than 2)
# M: Matrix specifying the cell means of crossing between- and within-subject factors (dimensions: prod(B)-by-prod(W) matrix)
# (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix)
# OR
# a scalar (single number) that will be the mean of all cells
# SD: Matrix specifying the cell standard deviations of crossing between- and within-subject factors (dimensions: prod(B)-by-prod(W) matrix)
# (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix)
# OR
# a scalar (single number) that will be the standard deviation of all cells
# R: List of matrices specifying the correlations of the witin-subject factor level combinations
# (list length: prod(B); dimensions of matrices: prod(W)-by-prod(W))
# OR
# a singe matrix that will be used for the correlation of all within-subject factor level combinations
# OR
# a scalar (single number) that will be the correlation of all within-subject factor level combinations (except the diagonal)
# OR
# a list of length prod(B) containing only single numbers or single numbers and matrices
# Note: R is is ignored if there are no within-subject factors
# miss: Matrix specifying the proportion of random data loss within each cell of crossing between- and within-subject factors
# (dimensions: prod(B)-by-prod(W) matrix)
# (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix)
# OR
# a scalar (single number), [0, 1], that will be the relative amount of data loss in each cell
# long: if TRUE each line of the resulting data frame contains of one value of the dependent variable
# if FALSE all (within-subject) measures of one subject are in one line of the resulting data frame
# family: string specifying the output distribution; "gaussian", "gamma", or "beta"
# empirical: if TRUE the random values will match the specified properties exactly (at least for gaussian data)
# if FALSE all values will be drawn randomly from a distribution with the specified properties
################
################
# OUTPUT:
#
# data frame containing the labels for the between- and-within factors, the subject (id) numbers, and the measurements obtained randomly from a normal distribution
# (the values of the dependent variable are random BUT they match the mean, standard deviation, and correlation values of the input)
################
################
# Please note:
# This works perfectly for guassian distributions.
# But note two suboptimal instances occuring with gamma and beta distributions:
# 1) Obtained means and standard deviations will slightly differ from the input. The deviation decreases with the number of subjects (observations).
# 2) Obtained correlations will slightly differ from the input. The deviation decreases with the number of subjects (observations).
# Whereas the classical Pearson correlation statistic is informative for gaussian distributions, it is hardly convenient for gamma and beta distributions.
# If a gamma or beta distribution is used, the Spearman rank correlation is the appropriate statistic. This method is used if cor() is called with the parameter method="spearman".
# The obtained Spearman correlations will be more close to the input values than Pearson correlations.
################
mixedDesign <- function(B = NULL, W = NULL, n = ifelse(is.null(W), 2, prod(W)), M = 0, SD = 1, R = 0, miss = 0, family = "gaussian", long = FALSE, empirical = TRUE) {
##### catch wrong input, e.g., B <- c(1, 2, 3)
if (any(B < 2)) { # remove illegal conditions (less than 2 conditions per factor)
warning("Removing illegal between-subject factors.")
ix <- which(B < 2)
B <- B[- ix]
}
nB <- length(B)
if (any(W < 2)) { # remove illegal conditions (less than 2 conditions per factor)
warning("Removing illegal within-subject factors.")
ix <- which(W < 2)
W <- W[- ix]
}
nW <- length(W) # number of within-subject factors
if (nB || nW) {
nBcomb <- 1 # number of within-subject factor level combinations
# this start value remains constant (1) for cases of pure within-subject designs since there is no "X by 0" design
if (nB) { # this segment is ignored if there are no between-subject factors
Blabel <- NULL # list with item labels
for (i in 1:nB) {
Blabel[[i]] <- paste(LETTERS[i], 1:B[i], sep = "")
nBcomb <- nBcomb * B[i]
}
}
nWcomb <- 1 # number of within-subject factor level combinations
# this start value remains constant (1) for cases of pure between-subject designs since there is no "X by 0" design
if (nW) { # this segment is ignored if there are no within-subject factors
Wlabel <- NULL # list with item labels
Wlabel_comb <- NULL # combination of factor level items
for (i in 1:nW) {
Wlabel_comb <- rep(Wlabel_comb, each = W[[i]])
Wlabel[[i]] <- paste(letters[i], 1:W[i], sep = "")
if (i == 1) Wlabel_comb <- Wlabel[[i]] else Wlabel_comb <- paste(Wlabel_comb, rep(Wlabel[[i]], nWcomb), sep = "_") # arrangement of combinations
nWcomb <- nWcomb * W[i]
}
# add "W" to the labels of the within factors
Wlabel_comb <- paste("W_", Wlabel_comb, sep ="")
}
if (n < 2) stop("Not enough subjects for the given design")
if (n < nWcomb) { # if subjects are not sufficient to produce data frame, additional subjects are added and later removed
warning("Not enough subjects for the given design:\n the resulting means, standard deviations, and correlations may differ from the specified ones")
n.dif <- nWcomb - n # subjects to remove
n <- nWcomb
} else n.dif <- 0 # no subjects to remove
# Means: allow vectors as an alternative to 1-by-X matrices
if (!nB && !is.matrix(M)) {
M <- as.matrix(M) # necessary for comparing dimensions
dim(M) <- rev(dim(M)) # change vector to 1-by-X matrix
}
if (!all(dim(as.matrix(M)) == c(nBcomb, nWcomb))) {
#generate matrix of mean values)
if (!all(dim(as.matrix(M)) == c(1, 1))) warning("Inanpropriate matrix of means. Using first value for all cell means.") # more than one element
CellMeans <- matrix(M[1], nBcomb, nWcomb)
} else CellMeans <- M
# SDs: allow vectors as an alternative to 1-by-X matrices
if (!nB && !is.matrix(SD)) {
SD <- as.matrix(SD) # necessary for comparing dimensions
dim(SD) <- rev(dim(SD)) # change vector to 1-by-X matrix
}
if (!all(dim(as.matrix(SD)) == c(nBcomb, nWcomb))) {
#generate matrix of sd values)
if (!all(dim(as.matrix(SD)) == c(1, 1))) warning("Inanpropriate dimensions of matrix of SDs. Using first value for all cell SDs.")
CellSDs <- matrix(SD[1], nBcomb, nWcomb)
} else CellSDs <- SD
if (any(CellSDs < 0)) stop("Negative values are not allowed for standard deviation parameter.")
if (!nB && !is.matrix(miss)) {
miss <- as.matrix(miss) # necessary for comparing dimensions
dim(miss) <- rev(dim(miss)) # change vector to 1-by-X matrix
}
if (!all(dim(as.matrix(miss)) == c(nBcomb, nWcomb))) {
#generate matrix of data loss proportions)
if (!all(dim(as.matrix(miss)) == c(1, 1))) warning("Inanpropriate dimensions of matrix of data loss proportions. Using first value for all cell data loss proportions.")
Cell.loss <- matrix(miss[1], nBcomb, nWcomb)
} else Cell.loss <- miss
# checking data loss values, must be between 0 (no data loss) and 1 (complete data loss)
if (any(Cell.loss < 0 | Cell.loss > 1)) {
stop("Inappropriate values for data loss proportions. All values must be between 0 and 1.") }
# check correlation matrices
if (any(unlist(R) < -1 | unlist(R) > 1) & nW) {
stop("Inappropriate values for correlation matrix. All values must be between -1 and 1.")
}
N <- n*nBcomb # total number of subjects
# ... generate dataframe compatible with means, standard deviation, and cors for w-s factor
dat.w <- matrix(rnorm(nWcomb * N), N)
# check correlation matrix
if (nW) {
# put single matrix in list (if there are no between-subject factors)
if (!is.list(R)) {
if (!nB) {
R <- list(R)
} else {
# use single matrix for all correlation matrices
R <- replicate(nBcomb, list(R))
warning("Using identical correlation matrix for all groups.")
oneCorMat <- TRUE # one correlation matrix (or one single number) for all correlations
}
} else { # test length of correlation list R
if (nB & length(R) != nBcomb) stop("Inappropriate matrix list 'R' specified. 'R' must be a list with a length according to the number of (between-subject) groups or a single matrix or a single number.")
oneCorMat <- FALSE # multiple correlation matrices (or single values)
}
for (group in 1 : nBcomb) {
if (!all(dim(as.matrix(R[[group]])) == c(nWcomb, nWcomb))) {
# does the inanpropriate matrix contain more than a single value?
if (!all(dim(as.matrix(R[[group]])) == c(1, 1))) {
warning(paste("Inanpropriate dimensions of ", group, ". correlation matrix. Using first value for all correlations (except the diagonal).", sep = ""))
} else if (nB) if (!oneCorMat) warning("Using ", R[[group]][1], " as value for all correlations in group ", group, ".\n", sep ="") # not if there is a single matrix/value
# create correlation matrix
R[[group]] <- matrix(R[[group]][1], nWcomb, nWcomb)
diag(R[[group]]) <- 1
} else {
# since only the values below the diagonal are relevant, the values above are replaced by them
R[[group]][upper.tri(R[[group]])] <- R[[group]][lower.tri(R[[group]])]
diag(R[[group]]) <- 1
}
}
} else R <- replicate(nBcomb, list(1)) # if there are no correlations (no within-subject factors)
for (group in 1 : nBcomb) {
# group index
ix <- (n * (group - 1) + 1) : (n * group)
# function mvrnorm() from MASS package
dat.w[ix, ] <- MASS::mvrnorm(n = n, mu = rep(0, nWcomb), Sigma = R[[group]], empirical = empirical)
}
# remove additional subjects (if n < prod(W))
if (n.dif != 0) {
ix.rem <- NULL #lines to be removed
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines)
ix <- ix[(n - n.dif + 1) : n] # lines with additional subjects (to be removed)
ix.rem <- c(ix.rem, ix)
}
dat.w <- dat.w[setdiff(1:N, ix.rem), ] # remove additional subject lines
n <- n - n.dif # give n its actual value back
N <- n * prod(B) # the actual value of N
}
# ... between-group effect
if (nB) {
#dat.b <- expand.grid(id=1:n, group=Blabel)
dat.b_small <- expand.grid(rev(Blabel))
dat.b <- data.frame(matrix(NA, N, nB))
colnames(dat.b) <- paste("B", LETTERS[1:nB], sep ="_")
for (i in nB:1) {
dat.b[ , nB + 1 - i] <- rep(dat.b_small[, i], each = n)
}
dat.b$id <- factor(1:N)
#dat.b$id <- factor(paste(dat.b$group, dat.b$id, sep=""))
} else {
dat.b <- data.frame(id = factor(1:N)) # a data frame containing the subject id
}
# do operations according distribution family
if (tolower(family) == "gaussian") {
# ... put mean and standard deviation in data
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines)
for (j in 1:n) {
dat.w[ix[j], ] <- dat.w[ix[j], ] * CellSDs[i, ]
dat.w[ix[j], ] <- dat.w[ix[j], ] + CellMeans[i, ]
}
}
} else if (tolower(family) == "gamma") {
if (any(CellMeans <= 0)) stop("Inappropriate mean for gamma distribution specified. Choose M > 0.")
if (any(CellSDs <= 0)) stop("Inappropriate standard deviation for gamma distribution specified. Choose SD > 0.")
# generate random beta distributed values
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines)
for (k in 1:nWcomb) {
# compute rank of gaussian distributed data of one factor level combination (between and within)
rg <- rank(dat.w[ix, k])
# calculate alpha (shape parameter) and theta (scale parameter) to obtain the desired mean and standard deviation
alpha <- CellMeans[i, k]^2 / CellSDs[i, k]^2
theta <- CellSDs[i, k]^2 / CellMeans[i, k]
# create random gamma data
dat.w[ix, k] <- rgamma(n, shape = alpha, scale = theta)
# apply rank of gaussian distributed values to ordered gamma distributed values
dat.w[ix, k] <- dat.w[ix, k][order(dat.w[ix, k])][rg]
}
}
} else if (tolower(family) == "beta") {
if (any(0 >= CellMeans | CellMeans >= 1)) stop("Mean of beta distribution must be in the closed interval (0, 1).")
if (any(0 >= CellSDs | CellSDs >= 0.5)) stop("Standard deviation of beta distribution must be in the closed interval (0, 0.5).")
# generate random beta distributed values
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines)
for (k in 1:nWcomb) {
# compute rank of gaussian distributed data of one factor level combination (between and within)
rg <- rank(dat.w[ix, k])
# calculate alpha1 (shape parameter) and alpha2 (shape parameter) to obtain the desired mean and standard deviation
alpha <- -((1 / CellSDs[i, k]^2) * CellMeans[i, k] * (CellMeans[i, k]^2 - CellMeans[i, k] + CellSDs[i, k]^2))
beta <- (1 / CellSDs[i, k]^2) * (CellMeans[i, k] - 1) * (CellMeans[i, k]^2 - CellMeans[i, k] + CellSDs[i, k]^2)
# create random beta data
dat.w[ix, k] <- rbeta(n, alpha, beta)
if (any(is.na(dat.w[ix, k]))) {
stop(paste("Failed to create beta distribution with mean ", CellMeans[i, k], " and standard deviation ", CellSDs[i, k], ". Choose different parameters.", sep = "")) }
# apply rank of gaussian distributed values to ordered beta distributed values
dat.w[ix, k] <- dat.w[ix, k][order(dat.w[ix, k])][rg]
}
}
} else stop("Unknown distribution family; possible are gaussian, gamma, and beta distributions.")
# apply data loss randomly
# 1) change proportion to absolute value
Cell.loss <- round(Cell.loss * n)
# 2) apply within each cell of n subjects
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i)
for (j in 1:nWcomb) {
remove <- sample(1:n, Cell.loss[i, j], replace = FALSE)
dat.w[ix, j][remove] <- NA
}
}
# ... combine dat.b and dat.w (or dat.l)
if (long | !nW) { # there is no wide format for between-subjects designs since each subject is associated with exactly 1 value (DV)
dat.l <- as.vector(as.matrix(dat.w)) #long format
dat <- NULL # this will be the final data frame
for (i in 1:nWcomb) {
dat <- rbind(dat, dat.b)
}
if (nW) {
# ... combine with information of within-subject factor levels combinations
Wcond <- rev(expand.grid(rev(Wlabel)))
names(Wcond) <- paste("W_", letters[1:nW], sep = "")
Wcond.l <- NULL
for (i in 1:N) { # span a data frame
Wcond.l <- rbind(Wcond.l, Wcond)
}
for (i in 1:nWcomb) { # modify data frame (each row of Wcond is repeated N times)
ix <- (N * (i - 1) + 1) : (N * i)
Wcond.l[ix, ] <- Wcond[i, ]
}
dat <- cbind(dat, Wcond.l) # combine between-subject information and within-subject labels
}
dat <- cbind(dat, dat.l) # combine (between-subject information and within-subject labels) and (measures (DV))
names(dat)[2 + nB + nW] <- "DV"
# order data frame along id
dat <- data.frame(dat[order(dat$id), ], row.names = NULL)
} else {
colnames(dat.w) <- Wlabel_comb
dat <- cbind(dat.b, dat.w)
}
} else dat <- NULL # if there are neither between- nor within-subject factors
return(dat)
} # end of function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.