Nothing
#' Add correlated data to existing data.table
#'
#' @param dtOld Data table that is the new columns will be appended to.
#' @param idname Character name of id field, defaults to "id".
#' @param mu A vector of means. The length of mu must be nvars.
#' @param sigma Standard deviation of variables. If standard deviation differs
#' for each variable, enter as a vector with the same length as the mean vector
#' mu. If the standard deviation is constant across variables, as single value
#' can be entered.
#' @param corMatrix Correlation matrix can be entered directly. It must be
#' symmetrical and positive semi-definite. It is not a required field; if a
#' matrix is not provided, then a structure and correlation coefficient rho must
#' be specified.
#' @param rho Correlation coefficient, -1 <= rho <= 1. Use if corMatrix is not
#' provided.
#' @param corstr Correlation structure of the variance-covariance matrix
#' defined by sigma and rho. Options include "ind" for an independence
#' structure, "cs" for a compound symmetry structure, and "ar1" for an
#' autoregressive structure.
#' @param cnames Explicit column names. A single string with names separated
#' by commas. If no string is provided, the default names will be V#, where #
#' represents the column.
#' @return The original data table with the additional correlated columns
#' @examples
#' def <- defData(varname = "xUni", dist = "uniform", formula = "10;20", id = "myID")
#' def <- defData(def,
#' varname = "xNorm", formula = "xUni * 2", dist = "normal",
#' variance = 8
#' )
#'
#' dt <- genData(250, def)
#'
#' mu <- c(3, 8, 15)
#' sigma <- c(1, 2, 3)
#'
#' dtAdd <- addCorData(dt, "myID",
#' mu = mu, sigma = sigma,
#' rho = .7, corstr = "cs"
#' )
#' dtAdd
#'
#' round(var(dtAdd[, .(V1, V2, V3)]), 3)
#' round(cor(dtAdd[, .(V1, V2, V3)]), 2)
#'
#' dtAdd <- addCorData(dt, "myID",
#' mu = mu, sigma = sigma,
#' rho = .7, corstr = "ar1"
#' )
#' round(cor(dtAdd[, .(V1, V2, V3)]), 2)
#'
#' corMat <- matrix(c(1, .2, .8, .2, 1, .6, .8, .6, 1), nrow = 3)
#'
#' dtAdd <- addCorData(dt, "myID",
#' mu = mu, sigma = sigma,
#' corMatrix = corMat
#' )
#' round(cor(dtAdd[, .(V1, V2, V3)]), 2)
#' @concept correlated
#' @export
addCorData <- function(dtOld, idname, mu, sigma, corMatrix = NULL,
rho, corstr = "ind", cnames = NULL) {
# dtName must contain id for now
dtTemp <- copy(dtOld)
data.table::setkeyv(dtTemp, idname)
n <- nrow(dtTemp)
dtNew <- simstudy::genCorData(
n = n, mu = mu, sigma = sigma,
corMatrix = corMatrix, rho = rho,
corstr = corstr, cnames = cnames,
idname = idname
)
data.table::setkeyv(dtNew, idname)
dtTemp <- mergeData(dtTemp, dtNew, idname)
return(dtTemp[])
}
#' Create multivariate (correlated) data - for general distributions
#'
#' @param dt Data table that will be updated.
#' @param defs Field definition table created by function `defDataAdd`.
#' @param rho Correlation coefficient, -1 <= rho <= 1. Use if corMatrix is not
#' provided.
#' @param tau Correlation based on Kendall's tau. If tau is specified, then it
#' is used as the correlation even if rho is specified. If tau is NULL, then the
#' specified value of rho is used, or rho defaults to 0.
#' @param corstr Correlation structure of the variance-covariance matrix defined
#' by sigma and rho. Options include "cs" for a compound symmetry structure
#' and "ar1" for an autoregressive structure. Defaults to "cs".
#' @param corMatrix Correlation matrix can be entered directly. It must be
#' symmetrical and positive semi-definite. It is not a required field; if a
#' matrix is not provided, then a structure and correlation coefficient rho must
#' be specified.
#' @param envir Environment the data definitions are evaluated in.
#' Defaults to [base::parent.frame].
#' @return data.table with added column(s) of correlated data
#' @examples
#' defC <- defData(
#' varname = "nInds", formula = 50, dist = "noZeroPoisson",
#' id = "idClust"
#' )
#'
#' dc <- genData(10, defC)
#' #### Normal only
#'
#' dc <- addCorData(dc,
#' mu = c(0, 0, 0, 0), sigma = c(2, 2, 2, 2), rho = .2,
#' corstr = "cs", cnames = c("a", "b", "c", "d"),
#' idname = "idClust"
#' )
#'
#' di <- genCluster(dc, "idClust", "nInds", "id")
#'
#' defI <- defDataAdd(
#' varname = "A", formula = "-1 + a", variance = 3,
#' dist = "normal"
#' )
#' defI <- defDataAdd(defI,
#' varname = "B", formula = "4.5 + b", variance = .5,
#' dist = "normal"
#' )
#' defI <- defDataAdd(defI,
#' varname = "C", formula = "5*c", variance = 3,
#' dist = "normal"
#' )
#' defI <- defDataAdd(defI,
#' varname = "D", formula = "1.6 + d", variance = 1,
#' dist = "normal"
#' )
#'
#' #### Generate new data
#'
#' di <- addCorFlex(di, defI, rho = 0.4, corstr = "cs")
#'
#' # Check correlations by cluster
#'
#' for (i in 1:nrow(dc)) {
#' print(cor(di[idClust == i, list(A, B, C, D)]))
#' }
#'
#' # Check global correlations - should not be as correlated
#' cor(di[, list(A, B, C, D)])
#' @concept correlated
#' @md
#' @export
addCorFlex <- function(dt, defs, rho = 0, tau = NULL, corstr = "cs",
corMatrix = NULL, envir = parent.frame()) {
# "Declare" vars to avoid R CMD warning
X <- NULL
Unew <- NULL
param1 <- NULL
param2 <- NULL
id <- NULL
period <- NULL
dist <- NULL
formula <- NULL
link <- NULL
variance <- NULL
#### Check args
## Other checks? ##
# check that names are not already used
###
if (!all(defs[, dist] %in% c("normal", "gamma", "binary", "poisson", "negBinomial"))) {
stop("Only implemented for the following distributions: binary, normal, poisson, gamma, and negative binomial")
}
####
dtCopy <- copy(dt)
n <- nrow(dtCopy)
corDefs <- copy(defs)
nvars <- nrow(corDefs)
### Start generating data (first, using copula)
### Convert tau to rho
if (!is.null(tau)) {
rho <- sin(tau * pi / 2)
}
###
dx <- .genQuantU(nvars, n, rho, corstr, corMatrix)
dFinal <- dx[period == 0, list(id)]
for (i in 1:nvars) {
dTemp <- dx[period == (i - 1)]
dTemp <- dTemp[dtCopy]
iDist <- corDefs[i, dist]
iFormula <- corDefs[i, formula]
iLink <- corDefs[i, link]
if (iDist == "binary") {
params <- .getBinaryMean(dTemp,
formula = iFormula,
size = 1,
link = iLink,
envir = envir
)
V <- dTemp[, stats::qbinom(Unew, 1, params[[1]])]
} else if (iDist == "poisson") {
param1 <- .getPoissonMean(
dtSim = dTemp,
formula = iFormula,
link = iLink,
envir = envir
)
V <- dTemp[, stats::qpois(Unew, param1)]
} else if (iDist == "gamma") {
mn <- .getGammaMean(
dtSim = dTemp,
formula = iFormula,
link = iLink,
envir = envir
)
### Gamma parameters need to be transformed
sr <- gammaGetShapeRate(mn, corDefs[i, variance])
param1 <- sr[[1]]
param2 <- sr[[2]]
V <- dTemp[, stats::qgamma(Unew, param1, param2)]
} else if (iDist == "negBinomial") {
mn <- .getNBmean(dTemp, formula = iFormula, link = iLink, envir = envir)
### NB parameters need to be transformed
sp <- negbinomGetSizeProb(mn, corDefs[i, variance])
param1 <- sp[[1]]
param2 <- sp[[2]]
V <- dTemp[, stats::qnbinom(Unew, param1, param2)]
} else if (iDist == "normal") {
param1 <- .getNormalMean(dtSim = dTemp, formula = iFormula, envir = envir)
param2 <- sqrt(corDefs[i, variance])
V <- dTemp[, stats::qnorm(Unew, param1, param2)]
}
dFinal <- cbind(dFinal, V)
setnames(dFinal, "V", corDefs$varname[i])
}
dFinal <- dtCopy[dFinal]
return(dFinal[])
}
#' Create multivariate (correlated) data - for general distributions
#'
#' @param dtOld The data set that will be augmented. If the data set includes a
#' single record per id, the new data table will be created as a "wide" data set.
#' If the original data set includes multiple records per id, the new data set will
#' be in "long" format.
#' @param nvars The number of new variables to create for each id. This is only applicable
#' when the data are generated from a data set that includes one record per id.
#' @param idvar String variable name of column represents individual level id for correlated
#' data.
#' @param dist A string indicating "normal", "binary", "poisson" or "gamma".
#' @param rho Correlation coefficient, -1 <= rho <= 1. Use if corMatrix is not provided.
#' @param corstr Correlation structure of the variance-covariance matrix
#' defined by sigma and rho. Options include "cs" for a compound symmetry structure
#' and "ar1" for an autoregressive structure.
#' @param corMatrix Correlation matrix can be entered directly. It must be symmetrical and
#' positive semi-definite. It is not a required field; if a matrix is not provided, then a
#' structure and correlation coefficient rho must be specified.
#' @param param1 A string that represents the column in dtOld that contains the parameter
#' for the mean of the distribution. In the case of the uniform distribution the column
#' specifies the minimum.
#' @param param2 A string that represents the column in dtOld that contains a possible second
#' parameter for the distribution. For the normal distribution, this will be the variance;
#' for the gamma distribution, this will be the dispersion; and for the uniform distribution,
#' this will be the maximum.
#' @param cnames Explicit column names. A single string with names separated
#' by commas. If no string is provided, the default names will be V#, where #
#' represents the column.
#' @param method Two methods are available to generate correlated data. (1) "copula" uses
#' the multivariate Gaussian copula method that is applied to all other distributions; this
#' applies to all available distributions. (2) "ep" uses an algorithm developed by
#' Emrich and Piedmonte (1991).
#' @param ... May include additional arguments that have been deprecated and are
#' no longer used.
#' @details The original data table can come in one of two formats: a single row
#' per **idvar** (where data are *ungrouped*) or multiple rows per **idvar** (in which
#' case the data are *grouped* or clustered). The structure of the arguments
#' depends on the format of the data.
#'
#' In the case of *ungrouped* data, there are two ways to specify the number of
#' correlated variables and the covariance matrix. In approach (1),
#' **nvars** needs to be specified along with **rho** and **corstr**.
#' In approach (2), **corMatrix** may be specified by identifying a single square
#' *n* x *n* covariance matrix. The number of new variables generated for each
#' record will be *n*. If **nvars**, **rho**,
#' **corstr**, and **corMatrix** are all specified, the data will be
#' generated based on the information provided in the covariance matrix alone.
#' In both (1) and (2), the data will be returned in a wide format.
#'
#' In the case of *grouped* data, where there are *G* groups, there are also two
#' ways to proceed. In both cases,
#' the number of new variables to be generated may vary by group, and will be determined by the
#' number of records in each group, \eqn{n_i, i \in \{1,...,G\}} (i.e., the number of records that share the same
#' value of *idvar*). **nvars** is not used in grouped data.
#' In approach (1), the arguments **rho** and **corstr** may both be specified
#' to determine the structure of the covariance
#' matrix. In approach (2), the argument **corMatrix** may be specified.
#' **corMatrix** can be a single matrix with dimensions \eqn{n \ \text{x} \ n} if
#' \eqn{n_i = n} for all *i*. However, if the sample sizes of each group vary
#' (i.e., \eqn{n_i \ne n_j} for some groups *i* and *j*), **corMatrix** must be a list
#' of covariance matrices with a length *G*; each
#' covariance matrix in the list will have dimensions
#' \eqn{n_i \ \text{x} \ n_i, \ i \in \{1,...,G\}}. In the case of *grouped* data, the
#' new data will be returned in *long* format (i.e., one new column only).
#'
#' @return Original data.table with added column(s) of correlated data
#' @references Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional
#' Multivariate Binary Variates. The American Statistician 1991;45:302-4.
#' @examples
#' # Ungrouped data
#'
#' cMat <- genCorMat(nvars = 4, rho = .2, corstr = "ar1", nclusters = 1)
#'
#' def <-
#' defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma") |>
#' defData(varname = "lambda", formula = ".5 + .1*xbase", dist = "nonrandom", link = "log") |>
#' defData(varname = "n", formula = 3, dist = "noZeroPoisson")
#'
#' dd <- genData(101, def, id = "cid")
#'
#' ## Specify with nvars, rho, and corstr
#'
#' addCorGen(
#' dtOld = dd, idvar = "cid", nvars = 3, rho = .7, corstr = "cs",
#' dist = "poisson", param1 = "lambda"
#' )
#'
#' ## Specify with covMatrix
#'
#' addCorGen(
#' dtOld = dd, idvar = "cid", corMatrix = cMat,
#' dist = "poisson", param1 = "lambda"
#' )
#'
#' # Grouped data
#'
#' cMats <- genCorMat(nvars = dd$n, rho = .5, corstr = "cs", nclusters = nrow(dd))
#'
#' dx <- genCluster(dd, "cid", "n", "id")
#'
#' ## Specify with nvars, rho, and corstr
#'
#' addCorGen(
#' dtOld = dx, idvar = "cid", rho = .8, corstr = "ar1", dist = "poisson", param1 = "xbase"
#' )
#'
#' ## Specify with covMatrix
#'
#' addCorGen(
#' dtOld = dx, idvar = "cid", corMatrix = cMats, dist = "poisson", param1 = "xbase"
#' )
#'
#' @concept correlated
#' @export
#' @md
addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, corMatrix = NULL,
dist, param1, param2 = NULL, cnames = NULL,
method = "copula", ...) {
### can deprecate formSpec - no longer relevant
### can deprecate periodvar - no longer relevant
# "Declare" vars to avoid R CMD warning
.id <- NULL
N <- NULL
.U <- NULL
Unew <- NULL
.XX <- NULL
X <- NULL
.param1 <- NULL
.param2 <- NULL
seq_ <- NULL
period <- NULL
p <- NULL
corM <- NULL
seqid <- NULL
####
.genByID <- function(p, rho, corstr, corMatrix) {
corM <- .buildCorMat(nvars = length(p), corMatrix, corstr, rho)
dtM <- .genBinEP(1, p, corM)
dtM <- dtM[, list(X, seq_ = seqid)]
dtM[]
}
#### Check args
assertNotMissing(dtOld = missing(dtOld),
dist = missing(dist), param1 = missing(param1))
assertClass(dtOld = dtOld, class = "data.table")
assertOption(dist = dist,
options = c("poisson", "binary", "gamma", "uniform", "negBinomial", "normal"))
if (!is.null(param2)) {
assertInDataTable(vars = c(idvar, param1, param2), dt = dtOld)
} else assertInDataTable(vars = c(idvar, param1), dt = dtOld)
assertOption(method = method, options = c("copula", "ep"))
nparams <- as.numeric(!is.null(param1)) + as.numeric(!is.null(param2))
if (((nparams > 1) & (dist %in% c("poisson", "binary")))) {
stop(paste0("Too many parameters (", nparams, ") for ", dist))
}
if (((nparams < 2) & (dist %in% c("gamma", "uniform", "normal", "negBinomial")))) {
stop(paste0("Too few parameters (", nparams, ") for ", dist))
}
if (dist != "binary" & method == "ep") {
stop("Method `ep` applies only to binary data generation")
}
# wide(ness) is determined by incoming data structure.
maxN <- dtOld[, .N, by = idvar][, max(N)]
if (maxN == 1) {
wide <- TRUE
if ((is.null(nvars) | is.null(rho) | is.null(corstr)) & (is.null(corMatrix))) {
stop("Either nvars, rho, and corstr all must be provided or corMatrix must be provided.")
}
if (is.null(corMatrix)) { # that means that we are using nvars/rho/corstr
assertAtLeast(nvars = nvars, minVal = 2)
}
} else if (maxN > 1) {
wide <- FALSE
if ((is.null(rho) | is.null(corstr)) & (is.null(corMatrix))) {
stop("Either both rho and corstr must be provided or corMatrix must be provided.")
}
}
####
if (!is.null(cnames)) {
nnames <- trimws(unlist(strsplit(cnames, split = ",")))
lnames <- length(nnames)
if (!wide) {
if (lnames > 1) stop(paste("Long format can have only 1 name.", lnames, "have been provided."))
} else if (wide) {
if (lnames != nvars) stop(paste0("Number of names (", lnames, ") not equal to specified nvars (", nvars, ")."))
}
}
####
dtOrig <- copy(dtOld)
setnames(dtOrig, idvar, ".id")
dtTemp <- copy(dtOrig)
# check corMatrix
if (!is.null(corMatrix)) {
if (is.list(corMatrix)) {
# check if corMatrix is a numeric matrix
test <-
sapply(corMatrix, function(corMatrix) assertNumericMatrix(corMatrix = corMatrix))
# check if there are the same number of correlation matrices as there are clusters
dn <- dtTemp[, .N, keyby = .id]
assertLength(corMatrix = corMatrix, length = nrow(dn))
# check if the dimensions of corr matrices match cluster sizes
dn$dim <- sapply(corMatrix, function(x) nrow(x))
compare_cluster_size <- dn[, sum(N != dim)]
if (compare_cluster_size != 0) {
stop("Dimensions of correlation matrices in corMatrix not equal to cluster sizes!")
}
} else { # not a list
assertNumericMatrix(corMatrix = corMatrix)
# check if the dimensions of corr matrix matches (equal) cluster size
if (!wide) {
dn <- dtTemp[, .N, keyby = .id]
dn[, dim := nrow(corMatrix)]
compare_cluster_size <- dn[, sum(N != dim)]
if (compare_cluster_size != 0) {
stop("Dimensions of corMatrix not equal to cluster sizes!")
}
}
}
}
if (wide) { # Convert to long form temporarily
if ( is.null(nvars) ) nvars <- nrow(corMatrix)
dtTemp <- addPeriods(dtTemp, nPeriods = nvars, idvars = ".id")
}
dtTemp[, seq_ := 1:.N, keyby = .id]
# nvars <- dtTemp[.id == 1, .N] # only permits case where number of records per id is the same
counts <- dtTemp[, .N, by = .id][, N]
same_nvar <- all(counts == counts[1])
if (!wide) { # multiple record per id
if (is.null(corMatrix)) {
if (same_nvar) {
corMatrix <- genCorMat(nvars = counts[1] , rho = rho, corstr = corstr, nclusters = 1)
} else {
corMatrix <- genCorMat(nvars = counts , rho = rho, corstr = corstr, nclusters = length(counts))
}
}
} else { # single record per id
if (is.null(corMatrix)) {
corMatrix <- genCorMat(nvars = nvars , rho = rho, corstr = corstr, nclusters = 1)
}
}
if (method == "copula") {
if (is.list(corMatrix)) {
ns <- split(dtTemp[, .N, keyby = .id], by = ".id")
dtM <- rbindlist(
lapply(ns, function(x) .genQuantU(x$N, 1, rho, corstr, corMatrix[[x$.id]]))
)
dtTemp[, .U := dtM$Unew]
} else {
nvars <- nrow(corMatrix)
ns <- nrow(dtTemp[, .N, keyby = .id])
Unew <- c(t(mvnfast::rmvn(n = ns, mu = rep(0, nvars), sigma = corMatrix)))
dtTemp[, .U := stats::pnorm(Unew)]
}
if (dist == "poisson") {
setnames(dtTemp, param1, ".param1")
dtTemp[, .XX := stats::qpois(p = .U, lambda = .param1)]
} else if (dist == "binary") {
setnames(dtTemp, param1, ".param1")
dtTemp[, .XX := stats::qbinom(p = .U, size = 1, prob = .param1)]
} else if (dist == "negBinomial") {
setnames(dtTemp, param1, ".param1")
setnames(dtTemp, param2, ".param2")
sp <- negbinomGetSizeProb(dtTemp$.param1, dtTemp$.param2)
dtTemp[, .param1 := sp[[1]]]
dtTemp[, .param2 := sp[[2]]]
dtTemp[, .XX := stats::qnbinom(p = .U, size = .param1, prob = .param2)]
} else if (dist == "uniform") {
setnames(dtTemp, param1, ".param1")
setnames(dtTemp, param2, ".param2")
dtTemp[, .XX := stats::qunif(p = .U, min = .param1, max = .param2)]
} else if (dist == "gamma") {
setnames(dtTemp, param1, ".param1")
setnames(dtTemp, param2, ".param2")
sr <- gammaGetShapeRate(dtTemp$.param1, dtTemp$.param2)
dtTemp[, .param1 := sr[[1]]]
dtTemp[, .param2 := sr[[2]]]
dtTemp[, .XX := stats::qgamma(p = .U, shape = .param1, rate = .param2)]
} else if (dist == "normal") {
setnames(dtTemp, param1, ".param1")
setnames(dtTemp, param2, ".param2")
dtTemp[, .XX := stats::qnorm(p = .U, mean = .param1, sd = sqrt(.param2))]
}
dX <- dtTemp[, list(.id, seq_, .XX)]
} else if (method == "ep") {
if (is.list(corMatrix)) {
dX <- dtTemp[, .genByID(p = get(param1), rho, corstr, corMatrix[[.id]]), keyby = .id]
} else {
dX <- dtTemp[, .genByID(p = get(param1), rho, corstr, corMatrix), keyby = .id]
}
setnames(dX, "X", ".XX")
} # end (if ep)
if (wide) {
setkey(dX, .id, seq_)
dWide <- dcast(dX, .id ~ seq_, value.var = ".XX")
setnames(dWide, c(".id", paste0("V", 1:nvars)))
setkey(dWide, ".id")
setkey(dtOrig, ".id")
dtTemp <- dtOrig[dWide]
if (!is.null(cnames)) {
setnames(dtTemp, paste0("V", 1:nvars), nnames)
}
} else if (!wide) {
dtTemp <- copy(dtOld)
setnames(dtTemp, c(idvar), c(".id"))
dtTemp[, seq_ := 1:.N, keyby = .id]
setkey(dX, .id, seq_)
setkey(dtTemp, .id, seq_)
dtTemp <- dtTemp[dX]
if (!is.null(cnames)) {
setnames(dtTemp, ".XX", cnames)
} else {
setnames(dtTemp, ".XX", "X")
}
dtTemp[, seq_ := NULL]
} # end if !wide
setnames(dtTemp, ".id", idvar)
return(dtTemp[])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.