R/add_correlated_data.R

Defines functions addCorGen addCorFlex addCorData

Documented in addCorData addCorFlex addCorGen

#' Add correlated data to existing data.table
#'
#' @param dtOld Data table that is the new columns will be appended to.
#' @param idname Character name of id field, defaults to "id".
#' @param mu A vector of means. The length of mu must be nvars.
#' @param sigma Standard deviation of variables. If standard deviation differs
#' for each variable, enter as a vector with the same length as the mean vector
#' mu. If the standard deviation is constant across variables, as single value
#' can be entered.
#' @param corMatrix Correlation matrix can be entered directly. It must be
#' symmetrical and positive semi-definite. It is not a required field; if a
#' matrix is not provided, then a structure and correlation coefficient rho must
#' be specified.
#' @param rho Correlation coefficient, -1 <= rho <= 1. Use if corMatrix is not
#' provided.
#' @param corstr Correlation structure of the variance-covariance matrix
#' defined by sigma and rho. Options include "ind" for an independence
#' structure, "cs" for a compound symmetry structure, and "ar1" for an
#' autoregressive structure.
#' @param cnames Explicit column names. A single string with names separated
#' by commas. If no string is provided, the default names will be V#, where #
#' represents the column.
#' @return The original data table with the additional correlated columns
#' @examples
#' def <- defData(varname = "xUni", dist = "uniform", formula = "10;20", id = "myID")
#' def <- defData(def,
#'   varname = "xNorm", formula = "xUni * 2", dist = "normal",
#'   variance = 8
#' )
#'
#' dt <- genData(250, def)
#'
#' mu <- c(3, 8, 15)
#' sigma <- c(1, 2, 3)
#'
#' dtAdd <- addCorData(dt, "myID",
#'   mu = mu, sigma = sigma,
#'   rho = .7, corstr = "cs"
#' )
#' dtAdd
#'
#' round(var(dtAdd[, .(V1, V2, V3)]), 3)
#' round(cor(dtAdd[, .(V1, V2, V3)]), 2)
#'
#' dtAdd <- addCorData(dt, "myID",
#'   mu = mu, sigma = sigma,
#'   rho = .7, corstr = "ar1"
#' )
#' round(cor(dtAdd[, .(V1, V2, V3)]), 2)
#'
#' corMat <- matrix(c(1, .2, .8, .2, 1, .6, .8, .6, 1), nrow = 3)
#'
#' dtAdd <- addCorData(dt, "myID",
#'   mu = mu, sigma = sigma,
#'   corMatrix = corMat
#' )
#' round(cor(dtAdd[, .(V1, V2, V3)]), 2)
#' @concept correlated
#' @export
addCorData <- function(dtOld, idname, mu, sigma, corMatrix = NULL,
                       rho, corstr = "ind", cnames = NULL) {
  # dtName must contain id for now
  dtTemp <- copy(dtOld)
  data.table::setkeyv(dtTemp, idname)

  n <- nrow(dtTemp)

  dtNew <- simstudy::genCorData(
    n = n, mu = mu, sigma = sigma,
    corMatrix = corMatrix, rho = rho,
    corstr = corstr, cnames = cnames,
    idname = idname
  )

  data.table::setkeyv(dtNew, idname)

  dtTemp <- mergeData(dtTemp, dtNew, idname)

  return(dtTemp[])
}

#' Create multivariate (correlated) data - for general distributions
#'
#' @param dt Data table that will be updated.
#' @param defs Field definition table created by function `defDataAdd`.
#' @param rho Correlation coefficient, -1 <= rho <= 1. Use if corMatrix is not
#' provided.
#' @param tau Correlation based on Kendall's tau. If tau is specified, then it
#' is used as the correlation even if rho is specified. If tau is NULL, then the
#' specified value of rho is used, or rho defaults to 0.
#' @param corstr Correlation structure of the variance-covariance matrix defined
#' by sigma and rho. Options include "cs" for a compound symmetry structure
#' and "ar1" for an autoregressive structure. Defaults to "cs".
#' @param corMatrix Correlation matrix can be entered directly. It must be
#' symmetrical and positive semi-definite. It is not a required field; if a
#' matrix is not provided, then a structure and correlation coefficient rho must
#' be specified.
#' @param envir Environment the data definitions are evaluated in.
#'  Defaults to [base::parent.frame].
#' @return data.table with added column(s) of correlated data
#' @examples
#' defC <- defData(
#'   varname = "nInds", formula = 50, dist = "noZeroPoisson",
#'   id = "idClust"
#' )
#'
#' dc <- genData(10, defC)
#' #### Normal only
#'
#' dc <- addCorData(dc,
#'   mu = c(0, 0, 0, 0), sigma = c(2, 2, 2, 2), rho = .2,
#'   corstr = "cs", cnames = c("a", "b", "c", "d"),
#'   idname = "idClust"
#' )
#'
#' di <- genCluster(dc, "idClust", "nInds", "id")
#'
#' defI <- defDataAdd(
#'   varname = "A", formula = "-1 + a", variance = 3,
#'   dist = "normal"
#' )
#' defI <- defDataAdd(defI,
#'   varname = "B", formula = "4.5 + b", variance = .5,
#'   dist = "normal"
#' )
#' defI <- defDataAdd(defI,
#'   varname = "C", formula = "5*c", variance = 3,
#'   dist = "normal"
#' )
#' defI <- defDataAdd(defI,
#'   varname = "D", formula = "1.6 + d", variance = 1,
#'   dist = "normal"
#' )
#'
#' #### Generate new data
#'
#' di <- addCorFlex(di, defI, rho = 0.4, corstr = "cs")
#'
#' # Check correlations by cluster
#'
#' for (i in 1:nrow(dc)) {
#'   print(cor(di[idClust == i, list(A, B, C, D)]))
#' }
#'
#' # Check global correlations - should not be as correlated
#' cor(di[, list(A, B, C, D)])
#' @concept correlated
#' @md
#' @export
addCorFlex <- function(dt, defs, rho = 0, tau = NULL, corstr = "cs",
                       corMatrix = NULL, envir = parent.frame()) {

  # "Declare" vars to avoid R CMD warning

  X <- NULL
  Unew <- NULL
  param1 <- NULL
  param2 <- NULL
  id <- NULL
  period <- NULL
  dist <- NULL
  formula <- NULL
  link <- NULL
  variance <- NULL

  #### Check args

  ## Other checks? ##

  # check that names are not already used

  ###

  if (!all(defs[, dist] %in% c("normal", "gamma", "binary", "poisson", "negBinomial"))) {
    stop("Only implemented for the following distributions: binary, normal, poisson, gamma, and negative binomial")
  }

  ####

  dtCopy <- copy(dt)
  n <- nrow(dtCopy)

  corDefs <- copy(defs)
  nvars <- nrow(corDefs)

  ### Start generating data (first, using copula)

  ### Convert tau to rho

  if (!is.null(tau)) {
    rho <- sin(tau * pi / 2)
  }

  ###

  dx <- .genQuantU(nvars, n, rho, corstr, corMatrix)

  dFinal <- dx[period == 0, list(id)]

  for (i in 1:nvars) {
    dTemp <- dx[period == (i - 1)]
    dTemp <- dTemp[dtCopy]

    iDist <- corDefs[i, dist]
    iFormula <- corDefs[i, formula]
    iLink <- corDefs[i, link]

    if (iDist == "binary") {
      params <- .getBinaryMean(dTemp,
        formula = iFormula,
        size = 1,
        link = iLink,
        envir = envir
      )

      V <- dTemp[, stats::qbinom(Unew, 1, params[[1]])]
    } else if (iDist == "poisson") {
      param1 <- .getPoissonMean(
        dtSim = dTemp,
        formula = iFormula,
        link = iLink,
        envir = envir
      )

      V <- dTemp[, stats::qpois(Unew, param1)]
    } else if (iDist == "gamma") {
      mn <- .getGammaMean(
        dtSim = dTemp,
        formula = iFormula,
        link = iLink,
        envir = envir
      )

      ### Gamma parameters need to be transformed

      sr <- gammaGetShapeRate(mn, corDefs[i, variance])
      param1 <- sr[[1]]
      param2 <- sr[[2]]

      V <- dTemp[, stats::qgamma(Unew, param1, param2)]
    } else if (iDist == "negBinomial") {
      mn <- .getNBmean(dTemp, formula = iFormula, link = iLink, envir = envir)

      ### NB parameters need to be transformed

      sp <- negbinomGetSizeProb(mn, corDefs[i, variance])
      param1 <- sp[[1]]
      param2 <- sp[[2]]

      V <- dTemp[, stats::qnbinom(Unew, param1, param2)]
    } else if (iDist == "normal") {
      param1 <- .getNormalMean(dtSim = dTemp, formula = iFormula, envir = envir)
      param2 <- sqrt(corDefs[i, variance])

      V <- dTemp[, stats::qnorm(Unew, param1, param2)]
    }

    dFinal <- cbind(dFinal, V)
    setnames(dFinal, "V", corDefs$varname[i])
  }

  dFinal <- dtCopy[dFinal]

  return(dFinal[])
}

#' Create multivariate (correlated) data - for general distributions
#'
#' @param dtOld The data set that will be augmented. If the data set includes a
#' single record per id, the new data table will be created as a "wide" data set.
#' If the original data set includes multiple records per id, the new data set will 
#' be in "long" format.
#' @param nvars The number of new variables to create for each id. This is only applicable
#' when the data are generated from a data set that includes one record per id.
#' @param idvar String variable name of column represents individual level id for correlated
#' data.
#' @param dist A string indicating "normal", "binary", "poisson" or "gamma".
#' @param rho Correlation coefficient, -1 <= rho <= 1. Use if corMatrix is not provided.
#' @param corstr Correlation structure of the variance-covariance matrix
#' defined by sigma and rho. Options include "cs" for a compound symmetry structure
#' and "ar1" for an autoregressive structure.
#' @param corMatrix Correlation matrix can be entered directly. It must be symmetrical and
#' positive semi-definite. It is not a required field; if a matrix is not provided, then a
#' structure and correlation coefficient rho must be specified.
#' @param param1  A string that represents the column in dtOld that contains the parameter
#' for the mean of the distribution. In the case of the uniform distribution the column
#' specifies the minimum.
#' @param param2 A string that represents the column in dtOld that contains a possible second
#' parameter for the distribution. For the normal distribution, this will be the variance;
#' for the gamma distribution, this will be the dispersion; and for the uniform distribution,
#' this will be the maximum.
#' @param cnames Explicit column names. A single string with names separated
#' by commas. If no string is provided, the default names will be V#, where #
#' represents the column.
#' @param method Two methods are available to generate correlated data. (1) "copula" uses
#' the multivariate Gaussian copula method that is applied to all other distributions; this
#' applies to all available distributions. (2) "ep" uses an algorithm developed by
#' Emrich and Piedmonte (1991).
#' @param ... May include additional arguments that have been deprecated and are
#' no longer used.
#' @return Original data.table with added column(s) of correlated data
#' @references Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional
#' Multivariate Binary Variates. The American Statistician 1991;45:302-4.
#' @examples
#' # Wide example
#'
#' def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid")
#' def <- defData(def, varname = "lambda", formula = ".5 + .1*xbase", dist = "nonrandom", link = "log")
#'
#' dt <- genData(100, def)
#'
#' addCorGen(
#'   dtOld = dt, idvar = "cid", nvars = 3, rho = .7, corstr = "cs",
#'   dist = "poisson", param1 = "lambda"
#' )
#'
#' # Long example
#'
#' def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid")
#'
#' def2 <- defDataAdd(
#'   varname = "p", formula = "-3+.2*period + .3*xbase",
#'   dist = "nonrandom", link = "logit"
#' )
#'
#' dt <- genData(100, def)
#'
#' dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3)
#' dtLong <- addColumns(def2, dtLong)
#' 
#' addCorGen(
#'   dtOld = dtLong, idvar = "cid", nvars = NULL, rho = .7, corstr = "cs",
#'   dist = "binary", param1 = "p"
#' )
#'
#' @concept correlated
#' @export
addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, corMatrix = NULL,
                      dist, param1, param2 = NULL, cnames = NULL,
                      method = "copula", ...) {

  ### can deprecate formSpec - no longer relevant
  ### can deprecate periodvar - no longer relevant
  
  # "Declare" vars to avoid R CMD warning

  .id <- NULL
  N <- NULL
  .U <- NULL
  Unew <- NULL
  .XX <- NULL
  X <- NULL
  .param1 <- NULL
  .param2 <- NULL
  seq_ <- NULL
  period <- NULL
  p <- NULL
  corM <- NULL
  seqid <- NULL
  
  ####
  
  .genByID <- function(p, rho, corstr, corMatrix) {

    corM <- .buildCorMat(nvars = length(p), corMatrix, corstr, rho)
    dtM <- .genBinEP(1, p, corM)
    dtM <- dtM[, list(X, seq_ = seqid)]
    dtM[]
    
  }
  
  #### Check args

  assertNotMissing(dtOld = missing(dtOld), 
                   dist = missing(dist), param1 = missing(param1))
  
  assertClass(dtOld = dtOld, class = "data.table")
  
  assertOption(dist = dist, 
    options = c("poisson", "binary", "gamma", "uniform", "negBinomial", "normal"))
  
  if (!is.null(param2)) {
    assertInDataTable(vars = c(idvar, param1, param2), dt = dtOld)
  } else assertInDataTable(vars = c(idvar, param1), dt = dtOld)
  
  assertOption(method = method, options = c("copula", "ep"))

  nparams <- as.numeric(!is.null(param1)) + as.numeric(!is.null(param2))

  if (((nparams > 1) & (dist %in% c("poisson", "binary")))) {
    stop(paste0("Too many parameters (", nparams, ") for ", dist))
  }

  if (((nparams < 2) & (dist %in% c("gamma", "uniform", "normal", "negBinomial")))) {
    stop(paste0("Too few parameters (", nparams, ") for ", dist))
  }

  if (dist != "binary" & method == "ep") {
    stop("Method `ep` applies only to binary data generation")
  }
  

  # wide(ness) is determined by incoming data structure.

  maxN <- dtOld[, .N, by = idvar][, max(N)]
  if (maxN == 1) {
    wide <- TRUE
    assertNotMissing(nvars = missing(nvars))
    assertAtLeast(nvars = nvars, minVal = 2)
  } else if (maxN > 1) {
    wide <- FALSE
  }
  
  ####
  
  if (!is.null(cnames)) {
    nnames <- trimws(unlist(strsplit(cnames, split = ",")))
    lnames <- length(nnames)
    if (!wide) {
      if (lnames > 1) stop(paste("Long format can have only 1 name.", lnames, "have been provided."))
    } else if (wide) {
      if (lnames != nvars) stop(paste0("Number of names (", lnames, ") not equal to specified nvars (", nvars, ")."))
    }
  }

  ####
  
  dtOrig <- copy(dtOld)
  setnames(dtOrig, idvar, ".id")
  dtTemp <- copy(dtOrig)
  
  # check corMatrix
  
  if (!is.null(corMatrix)) {
    if (is.list(corMatrix)) {
      
      # check if corMatrix is a numeric matrix
      
      test <- 
        sapply(corMatrix, function(corMatrix) assertNumericMatrix(corMatrix = corMatrix))
      
      # check if there are the same number of correlation matrices as there are clusters
      
      dn <- dtTemp[, .N, keyby = .id]
      assertLength(corMatrix = corMatrix, length = nrow(dn))
      
      # check if the dimensions of corr matrices match cluster sizes
      
      dn$dim <- sapply(corMatrix, function(x) nrow(x))
      compare_cluster_size <- dn[, sum(N != dim)]
      if (compare_cluster_size != 0) {
        stop("Dimensions of correlation matrices in corMatrix not equal to cluster sizes!")
      }
    
    } else { # not a list
      assertNumericMatrix(corMatrix = corMatrix)
      
      # check if the dimensions of corr matrix matches (equal) cluster size
      
      dn <- dtTemp[, .N, keyby = .id]
      dn[, dim := nrow(corMatrix)]
      compare_cluster_size <- dn[, sum(N != dim)]
      if (compare_cluster_size != 0) {
        stop("Dimensions of corMatrix not equal to cluster sizes!")
      }
    }
  }
  
  
  if (wide) { # Convert to long form temporarily
    dtTemp <- addPeriods(dtTemp, nPeriods = nvars, idvars = ".id")
  }
  
  dtTemp[, seq_ := 1:.N, keyby = .id]
  nvars <- dtTemp[.id == 1, .N] # only permits case where number of records per id is the same

  ####

  if (method == "copula") {
    
    if (is.list(corMatrix)) {
      ns <- split(dtTemp[, .N, keyby = .id], by = ".id")
      dtM <- rbindlist(
        lapply(ns, function(x) .genQuantU(x$N, 1, rho, corstr, corMatrix[[x$.id]])) 
      )
      dtTemp[, .U := dtM$Unew]
    } else {
      if (is.null(corMatrix)) {
        corMatrix <- .buildCorMat(nvars, corMatrix = NULL, rho = rho, corstr = corstr)
      }
      ns <- nrow(dtTemp[, .N, keyby = .id])
      Unew <- c(t(mvnfast::rmvn(n = ns, mu = rep(0, nvars), sigma = corMatrix)))
    
      dtTemp[, .U := stats::pnorm(Unew)]
    }
    
    # dtTemp[, seq := dtM$seq]
    
    if (dist == "poisson") {
      setnames(dtTemp, param1, ".param1")
      dtTemp[, .XX := stats::qpois(p = .U, lambda = .param1)]
    } else if (dist == "binary") {
      setnames(dtTemp, param1, ".param1")
      dtTemp[, .XX := stats::qbinom(p = .U, size = 1, prob = .param1)]
    } else if (dist == "negBinomial") {
      setnames(dtTemp, param1, ".param1")
      setnames(dtTemp, param2, ".param2")
      sp <- negbinomGetSizeProb(dtTemp$.param1, dtTemp$.param2)
      dtTemp[, .param1 := sp[[1]]]
      dtTemp[, .param2 := sp[[2]]]
      dtTemp[, .XX := stats::qnbinom(p = .U, size = .param1, prob = .param2)]
    } else if (dist == "uniform") {
      setnames(dtTemp, param1, ".param1")
      setnames(dtTemp, param2, ".param2")
      dtTemp[, .XX := stats::qunif(p = .U, min = .param1, max = .param2)]
    } else if (dist == "gamma") {
      setnames(dtTemp, param1, ".param1")
      setnames(dtTemp, param2, ".param2")
      sr <- gammaGetShapeRate(dtTemp$.param1, dtTemp$.param2)
      dtTemp[, .param1 := sr[[1]]]
      dtTemp[, .param2 := sr[[2]]]
      dtTemp[, .XX := stats::qgamma(p = .U, shape = .param1, rate = .param2)]
    } else if (dist == "normal") {
      setnames(dtTemp, param1, ".param1")
      setnames(dtTemp, param2, ".param2")
      dtTemp[, .XX := stats::qnorm(p = .U, mean = .param1, sd = sqrt(.param2))]
    }
    
    dX <- dtTemp[, list(.id, seq_, .XX)]

  } else if (method == "ep") {
    
    if (is.list(corMatrix)) {
      dX <- dtTemp[, .genByID(p = get(param1), rho, corstr, corMatrix[[.id]]), keyby = .id]
    } else {
      dX <- dtTemp[, .genByID(p = get(param1), rho, corstr, corMatrix), keyby = .id]
    }
    
    setnames(dX, "X", ".XX")
    
  } # end (if ep)

  if (wide) {
    
    setkey(dX, .id, seq_)

    dWide <- dcast(dX, .id ~ seq_, value.var = ".XX")
    setnames(dWide, c(".id", paste0("V", 1:nvars)))
    
    setkey(dWide, ".id")
    setkey(dtOrig, ".id")
    
    dtTemp <- dtOrig[dWide]

    if (!is.null(cnames)) {
      setnames(dtTemp, paste0("V", 1:nvars), nnames)
    }

  } else if (!wide) {
    
    dtTemp <- copy(dtOld)
    setnames(dtTemp, c(idvar), c(".id"))
    dtTemp[, seq_ := 1:.N, keyby = .id]
    
    setkey(dX, .id, seq_)
    setkey(dtTemp, .id, seq_)
    
    dtTemp <- dtTemp[dX]
    
    if (!is.null(cnames)) {
      setnames(dtTemp, ".XX", cnames)
    } else {
      setnames(dtTemp, ".XX", "X")
    }
    
    dtTemp[, seq_ := NULL]
    
  } # end if !wide
  
  setnames(dtTemp, ".id", idvar)
  return(dtTemp[])
}

Try the simstudy package in your browser

Any scripts or data that you put into this service are public.

simstudy documentation built on Nov. 23, 2023, 1:06 a.m.