R/generate_data.R
In simstudy: Simulation of Study Data

Documented in genData genDataDensity genDummy genFactor genFormula genMarkov genMultiFac genOrdCat genSpline genSurv genSynthetic

#' Calling function to simulate data
#'
#' @param n the number of observations required in the data set.
#' @param dtDefs name of definitions data.table/data.frame. If no definitions
#'  are provided
#' a data set with ids only is generated.
#' @param id The string defining the id of the record. Will override previously
#'  set id name with a warning (unless the old value is 'id'). If the
#' id attribute in dtDefs is NULL will default to 'id'.
#' @param envir Environment the data definitions are evaluated in.
#'  Defaults to [base::parent.frame].
#' @return A data.table that contains the simulated data.
#' @export
#' @md
#' @concept generate_data
#' @examples
#' genData(5)
#' genData(5, id = "grpID")
#'
#' def <- defData(
#'   varname = "xNr", dist = "nonrandom", formula = 7,
#'   id = "idnum"
#' )
#' def <- defData(def,
#'   varname = "xUni", dist = "uniform",
#'   formula = "10;20"
#' )
#' def <- defData(def,
#'   varname = "xNorm", formula = "xNr + xUni * 2",
#'   dist = "normal", variance = 8
#' )
#' def <- defData(def,
#'   varname = "xPois", dist = "poisson",
#'   formula = "xNr - 0.2 * xUni", link = "log"
#' )
#' def <- defData(def,
#'   varname = "xCat", formula = "0.3;0.2;0.5",
#'   dist = "categorical"
#' )
#' def <- defData(def,
#'   varname = "xGamma", dist = "gamma", formula = "5+xCat",
#'   variance = 1, link = "log"
#' )
#' def <- defData(def,
#'   varname = "xBin", dist = "binary", formula = "-3 + xCat",
#'   link = "logit"
#' )
#' def
#'
#' genData(5, def)
genData <- function(n, dtDefs = NULL, id = "id", envir = parent.frame()) {
  assertNotMissing(n = missing(n))
  assertValue(n = n, id = id)
  assertType(id = id, type = "character")
  assertNumeric(n = n)

  if (is.null(dtDefs)) {
    dt <- data.table::data.table(x = 1:n)
    data.table::setnames(dt, id)
    data.table::setkeyv(dt, id)
  } else { # existing definitions
    assertClass(dtDefs = dtDefs, class = "data.table")

    oldId <- attr(dtDefs, "id")
    if (!is.null(oldId) && id != oldId && !missing(id)) {
      if (oldId != "id") {
        valueWarning(
          var = oldId,
          names = id,
          msg = list(
            "Previously defined 'id'-column found: '{var}'. ",
            "The current specification '{names}' will override it."
          )
        )
      }
    } else {
      id <- oldId %||% id
    }

    dfSimulate <- data.table::data.table(x = 1:n)
    data.table::setnames(dfSimulate, id)
    data.table::setkeyv(dfSimulate, id)
    iter <- nrow(dtDefs) # generate a column of data for each row of dtDefs

    for (i in (1:iter)) {
      dfSimulate <- .generate(
        args = dtDefs[i, ],
        n = n,
        dfSim = dfSimulate,
        idname = id,
        envir = envir
      )
    }

    dt <- data.table::data.table(dfSimulate)
    data.table::setkeyv(dt, id)
  }

  return(dt[])
}



#' Create dummy variables from a factor or integer variable
#'
#' @param dtName Data table with column
#' @param varname Name of factor
#' @param sep Character to be used in creating new name for dummy fields.
#' Valid characters include all letters and "_". Will default to ".". If
#' an invalid character is provided, it will be replaced by default.
#' @param replace If replace is set to TRUE (defaults to FALSE) the field
#' referenced varname will be removed.
#'
#' @examples
#'
#' # First example:
#'
#' def <- defData(varname = "cat", formula = ".2;.3;.5", dist = "categorical")
#' def <- defData(def, varname = "x", formula = 5, variance = 2)
#'
#' dx <- genData(200, def)
#' dx
#'
#' dx <- genFactor(dx, "cat", labels = c("one", "two", "three"), replace = TRUE)
#' dx <- genDummy(dx, varname = "fcat", sep = "_")
#'
#' dx
#'
#' # Second example:
#'
#' dx <- genData(15)
#' dx <- trtAssign(dtName = dx, 3, grpName = "arm")
#' dx <- genDummy(dx, varname = "arm")
#' dx
#' @export
#' @concept generate_data
genDummy <- function(dtName, varname, sep = ".", replace = FALSE) {

  # Initial data checks
  
  assertNotMissing(dtName = missing(dtName), varname = missing(varname))

  # Check if data table exists

  if (!exists(deparse(substitute(dtName)), envir = parent.frame())) {
    c <- condition(c("simstudy::dtNotExist", "error"),
                   "Data Table does not exist!")
    stop(c)
  }
  
  #assertDataTableExists(deparse(substitute(dtName)))

  # Check if varname exists
  
  assertInDataTable(varname, dtName)

  # Check if field is integer or factor

  x <- dtName[, get(varname)]
  
  if (!(is.integer(x) | is.factor(x))) {
    c <- condition(c("simstudy::notIntegerOrFactor", "error"),
                   "Variable must be a factor or integer")
    stop(c)
  }


  if (is.integer(x)) x <- factor(x)

  # if sep is invalid, defaults to "."
  dummy.names <- make.names(paste0(varname, sep, levels(x)))
  nlevels <- length(levels(x))

  # Check to see if new field names exist

  for (i in 1:nlevels) {
    assertNotInDataTable(dummy.names[i], dtName)
  }

  # Create dummies for each level of factor

  dummies <- NULL

  for (i in (1:nlevels)) {
    dummies <- cbind(dummies, as.integer(x == levels(x)[i]))
  }

  dummies <- data.table(dummies)

  setnames(dummies, dummy.names)

  if (replace == TRUE) dtName[, (varname) := NULL]

  return(cbind(dtName, dummies))
}

#' Create factor variable from an existing (non-double) variable
#'
#' @param dtName Data table with columns.
#' @param varname Name of field(s) to be converted.
#' @param labels Factor level labels. If not provided, the generated factor
#' levels will be used as the labels. Can be a vector (if only one new factor or
#' all factors have the same labels) or a list of character vectors of the same
#' length as varname.
#' @param prefix By default, the new field name will be a concatenation of "f"
#' and the old field name. A prefix string can be provided.
#' @param replace If replace is set to TRUE (defaults to FALSE) the field
#' referenced varname will be removed.
#'
#' @examples
#'
#' # First example:
#'
#' def <- defData(varname = "cat", formula = ".2;.3;.5", dist = "categorical")
#' def <- defData(def, varname = "x", formula = 5, variance = 2)
#'
#' dx <- genData(200, def)
#' dx
#'
#' dx <- genFactor(dx, "cat", labels = c("one", "two", "three"))
#' dx
#'
#' # Second example:
#'
#' dx <- genData(10)
#' dx <- trtAssign(dtName = dx, 2, grpName = "studyArm")
#' dx <- genFactor(dx, varname = "studyArm", labels = c("control", "treatment"), prefix = "t_")
#' dx
#' @export
#' @concept generate_data
genFactor <- function(dtName,
                      varname,
                      labels = NULL,
                      prefix = "f",
                      replace = FALSE) {
  assertNotMissing(dtName = missing(dtName), varname = missing(varname))
  assertValue(dtName = dtName, varname = varname)
  assertClass(dtName = dtName, class = "data.table")
  assertType(varname = varname, type = "character")
  assertUnique(varname = varname)
  assertInDataTable(vars = varname, dt = dtName)
  assertInteger(columns2Convert = dtName[, ..varname])

  ..varname <- NULL
  # Create new field name (check to see if it exists)

  fname <- make.names(glue("{prefix}{varname}"))
  assertNotInDataTable(vars = fname, dt = dtName)

  # Create new column as factor
  if (!is.null(labels)) {
    assertType(labels = labels, type = "character")

    if (is.list(labels)) {
      assertLength(labels = labels, length = length(varname))

      for (i in seq_len(length(varname))) {
        dtName[[fname[i]]] <- factor(dtName[[varname[i]]],
          labels = labels[[i]]
        )
      }
    } else {
      dtName[, (fname) := lapply(.SD, factor, labels = labels),
        .SDcols = varname
      ]
    }
  } else {
    dtName[, (fname) := lapply(.SD, factor),
      .SDcols = varname
    ]
  }

  if (replace == TRUE) dtName[, (varname) := NULL]

  dtName[]
}

#' @title Generate a linear formula
#' @description Formulas for additive linear models can be generated
#' with specified coefficient values and variable names.
#' @param coefs A vector that contains the values of the
#' coefficients. Coefficients can also be defined as character for use with 
#' double dot notation. If length(coefs) == length(vars), then no intercept
#' is assumed. Otherwise, an intercept is assumed.
#' @param vars A vector of strings that specify the names of the
#' explanatory variables in the equation.
#' @return A string that represents the desired formula
#' @examples
#'
#' genFormula(c(.5, 2, 4), c("A", "B", "C"))
#' genFormula(c(.5, 2, 4), c("A", "B"))
#' 
#' genFormula(c(.5, "..x", 4), c("A", "B", "C"))
#' genFormula(c(.5, 2, "..z"), c("A", "B"))
#'
#' changeX <- c(7, 10)
#' genFormula(c(.5, 2, changeX[1]), c("A", "B"))
#' genFormula(c(.5, 2, changeX[2]), c("A", "B"))
#' genFormula(c(.5, 2, changeX[2]), c("A", "B", "C"))
#'
#' newForm <- genFormula(c(-2, 1), c("A"))
#'
#' def1 <- defData(varname = "A", formula = 0, variance = 3, dist = "normal")
#' def1 <- defData(def1, varname = "B", formula = newForm, dist = "binary", link = "logit")
#'
#' set.seed(2001)
#' dt <- genData(500, def1)
#' summary(glm(B ~ A, data = dt, family = binomial))
#' @export
#' @concept generate_data
genFormula <- function(coefs, vars) {
  lcoef <- length(coefs)
  lvars <- length(vars)

  if (!(lcoef == lvars | lcoef == lvars + 1)) {
    c <- condition(c("simstudy::coeffVar", "error"),
                   "Coefficients or variables not properly specified!")
    stop(c)
  }
  
  
  if (is.character(coefs)) {
    for (cf in coefs) {
      if (suppressWarnings(is.na(as.integer(cf)))) {
        if (substr(cf, start = 1, stop = 2) != "..") {
          c <- condition(c("simstudy::doubleDot", "error"),
                         "non-numerical coefficients must be specified with double dot notation")
          
          stop(c)
        }
      }
    }
  }

  assertType(var1 = vars, type = "character")

  if (lcoef != lvars) { # Intercept

    form <- paste0(coefs[1])
    coefs <- coefs[-1]
  } else { # no intercept

    form <- paste(coefs[1], "*", vars[1])
    coefs <- coefs[-1]
    vars <- vars[-1]
  }

  for (i in 1:(lcoef - 1)) {
    form <- paste(form, "+", coefs[i], "*", vars[i])
  }

  return(form)
}

#' @title  Generate Markov chain
#' @description Generate a Markov chain for n individuals or units by
#' specifying a transition matrix.
#' @param n number of individual chains to generate
#' @param transMat Square transition matrix where the sum of each row
#' must equal 1. The dimensions of the matrix equal the number of possible
#' states.
#' @param chainLen Length of each chain that will be generated for each
#' chain; minimum chain length is 2.
#' @param wide Logical variable (TRUE or FALSE) indicating whether the
#' resulting data table should be returned in wide or long format. The
#' wide format includes all elements of a chain on a single row; the long
#' format includes each element of a chain in its own row. The default is
#' wide = FALSE, so the long format is returned by default.
#' @param id Character string that represents name of "id" field.
#' Defaults to "id".
#' @param pername Character string that represents the variable name of the
#' chain sequence in the long format. Defaults "period",
#' @param varname Character string that represents the variable name of the
#' state in the long format. Defaults to "state".
#' @param widePrefix Character string that represents the variable name
#' prefix for the state fields in the wide format. Defaults to "S".
#' @param trimvalue Integer value indicating end state. If trimvalue is not NULL,
#' all records after the first instance of state = trimvalue will be deleted.
#' @param startProb A string that contains the probability distribution of the 
#' starting state, separated by a ";". Length of start probabilities must match
#' the number of rows of the transition matrix.
#' @return A data table with n rows if in wide format, or n by chainLen rows
#' if in long format.
#' @examples
#'
#' # Transition matrix P
#'
#' P <- t(matrix(c(
#'   0.7, 0.2, 0.1,
#'   0.5, 0.3, 0.2,
#'   0.0, 0.1, 0.9
#' ), nrow = 3, ncol = 3))
#'
#' d1 <- genMarkov(n = 10, transMat = P, chainLen = 5)
#' d2 <- genMarkov(n = 10, transMat = P, chainLen = 5, wide = TRUE)
#' d3 <- genMarkov(
#'   n = 10, transMat = P, chainLen = 5,
#'   pername = "seq", varname = "health",
#'   trimvalue = 3
#' )
#' @export
#' @concept generate_data
genMarkov <- function(n, transMat, chainLen, wide = FALSE, id = "id",
                      pername = "period", varname = "state",
                      widePrefix = "S", trimvalue = NULL, startProb = NULL) {

  # 'declare' vars created in data.table
  variable <- NULL

  # check transMat is matrix
  if (!is.matrix(transMat)) {
    c <- condition(c("simstudy::typeMatrix", "error"),
                   "transMat is not a matrix!")
    stop(c)
  }
    
  # check transMat is square matrix
  if ((length(dim(transMat)) != 2) |
      (dim(transMat)[1] != dim(transMat)[2])) {
    c <- condition(c("simstudy::squareMatrix", "error"),
                   "transMat is not a square matrix!")
    stop(c)
  }
  
  # check transMat row sums = 1
  if (!all(round(apply(transMat, 1, sum), 5) == 1)) {
    c <- condition(c("simstudy::rowSums1", "error"),
                   "transMat rows do not sum to 1!")
    stop(c)
  }
  
  # check chainLen greater than 1
  if (chainLen <= 1) {
    c <- condition(c("simstudy::chainLen", "error"),
                   "chainLen must be greater than 1!")
    stop(c)
  }
  
  # if startProb defined, check it sums to 1
  if (!is.null(startProb)) {
    s <- as.numeric(unlist(strsplit(startProb, split = ";")))
    ssum <- sum(s)
    if (ssum != 1) {
      c <- condition(c("simstudy::notEqual", "error"),
                     "startProb must sum to 1!")
      stop(c)
    }
    
  }
  
  # if startProb defined, check it has length == number of matrix rows
  if (!is.null(startProb)) {
    s <- unlist(strsplit(startProb, split = ";"))
    r <- dim(transMat)[1]
      assertLength(var1 = s, length = r)
    
  }

  ####

  if (!is.null(startProb)) {
    dprob <- defData(varname = "prob", formula = startProb, dist = "categorical")
    dd <- genData(n = n, dprob, id = id)
    dd <- addMarkov(dd, transMat, chainLen, wide, id,
      pername, varname, widePrefix,
      start0lab = "prob",
      trimvalue = trimvalue
    )
    
    dd$prob <- NULL
  } else {
    dd <- genData(n = n, id = id)
    dd <- addMarkov(dd, transMat, chainLen, wide, id,
                    pername, varname, widePrefix,
                    trimvalue = trimvalue
    )
  }

  dd[]
}

#' Generate multi-factorial data
#'
#' @param nFactors Number of factors (columns) to generate.
#' @param each Number of replications for each combination of factors. Must be specified.
#' @param levels Vector or scalar. If a vector is specified, it must be
#' the same length as nFatctors. Each value of the vector represents the
#' number of levels of each corresponding factor. If a scalar is specified,
#' each factor will have the same number of levels. The default is 2 levels
#' for each factor.
#' @param coding String value to specify if "dummy" or "effect" coding is used.
#' Defaults to "dummy".
#' @param colNames A vector of strings, with a length of nFactors. The strings
#' represent the name for each factor.
#' @param idName A string that specifies the id of the record. Defaults to "id".
#' @return A data.table that contains the added simulated data. Each column contains
#' an integer.
#' @examples
#' genMultiFac(nFactors = 2, each = 5)
#' genMultiFac(nFactors = 2, each = 4, levels = c(2, 3))
#' genMultiFac(
#'   nFactors = 3, each = 1, coding = "effect",
#'   colNames = c("Fac1", "Fac2", "Fac3"), id = "block"
#' )
#' @export
#' @concept generate_data
genMultiFac <- function(nFactors, each, levels = 2, coding = "dummy", colNames = NULL, idName = "id") {
  # check nFactors are integers
  assertInteger(var1 = nFactors)
  
  # check length nFactors greater than 2
  if(nFactors < 2) {
    c <- condition(c("simstudy::greaterThan", "error"),
                   "nFactors must be greater than 2!")
    stop(c)
  }
  
  # check number of levels matches factors
  if (length(levels) > 1) {
    assertLength(var1 = levels, length = nFactors)
  }
  
  # check coding == 'effect' or 'dummy'
  if(!(coding == "effect" | coding == "dummy")) {
    c <- condition(c("simstudy::codingVal", "error"),
                   "coding must equal 'effect' or 'dummy'!")
    stop(c)
  }

  x <- list()

  if (all(levels == 2)) {
    if (coding == "effect") {
      opts <- c(-1, 1)
    } else {
      opts <- c(0, 1)
    }

    for (i in 1:nFactors) {
      x[[i]] <- opts
    }
  } else {
    if (length(levels) == 1) levels <- rep(levels, nFactors)

    for (i in 1:nFactors) x[[i]] <- c(1:levels[i])
  }

  dt <- data.table(as.data.frame(lapply(expand.grid(x), function(x) rep(x, each = each))))

  if (!is.null(colNames)) setnames(dt, colNames)

  origNames <- copy(names(dt))

  dt[, (idName) := 1:.N]

  setcolorder(dt, c(idName, origNames))
  setkeyv(dt, idName)

  return(dt[])
}

#' @title Generate ordinal categorical data
#' @description Ordinal categorical data is added to an existing data set.
#' Correlations can be added via correlation matrix or `rho` and `corstr`.
#' @param dtName Name of complete data set
#' @param adjVar Adjustment variable  name in dtName - determines
#' logistic shift. This is specified assuming a cumulative logit
#' link.
#' @param baseprobs Baseline probability expressed as a vector or matrix of
#' probabilities. The values (per row) must sum to <= 1. If `rowSums(baseprobs)
#' < 1`, an additional category is added with probability `1 -
#' rowSums(baseprobs)`. The number of rows represents the number of new
#' categorical variables. The number of columns represents the number of
#' possible responses - if an particular category has fewer possible responses,
#' assign zero probability to non-relevant columns.
#' @param catVar Name of the new categorical field. Defaults to "cat". Can be a
#' character vector with a name for each new variable defined via `baseprobs`.
#' Will be overridden by `prefix` if more than one variable is defined and
#' `length(catVar) == 1`.
#' @param asFactor If `asFactor == TRUE` (default), new field is returned
#' as a factor. If `asFactor == FALSE`, new field is returned as an integer.
#' @param idname Name of the id column in `dtName`.
#' @param prefix A string. The names of the new variables will be a
#' concatenation of the prefix and a sequence of integers indicating the
#' variable number.
#' @param rho Correlation coefficient, -1 < rho < 1. Use if corMatrix is not
#' provided.
#' @param corstr Correlation structure of the variance-covariance matrix defined
#' by sigma and rho. Options include "ind" for an independence structure, "cs"
#' for a compound symmetry structure, and "ar1" for an autoregressive structure.
#' @param corMatrix Correlation matrix can be entered directly. It must be
#' symmetrical and positive definite. It is not a required field; if a matrix is
#' not provided, then a structure and correlation coefficient rho must be
#' specified. (The matrix created via `rho` and `corstr` must also be positive
#' definite.)
#' @param npVar Vector of variable names that indicate which variables are to
#' violate the proportionality assumption.
#' @param npAdj Matrix with a row for each npVar and a column for each category.
#' Each value represents the deviation from the proportional odds assumption on
#' the logistic scale.
#' @return Original data.table with added categorical field.
#' @examples
#' # Ordinal Categorical Data ----
#'
#' def1 <- defData(
#'   varname = "male",
#'   formula = 0.45, dist = "binary", id = "idG"
#' )
#' def1 <- defData(def1,
#'   varname = "z",
#'   formula = "1.2*male", dist = "nonrandom"
#' )
#' def1
#'
#' ## Generate data
#'
#' set.seed(20)
#'
#' dx <- genData(1000, def1)
#'
#' probs <- c(0.40, 0.25, 0.15)
#'
#' dx <- genOrdCat(dx,
#'   adjVar = "z", idname = "idG", baseprobs = probs,
#'   catVar = "grp"
#' )
#' dx
#'
#' # Correlated Ordinal Categorical Data ----
#'
#' baseprobs <- matrix(c(
#'   0.2, 0.1, 0.1, 0.6,
#'   0.7, 0.2, 0.1, 0,
#'   0.5, 0.2, 0.3, 0,
#'   0.4, 0.2, 0.4, 0,
#'   0.6, 0.2, 0.2, 0
#' ),
#' nrow = 5, byrow = TRUE
#' )
#'
#' set.seed(333)
#' dT <- genData(1000)
#'
#' dX <- genOrdCat(dT,
#'   adjVar = NULL, baseprobs = baseprobs,
#'   prefix = "q", rho = .125, corstr = "cs", asFactor = FALSE
#' )
#' dX
#'
#' dM <- data.table::melt(dX, id.vars = "id")
#' dProp <- dM[, prop.table(table(value)), by = variable]
#' dProp[, response := c(1:4, 1:3, 1:3, 1:3, 1:3)]
#'
#' data.table::dcast(dProp, variable ~ response,
#'   value.var = "V1", fill = 0
#' )
#'
#' # proportional odds assumption violated
#'
#' d1 <- defData(varname = "rx", formula = "1;1", dist = "trtAssign")
#' d1 <- defData(d1, varname = "z", formula = "0 - 1.2*rx", dist = "nonrandom")
#'
#' dd <- genData(1000, d1)
#'
#' baseprobs <- c(.4, .3, .2, .1)
#' npAdj <- c(0, 1, 0, 0)
#'
#' dn <- genOrdCat(
#'   dtName = dd, adjVar = "z",
#'   baseprobs = baseprobs,
#'   npVar = "rx", npAdj = npAdj
#' )
#'
#' @export
#' @md
#' @concept generate_data
#' @concept categorical
#' @concept correlated
genOrdCat <- function(dtName,
                      adjVar = NULL,
                      baseprobs,
                      catVar = "cat",
                      asFactor = TRUE,
                      idname = "id",
                      prefix = "grp",
                      rho = 0,
                      corstr = "ind",
                      corMatrix = NULL,
                      npVar = NULL,
                      npAdj = NULL) {

  # "declares" to avoid global NOTE
  cat <- NULL
  logisZ <- NULL
  period <- NULL

  assertNotMissing(dtName = missing(dtName), baseprobs = missing(baseprobs))
  assertValue(
    dtName = dtName,
    baseprobs = baseprobs,
    catVar = catVar,
    asFactor = asFactor,
    idname = idname,
    prefix = prefix,
    rho = rho,
    corstr = corstr
  )
  assertClass(dtName = dtName, class = "data.table")
  assertClass(rho = rho, class = "numeric")
  assertClass(
    catVar = catVar,
    prefix = prefix,
    corstr = corstr,
    idname = idname,
    class = "character"
  )
  assertInDataTable(c(adjVar, idname, npVar), dtName)
  corstr <- ensureOption(
    corstr = corstr,
    options = c("ind", "cs", "ar1"),
    default = "ind"
  )

  baseprobs <- ensureMatrix(baseprobs)
  baseprobs <- .adjustProbs(baseprobs)

  nCats <- nrow(baseprobs)

  ensureLength(catVar = catVar, n = nCats)

  if (!is.null(adjVar)) {
    adjVar <- ensureLength(
      adjVar = adjVar,
      n = nCats, msg = list(
        "Number of categories implied",
        " by baseprobs and adjVar do not match. ",
        "{ dots$names[[1]] } should be",
        " either length 1 or { n } but",
        " is { length(var) }!"
      )
    )
  }

  if (!is.null(npAdj) & is.null(npVar)) {
    mismatchError("npAdj", "npVar")
  } else if (is.null(npAdj) & !is.null(npVar)) {
    mismatchError("npVar", "npAdj")
  } else if (is.null(npAdj) & is.null(npVar)) {
    npAdj <- matrix(rep(0, ncol(baseprobs)), nrow = 1)
  } else if (!is.null(npAdj) & !is.null(npVar)) {
    b_len <- ncol(baseprobs)
    v_len <- length(npVar)

    npAdj <- ensureMatrix(npAdj)

    if (nrow(npAdj) != v_len) {
      msg <- list(
        "Number of rows for npAdj ({nrow(npAdj)})",
        " does not match with the number of",
        " adjustment variables specified",
        " in npVar ({v_len})!"
      )
      stop(do.call(glue, msg))
    }

    if (ncol(npAdj) != b_len) {
      msg <- list(
        "Number of categories implied",
        " by baseprobs and npAdj do not match. ",
        "npAdj should have {b_len}",
        " columns but has { ncol(npAdj) }!"
      )
      stop(do.call(glue, msg))
    }
  }

  if (nCats > 1 && length(catVar) != nCats) {
    catVar <- glue("{prefix}{i}", i = zeroPadInts(1:nCats))
  }

  dt <- copy(dtName)
  n <- nrow(dt)
  zs <- .genQuantU(nCats, n, rho = rho, corstr, corMatrix = corMatrix)
  zs[, logisZ := stats::qlogis(p = zs$Unew)]
  cprop <- t(apply(baseprobs, 1, cumsum))
  quant <- t(apply(cprop, 1, stats::qlogis))

  mycat <- list()

  for (i in 1:nCats) {
    iLogisZ <- zs[period == i - 1, logisZ]
    n_obs_i <- length(iLogisZ)
    matlp <- matrix(rep(quant[i, ], n),
      ncol = ncol(cprop),
      byrow = TRUE
    )

    if (!is.null(adjVar)) {
      z <- dt[, adjVar[i], with = FALSE][[1]]
    } else {
      z <- rep(0, n_obs_i)
    }

    if (!is.null(npVar)) {
      npVar_mat <- as.matrix(dt[, npVar, with = FALSE])
    } else {
      npVar_mat <- matrix(rep(0, n_obs_i))
    }

    npmat <- npVar_mat %*% npAdj # npAdj is #npVAR X
    matlp <- matlp - npmat - z

    if (chkNonIncreasing(matlp)) {
      stop("Overlapping thresholds. Check adjustment values!")
    }

    locateGrp <- (iLogisZ > cbind(-Inf, matlp))
    assignGrp <- apply(locateGrp, 1, sum)
    mycat[[i]] <- data.table(
      id = dt[, idname, with = FALSE][[1]],
      var = catVar[[i]],
      cat = assignGrp
    )
  }

  dcat <- data.table::rbindlist(mycat)
  cats <- data.table::dcast(dcat, id ~ var, value.var = "cat")

  setnames(cats, "id", idname)
  setkeyv(cats, idname)
  dt <- dt[cats]

  if (asFactor) {
    dt <- genFactor(dt, catVar, replace = TRUE)
    data.table::setnames(dt, glue("f{catVar}"), catVar)
  }

  dt[]
}

#' Generate spline curves
#'
#' @param dt data.table that will be modified
#' @param newvar Name of new variable to be created
#' @param predictor Name of field in old data.table that is predicting new value
#' @param theta A vector or matrix of values between 0 and 1. Each column of the matrix
#' represents the weights/coefficients that will be applied to the basis functions
#' determined by the knots and degree. Each column of theta represents a separate
#' spline curve.
#' @param knots A vector of values between 0 and 1, specifying quantile
#' cut-points for splines. Defaults to c(0.25, 0.50, 0.75).
#' @param degree Integer specifying polynomial degree of curvature.
#' @param newrange Range of the spline function , specified as a string
#' with two values separated by a semi-colon. The first value represents the
#' minimum, and the second value represents the maximum. Defaults to NULL, which
#' sets the range to be between 0 and 1.
#' @param noise.var Add to normally distributed noise to observation - where mean
#' is value of spline curve.
#' @return A modified data.table with an added column named newvar.
#' @examples
#' ddef <- defData(varname = "age", formula = "0;1", dist = "uniform")
#'
#' theta1 <- c(0.1, 0.8, 0.6, 0.4, 0.6, 0.9, 0.9)
#' knots <- c(0.25, 0.5, 0.75)
#'
#' viewSplines(knots = knots, theta = theta1, degree = 3)
#'
#' set.seed(234)
#' dt <- genData(1000, ddef)
#'
#' dt <- genSpline(
#'   dt = dt, newvar = "weight",
#'   predictor = "age", theta = theta1,
#'   knots = knots, degree = 3,
#'   noise.var = .025
#' )
#'
#' dt
#' @export
#' @concept splines
#' @concept generate_data
genSpline <- function(dt, newvar, predictor, theta,
                      knots = c(0.25, 0.50, 0.75), degree = 3,
                      newrange = NULL, noise.var = 0) {

  # "declares" to avoid global NOTE
  y.spline <- NULL

  # Check arguments

  assertNotMissing(
    dt = missing(dt), 
    newvar = missing(newvar), 
    predictor = missing(predictor), 
    theta = missing(theta)
  )
  
  assertClass(dt = dt, class = "data.table")
  assertInDataTable(predictor, dt)
  assertClass(newvar = newvar, class = "character")

  if (!(is.null(newrange))) {
    
    newrange <- unlist(strsplit(as.character(newrange), split = ";", fixed = TRUE))
    
    assertLength(newrange = newrange, length = 2)
    newrange <- tryCatch(as.numeric(newrange), warning = function(x) NULL)
    assertClass(newrange = newrange, class = "numeric")

    newmin <- min(newrange)
    newmax <- max(newrange)
  }

  ### All checks passed

  x <- dt[, predictor, with = FALSE][[1]]

  if (!(min(x) >= 0 & max(x) <= 1)) {
    x.normalize <- (x - min(x)) / (max(x) - min(x))
  } else {
    x.normalize <- copy(x)
  }

  qknots <- stats::quantile(x = x.normalize, probs = knots)

  sdata <- .genbasisdt(x.normalize, qknots, degree, theta) # Call internal function

  if (is.null(newrange)) {
    newy <- sdata$dt[, y.spline]
  } else {
    newy <- sdata$dt[, y.spline * (newmax - newmin) + newmin] # de-normalize
  }

  newy <- stats::rnorm(length(newy), newy, sqrt(noise.var)) # add noise

  dt[, newy := newy]
  data.table::setnames(dt, "newy", newvar)

  return(dt[])
}

#' @title Generate survival data
#' @description Survival data is added to an existing data set.
#' @param dtName Name of data set
#' @param survDefs Definitions of survival
#' @param digits Number of digits for rounding
#' @param timeName A string to indicate the name of a combined competing risk
#' time-to-event outcome that reflects the minimum observed value of all 
#' time-to-event outcomes. Defaults to NULL, indicating that each time-to-event
#' outcome will be included in dataset.
#' @param censorName The name of a time to event variable that is the censoring
#' variable. Will be ignored if timeName is NULL.
#' @param eventName The name of the new numeric/integer column representing the
#' competing event outcomes. If censorName is specified, the integer value for
#' that event will be 0. Defaults to "event", but will be ignored 
#' if timeName is NULL.
#' @param typeName The name of the new character column that will indicate the
#' event type. The type will be the unique variable names in survDefs. Defaults
#' to "type", but will be ignored if timeName is NULL.
#' @param keepEvents Indicator to retain original "events" columns. Defaults
#' to FALSE.
#' @param idName Name of id field in existing data set.
#' @param envir Optional environment, defaults to current calling environment.
#' @return Original data table with survival time
#' @examples
#' # Baseline data definitions
#'
#' def <- defData(varname = "x1", formula = .5, dist = "binary")
#' def <- defData(def, varname = "x2", formula = .5, dist = "binary")
#' def <- defData(def, varname = "grp", formula = .5, dist = "binary")
#'
#' # Survival data definitions
#'
#' sdef <- defSurv(
#'   varname = "survTime", formula = "1.5*x1",
#'   scale = "grp*50 + (1-grp)*25", shape = "grp*1 + (1-grp)*1.5"
#' )
#'
#' sdef <- defSurv(sdef, varname = "censorTime", scale = 80, shape = 1)
#'
#' sdef
#'
#' # Baseline data definitions
#'
#' dtSurv <- genData(300, def)
#'
#' # Add survival times
#'
#' dtSurv <- genSurv(dtSurv, sdef)
#'
#' head(dtSurv)
#' @export
#' @concept generate_data
#'
# source: Bender, Augustin, & Blettner, Generating survival times to simulate Cox
# proportional hazard models, SIM, 2005;24;1713-1723.
#
genSurv <- function(dtName, survDefs, digits = 3, 
  timeName = NULL, censorName = NULL, eventName = "event", 
  typeName = "type", keepEvents = FALSE, idName = "id", envir = parent.frame()) {
  
  # For double-dot notation
  
  assertNotMissing(
    dtName = missing(dtName),
    survDefs = missing(survDefs),
    call = sys.call(-1)
  )
  
  assertClass(
    dtName = dtName, 
    survDefs = survDefs, 
    class = "data.table",
    call = sys.call(-1)
  )
  
  events <- unique(survDefs$varname)
  
  assertNotInDataTable(events, dtName)
  
  assertClass(digits = digits, class="numeric")
  assertLength(digits = digits, length = 1, call = sys.call(-1))
  assertInDataTable(idName, dtName)

  # 'declare
  varname <- NULL
  formula <- NULL
  N <- NULL
  V1 <- NULL 
  event  <- NULL
  survx  <- NULL
  time  <- NULL
  type <- NULL
  id <- NULL
  
  dtSurv <- copy(dtName)
  setnames(dtSurv, idName, "id")
  
  for (i in (seq_along(events))) {
    
    nlogu <- -log(stats::runif(nrow(dtSurv), min = 0, max = 1))
    
    subDef <- survDefs[varname == events[i]]

    formshape <- subDef[1, shape]
    shape <- as.vector(.evalWith(formshape, .parseDotVars(formshape,  envir = parent.frame()), dtSurv, envir = envir))

    formscale <- subDef[1, scale]
    scale <- as.vector(.evalWith(formscale, .parseDotVars(formscale, envir = parent.frame()), dtSurv, envir = envir))
    
    formulas <- subDef[, formula]
    form1 <- as.vector(.evalWith(formulas[1], .parseDotVars(formulas[1], envir = parent.frame()), dtSurv, envir = envir))
    
    if (nrow(subDef) > 1) {
      
      if (subDef[2, shape] != subDef[1, shape]) {
        warning("Shape definitions over periods are different. Only first definition will be used.")
      }
      
      if (subDef[2, scale] != subDef[1, scale]) {
        warning("Scale definitions over periods are different. Only first definition will be used.")
      }
      
      transition <- subDef[2, transition]
      t_adj <- transition ^ (1/shape)
     
      form2 <- as.vector(.evalWith(formulas[2], .parseDotVars(formulas[2], envir = parent.frame()), dtSurv, envir = envir))
      
      threshold <- exp(form1) * t_adj
      period <- 1*(nlogu < threshold) + 2*(nlogu >= threshold)
      
      tempdt <- data.table(nlogu, form1, form2, period, shape, scale, t_adj)
      
      tempdt[period == 1, survx := (nlogu/((1/scale)*exp(form1)))^shape]
      tempdt[period == 2, survx := ((nlogu - (1/scale)*exp(form1)*t_adj + (1/scale)*exp(form2)*t_adj)/((1/scale)*exp(form2)))^shape]
      
      newColumn <- tempdt[, list(survx = round(survx, digits))]
      
    } else {
    
      tempdt <- data.table(nlogu, form1)
      newColumn <-  
        tempdt[, list(survx = round((nlogu/((1/scale)*exp(form1)))^shape, digits))]

    }
    
    dtSurv <- data.table::data.table(dtSurv, newColumn)
    data.table::setnames(dtSurv, "survx", as.character(subDef[1, varname]))
    
  }
  
  if (!is.null(timeName)) {
    
    dtSurv <- addCompRisk(dtSurv, events, timeName, censorName, 
      eventName, typeName, keepEvents)
    
  }
    
  setnames(dtSurv, "id", idName)
  return(dtSurv[])
  
}

#' @title Generate synthetic data 
#' @description Synthetic data is generated from an existing data set
#' @param dtFrom Data table that contains the source data
#' @param n Number of samples to draw from the source data. The default
#' is number of records that are in the source data file.
#' @param vars A vector of string names specifying the fields that will be
#' sampled. The default is that all variables will be selected.
#' @param id A string specifying the field that serves as the record id. The
#' default field is "id".
#' @return A data table with the generated data
#' @examples
#' ### Create fake "real" data set
#'
#' d <- defData(varname = "a", formula = 3, variance = 1, dist = "normal")
#' d <- defData(d, varname = "b", formula = 5, dist = "poisson")
#' d <- defData(d, varname = "c", formula = 0.3, dist = "binary")
#' d <- defData(d, varname = "d", formula = "a + b + 3*c", variance = 2, dist = "normal")
#' 
#' A <- genData(100, d, id = "index")
#' 
#' ### Create synthetic data set from "observed" data set A:
#' 
#' def <- defDataAdd(varname = "x", formula = "2*b + 2*d", variance = 2)
#' 
#' S <- genSynthetic(dtFrom = A, n = 120, vars = c("b", "d"), id = "index")
#' S <- addColumns(def, S)
#'
#' @export
#' @concept generate_data

genSynthetic <- function(dtFrom, n = nrow(dtFrom),  
  vars = NULL, id = "id") {
  
  assertNotMissing(
    dtFrom = missing(dtFrom),
    call = sys.call(-1)
  )
  
  assertClass(
    dtFrom = dtFrom, 
    class = "data.table",
    call = sys.call(-1)
  )
  
  if (is.null(vars)) { vars <- names(dtFrom)[names(dtFrom) != id] }

  assertClass(
    vars= vars,
    id = id,
    class = "character",
    call = sys.call(-1)
  )  
  
  assertInteger(n = n)
  
  assertInDataTable(vars = vars, dt = dtFrom)
  assertInDataTable(vars = id, dt = dtFrom)
  
  assertNotInVector(id, vars)
  
  dx <- copy(dtFrom)
  
  getVars <- c(id, vars)
  dx <- dx[, getVars, with = FALSE]  
  
  setnames(dx, id, "id")
  ids <- dx[, sample(id, n, replace = TRUE)]
  dx <- dx[ids]
  dx[, id := 1:n]
  setnames(dx, "id", id)
  
  dx[]
  
}

#' @title Generate data from a density defined by a vector of integers
#' @description Data are generated from an a density defined by a vector of integers
#' @param n Number of samples to draw from the density.
#' @param dataDist Vector that defines the desired density
#' @param varname Name of variable name
#' @param uselimits Indicator to use minimum and maximum of input data vector as 
#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
#' extends beyond the limits is used.
#' @param id A string specifying the field that serves as the record id. The
#' default field is "id".
#' @return A data table with the generated data
#' @examples
#' data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
#' 
#' genDataDensity(500, data_dist, varname = "x1", id = "id")
#' genDataDensity(500, data_dist, varname = "x1", uselimits = TRUE, id = "id")
#' @export
#' @concept generate_data

genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id") {
  
  assertNotMissing(n = missing(n), dataDist = missing(dataDist), varname = missing(varname))
  
  dataDist <- round(dataDist, 0)

  .dd <- genData(n, id = id)
  addDataDensity(.dd, dataDist, varname, uselimits)[]
  
}
Any scripts or data that you put into this service are public.
simstudy documentation built on Sept. 11, 2024, 6:38 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
simstudy
Simulation of Study Data

R/generate_data.R
In simstudy: Simulation of Study Data

Defines functions genDataDensity genSynthetic genSurv genSpline genOrdCat genMultiFac genMarkov genFormula genFactor genDummy genData

Documented in genData genDataDensity genDummy genFactor genFormula genMarkov genMultiFac genOrdCat genSpline genSurv genSynthetic

Try the simstudy package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

simstudy Simulation of Study Data

R/generate_data.R In simstudy: Simulation of Study Data

Defines functions genDataDensity genSynthetic genSurv genSpline genOrdCat genMultiFac genMarkov genFormula genFactor genDummy genData

Documented in genData genDataDensity genDummy genFactor genFormula genMarkov genMultiFac genOrdCat genSpline genSurv genSynthetic

Try the simstudy package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

simstudy
Simulation of Study Data

R/generate_data.R
In simstudy: Simulation of Study Data