R/add_data.R

Defines functions addSynthetic addMultiFac addMarkov addCondition addColumns

Documented in addColumns addCondition addMarkov addMultiFac addSynthetic

#' Add columns to existing data set
#'
#' @param dtDefs name of definitions for added columns
#' @param dtOld name of data table that is to be updated
#' @param envir Environment the data definitions are evaluated in.
#'  Defaults to [base::parent.frame].
#' @return an updated data.table that contains the added simulated data
#' @examples
#' # New data set
#'
#' def <- defData(varname = "xNr", dist = "nonrandom", formula = 7, id = "idnum")
#' def <- defData(def, varname = "xUni", dist = "uniform", formula = "10;20")
#'
#' dt <- genData(10, def)
#'
#' # Add columns to dt
#'
#' def2 <- defDataAdd(varname = "y1", formula = 10, variance = 3)
#' def2 <- defDataAdd(def2, varname = "y2", formula = .5, dist = "binary")
#' def2
#'
#' dt <- addColumns(def2, dt)
#' dt
#' @concept generate_data
#' @md
#' @export
addColumns <- function(dtDefs, dtOld, envir = parent.frame()) {

  # "declares" varname to avoid global NOTE
  varname <- NULL
  formula <- NULL
  dist <- NULL

  assertNotMissing(dtDefs = missing(dtDefs), dtOld = missing(dtOld))
  assertClass(dtDefs = dtDefs, dtOld = dtOld, class = "data.table")

  for (i in seq_len(nrow(dtDefs))) {
    if (i == 1) {
      chkVars <- names(dtOld)
    } else { # check all previously defined vars

      chkVars <- c(dtDefs[1:(i - 1), varname], names(dtOld))
    }

    .evalDef(
      newvar = dtDefs[i, varname],
      newform = dtDefs[i, formula],
      newdist = dtDefs[i, dist],
      defVars = chkVars
    )
  }

  oldkey <- data.table::key(dtOld)

  iter <- nrow(dtDefs)
  n <- nrow(dtOld)
  for (i in (1:iter)) {
    dtOld <- .generate(
      args = dtDefs[i, ],
      n = n,
      dfSim = dtOld,
      idname = oldkey,
      envir = envir
    )
  }

  dtOld <- data.table::data.table(dtOld)
  data.table::setkeyv(dtOld, oldkey)

  return(dtOld[])
}

#' Add a single column to existing data set based on a condition
#'
#' @param condDefs Name of definitions for added column
#' @param dtOld Name of data table that is to be updated
#' @param newvar Name of new column to add
#' @param envir Environment the data definitions are evaluated in.
#'  Defaults to [base::parent.frame].
#' @return An updated data.table that contains the added simulated data
#' @examples
#'
#' # New data set
#'
#' def <- defData(varname = "x", dist = "categorical", formula = ".33;.33")
#' def <- defData(def, varname = "y", dist = "uniform", formula = "-5;5")
#'
#' dt <- genData(1000, def)
#'
#' # Define conditions
#'
#' defC <- defCondition(
#'   condition = "x == 1", formula = "5 + 2*y-.5*y^2",
#'   variance = 1, dist = "normal"
#' )
#' defC <- defCondition(defC,
#'   condition = "x == 2",
#'   formula = "3 - 3*y + y^2", variance = 2, dist = "normal"
#' )
#' defC <- defCondition(defC,
#'   condition = "x == 3",
#'   formula = "abs(y)", dist = "poisson"
#' )
#'
#' # Add column
#'
#' dt <- addCondition(defC, dt, "NewVar")
#'
#' # Plot data
#'
#' library(ggplot2)
#'
#' ggplot(data = dt, aes(x = y, y = NewVar, group = x)) +
#'   geom_point(aes(color = factor(x)))
#' @export
#' @md
#' @concept generate_data
#' @concept condition
addCondition <- function(condDefs, dtOld, newvar, envir = parent.frame()) {

  # 'declare' vars
  varname <- NULL
  formula <- NULL
  dist <- NULL

  assertNotMissing(
    condDefs = missing(condDefs),
    dtOld = missing(dtOld),
    newvar = missing(newvar)
  )
  assertClass(
    condDefs = condDefs,
    dtOld = dtOld,
    class = "data.table"
  )

  cDefs <- copy(condDefs)
  cDefs[, varname := newvar]

  chkVars <- names(dtOld)

  # Check to make sure both formulas are appropriate and reference valid data

  for (i in seq_len(nrow(condDefs))) {
    .evalDef(
      newvar = newvar,
      newform = cDefs[i, formula],
      newdist = cDefs[i, dist],
      defVars = chkVars
    )

    .evalDef(
      newvar = newvar,
      newform = cDefs[i, condition],
      newdist = "nonrandom",
      defVars = chkVars
    )
  }

  oldkey <- data.table::key(dtOld)

  iter <- nrow(cDefs)

  dtNew <- data.table()
  dtTemp <- data.table()

  # Loop through each condition

  for (i in (1:iter)) {
    condition <- cDefs[, condition][i]
    formula <- cDefs[, formula][i]

    dtTemp <- dtOld[eval(parse(text = condition))]
    n <- nrow(dtTemp)

    if (n > 0) {
      dtTemp <- .generate(
        args = cDefs[i, ],
        n = n,
        dfSim = dtTemp,
        idname =  oldkey,
        envir = envir
      )

      dtTemp <- data.table::data.table(dtTemp)
      dtTemp <- dtTemp[, list(get(oldkey), get(newvar))]

      dtNew <- rbind(dtNew, dtTemp)
    }
  }

  setnames(dtNew, c(oldkey, newvar))
  data.table::setkeyv(dtNew, oldkey)

  dtNew <- dtNew[dtOld]

  return(dtNew)
}

#' @title  Add Markov chain
#' @description Generate a Markov chain for n individuals or units by
#' specifying a transition matrix.
#' @param dd data.table with a unique identifier
#' @param transMat Square transition matrix where the sum of each row
#' must equal 1. The dimensions of the matrix equal the number of possible
#' states.
#' @param chainLen Length of each chain that will be generated for each
#' chain; minimum chain length is 2.
#' @param wide Logical variable (TRUE or FALSE) indicating whether the
#' resulting data table should be returned in wide or long format. The
#' wide format includes all elements of a chain on a single row; the long
#' format includes each element of a chain in its own row. The default is
#' wide = FALSE, so the long format is returned by default.
#' @param id Character string that represents name of "id" field.
#' Defaults to "id".
#' @param pername Character string that represents the variable name of the
#' chain sequence in the long format. Defaults "period",
#' @param varname Character string that represents the variable name of the
#' state in the long format. Defaults to "state".
#' @param widePrefix Character string that represents the variable name
#' prefix for the state fields in the wide format. Defaults to "S".
#' @param start0lab Character string that represents name of the integer
#' field containing starting state (State 0) of the chain for each individual.
#' If it is NULL, starting state defaults to 1. Default is NULL.
#' @param trimvalue Integer value indicating end state. If trimvalue is not NULL,
#' all records after the first instance of state = trimvalue will be deleted.
#' @return A data table with n rows if in wide format, or n by chainLen rows
#' if in long format.
#' @examples
#' def1 <- defData(varname = "x1", formula = 0, variance = 1)
#' def1 <- defData(def1, varname = "x2", formula = 0, variance = 1)
#' def1 <- defData(def1,
#'   varname = "S0", formula = ".6;.3;.1",
#'   dist = "categorical"
#' )
#'
#' dd <- genData(20, def1)
#'
#' # Transition matrix P
#'
#' P <- t(matrix(c(
#'   0.7, 0.2, 0.1,
#'   0.5, 0.3, 0.2,
#'   0.0, 0.7, 0.3
#' ),
#' nrow = 3
#' ))
#'
#' d1 <- addMarkov(dd, P, chainLen = 3)
#' d2 <- addMarkov(dd, P, chainLen = 5, wide = TRUE)
#' d3 <- addMarkov(dd, P, chainLen = 5, wide = TRUE, start0lab = "S0")
#' d4 <- addMarkov(dd, P, chainLen = 5, start0lab = "S0", trimvalue = 3)
#' @export
#' @concept generate_data
addMarkov <- function(dd, transMat, chainLen, wide = FALSE, id = "id",
                      pername = "period", varname = "state",
                      widePrefix = "S", start0lab = NULL,
                      trimvalue = NULL) {

  # 'declare' vars created in data.table
  variable <- NULL
  .e <- NULL
  
  ######
  # check transMat is matrix
  if (!is.matrix(transMat)) {
    c <- condition(c("simstudy::typeMatrix", "error"),
                   "transMat is not a matrix!")
    stop(c)
  }

  # check transMat is square matrix
  if ((length(dim(transMat)) != 2) |
      (dim(transMat)[1] != dim(transMat)[2])) {
    c <- condition(c("simstudy::squareMatrix", "error"),
                   "transMat is not a square matrix!")
    stop(c)
  }

  # check transMat row sums = 1
  if (!all(round(apply(transMat, 1, sum), 5) == 1)) {
    c <- condition(c("simstudy::rowSums1", "error"),
                   "transMat rows do not sum to 1!")
    stop(c)
  }

  # check chainLen greater than 1
  if (chainLen <= 1) {
    c <- condition(c("simstudy::chainLen", "error"),
                   "chainLen must be greater than 1!")
    stop(c)
  }
  
  # if start0lab defined, check that it is defined in dd
  if (!is.null(start0lab)) {
    assertInDataTable(vars = start0lab, dt = dd)
    
  }

  # if start0lab defined, check that it exists in the transition matrix
  if (!is.null(start0lab)) {
    if (any(1 > dd[, start0lab, with = FALSE] | dd[, start0lab, with = FALSE] > dim(transMat)[1])) {
      c <- condition(c("simstudy::start0probNotInTransMat", "error"),
                     "all start states in start0prob must exist in the transistion matrix!")
      stop(c)
    }

  }
  ######

  # verify id is in data.table dd

  #if (!(id %in% names(dd))) stop(paste(id, "is not in data table"))
  assertInDataTable(vars = id, dt = dd)

  ####

  n <- nrow(dd)

  if (is.null(start0lab)) {
    s0 <- rep(1, n)
  } else {
    s0 <- dd[, get(start0lab)]
  }

  idlab <- id
  ids <- dd[, get(idlab)]
  xmat <- markovChains(n, transMat, chainLen, s0)

  dx <- data.table::data.table(id = ids, xmat)
  data.table::setnames(dx, "id", ".id") # changed 8/19

  defnames <- paste0("V", seq(1:chainLen))
  tempnames <- paste0(".V", seq(1:chainLen))
  data.table::setnames(dx, defnames, tempnames)

  dx <- merge(dd, dx, by.x = id, by.y = ".id")

  if (wide == TRUE) {
    defnames <- paste0(".V", seq(1:chainLen))
    newnames <- paste0(widePrefix, seq(1:chainLen))
    data.table::setnames(dx, defnames, newnames)
    setkeyv(dx, id)
  } else { # wide = FALSE, so long format

    dx <- data.table::melt(dx,
      id.vars = names(dd),
      value.name = varname, variable.factor = TRUE
    )

    dx[, variable := as.integer(variable)]
    data.table::setnames(dx, "variable", pername)
    setkeyv(dx, id)

    if (!is.null(trimvalue)) {
      dx[, .e := as.integer(get(varname) == trimvalue)]
      dx <- trimData(dx, pername, eventvar = ".e", id)
      dx[, .e := NULL]
    }
  }

  dx[]
}

#' Add multi-factorial data
#'
#' @param dtOld data.table that is to be modified
#' @param nFactors Number of factors (columns) to generate.
#' @param levels Vector or scalar. If a vector is specified, it must be
#' the same length as nFatctors. Each value of the vector represents the
#' number of levels of each corresponding factor. If a scalar is specified,
#' each factor will have the same number of levels. The default is 2 levels
#' for each factor.
#' @param coding String value to specify if "dummy" or "effect" coding is used.
#' Defaults to "dummy".
#' @param colNames A vector of strings, with a length of nFactors. The strings
#' represent the name for each factor.
#' @return A data.table that contains the added simulated data. Each new column contains
#' an integer.
#' @examples
#' defD <- defData(varname = "x", formula = 0, variance = 1)
#'
#' DT <- genData(360, defD)
#' DT <- addMultiFac(DT, nFactors = 3, levels = c(2, 3, 3), colNames = c("A", "B", "C"))
#' DT
#' DT[, .N, keyby = .(A, B, C)]
#'
#' DT <- genData(300, defD)
#' DT <- addMultiFac(DT, nFactors = 3, levels = 2)
#' DT[, .N, keyby = .(Var1, Var2, Var3)]
#' @export
#' @concept generate_data
addMultiFac <- function(dtOld, nFactors, levels = 2, coding = "dummy", colNames = NULL) {

  # 'declare' vars
  count <- NULL

  if (nFactors < 2) stop("Must specify at least 2 factors")
  if (length(levels) > 1 & (length(levels) != nFactors)) stop("Number of levels does not match factors")

  if (is.null(colNames)) {
    cn <- paste0("Var", 1:nFactors)
    if (any(cn %in% names(dtOld))) stop("Default column name(s) already in use")
  } else {
    if (any(colNames %in% names(dtOld))) stop("At least one column name already in use")
  }

  if (length(levels) == 1) {
    combos <- prod(rep(levels, nFactors))
  } else {
    combos <- prod(levels)
  }

  each <- ceiling(nrow(dtOld) / combos)
  extra <- nrow(dtOld) %% combos

  x <- list()

  if (all(levels == 2)) {
    if (coding == "effect") {
      opts <- c(-1, 1)
    } else if (coding == "dummy") {
      opts <- c(0, 1)
    } else {
      stop("Need to specify 'effect' or 'dummy' coding")
    }

    for (i in 1:nFactors) {
      x[[i]] <- opts
    }
  } else {
    if (length(levels) == 1) levels <- rep(levels, nFactors)

    for (i in 1:nFactors) x[[i]] <- c(1:levels[i])
  }

  dnew <- data.table(as.data.frame(lapply(
    expand.grid(x),
    function(x) rep(x, each = each)
  )))
  dnew[, count := rep(c(1:each), length.out = .N)]
  neworder <- sample(1:nrow(dnew), nrow(dnew), replace = FALSE)
  dnew <- dnew[neworder]

  if (extra > 0) {
    full <- dnew[count < each]
    partial <- dnew[count == each][1:extra]

    all <- rbind(full, partial)
  } else {
    all <- copy(dnew)
  }

  all <- all[, -"count"]

  if (!is.null(colNames)) setnames(all, colNames)

  origNames <- copy(names(all))
  dreturn <- cbind(dtOld, all)

  return(dreturn[])
}

#' Add synthetic data
#' @title Add synthetic data to existing data set
#' @description This function generates synthetic data from an existing 
#' data.table and adds it to another (simstudy) data.table.
#' @param dtOld data.table that is to be modified
#' @param dtFrom Data table that contains the source data
#' @param vars A vector of string names specifying the fields that will be
#' sampled. The default is that all variables will be selected.
#' @param id A string specifying the field that serves as the record id. The
#' default field is "id".
#' @return A data.table that contains the added synthetic data.
#' @examples
#' ### Create fake "real" data set - this is the source of the synthetic data
#' 
#' d <- defData(varname = "a", formula = 3, variance = 1, dist = "normal")
#' d <- defData(d, varname = "b", formula = 5, dist = "poisson")
#' d <- defData(d, varname = "c", formula = 0.3, dist = "binary")
#' d <- defData(d, varname = "d", formula = "a + b + 3*c", variance = 2, dist = "normal")
#' 
#' ### Create synthetic data set from "observed" data set A (normally this
#' ### would be an actual external data set):
#' 
#' A <- genData(1000, d)
#' 
#' ### Generate new simstudy data set (using 'def')
#' 
#' def <- defData(varname = "x", formula = 0, variance = 5)
#' S <- genData(120, def)
#' 
#' ### Create synthetic data from 'A' and add to simulated data in 'S'
#' 
#' S <- addSynthetic(dtOld = S, dtFrom = A, vars = c("b", "d"))
#' @export
#' @concept generate_data
addSynthetic <- function(dtOld, dtFrom, 
  vars = NULL, id = "id") {
  
  assertNotMissing(
    dtOld = missing(dtOld),
    dtFrom = missing(dtFrom),
    call = sys.call(-1)
  )
  
  assertClass(
    dtOld = dtOld,
    dtFrom = dtFrom, 
    class = "data.table",
    call = sys.call(-1)
  )
  
  if (is.null(vars)) { vars <- names(dtFrom)[names(dtFrom) != id] }
  
  assertInDataTable(vars = id, dt = dtOld)
  assertInDataTable(vars = id, dt = dtFrom)
  assertNotInDataTable(vars = vars, dt = dtOld)
  
  n <- nrow(dtOld)
  dS <- genSynthetic(dtFrom = dtFrom, n = n, vars = vars, id = id)
  dS <- dtOld[dS, on = id]
  dS[]
  
}

Try the simstudy package in your browser

Any scripts or data that you put into this service are public.

simstudy documentation built on Nov. 23, 2023, 1:06 a.m.