#' @importFrom stats runif setNames
#' @importFrom methods is
#' @importFrom tibble tribble
NULL
#' Sentinel Values for Recipes
#' @export
#' @rdname sentinelS
no_key <- character()
#' @export
#' @rdname sentinelS
no_deps <- character()
#' @export
#' @rdname sentinelS
no_rec <- list()
#' @export
#' @rdname sentinelS
no_args <- list()
#' Generate variables in a table
#'
#'
#' @param N numeric(1). Number of rows to generate. Defaults to 400, or the number of rows in \code{df} if provided.
#' @param recipe tibble. A recipe for generating variables into the dataset. see \code{Details}.
#' @param miss_recipe tibble. A recipe for generating missingness positions, or \code{NULL} (the default).
#' @param df data.frame/tibble. Existing partial data which new variables should be added to, or \code{NULL} (the
#' default).
#' @param df_keepcols logical. which columns from \code{df} should be retained in the resulting dataset (by position).
#' Defaults to all columns present in \code{df}.
#'
#' @details
#'
#' The \code{recipe} parameter should be a tibble made up of one or more rows which define variable recipes via the following
#' columns:
#' \describe{
#' \item{variables}{(list column as needed) names of variables generated by that row. No empty/length 0 entries allowed}
#' \item{dependencies}{(list column). Names of variables which must have already been populated for the the variables in this row to be synthesized}
#' \item{func}{(list column) A character value which can be used to look up a function, or the function object itself, which accepts n, .df, and ... and returns either an atomic vector of length \code{n}, or a data.frame/tibble with n rows}
#' \item{func_args}{(list column) a list of arguments which should be passed to \code{func} in addition to \code{n} and \code{.df}}
#' }
#'
#' The algorithm for synthesizing the table from the recipe is as follows:
#' \enumerate{
#' \item{Columns of synthesized data are generated according to recipe rows which have no dependencies in the order they appear in the recipe tibble and appended to the dataset with names for the variables generated}
#' \item{Recipe rows containing dependencies are checked in the order they appear in the recipe table for whether their dependencies are met, and if so data is synthesized for the corresponding variables and added to the dataset. This step is repeated until all recipe rows have been resolved, or until a full pass through the unresolved recipe rows does not lead to any new data synthesis.}
#' \item{After all data synthesis is complete, columns are then reordered based on any columns of \code{df} first, followed by newly synthesized variables in the order the appear in the recipe table's \code{variables} column.}
#' }
#'
#' @examples
#' library(tibble)
#' dat <- cbind(model = row.names(mtcars), as_tibble(mtcars))
#' recipe <- tribble(~variables, ~dependencies, ~func, ~func_args,
#' "id", no_deps, "seq_n", NULL,
#' "specialid", "id", function(n, .df) paste0("special-", .df$id), NULL)
#'
#' gen_table_data(10, recipe)
#'
#'@export
gen_table_data <- function(N = if(is.null(df)) 400 else NROW(df),
recipe,
df = NULL,
df_keepcols = if(is.null(df)) character() else names(df),
miss_recipe = NULL) {
if(!("keep" %in% names(recipe))) {
recipe$keep <- I(lapply(recipe$variables, function(x) rep(TRUE, length(x))))
}
## if(length(df_keepcols) && !is.null(df)) {
## df_keepcols <- names(df)[names(df) %in% unlist(recipe$dependencies)]
## }
## avoid infinite recursion since currently validate_recipe_deps calls gen_table_data
if(N > 1 && !all(sapply(recipe$func, identical, noop_func)))
validate_recipe_deps(recipe, seed_df = df)
deps <- recipe$dependencies
nodep <- vapply(deps, function(x) identical(x, no_deps), NA)
## do independent variables first
indepdf <- recipe[nodep,]
depdf <- recipe[!nodep,]
if(is.null(df) && NROW(indepdf) == 0)
stop("No independent recipe components found, unable to generate data.")
collst <- lapply(seq_len(NROW(indepdf)),
function(i) {
fun <- lookup_fun(indepdf$func[[i]])
args <- indepdf$func_args[[i]]
if(!is.null(df))
args <- c(args, list(.df = df))
vars <- indepdf$variables[[i]]
ret <- do.call(fun, c(list(n=N), args))
})
patdf <- do.call(cbind.data.frame, c(if(!is.null(df)) list(df), collst))
names(patdf) <- c(if(!is.null(df)) names(df), unlist(indepdf$variables))
lastnrow <- nrow(recipe) + 1 ## incase there were no indep rows because we had a seeder df
## now generate variables that are contingent on other variables
while(NROW(depdf) > 0 & NROW(depdf) < lastnrow) {
## we pass through the entire (remaining) depdf each time
lastnrow <- NROW(depdf)
for(di in 1:NROW(depdf)) {
if(all(depdf$dependencies[[di]] %in% names(patdf))) {
func <- depdf$func[[di]]
args <- depdf$func_args[[di]]
newcols <- do.call(func, c(list(n = N, .df = patdf), args))
oldnms <- names(patdf)
patdf <- cbind.data.frame(patdf, newcols)
names(patdf) <- c(oldnms, depdf$variables[[di]])
}
}
donerows <- vapply(depdf$variables, function(vn) all(vn %in% names(patdf)), NA)
## remove rows we are now done with. if this does not remove any rows,
## there is an invalid/circular dependenc
depdf <- depdf[!donerows,]
}
if(NROW(depdf) > 0)
stop(sprintf("Unable to generate some dependent variables: %s\n\t Some of these likely need to be jointly generated.",
paste(unlist(depdf$variables), collapse = ", ")))
## put them in the order they appear in the recipe, this ultimately may not be what we want
patdf <- patdf[,c(names(df), unlist(recipe$variables)), drop = FALSE]
if("keep" %in% names(recipe)) {
if(length(df_keepcols) == 0 || all(is.na(df_keepcols)) || isTRUE(df_keepcols)) {
df_keepcols <- names(df)
}
keep <- c(df_keepcols,
unlist(mapply(function(vars, keep) {vars[keep]},
vars = recipe$variables,
keep = recipe$keep,
SIMPLIFY=FALSE)))
## unlist(lapply(seq_len(NROW(recipe)),
## function(i) {
## rep(recipe$keep[[i]], length.out = length(recipe$variables[[i]]))
## })))
patdf <- patdf[, keep, drop = FALSE]
}
if(!is.null(miss_recipe))
patdf <- inject_nas(patdf, miss_recipe)
patdf
}
#' Generate synthetic data relationally-linked to existing data
#'
#'
#' @param joinrec tibble. Recipe for synthesizing core/seed data based of a foreign key present in an existing table
#' within \code{db}
#' @param tblrec tibble. Recipe for generating the remainder of the new table, via \code{\link{gen_table_data}},
#' building on initial table generated using \code{joinrec}.
#' @param miss_recipe tibble or NULL. A missingness recipe, if desired, to be applied after data generation via
#' \code{\link{inject_nas}}.
#' @param db list. A named list of existing tibbles/data.frames. The names will be used to resolve foreign table
#' references in \code{joinrec}.
#' @param keep TODO
#' @return The newly synthesized data table.
#'
#' @details
#' In relational database terms, this function synthesizes
#' new data in a table which has a foreign key in a table existing
#' already within \code{db}. Typically it will not generate data
#' in the same dimension as the foreign table (as in that case the
#' new data could simply be columns added to the existing
#' table). Instead, it generally has the possibility of multiple
#' rows for a particular foreign-key value, the possibility a
#' foreign key value is not present at all, or both. A concrete
#' example of this is Adverse Events being mapped to patients
#' (USUBJID in CDISC terms). Some patients will have multiple
#' adverse events, while many will have none at all.
#'
#' This is done via 3 steps:
#'
#' 1. Applying the relational join recipe. The "relational join recipe"
#' step should be considered primarily as the mechanism for defining
#' the \emph{dimensions} of the new data table.
#'
#' 2. The main data synthesis step, which is done by applying the
#' \code{tblrec} recipe on the scaffolding provided by the newly
#' dimensioned table generated in step 1.
#'
#' 3. Injecting missingness (optional) using \code{missrec}.
#'
#' @seealso \code{\link{reljoin_funcs}}
#' @export
gen_reljoin_table <- function(joinrec, tblrec, miss_recipe = NULL, db, keep = NA_character_) {
## fdat <- db[unlist(joinrec$foreign_tbls)]
func <- lookup_fun(joinrec$func[[1]])
df <- do.call(joinrec$func[[1]], c(joinrec$func_args[[1]], list(.db = db)))
## if(identical(keep, NA_character_))
## keep <- names(df)
tab <- gen_table_data(recipe = tblrec, df = df, miss_recipe = miss_recipe, df_keepcols = keep)
tab
}
#' Apply random or systematic missingness to existing data according to recipe
#' @param tbl data.frame/tibble. The already generated data to inject missingness into
#' @param recipe tibble. A recipe for generating missingness positions
#' @details Unlike in data-generation recipes, the \code{func} column in a missingness recipe must return a logical
#' vector of length n or an n x k logical matrix, where n is the number of rows in \code{.df} and \code{k} is the
#' number of variables listed in this row of the recipe. A vector is only allowed when only one variable is listed in
#' the recipe row (ie \code{k=1}). \code{TRUE} in the return value indicates that position in the column being
#' processed should be set to missing (\code{NA}), while \code{FALSE} indicates the value already there should remain
#' unchanged.
#'
#' @return \code{tbl}, with missingness injected into it as laid out by \code{recipe}
#' @examples
#' library(tibble)
#' missrec <- tibble(variables = "wt", func = list(function(.df) rep(c(TRUE, FALSE), times = c(3, NROW(.df) - 3))), func_args = list(NULL))
#' newdat <- inject_nas(mtcars, missrec)
#' head(newdat)
#'
#' @export
inject_nas <- function(tbl, recipe) {
for(i in seq_len(NROW(recipe))) {
rw <- recipe[i,]
missfunc <- lookup_fun(rw$func[[1]])
funcargs <- rw$func_args[[1]]
posmatrix <- do.call(missfunc, c(list(.df = tbl), funcargs))
tbl[,rw$variables[[1]]][posmatrix] <- NA
## for(var in rw$variables[[1]]) {
## inds <-
## tbl[inds, var] <- NA
## }
}
tbl
}
valid_cookbook <- function(recs) {
stopifnot(is(recs, "data.frame") &&
identical(names(recs),
c("table", "scaff_rec", "data_rec", "na_rec")))
TRUE
}
#' Generate full db from a Cookbook of Recipes
#'
#' @param cbook tribble. Columns: table, scaff_ref, data_rec, na_rec
#' @param db list. Named list of already existing tables
#' @param ns named numeric. Ns for use when generating independent tables.
#' @return A named list of tables of generated data.
#' @export
gen_data_db <- function(cbook, db = list(), ns = setNames(rep(500, NROW(cbook)), cbook$table)) {
valid_cookbook(cbook)
cbook$dependencies <- lapply(cbook$scaff_rec,
function(x) {
if(NROW(x) > 0)
x$foreign_tbl[[1]]
else
no_deps
})
nodep <- vapply(cbook$dependencies,
function(x) identical(x, no_deps),
NA)
## do independent variables first
indepdf <- cbook[nodep,]
depdf <- cbook[!nodep,]
if(any(nodep)) {
indeptabs <- lapply(seq_len(nrow(indepdf)),
function(i) {
cbrow <- indepdf[i,]
tname <- cbrow$table
gen_table_data(N = ns[tname],
recipe = cbrow$data_rec[[i]],
miss_recipe = cbrow$na_rec[[i]])
})
names(indeptabs) <- indepdf$table
db <- c(db, indeptabs)
}
lastnrow <- nrow(cbook) + 1 ## incase there were no indep rows because we had a seeder df
## now generate variables that are contingent on other variables
while(NROW(depdf) > 0 & NROW(depdf) < lastnrow) {
## we pass through the entire (remaining) depdf each time
lastnrow <- NROW(depdf)
for(di in 1:NROW(depdf)) {
if(all(depdf$dependencies[[di]] %in% names(db))) {
db[[ depdf$table[[di]] ]] <- gen_reljoin_table(depdf$scaff_rec[[di]],
depdf$data_rec[[di]],
depdf$na_rec[[di]],
db = db)
}
}
donerows <- depdf$table %in% names(db)
## remove rows we are now done with. if this does not remove any rows,
## there is an invalid/circular dependenc
depdf <- depdf[!donerows,]
}
if(NROW(depdf) > 0)
stop(sprintf("Unable to generate some dependent variables: %s\n\t Some of these likely need to be jointly generated.",
paste(unlist(depdf$table), collapse = ", ")))
if(requireNamespace("dm", quietly = TRUE)) {
db <- make_dm(db, cbook)
}
db
}
make_dm <- function(db, cbook) {
if(!requireNamespace("dm", quielty = TRUE)) {
stop("This requires the dm package, please install it")
}
retdm <- dm::as_dm(db)
keep <- !vapply(cbook$scaff_rec, identical, NA, y = no_rec)
cbook <- cbook[keep,]
for(i in seq_len(NROW(cbook))) {
scaff <- cbook$scaff_rec[[i]]
reft <- scaff$foreign_tbl
key <- scaff$foreign_key
ft <- cbook$table[[i]]
retdm <- dm::dm_add_pk(retdm,
!!reft, !!key)
retdm <- dm::dm_add_fk(retdm, !!ft, !!key, !!reft, !!key)
}
retdm
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.