R/create_data.R

Defines functions make_dm gen_data_db valid_cookbook inject_nas gen_reljoin_table gen_table_data

Documented in gen_data_db gen_reljoin_table gen_table_data inject_nas

#' @importFrom stats runif setNames
#' @importFrom methods is
#' @importFrom tibble tribble
NULL

#' Sentinel Values for Recipes
#' @export
#' @rdname sentinelS
no_key <- character()
#' @export
#' @rdname sentinelS
no_deps <- character()
#' @export
#' @rdname sentinelS
no_rec <- list()
#' @export
#' @rdname sentinelS
no_args <- list()



#' Generate variables in a table
#'
#'
#' @param N numeric(1). Number of rows to generate. Defaults to 400, or the number of rows in \code{df} if provided.
#' @param recipe tibble. A recipe for generating variables into the dataset. see \code{Details}.
#' @param miss_recipe tibble. A recipe for generating missingness positions, or \code{NULL} (the default).
#' @param df data.frame/tibble. Existing partial data which new variables should be added to, or \code{NULL} (the
#'   default).
#' @param df_keepcols logical. which columns from \code{df} should be retained in the resulting dataset (by position).
#'   Defaults to all columns present in \code{df}.
#'
#' @details
#'
#' The \code{recipe} parameter should be a tibble made up of one or more rows which define variable recipes via the following
#' columns:
#' \describe{
#' \item{variables}{(list column as needed) names of variables generated by that row. No empty/length 0 entries allowed}
#' \item{dependencies}{(list column). Names of variables which must have already been populated for the the variables in this row to be synthesized}
#' \item{func}{(list column) A character value which can be used to look up a function, or the function object itself, which accepts n, .df, and ... and returns either an atomic vector of length \code{n}, or a data.frame/tibble with n rows}
#' \item{func_args}{(list column) a list of arguments which should be passed to \code{func} in addition to \code{n} and \code{.df}}
#' }
#'
#' The algorithm for synthesizing the table from the recipe is as follows:
#' \enumerate{
#' \item{Columns of synthesized data are generated according to recipe rows which have no dependencies in the order they appear in the recipe tibble and appended to the dataset with names for the variables generated}
#' \item{Recipe rows containing dependencies are checked in the order they appear in the recipe table for whether their dependencies are met, and if so data is synthesized for the corresponding variables and added to the dataset. This step is repeated until all recipe rows have been resolved, or until a full pass through the unresolved recipe rows does not lead to any new data synthesis.}
#' \item{After all data synthesis is complete, columns are then reordered based on any columns of \code{df} first, followed by newly synthesized variables in the order the appear in the recipe table's \code{variables} column.}
#' }
#'
#' @examples
#' library(tibble)
#' dat <- cbind(model = row.names(mtcars), as_tibble(mtcars))
#' recipe <- tribble(~variables, ~dependencies, ~func, ~func_args,
#'         "id", no_deps, "seq_n", NULL,
#'         "specialid", "id", function(n, .df) paste0("special-", .df$id), NULL)
#'
#' gen_table_data(10, recipe)
#'
#'@export
gen_table_data <- function(N = if(is.null(df)) 400 else NROW(df),
                           recipe,
                           df = NULL,
                           df_keepcols = if(is.null(df)) character() else names(df),
                           miss_recipe = NULL) {
    if(!("keep" %in% names(recipe))) {
        recipe$keep <- I(lapply(recipe$variables, function(x) rep(TRUE, length(x))))
    }
    ## if(length(df_keepcols) && !is.null(df)) {
    ##     df_keepcols <- names(df)[names(df) %in% unlist(recipe$dependencies)]
    ## }

    ## avoid infinite recursion since currently validate_recipe_deps calls gen_table_data
    if(N > 1 && !all(sapply(recipe$func, identical, noop_func)))
        validate_recipe_deps(recipe, seed_df = df)

    deps <- recipe$dependencies
    nodep <- vapply(deps, function(x) identical(x, no_deps), NA)

    ## do independent variables first
    indepdf <- recipe[nodep,]
    depdf <- recipe[!nodep,]

    if(is.null(df) && NROW(indepdf) == 0)
        stop("No independent recipe components found, unable to generate data.")

    collst <- lapply(seq_len(NROW(indepdf)),
                     function(i) {
        fun <- lookup_fun(indepdf$func[[i]])
        args <- indepdf$func_args[[i]]
        if(!is.null(df))
            args <- c(args, list(.df = df))
        vars <- indepdf$variables[[i]]
        ret <- do.call(fun, c(list(n=N), args))
    })


    patdf <- do.call(cbind.data.frame, c(if(!is.null(df)) list(df), collst))
    names(patdf) <- c(if(!is.null(df)) names(df), unlist(indepdf$variables))

    lastnrow <- nrow(recipe) + 1 ## incase there were no indep rows because we had a seeder df
    ## now generate variables that are contingent on other variables
    while(NROW(depdf) > 0 & NROW(depdf) < lastnrow) {
        ## we pass through the entire (remaining) depdf each time
        lastnrow <- NROW(depdf)
        for(di in 1:NROW(depdf)) {
            if(all(depdf$dependencies[[di]] %in% names(patdf))) {
                func <- depdf$func[[di]]
                args <- depdf$func_args[[di]]
                newcols <- do.call(func, c(list(n = N, .df = patdf), args))
                oldnms <- names(patdf)
                patdf <- cbind.data.frame(patdf, newcols)
                names(patdf) <- c(oldnms, depdf$variables[[di]])
            }
        }
        donerows <- vapply(depdf$variables, function(vn) all(vn %in% names(patdf)), NA)
        ## remove rows we are now done with. if this does not remove any rows,
        ## there is an invalid/circular dependenc
        depdf <- depdf[!donerows,]
    }
    if(NROW(depdf) > 0)
        stop(sprintf("Unable to generate some dependent variables: %s\n\t Some of these likely need to be jointly generated.",
                     paste(unlist(depdf$variables), collapse = ", ")))

    ## put them in the order they appear in the recipe, this ultimately may not be what we want
    patdf <- patdf[,c(names(df), unlist(recipe$variables)), drop = FALSE]
    if("keep" %in% names(recipe)) {
        if(length(df_keepcols) == 0 || all(is.na(df_keepcols)) || isTRUE(df_keepcols)) {
            df_keepcols <- names(df)
        }
        keep <- c(df_keepcols,
                  unlist(mapply(function(vars, keep) {vars[keep]},
                                vars = recipe$variables,
                                keep = recipe$keep,
                                SIMPLIFY=FALSE)))

                  ## unlist(lapply(seq_len(NROW(recipe)),
                  ##                            function(i) {
                  ##                  rep(recipe$keep[[i]], length.out = length(recipe$variables[[i]]))
        ##              })))
        patdf <- patdf[, keep, drop = FALSE]
    }
    if(!is.null(miss_recipe))
        patdf <- inject_nas(patdf, miss_recipe)
    patdf

}


#' Generate synthetic data relationally-linked to existing data
#'
#'
#' @param joinrec tibble. Recipe for synthesizing core/seed data based of a foreign key present in an existing table
#'   within \code{db}
#' @param tblrec tibble. Recipe for generating the remainder of the new table, via \code{\link{gen_table_data}},
#'   building on initial table generated using \code{joinrec}.
#' @param miss_recipe tibble or NULL. A missingness recipe, if desired, to be applied after data generation via
#'   \code{\link{inject_nas}}.
#' @param db list. A named list of existing tibbles/data.frames. The names will be used to resolve foreign table
#'   references in \code{joinrec}.
#' @param keep TODO
#' @return The newly synthesized data table.
#'
#' @details
#' In relational  database terms, this  function synthesizes
#' new data in a table which has a foreign key in a table existing
#' already within  \code{db}. Typically it will  not generate data
#' in the same dimension as the foreign table (as in that case the
#' new  data  could  simply  be  columns  added  to  the  existing
#' table). Instead,  it generally has the  possibility of multiple
#' rows  for a  particular  foreign-key value,  the possibility  a
#' foreign key  value is not present  at all, or both.  A concrete
#' example  of this  is Adverse  Events being  mapped to  patients
#' (USUBJID  in CDISC  terms).  Some patients  will have  multiple
#' adverse events, while many will have none at all.
#'
#' This is done via 3 steps:
#'
#' 1. Applying the relational join recipe. The "relational join recipe"
#' step should be considered primarily as the mechanism  for defining
#' the  \emph{dimensions} of the  new data table.
#'
#' 2. The main data synthesis step, which is done by applying the
#' \code{tblrec} recipe on the scaffolding provided by the newly
#' dimensioned table generated in step 1.
#'
#' 3. Injecting missingness (optional) using \code{missrec}.
#'
#' @seealso \code{\link{reljoin_funcs}}
#' @export
gen_reljoin_table <- function(joinrec, tblrec, miss_recipe = NULL, db, keep = NA_character_) {
    ## fdat <- db[unlist(joinrec$foreign_tbls)]
    func <- lookup_fun(joinrec$func[[1]])
    df <- do.call(joinrec$func[[1]], c(joinrec$func_args[[1]], list(.db = db)))
    ## if(identical(keep, NA_character_))
    ##     keep <- names(df)
    tab <- gen_table_data(recipe = tblrec, df = df, miss_recipe = miss_recipe, df_keepcols = keep)
    tab
}


#' Apply random or systematic missingness to existing data according to recipe
#' @param tbl data.frame/tibble. The already generated data to inject missingness into
#' @param recipe tibble. A recipe for generating missingness positions
#' @details Unlike in data-generation recipes, the \code{func} column in a missingness recipe must return a logical
#'   vector of length n or  an n x k logical matrix, where n is the number of rows in \code{.df} and \code{k} is the
#'   number of variables listed in this row of the recipe. A vector is only allowed when only one variable is listed in
#'   the recipe row (ie \code{k=1}). \code{TRUE} in the return value indicates that position in the column being
#'   processed should be set to missing (\code{NA}), while \code{FALSE} indicates the value already there should remain
#'   unchanged.
#'
#' @return \code{tbl}, with missingness injected into it as laid out by \code{recipe}
#' @examples
#' library(tibble)
#' missrec <- tibble(variables = "wt", func = list(function(.df) rep(c(TRUE, FALSE), times = c(3, NROW(.df) - 3))), func_args = list(NULL))
#' newdat <- inject_nas(mtcars, missrec)
#' head(newdat)
#'
#' @export
inject_nas <- function(tbl, recipe) {
    for(i in seq_len(NROW(recipe))) {
        rw <- recipe[i,]
        missfunc <- lookup_fun(rw$func[[1]])
        funcargs <- rw$func_args[[1]]
        posmatrix <- do.call(missfunc, c(list(.df = tbl), funcargs))
        tbl[,rw$variables[[1]]][posmatrix] <- NA

        ## for(var in rw$variables[[1]]) {
        ##     inds <-
        ##     tbl[inds, var] <- NA
        ## }
    }
    tbl
}




valid_cookbook <- function(recs) {
    stopifnot(is(recs, "data.frame") &&
              identical(names(recs),
                        c("table", "scaff_rec", "data_rec", "na_rec")))
    TRUE
}

#' Generate full db from a Cookbook of Recipes
#'
#' @param cbook tribble. Columns: table, scaff_ref, data_rec, na_rec
#' @param db list. Named list of already existing tables
#' @param ns named numeric. Ns for use when generating independent tables.
#' @return A named list of tables of generated data.
#' @export
gen_data_db <- function(cbook, db = list(), ns = setNames(rep(500, NROW(cbook)), cbook$table)) {
    valid_cookbook(cbook)
    cbook$dependencies <- lapply(cbook$scaff_rec,
                                 function(x) {
        if(NROW(x) > 0)
            x$foreign_tbl[[1]]
        else
            no_deps
    })


    nodep <- vapply(cbook$dependencies,
                    function(x) identical(x, no_deps),
                    NA)

    ## do independent variables first
    indepdf <- cbook[nodep,]
    depdf <- cbook[!nodep,]

    if(any(nodep)) {
        indeptabs <- lapply(seq_len(nrow(indepdf)),
                            function(i) {
            cbrow <- indepdf[i,]
            tname <- cbrow$table
            gen_table_data(N = ns[tname],
                           recipe = cbrow$data_rec[[i]],
                           miss_recipe = cbrow$na_rec[[i]])
        })
        names(indeptabs) <- indepdf$table
        db <- c(db, indeptabs)
    }

    lastnrow <- nrow(cbook) + 1 ## incase there were no indep rows because we had a seeder df
    ## now generate variables that are contingent on other variables
    while(NROW(depdf) > 0 & NROW(depdf) < lastnrow) {
        ## we pass through the entire (remaining) depdf each time
        lastnrow <- NROW(depdf)
        for(di in 1:NROW(depdf)) {
            if(all(depdf$dependencies[[di]] %in% names(db))) {
                db[[ depdf$table[[di]] ]] <- gen_reljoin_table(depdf$scaff_rec[[di]],
                                                       depdf$data_rec[[di]],
                                                       depdf$na_rec[[di]],
                                                       db = db)
            }
        }
        donerows <- depdf$table %in% names(db)
        ## remove rows we are now done with. if this does not remove any rows,
        ## there is an invalid/circular dependenc
        depdf <- depdf[!donerows,]
    }
    if(NROW(depdf) > 0)
        stop(sprintf("Unable to generate some dependent variables: %s\n\t Some of these likely need to be jointly generated.",
                     paste(unlist(depdf$table), collapse = ", ")))

    if(requireNamespace("dm", quietly = TRUE)) {
        db <- make_dm(db, cbook)
    }
    db
}



make_dm <- function(db, cbook) {
    if(!requireNamespace("dm", quielty = TRUE)) {
        stop("This requires the dm package, please install it")
    }
    retdm <- dm::as_dm(db)

    keep <- !vapply(cbook$scaff_rec, identical, NA, y = no_rec)
    cbook <- cbook[keep,]

    for(i in seq_len(NROW(cbook))) {
        scaff <- cbook$scaff_rec[[i]]
        reft <- scaff$foreign_tbl
        key <- scaff$foreign_key
        ft <- cbook$table[[i]]
        retdm <- dm::dm_add_pk(retdm,
                           !!reft, !!key)
        retdm <- dm::dm_add_fk(retdm, !!ft, !!key, !!reft, !!key)
    }
    retdm
}
Roche/respectables documentation built on Oct. 2, 2024, 8:57 p.m.