dm
Packagedata.frame
s into relational databasedm
is great so why are we here?dm
assumes data already existsWhen simulating artificial data
respectables
tbl_df
containing the following list columns:
variables
-variables to generatedependencies
- variable depenedenciesfunc
- function to generatate datafunc_args
- [list] arguments for func
keep
- [logical] should variables
be keptrespectables
determines run order based on dependency declarationID
- unique customer IDSTUFFLEVEL
- randomly sampled from high
and low
valuesKEYSIZE
- normally distributed with mean depending on STUFFLEVEL
ACCT_OPEN
- date account was openedACCT_CLOSED
- date account was closed (or NA if still open)keysize <- function(n, .df, low = 20, high = 50) { levmean <- ifelse(.df$STUFFLEVEL == "high", high, low) rnorm(nrow(.df), mean = levmean, sd = 15) } keysize(.df = data.frame(STUFFLEVEL = rep("high", 2)))
acct_dates <- function(n, firstopen, lastclose = Sys.Date(), p_closed = .15) { starts <- rand_posixct(n = n, start = firstopen, end = lastclose) ends <- rand_posixct(n = n, start = starts, end = lastclose) if(p_closed > 0) { nainds <- sample(seq_len(n), n - floor(n * p_closed), replace = FALSE) ends[nainds] <- NA } data.frame(ACCT_OPEN=starts, ACCT_CLOSED=ends) } acct_dates(2, firstopen = "2010-01-01")
acctdate_vars <- c("ACCT_OPEN", "ACCT_CLOSED") recipe <- tribble(~variables, ~dependencies, ~func, ~func_args, ~keep, "ID", no_deps, "subjid_func", list(prefix="ID", sep=""), TRUE, "STUFFLEVEL", no_deps, sample_fct, list(x = c("high", "low")), TRUE, "KEYSIZE", "STUFFLEVEL", "keysize", NULL, TRUE, acctdate_vars, no_deps, "acct_dates", list(firstopen = "2015-01-01", lastclose = Sys.Date()), TRUE)
sillydata <- gen_table_data(N = 500, recipe = recipe) head(sillydata)
dplyr
Waylibrary(dplyr) sillydata2 <- data.frame(ID = subjid_func(n = 500, prefix = "ID", sep = "")) %>% mutate(STUFFLEVEL = sample_fct(n = n(), x = c("high", "low")), KEYSIZE = rnorm(n(), mean = ifelse(STUFFLEVEL == "high", 50, 20), sd = 15), ACCT_OPEN = rand_posixct(n = n(), start = "2015-01-01", end = Sys.Date()), ACCT_CLOSE = rand_posixct(n = n(), start = ACCT_OPEN, end = Sys.Date()), ACCT_CLOSE = na_if(ACCT_CLOSE, runif(n()) <= .15)) head(sillydata2)
recipe2 <- recipe recipe2$func_args[[3]] <- list(high=20, low = 50) sillydata2 <- gen_table_data(N= 500, recipe = recipe2) head(sillydata2)
Sometimes you really do want to jointly generate 2+ variables
mutate()
land anymorefunction(x, ...) cbind(x, real_logic(x, ...))
knitr::include_graphics("scaffold_figure.png")
Scaffolding - Determine dimensions based on foreign table
Data Generation - Normal
rand_per_key
function factorykeyvar
, mincount
, maxcount
, and prop_present
sjfun <- rand_per_key("ID", mincount = 0, maxcount = 5, prop_present = 1) scaff <- sjfun(.dbtab = sillydata) table(table(scaff$ID))
length(unique(scaff$ID))
sjrecipe <- tribble(~foreign_tbl, ~foreign_key, ~func, ~func_args, "sillydata", "ID", sjfun, list())
## lookup table for products and when they were available for purchase products <- tribble(~PRODUCTID, ~DESC, ~AVAIL_ST, ~AVAIL_END, "PROD1", "Doodad", "2010-01-01", Sys.time(), "PROD2", "HLight Fluid", "2010-01-01", Sys.time(), "PROD3", "Mcguffin", "2017-01-01", as.POSIXct("2017-12-31"), "PROD4", "Spanner", "2010-01-01", Sys.time(), "PROD5", "Thingbat", "2015-03-03", Sys.time(), "PROD6", "Thingbat v2", "2018-03-03", Sys.time())
We define buydate_func
which
ID
, ACCT_OPEN
and ACCT_CLOSE
BUYDATE
and BUYID
such thatBUYID
are sequental globallyBUYDATE
are sequential within ID
buycols <- c("BUYID", "BUYDATE") buydate_func <- function(n, .df, end = Sys.time()) { n <- nrow(.df) starts <- .df$ACCT_OPEN ends <- .df$ACCT_CLOSED ends[is.na(end)] <- end dates <- rand_posixct(start = starts, end = ends, n = n) ids <- .df$ID odates <- order(ids, dates) revit <- order(order(dates)) toret <- data.frame(BUYID = seq_len(n)[revit], BUYDATE = dates, stringsAsFactors = FALSE) toret[odates,] }
prods_func
.df
containing BUYDATE
BUYDATE
for each transactionprodcols <- c("PRODUCTID", "DESC") prods_func <- function(n, .df) { n <- NROW(.df) BUYDATE <- .df$BUYDATE pstarts <- as.POSIXct(products$AVAIL_ST) pends <- products$AVAIL_END rows <- lapply(BUYDATE, function(bdt) { pids <- products$PRODUCTID[pstarts <= bdt & bdt <= pends] bought <- sample(pids, 1) products[products$PRODUCTID==bought,] }) do.call(rbind, rows)[,prodcols] }
buycoldeps <- c("ID", acctdate_vars) buys_rec <- tribble(~variables, ~dependencies, ~func, ~func_args, ~keep, buycols, buycoldeps, "buydate_func", NULL, TRUE, prodcols, "BUYDATE", "prods_func", NULL, TRUE) buysdf <- gen_reljoin_table(sjrecipe, buys_rec, db = list(sillydata = sillydata), ) head(buysdf[, c("ID", buycols, prodcols)])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.