add.surrogate.key = function(x, idcol){
nm = copy(names(x))
if(missing(idcol)) idcol = paste(strsplit(nm[1L], "_", fixed = TRUE)[[1L]][1L], "id", sep="_") else stopifnot(is.character(idcol), length(idcol)==1L)
qj = as.call(lapply(c(":=", idcol, ".I"), as.symbol))
x[, eval(qj), c(nm[1L])]
setkeyv(setcolorder(x, c(idcol, nm)), idcol)[]
}
#' @title Populate star schema tables
#' @param N integer count of rows in fact table before sub-aggregation to all dimensions.
#' @param Y integer vector of year range (scalar or length 2) to generate \emph{time} dimension, default \code{c(2010L, 2014L)} results in 365 dim cardinality.
#' @param surrogate.keys logical if integer sequence column should be used or the lowest granularity natural key.
#' @param hierarchies logical default FALSE, if TRUE the third element in list will be returned with hierarchy as list of character column names. List can be used when creating data.cube.
#' @param seed integer used for \code{set.seed} when producing fact table from dimensions. Default fixed to \code{1L}.
#' @description Populates example sales data based on \emph{mtcars}, \emph{state}, \emph{HairEyeColor} datasets.
#' @return List of two list named \emph{fact} and \emph{dims}. The \emph{fact} list keeps single fact data.table sub-aggregated to all dimensions. The \emph{dims} list keeps five dimension data.tables.
populate_star = function(N = 1e5L, Y = c(2010L,2014L), surrogate.keys = FALSE, hierarchies = TRUE, seed = 1L) {
stopifnot(length(Y) %in% 1:2)
`.` = prod_name = cyl = vs = am = gear = Hair = Eye = Sex = time_date = time_month = time_quarter = NULL
# produce dimension based on mtcars dataset
product = as.data.table(datasets::mtcars, keep.rownames = "prod_name")[, .(prod_name, prod_cyl = as.integer(cyl), prod_vs = as.integer(vs), prod_am = as.integer(am), prod_gear = as.integer(gear))]
# customer dimension based on HairEyeColor dataset
customer = as.data.table.array(datasets::HairEyeColor)[, .(cust_profile = as.character(.I), cust_hair = Hair, cust_eye = Eye, cust_sex = Sex)]
# geography dimension based on state.* dataset
geography = data.table(geog_abb = datasets::state.abb, geog_name = datasets::state.name, geog_division_name = as.character(datasets::state.division), geog_region_name = as.character(datasets::state.region))
# time dimension based on sequence of Dates
monthday = c("01-01","12-31")
dates = as.Date(paste(sort(Y), monthday, sep="-"))
time = data.table(time_date = seq(dates[1L], dates[2L], by = "1 day"))
time[,`:=`(time_weekday = weekdays(time_date),
time_week = week(time_date),
time_month = month(time_date),
time_quarter = as.POSIXlt(time_date)$mon %/% 3L + 1L,
time_year = year(time_date))
][,`:=`(time_month_name = month.name[time_month],
time_quarter_name = paste0("Q",time_quarter))
]
setcolorder(time, c("time_date", "time_weekday", "time_week", "time_month", "time_month_name", "time_quarter", "time_quarter_name", "time_year"))
# currency taken from: https://github.com/jangorecki/Rbitcoin/blob/cf142d34e8018d135a526998aeddc4e28d548f5c/R/dictionaries.R#L139
ct.dict = list(
crypto = c('BTC','LTC','NMC','FTC','NVC','PPC','TRC','XPM','XDG','XRP','XVN'),
fiat = c('USD','EUR','GBP','KRW','PLN','RUR','JPY','CHF','CAD','AUD','NZD','CNY','INR',
'TRY','SYP','GEL','AZN','IRR','KZT','NOK','SEK','ISK','MYR','DKK','BGN','HRK',
'CZK','HUF','LTL','RON','UAH','IDR','IQD','MNT','BRL','ARS','VEF','MXN')
)
currency = rbindlist(lapply(1:length(ct.dict), function(i) data.table(curr_name = ct.dict[[i]], curr_type = names(ct.dict[i]))))
# wrap dimensions into list
dims = list(product=product, customer=customer, currency=currency, geography=geography, time=time)
# add surrogate keys if needed
if(surrogate.keys){
lapply(dims, add.surrogate.key)
}
# add key on first col
lapply(dims, function(x) setkeyv(x, names(x)[1L]))
# build fact table
keys = sapply(dims, key)
set.seed(seed)
sales = setDT(lapply(setNames(names(keys), keys), function(dim) sample(dims[[dim]][[1L]], N, TRUE)))
sales[, `:=`(amount = round(runif(N, max=1e2), 2),
value = round(runif(N,max=1e4),4))]
sales = sales[, lapply(.SD, sum),, c(keys)]
# - [x] metadata for data.cube class
if(isTRUE(hierarchies)){
if(isTRUE(surrogate.keys)) stop("Hierarchies with surrogate keys not available.")
hierarchies = list(
product = list(
list(
"prod_vs" = character(),
"prod_am" = character(),
"prod_gear" = character(),
"prod_cyl" = character(),
"prod_name" = c("prod_cyl","prod_vs","prod_am","prod_gear")
)
),
customer = list(
list(
"cust_sex" = character(),
"cust_hair" = character(),
"cust_eye" = character(),
"cust_profile" = c("cust_eye","cust_hair","cust_sex")
)
),
currency = list(
list(
"curr_type" = character(),
"curr_name" = c("curr_type")
)
),
geography = list(
list(
"geog_region_name" = character(),
"geog_division_name" = c("geog_region_name"),
"geog_abb" = c("geog_name","geog_division_name","geog_region_name")
)
),
time = list(
"monthly" = list(
"time_year" = character(),
"time_quarter" = c("time_quarter_name"),
"time_month" = c("time_month_name"),
"time_date" = c("time_month","time_quarter","time_year")
),
"weekly" = list(
"time_year" = character(),
"time_week" = character(),
"time_weekday" = character(),
"time_date" = c("time_weekday","time_week","time_year")
)
)
)
list(fact = list(sales=sales), dims = dims, hierarchies = hierarchies)
} else {
list(fact = list(sales=sales), dims = dims)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.