Set up.
library("vtreat") packageVersion("vtreat") packageVersion("data.table") # data.table needed for fast row binding in vtreat 1.2.0 and newer. options('vtreat.use_data.table_binding' = TRUE) # vtreat 1.2.0 fails to set this in some cicrumstances getOption('vtreat.use_data.table_binding', FALSE) ncores <- parallel::detectCores() parallelCluster <- parallel::makeCluster(ncores) parallelCluster
Make example.
n_row <- 500000 n_cat_var <- 15 n_cat_var_levels <- c(10, 100, 50000) # mix of cat sizes, smaller ones more likely to create indicators cat_effect_strength <- 0.1 n_num_var <- 15 num_effect_strength <- 0.1 na_rate <- 0.01 set.seed(3252) d <- data.frame(id = seq_len(n_row), group = seq_len(n_row) %% 10, yN = rnorm(n_row)) for(i in seq_len(n_num_var)) { vi <- paste0("nv", i) d[[vi]] <- rnorm(n_row) d[[vi]][runif(n_row)<=na_rate] <- NA_real_ d$yN <- d$yN + num_effect_strength*ifelse(is.na(d[[vi]]), 0.5, d[[vi]]) } for(i in seq_len(n_cat_var)) { vi <- paste0("cv", i) veci <- sample.int(n_cat_var_levels[1 + (i %% length(n_cat_var_levels))], n_row, replace = TRUE) d[[vi]] <- sample(paste0("lev_", veci)) d[[vi]][runif(n_row)<=na_rate] <- NA_character_ d$yN <- d$yN + cat_effect_strength*ifelse(is.na(d[[vi]]), 0.5, ifelse((veci %% 2) == 1, 1, -1)) } d$yC <- ifelse(d$yN>0, "YES", "NO") vars <- setdiff(colnames(d), c("id", "yN", "yC"))
Do the work (and time it).
base::date() system.time( ctpc <- mkCrossFrameCExperiment(d, vars, "yC", "YES", parallelCluster = parallelCluster) ) base::date() system.time( tpc <- designTreatmentsC(d, vars, "yC", "YES", parallelCluster = parallelCluster) ) base::date() system.time( tpc <- designTreatmentsC(d, vars, "yC", "YES") ) base::date()
base::date() system.time( ctpn <- mkCrossFrameNExperiment(d, vars, "yN", parallelCluster = parallelCluster) ) base::date() system.time( tpn <- designTreatmentsN(d, vars, "yN", parallelCluster = parallelCluster) ) base::date() system.time( tpn <- designTreatmentsN(d, vars, "yN") ) base::date()
Note a major cost is production of indicator columns (which leads to a large result). Setting minFraction
to something larger (like 0.1
or 0.2
) can help there.
Some timings of prepare()
. Note: one uses prepare()
on new data, for the
variable design data you use ctpc$crossFrame
to reduce nested model bias.
system.time(r <- prepare(ctpc$treatments, d, extracols = "id")) system.time(r <- prepare(ctpc$treatments, d, extracols = "id", parallelCluster = parallelCluster)) rqplan <- as_rquery_plan(list(ctpc$treatments)) system.time(r <- rqdatatable_prepare(rqplan, d, extracols = "id")) system.time(r <- rqdatatable_prepare(rqplan, d, extracols = "id", partition_column = "group", parallelCluster = parallelCluster)) system.time(r <- rqdatatable_prepare(rqplan, d, extracols = "id", non_join_mapping = TRUE))
Clean up.
parallel::stopCluster(parallelCluster) rm(list = "parallelCluster")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.