Set up.

library("vtreat")
packageVersion("vtreat")
packageVersion("data.table") # data.table needed for fast row binding in vtreat 1.2.0 and newer.
options('vtreat.use_data.table_binding' = TRUE) # vtreat 1.2.0 fails to set this in some cicrumstances

getOption('vtreat.use_data.table_binding', FALSE)

ncores <- parallel::detectCores()
parallelCluster <- parallel::makeCluster(ncores)
parallelCluster

Make example.

n_row <- 500000
n_cat_var <- 15
n_cat_var_levels <- c(10, 100, 50000) # mix of cat sizes, smaller ones more likely to create indicators
cat_effect_strength <- 0.1
n_num_var <- 15 
num_effect_strength <- 0.1
na_rate <- 0.01

set.seed(3252)
d <- data.frame(id = seq_len(n_row),
                group = seq_len(n_row) %% 10,
                yN = rnorm(n_row))
for(i in seq_len(n_num_var)) {
  vi <- paste0("nv", i)
  d[[vi]] <- rnorm(n_row)
  d[[vi]][runif(n_row)<=na_rate] <- NA_real_
  d$yN <- d$yN + num_effect_strength*ifelse(is.na(d[[vi]]), 0.5, d[[vi]])
}
for(i in seq_len(n_cat_var)) {
  vi <- paste0("cv", i)
  veci <- sample.int(n_cat_var_levels[1 + (i %% length(n_cat_var_levels))], 
                     n_row, 
                     replace = TRUE)
  d[[vi]] <- sample(paste0("lev_", veci))
  d[[vi]][runif(n_row)<=na_rate] <- NA_character_
  d$yN <- d$yN + cat_effect_strength*ifelse(is.na(d[[vi]]), 0.5, ifelse((veci %% 2) == 1, 1, -1))
}
d$yC <- ifelse(d$yN>0, "YES", "NO")
vars <- setdiff(colnames(d), c("id", "yN", "yC"))

Do the work (and time it).

base::date()
system.time(
  ctpc <- mkCrossFrameCExperiment(d, vars, "yC", "YES",
                                  parallelCluster = parallelCluster)
)

base::date()

system.time(
  tpc <- designTreatmentsC(d, vars, "yC", "YES",
                           parallelCluster = parallelCluster)
)

base::date()

system.time(
  tpc <- designTreatmentsC(d, vars, "yC", "YES")
)

base::date()
base::date()
system.time(
  ctpn <- mkCrossFrameNExperiment(d, vars, "yN", 
                                  parallelCluster = parallelCluster)
)

base::date()

system.time(
  tpn <- designTreatmentsN(d, vars, "yN",
                           parallelCluster = parallelCluster)
)

base::date()

system.time(
  tpn <- designTreatmentsN(d, vars, "yN")
)

base::date()

Note a major cost is production of indicator columns (which leads to a large result). Setting minFraction to something larger (like 0.1 or 0.2) can help there.


Some timings of prepare(). Note: one uses prepare() on new data, for the variable design data you use ctpc$crossFrame to reduce nested model bias.

system.time(r <- prepare(ctpc$treatments, d, 
                         extracols = "id"))

system.time(r <- prepare(ctpc$treatments, d, 
                         extracols = "id", 
                         parallelCluster = parallelCluster))

rqplan <- as_rquery_plan(list(ctpc$treatments))

system.time(r <- rqdatatable_prepare(rqplan, d, 
                                     extracols = "id"))

system.time(r <- rqdatatable_prepare(rqplan, d, 
                                     extracols = "id",
                                     partition_column = "group",
                                     parallelCluster = parallelCluster))

system.time(r <- rqdatatable_prepare(rqplan, d, 
                                     extracols = "id",
                                     non_join_mapping = TRUE))

Clean up.

parallel::stopCluster(parallelCluster)
rm(list = "parallelCluster")


WinVector/vtreat documentation built on Aug. 29, 2023, 4:49 a.m.