Win-Vector LLC 2/15/2018
library("vtreat")
packageVersion("vtreat")
## [1] '1.0.3'
useParallel <- TRUE
mkEx <- function(n_rows,
n_cat_columns, n_num_columns, n_irrel_columns,
n_cat_levels_a, n_cat_levels_b) {
n_cols <- n_cat_columns + n_num_columns + n_irrel_columns + 2
d <- as.data.frame(matrix(data = rnorm(n_rows * n_cols),
nrow = n_rows, ncol = n_cols))
cat_names <- NULL
num_names <- NULL
irrel_names <- NULL
if(n_cat_columns>0) {
cat_names <- paste0('var_cat_', seq_len(n_cat_columns))
}
if(n_num_columns>0) {
num_names <- paste0('var_num_', seq_len(n_num_columns))
}
if(n_irrel_columns>0) {
irrel_names <- paste0('irrel_', seq_len(n_irrel_columns))
}
y_names <- c("yC", "yN")
colnames(d) <- c(cat_names, num_names, irrel_names, y_names)
d$yC <- ifelse(d$yC>=0, "Y", "N")
levels_a <- paste0("lev_a_", seq_len(n_cat_levels_a))
levels_b <- NULL
if(n_cat_levels_b>0) {
levels_b <- paste0("lev_b_", seq_len(n_cat_levels_b))
}
for(ci in cat_names) {
a_set <- rep(TRUE, n_rows)
if(n_cat_levels_b>0) {
a_set <- runif(n_rows)>=0.5
}
na <- sum(a_set)
nb <- n_rows - na
if(na>0) {
d[[ci]][a_set] <- sample(levels_a, na, replace = TRUE)
}
if(nb>0) {
d[[ci]][!a_set] <- sample(levels_b, nb, replace = TRUE)
}
}
d
}
parallelCluster <- NULL
if(useParallel) {
ncores <- parallel::detectCores()
parallelCluster <- parallel::makeCluster(ncores)
}
n_rows <- 2000000
Get a base timing of a moderately large task.
d <- mkEx(n_rows = n_rows,
n_cat_columns = 2,
n_num_columns = 2,
n_irrel_columns = 10,
n_cat_levels_a = 5,
n_cat_levels_b = 0)
yName <- "yC"
yTarget <- "Y"
varNames <- colnames(d)[grep("^var", colnames(d))]
system.time(
tplan <-
vtreat::mkCrossFrameCExperiment(
d,
varNames,
yName,
yTarget,
parallelCluster = parallelCluster))
## user system elapsed
## 168.960 18.400 349.123
knitr::kable(tplan$treatments$scoreFrame)
| varName | varMoves | rsq| sig| needsSplit | extraModelDegrees| origName | code | |:------------------------------|:---------|--------:|----------:|:-----------|------------------:|:------------|:------| | var_cat_1_catP | TRUE | 0.0e+00| 0.9009002| TRUE | 4| var_cat_1 | catP | | var_cat_1_catB | TRUE | 1.9e-06| 0.0229055| TRUE | 4| var_cat_1 | catB | | var_cat_2_catP | TRUE | 0.0e+00| 0.7288237| TRUE | 4| var_cat_2 | catP | | var_cat_2_catB | TRUE | 1.4e-06| 0.0509270| TRUE | 4| var_cat_2 | catB | | var_num_1_clean | TRUE | 2.8e-06| 0.0053804| FALSE | 0| var_num_1 | clean | | var_num_2_clean | TRUE | 0.0e+00| 0.8547693| FALSE | 0| var_num_2 | clean | | var_cat_1_lev_x.lev_a_1 | TRUE | 0.0e+00| 0.8974030| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_2 | TRUE | 0.0e+00| 0.7900162| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_3 | TRUE | 3.0e-07| 0.3603173| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_4 | TRUE | 3.5e-06| 0.0018322| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_5 | TRUE | 1.2e-06| 0.0707593| FALSE | 0| var_cat_1 | lev | | var_cat_2_lev_x.lev_a_1 | TRUE | 0.0e+00| 0.9510616| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_2 | TRUE | 8.0e-07| 0.1346614| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_3 | TRUE | 5.0e-07| 0.2197869| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_4 | TRUE | 1.3e-06| 0.0571191| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_5 | TRUE | 1.8e-06| 0.0256345| FALSE | 0| var_cat_2 | lev |
Measure the effect of irrelevant columns.
d <- mkEx(n_rows = n_rows,
n_cat_columns = 2,
n_num_columns = 2,
n_irrel_columns = 100,
n_cat_levels_a = 5,
n_cat_levels_b = 0)
yName <- "yC"
yTarget <- "Y"
varNames <- colnames(d)[grep("^var", colnames(d))]
system.time(
tplan <-
vtreat::mkCrossFrameCExperiment(
d,
varNames,
yName,
yTarget,
parallelCluster = parallelCluster))
## user system elapsed
## 170.460 19.340 339.995
knitr::kable(tplan$treatments$scoreFrame)
| varName | varMoves | rsq| sig| needsSplit | extraModelDegrees| origName | code | |:------------------------------|:---------|--------:|----------:|:-----------|------------------:|:------------|:------| | var_cat_1_catP | TRUE | 0.0e+00| 0.9667085| TRUE | 4| var_cat_1 | catP | | var_cat_1_catB | TRUE | 1.7e-06| 0.0302954| TRUE | 4| var_cat_1 | catB | | var_cat_2_catP | TRUE | 1.0e-07| 0.6423609| TRUE | 4| var_cat_2 | catP | | var_cat_2_catB | TRUE | 0.0e+00| 0.8919384| TRUE | 4| var_cat_2 | catB | | var_num_1_clean | TRUE | 1.0e-07| 0.5618168| FALSE | 0| var_num_1 | clean | | var_num_2_clean | TRUE | 0.0e+00| 0.7254961| FALSE | 0| var_num_2 | clean | | var_cat_1_lev_x.lev_a_1 | TRUE | 0.0e+00| 0.7633458| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_2 | TRUE | 1.0e-07| 0.7063396| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_3 | TRUE | 0.0e+00| 0.9856348| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_4 | TRUE | 1.4e-06| 0.0516303| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_5 | TRUE | 2.5e-06| 0.0082229| FALSE | 0| var_cat_1 | lev | | var_cat_2_lev_x.lev_a_1 | TRUE | 3.0e-07| 0.3690146| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_2 | TRUE | 1.0e-07| 0.6876347| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_3 | TRUE | 1.0e-07| 0.7067748| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_4 | TRUE | 1.0e-07| 0.6948953| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_5 | TRUE | 6.0e-07| 0.1878351| FALSE | 0| var_cat_2 | lev |
d <- NULL
tplan <- NULL
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 446145 23.9 1508222 80.6 2945748 157.4
## Vcells 2844488 21.8 573857663 4378.2 716997619 5470.3
Measure the effect of more levels (both common and uncommon).
d <- mkEx(n_rows = n_rows,
n_cat_columns = 2,
n_num_columns = 2,
n_irrel_columns = 10,
n_cat_levels_a = 10,
n_cat_levels_b = 50000)
yName <- "yC"
yTarget <- "Y"
varNames <- colnames(d)[grep("^var", colnames(d))]
system.time(
tplan <-
vtreat::mkCrossFrameCExperiment(
d,
varNames,
yName,
yTarget,
parallelCluster = parallelCluster))
## user system elapsed
## 228.568 25.516 445.929
knitr::kable(tplan$treatments$scoreFrame)
| varName | varMoves | rsq| sig| needsSplit | extraModelDegrees| origName | code | |:-------------------------------|:---------|--------:|----------:|:-----------|------------------:|:------------|:------| | var_cat_1_catP | TRUE | 1.0e-07| 0.5365186| TRUE | 50009| var_cat_1 | catP | | var_cat_1_catB | TRUE | 1.0e-07| 0.5412263| TRUE | 50009| var_cat_1 | catB | | var_cat_2_catP | TRUE | 9.0e-07| 0.1183884| TRUE | 50009| var_cat_2 | catP | | var_cat_2_catB | TRUE | 1.6e-06| 0.0360525| TRUE | 50009| var_cat_2 | catB | | var_num_1_clean | TRUE | 3.0e-07| 0.3435448| FALSE | 0| var_num_1 | clean | | var_num_2_clean | TRUE | 2.0e-07| 0.4481928| FALSE | 0| var_num_2 | clean | | var_cat_1_lev_x.lev_a_1 | TRUE | 2.0e-07| 0.4506758| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_10 | TRUE | 3.0e-07| 0.3302589| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_2 | TRUE | 6.0e-07| 0.1804519| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_3 | TRUE | 1.0e-07| 0.6931947| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_4 | TRUE | 1.8e-06| 0.0264180| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_5 | TRUE | 7.0e-07| 0.1785925| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_6 | TRUE | 1.0e-07| 0.5360518| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_7 | TRUE | 0.0e+00| 0.7648720| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_8 | TRUE | 4.0e-07| 0.2906581| FALSE | 0| var_cat_1 | lev | | var_cat_1_lev_x.lev_a_9 | TRUE | 3.0e-07| 0.3594626| FALSE | 0| var_cat_1 | lev | | var_cat_2_lev_x.lev_a_1 | TRUE | 0.0e+00| 0.7183708| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_10 | TRUE | 2.0e-07| 0.4245337| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_2 | TRUE | 1.0e-07| 0.6788411| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_3 | TRUE | 0.0e+00| 0.8151242| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_4 | TRUE | 1.0e-07| 0.6457424| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_5 | TRUE | 2.0e-07| 0.5014217| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_6 | TRUE | 1.0e-07| 0.5968353| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_7 | TRUE | 1.4e-06| 0.0525395| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_8 | TRUE | 1.0e-07| 0.5537246| FALSE | 0| var_cat_2 | lev | | var_cat_2_lev_x.lev_a_9 | TRUE | 0.0e+00| 0.9167634| FALSE | 0| var_cat_2 | lev |
d <- NULL
tplan <- NULL
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 446191 23.9 1761155 94.1 2945748 157.4
## Vcells 2845031 21.8 208192011 1588.4 716997619 5470.3
See if it is the indicators.
Measure the effect of more levels (both common and uncommon).
d <- mkEx(n_rows = n_rows,
n_cat_columns = 2,
n_num_columns = 2,
n_irrel_columns = 10,
n_cat_levels_a = 10,
n_cat_levels_b = 50000)
yName <- "yC"
yTarget <- "Y"
varNames <- colnames(d)[grep("^var", colnames(d))]
system.time(
tplan <-
vtreat::mkCrossFrameCExperiment(
d,
varNames,
yName,
yTarget,
minFraction = 2.0,
parallelCluster = parallelCluster))
## user system elapsed
## 105.892 6.592 187.766
knitr::kable(tplan$treatments$scoreFrame)
| varName | varMoves | rsq| sig| needsSplit | extraModelDegrees| origName | code | |:-------------------|:---------|--------:|----------:|:-----------|------------------:|:------------|:------| | var_cat_1_catP | TRUE | 1.0e-07| 0.5950522| TRUE | 50009| var_cat_1 | catP | | var_cat_1_catB | TRUE | 1.0e-06| 0.0888238| TRUE | 50009| var_cat_1 | catB | | var_cat_2_catP | TRUE | 0.0e+00| 0.8322227| TRUE | 50009| var_cat_2 | catP | | var_cat_2_catB | TRUE | 0.0e+00| 0.9061606| TRUE | 50009| var_cat_2 | catB | | var_num_1_clean | TRUE | 0.0e+00| 0.8226784| FALSE | 0| var_num_1 | clean | | var_num_2_clean | TRUE | 1.1e-06| 0.0757819| FALSE | 0| var_num_2 | clean |
d <- NULL
tplan <- NULL
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 446193 23.9 1637824 87.5 2945748 157.4
## Vcells 2845234 21.8 122937441 938.0 716997619 5470.3
if(!is.null(parallelCluster)) {
parallel::stopCluster(parallelCluster)
parallelCluster <- NULL
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.