library("vtreat") packageVersion("vtreat") useParallel <- TRUE mkEx <- function(n_rows, n_cat_columns, n_num_columns, n_irrel_columns, n_cat_levels_a, n_cat_levels_b) { n_cols <- n_cat_columns + n_num_columns + n_irrel_columns + 2 d <- as.data.frame(matrix(data = rnorm(n_rows * n_cols), nrow = n_rows, ncol = n_cols)) cat_names <- NULL num_names <- NULL irrel_names <- NULL if(n_cat_columns>0) { cat_names <- paste0('var_cat_', seq_len(n_cat_columns)) } if(n_num_columns>0) { num_names <- paste0('var_num_', seq_len(n_num_columns)) } if(n_irrel_columns>0) { irrel_names <- paste0('irrel_', seq_len(n_irrel_columns)) } y_names <- c("yC", "yN") colnames(d) <- c(cat_names, num_names, irrel_names, y_names) d$yC <- ifelse(d$yC>=0, "Y", "N") levels_a <- paste0("lev_a_", seq_len(n_cat_levels_a)) levels_b <- NULL if(n_cat_levels_b>0) { levels_b <- paste0("lev_b_", seq_len(n_cat_levels_b)) } for(ci in cat_names) { a_set <- rep(TRUE, n_rows) if(n_cat_levels_b>0) { a_set <- runif(n_rows)>=0.5 } na <- sum(a_set) nb <- n_rows - na if(na>0) { d[[ci]][a_set] <- sample(levels_a, na, replace = TRUE) } if(nb>0) { d[[ci]][!a_set] <- sample(levels_b, nb, replace = TRUE) } } d } parallelCluster <- NULL if(useParallel) { ncores <- parallel::detectCores() parallelCluster <- parallel::makeCluster(ncores) } n_rows <- 500000
Get a base timing of a moderately large task.
d <- mkEx(n_rows = n_rows, n_cat_columns = 2, n_num_columns = 2, n_irrel_columns = 10, n_cat_levels_a = 5, n_cat_levels_b = 0) yName <- "yC" yTarget <- "Y" varNames <- colnames(d)[grep("^var", colnames(d))] system.time( tplan <- vtreat::mkCrossFrameCExperiment( d, varNames, yName, yTarget, parallelCluster = parallelCluster)) knitr::kable(tplan$treatments$scoreFrame)
Measure the effect of irrelevant columns.
d <- mkEx(n_rows = n_rows, n_cat_columns = 2, n_num_columns = 2, n_irrel_columns = 300, n_cat_levels_a = 5, n_cat_levels_b = 0) yName <- "yC" yTarget <- "Y" varNames <- colnames(d)[grep("^var", colnames(d))] system.time( tplan <- vtreat::mkCrossFrameCExperiment( d, varNames, yName, yTarget, parallelCluster = parallelCluster)) knitr::kable(tplan$treatments$scoreFrame)
Measure the effect of more levels (both common and uncommon).
d <- mkEx(n_rows = n_rows, n_cat_columns = 2, n_num_columns = 2, n_irrel_columns = 10, n_cat_levels_a = 10, n_cat_levels_b = 50000) yName <- "yC" yTarget <- "Y" varNames <- colnames(d)[grep("^var", colnames(d))] system.time( tplan <- vtreat::mkCrossFrameCExperiment( d, varNames, yName, yTarget, parallelCluster = parallelCluster)) knitr::kable(tplan$treatments$scoreFrame)
See if it is the indicators.
Measure the effect of more levels (both common and uncommon).
d <- mkEx(n_rows = n_rows, n_cat_columns = 2, n_num_columns = 2, n_irrel_columns = 10, n_cat_levels_a = 10, n_cat_levels_b = 50000) yName <- "yC" yTarget <- "Y" varNames <- colnames(d)[grep("^var", colnames(d))] system.time( tplan <- vtreat::mkCrossFrameCExperiment( d, varNames, yName, yTarget, minFraction = 2.0, parallelCluster = parallelCluster)) knitr::kable(tplan$treatments$scoreFrame)
if(!is.null(parallelCluster)) { parallel::stopCluster(parallelCluster) parallelCluster <- NULL }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.