extras/LargeExample.md

Large Example

Win-Vector LLC 2/17/2018

vtreat is a an R 'data.frame' processor/conditioner that prepares real-world data for predictive modeling in a statistically sound manner. vtreat prepares variables so that data has fewer exceptional cases, making it easier to safely use models in production. Common problems vtreat defends against: 'Inf', 'NA', too many categorical levels, rare categorical levels, and new categorical levels (levels seen during application, but not during training). Reference: "vtreat: a data.frame Processor for Predictive Modeling", Zumel, Mount, 2016, DOI:10.5281/zenodo.1173314.

vtreat::mkCrossFrameCExperiment() and vtreat::mkCrossFrameCExperiment() are the most statistically efficient methods vtreat supplies. So we often advise them, especially for data with few rows. However they have non-trivial computational cost. For data with very many rows we suggest the more computationally efficient vtreat::designTreamentsC() and vtreat::designTreatmensN().

Here is an example (based on vtreat issue 12, perf tests, and perf tests2).

Example data (in this case no variables are truly related to the outcome to be predicted).

library("vtreat")
packageVersion("vtreat")
## [1] '1.0.3'
useParallel <- TRUE

mkEx <- function(n_rows, 
                 n_cat_columns, n_num_columns, n_irrel_columns,
                 n_cat_levels_a, n_cat_levels_b) {
  n_cols <- n_cat_columns + n_num_columns + n_irrel_columns + 2
  d <- as.data.frame(matrix(data = rnorm(n_rows * n_cols), 
                            nrow = n_rows, ncol = n_cols))
  cat_names <- NULL
  num_names <- NULL
  irrel_names <- NULL
  if(n_cat_columns>0) {
    cat_names <- paste0('var_cat_', seq_len(n_cat_columns))
  }
  if(n_num_columns>0) {
    num_names <- paste0('var_num_', seq_len(n_num_columns))
  }
  if(n_irrel_columns>0) {
    irrel_names <- paste0('irrel_', seq_len(n_irrel_columns))
  }
  y_names <- c("yC", "yN")
  colnames(d) <- c(cat_names, num_names, irrel_names, y_names)
  d$yC <- ifelse(d$yC>=0, "Y", "N")
  levels_a <- paste0("lev_a_", seq_len(n_cat_levels_a))
  levels_b <- NULL
  if(n_cat_levels_b>0) {
    levels_b <- paste0("lev_b_", seq_len(n_cat_levels_b))
  }
  for(ci in cat_names) {
    a_set <- rep(TRUE, n_rows)
    if(n_cat_levels_b>0) {
      a_set <- runif(n_rows)>=0.5
    }
    na <- sum(a_set)
    nb <- n_rows - na
    if(na>0) {
      d[[ci]][a_set] <- sample(levels_a, na, replace = TRUE)
    }
    if(nb>0) {
       d[[ci]][!a_set] <- sample(levels_b, nb, replace = TRUE)
    }
  }
  d
}

parallelCluster <- NULL
if(useParallel) {
  ncores <- parallel::detectCores()
  parallelCluster <- parallel::makeCluster(ncores)
}

n_rows <- 2000000

Convert the large cardinality categorical variables into new single column sub-models ready to be used in later modeling.

d <- mkEx(n_rows = n_rows,
          n_cat_columns = 2,
          n_num_columns = 2,
          n_irrel_columns = 10,
          n_cat_levels_a = 10,
          n_cat_levels_b = 50000)
yName <- "yC"
yTarget <- "Y"
varNames <- colnames(d)[grep("^var", colnames(d))]
codeTargets <- c("catB", "catP", "clean", "isBAD")
system.time(
  {
    splitGroup <- sample.int(2, nrow(d), replace = TRUE)
    tplan <- vtreat::designTreatmentsC(d[splitGroup==1, , drop = FALSE],
                                       varNames,
                                       yName,
                                       yTarget,
                                       codeRestriction = codeTargets,
                                       parallelCluster = parallelCluster)
    scoreFrame <- tplan$scoreFrame
    print(scoreFrame[, c("varName", "rsq", "sig", "extraModelDegrees", "origName")])
    # newVars <- scoreFrame$varName[scoreFrame$sig < 1/nrow(scoreFrame)]
    newVars <- scoreFrame$varName
    trainFrame <- vtreat::prepare(tplan, 
                                  d[splitGroup==2, , drop= FALSE],
                                  varRestriction = newVars,
                                  parallelCluster = parallelCluster)
})
## [1] "designing treatments Sat Feb 17 06:08:54 2018"
## [1] "designing treatments Sat Feb 17 06:08:54 2018"
## [1] " have initial level statistics Sat Feb 17 06:08:58 2018"
## [1] " scoring treatments Sat Feb 17 06:09:11 2018"
## [1] "have treatment plan Sat Feb 17 06:09:29 2018"
## [1] "rescoring complex variables Sat Feb 17 06:09:29 2018"
## [1] "done rescoring complex variables Sat Feb 17 06:10:18 2018"
##           varName          rsq        sig extraModelDegrees  origName
## 1  var_cat_1_catP 3.714442e-09 0.94281274             50005 var_cat_1
## 2  var_cat_1_catB 4.046618e-06 0.01789792             50005 var_cat_1
## 3  var_cat_2_catP 3.124530e-07 0.51058560             50005 var_cat_2
## 4  var_cat_2_catB 4.306401e-07 0.43987781             50005 var_cat_2
## 5 var_num_1_clean 9.964966e-07 0.24001112                 0 var_num_1
## 6 var_num_2_clean 1.724072e-06 0.12223092                 0 var_num_2

##    user  system elapsed 
##  30.953   2.600  88.468
if(!is.null(parallelCluster)) {
  parallel::stopCluster(parallelCluster)
  parallelCluster <- NULL
}


WinVector/vtreat documentation built on Aug. 29, 2023, 4:49 a.m.