Conditioner

The Python vtreat steps from "Cross-Methods are a Leak/Variance Trade-Off" re-worked in R vtreat.

For the purpose of each of these steps, please refer to the original article, the purpose of this note is to show the related vtreat steps in R (and how similar they are to the Python steps).

# https://CRAN.R-project.org/package=wrapr
library(wrapr)

# https://CRAN.R-project.org/package=vtreat
library(vtreat)

# https://github.com/WinVector/vtreat/blob/master/Examples/CrossVal/LeakTradeOff/break_cross_val.R
source("break_cross_val.R")

# https://CRAN.R-project.org/package=ggplot2
library(ggplot2)

set.seed(2020)

unpack[
  d_example_s = d, 
  y_example_s = y
  ] <- mk_data(
    nrow = 100,
    n_noise_var = 10,
    n_noise_level = 50,
    n_signal_var = 5)

d_example_s$y <- y_example_s

d_example_s %.>%
  head(.) %.>%
  knitr::kable(.)

# https://github.com/WinVector/vtreat/blob/master/Examples/Regression/Regression_FP.md

vtreat_coder <- vtreat::NumericOutcomeTreatment(
  var_list = setdiff(colnames(d_example_s), 'y'),  # columns to transform
  outcome_name = 'y',  # outcome variable,
  params = vtreat::regression_parameters(
    list(
      codeRestriction = 'catN'
    ))
)

unpack[
  treatment_plan = treatments,
  vtreat_cross_frame = cross_frame
  ] <- fit_prepare(
    vtreat_coder, 
    d_example_s)

# the frame of cross-encoded variables
vtreat_cross_frame %.>%
  head(.) %.>%
  knitr::kable(.)

# the per-variable score frame
cols_to_display <- c('varName', 'origName', 'rsq', 'sig', 'code', 'default_threshold', 'recommended')

sf <- treatment_plan$get_score_frame()

knitr::kable(sf[, cols_to_display, drop = FALSE])

# show conditional distribution of estimated significances

sf$variable_type = gsub("_.*$", "", sf$origName)

ggplot(data = sf,
       aes(x = sig, fill = variable_type)) +
  geom_histogram(aes(y = ..density..), bins = 10, alpha = 0.5) + 
  geom_line(aes(y = ..density.., color = variable_type), stat = 'density') + 
  facet_wrap( ~ variable_type, ncol = 1, scale = 'free_y') +
  ggtitle("distribution of training R2 grouped by variable type")

# using the re-encoded data in an lm
formula <- wrapr::mk_formula(
  'y', 
  setdiff(colnames(vtreat_cross_frame), 'y'))
print(formula)

good_fit <- lm(formula, vtreat_cross_frame)
summary(good_fit)

f <- function(...) {
  unpack[
    d_test_s = d, 
    y_test_s = y] <- mk_data(
      nrow=100,
      n_noise_var=10,
      n_noise_level=50,
      n_signal_var=5)

  vtreat_test_frame <- prepare(
    treatment_plan,
    d_test_s)
  vtreat_test_frame$y <- y_test_s
  vtreat_test_frame$pred <- predict(
    good_fit, 
    newdata = vtreat_test_frame)
  return(sigr::wrapFTest(vtreat_test_frame, 'pred', 'y')$R2)
}


# the array of R-squared for the repeated tests
test_r2 = vapply(seq_len(100), f, numeric(1))

df = data.frame(R2 = test_r2)

ggplot(data = df, aes(x = R2)) +
  geom_histogram(aes(y = ..density..), bins = 10, fill = 'blue', alpha = 0.5) + 
  geom_line(aes(y = ..density..), stat = 'density', color = 'blue') + 
  ggtitle('distribution of test R2 under repeated draws')