extras/rquery_vtreat.md

rquery vtreat

John Mount, Win-Vector LLC 2018-08-01

vtreat transforms can be hosted on rquery. This allows transforms at scale.

library("vtreat")

eval_examples <- requireNamespace("rquery", quietly = TRUE)
eval_rqdt <- eval_examples && requireNamespace("rqdatatable", quietly = TRUE)
eval_db <- eval_examples &&
  requireNamespace("DBI", quietly = TRUE) &&
  requireNamespace("RSQLite", quietly = TRUE)
db <- NULL
if(eval_db) {
  db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
}

Classification example.

dTrainC <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
                      z= c(1, 2, NA, 4, 5, 6),
                      y= c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE),
                      stringsAsFactors = FALSE)
dTrainC$id <- seq_len(nrow(dTrainC))
treatmentsC <- designTreatmentsC(dTrainC, c("x", "z"), 'y', TRUE)
## [1] "vtreat 1.3.1 inspecting inputs Wed Aug  1 18:44:44 2018"
## [1] "designing treatments Wed Aug  1 18:44:44 2018"
## [1] " have initial level statistics Wed Aug  1 18:44:44 2018"
## [1] " scoring treatments Wed Aug  1 18:44:44 2018"
## [1] "have treatment plan Wed Aug  1 18:44:44 2018"
## [1] "rescoring complex variables Wed Aug  1 18:44:44 2018"
## [1] "done rescoring complex variables Wed Aug  1 18:44:44 2018"
prepare(treatmentsC, dTrainC) %.>%
  knitr::kable(.)

| x_catP| x_catB| z_clean| z_isBAD| x_lev_NA| x_lev_x_a| x_lev_x_b| y | |----------:|-----------:|---------:|---------:|-----------:|-------------:|-------------:|:------| | 0.5000000| -0.6930972| 1.0| 0| 0| 1| 0| FALSE | | 0.5000000| -0.6930972| 2.0| 0| 0| 1| 0| FALSE | | 0.5000000| -0.6930972| 3.6| 1| 0| 1| 0| TRUE | | 0.3333333| 0.0000000| 4.0| 0| 0| 0| 1| FALSE | | 0.1666667| 9.2104404| 5.0| 0| 1| 0| 0| TRUE | | 0.3333333| 0.0000000| 6.0| 0| 0| 0| 1| TRUE |

rqplan <- as_rquery_plan(list(treatmentsC))
source_data <- rquery::rq_copy_to(db, "dTrainC", dTrainC,
                                  overwrite = TRUE, temporary = TRUE)

rest <- rquery_prepare(db, rqplan, source_data, "dTreatedC",
                       extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd  %.>%
  knitr::kable(.)

| id| x_catB| x_catP| x_lev_NA| x_lev_x_a| x_lev_x_b| y| z_clean| z_isBAD| |----:|-----------:|----------:|-----------:|-------------:|-------------:|----:|---------:|---------:| | 1| -0.6930972| 0.5000000| 0| 1| 0| 0| 1.0| 0| | 2| -0.6930972| 0.5000000| 0| 1| 0| 0| 2.0| 0| | 3| -0.6930972| 0.5000000| 0| 1| 0| 1| 3.6| 1| | 4| 0.0000000| 0.3333333| 0| 0| 1| 0| 4.0| 0| | 5| 9.2104404| 0.1666667| 1| 0| 0| 1| 5.0| 0| | 6| 0.0000000| 0.3333333| 0| 0| 1| 1| 6.0| 0|

rquery::rq_remove_table(db, source_data$table_name)
## [1] TRUE
rquery::rq_remove_table(db, rest$table_name)
## [1] TRUE

Regression example.

dTrainR <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
                      z= c(1, 2, NA, 4, 5, 6),
                      y= as.numeric(c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE)),
                      stringsAsFactors = FALSE)
dTrainR$id <- seq_len(nrow(dTrainR))
treatmentsN <- designTreatmentsN(dTrainR, c("x", "z"), 'y')
## [1] "vtreat 1.3.1 inspecting inputs Wed Aug  1 18:44:45 2018"
## [1] "designing treatments Wed Aug  1 18:44:45 2018"
## [1] " have initial level statistics Wed Aug  1 18:44:45 2018"
## [1] " scoring treatments Wed Aug  1 18:44:45 2018"
## [1] "have treatment plan Wed Aug  1 18:44:45 2018"
## [1] "rescoring complex variables Wed Aug  1 18:44:45 2018"
## [1] "done rescoring complex variables Wed Aug  1 18:44:45 2018"
prepare(treatmentsN, dTrainR)  %.>%
  knitr::kable(.)

| x_catP| x_catN| x_catD| z_clean| z_isBAD| x_lev_NA| x_lev_x_a| x_lev_x_b| y| |----------:|-----------:|----------:|---------:|---------:|-----------:|-------------:|-------------:|----:| | 0.5000000| -0.1666667| 0.5773503| 1.0| 0| 0| 1| 0| 0| | 0.5000000| -0.1666667| 0.5773503| 2.0| 0| 0| 1| 0| 0| | 0.5000000| -0.1666667| 0.5773503| 3.6| 1| 0| 1| 0| 1| | 0.3333333| 0.0000000| 0.7071068| 4.0| 0| 0| 0| 1| 0| | 0.1666667| 0.5000000| 0.7071068| 5.0| 0| 1| 0| 0| 1| | 0.3333333| 0.0000000| 0.7071068| 6.0| 0| 0| 0| 1| 1|

rqplan <- as_rquery_plan(list(treatmentsN))
source_data <- rquery::rq_copy_to(db, "dTrainR", dTrainR,
                                  overwrite = TRUE, temporary = TRUE)

if(FALSE) {
  ops <- rquery_prepare(db, rqplan, source_data, "dTreatedN",
                       extracols = "id", return_ops = TRUE)
  cat(format(ops))
  ops %.>%
    rquery::op_diagram(.) %.>%
    DiagrammeR::grViz(.)
  # sql <- rquery::to_sql(ops, db)
  # cat(sql)
}

rest <- rquery_prepare(db, rqplan, source_data, "dTreatedN",
                       extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd %.>%
  knitr::kable(.)

| id| x_catD| x_catN| x_catP| x_lev_NA| x_lev_x_a| x_lev_x_b| y| z_clean| z_isBAD| |----:|----------:|-----------:|----------:|-----------:|-------------:|-------------:|----:|---------:|---------:| | 1| 0.5773503| -0.1666667| 0.5000000| 0| 1| 0| 0| 1.0| 0| | 2| 0.5773503| -0.1666667| 0.5000000| 0| 1| 0| 0| 2.0| 0| | 3| 0.5773503| -0.1666667| 0.5000000| 0| 1| 0| 1| 3.6| 1| | 4| 0.7071068| 0.0000000| 0.3333333| 0| 0| 1| 0| 4.0| 0| | 5| 0.0000000| 0.5000000| 0.1666667| 1| 0| 0| 1| 5.0| 0| | 6| 0.7071068| 0.0000000| 0.3333333| 0| 0| 1| 1| 6.0| 0|

rquery::rq_remove_table(db, source_data$table_name)
## [1] TRUE
rquery::rq_remove_table(db, rest$table_name)
## [1] TRUE

y-free example.

dTrainZ <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
                      z= c(1, 2, NA, 4, 5, 6),
                      stringsAsFactors = FALSE)
dTrainZ$id <- seq_len(nrow(dTrainZ))
treatmentsZ <- designTreatmentsZ(dTrainZ, c("x", "z"))
## [1] "vtreat 1.3.1 inspecting inputs Wed Aug  1 18:44:45 2018"
## [1] "designing treatments Wed Aug  1 18:44:45 2018"
## [1] " have initial level statistics Wed Aug  1 18:44:45 2018"
## [1] " scoring treatments Wed Aug  1 18:44:45 2018"
## [1] "have treatment plan Wed Aug  1 18:44:45 2018"
prepare(treatmentsZ, dTrainZ)  %.>%
  knitr::kable(.)

| x_catP| z_clean| z_isBAD| x_lev_NA| x_lev_x_a| x_lev_x_b| |----------:|---------:|---------:|-----------:|-------------:|-------------:| | 0.5000000| 1.0| 0| 0| 1| 0| | 0.5000000| 2.0| 0| 0| 1| 0| | 0.5000000| 3.6| 1| 0| 1| 0| | 0.3333333| 4.0| 0| 0| 0| 1| | 0.1666667| 5.0| 0| 1| 0| 0| | 0.3333333| 6.0| 0| 0| 0| 1|

rqplan <- as_rquery_plan(list(treatmentsZ))
source_data <- rquery::rq_copy_to(db, "dTrainZ", dTrainZ,
                                  overwrite = TRUE, temporary = TRUE)

rest <- rquery_prepare(db, rqplan, source_data, "dTreatedZ",
                       extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd  %.>%
  knitr::kable(.)

| id| x_catP| x_lev_NA| x_lev_x_a| x_lev_x_b| z_clean| z_isBAD| |----:|----------:|-----------:|-------------:|-------------:|---------:|---------:| | 1| 0.5000000| 0| 1| 0| 1.0| 0| | 2| 0.5000000| 0| 1| 0| 2.0| 0| | 3| 0.5000000| 0| 1| 0| 3.6| 1| | 4| 0.3333333| 0| 0| 1| 4.0| 0| | 5| 0.1666667| 1| 0| 0| 5.0| 0| | 6| 0.3333333| 0| 0| 1| 6.0| 0|

rquery::rq_remove_table(db, source_data$table_name)
## [1] TRUE
rquery::rq_remove_table(db, rest$table_name)
## [1] TRUE
if(!is.null(db)) {
  DBI::dbDisconnect(db)
}


WinVector/vtreat documentation built on Aug. 29, 2023, 4:49 a.m.