John Mount, Win-Vector LLC 2018-08-01
vtreat
transforms can be hosted on rquery
. This allows transforms at scale.
library("vtreat")
eval_examples <- requireNamespace("rquery", quietly = TRUE)
eval_rqdt <- eval_examples && requireNamespace("rqdatatable", quietly = TRUE)
eval_db <- eval_examples &&
requireNamespace("DBI", quietly = TRUE) &&
requireNamespace("RSQLite", quietly = TRUE)
db <- NULL
if(eval_db) {
db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
}
Classification example.
dTrainC <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
z= c(1, 2, NA, 4, 5, 6),
y= c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE),
stringsAsFactors = FALSE)
dTrainC$id <- seq_len(nrow(dTrainC))
treatmentsC <- designTreatmentsC(dTrainC, c("x", "z"), 'y', TRUE)
## [1] "vtreat 1.3.1 inspecting inputs Wed Aug 1 18:44:44 2018"
## [1] "designing treatments Wed Aug 1 18:44:44 2018"
## [1] " have initial level statistics Wed Aug 1 18:44:44 2018"
## [1] " scoring treatments Wed Aug 1 18:44:44 2018"
## [1] "have treatment plan Wed Aug 1 18:44:44 2018"
## [1] "rescoring complex variables Wed Aug 1 18:44:44 2018"
## [1] "done rescoring complex variables Wed Aug 1 18:44:44 2018"
prepare(treatmentsC, dTrainC) %.>%
knitr::kable(.)
| x_catP| x_catB| z_clean| z_isBAD| x_lev_NA| x_lev_x_a| x_lev_x_b| y | |----------:|-----------:|---------:|---------:|-----------:|-------------:|-------------:|:------| | 0.5000000| -0.6930972| 1.0| 0| 0| 1| 0| FALSE | | 0.5000000| -0.6930972| 2.0| 0| 0| 1| 0| FALSE | | 0.5000000| -0.6930972| 3.6| 1| 0| 1| 0| TRUE | | 0.3333333| 0.0000000| 4.0| 0| 0| 0| 1| FALSE | | 0.1666667| 9.2104404| 5.0| 0| 1| 0| 0| TRUE | | 0.3333333| 0.0000000| 6.0| 0| 0| 0| 1| TRUE |
rqplan <- as_rquery_plan(list(treatmentsC))
source_data <- rquery::rq_copy_to(db, "dTrainC", dTrainC,
overwrite = TRUE, temporary = TRUE)
rest <- rquery_prepare(db, rqplan, source_data, "dTreatedC",
extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd %.>%
knitr::kable(.)
| id| x_catB| x_catP| x_lev_NA| x_lev_x_a| x_lev_x_b| y| z_clean| z_isBAD| |----:|-----------:|----------:|-----------:|-------------:|-------------:|----:|---------:|---------:| | 1| -0.6930972| 0.5000000| 0| 1| 0| 0| 1.0| 0| | 2| -0.6930972| 0.5000000| 0| 1| 0| 0| 2.0| 0| | 3| -0.6930972| 0.5000000| 0| 1| 0| 1| 3.6| 1| | 4| 0.0000000| 0.3333333| 0| 0| 1| 0| 4.0| 0| | 5| 9.2104404| 0.1666667| 1| 0| 0| 1| 5.0| 0| | 6| 0.0000000| 0.3333333| 0| 0| 1| 1| 6.0| 0|
rquery::rq_remove_table(db, source_data$table_name)
## [1] TRUE
rquery::rq_remove_table(db, rest$table_name)
## [1] TRUE
Regression example.
dTrainR <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
z= c(1, 2, NA, 4, 5, 6),
y= as.numeric(c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE)),
stringsAsFactors = FALSE)
dTrainR$id <- seq_len(nrow(dTrainR))
treatmentsN <- designTreatmentsN(dTrainR, c("x", "z"), 'y')
## [1] "vtreat 1.3.1 inspecting inputs Wed Aug 1 18:44:45 2018"
## [1] "designing treatments Wed Aug 1 18:44:45 2018"
## [1] " have initial level statistics Wed Aug 1 18:44:45 2018"
## [1] " scoring treatments Wed Aug 1 18:44:45 2018"
## [1] "have treatment plan Wed Aug 1 18:44:45 2018"
## [1] "rescoring complex variables Wed Aug 1 18:44:45 2018"
## [1] "done rescoring complex variables Wed Aug 1 18:44:45 2018"
prepare(treatmentsN, dTrainR) %.>%
knitr::kable(.)
| x_catP| x_catN| x_catD| z_clean| z_isBAD| x_lev_NA| x_lev_x_a| x_lev_x_b| y| |----------:|-----------:|----------:|---------:|---------:|-----------:|-------------:|-------------:|----:| | 0.5000000| -0.1666667| 0.5773503| 1.0| 0| 0| 1| 0| 0| | 0.5000000| -0.1666667| 0.5773503| 2.0| 0| 0| 1| 0| 0| | 0.5000000| -0.1666667| 0.5773503| 3.6| 1| 0| 1| 0| 1| | 0.3333333| 0.0000000| 0.7071068| 4.0| 0| 0| 0| 1| 0| | 0.1666667| 0.5000000| 0.7071068| 5.0| 0| 1| 0| 0| 1| | 0.3333333| 0.0000000| 0.7071068| 6.0| 0| 0| 0| 1| 1|
rqplan <- as_rquery_plan(list(treatmentsN))
source_data <- rquery::rq_copy_to(db, "dTrainR", dTrainR,
overwrite = TRUE, temporary = TRUE)
if(FALSE) {
ops <- rquery_prepare(db, rqplan, source_data, "dTreatedN",
extracols = "id", return_ops = TRUE)
cat(format(ops))
ops %.>%
rquery::op_diagram(.) %.>%
DiagrammeR::grViz(.)
# sql <- rquery::to_sql(ops, db)
# cat(sql)
}
rest <- rquery_prepare(db, rqplan, source_data, "dTreatedN",
extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd %.>%
knitr::kable(.)
| id| x_catD| x_catN| x_catP| x_lev_NA| x_lev_x_a| x_lev_x_b| y| z_clean| z_isBAD| |----:|----------:|-----------:|----------:|-----------:|-------------:|-------------:|----:|---------:|---------:| | 1| 0.5773503| -0.1666667| 0.5000000| 0| 1| 0| 0| 1.0| 0| | 2| 0.5773503| -0.1666667| 0.5000000| 0| 1| 0| 0| 2.0| 0| | 3| 0.5773503| -0.1666667| 0.5000000| 0| 1| 0| 1| 3.6| 1| | 4| 0.7071068| 0.0000000| 0.3333333| 0| 0| 1| 0| 4.0| 0| | 5| 0.0000000| 0.5000000| 0.1666667| 1| 0| 0| 1| 5.0| 0| | 6| 0.7071068| 0.0000000| 0.3333333| 0| 0| 1| 1| 6.0| 0|
rquery::rq_remove_table(db, source_data$table_name)
## [1] TRUE
rquery::rq_remove_table(db, rest$table_name)
## [1] TRUE
y-free example.
dTrainZ <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
z= c(1, 2, NA, 4, 5, 6),
stringsAsFactors = FALSE)
dTrainZ$id <- seq_len(nrow(dTrainZ))
treatmentsZ <- designTreatmentsZ(dTrainZ, c("x", "z"))
## [1] "vtreat 1.3.1 inspecting inputs Wed Aug 1 18:44:45 2018"
## [1] "designing treatments Wed Aug 1 18:44:45 2018"
## [1] " have initial level statistics Wed Aug 1 18:44:45 2018"
## [1] " scoring treatments Wed Aug 1 18:44:45 2018"
## [1] "have treatment plan Wed Aug 1 18:44:45 2018"
prepare(treatmentsZ, dTrainZ) %.>%
knitr::kable(.)
| x_catP| z_clean| z_isBAD| x_lev_NA| x_lev_x_a| x_lev_x_b| |----------:|---------:|---------:|-----------:|-------------:|-------------:| | 0.5000000| 1.0| 0| 0| 1| 0| | 0.5000000| 2.0| 0| 0| 1| 0| | 0.5000000| 3.6| 1| 0| 1| 0| | 0.3333333| 4.0| 0| 0| 0| 1| | 0.1666667| 5.0| 0| 1| 0| 0| | 0.3333333| 6.0| 0| 0| 0| 1|
rqplan <- as_rquery_plan(list(treatmentsZ))
source_data <- rquery::rq_copy_to(db, "dTrainZ", dTrainZ,
overwrite = TRUE, temporary = TRUE)
rest <- rquery_prepare(db, rqplan, source_data, "dTreatedZ",
extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd %.>%
knitr::kable(.)
| id| x_catP| x_lev_NA| x_lev_x_a| x_lev_x_b| z_clean| z_isBAD| |----:|----------:|-----------:|-------------:|-------------:|---------:|---------:| | 1| 0.5000000| 0| 1| 0| 1.0| 0| | 2| 0.5000000| 0| 1| 0| 2.0| 0| | 3| 0.5000000| 0| 1| 0| 3.6| 1| | 4| 0.3333333| 0| 0| 1| 4.0| 0| | 5| 0.1666667| 1| 0| 0| 5.0| 0| | 6| 0.3333333| 0| 0| 1| 6.0| 0|
rquery::rq_remove_table(db, source_data$table_name)
## [1] TRUE
rquery::rq_remove_table(db, rest$table_name)
## [1] TRUE
if(!is.null(db)) {
DBI::dbDisconnect(db)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.