vtreat transforms can be hosted on rquery. This allows transforms at scale.

library("vtreat")

eval_examples <- requireNamespace("rquery", quietly = TRUE)
eval_rqdt <- eval_examples && requireNamespace("rqdatatable", quietly = TRUE)
eval_db <- eval_examples &&
  requireNamespace("DBI", quietly = TRUE) &&
  requireNamespace("RSQLite", quietly = TRUE)
db <- NULL
if(eval_db) {
  db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
}

Classification example.

dTrainC <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
                      z= c(1, 2, NA, 4, 5, 6),
                      y= c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE),
                      stringsAsFactors = FALSE)
dTrainC$id <- seq_len(nrow(dTrainC))
treatmentsC <- designTreatmentsC(dTrainC, c("x", "z"), 'y', TRUE)
prepare(treatmentsC, dTrainC) %.>%
  knitr::kable(.)
rqplan <- as_rquery_plan(list(treatmentsC))
source_data <- rquery::rq_copy_to(db, "dTrainC", dTrainC,
                                  overwrite = TRUE, temporary = TRUE)

rest <- rquery_prepare(db, rqplan, source_data, "dTreatedC",
                       extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd  %.>%
  knitr::kable(.)

rquery::rq_remove_table(db, source_data$table_name)
rquery::rq_remove_table(db, rest$table_name)

Regression example.

dTrainR <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
                      z= c(1, 2, NA, 4, 5, 6),
                      y= as.numeric(c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE)),
                      stringsAsFactors = FALSE)
dTrainR$id <- seq_len(nrow(dTrainR))
treatmentsN <- designTreatmentsN(dTrainR, c("x", "z"), 'y')
prepare(treatmentsN, dTrainR)  %.>%
  knitr::kable(.)

rqplan <- as_rquery_plan(list(treatmentsN))
source_data <- rquery::rq_copy_to(db, "dTrainR", dTrainR,
                                  overwrite = TRUE, temporary = TRUE)

if(FALSE) {
  ops <- rquery_prepare(db, rqplan, source_data, "dTreatedN",
                       extracols = "id", return_ops = TRUE)
  cat(format(ops))
  ops %.>%
    rquery::op_diagram(.) %.>%
    DiagrammeR::grViz(.)
  # sql <- rquery::to_sql(ops, db)
  # cat(sql)
}

rest <- rquery_prepare(db, rqplan, source_data, "dTreatedN",
                       extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd %.>%
  knitr::kable(.)

rquery::rq_remove_table(db, source_data$table_name)
rquery::rq_remove_table(db, rest$table_name)

y-free example.

dTrainZ <- data.frame(x= c('a', 'a', 'a', 'b' ,NA , 'b'),
                      z= c(1, 2, NA, 4, 5, 6),
                      stringsAsFactors = FALSE)
dTrainZ$id <- seq_len(nrow(dTrainZ))
treatmentsZ <- designTreatmentsZ(dTrainZ, c("x", "z"))
prepare(treatmentsZ, dTrainZ)  %.>%
  knitr::kable(.)

rqplan <- as_rquery_plan(list(treatmentsZ))
source_data <- rquery::rq_copy_to(db, "dTrainZ", dTrainZ,
                                  overwrite = TRUE, temporary = TRUE)

rest <- rquery_prepare(db, rqplan, source_data, "dTreatedZ",
                       extracols = "id")
resd <- DBI::dbReadTable(db, rest$table_name)
resd  %.>%
  knitr::kable(.)

rquery::rq_remove_table(db, source_data$table_name)
rquery::rq_remove_table(db, rest$table_name)

if(!is.null(db)) {
  DBI::dbDisconnect(db)
}


WinVector/vtreat documentation built on Aug. 29, 2023, 4:49 a.m.