#' splits train and test data
#'
#' @param x a data.frame or data.table
#' @param prop proportion of data to be split into the test frame
#' @return a list with train and test data.frames or data.tables
split_data <- function(x, prop = 0.8){
rand_split <- runif(nrow(x))
out <- list()
out$train <- x[rand_split <= prop, ]
out$test <- x[rand_split > prop, ]
out
}
#' categorical variable treatment using cross frames
#'
#' @param mdl a list with two data.frames or tables from split_data. List should have train and test named data.frames/data.tables
#' @param target_var the target variable for target encoding
#' @param ncross the number of cross frames to generate for variable treatment
#' @param ncore should this be parallelized?
#' @param var_types_return what variable types should be returned from vtreat
#' @return a list with treated frames, and a treatment plan
treat_numeric_data <- function(mdl,
target_var,
ncross = 5,
multicore = T,
var_types_return = c('clean', 'isBAD', 'catN')){
mdl <- tryCatch({
if(multicore == T){
ncores <- parallel::detectCores()
workers <- parallel::makeCluster(ncores, type="SOCK")
doParallel::registerDoParallel(workers)
}
treatment <- vtreat::mkCrossFrameNExperiment(mdl$train,
varlist = setdiff(colnames(mdl$train), target_var),
outcomename = target_var,
ncross = ncross,
codeRestriction = var_types_return)
test <- vtreat::prepare(treatmentplan = treatment$treatments, dframe = mdl$test)
#take care of cluster
if(multicore == T){
parallel::stopCluster(workers)
}
mdl$train <- treatment$crossFrame
mdl$test <- test
mdl$treatments <- treatment$treatments
mdl
}, error = function(e){
parallel::stopCluster(workers)
})
mdl
}
#' categorical variable treatment using cross frames
#'
#' @param mdl a list with two data.frames or tables from split_data. List should have train and test named data.frames/data.tables
#' @param target_var the target variable for target encoding
#' @param var_keep if var_keep is null all variables are retained, otherwise they are subset based on columnnames specified in var_keep
#' @return a list with data.frames for modeling
prep_mdl_data <- function(mdl, target_var, var_keep = NULL){
#setup training and test data
if(is.null(var_keep)){
mdl$train_x <- mdl$train[,!(colnames(mdl$train) %in% target_var)] %>% as.matrix()
mdl$train_y <- mdl$train[,colnames(mdl$train) %in% target_var]
mdl$test_x <- mdl$test[,!(colnames(mdl$test) %in% target_var)]
mdl$test_y <- mdl$test[,colnames(mdl$test) %in% target_var]
} else {
#subset training
train <- mdl$train[, !(colnames(mdl$train) %in% target_var)]
train <- train[,colnames(train) %in% var_keep]
mdl$train_x <- train %>% as.matrix()
mdl$train_y <- mdl$train[,colnames(mdl$train) %in% target_var]
#subset test
test <- mdl$test[, !(colnames(mdl$train) %in% target_var)]
test <- test[,colnames(test) %in% var_keep]
mdl$test_x <- test %>% as.matrix()
mdl$test_y <- mdl$test[,colnames(test) %in% target_var]
}
return(mdl)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.