R/data_prep.R

Defines functions split_data treat_numeric_data prep_mdl_data

#' splits train and test data
#'
#' @param x a data.frame or data.table
#' @param prop proportion of data to be split into the test frame
#' @return a list with train and test data.frames or data.tables

split_data <- function(x, prop = 0.8){
  rand_split <- runif(nrow(x))
  out <- list()
  out$train <- x[rand_split <= prop, ]
  out$test <- x[rand_split > prop, ]
  out
}

#' categorical variable treatment using cross frames
#'
#' @param mdl a list with two data.frames or tables from split_data. List should have train and test named data.frames/data.tables
#' @param target_var the target variable for target encoding
#' @param ncross the number of cross frames to generate for variable treatment
#' @param ncore should this be parallelized?
#' @param var_types_return what variable types should be returned from vtreat
#' @return a list with treated frames, and a treatment plan

treat_numeric_data <- function(mdl,
                               target_var,
                               ncross           = 5,
                               multicore        = T,
                               var_types_return = c('clean', 'isBAD', 'catN')){

  mdl <- tryCatch({
    if(multicore == T){
      ncores <- parallel::detectCores()
      workers <- parallel::makeCluster(ncores, type="SOCK")
      doParallel::registerDoParallel(workers)
    }

    treatment <- vtreat::mkCrossFrameNExperiment(mdl$train,
                                                 varlist         = setdiff(colnames(mdl$train), target_var),
                                                 outcomename     = target_var,
                                                 ncross          = ncross,
                                                 codeRestriction = var_types_return)
    test <- vtreat::prepare(treatmentplan = treatment$treatments, dframe = mdl$test)

    #take care of cluster
    if(multicore == T){
      parallel::stopCluster(workers)
    }
    mdl$train <- treatment$crossFrame
    mdl$test <- test
    mdl$treatments <- treatment$treatments
    mdl
  }, error = function(e){
    parallel::stopCluster(workers)
  })
  mdl
}


#' categorical variable treatment using cross frames
#'
#' @param mdl a list with two data.frames or tables from split_data. List should have train and test named data.frames/data.tables
#' @param target_var the target variable for target encoding
#' @param var_keep if var_keep is null all variables are retained, otherwise they are subset based on columnnames specified in var_keep
#' @return a list with data.frames for modeling

prep_mdl_data <- function(mdl, target_var, var_keep = NULL){
  #setup training and test data
  if(is.null(var_keep)){
    mdl$train_x <- mdl$train[,!(colnames(mdl$train) %in% target_var)] %>% as.matrix()
    mdl$train_y <- mdl$train[,colnames(mdl$train) %in% target_var]
    mdl$test_x  <- mdl$test[,!(colnames(mdl$test) %in% target_var)]
    mdl$test_y  <- mdl$test[,colnames(mdl$test) %in% target_var]
  } else {
    #subset training
    train            <- mdl$train[, !(colnames(mdl$train) %in% target_var)]
    train            <- train[,colnames(train) %in% var_keep]
    mdl$train_x <- train %>% as.matrix()
    mdl$train_y <- mdl$train[,colnames(mdl$train) %in% target_var]
    #subset test
    test            <- mdl$test[, !(colnames(mdl$train) %in% target_var)]
    test            <- test[,colnames(test) %in% var_keep]
    mdl$test_x <- test %>% as.matrix()
    mdl$test_y <- mdl$test[,colnames(test) %in% target_var]
  }
  return(mdl)
}
prescient/opinionated_pipelines documentation built on Dec. 10, 2019, 12:15 a.m.