R/data_prep.R

Defines functions prep_numeric prep_bin

Documented in prep_bin prep_numeric

#' Preps numeric targets using vtreat using cross frames and and fixes names to my preferences. Assumes
#' there is a variable named "target" in the input data.frame
#'
#' @param df_train Training data.frame.
#' @param df_test Testing data.frame.
#' @param ncross Number of cross validation frames in treatment design.
#' @param use_parallel Logical, if TRUE use parallel methods.
#' @param var_types_return Specifies what types of variables to produce (character array
#' of level codes, NULL means no restriction).
#' @return List with cross validation fold indices

prep_numeric <- function(df_train,
                         df_test,
                         ncross = 5,
                         use_parallel = F,
                         var_types_return = c('clean', 'isBAD', 'catN')){

  testit::assert("Training and testing frames are not data.frames.", is.data.frame(df_train) & is.data.frame(df_test))
  testit::assert('df_train is missing "target" column.', "target" %in% colnames(df_train))
  testit::assert('df_test is missing "target" columns.', "target" %in% colnames(df_test))

  cf_exp <- vtreat::mkCrossFrameNExperiment(dframe          = df_train,
                                            varlist         = setdiff(colnames(df_train), "target"),
                                            outcomename     = "target",
                                            ncross          = ncross,
                                            use_parallel    = use_parallel,
                                            codeRestriction = var_types_return)

  #preference vs vtreat's naming conventions
  df_train <- janitor::clean_names(cf_exp$crossFrame)
  df_test  <- vtreat::prepare(treatmentplan = cf_exp$treatments,
                              dframe        = df_test) %>% janitor::clean_names()

  out <- list()
  out$df_train       <- df_train
  out$df_test        <- df_test
  out$treatment_plan <- cf_exp
  return(out)
}


#' Preps binary targets using vtreat using cross frames and and fixes names to my preferences. Assumes
#' there is a variable named "target" in the input data.frame
#'
#' @param df_train Training data.frame.
#' @param df_test Testing data.frame.
#' @param outcome_target Value/level of outcome to be considered "success", and there must be a cut such that dframe[[outcomename]]==outcometarget at least twice and dframe[[outcomename]]!=outcometarget at least twice.
#' @param ncross Number of cross validation frames in treatment design.
#' @param use_parallel Logical, if TRUE use parallel methods.
#' @param var_types_return Specifies what types of variables to produce (character array
#' of level codes, NULL means no restriction).
#' @return List with cross validation fold indices

prep_bin <- function(df_train,
                     df_test,
                     outcome_target,
                     ncross = 5,
                     use_parallel = F,
                     var_types_return = c('clean', 'isBAD', 'catP')){

  testit::assert("Training and testing frames are not data.frames.", is.data.frame(df_train) & is.data.frame(df_test))
  testit::assert('df_train is missing "target" column.', "target" %in% colnames(df_train))
  testit::assert('df_test is missing "target" columns.', "target" %in% colnames(df_test))

  cf_exp <- vtreat::mkCrossFrameCExperiment(dframe          = df_train,
                                            varlist         = setdiff(colnames(df_train), "target"),
                                            outcomename     = "target",
                                            outcometarget   = outcome_target,
                                            ncross          = ncross,
                                            use_parallel    = use_parallel,
                                            codeRestriction = var_types_return)

  #preference vs vtreat's naming conventions
  df_train <- janitor::clean_names(cf_exp$crossFrame)
  df_test  <- vtreat::prepare(treatmentplan = cf_exp$treatments,
                              dframe        = df_test) %>% janitor::clean_names()



  out <- list()
  out$df_train       <- df_train
  out$df_test        <- df_test
  out$treatment_plan <- cf_exp
  return(out)
}
prescient/modelpipe documentation built on Dec. 25, 2019, 3:20 a.m.