modelpipe: Fast and opinionated model development pipelines.

Documented in eliminate_features

#' Performs backwards variable selection using random forest importance and leverages the ranger package.
#'
#' @param df_train Training data.frame with column called "target" for selection. All columns
#' should be numeric and prepared with a package like vtreat or using a modelpipe
#' prep_numeric or prep_bin function call. If you are working with categorical data you should convert "target"
#' to a factor.
#' @param num_vars Number of variables to retain.
#' @param num_trees Number of trees to be used in Ranger.
#' @param importance_type Specifies importance type. Valid values are one of
#' 'none', 'impurity', 'impurity_corrected', 'permutation'. The default is 'permutation'.
#' @param removal_rate Number of variables to remove at a time.
#' @param verbose TRUE prints an update each time a variable is removed.
#' @return Returns a vector of selected variable names.

eliminate_features <- function(df_train,
                               num_vars        = 15,
                               num_trees       = 500,
                               importance_type = 'permutation',
                               removal_rate    = 1,
                               verbose         = T){

  remaning_vars <- ncol(df_train) - 1
  while(remaning_vars > num_vars){
    #build model to get importance
    mdl <- ranger::ranger(target ~ .,
                          data       = df_train,
                          num.trees  = num_trees,
                          importance = importance_type)
    #calculate variable importance and remove unimportant variable
    imp <- ranger::importance(mdl)
    vars <- names(imp)
    vars <- vars[order(imp)]
    vars <- vars[(removal_rate + 1):length(vars)]
    df_train <- df_train[,c("target", vars)]

    #update counters
    remaning_vars <- ncol(df_train) - 1
    if(verbose == T){
      flush.console()
      print(paste0(remaning_vars, " of max ", num_vars, " predictor variables remaining"))
    }
  }
  return(vars)
}