#' Performs backwards variable selection using random forest importance and leverages the ranger package.
#'
#' @param df_train Training data.frame with column called "target" for selection. All columns
#' should be numeric and prepared with a package like vtreat or using a modelpipe
#' prep_numeric or prep_bin function call. If you are working with categorical data you should convert "target"
#' to a factor.
#' @param num_vars Number of variables to retain.
#' @param num_trees Number of trees to be used in Ranger.
#' @param importance_type Specifies importance type. Valid values are one of
#' 'none', 'impurity', 'impurity_corrected', 'permutation'. The default is 'permutation'.
#' @param removal_rate Number of variables to remove at a time.
#' @param verbose TRUE prints an update each time a variable is removed.
#' @return Returns a vector of selected variable names.
eliminate_features <- function(df_train,
num_vars = 15,
num_trees = 500,
importance_type = 'permutation',
removal_rate = 1,
verbose = T){
remaning_vars <- ncol(df_train) - 1
while(remaning_vars > num_vars){
#build model to get importance
mdl <- ranger::ranger(target ~ .,
data = df_train,
num.trees = num_trees,
importance = importance_type)
#calculate variable importance and remove unimportant variable
imp <- ranger::importance(mdl)
vars <- names(imp)
vars <- vars[order(imp)]
vars <- vars[(removal_rate + 1):length(vars)]
df_train <- df_train[,c("target", vars)]
#update counters
remaning_vars <- ncol(df_train) - 1
if(verbose == T){
flush.console()
print(paste0(remaning_vars, " of max ", num_vars, " predictor variables remaining"))
}
}
return(vars)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.