#' Variable Selection with Regression RF
#'
#' Similar approach to varSelRF, but for regresssion. Use full model to rank
#' variables based on either %IncMSE or Gini. To use %IncMSE set
#' imporatance = TRUE. It then steps through that sorted variable list with
#' most important first and runs RF, store variables and %MSE of model.
#' Note: this sorts on Gini, by default. importance = TRUE ensures %IncMSE.
#' @param y response a vector
#' @param x predictors a data.frame
#' @param ... options to pass to randomForest
#' @export
#' @import randomForest
#' @examples
#' data(LakeTrophicModelling)
#' predictors_all <- predictors_all[predictors_all!="DATE_COL"]
#' all_dat <- data.frame(ltmData[predictors_all],LogCHLA=log10(ltmData$CHLA))
#' all_dat <- all_dat[complete.cases(all_dat),]
#' x<-varsel_regression_rf(all_dat$LogCHLA,all_dat[,names(all_dat)!="LogCHLA"],
#' ntree=100,prog=T, importance = TRUE)
varsel_regression_rf <- function(y,x,prog=F,...){
out <- list(mse=NULL,rsq=NULL,num_var=NULL,vars=NULL)
dat <- data.frame(y=y,x)
init_rf <- randomForest(y=dat$y,x=dat[,names(x)],...)
init_imp <- importance(init_rf)
var_sort <- rownames(init_imp)[order(init_imp[,1],decreasing = TRUE)]
vars <- NULL
for(i in var_sort){
if(is.null(vars)){
vars <- c(vars,i)
idx <- 1
} else {
vars <- c(vars,i)
vars_rf <- randomForest(y=dat$y,x=dat[,vars],...)
out$mse[[idx]] <- vars_rf$mse[length(vars_rf$mse)]
out$rsq[[idx]] <- vars_rf$rsq[length(vars_rf$rsq)]
out$num_var[[idx]] <- length(vars)
out$vars[[idx]] <- vars
idx <- idx + 1
}
if(prog & idx%%3==0){
print(paste0(round(idx/length(var_sort)*100,1),"% completed"))
}
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.