R/trainModelVarSel.R

#' Extract cross-validated important variables
#' 
#' @description 
#' This function approaches the identification of important variables from
#' \code{\link[caret]{rfe}} more conservatively than \strong{caret}. It uses
#' the standard deviation (or standard error) of the cross-validated error 
#' metric to identify important variables.
#' 
#' @param model a rfe model. See \code{\link[caret]{rfe}}
#' @param metric the metric to be used. Note this needs to be the metric used 
#' to calculate the \code{\link[caret]{rfe}} model
#' @param maximize logical: Is a higher value of the metric favourable
#'  (e.g metric = Rsquared) or not (e.g metric = RMSE). maximize=TRUE is 
#'  determined automatically as long as metric is either Rsquared, ROC, Accuracy.
#'  maximize =FALSE is used for all other metrics. Set this manually if you
#'  use an other metric where higher values are favourable.
#' @param sderror If TRUE then standard error is calculated. If FALSE then
#' standard deviations are used
#' 
#' @return
#' a character vector of the variable names
#' 
#' @examples
#' # Not run
trainModelVarSelSD <- function (model, metric = model$metric, maximize=FALSE,
                       sderror=TRUE){
  
  if (metric=="Rsquared"||metric=="ROC"||metric=="Accuracy"){
    maximize=TRUE
  } else {
    maximize =FALSE
  }
  
  data <- as.data.frame(model$resample)
  
  sdv <- c()
  means <- c()
  
  for (i in unique(data$Variables)) {
    if(!sderror){
      sdv <- c(sdv,
               sd(eval(parse(text = paste("data$",
                                          metric)))[data$Variables == i]))
    }
    if(sderror){
      sdv <- c(sdv,helpStandardError(eval(parse(text = paste("data$",
                                              metric)))[data$Variables == i]))
    }
    means <- c(means,
               mean(eval(parse(text=paste("data$",
                                          metric)))[data$Variables==i]))
  }
  if (maximize)  {
    upr <- means - sdv
  } else {
    upr <- means + sdv
  }
  #start_var <- min(unique(data$Variables))
  #start_offset <- abs(start_var - 1)
  
  #   print(means)
  #   print(upr)
  #   print(upr[model$bestSubset - start_offset])
  #   print(means < upr[model$bestSubset - start_offset])
  if (maximize){
    n_vars <- unique(data$Variables)[which(means >
                                             upr[which(unique(data$Variables)==
                                                         model$bestSubset)])[1]]
  } else{
    n_vars <- unique(data$Variables)[which(means <
                                             upr[which(unique(data$Variables)==
                                                         model$bestSubset)])[1]]
  }
  
  subset <- model$variables[model$variables$Variables==n_vars,]
  uniqueVars <- unique(subset$var)
  bestVar<-c()
  for (i in 1:length(uniqueVars)){
    bestVar[i]=sum(subset$Overall[subset$var==uniqueVars[i]])/10
  }
  names(bestVar) <- uniqueVars
  
  if(is.na(names(bestVar))){
    bestVar<-model$optVariables
  } else {
    bestVar<-sort(bestVar,decreasing=TRUE)[1:n_vars]
  }
  
  #bestVar <- model$control$functions$selectVar(model$variables, n_vars)
  #return(model$optVariables[1:n_vars])
  return(bestVar)
}
environmentalinformatics-marburg/gpm documentation built on July 11, 2020, 11:12 a.m.