R/deprecated_ffs.R

#' Forward feature selection
#' @description A simple forward feature selection algorithm
#' @param predictors see \code{\link{train}}
#' @param response see \code{\link{train}}
#' @param method see \code{\link{train}}
#' @param metric see \code{\link{train}}
#' @param maximize see \code{\link{train}}
#' @param withinSD Logical Models are only selected if they are better than the 
#' currently best models Standard error
#' @param trControl see \code{\link{train}}
#' @param tuneLength see \code{\link{train}}
#' @param tuneGrid see \code{\link{train}}
#' @param seed A random number
#' @param runParallel Logical
#' @param ... arguments passed to the classification or regression routine 
#' (such as randomForest). Errors will occur if values for tuning parameters are 
#' passed here.
#' @return A list of class train. Beside of the usual train contentm
#' the object contains the vector "selectedvars" and "selectedvars_perf"
#' that give the order of the variables selected as well as their corresponding
#' performance (starting from the first two variables)
#' @details Models with two predictors are first trained using all possible 
#' pairs of predictor variables. The best model of these initial models is kept.
#' On the basis of this best model the predictor variables are iteratively
#' increased and each of the remaining variables is tested for its improvement
#' of the currently best model. The process stops if none of the remaining 
#' variables increases the model performance when added to the current best model.
#' 
#' The internal cross validation can be run in parallel. See information
#' on parallel processing of carets train functions for details.
#' 
#' Using withinSE will favour models with less variables and
#' probably shorten the calculation time 
#' 
#' @note This validation is particulary suitable for 
#' leave-one-station-out cross validations where variable selection
#' MUST be based on the performance of the model on the hold out station.
#' A computational time expesnsive alternative is the best subset
#' selection (\code{bss}).
#' @author Hanna Meyer
#' @seealso \code{\link{train}},  \code{bss}, 
#' \code{\link{trainControl}},\code{\link{rfe}}
#' @examples
#'  \dontrun{
#' data(iris)
#' ffsmodel <- ffs(iris[,1:4],iris$Species)
#' ffsmodel$selectedvars
#' ffsmodel$selectedvars_perf 
#' }
#' @name ffs-deprecated
#' @usage ffs(predictors, response, method = "rf"
#' , metric = ifelse(is.factor(response), "Accuracy", "RMSE")
#' , maximize = ifelse(metric == "RMSE", FALSE, TRUE), withinSD = FALSE
#' , trControl = trainControl(), tuneLength = 3, tuneGrid = NULL
#' , seed = sample(1:1000, 1), runParallel = FALSE, ...)     
#' @seealso \code{\link{Rsenal-deprecated}}  
#' @keywords internal
NULL

#' @rdname Rsenal-deprecated
#' @section \code{ffs}:
#' \code{ffs} is deprecated (no further details available).
#' 
#' @export 
ffs <- function (predictors, 
                 response, 
                 method = "rf",
                 metric = ifelse(is.factor(response), "Accuracy", "RMSE"),
                 maximize = ifelse(metric == "RMSE", FALSE, TRUE),
                 withinSD = FALSE,
                 trControl = trainControl(),
                 tuneLength = 3,
                 tuneGrid = NULL,
                 seed = sample(1:1000, 1),
                 runParallel = FALSE,
                 ...){
  if(runParallel){
    cl <- parallel::makeCluster(parallel::detectCores()-2)
    doParallel::registerDoParallel(cl)
    on.exit(parallel::stopCluster(cl))
  }
  n <- length(names(predictors))
  acc <- 0
  if(maximize) evalfunc <- function(x){max(x,na.rm=T)}
  if(!maximize) evalfunc <- function(x){min(x,na.rm=T)}
  isBetter <- function (actmodelperf,bestmodelperf,
                        bestmodelperfSD=NULL,
                        maximization=FALSE,
                        withinSE=FALSE){
    if(withinSE){
      result <- ifelse (!maximization, actmodelperf < bestmodelperf-bestmodelperfSD,
                        actmodelperf > bestmodelperf+bestmodelperfSD)
    }else{
      result <- ifelse (!maximization, actmodelperf < bestmodelperf,
                        actmodelperf > bestmodelperf)
    }
    return(result)
  }
  #### chose initial best model from all combinations of two variables
  twogrid <- t(data.frame(combn(names(predictors),2)))
  for (i in 1:nrow(twogrid)){
    set.seed(seed)
    model <- train(predictors[,twogrid[i,]],
                   response,
                   method=method,
                   trControl=trControl,
                   tuneLength = tuneLength,
                   tuneGrid = tuneGrid)
    ### compare the model with the currently best model
    actmodelperf <- evalfunc(model$results[,names(model$results)==metric])
    if(withinSD){
      actmodelperfSD <- Rsenal::se(
        sapply(unique(model$resample$Resample),
               FUN=function(x){mean(model$resample[model$resample$Resample==x,
                                                   metric])}))
      
    }
    if (i == 1){
      bestmodelperf <- actmodelperf
      if(withinSD){
        bestmodelperfSD <- actmodelperfSD
      }
      bestmodel <- model
    } else{
      if (isBetter(actmodelperf,bestmodelperf,maximization=maximize,withinSE=FALSE)){
        bestmodelperf <- actmodelperf 
        if(withinSD){
          bestmodelperfSD <- actmodelperfSD
        }
        bestmodel <- model
      }
    }
    acc <- acc+1
    print(paste0("maxmimum number of models that still need to be trained: ",
                 (((n-1)^2)+n-1)/2 + (((n-2)^2)+n-2)/2 - acc))
  }
  #### increase the number of predictors by one (try all combinations) 
  #and test if model performance increases
  selectedvars <- names(bestmodel$trainingData)[-which(
    names(bestmodel$trainingData)==".outcome")]
  if (maximize){
    selectedvars_perf <- max(bestmodel$results[,metric])
  } else{
    selectedvars_perf <- min(bestmodel$results[,metric])
  }
  print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')), 
               " with ",metric," ",round(selectedvars_perf,3)))
  for (k in 1:(length(names(predictors))-2)){
    startvars <- names(bestmodel$trainingData)[-which(
      names(bestmodel$trainingData)==".outcome")]
    nextvars <- names(predictors)[-which(
      names(predictors)%in%startvars)]
    if (length(startvars)<(k+1)){
      message(paste0("Note: No increase in performance found using more than ",
                     length(startvars), " variables"))
      bestmodel$selectedvars <- selectedvars
      bestmodel$selectedvars_perf <- selectedvars_perf[-length(selectedvars_perf)]
      return(bestmodel)
      break()
    }
    for (i in 1:length(nextvars)){
      set.seed(seed)
      model <- train(predictors[,c(startvars,nextvars[i])],
                     response,
                     method = method,
                     trControl = trControl,
                     tuneLength = tuneLength,
                     tuneGrid = tuneGrid)
      actmodelperf <- evalfunc(model$results[,names(model$results)==metric])
      if(withinSD){
        actmodelperfSD <- Rsenal::se(
          sapply(unique(model$resample$Resample),
                 FUN=function(x){mean(model$resample[model$resample$Resample==x,
                                                     metric])}))
      }
      if(isBetter(actmodelperf,bestmodelperf,bestmodelperfSD,
                  maximization=maximize,withinSE=withinSD)){
        bestmodelperf <- actmodelperf 
        if(withinSD){
          bestmodelperfSD <- actmodelperfSD
        }
        bestmodel <- model
        
        
      }
      acc <- acc+1
      print(paste0("maxmimum number of models that still need to be trained: ",
                   (((n-1)^2)+n-1)/2 + (((n-2)^2)+n-2)/2 - acc))
    }
    selectedvars <- c(selectedvars,names(bestmodel$trainingData)[-which(
      names(bestmodel$trainingData)%in%c(".outcome",selectedvars))])
    if (maximize){
      selectedvars_perf <- c(selectedvars_perf,max(bestmodel$results[,metric]))
      print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')), 
                   " with ", metric," ",round(max(bestmodel$results[,metric]),3)))
    }
    if (!maximize){
      selectedvars_perf <- c(selectedvars_perf,min(bestmodel$results[,metric]))
      print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')), 
                   " with ",metric," ",round(min(bestmodel$results[,metric]),3)))
    }
  }
  bestmodel$selectedvars <- selectedvars
  bestmodel$selectedvars_perf <- selectedvars_perf
  return(bestmodel)
}
environmentalinformatics-marburg/Rsenal documentation built on July 28, 2023, 6:09 a.m.