R/regression.R

# Ajusta Regressao
#'
#'
#' This function performs the training of the chosen regressor
#' @param df.train   Training dataframe
#' @param formula   A formula of the form y ~ x1 + x2 + ... If users don't inform formula, the first column will be used as Y values and the others columns with x1,x2....xn
#' @param preprocess pre process
#' @param regressor Choice of regressor to be used to train model. Uses  algortims names from Caret package.
#' @param resample_ ressample method 'boot', 'boot632', 'optimism_boot', 'boot_all', 'cv', 'repeatedcv', 'LOOCV', 'LGOCV','none', 'oob', 'timeslice', 'adaptive_cv', 'adaptive_boot', 'adaptive_LGOCV'
#' @param nfolds Number of folds to be build in crossvalidation
#' @param repeats repeats
#' @param index index
#' @param cpu_cores  Number of CPU cores to be used in parallel processing
#' @param tune_length  This argument is the number of levels for each tuning parameters that should be generated by train
#' @param metric metric used to evaluate model fit. For numeric outcome ("RMSE", "Rsquared)
#' @param seeds  seeds
#' @param verbose verbose
#' @keywords Train regression RMSE Rsquared
#' @importFrom parallel makePSOCKcluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom caret trainControl train getTrainPerf
#' @importFrom stats as.formula
#' @importFrom foreach registerDoSEQ
#' @author Elpidio Filho, \email{elpidio@ufv.br}
#' @details details
#' @export
#' @examples
#' \dontrun{
#' regression(df.train = df, regressor = "rf", metric = "Rsquared", seeds = 313)
#' }


regression <- function(df.train, formula = NULL, preprocess = NULL,
                       regressor = "rf", resample_ = 'cv', nfolds = 10,
                       repeats =  NA,  index = NULL, cpu_cores = 0,
                       tune_length = 5, metric = "Rsquared",
                       seeds = NULL, verbose = FALSE) {
  resample_methods = c('boot', 'boot632', 'optimism_boot', 'boot_all', 'cv',
                       'repeatedcv', 'LOOCV', 'LGOCV','none', 'oob',
                       'timeslice', 'adaptive_cv', 'adaptive_boot', 'adaptive_LGOCV')

  if  (!any(resample_ %in% resample_methods)) stop(paste("resample method",resample, "does not exist"))
  #lb = caret::getModelInfo(regressor, regex = FALSE)[[1]]$library
#  if (is.null(lb) == FALSE){
#    print(paste("loading library", lb))
#    suppressPackageStartupMessages(library(lb, character.only = TRUE))
#  }

  inicio <- Sys.time()
  tc <- caret::trainControl( method = resample_, number = nfolds,
                             repeats = repeats,  index = index,
                             seeds = seeds)


    if (cpu_cores > 0) {
    cl <- parallel::makePSOCKcluster(cpu_cores)
    doParallel::registerDoParallel(cl)
    on.exit(stopCluster(cl))
  } else {
    cl = NULL
  }

  if (is.null(formula)) {
    fit <- tryCatch({
      caret::train(x = df.train[, -1], y = df.train[, 1],
                   method = regressor, metric = metric,
                   trControl = tc, tuneLength = tune_length,
                   preProcess = preprocess
      )},
      error = function(e){NULL})
  } else {
    fit <- tryCatch({
      caret::train(formula, data = df.train, method = regressor,
                   metric = metric,trControl = tc, tuneLength = tune_length,
                   preProcess = preprocess
      )},
      error = function(e){NULL})
  }

  if (!is.null(cl)) {
    #parallel::stopCluster(cl)
    foreach::registerDoSEQ()
  }
  if (verbose == TRUE) {
    #  print(paste("time elapsed : ", hms_span(inicio, Sys.time())))
    #   print(caret::getTrainPerf(fit))
  }
  return(fit)
}
elpidiofilho/easyFit documentation built on May 28, 2019, 8:36 p.m.