R/run_models.R

#'  Run Models
#'
#'
#' This function run many models using the same data
#' @param df   Training dataframe
#' @param formula   A formula of the form y ~ x1 + x2 + ... If users don't inform formula, the first column will be used as Y values and the others columns with x1,x2....xn
#' @param preprocess pre process
#' @param index  Users cross validation folds. Default = NULL
#' @param models chosen models to be used to train model. Uses  algortims names from Caret package.
#' @param rsample resample method 'boot', 'boot632', 'optimism_boot', 'boot_all', 'cv', 'repeatedcv', 'LOOCV', 'LGOCV','none', 'oob', 'timeslice', 'adaptive_cv', 'adaptive_boot', 'adaptive_LGOCV'
#' @param nfolds   Number of folds to be build in crossvalidation
#' @param repeats number of repeats to resample method repeatedcv
#' @param cpu_cores  Number of CPU cores to be used in parallel processing
#' @param tune_length  This argument is the number of levels for each tuning parameters that should be generated by train
#' @param search search option "grid" or  "random"
#' @param metric metric used to evaluate model fit. For numeric outcome ('RMSE', 'Rsquared)
#' @param seeds  generate random seeds to allow reproductible results
#' @param verbose  prints results during the execution of the function
#' @importFrom utils flush.console
#' @importFrom caret getModelInfo
#' @importFrom utils install.packages installed.packages
#' @keywords  models
#' @author Elpidio Filho, \email{elpidio@ufv.br}
#' @details details
#' @export
#' @examples
#' \dontrun{
#' models = c('ridge', 'rf', 'cubist','pls','pcr','foba','gbm','glmboost')
#' fit_models = run_models(df,models = models)
#' }

run_models <- function(df, models = 'rf',
                     formula = NULL,
                     preprocess = NULL,
                     index = NULL,
                     rsample = "cv",
                     nfolds = 10,
                     repeats = NA,
                     tune_length = 5,
                     search = "grid",
                     cpu_cores = 0,
                     metric = NULL,
                     seeds = NULL,
                     verbose = TRUE) {
  if (class(df) != "data.frame") stop("df is not a data frame.")
  if (is.null(formula) == FALSE) {
    if (class(formula) != 'formula') {
      formula =as.formula(formula)
    }
  }
  mod = is_factor_income(df, formula)
#  if (is.factor(df[, 1]) == TRUE) {
#    mod <- 1
#  } else {
#    mod <- 0
#  }

  vlib <- character()
  for (i in seq_along(models)) {
    md <- caret::getModelInfo(models[i], regex = FALSE)[[1]]
    vlib <- c(vlib, md$library)
    if (length(md) == 0) {
      stop(paste(
        "Model", models[i],
        "is not in caret's built-in library"
      ), call. = FALSE)
    } else {
      if (mod == 0) {
        if (!("Regression" %in% md$type)) {
          stop(paste(
            "Model", models[i],
            "is not in a regression model"
          ), call. = FALSE)
        }
      } else {
        if (!("Classification" %in% md$type)) {
          stop(paste(
            "Model", models[i],
            "is not in a classification model"
          ), call. = FALSE)
        }
      }
    }
  }

  plataforma <- .Platform$OS.type

  remove <- c("NA", " ")
  idx <- which(vlib %in% remove)
  if (length(idx) > 0) {
    vlib <- vlib[-idx]
  }
  pkglist <- unique(vlib)
  inst <- vlib %in% utils::installed.packages()
  if (length(pkglist[!inst]) > 0) {
    np <- paste(pkglist[!inst], collapse = ", ")
#    if (plataforma == "windows") {
#      print(paste("packages ", np, " will be installed"))
#      utils::install.packages(pkglist[!inst], dep = TRUE)
#    } else {
      print(paste("Warning : packages ", np, " needs to installed"))
    #}
  }

  package.inicio <- search()[ifelse(unlist(
    gregexpr("package:", search())
  ) == 1, TRUE, FALSE)]

  if (verbose == TRUE) {
    label1 <- models[i]
  }

  list.model <- vector("list")
  cont <- 1
  failed <- character()

  inicio_total <- Sys.time()
  for (j in seq_along(models)) {
    if (verbose == TRUE) {

      inicio <- Sys.time()
      if (j == 1) {
        nm <- length(models)
        width <- 40
        label1 <- stringr::str_pad(models[j], 15, "right")
        cat("\r", label1)
      }
    }

    if (mod == 0) {
      if (is.null(metric)) { metric = 'Rsquared'}
      fit.reg <- tryCatch({
          regression(
            df.train = df,
            formula = formula,
            index = index,
            search = search,
            rsample = rsample,
            regressor = models[j],
            preprocess = preprocess,
            nfolds = nfolds,
            cpu_cores = cpu_cores,
            repeats = repeats,
            metric = metric,
            tune_length = tune_length,
            seeds = vector_seeds(seeds, repeats, nfolds)
          )
        },
        error = function(e) {
          print(paste("Error:", conditionMessage(e)))
          return(NULL)
        }
      )

      if (is.null(fit.reg) == FALSE) {
        list.model[cont] <- list(fit.reg)
        names(list.model)[cont] <- models[j]
        cont <- cont + 1
      } else {
        failed <- c(failed, models[j])
      }
    } else {
      if (is.null(metric)) { metric = 'Kappa'}
      fit.class <- tryCatch({
          classification(
            df.train = df,
            formula = formula,
            rsample = rsample,
            index = index,
            search = search,
            classifier = models[j],
            preprocess = preprocess,
            nfolds = nfolds,
            cpu_cores = cpu_cores,
            repeats = repeats,
            metric = metric,
            tune_length = tune_length,
            seeds = vector_seeds(seeds, repeats, nfolds)
          )
        },
        error = function(e) {
          print(" ")
          print(e)

        }
      )
      if (is.null(fit.class) == FALSE) {
        list.model[cont] <- list(fit.class)
        names(list.model)[cont] <- models[j]
        cont <- cont + 1
      }
    }
    if (verbose == TRUE) {
      nm <- length(models)
      width <- 40
      tdif <- until_now(inicio)
      tdif_total <- until_now(inicio_total)
      label1 <- stringr::str_pad(models[j], 15, "right")
      s1 <- paste0(rep("#", j / nm * width), collapse = "")
      s2 <- paste0(rep(".", (nm - j) / nm * width), collapse = "")
      cat("\r", label1, s1, s2, tdif, tdif_total)
      if (j == nm) cat("\n")
    }
  }
  package.fim <- search()[ifelse(unlist(gregexpr(
    "package:",
    search()
  )) == 1, TRUE, FALSE)]
  package.list <- setdiff(package.fim, package.inicio)
  if (length(package.list) > 0) {
    for (package in package.list)
      detach(package, character.only = TRUE)
  }
  list.model <- list.model[!sapply(list.model, is.null)]
  if (verbose == TRUE) {
    if (length(failed) > 0) {
      print("failed models")
      print(failed)
    }
  }
  return(list.model)
}


vector_seeds <- function(seeds, repeats, nfolds) {
  if (is.null(seeds)) {
    vseed <- NULL
  } else {
    set.seed(seeds)
    if (is.na(repeats) == FALSE) {
      nel <- (nfolds * repeats) + 1
    } else {
      nel <- nfolds + 1
    }
    vseed <- vector(mode = "list", length = nel)
    for (i in 1:nel) vseed[[i]] <- sample.int(n = 100000, 1000)
    vseed[[nel + 1]] <- sample.int(100000, 1)
  }
  return(vseed)
}


run_nested_models <- function(df, models ,
                              formula = NULL,
                              preprocess = NULL,
                              index = NULL,
                              rsample = "cv",
                              nfolds = 10,
                              repeats = NA,
                              tune_length = 5,
                              cpu_cores = 0,
                              metric = ifelse(is.factor(df[, 1]),
                                              "Kappa", "Rsquared"
                              ),
                              seeds = NULL,
                              verbose = T)
{


}


is_factor_income <- function(d, formula = NULL){
  if (is.null(formula)) {
    if (is.factor(d[, 1]) == TRUE) {
      mod <- 1
    } else {
      mod <- 0
    }
  } else {
    if (class(formula) != 'formula') {
      formula =as.formula(formula)
    }
    v = all.vars(formula)[1]
    qc = d %>% dplyr::select(one_of(v)) %>%  dplyr::pull() %>% class
    if (qc == 'factor') {
      mod <- 1
    } else {
      mod <- 0
    }
  }
  return(mod)
}
elpidiofilho/labgeo documentation built on May 14, 2019, 9:35 a.m.