R/bm_Tuning.R

Defines functions .scope .bm_Tuning.check.args bm_Tuning

Documented in bm_Tuning

###################################################################################################
##' @name bm_Tuning
##' @author Frank Breiner, Maya Guéguen, Hélène Blancheteau
##' 
##' @title Tune models parameters
##' 
##' @description This internal \pkg{biomod2} function allows to tune single model parameters and 
##' select more efficient ones based on an evaluation metric.
##' 
##'
##' @param model a \code{character} corresponding to the  algorithm to be tuned, must be either 
##' \code{ANN}, \code{CTA}, \code{DNN}, \code{FDA}, \code{GAM}, \code{GBM}, \code{GLM}, \code{MARS}, 
##' \code{MAXENT}, \code{MAXNET}, \code{RF}, \code{RFd}, \code{SRE}, \code{XGBOOST}
##' @param tuning.fun a \code{character} corresponding to the model function name to be called 
##' through \code{\link[caret]{train}} function for tuning parameters (see \code{\link{ModelsTable}} 
##' dataset)
##' @param do.formula (\emph{optional, default} \code{FALSE}) \cr  
##' A \code{logical} value defining whether formula is to be optimized or not
##' @param do.stepAIC (\emph{optional, default} \code{FALSE}) \cr  
##' A \code{logical} value defining whether variables selection is to be performed for 
##' \code{GLM} and \code{GAM} models or not
##' @param bm.options a \code{\link{BIOMOD.options.default}} or \code{\link{BIOMOD.options.dataset}} 
##' object returned by the \code{\link{bm_ModelingOptions}} function
##' @param bm.format a \code{\link{BIOMOD.formated.data}} or \code{\link{BIOMOD.formated.data.PA}} 
##' object returned by the \code{\link{BIOMOD_FormatingData}} function
##' @param calib.lines (\emph{optional, default} \code{NULL}) \cr
##' A \code{data.frame} object returned by \code{\link{get_calib_lines}} or 
##' \code{\link{bm_CrossValidation}} functions
##' @param metric.eval a \code{character} corresponding to the evaluation metric to be used, must 
##' be either \code{AUC}, \code{Kappa} or \code{TSS} for \code{SRE} only ; \code{auc.val.avg}, 
##' \code{auc.diff.avg}, \code{or.mtp.avg}, \code{or.10p.avg}, \code{AICc} for \code{MAXENT} only ; 
##' \code{ROC} or \code{TSS} for all other models
##' @param metric.AIC a \code{character} corresponding to the AIC metric to be used, must 
##' be either \code{AIC} or \code{BIC}
##' @param weights (\emph{optional, default} \code{NULL}) \cr 
##' A \code{vector} of \code{numeric} values corresponding to observation weights (one per 
##' observation, see Details)
##' @param ctrl.train (\emph{optional, default} \code{NULL}) \cr 
##' A \code{\link[caret]{trainControl}} object
##' @param params.train a \code{list} containing values of model parameters to be tested 
##' (see Details)
##' 
##' 
##' @return 
##' 
##' A \code{\link{BIOMOD.models.options}} object (see \code{\link{bm_ModelingOptions}}) with 
##' optimized parameters
##' 
##' 
##' @details 
##' 
##' \bold{Concerning \code{ctrl.train} parameter :}
##' 
##' Set by default to : \cr
##' 
##' \code{ctrl.train <- caret::trainControl(method = "repeatedcv", repeats = 3, number = 10,} \cr
##' \code{summaryFunction = caret::twoClassSummary,} \cr
##' \code{classProbs = TRUE, returnData = FALSE)} \cr \cr
##' 
##' 
##' \bold{Concerning \code{params.train} parameter :}
##' 
##' All elements of the \code{list} must have names matching \code{model.parameter_name} format, 
##' \code{parameter_name} being one of the parameter of the \code{tuning.fun} function called by 
##' \code{caret} package and that can be found through the \code{\link[caret]{getModelInfo}} 
##' function.
##' 
##' Currently, the available parameters to be tuned are the following :
##' \describe{
##'   \item{ANN}{\code{size}, \code{decay}, \code{bag}}
##'   \item{CTA}{\code{maxdepth}}
##'   \item{DNN}{\code{hidden} , \code{bias}, \code{lambda}, \code{alpha}, \code{lr}, \code{batchsize}, \code{150}}
##'   \item{FDA}{\code{degree}, \code{nprune}}
##'   \item{GAM.gam}{\code{span}, \code{degree}}
##'   \item{GAM.mgcv}{\code{select}, \code{method}}
##'   \item{GBM}{\code{n.trees}, \code{interaction.depth}, \code{shrinkage}, \code{n.minobsinnode}}
##'   \item{MARS}{\code{degree}, \code{nprune}}
##'   \item{MAXENT}{\code{algorithm},\code{tune.args}, \code{parallel}, \code{partitions}, \code{kfolds}, 
##'   \code{user.grp}}
##'   \item{RF}{\code{mtry}}
##'   \item{RFd}{\code{mtry}}
##'   \item{SRE}{\code{quant}}
##'   \item{XGBOOST}{\code{nrounds}, \code{max_depth}, \code{eta}, \code{gamma}, 
##'   \code{colsampl_bytree}, \code{min_child_weight}, \code{subsample}}
##' }
##' 
##' 
##' The \code{\link{expand.grid}} function is used to build a \code{matrix} containing all 
##' combinations of parameters to be tested.
##' 
##' @note 
##' \itemize{
##'   \item No tuning for \code{GLM} and \code{MAXNET}
##'   \item \code{MAXENT} is tuned through \code{\link[ENMeval]{ENMevaluate}} function which is
##'   calling either :
##'   \itemize{
##'     \item maxnet (by defining \code{MAXENT.algorithm = 'maxnet'}) (\emph{default})
##'     \item Java version of Maxent defined in \pkg{dismo} package (by defining 
##'     \code{MAXENT.algorithm = 'maxent.jar'})
##'   }
##'   \item \code{DNN} is tuned through \code{\link[cito]{tune}} function. 
##'   The values include in \code{params.train} are the lower or upper range which hyperparameters are sampled.
##'   If there is only one value, the hyperparameter is fixed by biomod2 (inclunding the width and depth of \code{hidden} parameters.) 
##'   \item \code{SRE} is tuned through \code{\link{bm_SRE}} function
##'   \item All other models are tuned through \code{\link[caret]{train}} function
##'   \item No optimization of formula for \code{DNN}, \code{MAXENT}, \code{MAXNET}, \code{SRE} and 
##'   \code{XGBOOST}
##'   \item No interaction included in formula for \code{CTA}
##'   \item Variables selection only for \code{GAM.gam} and \code{GLM}
##' }
##' 
##' 
##' @seealso \code{\link[caret]{trainControl}}, \code{\link[caret]{train}}, 
##' \code{\link[ENMeval]{ENMevaluate}}, 
##' \code{\link{ModelsTable}}, \code{\link{BIOMOD.models.options}}, 
##' \code{\link{bm_ModelingOptions}}, \code{\link{BIOMOD_Modeling}}
##' @family Secondary functions
##' 
##' 
##' @examples
##' library(terra)
##' 
##' # Load species occurrences (6 species available)
##' data(DataSpecies)
##' head(DataSpecies)
##' 
##' # Select the name of the studied species
##' myRespName <- 'GuloGulo'
##' 
##' # Get corresponding presence/absence data
##' myResp <- as.numeric(DataSpecies[, myRespName])
##' 
##' # Get corresponding XY coordinates
##' myRespXY <- DataSpecies[, c('X_WGS84', 'Y_WGS84')]
##' 
##' # Load environmental variables extracted from BIOCLIM (bio_3, bio_4, bio_7, bio_11 & bio_12)
##' data(bioclim_current)
##' myExpl <- terra::rast(bioclim_current)
##' 
##' \dontshow{
##' myExtent <- terra::ext(0,30,45,70)
##' myExpl <- terra::crop(myExpl, myExtent)
##' }
##' 
##' # --------------------------------------------------------------- #
##' # Format Data with true absences
##' myBiomodData <- BIOMOD_FormatingData(resp.name = myRespName,
##'                                      resp.var = myResp,
##'                                      resp.xy = myRespXY,
##'                                      expl.var = myExpl)
##' 
##' 
##' # --------------------------------------------------------------- #
##' # List of all models currently available in `biomod2` (and their related package and function)
##' # Some of them can be tuned through the `train` function of the `caret` package 
##' # (and corresponding training function to be used is indicated)
##' data(ModelsTable)
##' ModelsTable
##' 
##' allModels <- c('ANN', 'CTA', 'FDA', 'GAM', 'GBM', 'GLM'
##'                , 'MARS', 'MAXENT', 'MAXNET', 'RF', 'SRE', 'XGBOOST')
##' 
##' # default parameters
##' opt.d <- bm_ModelingOptions(data.type = 'binary',
##'                             models = allModels,
##'                             strategy = 'default')
##'                             
##' # tune parameters for Random Forest model
##' tuned.rf <- bm_Tuning(model = 'RF',
##'                       tuning.fun = 'rf', ## see in ModelsTable
##'                       do.formula = FALSE,
##'                       bm.options = opt.d@options$RF.binary.randomForest.randomForest,
##'                       bm.format = myBiomodData)
##' tuned.rf
##' 
##' \dontrun{
##' # tune parameters for GAM (from mgcv package) model
##' tuned.gam <- bm_Tuning(model = 'GAM',
##'                        tuning.fun = 'gam', ## see in ModelsTable
##'                        do.formula = TRUE,
##'                        do.stepAIC = TRUE,
##'                        bm.options = opt.d@options$GAM.binary.mgcv.gam,
##'                        bm.format = myBiomodData)
##' tuned.gam
##' }                  
##' 
##' 
##' 
##' @importFrom foreach foreach %do% %:%
##' @importFrom stats aggregate formula 
##' @importFrom PresenceAbsence optimal.thresholds presence.absence.accuracy
##' 
##' 
##' @export
##' 
##' 
###################################################################################################


bm_Tuning <- function(model,
                      tuning.fun,
                      do.formula = FALSE,
                      do.stepAIC = FALSE,
                      bm.options,
                      bm.format,
                      calib.lines = NULL,
                      metric.eval = 'TSS',
                      metric.AIC = 'AIC',
                      weights = NULL,
                      ctrl.train = NULL,
                      params.train = list(ANN.size = c(2, 4, 6, 8),
                                          ANN.decay = c(0.01, 0.05, 0.1),
                                          ANN.bag = FALSE,
                                          DNN.hidden = list(depth = 3, width = 100),
                                          DNN.bias = TRUE,
                                          DNN.lambda = 0.001,
                                          DNN.alpha = 1, 
                                          DNN.lr = c(0.0001, 0.1),
                                          DNN.batchsize = 100,
                                          DNN.epochs = 150,
                                          FDA.degree = 1:2, 
                                          FDA.nprune = 2:25,
                                          GAM.select = c(TRUE, FALSE),
                                          GAM.method = c('GCV.Cp', 'GACV.Cp', 'REML', 'P-REML', 'ML', 'P-ML'),
                                          GAM.span = c(0.3, 0.5, 0.7),
                                          GAM.degree = 1,
                                          GBM.n.trees = c(500, 1000, 2500),
                                          GBM.interaction.depth = seq(2, 8, by = 3),
                                          GBM.shrinkage = c(0.001, 0.01, 0.1),
                                          GBM.n.minobsinnode = 10,
                                          MARS.degree = 1:2, 
                                          MARS.nprune = 2:max(21, 2 * ncol(bm.format@data.env.var) + 1),
                                          MAXENT.algorithm = 'maxnet',
                                          MAXENT.parallel = TRUE,
                                          MAXENT.tune.args = list(rm = seq(0.5, 1, 0.5), fc = c('L')),
                                          MAXENT.partitions = 'randomkfold',
                                          MAXENT.kfolds = 10,
                                          MAXENT.user.grp = NULL,
                                          RF.mtry = 1:min(10, ncol(bm.format@data.env.var)),
                                          RFd.mtry = 1:min(10, ncol(bm.format@data.env.var)),
                                          SRE.quant = c(0, 0.0125, 0.025, 0.05, 0.1),
                                          XGBOOST.nrounds = 50,
                                          XGBOOST.max_depth = 1,
                                          XGBOOST.eta = c(0.3, 0.4),
                                          XGBOOST.gamma = 0,
                                          XGBOOST.colsample_bytree = c(0.6, 0.8),
                                          XGBOOST.min_child_weight = 1,
                                          XGBOOST.subsample = 0.5))
{
  ## 0. Check arguments ---------------------------------------------------------------------------
  args <- .bm_Tuning.check.args(model = model, tuning.fun = tuning.fun
                                , do.formula = do.formula, do.stepAIC = do.stepAIC
                                , bm.options = bm.options, bm.format = bm.format
                                , metric.eval = metric.eval, metric.AIC = metric.AIC
                                , weights = weights, params.train = params.train)
  for (argi in names(args)) { assign(x = argi, value = args[[argi]]) }
  rm(args)
  
  ## LOOP OVER CALIB LINES 
  if (is.null(calib.lines)) {
    calib.lines <- data.frame(rep(TRUE, length(bm.format@data.species)))
    colnames(calib.lines) <- "_allData_allRun"
  }
  ## LOOP OVER PA DATASETS
  if (inherits(bm.format, "BIOMOD.formated.data.PA")) {
    PA.lines <- colnames(bm.format@PA.table)
  } else {
    PA.lines <- "_allData_allRun"
  }
  ## LOOP OVER ALL COMBINED
  test_forPA <- sapply(PA.lines, function(xx) any(grepl(xx, colnames(calib.lines))))
  if (inherits(bm.format, "BIOMOD.formated.data.PA") && 
      sum(test_forPA) == length(PA.lines)) {
    combi <- expand.grid(PA = NA, calib = colnames(calib.lines), stringsAsFactors = FALSE)
    combi$PA <- sapply(combi$calib, function(ii) strsplit(ii, "_")[[1]][2])
    if (length(which(combi$calib == "_allData_allRun")) > 0) {
      combi$PA[which(combi$calib == "_allData_allRun")] <- "_allData_allRun"
    }
    combi$name_dataset <- combi$calib
  } else {
    combi <- expand.grid(PA = PA.lines, calib = colnames(calib.lines), stringsAsFactors = FALSE)
    combi$name_dataset <- sapply(1:nrow(combi), function(ii) {
      tmp1 <- combi$PA[ii]
      tmp2 <- combi$calib[ii]
      if (!grepl("PA", tmp2) || (grepl("PA", tmp2) && grepl(tmp1, tmp2))) {
        if (tmp1 == "_allData_allRun") tmp1 <- "allData"
        tmp2 <- strsplit(tmp2, "_")[[1]][3]
        return(paste0("_", tmp1, "_", tmp2))
      } else {
        return(NA)
      }
    })
    combi <- na.exclude(combi)
  }
  
  
  if (model != "MAXENT" && is.null(ctrl.train)) {
    ## check control
    if (bm.format@data.type == "binary") {
      ctrl.train <- caret::trainControl(method = "repeatedcv",
                                        repeats = 3,
                                        number = 10,
                                        summaryFunction = caret::twoClassSummary,
                                        classProbs = TRUE,
                                        returnData = FALSE)
    } else {
      ctrl.train <- caret::trainControl(method = "repeatedcv",
                                        repeats = 3,
                                        number = 10,
                                        summaryFunction = caret::defaultSummary,
                                        classProbs = FALSE,
                                        returnData = FALSE)
    }
  }
  
  
  argsval <- foreach(PA.i = combi$PA, calib.i = combi$calib, dataset.i = combi$name_dataset) %do%
    {
      cat(paste0("\n\t\t> Dataset ", dataset.i))
      
      if(inherits(bm.options, "BIOMOD.options.dataset") && !is.null(bm.options@args.values[[dataset.i]])){
        argstmp <- bm.options@args.values[[dataset.i]]
      } else {
        argstmp <- bm.options@args.default
      }
      
      
      if (model == "MAXNET") {
        warning("No tuning available for that model. Sorry.")
      } else if (model == "XGBOOST") {
        warning("Due to upgrade of xgboost package, currently, no tuning available for that model. Sorry.")
      } else {
        ## 1. SPECIFIC CASE OF MAXENT OR SRE ------------------------------------------------------------
        if (model %in% c("MAXENT", "SRE", "DNN")) {
          cat("\n\t\t\t> Tuning parameters...")
          
          ## create dataset ---------------------------------------------------------
          mySpExpl <- get_species_data(bm.format)
          mySpExpl[["_allData_allRun"]] <- TRUE
          mySpExpl <- mySpExpl[which(calib.lines[, calib.i] == TRUE), ]
          mySpExpl <- mySpExpl[which(mySpExpl[, PA.i] == TRUE), ]
          
          ## SRE case
          myResp <- mySpExpl[, 1]
          myExpl <- mySpExpl[, 4:(3 + ncol(bm.format@data.env.var))]
          
          ## MAXENT case
          if (params.train$MAXENT.algorithm == "maxnet") {
            mySpExpl[["_allData_allRun"]] <- NULL
            mySpExpl[, 1] <- ifelse(mySpExpl[, 1] == 1 & !is.na(mySpExpl[, 1]), 1, 0)
            mySpExpl <- mySpExpl[, 1:(3 + ncol(bm.format@data.env.var))]
          }
          
          if (model == "MAXENT") { # ------------------------------------------#
            
            try(tune.MAXENT <- ENMeval::ENMevaluate(occs = mySpExpl[mySpExpl[, 1] == 1 & !is.na(mySpExpl[, 1]), ],
                                                    bg = mySpExpl[mySpExpl[, 1] == 0 | is.na(mySpExpl[, 1]), ],
                                                    tune.args = params.train$MAXENT.tune.args,
                                                    algorithm = params.train$MAXENT.algorithm,
                                                    partitions = params.train$MAXENT.partitions,
                                                    partition.settings = list(kfolds = params.train$MAXENT.kfolds),
                                                    user.grp = params.train$MAXENT.user.grp,
                                                    doClamp = TRUE, ## allow to change or not ?
                                                    parallel = params.train$MAXENT.parallel,
                                                    numCores = NULL, ## default to 1 or NULL (all available cores used then) ?
                                                    categoricals = NULL))
            
            if (!is.null(tune.MAXENT)) {
              if (metric.eval == 'auc.val.avg') {
                tmp <- which.max(tune.MAXENT@results[, metric.eval])
              } else {
                tmp <- which.min(tune.MAXENT@results[, metric.eval])
              }
              argstmp$linear <- grepl("L", tune.MAXENT@results[tmp, "fc"])
              argstmp$quadratic <- grepl("Q", tune.MAXENT@results[tmp, "fc"])
              argstmp$hinge <- grepl("H", tune.MAXENT@results[tmp, "fc"])
              argstmp$product <- grepl("P", tune.MAXENT@results[tmp, "fc"])
              argstmp$threshold <- grepl("T", tune.MAXENT@results[tmp, "fc"])
              argstmp$betamultiplier <- tune.MAXENT@results[tmp, "rm"]
            }
          } else if (model == "SRE") { # -------------------------------------- #
            myResp <- sapply(myResp, function(xx) ifelse(xx == 0 || is.na(xx), 0, 1))
            
            tune.SRE <- foreach(rep = 1:ctrl.train$repeats, .combine = "rbind") %do%
              {
                fold <- dismo::kfold(myResp, by = myResp, k = ctrl.train$number) ## by = to keep prevalence
                RES <- foreach (quant = params.train$SRE.quant, .combine = "rbind") %:%
                  foreach (i = 1:ctrl.train$number, .combine = "rbind") %do%
                  {
                    DATA <- cbind(1:sum(fold == i)
                                  , myResp[fold == i]
                                  , bm_SRE(resp.var = myResp[fold != i],
                                           expl.var = myExpl[fold != i, ],
                                           new.env = myExpl[fold == i, ],
                                           quant = quant,
                                           do.extrem = FALSE))
                    thresh <- as.vector(optimal.thresholds(DATA, opt.methods = 3)[2], mode = "numeric")
                    RES <- presence.absence.accuracy(DATA, threshold = thresh)
                    return(data.frame(RES, quant = quant))
                  }
                return(data.frame(RES, rep = rep))
              }
            tune.SRE$TSS <- tune.SRE$sensitivity + tune.SRE$specificity - 1
            tmp <- aggregate(tune.SRE[, c("sensitivity", "specificity", "Kappa", "AUC", "TSS")]
                             , by = list(quant = tune.SRE$quant), mean)
            argstmp$quant <- tmp[which.max(tmp[, metric.eval]), "quant"]
            
          } else { ## DNN case  # -------------------------------------- #

            ## Preparation of data 
            scale_data <- scale(myExpl)
            argstmp$data <- cbind(myResp, as.data.frame(scale_data))
            argstmp$formula <- as.formula( "myResp ~.")
            
            ## Preparation of the tune parameters
            params.train = params.train[grep(paste0(model,"\\."), names(params.train))]
            for (param.n in names(params.train)) {
              real.name <- unlist(strsplit(param.n, split = "\\."))[2]
              
              if (real.name == "hidden") {
                if (length(params.train$DNN.hidden$depth) == 1 &&
                    length(params.train$DNN.hidden$width) == 1) {
                  argstmp$hidden <- rep(params.train$DNN.hidden$width, params.train$DNN.hidden$depth)
                } else if (length(params.train$DNN.hidden$depth) == 1) {
                  argstmp$hidden <- cito::tune(params.train$DNN.hidden$width, fixed = 'depth')
                } else if (length(params.train$DNN.hidden$width) == 1) {
                  argstmp$hidden <- cito::tune(params.train$DNN.hidden$depth, fixed = 'width')
                } else {
                  argstmp$hidden <- cito::tune(params.train$DNN.hidden$depth, params.train$DNN.hidden$width)
                }
              } else {
                if (length(params.train[[param.n]]) == 1) {
                  argstmp[[real.name]] <- params.train[[param.n]]
                } else {
                  argstmp[[real.name]] <- cito::tune(values = params.train[[param.n]])
                }
              }
            }
            argstmp$tuning <- cito::config_tuning(steps = 5)
            
            ## Tune model
            tune.DNN <- do.call(cito::dnn, argstmp)
            
            ## Keep the tuned parameters
            argstmp$hidden = tune.DNN$model_properties$hidden
            argstmp$bias = tune.DNN$model_properties$bias
            argstmp$lambda = tune.DNN$training_properties$lambda
            argstmp$alpha = tune.DNN$training_properties$alpha 
            argstmp$lr = as.numeric(tune.DNN$training_properties$lr)
            argstmp$batchsize = tune.DNN$training_properties$batchsize
            argstmp$epochs = tune.DNN$training_properties$epochs
          }
          
        } else {
          ## 2. ALL OTHER MODELS ------------------------------------------------------------------------
          
          ## create dataset
          mySpExpl <- get_species_data(bm.format)
          mySpExpl[["_allData_allRun"]] <- 1
          current.weights <- weights[which(calib.lines[, calib.i] == TRUE & mySpExpl[, PA.i] == TRUE)]
          mySpExpl <- mySpExpl[which(calib.lines[, calib.i] == TRUE), ]
          mySpExpl <- mySpExpl[which(mySpExpl[, PA.i] == TRUE), ]
          
          if (bm.format@data.type == "binary") {
            mySpExpl[, 1] <- as.factor(ifelse(mySpExpl[, 1] == 1 & !is.na(mySpExpl[, 1]), "presence", "absence"))
          } 
          myResp <- mySpExpl[, 1]
          myExpl <- mySpExpl[, 4:(3 + ncol(bm.format@data.env.var))]
          
          ## run tuning -------------------------------------------------------
          
          cmd.tuning <- "caret::train(x = myExpl, y = myResp, method = tuning.fun, tuneGrid = tuning.grid,"
          if (bm.format@data.type == "binary") {
            cmd.tuning <- paste0(cmd.tuning, " trControl = ctrl.train, metric = 'ROC',")
          } else {
            cmd.tuning <- paste0(cmd.tuning, " trControl = ctrl.train, metric ='", metric.eval, "',")
          }
          
          if (tuning.fun %in% c("fda", "rpart")) { ## add weights
            cmd.tuning <- paste0(cmd.tuning, " weights = current.weights,")
          }
          if (tuning.fun == "avNNet") {
            maxit = 500
            maxnwts = 10 * (ncol(myExpl) + 1) + 10 + 1
            ## Automatically standardize data prior to modeling and prediction
            cmd.tuning <- paste0(cmd.tuning, " preProc = c('center', 'scale'), linout = TRUE, trace = FALSE,")
            cmd.tuning <- paste0(cmd.tuning, " MaxNWts.ANN = maxnwts, maxit = maxit))")
          } else if (tuning.fun %in% c("earth", "bam", "fda", "rpart", "glm")) { ## remove verbose
            cmd.tuning <- paste0(cmd.tuning, " tuneLength = tuning.length))")
          } else {
            cmd.tuning <- paste0(cmd.tuning, " tuneLength = tuning.length, verbose = FALSE))")
          }
          
          if (model != "GLM") {
            cat("\n\t\t\t> Tuning parameters...")
            eval(parse(text = paste0("try(tuned.mod <- ", cmd.tuning)))
            
            ## GET tuned parameter values -------------------------------------------------------------
            if (!is.null(tuned.mod)) {
              tmp <- tuned.mod$results
              if (bm.format@data.type == 'binary') { tmp$TSS <- tmp$Sens + tmp$Spec - 1 }
              
              if (metric.eval == "RMSE") {
                selected <- which.min(tmp[, metric.eval])
              } else {
                selected <- which.max(tmp[, metric.eval])
              }
              
              if (model == "XGBOOST") {
                ## TO CHECK
                # for (param in train.params$params) {
                #   if (is.null(argstmp[[param]])){
                #     argstmp$params[[param]] <- tmp[selected, param]
                #   } else {
                #     argstmp[[param]] <- tmp[selected, param]}
                # }
              } else {
                for (param in train.params$params) {
                  argstmp[[param]] <- tmp[selected, param]
                }
              }
              
              tuning.form <- tuning.grid[selected, ]
              
              if (model %in% c("RF", "RFd")) {
                tuning.form <- data.frame(mtry = tuning.grid[selected, ])
              }
              
              if (model == "CTA") {
                tuning.fun = "rpart2"
                eval(parse(text = paste0("try(tuned.mod <- ", cmd.tuning)))
                tuning.fun = "rpart" # needed to reset the tuning function in non parallel mode
                if (!is.null(tuned.mod)) {
                  tmp = tuned.mod$results
                  if (bm.format@data.type == 'binary') { tmp$TSS = tmp$Sens + tmp$Spec - 1 }
                  if(metric.eval == "RMSE"){
                    argstmp[["maxdepth"]] <- tmp[which.min(tmp[, metric.eval]), "maxdepth"]
                  } else {
                    argstmp[["maxdepth"]] <- tmp[which.max(tmp[, metric.eval]), "maxdepth"]
                  }
                }
              }
            }
          } else { tuning.form <- tuning.grid }
          
          ## run formula selection ------------------------------------------------------------------
          if (do.formula) {
            cat("\n\t\t\t> Tuning formula...")
            
            typ.vec = c('simple', 'quadratic', 'polynomial', 's_smoother')
            max.intlev <- min(ncol(myExpl) - 1, 3)
            if (model == "CTA") { max.intlev <- 0}
            if (model == "FDA") { 
              typ.vec = c('simple', 's_smoother') 
              max.intlev <- 0
              argstmp$method <- eval(parse(text = paste0("quote(", argstmp$method, ")")))
            }
            if (model %in% c("RF", "RFd", "GBM")) { typ.vec = c('simple', 'quadratic', 'polynomial') }
            
            if (bm.format@data.type == "binary") {
              myObs <- as.numeric(factor(myResp, labels = c("absence" = 0, "presence" = 1))) - 1
            } else {
              myObs <- myResp
            }
            
            data <- cbind(myExpl, resp = myObs)
            if (bm.format@data.type == "binary" && model %in% c("RF", "RFd")) {
              data <- data %>% mutate_at("resp", factor)
              argstmp$strata <- data[, "resp"]
              argstmp$sampsize <- unlist(ifelse(!is.null(argstmp$sampsize), list(argstmp$sampsize), nrow(data)))
            }
            argstmp$data <- data
            
            
            TMP <- foreach (typ = typ.vec, .combine = "rbind") %:%
              foreach (intlev = 0:max.intlev, .combine = "rbind") %do%
              {
                tuned.form <- NULL
                model.call <- paste0(bm.options@package, "::", bm.options@func)
                formu <- bm_MakeFormula(resp.name = "resp", expl.var = myExpl, type = typ, interaction.level = intlev)
                argstmp$formula <- formu
                argstmp <- argstmp[c("formula", "data", names(argstmp)[which(!(names(argstmp) %in% c("formula", "data")))])]
                tuned.form <- try(do.call(eval(parse(text = model.call)), argstmp), silent = TRUE)
                
                if (!inherits(tuned.form, "try-error") && !inherits(tuned.form, "data.frame")) {
                  fit <- predict(tuned.form)
                  if (bm.format@data.type %in% c("multiclass","ordinal")) {
                    if (model %in% c("GAM", "GLM")) { 
                      fit <- .threshold_ordinal(myObs, fit, metric.bm)$fit_factor 
                    } else if (model %in% c("CTA", "MARS")) {
                      fit <- predict(tuned.form, type = "class")
                    }
                  }
                  if (bm.format@data.type == "binary" && model %in% c("CTA", "FDA", "RF", "RFd")) {
                    fit <- predict(tuned.form, type = "class")
                    fit <- as.numeric(fit) - 1
                  }
                  if (model == "MARS") {
                    if (bm.format@data.type %in% c("multiclass","ordinal")) {
                      fit <- as.factor(fit[, 1])
                    } else if (bm.format@data.type == "binary") {
                      fit <- as.numeric(fit[, 1])
                    }
                  }
                  tmp <- bm_FindOptimStat(metric.bm, obs = myObs, fit = fit)[, "best.stat"]
                  formu <- paste0(bm.format@sp.name, "~" , as.character(formu)[3])
                  return(data.frame(stat = tmp, type = typ, interaction.level = intlev, formula = formu))
                }
              }
            
            if (metric.eval == "RMSE") {
              argstmp$formula <- formula(TMP[which.min(TMP[, 'stat']), "formula"])
            } else {
              argstmp$formula <- formula(TMP[which.max(TMP[, 'stat']), "formula"])
            }
          } else {
            if (length(argstmp$formula) <= 1 && model %in% c("CTA", "FDA", "GAM", "GBM", "GLM")) {
              argstmp$formula <- bm_MakeFormula(resp.name = bm.format@sp.name
                                                , expl.var = myExpl
                                                , type = 'simple'
                                                , interaction.level = 0)
            }
          }
          
          ## run variable selection -----------------------------------------------------------------
          if (do.stepAIC &&
              (model == "GLM" || 
               (model == "GAM" && bm.options@package == "gam")) ) {
            cat("\n\t\t\t> Tuning variables (AIC)...")
            
            if (model == "GLM") {
              glmStart <- glm(as.formula(paste0(bm.format@sp.name, " ~ 1")),
                              data = mySpExpl, 
                              family = argstmp$family, 
                              control = argstmp$control, 
                              weights = current.weights,
                              mustart = rep(ifelse(!is.null(argstmp$mustart) & nchar(argstmp$mustart) > 0
                                                   , argstmp$mustart, 0.5), length(myResp)),
                              model = TRUE)
              
              try(tuned.AIC <- MASS::stepAIC(glmStart,
                                             scope = list(upper = argstmp$formula, lower = ~1), ##upper = (sub(".*~", "~", argstmp$formula))
                                             k = criteria.AIC,
                                             direction = "both",
                                             trace = FALSE,
                                             steps = 10000))
              
              if (!is.null(tuned.AIC)) { argstmp$formula <- deparse(tuned.AIC$formula) }
              
            } else if (model == "GAM") {
              gamStart <- do.call(gam::gam, list(formula = as.formula(paste0(bm.format@sp.name, " ~ 1")),
                                                 data = mySpExpl, 
                                                 family = argstmp$family,
                                                 control = argstmp$control,
                                                 weights = current.weights))          
              tuned.AIC <- NULL
              try(tuned.AIC <-gam::step.Gam(gamStart,
                                            scope = .scope(head(myExpl), "gam::s", 6),
                                            direction = "both",
                                            trace = FALSE))
              if (!is.null(tuned.AIC)) { argstmp$formula <- formula(deparse(tuned.AIC$formula)) }
            }
          }
        }
      }
      return(argstmp)
    }
  names(argsval) <- combi$name_dataset
  return(argsval)
}


###################################################################################################

.bm_Tuning.check.args <- function(model, tuning.fun, do.formula, do.stepAIC
                                  , bm.options, bm.format, metric.eval, metric.AIC
                                  , weights = NULL, params.train)
{
  ## check model --------------------------------------------------------------
  .fun_testIfIn(TRUE, "model", model, c("ANN", "CTA", "DNN", "FDA", "GAM", "GBM", "GLM"
                                        , "MARS", "MAXENT", "MAXNET", "RF", "RFd", "SRE", "XGBOOST"))
  
  ## check namespace ----------------------------------------------------------
  if (!isNamespaceLoaded("caret")) { 
    if (!requireNamespace('caret', quietly = TRUE)) stop("Package 'caret' not found")
  }
  if (model == "MAXENT" && !isNamespaceLoaded('ENMeval')) { 
    if (!requireNamespace('ENMeval', quietly = TRUE)) stop("Package 'ENMeval' not found")
  } else if (model == "SRE" && !isNamespaceLoaded('dismo')) { 
    if (!requireNamespace('dismo', quietly = TRUE)) stop("Package 'dismo' not found")
  }
  if (do.formula == TRUE) {
    if (!requireNamespace('gam', quietly = TRUE)) stop("Package 'gam' not found")
  }
  if (do.stepAIC == TRUE) {
    if (!requireNamespace('MASS', quietly = TRUE)) stop("Package 'MASS' not found")
  }
  
  ## check bm.options ----------------------------------------------------------
  .fun_testIfInherits(TRUE, "bm.options", bm.options, c("BIOMOD.options.default", "BIOMOD.options.dataset"))
  ## check bm.format ----------------------------------------------------------
  .fun_testIfInherits(TRUE, "bm.format", bm.format, c("BIOMOD.formated.data", "BIOMOD.formated.data.PA"))
  ## check params.train -------------------------------------------------------
  params.train_init = list(ANN.size = c(2, 4, 6, 8),
                           ANN.decay = c(0.01, 0.05, 0.1),
                           ANN.bag = FALSE,
                           DNN.hidden = list(depth = 3, width = 100),
                           DNN.bias = TRUE,
                           DNN.lambda = 0.001,
                           DNN.alpha = 1, 
                           DNN.lr = c(0.0001, 0.1),
                           DNN.batchsize = 100,
                           DNN.epochs = 150,
                           FDA.degree = 1:2, 
                           FDA.nprune = 2:25,
                           GAM.select = c(TRUE, FALSE),
                           GAM.method = c('GCV.Cp', 'GACV.Cp', 'REML', 'P-REML', 'ML', 'P-ML'),
                           GAM.span = c(0.3, 0.5, 0.7),
                           GAM.degree = 1,
                           GBM.n.trees = c(500, 1000, 2500),
                           GBM.interaction.depth = seq(2, 8, by = 3),
                           GBM.shrinkage = c(0.001, 0.01, 0.1),
                           GBM.n.minobsinnode = 10,
                           MARS.degree = 1:2, 
                           MARS.nprune = 2:max(21, 2 * ncol(bm.format@data.env.var) + 1),
                           MAXENT.algorithm = 'maxnet',
                           MAXENT.tune.args = list(rm = seq(0.5, 1, 0.5), fc = c("L")),
                           MAXENT.parallel = TRUE,
                           MAXENT.partitions = 'randomkfold',
                           MAXENT.kfolds = 10,
                           MAXENT.user.grp = NULL,
                           RF.mtry = 1:min(10, ncol(bm.format@data.env.var)),
                           RFd.mtry = 1:min(10, ncol(bm.format@data.env.var)),
                           SRE.quant = c(0, 0.0125, 0.025, 0.05, 0.1),
                           XGBOOST.nrounds = 50,
                           XGBOOST.max_depth = 1,
                           XGBOOST.eta = c(0.3, 0.4),
                           XGBOOST.gamma = 0,
                           XGBOOST.colsample_bytree = c(0.6, 0.8),
                           XGBOOST.min_child_weight = 1,
                           XGBOOST.subsample = 0.5)
  
  for (i in names(params.train)) {
    if (i %in% names(params.train_init)) {
      params.train_init[[i]] = params.train[[i]]
    }
  }
  params.train = params.train_init
  
  ## check evaluation metric --------------------------------------------------
  metric.bm <- NULL
  if (model == "MAXENT") {
    .fun_testIfIn(TRUE, "metric.eval", metric.eval, c("auc.val.avg", "auc.diff.avg", "or.mtp.avg", "or.10p.avg", "AICc"))
    .fun_testIfIn(TRUE, "params.train$MAXENT.algorithm", params.train$MAXENT.algorithm, c("maxent.jar", "maxnet"))
  } else if (model == "SRE") {
    .fun_testIfIn(TRUE, "metric.eval", metric.eval, c("AUC", "Kappa", "TSS"))
    sapply(params.train$SRE.quant, FUN = .fun_testIf01, test = TRUE, objName =  "params.train$SRE.quant")
  } else if (bm.format@data.type == "binary") {
    .fun_testIfIn(TRUE, "metric.eval", metric.eval, c("ROC", "TSS"))
    metric.bm <- metric.eval
  } else if (bm.format@data.type %in% c("multiclass","ordinal")) {
    .fun_testIfIn(TRUE, "metric.eval", metric.eval, c("Accuracy"))
    metric.bm <- "Accuracy"
  } else {
    .fun_testIfIn(TRUE, "metric.eval", metric.eval, c("RMSE", "Rsquared"))
    metric.bm <- ifelse(metric.eval == "RMSE", "RMSE", "Rsquared")
  }
  
  ## check weights ------------------------------------------------------------
  if (model %in% c("CTA", "FDA", "GAM", "GLM") && is.null(weights)) { 
    weights = rep(1, length(bm.format@data.species))
  }
  
  ## get tuning function and parameters ---------------------------------------
  all.fun <- c('avNNet', 'rpart', 'rpart2', 'fda', 'gamLoess', 'bam', 'gam', 'gbm', 'glm', 'earth', 'rf', 'xgbTree')
  all.params <- foreach (fi = all.fun) %do% {
    params <- caret::getModelInfo(model = fi)
    return(list(pkg = params[[fi]]$library, params = params[[fi]]$parameters$parameter))
  }
  names(all.params) <- all.fun
  
  .fun_testIfIn(TRUE, "tuning.fun", tuning.fun, c(all.fun, "bm_SRE", "ENMevaluate", "maxnet", "tune"))
  .fun_testIfIn(TRUE, "tuning.fun", tuning.fun, unique(ModelsTable$train[which(ModelsTable$model == model)]))
  train.params <- all.params[[tuning.fun]]
  
  ## get tuning grid through params.train -------------------------------------
  tuning.grid <- NULL
  if (model %in% c("ANN", "FDA", "GAM", "GBM", "MARS", "RF", "RFd", "XGBOOST")) {
    if (!(model == "GAM")) {
      params.train = params.train[grep(paste0(model,"\\."), names(params.train))]
      .fun_testIfIn(TRUE, "names(params.train)", names(params.train), paste0(model, ".", train.params$params))
    } else if (tuning.fun == "gamLoess"){
      params.train = params.train[c('GAM.span', "GAM.degree")]
    } else {
      params.train = params.train[c('GAM.select', 'GAM.method')]
    }
    names(params.train) = sub(model, "", names(params.train))
    tuning.grid <- do.call(expand.grid, params.train)
  }
  
  ## get tuning length --------------------------------------------------------
  tuning.length <- 1
  if (model == "CTA") tuning.length <- 30
  if (model == "RF" | model == "RFd") tuning.length <- min(30, ncol(bm.format@data.env.var))
  
  ## Do formula ---------------------------------------------------------------
  if (model %in% c("DNN", "MAXENT", "MAXNET", "SRE", "XGBOOST") && do.formula == TRUE) {
    do.formula <- FALSE 
    cat("\n No optimization of formula for", model)
  }
  
  ## get criteria -------------------------------------------------------------
  if (do.stepAIC &&
      (model == "GLM" || (model == "GAM" && bm.options@package == "gam")) && 
      !bm.format@data.type %in% c("multiclass", "ordinal", "relative")) {
    .fun_testIfIn(TRUE, "metric.AIC", metric.AIC, c("AIC", "BIC"))
    if (metric.AIC == "AIC") { criteria.AIC <- 2 }
    if (metric.AIC == "BIC")  { criteria.AIC <- log(ncol(bm.format@data.env.var)) }
  } else {
    do.stepAIC <- FALSE
    criteria.AIC <- NA
  }
  
  return(list(weights = weights
              , do.formula = do.formula
              , do.stepAIC = do.stepAIC
              , criteria.AIC = criteria.AIC
              , tuning.fun = tuning.fun
              , train.params = train.params
              , tuning.length = tuning.length
              , tuning.grid = tuning.grid
              , params.train = params.train
              , metric.bm = metric.bm))
}


###################################################################################################

.scope <- function(enviroTrain, Smoother, degree)
{
  XXX <- enviroTrain
  deg <- degree
  vnames <- names(XXX[])
  step.list <- as.list(vnames)
  names(step.list) <- vnames
  NbVar <- dim(enviroTrain)[2]
  i <- 1
  while (i <= NbVar)
  {
    vname <- names(XXX)[i]
    # loops through independent variable names
    junk <- paste0("1 + ", vname)
    # minimum scope
    if (is.numeric(XXX[, i])) {
      junk <- c(junk, paste0(Smoother, "(", vname, ",", deg, ")"))
      junk <- eval(parse(text = paste("~", paste(junk, collapse = "+"))))
    } else if (is.factor(XXX[, i])) {
      junk <- c(junk, vname)
      junk <- eval(parse(text = paste("~", paste(junk, collapse = "+"))))
    }
    step.list[[vname]] <- junk
    i <- i + 1
  }
  
  return(step.list)
}

Try the biomod2 package in your browser

Any scripts or data that you put into this service are public.

biomod2 documentation built on Jan. 30, 2026, 5:08 p.m.