tuneRF: Tune Random Forest of the 'ranger' Package

Documented in tuneRanger

#' tuneRanger
#' 
#' Automatic tuning of random forests of the \code{\link[ranger]{ranger}} package with one line of code. 
#'
#' @param task The mlr task created by \code{\link[mlr]{makeClassifTask}}, \code{\link[mlr]{makeRegrTask}} or \code{\link[mlr]{makeSurvTask}}. 
#' @param measure Performance measure to evaluate/optimize. Default is brier score for classification and mse for regression. Can be changed to accuracy, AUC or logaritmic loss by setting it to \code{list(acc)}, \code{list(auc)} or \code{list(logloss)}. Other possible performance measures from mlr can be looked up in the \href{https://mlr.mlr-org.com/articles/tutorial/measures.html}{mlr tutorial}. 
#' @param iters Number of iterations. Default is 70.
#' @param iters.warmup Number of iterations for the warmup. Default is 30. 
#' @param time.budget Running time budget in seconds. Note that the actual mbo run can take more time since the condition is checked after each iteration. The default NULL means: There is no time budget.
#' @param num.threads Number of threads. Default is number of CPUs available.
#' @param num.trees Number of trees.
#' @param parameters Optional list of fixed named parameters that should be passed to \code{\link[ranger]{ranger}}.
#' @param tune.parameters Optional character vector of parameters that should be tuned. 
#' Default is mtry, min.node.size and sample.fraction. Additionally replace and respect.unordered.factors can be 
#' included in the tuning process.
#' @param save.file.path File to which interim results are saved (e.g. "optpath.RData") in the current working directory. 
#' Default is NULL, which does not save the results. If a file was specified and one iteration fails the algorithm can be 
#' started again with \code{\link{restartTuneRanger}}.
#' @param build.final.model [\code{logical(1)}]\cr
#'   Should the best found model be fitted on the complete dataset?
#'   Default is \code{TRUE}. 
#' @param show.info Verbose mlrMBO output on console? Default is \code{TRUE}.
#' @import ranger mlr mlrMBO ParamHelpers BBmisc stats smoof lhs parallel
#' @importFrom DiceKriging km predict.km
#' @return A list with elements
#'   \item{\code{recommended.pars}}{Recommended hyperparameters.}
#'   \item{\code{results}}{A data.frame with all evaluated hyperparameters and performance and time results for each run.}
#'   \item{\code{model}}{The final model if \code{build.final.model} set to TRUE.}
#' @details Model based optimization is used as tuning strategy and the three parameters min.node.size, sample.fraction and mtry are tuned at once. Out-of-bag predictions are used for evaluation, which makes it much faster than other packages and tuning strategies that use for example 5-fold cross-validation. Classification as well as regression is supported. 
#' The measure that should be optimized can be chosen from the list of measures in mlr: \href{https://mlr.mlr-org.com/articles/tutorial/measures.html}{mlr tutorial}
#' @seealso \code{\link{estimateTimeTuneRanger}} for time estimation and \code{\link{restartTuneRanger}} for continuing the algorithm if there was an error. 
#' @export
#' @examples 
#' \dontrun{
#' library(tuneRanger)
#' library(mlr)
#' 
#' # A mlr task has to be created in order to use the package
#' data(iris)
#' iris.task = makeClassifTask(data = iris, target = "Species")
#'  
#' # Estimate runtime
#' estimateTimeTuneRanger(iris.task)
#' # Tuning
#' res = tuneRanger(iris.task, measure = list(multiclass.brier), num.trees = 1000, 
#'   num.threads = 2, iters = 70, save.file.path = NULL)
#'   
#' # Mean of best 5 % of the results
#' res
#' # Model with the new tuned hyperparameters
#' res$model
#' # Prediction
#' predict(res$model, newdata = iris[1:10,])}
tuneRanger = function(task, measure = NULL, iters = 70, iters.warmup = 30, time.budget = NULL, num.threads = NULL, num.trees = 1000, 
  parameters = list(replace = FALSE, respect.unordered.factors = "order"), 
  tune.parameters = c("mtry", "min.node.size", "sample.fraction"), save.file.path = NULL,
  build.final.model = TRUE, show.info = getOption("mlrMBO.show.info", TRUE)) {
  
  if(is.null(save.file.path)) {
    save.on.disk.at = NULL
  } else {
    save.on.disk.at = 1:(iters + 1)
  }
  
  fixed.param.in.tune = names(parameters) %in% tune.parameters
  if(any(fixed.param.in.tune))
    BBmisc::stopf("Fixed parameter %s cannot be tuning parameter at the same time.", names(parameters)[fixed.param.in.tune][1])
  
  type = getTaskType(task)
  size = getTaskSize(task)
  NFeats = getTaskNFeats(task)
  
  predict.type = ifelse(type == "classif", "prob", "response")
  if(is.null(measure)) {
    if(type == "classif") {
      cls.levels = getTaskClassLevels(task)
      if(length(cls.levels) == 2) {
        measure = list(brier)
      } else {
        measure = list(multiclass.brier)
      }
    }
    if(type == "regr") {
      measure = list(mse)
    }
  }
  measure.name = measure[[1]]$id
  minimize = measure[[1]]$minimize
  
  # Set the number of threads if not given by user
  if(is.null(num.threads))
    num.threads = parallel::detectCores()
  
  # Evaluation function
  performan = function(x) {
    par.vals = c(x, num.trees = num.trees, num.threads = num.threads, parameters)
    lrn = makeLearner(paste0(type, ".ranger"), par.vals = par.vals, predict.type = predict.type)
    mod = catchOrderWarning(mlr::train(lrn, task))
    preds = getOOBPreds(mod, task)
    performance(preds, measures = measure)
  }
  
  # Transformation of nodesize
  trafo_nodesize = function(x) ceiling((size * 0.2)^x)
  #trafo_mtry = function(x) round(NFeats^x)
  
  # Its ParamSet
  ps = makeParamSet(
    #makeNumericParam("mtry", lower = 0, upper = 1, trafo = trafo_mtry), 
    makeIntegerParam("mtry", lower = 1, upper = NFeats),
    makeNumericParam("min.node.size", lower = 0, upper = 1, trafo = trafo_nodesize), 
    makeNumericParam("sample.fraction", lower = 0.2, upper = 0.9),
    makeLogicalParam(id = "replace", default = FALSE),
    makeDiscreteLearnerParam("respect.unordered.factors", values = c("ignore", "order", "partition"), default = "order")
  )
  tunable.parameters = c("mtry", "min.node.size", "sample.fraction", "replace", "respect.unordered.factors")
  ps$pars = ps$pars[tunable.parameters %in% tune.parameters]
  
  # Budget
  f.evals = iters + iters.warmup
  mbo.init.design.size = iters.warmup
  
  # Focus search
  infill.opt = "focussearch"
  mbo.focussearch.points = iters + iters.warmup
  mbo.focussearch.maxit = 3
  mbo.focussearch.restarts = 3
  
  # The final SMOOF objective function
  objFun = smoof::makeMultiObjectiveFunction(
    name = "reg",
    fn = performan,
    par.set = ps,
    has.simple.signature = FALSE,
    noisy = TRUE,
    n.objectives = 1,
    minimize = minimize
  )
  
  # Build the control object
  method = "parego"
  if (method == "parego") {
    mbo.prop.points = 1
    mbo.crit = "cb"
    parego.crit.cb.pi = 0.5
  }
  
  control = makeMBOControl(n.objectives = 1L, propose.points = mbo.prop.points, # impute.y.fun = function(x, y, opt.path) 0.7, 
    save.on.disk.at = save.on.disk.at, save.file.path = save.file.path)
  control = setMBOControlTermination(control, max.evals = f.evals, iters = iters, time.budget = time.budget)
  control = setMBOControlInfill(control, #opt = infill.opt,
    opt.focussearch.maxit = mbo.focussearch.maxit,
    opt.focussearch.points = mbo.focussearch.points,
    opt.restarts = mbo.focussearch.restarts)
  
  design = generateDesign(mbo.init.design.size, getParamSet(objFun), fun = lhs::maximinLHS)
  #mbo.learner = makeLearner("regr.randomForest", predict.type = "se")
  mbo.learner = makeLearner("regr.km", covtype = "matern3_2", optim.method = "BFGS", nugget.estim = TRUE, 
    jitter = TRUE, predict.type = "se", config = list(show.learner.output = FALSE))
  
  result = mbo(fun = objFun, design = design, learner = mbo.learner, control = control, show.info = show.info)
  
  res = data.frame(result$opt.path)
  if("min.node.size" %in% tune.parameters)
    res$min.node.size = trafo_nodesize_end(res$min.node.size, size)
  #if("mtry" %in% tune.parameters)
  #  res$mtry = trafo_mtry_end(res$mtry, NFeats)
  colnames(res)[colnames(res) == "y"] = measure.name
  res = res[, c(tune.parameters, measure.name, "exec.time")]
  
  if (minimize) {
    recommended.pars = lapply(res[res[, measure.name] <= stats::quantile(res[, measure.name], 0.05),], summaryfunction)
  } else {
    recommended.pars = lapply(res[res[, measure.name] >= stats::quantile(res[, measure.name], 0.95),], summaryfunction)
  }
  recommended.pars = data.frame(recommended.pars)
  recommended.pars[colnames(res) %in% c("min.node.size", "mtry")] = round(recommended.pars[colnames(res) %in% c("min.node.size", "mtry")])
  
  # save the model with recommended hyperparameters
  mod = if(build.final.model) {
    ln.rec.pars = length(recommended.pars)
    x = as.list(recommended.pars[-c(ln.rec.pars - 1, ln.rec.pars)])
    x = c(x, num.trees = num.trees, num.threads = num.threads, parameters)
    lrn = mlr::makeLearner(paste0(type, ".ranger"), par.vals = x, predict.type = predict.type)
    catchOrderWarning(mlr::train(lrn, task))
  } else {
    NULL
  }
  
  out = list(recommended.pars = recommended.pars, results = res, model = mod)
  class(out) = "tuneRanger"
  return(out)
}

#' @export
print.tuneRanger = function(x, ...) {
  cat("Recommended parameter settings:", "\n")
  ln = length(x$recommended.pars)
  print(x$recommended.pars[-c(ln-1, ln)])
  cat("Results:", "\n")
  print(x$recommended.pars[c(ln-1, ln)])
}

trafo_mtry_end = function(x, NFeats) round(NFeats^x)
trafo_nodesize_end = function(x, size) ceiling((size * 0.2)^x)

summaryfunction = function(x) ifelse(class(x) %in% c("numeric", "integer"), mean(x), 
  names(sort(table(x), decreasing = TRUE)[1]))

catchOrderWarning = function(code) {
withCallingHandlers(code,
  warning = function(w) {
    if (grepl("Warning: The 'order' mode for unordered factor handling for multiclass classification is experimental.", w$message))
      invokeRestart("muffleWarning")
})
}