In jakob-r/mlrHyperopt: Easy Hyperparameteroptimization with mlr and mlrMBO

knitr::opts_chunk$set(echo = TRUE)

Caret Tuning

It is quite hidden

Many people are confused why trainig takes so much longer https://www.google.de/search?q=r+caret+slow

Increasing tuning budget does not necessarily increase fineness of tuning

Example: Boosting

library(caret)
fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 3)
gbmFit1 = train(Species ~ ., data = iris, method = "gbm", trControl = fitControl, verbose = FALSE, tuneLength = 5)
summary(gbmFit1$results)
gbmFit1$bestTune
gbmFit2 = train(Species ~ ., data = iris, method = "gbm", trControl = fitControl, verbose = FALSE, tuneLength = 10)
gbmFit2$results
gbmFit2$bestTune

Note: tuneLength Different settings for n.trees where calculated with the submodel trick.

Tuning Space defined in caret for gbm:

grid = function(x, y, len = NULL, search = "grid") {
  if(search == "grid") {
    out <- expand.grid(
      interaction.depth = seq(1, len), # <-- Why so dependent from tuning budget?
      n.trees = floor((1:len) * 50),
      shrinkage = .1, # <-- Why set fixed here?
      n.minobsinnode = 10)
  } else {
    out <- data.frame(
      n.trees = floor(runif(len, min = 1, max = 5000)),
      interaction.depth = sample(1:10, replace = TRUE, size = len),         
      shrinkage = runif(len, min = .001, max = .6),
      n.minobsinnode = sample(5:25, replace = TRUE, size = len) )
    out <- out[!duplicated(out),]
  }
  out
}

Tuning Space defined in caret for boosting (aka AdaBoost):

grid = function(x, y, len = NULL, search = "grid") {
  types <- c("Breiman", "Freund", "Zhu")
  if(search == "grid") {
    out <- expand.grid(mfinal = floor((1:len) * 50),
                       maxdepth = seq(1, len), # <-- WHAT????
                       coeflearn = types)
  } else {
    out <- data.frame(mfinal = sample(1:1000, replace = TRUE, size = len),
                      maxdepth = sample(1:30, replace = TRUE, size = len),
                      coeflearn = sample(types, replace = TRUE, size = len))
  }
  out
},

Tuning Space defined in caret for C5.0.default:

grid = function(x, y, len = NULL, search = "grid") {
  if(search == "grid") {
    c5seq <- if(len == 1)  1 else  c(1, 10*((2:min(len, 11)) - 1))
    out <- expand.grid(trials = c5seq, model = c("tree", "rules"), winnow = c(TRUE, FALSE))
  } else {
    out <- data.frame(trials = sample(1:100, replace = TRUE, size = len),
                      model = sample(c("tree", "rules"), replace = TRUE, size = len), # <-- that parameter does not affect the performance at all!
                      winnow = sample(c(TRUE, FALSE), replace = TRUE, size = len))
  }
  out
},

Proof (kind of):

library("mlr")
set.seed(1)
ra = resample(learner = makeLearner("classif.C50", rules = FALSE), task = iris.task, resampling = cv10)
set.seed(1)
rb = resample(learner = makeLearner("classif.C50", rules = TRUE), task = iris.task, resampling = cv10)
ra$measures.test == rb$measures.test

caret does some things without making it very transparent:

grid = function(x, y, len = NULL, search = "grid"){
  if(search == "grid") {
    out <- expand.grid(mtry = caret::var_seq(p = ncol(x), 
                                             classification = is.factor(y), 
                                             len = len),
                       coefReg = seq(0.01, 1, length = len),
                       coefImp = seq(0, 1, length = len))
  } else {
    out <- data.frame(mtry = sample(1:ncol(x), size = len, replace = TRUE),
                      coefReg = runif(len, min = 0, max = 1),
                      coefImp = runif(len, min = 0, max = 1))
  }
  out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
  theDots <- list(...)
  theDots$importance <- TRUE
  args <- list(x = x, y = y, mtry = param$mtry)
  args <- c(args, theDots)                       
  firstFit <- do.call("randomForest", args)
  firstImp <- randomForest:::importance(firstFit)
  if(is.factor(y))
  {
    firstImp <- firstImp[,"MeanDecreaseGini"]/max(firstImp[,"MeanDecreaseGini"])
  } else firstImp <- firstImp[,"%IncMSE"]/max(firstImp[,"%IncMSE"])
  firstImp <- ((1 - param$coefImp) * param$coefReg) + (param$coefImp * firstImp)

  RRF(x, y, mtry = param$mtry, coefReg = firstImp, ...)
}

Improve tuning space with a better definition:

library(ParamHelpers)
ps.gbm.grid = makeParamSet(
  makeIntegerParam("interaction.depth", lower = 1, upper = expression(p)),
  makeIntegerParam("n.trees", lower = 1, upper = floor(sqrt(5000)), trafo = function(x) x^2),
  makeDiscreteParam("shrinkage", values = 0.001),
  makeDiscreteParam("n.minobsinnode", values = 10),
  keys = c("p")
)

ps.gbm.random = makeParamSet(
  makeIntegerParam("interaction.depth", lower = 1, upper = expression(p)),
  makeIntegerParam("n.trees", lower = 1, upper = floor(sqrt(5000)), trafo = function(x) x^2),
  makeNumericParam("shrinkage", lower = 0.001, upper = 0.6),
  makeIntegerParam("n.minobsinnode", lower = 5, upper = 25),
  keys = c("p")
)
ps.gbm.grid = evaluateParamExpressions(ps.gbm.grid, dict = list(p = ncol(iris)-1))
grid.tune.gbm = generateGridDesign(ps.gbm.grid, resolution = 5, )
grid.tune.gbm$shrinkage = as.numeric(grid.tune.gbm$shrinkage)
grid.tune.gbm$n.minobsinnode = as.numeric(grid.tune.gbm$n.minobsinnode)
gbmFit3 = train(Species ~ ., data = iris, method = "gbm", trControl = fitControl, verbose = FALSE, tuneGrid = grid.tune.gbm)
max(gbmFit3$results$Accuracy, na.rm = TRUE)
max(gbmFit2$results$Accuracy, na.rm = TRUE)