knitr::opts_chunk$set(echo = TRUE)
Many people are confused why trainig takes so much longer https://www.google.de/search?q=r+caret+slow
Example: Boosting
library(caret) fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 3) gbmFit1 = train(Species ~ ., data = iris, method = "gbm", trControl = fitControl, verbose = FALSE, tuneLength = 5) summary(gbmFit1$results) gbmFit1$bestTune gbmFit2 = train(Species ~ ., data = iris, method = "gbm", trControl = fitControl, verbose = FALSE, tuneLength = 10) gbmFit2$results gbmFit2$bestTune
Note: tuneLength
Different settings for n.trees
where calculated with the submodel trick.
Tuning Space defined in caret for gbm:
grid = function(x, y, len = NULL, search = "grid") { if(search == "grid") { out <- expand.grid( interaction.depth = seq(1, len), # <-- Why so dependent from tuning budget? n.trees = floor((1:len) * 50), shrinkage = .1, # <-- Why set fixed here? n.minobsinnode = 10) } else { out <- data.frame( n.trees = floor(runif(len, min = 1, max = 5000)), interaction.depth = sample(1:10, replace = TRUE, size = len), shrinkage = runif(len, min = .001, max = .6), n.minobsinnode = sample(5:25, replace = TRUE, size = len) ) out <- out[!duplicated(out),] } out }
Tuning Space defined in caret for boosting (aka AdaBoost):
grid = function(x, y, len = NULL, search = "grid") { types <- c("Breiman", "Freund", "Zhu") if(search == "grid") { out <- expand.grid(mfinal = floor((1:len) * 50), maxdepth = seq(1, len), # <-- WHAT???? coeflearn = types) } else { out <- data.frame(mfinal = sample(1:1000, replace = TRUE, size = len), maxdepth = sample(1:30, replace = TRUE, size = len), coeflearn = sample(types, replace = TRUE, size = len)) } out },
Tuning Space defined in caret for C5.0.default:
grid = function(x, y, len = NULL, search = "grid") { if(search == "grid") { c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1)) out <- expand.grid(trials = c5seq, model = c("tree", "rules"), winnow = c(TRUE, FALSE)) } else { out <- data.frame(trials = sample(1:100, replace = TRUE, size = len), model = sample(c("tree", "rules"), replace = TRUE, size = len), # <-- that parameter does not affect the performance at all! winnow = sample(c(TRUE, FALSE), replace = TRUE, size = len)) } out },
Proof (kind of):
library("mlr") set.seed(1) ra = resample(learner = makeLearner("classif.C50", rules = FALSE), task = iris.task, resampling = cv10) set.seed(1) rb = resample(learner = makeLearner("classif.C50", rules = TRUE), task = iris.task, resampling = cv10) ra$measures.test == rb$measures.test
caret does some things without making it very transparent:
grid = function(x, y, len = NULL, search = "grid"){ if(search == "grid") { out <- expand.grid(mtry = caret::var_seq(p = ncol(x), classification = is.factor(y), len = len), coefReg = seq(0.01, 1, length = len), coefImp = seq(0, 1, length = len)) } else { out <- data.frame(mtry = sample(1:ncol(x), size = len, replace = TRUE), coefReg = runif(len, min = 0, max = 1), coefImp = runif(len, min = 0, max = 1)) } out }, fit = function(x, y, wts, param, lev, last, classProbs, ...) { theDots <- list(...) theDots$importance <- TRUE args <- list(x = x, y = y, mtry = param$mtry) args <- c(args, theDots) firstFit <- do.call("randomForest", args) firstImp <- randomForest:::importance(firstFit) if(is.factor(y)) { firstImp <- firstImp[,"MeanDecreaseGini"]/max(firstImp[,"MeanDecreaseGini"]) } else firstImp <- firstImp[,"%IncMSE"]/max(firstImp[,"%IncMSE"]) firstImp <- ((1 - param$coefImp) * param$coefReg) + (param$coefImp * firstImp) RRF(x, y, mtry = param$mtry, coefReg = firstImp, ...) }
Improve tuning space with a better definition:
library(ParamHelpers) ps.gbm.grid = makeParamSet( makeIntegerParam("interaction.depth", lower = 1, upper = expression(p)), makeIntegerParam("n.trees", lower = 1, upper = floor(sqrt(5000)), trafo = function(x) x^2), makeDiscreteParam("shrinkage", values = 0.001), makeDiscreteParam("n.minobsinnode", values = 10), keys = c("p") ) ps.gbm.random = makeParamSet( makeIntegerParam("interaction.depth", lower = 1, upper = expression(p)), makeIntegerParam("n.trees", lower = 1, upper = floor(sqrt(5000)), trafo = function(x) x^2), makeNumericParam("shrinkage", lower = 0.001, upper = 0.6), makeIntegerParam("n.minobsinnode", lower = 5, upper = 25), keys = c("p") ) ps.gbm.grid = evaluateParamExpressions(ps.gbm.grid, dict = list(p = ncol(iris)-1)) grid.tune.gbm = generateGridDesign(ps.gbm.grid, resolution = 5, ) grid.tune.gbm$shrinkage = as.numeric(grid.tune.gbm$shrinkage) grid.tune.gbm$n.minobsinnode = as.numeric(grid.tune.gbm$n.minobsinnode) gbmFit3 = train(Species ~ ., data = iris, method = "gbm", trControl = fitControl, verbose = FALSE, tuneGrid = grid.tune.gbm) max(gbmFit3$results$Accuracy, na.rm = TRUE) max(gbmFit2$results$Accuracy, na.rm = TRUE)
Still no effective tuning aside from grid search and random search.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.