#https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde
#https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
#https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e
#https://www.kaggle.com/general/17120
#https://www.analyticsvidhya.com/blog/2016/01/xgboost-algorithm-easy-steps/
#https://www.slideshare.net/ShangxuanZhang/kaggle-winning-solution-xgboost-algorithm-let-us-learn-from-its-author
#https://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
#' @export
hyperparameter_search <- function(dataset, objective= c("averaging", "selection"),
rand_points=50,
n_iter=50, n.cores=1,
save_filename="bayes_hyper_search.RData",
resume_filename=NULL) {
if (is.null(save_filename)) {
stop("hyperparameter_search requires a file to store temp results, please set the save_filename parameter")
}
#weird check to remove a save file already exisiting, when we are not resuming, to avoid continuing from it
if (!is.null(save_filename)) {
if (is.null(resume_filename) || (save_filename!=resume_filename)) {
if (file.exists(save_filename)) {
message("Rewriting saving file...")
file.remove(save_filename)
}
}
}
if (length(dataset) < 10) {
stop("Not enough data to do the crossvalidation!")
}
if (is.null(attr(dataset, "avg_naive2_errors"))) {
stop("Need to calculate the average naive2 OWA errors with process_owa_errors")
}
type_objective <- match.arg(objective)
N_THREAD = n.cores
whole_dataset <- dataset
#prepare the folds
folds <- rBayesianOptimization::KFold(1:length(whole_dataset), nfolds=5)
train_ds <- NULL
test_ds <- NULL
train_feat <- NULL
test_feat <- NULL
for (i in 1:length(folds)) {
train_ds[[i]] <- whole_dataset[ -folds[[i]] ]
train_feat[[i]] <- create_feat_classif_problem(train_ds[[i]])
test_ds[[i]] <- whole_dataset[ folds[[i]] ]
test_feat[[i]] <- create_feat_classif_problem(test_ds[[i]])
}
bay_results <- NULL
bayes_xgb <- function(max_depth, eta, gamma, min_child_weight,
subsample, colsample_bytree, nrounds) {
param_bay <- list(max_depth=max_depth, eta=eta,
gamma=gamma,
min_child_weight=min_child_weight,
subsample=subsample,
colsample_bytree=colsample_bytree,
nrounds=nrounds)
final_error = NULL
final_preds = NULL
for (i in 1:1) {
bst <- .train_data_from_bayes_res(train_feat[[i]], param_bay, N_THREAD)
preds <- predict_selection_ensemble(bst, test_feat[[i]]$data)
attr(test_ds[[i]], "avg_naive2_errors") <- attr(dataset, "avg_naive2_errors")
er <- summary_performance(preds,
test_ds[[i]],
print.summary = FALSE)
#maybe improve this a bit to avoid calculating both errors always
er <- switch(type_objective,
selection = er$selected_error,
averaging = er$weighted_error)
final_error <- c(final_error, er)
final_preds <- rbind(final_preds, preds)
}
bay_results <- rbind(NULL, c(max_depth, eta, gamma, min_child_weight,
subsample, colsample_bytree, nrounds, mean(final_error)))
try({colnames(bay_results) <- c("max_depth", "eta", "gamma", "min_child_weight",
"subsample", "colsample_bytree", "nrounds", "Value")})
bay_results <- data.frame(bay_results)
if (!is.null(save_filename)) {
oldres <- NULL
try(oldres<-readRDS(save_filename))
bay_results <- rbind(oldres, bay_results)
saveRDS(bay_results, file=save_filename)
}
list(Score=-mean(final_error), Pred=0)
}
precalc_grid <- NULL
if (!is.null(resume_filename)) {
message("Resuming hyperparameter search")
precalc_grid <- readRDS(resume_filename)
bay_results <- precalc_grid
n_iter <- n_iter + min(rand_points - nrow(bay_results), 0)#when resuming calc how many iterations are left
rand_points <- max(rand_points - nrow(bay_results), 0) #we are resuming, we assume the first points come from random
}
if (rand_points + n_iter > 0) {
k=2
rBayesianOptimization::BayesianOptimization(bayes_xgb, bounds=list(max_depth=c(2L,50L),
eta=c(0.001, 1.0),
gamma=c(0.00001, 2.0),
min_child_weight=c(0.00001, 5.0),
subsample=c(0.5,1.0),
colsample_bytree=c(0.5,1.0),
nrounds=c(1L,500L)),
init_grid_dt = precalc_grid,
init_points= rand_points,
kappa = 2.576,
n_iter=n_iter,
kernel=list(type = "matern", nu=(2*k+1)/2),
verbose=FALSE)
}
bay_results <- readRDS(save_filename)
bay_results
}
#from a bay_results row, train the model
.train_data_from_bayes_res <- function (data, bayes_res, n.cores) {
param <- as.list(bayes_res)
param$nthread <- n.cores
nrounds <- param$nrounds
param$nrounds <- NULL
param$Value <- NULL
train_selection_ensemble(data$data, data$errors, param, nrounds)
}
.train_from_bayes_res <- function (dataset, bayes_res, n.cores) {
data <- create_feat_classif_problem(dataset)
.train_data_from_bayes_res(data, bayes_res, n.cores)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.