Nothing
#' XGBoost SuperLearner wrapper
#'
#' Supports the Extreme Gradient Boosting package for SuperLearnering, which is
#' a variant of gradient boosted machines (GBM).
#'
#' The performance of XGBoost, like GBM, is sensitive to the configuration
#' settings. Therefore it is best to create multiple configurations using
#' create.SL.xgboost and allow the SuperLearner to choose the best weights based
#' on cross-validated performance.
#'
#' If you run into errors please first try installing the latest version of
#' XGBoost from drat as described here:
#' \url{http://xgboost.readthedocs.io/en/latest/build.html}
#'
#' @param Y Outcome variable
#' @param X Covariate dataframe
#' @param newX Optional dataframe to predict the outcome
#' @param obsWeights Optional observation-level weights (supported but not tested)
#' @param id Optional id to group observations from the same unit (not used
#' currently).
#' @param family "gaussian" for regression, "binomial" for binary
#' classification, "multinomial" for multiple classification (not yet supported).
#' @param ntrees How many trees to fit. Low numbers may underfit but high
#' numbers may overfit, depending also on the shrinkage.
#' @param max_depth How deep each tree can be. 1 means no interactions, aka tree
#' stubs.
#' @param shrinkage How much to shrink the predictions, in order to reduce
#' overfitting.
#' @param minobspernode Minimum observations allowed per tree node, after which
#' no more splitting will occur.
#' @param params Many other parameters can be customized. See
#' \url{http://xgboost.readthedocs.io/en/latest/parameter.html}
#' @param nthread How many threads (cores) should xgboost use. Generally we want
#' to keep this to 1 so that XGBoost does not compete with SuperLearner
#' parallelization.
#' @param save_period How often (in tree iterations) to save current model to
#' disk during processing. If NULL does not save model, and if 0 saves model
#' at the end.
#' @param verbose Verbosity of XGB fitting.
#' @param ... Any remaining arguments (not supported though).
#'
#' @export
SL.xgboost = function(Y, X, newX, family, obsWeights, id, ntrees = 1000,
max_depth = 4, shrinkage = 0.1, minobspernode = 10,
params = list(),
nthread = 1,
verbose = 0,
save_period = NULL,
...) {
.SL.require("xgboost")
if(packageVersion("xgboost") < "0.6") stop("SL.xgboost requires xgboost version >= 0.6, try help(\'SL.xgboost\') for details")
# X needs to be converted to a matrix first, then an xgb.DMatrix.
if (!is.matrix(X)) {
X = model.matrix(~ . - 1, X)
}
# Convert to an xgboost compatible data matrix, using the sample weights.
xgmat = xgboost::xgb.DMatrix(data = X, label = Y, weight = obsWeights)
# TODO: support early stopping, which requires a "watchlist". See ?xgb.train
if (family$family == "gaussian") {
# reg:linear was deprecated in version 1.1.1.1, changed to reg:squarederror
if(packageVersion("xgboost") >= "1.1.1.1") {
objective <- 'reg:squarederror'
} else {
objective <- 'reg:linear'
}
model = xgboost::xgboost(data = xgmat, objective=objective, nrounds = ntrees,
max_depth = max_depth, min_child_weight = minobspernode, eta = shrinkage,
verbose = verbose, nthread = nthread, params = params,
save_period = save_period)
}
if (family$family == "binomial") {
model = xgboost::xgboost(data = xgmat, objective="binary:logistic", nrounds = ntrees,
max_depth = max_depth, min_child_weight = minobspernode, eta = shrinkage,
verbose = verbose, nthread = nthread, params = params,
save_period = save_period, eval_metric = "logloss")
}
if (family$family == "multinomial") {
# TODO: test this.
model = xgboost::xgboost(data = xgmat, objective="multi:softmax", nrounds = ntrees,
max_depth = max_depth, min_child_weight = minobspernode, eta = shrinkage,
verbose = verbose, num_class = length(unique(Y)), nthread = nthread,
params = params,
save_period = save_period)
}
# Newdata needs to be converted to a matrix first, then an xgb.DMatrix.
if (!is.matrix(newX)) {
newX = model.matrix(~ . - 1, newX)
}
pred = predict(model, newdata = newX)
fit = list(object = model)
class(fit) = c("SL.xgboost")
out = list(pred = pred, fit = fit)
return(out)
}
#' XGBoost prediction on new data
#' @param object Model fit object from SuperLearner
#' @param newdata Dataframe that will be converted to an xgb.DMatrix
#' @param family Binomial or gaussian
#' @param ... Any remaining arguments (not supported though).
predict.SL.xgboost <- function(object, newdata, family, ...) {
.SL.require("xgboost")
if(packageVersion("xgboost") < "0.6") stop("SL.xgboost requires xgboost version >= 0.6, try help(\'SL.xgboost\') for details")
# newdata needs to be converted to a matrix first
if (!is.matrix(newdata)) {
newdata = model.matrix(~ . - 1, newdata)
}
pred = predict(object$object, newdata = newdata)
return(pred)
}
#' Factory for XGBoost SL wrappers
#'
#' Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters.
#'
#' @param tune List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined.
#' @param detailed_names Set to T to have the function names include the parameter configurations.
#' @param env Environment in which to create the SL.xgboost functions. Defaults to the global environment.
#' @param name_prefix The prefix string for the name of each function that is generated.
#'
#' @examples
#'
#' # Create a new environment to store the learner functions.
#' # This keeps the global environment organized.
#' sl_env = new.env()
#' # Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters.
#' tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10,
#' shrinkage = c(0.1, 0.01, 0.001))
#' # Generate a separate learner for each combination.
#' xgb_grid = create.SL.xgboost(tune = tune, env = sl_env)
#' # Review the function configurations.
#' xgb_grid
#' # Attach the environment so that the custom learner functions can be accessed.
#' attach(sl_env)
#' \dontrun{
#' sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names)
#' }
#' detach(sl_env)
#' @export
create.SL.xgboost = function(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = c(0.1),
minobspernode = c(10)), detailed_names = F, env = .GlobalEnv,
name_prefix = "SL.xgb") {
# Create all combinations of hyperparameters, for grid-like search.
tuneGrid = expand.grid(tune, stringsAsFactors=F)
names = rep("", nrow(tuneGrid))
for (i in seq(nrow(tuneGrid))) {
g = tuneGrid[i,]
if (detailed_names) {
name = paste(name_prefix, g$ntrees, g$max_depth, g$shrinkage, g$minobspernode, sep=".")
} else {
name = paste(name_prefix, i, sep=".")
}
names[i] = name
eval(parse(text = paste0(name, "= function(..., ntrees = ", g$ntrees, ", max_depth = ", g$max_depth, ", shrinkage=", g$shrinkage, ", minobspernode=", g$minobspernode, ") SL.xgboost(..., ntrees = ntrees, max_depth = max_depth, shrinkage=shrinkage, minobspernode=minobspernode)")), envir = env)
}
results = list(grid = tuneGrid, names = names)
invisible(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.