Nothing
#' Extreme Gradient Boosting Models
#'
#' Fits models with an efficient implementation of the gradient boosting
#' framework from Chen & Guestrin.
#'
#' @rdname XGBModel
#'
#' @param nrounds number of boosting iterations.
#' @param ... model parameters as described below and in the XGBoost
#' \href{https://xgboost.readthedocs.io/en/latest/parameter.html}{documentation}
#' and arguments passed to \code{XGBModel} from the other constructors.
#' @param verbose numeric value controlling the amount of output printed
#' during model fitting, such that 0 = none, 1 = performance information, and
#' 2 = additional information.
#' @param print_every_n numeric value designating the fitting iterations at
#' at which to print output when \code{verbose > 0}.
#' @param eta shrinkage of variable weights at each iteration to prevent
#' overfitting.
#' @param gamma minimum loss reduction required to split a tree node.
#' @param max_depth maximum tree depth.
#' @param min_child_weight minimum sum of observation weights required of nodes.
#' @param subsample subsample ratio of the training observations.
#' @param colsample_bytree,colsample_bylevel,colsample_bynode subsample ratio of
#' variables for each tree, level, or split.
#' @param rate_drop rate at which to drop trees during the dropout procedure.
#' @param one_drop integer indicating whether to drop at least one tree during
#' the dropout procedure.
#' @param skip_drop probability of skipping the dropout procedure during a
#' boosting iteration.
#' @param alpha,lambda L1 and L2 regularization terms for variable weights.
#' @param max_delta_step,tree_method,sketch_eps,scale_pos_weight,updater,refresh_leaf,process_type,grow_policy,max_leaves,max_bin,num_parallel_tree
#' other tree booster parameters.
#' @param sample_type,normalize_type type of sampling and normalization
#' algorithms.
#' @param feature_selector,top_k character string specifying the feature
#' selection and ordering method, and number of top variables to select in the
#' \code{"greedy"} and \code{"thrifty"} feature selectors.
#' @param objective optional character string defining the learning task and
#' objective. Set automatically if not specified according to the following
#' values available for supported response variable types.
#' \describe{
#' \item{\code{factor}:}{\code{"multi:softprob"}, \code{"binary:logistic"}
#' (2 levels only)}
#' \item{\code{numeric}:}{\code{"reg:squarederror"}, \code{"reg:logistic"},
#' \code{"reg:gamma"}, \code{"reg:tweedie"}, \code{"rank:pairwise"},
#' \code{"rank:ndcg"}, \code{"rank:map"}}
#' \item{\code{PoissonVariate}:}{\code{"count:poisson"}}
#' \item{\code{Surv}:}{\code{"survival:aft"}, \code{"survival:cox"}}
#' }
#' The first values listed are the defaults for the corresponding response
#' types.
#' @param aft_loss_distribution character string specifying a distribution for
#' the accelerated failure time objective (\code{"survival:aft"}) as
#' \code{"extreme"}, \code{"logistic"}, or \code{"normal"}.
#' @param aft_loss_distribution_scale numeric scaling parameter for the
#' accelerated failure time distribution.
#' @param base_score initial prediction score of all observations, global bias.
#'
#' @details
#' \describe{
#' \item{Response types:}{\code{factor}, \code{numeric},
#' \code{PoissonVariate}, \code{Surv}}
#' \item{\link[=TunedModel]{Automatic tuning} of grid parameters:}{
#' \itemize{
#' \item XGBModel: \code{NULL}
#' \item XGBDARTModel: \code{nrounds}, \code{eta}*, \code{gamma}*,
#' \code{max_depth}, \code{min_child_weight}*, \code{subsample}*,
#' \code{colsample_bytree}*, \code{rate_drop}*, \code{skip_drop}*
#' \item XGBLinearModel: \code{nrounds}, \code{alpha}, \code{lambda}
#' \item XGBTreeModel: \code{nrounds}, \code{eta}*, \code{gamma}*,
#' \code{max_depth}, \code{min_child_weight}*, \code{subsample}*,
#' \code{colsample_bytree}*
#' }
#' }
#' }
#' * excluded from grids by default
#'
#' The booster-specific constructor functions \code{XGBDARTModel},
#' \code{XGBLinearModel}, and \code{XGBTreeModel} are special cases of
#' \code{XGBModel} which automatically set the XGBoost \code{booster}
#' \href{https://xgboost.readthedocs.io/en/latest/parameter.html}{parameter}.
#' These are called directly in typical usage unless \code{XGBModel} is needed
#' to specify a more general model.
#'
#' Default argument values and further model details can be found in the source
#' See Also link below.
#'
#' In calls to \code{\link{varimp}} for \code{XGBTreeModel}, argument
#' \code{type} may be specified as \code{"Gain"} (default) for the fractional
#' contribution of each predictor to the total gain of its splits, as
#' \code{"Cover"} for the number of observations related to each predictor, or
#' as \code{"Frequency"} for the percentage of times each predictor is used in
#' the trees. Variable importance is automatically scaled to range from 0 to
#' 100. To obtain unscaled importance values, set \code{scale = FALSE}. See
#' example below.
#'
#' @return \code{MLModel} class object.
#'
#' @seealso \code{\link[xgboost:xgb.train]{xgboost}}, \code{\link{fit}},
#' \code{\link{resample}}
#'
#' @examples
#' \donttest{
#' ## Requires prior installation of suggested package xgboost to run
#'
#' model_fit <- fit(Species ~ ., data = iris, model = XGBTreeModel)
#' varimp(model_fit, method = "model", type = "Frequency", scale = FALSE)
#' }
#'
XGBModel <- function(
nrounds = 100, ..., objective = character(), aft_loss_distribution = "normal",
aft_loss_distribution_scale = 1, base_score = 0.5, verbose = 0,
print_every_n = 1
) {
params <- as.list(environment())
MLModel(
name = "XGBModel",
label = "Extreme Gradient Boosting",
packages = "xgboost (>= 1.3.0)",
response_types = c("factor", "numeric", "PoissonVariate", "Surv"),
weights = TRUE,
predictor_encoding = "model.matrix",
na.rm = "response",
params = new_params(c(params[1], ..., params[-1])),
fit = function(
formula, data, weights, nrounds, verbose, print_every_n, ...
) {
x <- model.matrix(data, intercept = FALSE)
y <- response(data)
y_levels <- levels(y)
params <- list(...)
choices <- switch_class(y,
"factor" = {
y <- as.numeric(y) - 1
c("multi:softprob", if (length(y_levels) <= 2) "binary:logistic")
},
"numeric" = c("reg:squarederror", "reg:logistic", "reg:gamma",
"reg:tweedie", "rank:pairwise", "rank:ndcg", "rank:map"),
"PoissonVariate" = "count:poisson",
"Surv" = {
throw(check_censoring(y, "right"))
y_time <- y[, "time"]
y_event <- y[, "status"] == 1
c("survival:aft", "survival:cox")
}
)
params$objective <- match.arg(params$objective, choices)
dmat <- xgboost::xgb.DMatrix(x)
if (params$objective == "survival:aft") {
y_lower <- y_upper <- y_time
y_upper[!y_event] <- Inf
xgboost::setinfo(dmat, "label_lower_bound", y_lower)
xgboost::setinfo(dmat, "label_upper_bound", y_upper)
} else {
params$aft_loss_distribution <- NULL
params$aft_loss_distribution_scale <- NULL
if (params$objective == "survival:cox") {
y <- ifelse(y_event, y_time, -y_time)
}
xgboost::setinfo(dmat, "label", y)
}
params$num_class <- if (params$objective == "multi:softprob") {
length(y_levels)
}
scalable <- c("binary:logistic", "reg:logistic", "reg:squarederror")
if (!(params$objective %in% scalable)) {
params$scale_pos_weight <- NULL
}
if (!isTRUE(params$feature_selector %in% c("greedy", "thrifty"))) {
params$top_k <- NULL
}
res <- xgboost::xgboost(
dmat, weight = weights, params = params, nrounds = nrounds,
verbose = verbose, print_every_n = print_every_n
)
attr(res, ".MachineShop") <- list(y_levels = y_levels)
res
},
predict = function(object, newdata, times, .MachineShop, ...) {
input <- .MachineShop$input
newx <- model.matrix(newdata, intercept = FALSE)
xgb_predict <- function(newdata = newx, lp = FALSE) {
predict(object, newdata = newdata, outputmargin = lp)
}
switch(object$params$objective,
"multi:softprob" = {
matrix(xgb_predict(), nrow(newx), byrow = TRUE)
},
"survival:aft" = {
distr <- object$params$aft_loss_distribution
if (distr == "normal") distr <- "gaussian"
if (length(times)) {
pred <- xgb_predict(lp = TRUE)
log_times <- matrix(log(times), length(pred), length(times),
byrow = TRUE)
scale <- object$params$aft_loss_distribution_scale
quants <- (log_times - pred) / scale
surv_probs <- switch(distr,
"extreme" = exp(-exp(quants)),
"gaussian" = 1 - pnorm(quants),
"logistic" = 1 / (1 + exp(quants)),
throw(Error("Unsupported aft loss distribution \"", distr, "\"."))
)
SurvProbs(surv_probs, times = times, distr = distr)
} else {
SurvTimes(xgb_predict(), distr = distr)
}
},
"survival:cox" = {
x <- model.matrix(PredictorFrame(input), intercept = FALSE)
lp <- xgb_predict(x, lp = TRUE)
new_lp <- xgb_predict(newx, lp = TRUE)
predict(response(input), lp, new_lp, times = times,
weights = case_weights(input), ...)
},
xgb_predict()
)
},
varimp = function(
object, type = c("Gain", "Cover", "Frequency"), .MachineShop, ...
) {
vi <- xgboost::xgb.importance(model = object, ...)
structure(
if (!is.null(vi$Weight)) {
metric <- "coefficient"
if (!is.null(vi$Class)) {
vi_wide <- reshape(
vi, idvar = "Feature", timevar = "Class", v.names = "Weight",
varying = list(.MachineShop$y_levels), direction = "wide"
)
rownames(vi_wide) <- vi_wide[[1]]
vi_wide[-1]
} else {
structure(vi$Weight, names = vi$Feature)
}
} else {
type <- match.arg(type)
metric <- tolower(type)
structure(vi[[type]], names = vi$Feature)
},
metric = metric
)
}
)
}
MLModelFunction(XGBModel) <- NULL
#' @rdname XGBModel
#'
XGBDARTModel <- function(
eta = 0.3, gamma = 0, max_depth = 6, min_child_weight = 1,
max_delta_step = .(0.7 * is(y, "PoissonVariate")), subsample = 1,
colsample_bytree = 1, colsample_bylevel = 1, colsample_bynode = 1,
alpha = 0, lambda = 1, tree_method = "auto", sketch_eps = 0.03,
scale_pos_weight = 1, refresh_leaf = 1, process_type = "default",
grow_policy = "depthwise", max_leaves = 0, max_bin = 256,
num_parallel_tree = 1, sample_type = "uniform", normalize_type = "tree",
rate_drop = 0, one_drop = 0, skip_drop = 0, ...
) {
.XGBModel(name = "XGBDARTModel", label = "Extreme Gradient Boosting (DART)",
model = "dart", envir = environment(), ...)
}
MLModelFunction(XGBDARTModel) <- NULL
#' @rdname XGBModel
#'
XGBLinearModel <- function(
alpha = 0, lambda = 0, updater = "shotgun", feature_selector = "cyclic",
top_k = 0, ...
) {
.XGBModel(
name = "XGBLinearModel", label = "Extreme Gradient Boosting (Linear)",
model = "gblinear", envir = environment(), ...
)
}
MLModelFunction(XGBLinearModel) <- NULL
#' @rdname XGBModel
#'
XGBTreeModel <- function(
eta = 0.3, gamma = 0, max_depth = 6, min_child_weight = 1,
max_delta_step = .(0.7 * is(y, "PoissonVariate")), subsample = 1,
colsample_bytree = 1, colsample_bylevel = 1, colsample_bynode = 1,
alpha = 0, lambda = 1, tree_method = "auto", sketch_eps = 0.03,
scale_pos_weight = 1, refresh_leaf = 1, process_type = "default",
grow_policy = "depthwise", max_leaves = 0, max_bin = 256,
num_parallel_tree = 1, ...
) {
.XGBModel(name = "XGBTreeModel", label = "Extreme Gradient Boosting (Tree)",
model = "gbtree", envir = environment(), ...)
}
MLModelFunction(XGBTreeModel) <- NULL
.XGBModel <- function(name, label, model, envir, ...) {
params <- c(as.list(envir), ...)
params$booster <- model
model <- do.call(XGBModel, params, quote = TRUE)
model@name <- name
model@label <- label
gridinfo <- new_gridinfo(
param = c("nrounds", "eta", "gamma", "max_depth", "min_child_weight",
"subsample", "colsample_bytree", "rate_drop", "skip_drop",
"alpha", "lambda"),
get_values = c(
function(n, ...) round_int(seq_range(0, 50, c(1, 1000), n + 1)),
function(n, ...) seq(0.001, 0.6, length = n),
function(n, ...) seq(0, 10, length = n),
function(n, ...) seq_len(min(n, 10)),
function(n, data, ...) seq(0, min(20, nrow(data)), length = n),
function(n, ...) seq(0.25, 1, length = n),
function(n, ...) seq(0.3, 0.8, length = n),
function(n, ...) seq(0.01, 0.50, length = n),
function(n, ...) seq(0.05, 0.95, length = n),
function(n, ...) c(10^-seq_inner(0, 5, n - 1), 0),
function(n, ...) c(10^-seq_inner(0, 5, n - 1), 0)
),
default = c(TRUE, rep(FALSE, 2), TRUE, rep(FALSE, 5), rep(TRUE, 2))
)
grid_params <- switch(params$booster,
"dart" = c("nrounds", "eta", "gamma", "max_depth", "min_child_weight",
"subsample", "colsample_bytree", "rate_drop", "skip_drop"),
"gblinear" = c("nrounds", "alpha", "lambda"),
"gbtree" = c("nrounds", "eta", "gamma", "max_depth", "min_child_weight",
"subsample", "colsample_bytree")
)
model@gridinfo <- gridinfo[gridinfo$param %in% grid_params, ]
model
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.