#' Wrapper to xgb.cv and xgb.train
#' @param formula,data Formula and data.frame from which to create the model.matrix and response.
#' @param n.trees Number of trees to build.
#' @param interaction.depth,learning.rate Interaction depth (6) and learning rate (0.1).
#' @param bag.fraction Proportion of data in each tree (0.5). If \code{bag.fraction = 1}
#' and, the whole data set is used as-is.
#' @param col.fraction Proportion of columns of features to use. Defaults to
#' \code{col.fraction = 1} and the only reason you'd want to reduce it is due
#' to memory or processing time issues. This is passed as \code{colsample_bynode}
#' for no reason other than that's what random forest does. I've no idea if
#' one method is better than another.
#' @param cv.folds Number of cross-validation folds (10).
#' @param cv.class.stratify Whether to stratify cross-validation by the response
#' values (FALSE).
#' @param n.cores Number of cores to use (1).
#' @param n.minobsinnode Minimum number of observations allowed in a tree node (3).
#' @param leaf.penalty Penalty factor for the total number of leaves in trees (0).
#' @param weight.penalty.L1,weight.penalty.L2 L1 and L2 penalties for leaf weights
#' (0 for L1 and 1 for L2).
#' @param early.stopping.trees Passed through as \code{early_stopping_rounds} and
#' defaults to 100.
#' @param distribution The only values allowed are, "gaussian", "huber" (which
#' uses pseudo-huber loss), "binomial",
#' "multinomial", "poisson", "quantile" or "coxph". Others should be added as the (my)
#' need arises. There is no default because experience suggests that leads too
#' easily to mistakes.
#' @param quant Quantile to be modelled when \code{distribution = "quantile"}. Defaults
#' to \code{quant = 0.5}.
#' @param event Only used when \code{distribution = "coxph"}. Indicates if the
#' observational unit has an event (1) or is censored (0).
#' @param verbose Control printing (100). Use \code{verbose = 0} for no printing.
#' @param fail.if.not.converged Defaults to TRUE
#'
#' @details The function takes on the job of turning the data and formula into
#' the favoured stuff of \code{xgboost} and applies sensible metrics given
#' the distribution: that is, it does maximum likelihood when a likelihood
#' function is available (everything but quantile regression).
#'
#' The response (and any other) variable should be transformed prior to
#' using \code{xgbm}, if necessary. An apparent bug, somewhere or other,
#' means that if the transformation is done via the formula, relative
#' influence goes wrong.
#'
#' For \code{distribution = "coxph"}, you need to use the \code{event}
#' argument.
#'
#' For \code{distribution = "poisson"}, if you need an offset, divide the
#' response by the exposure and pass exposure in using the weight argument.
#'
#' Some of the code is quite inefficient and probably annoying. One of the
#' reasons is that it's best to fail quickly rather than wait for a lot of
#' processing to be done and then fail.
#' @export
xgbm <- function(formula, data, n.trees=100, interaction.depth=6, learning.rate=.1,
weight = NULL,
bag.fraction=.5, col.fraction = 1,
cv.folds=10, cv.class.stratify=FALSE, n.cores = NULL,
n.minobsinnode = 3, leaf.penalty = 0,
weight.penalty.L1 = 0, weight.penalty.L2 = 0,
early.stopping.trees = 100,
distribution = NULL,
quant = .5, event = NULL,
verbose =100, fail.if.not.converged = TRUE){
theCall <- match.call()
if (is.null(distribution)){
stop("distribution not specified and I daren't guess")
}
nms <- make.names(names(data))
if (any(names(data) != nms)){
stop("Do names(data) <- make.names(names(data))")
}
formula <- as.formula(formula)
if (length(as.character(formula[[2]])) != 1){
stop("You need to do transformations prior to calling xgbm, and the response appears to use a transformation")
}
distribution <- tolower(distribution)
#if (!is.null(metrics) & !is.function(metrics) & length(metrics) != 1){
# stop("metrics should be a string (not vector of strings) or a function")
#}
if (is.null(n.cores)){
n.cores <- parallel::detectCores() - 1
}
if (distribution == "coxph"){
event <- deparse(substitute(event))
if(event == "NULL"){
stop("For coxph, you need to specify which column indicates an event (unquoted).")
}
if (!(event %in% names(data))){
stop("Specified event column is not in the data.")
}
trueevent <- data[, event]
data <- data[, names(data) != event]
}
if (!is.logical(verbose) & verbose > 0 & verbose < Inf){
print_every_n <- verbose
verbose <- TRUE
} else {
print_every_n <- 1
verbose <- FALSE
}
# Get column classes
ytxt <- as.character(formula)[2]
cc <- sapply(data[, names(data) != ytxt], class)
if ("character" %in% cc){
if (verbose) message("converting character columns to factors")
data[sapply(data, is.character)] <- lapply(data[sapply(data, is.character)], as.factor)
}
data <- droplevels(data) # otherwise xgb.importance can fail
X <- big.model.matrix(formula, data, n.cores = n.cores)
lp <- as.character(formula)[3]
if (lp == "."){
Xnames <- names(data)[names(data) != ytxt]
} else {
Xnames <- strsplit(as.character(formula)[3], " \\+ ")[[1]]
}
xdata <- data[, Xnames]
data <- data[, c(ytxt, Xnames)]
xlevels <- lapply(xdata, function(X){
if (is.factor(X) | is.ordered(X)){
levels(X)
} else {
NULL
}
})
xlevels[sapply(xlevels, is.null)] <- NULL
y <- data[, ytxt]
if (distribution == "multinomial"){
y <- as.numeric(y) - 1
} else if (distribution %in% c("binomial", "bernoulli")){
if (inherits(y, "character")){
y <- as.numeric(as.factor(y)) - 1
} else if (inherits(y, "factor")){
y <- as.numeric(y) - 1
} else {
y <- as.numeric(y)
}
} else if (distribution == "coxph"){
y <- ifelse(trueevent, y, -y)
}
if (any(is.na(y))){
stop("missing values in the response are not allowed")
}
xd <- xgb.DMatrix(data = as.matrix(X), info = list(label = y))
if (class(distribution) == "character"){
if (length(unique(y)) == 2 | distribution %in% c("binomial", "bernoulli")){
distribution <- "reg:logistic"
metrics <- "logloss"
} else if (distribution == "gaussian"){
distribution <- "reg:squarederror"
metrics <- "rmse"
} else if (distribution == "huber") {
distribution <- "reg:pseudohubererror"
metrics <- "mphe"
} else if (distribution == "quantile"){
distribution <- function(preds, dtrain){
qntregobj(preds, dtrain, quant = quant)
}
metrics <- list(function(preds, dtrain) qnterr(preds, dtrain, quant=quant))
} else if (distribution == "multinomial"){
distribution <- "multi:softprob"
metrics <- "mlogloss"
num_class <- length(unique(y))
} else if (distribution == "poisson"){
distribution <- "count:poisson"
metrics <- "poisson-nloglik"
} else if (distribution == "coxph"){
event <- deparse(substitute(event))
if(event == "NULL"){
stop("For coxph, you need to specify which column indicates an event (unquoted).")
}
distribution <- "survival:cox"
metrics <- "cox-nloglik"
} else {
stop("you're going to have to implement that distribution before you can use it")
}
}
params <- list(eta = learning.rate, max_depth = interaction.depth,
nthread = n.cores, subsample = bag.fraction,
colsample_bynode = col.fraction,
min_child_weight = n.minobsinnode,
gamma = leaf.penalty, alpha = weight.penalty.L1,
lambda = weight.penalty.L2,
objective = distribution,
eval_metric = metrics)
if (distribution == "multi:softprob"){
params <- c(params, num_class = num_class)
}
xm <- xgb.cv(xd, params=params, nfold=cv.folds, nrounds = n.trees,
early_stopping_rounds = early.stopping.trees,
weight = weight,
print_every_n = print_every_n,
prediction = TRUE,
verbose = verbose)
nms <- names(xm$evaluation_log)
nms <- gsub("rmse", "error", nms)
nms <- gsub("logloss", "error", nms)
names(xm$evaluation_log) <- nms
if (bestCViter(list(cv = xm)) == n.trees){
msg <- "Not enough trees for CV convergence"
if (fail.if.not.converged){
stop(msg)
} else {
warning(msg)
}
}
m <- xgb.train(xd, params=params, nrounds=n.trees, print_every_n = print_every_n,
verbose = verbose)
res <- list(call = theCall, model = m, cv = xm, cv.folds = cv.folds,
xnames = Xnames, X = X, y = y, data = data,
xlevels=xlevels, n.trees = n.trees)
class(res) <- "xgbm"
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.