#' @title CompBoost Regression Learner
#'
#' @name mlr_learners_regr.compboost
#'
#' @description
#' Componentwise boosting
#' Calls [compboost::Compboost()] from package \CRANpkg{compboost}.
#'
#' @template section_dictionary_learner
#' @templateVar id regr.compboost
#'
#' @export
#' @template seealso_learner
#' @template example
LearnerRegrCompboost = R6Class("LearnerRegrCompboost",
inherit = LearnerRegr,
public = list(
#' @description
#' Create a `LearnerRegrCompboost` object.
initialize = function() {
ps = ParamSet$new(
params = list(
ParamDbl$new(id = "df", default = 4, lower = 1, tags = "train"),
ParamDbl$new(id = "df_cat", default = 4, lower = 1, tags = "train"),
ParamInt$new(id = "iters_max_univariate", default = 10000L, lower = 1L, tags = "train"),
ParamInt$new(id = "iters_max_interactions", default = 10000L, lower = 1L, tags = "train"),
# ParamDbl$new(id = "learning_rate_univariate", default = 0.01, lower = 0),
# ParamDbl$new(id = "learning_rate_interactions", default = 0.1, lower = 0),
ParamDbl$new(id = "n_knots_univariate", default = 20L, lower = 4, tags = "train"),
ParamDbl$new(id = "n_knots_interactions", default = 10L, lower = 4, tags = "train"),
ParamLgl$new(id = "use_components", default = TRUE, tags = "train"),
ParamInt$new(id = "ncores", default = 1L, lower = 1L, upper = parallel::detectCores() - 1L, tags = c("train", "test")),
ParamDbl$new(id = "stop_epsylon_for_break", default = 0.00001, lower = 0, upper = 1, tags = "train"),
ParamDbl$new(id = "stop_patience", default = 10L, lower = 1L, tags = "train"),
ParamDbl$new(id = "val_fraction", default = 0.33, lower = 0, upper = 1, tags = "train"),
ParamDbl$new(id = "top_interactions", default = 0.02, lower = 0.01, upper = 1, tags = "train"),
ParamDbl$new(id = "n_min_interactions", default = 10L, lower = 0, tags = "train"),
ParamLgl$new(id = "use_early_stopping", default = TRUE, tags = "train"),
ParamLgl$new(id = "show_output", default = FALSE, tags = "train"),
ParamLgl$new(id = "just_univariate", default = FALSE, tags = "train"),
ParamLgl$new(id = "add_deeper_interactions", default = FALSE, tags = "train"),
ParamInt$new(id = "iters_deeper_interactions", default = 500, lower = 0, tags = "train"),
#ParamDbl$new(id = "learning_rate_deeper_interactions", default = 0.2, lower = 0, upper = 1),
ParamInt$new(id = "train_time_total", default = 0, lower = 0, tags = "train"),
ParamInt$new(id = "n_threshold_binning", default = 4900, lower = 0, tags = "train"),
ParamDbl$new(id = "learning_rate", default = 0.01, lower = 0, tags = "train")
))
ps$values = list(
# General pars:
df = 6,
n_threshold_binning = 4000L,
use_components = TRUE,
show_output = FALSE,
learning_rate = 0.01,
ncores = parallel::detectCores() - 1L,
# Univariate model:
#learning_rate_univariate = 0.1,
n_knots_univariate = 15,
iters_max_univariate = 50000L,
# Interaction model (tensor splines):
just_univariate = FALSE,
top_interactions = 0.02,
#learning_rate_interactions = 0.15,
n_knots_interactions = 8,
iters_max_interactions = 50000L,
# Control deeper interactions (trees):
add_deeper_interactions = TRUE,
iters_deeper_interactions = 500L,
#learning_rate_deeper_interactions = 0.15,
# Control early stopping:
use_early_stopping = TRUE,
stop_patience = 10L,
stop_epsylon_for_break = 1e-6,
train_time_total = 120
) # Restrict the training to 2 hours
super$initialize(
id = "regr.compboost",
packages = "compboost",
feature_types = c("numeric", "factor", "integer", "character"),
predict_types = "response",
param_set = ps
)
},
getRiskStages = function(log_entry = "train_risk") {
if (is.null(self$model))
stop("Train learner first to extract risk values.")
out = data.frame(stage = c("featureless", "univariate", "pairwise-interactions", "deep-interactions"),
value = rep(NA_real_, 4L), explained = rep(NA_real_, 4L), percentage = rep(NA_real_, 4L),
iterations = rep(NA_integer_, 4L))
if ("univariate" %in% names(self$model)) {
logs = self$model$univariate$getLoggerData()
if (! log_entry %in% names(logs))
stop("No log entry", log_entry, "found in compboost log.")
out$value[1] = logs[[log_entry]][logs$baselearner == "intercept"]
out$value[2] = tail(logs[[log_entry]], 1)
out$iterations[1] = 0
out$iterations[2] = max(logs[["_iterations"]])
} else {
stop("At least univariate model must be given!")
}
if ("interactions" %in% names(self$model)) {
logs = self$model$interactions$getLoggerData()
if (! log_entry %in% names(logs))
stop("No log entry", log_entry, "found in compboost log.")
out$value[3] = tail(logs[[log_entry]], 1)
out$iterations[3] = max(logs[["_iterations"]])
}
if ("deeper_interactions" %in% names(self$model)) {
vals = vapply(X = self$model$deeper_interactions$trees, FUN = function(tree) {
if (! log_entry %in% names(tree))
stop("Cannot find log entry", log_entry, "in tree.")
return(tree[[log_entry]])
}, FUN.VALUE = numeric(1L))
if (length(vals) == 0)
out$value[4] = NA_real_
else
out$value[4] = tail(vals, 1L)
out$iterations[4] = length(vals)
}
out$explained = c(0, -diff(out$value))
out$percentage = out$explained / sum(out$explained, na.rm = TRUE)
return(out)
}
),
private = list(
.train = function(task) {
time0 = proc.time()
squaredLoss = function(truth, pred) (truth - pred)^2
squaredRisk = function(truth, pred) mean(squaredLoss(truth, pred))
### Set params with default and user defined ones:
pdefaults = self$param_set$default
pars = self$param_set$values
self$param_set$values = mlr3misc::insert_named(pdefaults, pars)
if (task$nrow >= self$param_set$values$n_threshold_binning)
bin_roots = c(seq(2, 1, length.out = 4)[-4], 0)
else
bin_roots = 0L
### Get stop arguments and train/test indices:
stop_args = list(eps_for_break = self$param_set$values$stop_epsylon_for_break,
patience = self$param_set$values$stop_patience)
if (self$param_set$values$use_early_stopping) {
test_idx = as.integer(sample(seq_len(task$nrow), trunc(task$nrow * self$param_set$values$val_fraction)))
} else {
test_idx = NULL
}
train_idx = setdiff(seq_len(task$nrow), test_idx)
for (bin_root in bin_roots) {
optimizer = compboost::OptimizerCoordinateDescent$new(self$param_set$values$ncores)
### Define compboost model for univariate features:
loss = compboost::LossQuadratic$new()
cboost_uni = Compboost$new(data = task$data(), target = task$target_names,
loss = loss, learning_rate = self$param_set$values$learning_rate,
# loss = loss, learning_rate = self$param_set$values$learning_rate_univariate,
optimizer = optimizer, test_idx = test_idx, stop_args = stop_args,
use_early_stopping = self$param_set$values$use_early_st
)
### If a maximum time is given, the logger is used for stopping. Otherwise the time is just logged:
if (self$param_set$values$train_time_total > 0) {
cboost_uni$addLogger(LoggerTime, TRUE, "minutes", self$param_set$values$train_time_total, "minutes")
} else {
cboost_uni$addLogger(LoggerTime, FALSE, "minutes", 0, "minutes")
}
### Add base-learner/components (linear + centered spline):
e = try({
nuisance = lapply(task$feature_names, function(nm) {
if (is.numeric(task$data()[[nm]])) {
if (self$param_set$values$use_components) {
cboost_uni$addComponents(nm, n_knots = self$param_set$values$n_knots_univariate,
df = self$param_set$values$df, bin_root = bin_root)
} else {
cboost_uni$addBaselearner(nm, "spline", BaselearnerPSpline,
n_knots = self$param_set$values$n_knots_univariat,
df = self$param_set$values$df, bin_root = bin_root)
}
} else {
cboost_uni$addBaselearner(nm, "category", BaselearnerCategoricalRidge,
df = self$param_set$values$df_cat)
}
})
### Train model:
if (self$param_set$values$show_output) {
cboost_uni$train(self$param_set$values$iters_max_univariate)
} else {
nuisance = capture.output(cboost_uni$train(self$param_set$values$iters_max_univariate))
}
}, silent = TRUE)
if (class(e) == "try-error") {
if (grepl("chol()", attr(e, "condition")) && (bin_root > 0)) {
i = which(bin_root == bin_roots)
msg = paste0("Trying to catch Cholesky decomposition error.",
"This may appear due to too aggressive binning with a root of ",
round(bin_root, 2))
if (i == (length(bin_roots) - 1)) {
msg = paste0(msg, ". Trying to fit model without binning.")
} else {
msg = paste0(msg, ". Now trying with a smaller root of ", round(bin_roots[i+1]), ".")
}
warning(msg)
} else {
stop(e)
}
} else {
break
}
}
# ### Check if model was early stopped, if so, set it to iters - patience - 1 to set the
# ### optimal stopping iter:
# ld = cboost_uni$getLoggerData()
# was_early_stopped = (max(ld[["_iterations"]]) < self$param_set$values[["iters_max_univariate"]]) &&
# (max(ld[["minutes"]]) < self$param_set$values[["train_time_total"]])
# if (was_early_stopped && (cboost_uni$getCurrentIteration() > self$param_set$values$stop_patience + 2))
# cboost_uni$train(cboost_uni$getCurrentIteration() - self$param_set$values$stop_patience - 1)
out = list()
out[["univariate"]] = cboost_uni
if (self$param_set$values$just_univariate) return(out)
### Create new task for interaction detection:
df_new = task$data()
df_new[[task$target_names]] = NULL
# Access predictions:
truth = task$truth()
pred_uni = cboost_uni$predict(task$data())
pseudo_uni = as.vector(loss$calculatePseudoResiduals(cbind(truth), pred_uni))
# Define new task with 'residuals' as target
df_new$residuals = pseudo_uni
tsk_new = TaskRegr$new(id = "residuals", backend = df_new, target = "residuals")
### Extract interactions based on random forest:
extracted_interactions = na.omit(po("extract_interactions", degree = 2)$train(list(tsk_new))$output)
ninteractions = nrow(extracted_interactions)
ntopinteractions = ceiling(ninteractions * self$param_set$values$top_interactions)
if (ntopinteractions < self$param_set$values$n_min_interactions)
ntopinteractions = min(self$param_set$values$n_min_interactions, ninteractions)
### STOP if no interactions were detected:
if (ntopinteractions == 0) {
if (self$param_set$values$show_output)
warning("No interactions were selected! Algorithm is now stopped!")
return(out)
}
### Just use the top interactions (defined by the user, too much interactions makes the model too slow):
top_interactions = seq_len(ntopinteractions)
### Define optimizer and loss with predictions as offset to continue training instead of
### start from all over again:
optimizer_int = compboost::OptimizerCoordinateDescent$new(self$param_set$values$ncores)
loss_int_inbag = compboost::LossQuadratic$new(cbind(pred_uni[train_idx]), TRUE)
if (self$param_set$values$use_early_stopping)
loss_int_oob = compboost::LossQuadratic$new(cbind(pred_uni[test_idx]), TRUE)
else
loss_int_oob = compboost::LossQuadratic$new()
### Define interaction model based on tensor splines:
cboost_int = Compboost$new(data = task$data(), target = task$target_names, loss = loss_int_inbag,
#learning_rate = self$param_set$values$learning_rate_interactions, optimizer = optimizer_int,
learning_rate = self$param_set$values$learning_rate, optimizer = optimizer_int,
test_idx = test_idx, stop_args = c(stop_args, loss_oob = loss_int_oob),
use_early_stopping = self$param_set$values$use_early_stopping
)
### Add tensor splines. (FIXME: Atm just for numeric-numeric interactions):
nuisance = lapply(top_interactions, function(i) {
#Check if numeric! Interactions between ridge and spline needs to be tested first!
e = try({
cboost_int$addTensor(extracted_interactions$feat1[i], extracted_interactions$feat2[i],
n_knots = self$param_set$values$n_knots_interactions, df = self$param_set$values$df)
}, silent = TRUE)
})
### Check if train is used as logging. If so calculate remaining budget:
if (self$param_set$values$train_time_total > 0) {
tint = proc.time() - time0
} else {
tint = Inf
}
### STOP if time is exhausted:
if (self$param_set$values$train_time_total <= (tint[3] / 60)) return(out)
if (self$param_set$values$train_time_total > 0) {
tintuse = ceiling(tint[3] / 60)
cboost_int$addLogger(LoggerTime, TRUE, "minutes", self$param_set$values$train_time_total - tintuse, "minutes")
} else {
cboost_int$addLogger(LoggerTime, FALSE, "minutes", 0, "minutes")
tintuse = Inf
}
if (length(cboost_int$getBaselearnerNames()) > 0) {
### Train interaction model:
if (self$param_set$values$show_output)
cboost_int$train(self$param_set$values$iters_max_interactions)
else
nuisance = capture.output(cboost_int$train(self$param_set$values$iters_max_interactions))
### Post check if model was really trained on the same data::
ch1 = all.equal(cboost_uni$data, cboost_int$data)
if (!ch1) stop("Check failed! Data for both models is not equal!")
if (self$param_set$values$use_early_stopping) {
ch2 = all.equal(cboost_uni$response_oob$getResponse(), cboost_int$response_oob$getResponse())
if (!ch2) stop("Check failed! Response for both models is not equal!")
}
out[["interactions"]] = cboost_int
} else {
warning("No interactions were included! Cannot train interaction model.")
}
### STOP if no deeper interactions are specified:
if (! self$param_set$values$add_deeper_interactions) return(out)
# Calculate pseudo residuals from the fitted univariate model:
pred_int = cboost_uni$predict(task$data())
if (! is.null(cboost_int$model))
pred_int = pred_int + cboost_int$predict(task$data())
pseudo_int = as.vector(loss$calculatePseudoResiduals(cbind(truth), pred_int))
# Define new task with 'residuals' as target
df_new$residuals = pseudo_int
tsk_new = TaskRegr$new(id = "residuals", backend = df_new, target = "residuals")
#residual_booster = boostRpart(tsk_new, lr = self$param_set$values$learning_rate_deeper_interactions,
residual_booster = boostRpartRegr(tsk_new, lr = self$param_set$values$learning_rate,
iters = self$param_set$values$iters_deeper_interactions,
patience = stop_args$patience, eps_for_break = stop_args$eps_for_break,
use_es = self$param_set$values$use_early_stopping, idx_train = train_idx,
idx_test = test_idx, Risk = squaredRisk, truth = truth,
prediction_offset = pred_int, pseudoResiduals = loss$calculatePseudoResiduals,
max_time = self$param_set$values$train_time_total - (proc.time() - time0)[3] / 60)
out[["deeper_interactions"]] = residual_booster
return(out)
},
.predict = function(task) {
newdata = task$data(cols = task$feature_names)
lin_pred = self$model$univariate$predict(newdata)
if (! is.null(self$model$interactions))
lin_pred = lin_pred + self$model$interactions$predict(newdata)
if (! is.null(self$model$deeper_interactions)) {
df_new = task$data()
df_new$residuals = 0
tsk_new = TaskRegr$new(id = "residuals", backend = df_new, target = "residuals")
lin_pred = lin_pred + predict(self$model$deeper_interactions, tsk_new)
#lin_pred = lin_pred + self$model$deeper_interactions$lrn$predict(tsk_new)$response
}
return(list(response = lin_pred))
}
)
)
if (FALSE) {
devtools::load_all()
lr1 = lrn("regr.compboost",
df = 4, show_output = TRUE, top_interactions = 0.02,
learning_rate_univariate = 0.01, learning_rate_interactions = 0.01,
train_time_total = 5,
iters_max_univariate = 50000L, iters_max_interactions = 50000L,
n_knots_univariate = 10, n_knots_interactions = 10,
use_early_stopping = TRUE, stop_patience = 10L, stop_epsylon_for_break = 1e-7)
task = tsk("boston_housing")
lr1$train(task)
pred = lr1$predict(task)
pred$score(msrs(c("regr.mse", "regr.mae")))
lr1$model$univariate$plotBlearnerTraces(n_legend = 20L)
lr1$model$interactions$plotBlearnerTraces()
inbag1 = lr1$model$univariate$getInbagRisk()
inbag2 = lr1$model$interactions$getInbagRisk()
oob1 = lr1$model$univariate$getLoggerData()$oob_risk
oob2 = lr1$model$interactions$getLoggerData()$oob_risk
cutoff = lr1$model$univariate$getCurrentIteration()
yrg = c(min(inbag2, oob2), max(inbag1, oob1))
plot(c(inbag1, inbag2), type = "l", col = "red", ylim = yrg)
lines(c(oob1, oob2), col = "blue")
abline(v = cutoff, lty = 2, col = "dark grey")
legend("topright", lty = 1, col = c("red", "blue"), legend = c("Train risk", "Validation risk"))
text(x = cutoff + 0.01 * (length(c(inbag1, inbag2))), y = max(c(inbag1, oob1)) - 0.05 * (yrg[2] - yrg[1]),
labels = "Switch from univariate\nto interaction model", adj = c(0, 0))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.