Nothing
#' @title Ensembling Base Class
#'
#' @usage NULL
#' @format Abstract [`R6Class`][R6::R6Class] inheriting from [`PipeOp`].
#'
#' @description
#' Parent class for [`PipeOp`]s that aggregate predictions. Implements the `private$.train()` and `private$.predict()` methods necessary
#' for a `PipeOp` and requires deriving classes to create the `private$weighted_avg_predictions()` function.
#'
#' @section Construction:
#' Note: This object is typically constructed via a derived class, e.g. [`PipeOpClassifAvg`] or [`PipeOpRegrAvg`].
#' ```
#' PipeOpEnsemble$new(innum = 0, collect_multiplicity = FALSE, id, param_set = ps(), param_vals = list(), packages = character(0), prediction_type = "Prediction")
#' ```
#'
#' * `innum` :: `numeric(1)`\cr
#' Determines the number of input channels.
#' If `innum` is 0 (default), a vararg input channel is created that can take an arbitrary number of inputs.
#' * `collect_multiplicity` :: `logical(1)`\cr
#' If `TRUE`, the input is a [`Multiplicity`] collecting channel. This means, a
#' [`Multiplicity`] input, instead of multiple normal inputs, is accepted and the members are aggregated. This requires `innum` to be 0.
#' Default is `FALSE`.
#' * `id` :: `character(1)`\cr
#' Identifier of the resulting object.
#' * `param_set` :: [`ParamSet`][paradox::ParamSet]\cr
#' ("Hyper"-)Parameters in form of a [`ParamSet`][paradox::ParamSet] for the resulting [`PipeOp`].
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
#' * `packages` :: `character`\cr
#' Set of packages required for this `PipeOp`. These packages are loaded during `$train()` and `$predict()`, but not attached.
#' Default `character(0)`.
#' * `prediction_type` :: `character(1)`\cr
#' The `predict` entry of the `$input` and `$output` type specifications.
#' Should be `"Prediction"` (default) or one of its subclasses, e.g. `"PredictionClassif"`, and correspond to the type accepted by
#' `private$.train()` and `private$.predict()`.
#'
#' @section Input and Output Channels:
#' [`PipeOpEnsemble`] has multiple input channels depending on the `innum` construction argument, named `"input1"`, `"input2"`, ...
#' if `innum` is nonzero; if `innum` is 0, there is only one *vararg* input channel named `"..."`.
#' All input channels take only `NULL` during training and take a [`Prediction`][mlr3::Prediction] during prediction.
#'
#' [`PipeOpEnsemble`] has one output channel named `"output"`, producing `NULL` during training and a [`Prediction`][mlr3::Prediction] during prediction.
#'
#' The output during prediction is in some way a weighted averaged representation of the input.
#'
#' @section State:
#' The `$state` is left empty (`list()`).
#'
#' @section Parameters:
#' * `weights` :: `numeric`\cr
#' Relative weights of input predictions. If this has length 1, it is ignored and weighs all inputs equally. Otherwise it must have
#' length equal to the number of connected inputs. Initialized to 1 (equal weights).
#'
#' @section Internals:
#' The commonality of ensemble methods using [`PipeOpEnsemble`] is that they take a `NULL`-input during training and save an empty `$state`. They can be
#' used following a set of [`PipeOpLearner`] [`PipeOp`]s to perform (possibly weighted) prediction averaging. See e.g.
#' [`PipeOpClassifAvg`] and [`PipeOpRegrAvg`] which both inherit from this class.
#'
#' Should it be necessary to use the output of preceding [`Learner`][mlr3::Learner]s
#' during the "training" phase, then [`PipeOpEnsemble`] should not be used. In fact, if training time behaviour of a [`Learner`][mlr3::Learner] is important, then
#' one should use a [`PipeOpLearnerCV`] instead of a [`PipeOpLearner`], and the ensemble can be created with a [`Learner`][mlr3::Learner] encapsulated by a [`PipeOpLearner`].
#' See [`LearnerClassifAvg`] and [`LearnerRegrAvg`] for examples.
#'
#' @section Fields:
#' Only fields inherited from [`PipeOp`].
#'
#' @section Methods:
#' Methods inherited from [`PipeOp`] as well as:
#' * `weighted_avg_prediction(inputs, weights, row_ids, truth)`\cr
#' (`list` of [`Prediction`][mlr3::Prediction], `numeric`, `integer` | `character`, `list`) -> `NULL`\cr
#' Create [`Prediction`][mlr3::Prediction]s that correspond to the weighted average of incoming [`Prediction`][mlr3::Prediction]s. This is
#' called by `private$.predict()` with cleaned and sanity-checked values: `inputs` are guaranteed to fit together,
#' `row_ids` and `truth` are guaranteed to be the same as each one in `inputs`, and `weights` is guaranteed to have the same length as `inputs`.\cr
#' This method is abstract, it must be implemented by deriving classes.
#'
#' @family PipeOps
#' @family Multiplicity PipeOps
#' @family Ensembles
#' @template seealso_pipeopslist
#' @include PipeOp.R
#' @export
PipeOpEnsemble = R6Class("PipeOpEnsemble",
inherit = PipeOp,
public = list(
initialize = function(innum = 0, collect_multiplicity = FALSE, id, param_set = ps(), param_vals = list(), packages = character(0), prediction_type = "Prediction", tags = NULL) {
assert_integerish(innum, lower = 0)
if (paradox_info$is_old) {
paux = ps(weights = p_uty(check_weights(innum), tags = "predict"))
param_set$add(paux$params$weights)
} else {
param_set = c(param_set, ps(weights = p_uty(check_weights(innum), tags = "predict")))
}
param_set$values$weights = 1
inname = if (innum) rep_suffix("input", innum) else "..."
intype = c("NULL", prediction_type)
private$.collect = assert_flag(collect_multiplicity)
if (collect_multiplicity) {
if (innum) {
stop("collect_multiplicity only works with innum == 0.")
}
inname = "[...]"
intype = sprintf("[%s]", intype)
}
super$initialize(id, param_set = param_set, param_vals = param_vals, packages = packages,
input = data.table(name = inname, train = intype[[1]], predict = intype[[2]]),
output = data.table(name = "output", train = "NULL", predict = prediction_type),
tags = c(tags, "ensemble")
)
}
),
private = list(
weighted_avg_predictions = function(inputs, weights, row_ids, truth) stop("Abstract."),
.train = function(inputs) {
self$state = list()
list(NULL)
},
.predict = function(inputs) {
if (private$.collect) {
inputs = unclass(inputs[[1]])
}
weights = self$param_set$values$weights
row_ids = inputs[[1]]$row_ids
map(inputs, function(x) assert_true(identical(row_ids, x$row_ids)))
truth = inputs[[1]]$truth
if (length(weights) == 1) weights = rep(1, length(inputs))
weights = weights / sum(weights)
assert_numeric(weights, any.missing = FALSE, len = length(inputs))
# Drop zero-weights for efficiency
# FIXME: this is not numerically stable
# Note: this is the behaviour of stats:::weighted.mean.default
inputs = inputs[weights != 0]
weights = weights[weights != 0]
list(private$weighted_avg_predictions(inputs, weights, row_ids, truth))
},
.collect = NULL,
.additional_phash_input = function() {
retval = list(private$.collect, self$input$name)
if (is.null(self$initialize)) return(retval)
initformals <- names(formals(args(self$initialize)))
if (!test_subset(initformals, c("id", "param_vals", "innum", "collect_multiplicity", "..."))) {
warningf("PipeOp %s has construction arguments besides 'id', 'param_vals', 'innum', 'collect_multiplicity' or '...' but .additional_phash_input was not overloaded.
This warning will become an error in the future.", class(self)[[1]], class(self)[[1]])
}
retval
}
)
)
# Check function for ParamUty: Check that "weight" parameter
# is a numeric vector and
# has either length 1 or length `innum`. `innum` can be 0 (vararg),
# in which case any length is accepted.
#
# It is necessary to put this function in top level because ParamUty does not
# handle function environments well.
check_weights = function(innum) {
if (innum == 0) {
function(x) check_numeric(x, any.missing = FALSE)
} else {
function(x) {
tests = c(test_numeric(x, len = innum, any.missing = FALSE), test_numeric(x, len = 1L, any.missing = FALSE))
if (sum(tests) == 0L) {
sprintf("Must be of type 'numeric', and length 1 or %s", innum)
} else {
TRUE
}
}
}
}
# Weighted sum of `matrices`
# @param matrices [`list` of `matrix`]: matrices to sum up, must be the same shape
# @param weights [`numeric`]: weights, same length as `matrices`
# @return `matrix`
weighted_matrix_sum = function(matrices, weights) {
accmat = matrices[[1]] * weights[1]
for (idx in seq_along(matrices)[-1]) {
accmat = accmat + matrices[[idx]] * weights[idx]
}
accmat
}
# For a set of n `factor` vectors each of length l with the same k levels and a
# numeric weight vector of length n, returns a matrix of dimension l times k.
# Each cell contains the weighted relative frequency of the respective factor
# level being present at the respective positional index over all n factors.
# If `factors` is a data.frame/data.table and the weights are uniform (1 / l),
# the output would simply be t(apply(factors, 1, table) / l).
# @param factors [`list` of `factor`]: must have the same length and levels
# @param weights [`numeric`]: weights, same length as the list of `factors`
# @return `matrix`
# FIXME: probably should be renamed
weighted_factor_mean = function(factors, weights, alllevels) {
accmat = matrix(0, nrow = length(factors[[1]]), ncol = length(alllevels))
colnames(accmat) = alllevels
for (idx in seq_along(factors)) {
rdf = data.frame(x = factor(factors[[idx]], levels = alllevels))
curmat = stats::model.matrix(~ 0 + x, rdf) * weights[idx]
accmat = accmat + curmat
}
accmat
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.