Nothing
#' @title Piecewise Linear Encoding Base Class
#'
#' @usage NULL
#' @format Abstract [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Abstract base class for piecewise linear encoding.
#'
#' Piecewise linear encoding works by splitting values of features into distinct bins, through an algorithm implemented
#' in `private$.get_bins()`, and then creating new feature columns through a continuous alternative to one-hot encoding.
#' Here, one new feature per bin is constructed, with values being either
#' * `0`, if the original value was below the lower bin boundary,
#' * `1`, if the original value was above or equal to the upper bin boundary, or
#' * a scaled value between `0` and `1`, if the original value was inside the bin boundaries. Scaling is done by
#' offsetting the original value by the lower bin boundary and dividing by the bin width.
#'
#' [`PipeOp`]s inheriting from this encode columns of type `numeric` and `integer`. Use the [`PipeOpTaskPreproc`]
#' `$affect_columns` functionality to only encode a subset of columns, or only encode columns of a certain type, etc.
#'
#' @section Construction:
#' ```
#' PipeOpEncodePL$new(id = "encodepl", param_set = ps(), param_vals = list(), packages = character(0), task_type = "Task")
#' ```
#' * `id` :: `character(1)`\cr
#' Identifier of resulting object. See `$id` slot of [`PipeOp`].
#' * `param_set` :: [`ParamSet`][paradox::ParamSet]\cr
#' Parameter space description. This should be created by the subclass and given to `super$initialize()`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings given in `param_set`. The
#' subclass should have its own `param_vals` parameter and pass it on to `super$initialize()`. Default `list()`.
#' * `packages` :: `character`\cr
#' Set of all required packages for the [`PipeOp`]'s `private$.train()` and `private$.predict()` methods. See `$packages` slot.
#' Default is `character(0)`.
#' * `task_type` :: `character(1)`\cr
#' The class of [`Task`][mlr3::Task] that should be accepted as input and will be returned as output. This
#' should generally be a `character(1)` identifying a type of [`Task`][mlr3::Task], e.g. `"Task"`, `"TaskClassif"` or
#' `"TaskRegr"` (or another subclass introduced by other packages). Default is `"Task"`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with all affected `numeric` and `integer` columns encoded using piecewise linear encoding.
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
#' * `bins` :: named `list`\cr
#' Named list of numeric vectors. Each element corresponds to and is named after one of the affected feature columns
#' and contains the bin boundaries derived through `private$.get_bins()`.
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`].
#'
#' @section Internals:
#' `PipeOpEncodePL` is an abstract class inheriting from [`PipeOpTaskPreprocSimple`] that allows easier implementation
#' of different binning algorithms for piecewise linear encoding. The respective binning algorithm should be implemented
#' as `private$.get_bins()`.
#'
#' @section Fields:
#' Only fields inherited from [`PipeOp`].
#'
#' @section Methods:
#' Methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`] as well as
#' * `.get_bins(task, cols)`\cr
#' ([`Task`][mlr3::Task], `character`) -> named `list` \cr
#' Abstract method for splitting the value range of a feature column into distinct bins. The argument `cols` should
#' give the names of the feature columns of the `task` for which bins should be derived. Returns a named list of
#' numeric vectors containing the bin boundaries for each affected feature column, named by that corresponding feature
#' column.
#'
#' @references
#' `r format_bib("gorishniy_2022")`
#'
#' @family PipeOps
#' @family Piecewise Linear Encoding PipeOps
#' @template seealso_pipeopslist
#' @include PipeOpTaskPreproc.R
#' @export
PipeOpEncodePL = R6Class("PipeOpEncodePL",
inherit = PipeOpTaskPreprocSimple,
public = list(
initialize = function(id, param_set = ps(), param_vals = list(), packages = character(0), task_type = "Task") {
super$initialize(id = id, param_set = param_set, param_vals = param_vals, packages = packages,
task_type = task_type, tags = "encode", feature_types = c("numeric", "integer"))
}
),
private = list(
.get_bins = function(task, cols) {
stop("Abstract.")
},
.get_state = function(task) {
cols = private$.select_cols(task)
if (!length(cols)) {
return(list(bins = named_list())) # early exit
}
list(bins = private$.get_bins(task, cols))
},
.transform = function(task) {
bins = self$state$bins
cols = names(bins)
if (!length(cols)) {
return(task) # early exit
}
dt = task$data(cols = cols)
res = imap_dtc(dt, function(d, col) encode_piecewise_linear(d, bins[[col]]))
task$select(setdiff(task$feature_names, cols))$cbind(res)
}
)
)
# Helper function to implement piecewise linear encoding.
# * column: numeric vector
# * bins as numeric vector of boundaries
encode_piecewise_linear = function(column, bins) {
n_bins = length(bins) - 1
dt = data.table(matrix(0, length(column), n_bins))
setnames(dt, paste0("bin", seq_len(n_bins)))
for (t in seq_len(n_bins)) {
lower = bins[[t]]
upper = bins[[t + 1]]
colname = colnames(dt)[[t]]
dt[column >= upper, (colname) := 1]
indices = !is.na(column) & column < upper & column >= lower
dt[indices, (colname) := (column[indices] - lower) / (upper - lower)]
# Filling NAs back in
dt[is.na(column), (colname) := NA]
}
dt
}
#' @title Piecewise Linear Encoding using Quantiles
#'
#' @usage NULL
#' @name mlr_pipeops_encodeplquantiles
#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpEncodePL`]/[`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Encodes `numeric` and `integer` feature columns using piecewise lienar encoding. For details, see documentation of
#' [`PipeOpEncodePL`] or Gorishniy et al. (2022).
#'
#' Bins are constructed by taking the quantiles of the respective feature column as bin boundaries. The first and
#' last boundaries are set to the minimum and maximum value of the feature, respectively. The number of bins can be
#' controlled with the `numsplits` hyperparameter.
#' Affected feature columns may contain `NA`s. These are ignored when calculating quantiles.
#'
#' @section Construction:
#' ```
#' PipeOpEncodePLQuantiles$new(id = "encodeplquantiles", param_vals = list())
#' ```
#' * `id` :: `character(1)`\cr
#' Identifier of resulting object, default `"encodeplquantiles"`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with all affected `numeric` and `integer` columns encoded using piecewise
#' linear encoding with bins being derived from the quantiles of the respective original feature column.
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpEncodePL`]/[`PipeOpTaskPreproc`].
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
#' * `numsplits` :: `integer(1)` \cr
#' Number of bins to create. Initialized to `2`.
#' * `type` :: `integer(1)`\cr
#' Method used to calculate sample quantiles. See help of [`stats::quantile`]. Default is `7`.
#'
#' @section Internals:
#' This overloads the `private$.get_bins()` method of [`PipeOpEncodePL`] and uses the [`stats::quantile`] function
#' to derive the bins used for piecewise linear encoding.
#'
#' @section Fields:
#' Only fields inherited from [`PipeOp`].
#'
#' @section Methods:
#' Only methods inherited from [`PipeOpEncodePL`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @references
#' `r format_bib("gorishniy_2022")`
#'
#' @family PipeOps
#' @family Piecewise Linear Encoding PipeOps
#' @template seealso_pipeopslist
#' @include PipeOpTaskPreproc.R
#' @export
#' @examples
#' library(mlr3)
#'
#' task = tsk("iris")$select(c("Petal.Width", "Petal.Length"))
#' pop = po("encodeplquantiles")
#'
#' train_out = pop$train(list(task))[[1L]]
#' # Calculated bin boundaries per feature
#' pop$state$bins
#' # Each feature was split into two encoded features using piecewise linear encoding
#' train_out$head()
#'
#' # Prediction works the same as training, using the bins learned during training
#' predict_out = pop$predict(list(task))[[1L]]
#' predict_out$head()
#'
#' # Binning into three bins per feature
#' # Using the nearest even order statistic for caluclating quantiles
#' pop$param_set$set_values(numsplits = 4, type = 3)
#'
#' train_out = pop$train(list(task))[[1L]]
#' # Calculated bin boundaries per feature
#' pop$state$bins
#' # Each feature was split into three encoded features using
#' # piecewise linear encoding
#' train_out$head()
PipeOpEncodePLQuantiles = R6Class("PipeOpEncodePLQuantiles",
inherit = PipeOpEncodePL,
public = list(
initialize = function(id = "encodeplquantiles", param_vals = list()) {
ps = ps(
numsplits = p_int(lower = 2, init = 2, tags = c("train", "predict", "required")),
type = p_int(lower = 1, upper = 9, default = 7, tags = c("train", "predict"))
)
super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats")
}
),
private = list(
.get_bins = function(task, cols) {
numsplits = self$param_set$values$numsplits
# Defaulting to default value in stats::quantile, i.e. method 7
type = self$param_set$values$type %??% 7
lapply(task$data(cols = cols), function(d) {
stats::quantile(d, seq(0, 1, length.out = numsplits + 1), na.rm = TRUE, names = FALSE, type = type)
})
}
)
)
mlr_pipeops$add("encodeplquantiles", PipeOpEncodePLQuantiles)
#' @title Piecewise Linear Encoding using Decision Trees
#'
#' @usage NULL
#' @name mlr_pipeops_encodepltree
#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpEncodePL`]/[`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Encodes `numeric` and `integer` feature columns using piecewise lienar encoding. For details, see documentation of
#' [`PipeOpEncodePL`] or Gorishniy et al. (2022).
#'
#' Bins are constructed by trainig one decision tree [`Learner`][mlr3::Learner] per feature column, taking the target
#' column into account, and using decision boundaries as bin boundaries.
#'
#' @section Construction:
#' ```
#' PipeOpEncodePLTree$new(task_type, id = "encodepltree", param_vals = list())
#' ```
#' * `task_type` :: `character(1)`\cr
#' The class of [`Task`][mlr3::Task] that should be accepted as input, given as a `character(1)`. This is used to
#' construct the appropriate [`Learner`][mlr3::Learner] to be used for obtaining the bins for piecewise linear
#' encoding. Supported options are `"TaskClassif"`for [`LearnerClassifRpart`][mlr3::LearnerClassifRpart] or
#' `"TaskRegr"`for [`LearnerRegrRpart`][mlr3::LearnerRegrRpart].
#' * `id` :: `character(1)`\cr
#' Identifier of resulting object, default `"encodeplquantiles"`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. Instead of a [`Task`][mlr3::Task], a
#' [`TaskClassif`][mlr3::TaskClassif] or [`TaskRegr`][mlr3::TaskRegr] is used as input and output during training and
#' prediction, depending on the `task_type` construction argument.
#'
#' The output is the input [`Task`][mlr3::Task] with all affected `numeric` and `integer` columns encoded using piecewise
#' linear encoding with bins being derived from a decision tree [`Learner`][mlr3::Learner] trained on the respective feature column.
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpEncodePL`]/[`PipeOpTaskPreproc`].
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as the parameters of
#' the [`Learner`][mlr3::Learner] used for obtaining the bins for piecewise linear encoding.
#'
#' @section Internals:
#' This overloads the `private$.get_bins()` method of [`PipeOpEncodePL`]. To derive the bins for each feature, the
#' [`Task`][mlr3::Task] is split into smaller [`Tasks`][mlr3::Task] with only the target and respective feature as columns.
#' On these [`Tasks`][mlr3::Task] either a [`LearnerClassifRpart`][mlr3::LearnerClassifRpart] or
#' [`LearnerRegrRpart`][mlr3::LearnerRegrRpart] gets trained and the respective splits extracted as bin boundaries used
#' for piecewise linear encodings.
#'
#' @section Fields:
#' Only fields inherited from [`PipeOp`].
#'
#' @section Methods:
#' Only methods inherited from [`PipeOpEncodePL`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @references
#' `r format_bib("gorishniy_2022")`
#'
#' @family PipeOps
#' @family Piecewise Linear Encoding PipeOps
#' @template seealso_pipeopslist
#' @include PipeOpTaskPreproc.R
#' @export
#' @examplesIf requireNamespace("rpart")
#' library(mlr3)
#'
#' # For classification task
#' task = tsk("iris")$select(c("Petal.Width", "Petal.Length"))
#' pop = po("encodepltree", task_type = "TaskClassif")
#' train_out = pop$train(list(task))[[1L]]
#'
#' # Calculated bin boundaries per feature
#' pop$state$bins
#' # Each feature was split into three encoded features using piecewise linear encoding
#' train_out$head()
#'
#' # Prediction works the same as training, using the bins learned during training
#' predict_out = pop$predict(list(task))[[1L]]
#' predict_out$head()
#'
#' # Controlling behavior of the tree learner, here: setting minimum number of
#' # observations per node for a split to be attempted
#' pop$param_set$set_values(minsplit = 5)
#'
#' train_out = pop$train(list(task))[[1L]]
#' # feature "hp" now gets split into five encoded features instead of three
#' pop$state$bins
#' train_out$head()
#'
#' # For regression task
#' task = tsk("mtcars")$select(c("cyl", "hp"))
#' pop = po("encodepltree", task_type = "TaskRegr")
#' train_out = pop$train(list(task))[[1L]]
#'
#' # Calculated bin boundaries per feature
#' pop$state$bins
#' # First feature was split into three encoded features,
#' # second into two, using piecewise linear encoding
#' train_out$head()
PipeOpEncodePLTree = R6Class("PipeOpEncodePLTree",
inherit = PipeOpEncodePL,
public = list(
initialize = function(task_type, id = "encodepltree", param_vals = list()) {
assert_choice(task_type, mlr_reflections$task_types$task)
if (task_type == "TaskRegr") {
private$.tree_learner = LearnerRegrRpart$new()
} else if (task_type == "TaskClassif") {
private$.tree_learner = LearnerClassifRpart$new()
} else {
stopf("Task type %s not supported.", task_type)
}
super$initialize(id, param_set = alist(private$.tree_learner$param_set), param_vals = param_vals,
packages = private$.tree_learner$packages, task_type = task_type)
}
),
private = list(
.tree_learner = NULL,
.get_bins = function(task, cols) {
learner = private$.tree_learner
bins = list()
for (col in cols) {
t = task$clone(deep = TRUE)$select(col)
model = learner$train(t)$model
frame = model$frame
# Extracting splits according to https://github.com/cran/rpart/blob/983544575b80df286bf8ce238faf4afa145872cd/R/labels.rpart.R#L26
# NOTE: `frame$nsurrogate` and `frame$ncompete` seem to be always 0 in our case, since competing and surrogate
# splits only exist when there are more than one feature. Leaving it in for completeness.
# - surrogates could only be used, if multiple variables exist (cannot use a non-existing variable as surrogate
# if the first considered variables has NAs)
# - competes could not occur with one feature, since no alternative splits are possible (this means alternative
# splits with other variables)
if (nrow(frame) > 1L) { # do splits exist?
is_leaf = frame$var == "<leaf>"
frame = frame[!is_leaf, , drop = FALSE]
index = cumsum(c(1, frame$nsurrogate + frame$ncompete + 1))
# remove last entry introduced by prepending 1 in cumsum
index = index[-length(index)]
splits = model$splits[index, "index"]
boundaries = unname(sort(splits))
} else {
# No boundaries if no splits exist (only root in frame)
boundaries = numeric(0)
}
d = task$data(cols = col)
bins[[col]] = c(min(d, na.rm = TRUE), boundaries, max(d, na.rm = TRUE))
}
bins
},
.additional_phash_input = function() private$.tree_learner$phash
)
)
# Registering with "TaskClassif", however both "TaskRegr" and "TaskClassif" are acceptable, see #869.
mlr_pipeops$add("encodepltree", PipeOpEncodePLTree, list(task_type = "TaskClassif"))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.