#' @title Wrapper-based Ensemble Feature Selection
#'
#' @include CallbackBatchFSelect.R
#'
#' @description
#' Ensemble feature selection using multiple learners.
#' The ensemble feature selection method is designed to identify the most predictive features from a given dataset by leveraging multiple machine learning models and resampling techniques.
#' Returns an [EnsembleFSResult].
#'
#' @details
#' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset (train/test splits).
#' This resampling process helps in generating diverse subsets of data for robust feature selection.
#'
#' For each subsample (train set) generated in the previous step, the method performs **wrapped-based feature selection** ([auto_fselector]) using each provided learner, the given inner resampling method, inner performance measure and optimization algorithm.
#' This process generates 1) the best feature subset and 2) a final trained model using these best features, for each combination of subsample and learner.
#' The final models are then scored on their ability to predict on the resampled test sets.
#'
#' Results are stored in an [EnsembleFSResult].
#'
#' The result object also includes the performance scores calculated during the inner resampling of the training sets, using models with the best feature subsets.
#' These scores are stored in a column named `{measure_id}_inner`.
#'
#' @note
#' The **active measure** of performance is the one applied to the test sets.
#' This is preferred, as inner resampling scores on the training sets are likely to be overestimated when using the final models.
#' Users can change the active measure by using the `set_active_measure()` method of the [EnsembleFSResult].
#'
#' @param learners (list of [mlr3::Learner])\cr
#' The learners to be used for feature selection.
#' @param init_resampling ([mlr3::Resampling])\cr
#' The initial resampling strategy of the data, from which each train set will be passed on to the [auto_fselector] to optimize the learners and perform feature selection.
#' Each test set will be used for prediction on the final models returned by [auto_fselector].
#' Can only be [mlr3::ResamplingSubsampling] or [mlr3::ResamplingBootstrap].
#' @param inner_resampling ([mlr3::Resampling])\cr
#' The inner resampling strategy used by the [FSelector].
#' @param inner_measure ([mlr3::Measure])\cr
#' The inner optimization measure used by the [FSelector].
#' @param measure ([mlr3::Measure])\cr
#' Measure used to score each trained learner on the test sets generated by `init_resampling`.
#' @param store_benchmark_result (`logical(1)`)\cr
#' Whether to store the benchmark result in [EnsembleFSResult] or not.
#' @param store_models (`logical(1)`)\cr
#' Whether to store models in [auto_fselector] or not.
#' @param callbacks (Named list of lists of [CallbackBatchFSelect])\cr
#' Callbacks to be used for each learner.
#' The lists must be named by the learner ids.
#'
#' @template param_fselector
#' @template param_task
#' @template param_terminator
#'
#' @returns an [EnsembleFSResult] object.
#'
#' @source
#' `r format_bib("saeys2008", "abeel2010", "pes2020")`
#' @export
#' @examples
#' \donttest{
#' efsr = ensemble_fselect(
#' fselector = fs("random_search"),
#' task = tsk("sonar"),
#' learners = lrns(c("classif.rpart", "classif.featureless")),
#' init_resampling = rsmp("subsampling", repeats = 2),
#' inner_resampling = rsmp("cv", folds = 3),
#' inner_measure = msr("classif.ce"),
#' measure = msr("classif.acc"),
#' terminator = trm("evals", n_evals = 10)
#' )
#' efsr
#' }
ensemble_fselect = function(
fselector,
task,
learners,
init_resampling,
inner_resampling,
inner_measure,
measure,
terminator,
callbacks = NULL,
store_benchmark_result = TRUE,
store_models = FALSE
) {
assert_task(task)
assert_learners(as_learners(learners), task = task)
assert_resampling(init_resampling)
assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling"))
assert_resampling(inner_resampling)
assert_measure(inner_measure, task = task)
assert_measure(measure, task = task)
callbacks = map(callbacks, function(callbacks) assert_callbacks(as_callbacks(callbacks)))
if (length(callbacks)) assert_names(names(callbacks), subset.of = map_chr(learners, "id"))
assert_flag(store_benchmark_result)
assert_flag(store_models)
# create auto_fselector for each learner
afss = map(learners, function(learner) {
auto_fselector(
fselector = fselector,
learner = learner,
resampling = inner_resampling,
measure = inner_measure,
terminator = terminator,
store_models = store_models,
callbacks = callbacks[[learner$id]]
)
})
design = benchmark_grid(
tasks = task,
learners = afss,
resamplings = init_resampling
)
bmr = benchmark(design, store_models = TRUE)
afss = bmr$score()$learner
# extract features
features = map(afss, function(afs) {
afs$fselect_result$features[[1]]
})
# extract n_features
n_features = map_int(afss, function(afs) {
afs$fselect_result$n_features[[1]]
})
# extract inner scores
inner_scores = map_dbl(afss, function(afs) {
afs$fselect_instance$archive$best()[, inner_measure$id, with = FALSE][[1]]
})
# extract scores on the test sets
scores = bmr$score(measure)
# remove `bmr_score` class
class(scores) = c("data.table", "data.frame")
set(scores, j = "features", value = features)
set(scores, j = "n_features", value = n_features)
set(scores, j = sprintf("%s_inner", inner_measure$id), value = inner_scores)
setnames(scores, "iteration", "resampling_iteration")
# remove R6 objects
set(scores, j = "learner", value = NULL)
set(scores, j = "task", value = NULL)
set(scores, j = "resampling", value = NULL)
set(scores, j = "prediction_test", value = NULL)
set(scores, j = "task_id", value = NULL)
set(scores, j = "nr", value = NULL)
set(scores, j = "resampling_id", value = NULL)
set(scores, j = "uhash", value = NULL)
# extract importance scores if RFE optimization was used
if (class(fselector)[1] == "FSelectorBatchRFE") {
imp_scores = map(afss, function(afs) {
afs$fselect_result$importance[[1]]
})
set(scores, j = "importance", value = imp_scores)
}
EnsembleFSResult$new(
result = scores,
features = task$feature_names,
benchmark_result = if (store_benchmark_result) bmr,
measure = measure,
inner_measure = inner_measure
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.