mlr3fselect: Feature Selection for 'mlr3'

Documented in ensemble_fselect

#' @title Wrapper-based Ensemble Feature Selection
#'
#' @include CallbackBatchFSelect.R
#'
#' @description
#' Ensemble feature selection using multiple learners.
#' The ensemble feature selection method is designed to identify the most predictive features from a given dataset by leveraging multiple machine learning models and resampling techniques.
#' Returns an [EnsembleFSResult].
#'
#' @details
#' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset (train/test splits).
#' This resampling process helps in generating diverse subsets of data for robust feature selection.
#'
#' For each subsample (train set) generated in the previous step, the method performs **wrapped-based feature selection** ([auto_fselector]) using each provided learner, the given inner resampling method, inner performance measure and optimization algorithm.
#' This process generates 1) the best feature subset and 2) a final trained model using these best features, for each combination of subsample and learner.
#' The final models are then scored on their ability to predict on the resampled test sets.
#'
#' Results are stored in an [EnsembleFSResult].
#'
#' The result object also includes the performance scores calculated during the inner resampling of the training sets, using models with the best feature subsets.
#' These scores are stored in a column named `{measure_id}_inner`.
#'
#' @note
#' The **active measure** of performance is the one applied to the test sets.
#' This is preferred, as inner resampling scores on the training sets are likely to be overestimated when using the final models.
#' Users can change the active measure by using the `set_active_measure()` method of the [EnsembleFSResult].
#'
#' @param learners (list of [mlr3::Learner])\cr
#'  The learners to be used for feature selection.
#' @param init_resampling ([mlr3::Resampling])\cr
#'  The initial resampling strategy of the data, from which each train set will be passed on to the [auto_fselector] to optimize the learners and perform feature selection.
#'  Each test set will be used for prediction on the final models returned by [auto_fselector].
#'  Can only be [mlr3::ResamplingSubsampling] or [mlr3::ResamplingBootstrap].
#' @param inner_resampling ([mlr3::Resampling])\cr
#'  The inner resampling strategy used by the [FSelector].
#' @param inner_measure ([mlr3::Measure])\cr
#'  The inner optimization measure used by the [FSelector].
#' @param measure ([mlr3::Measure])\cr
#'  Measure used to score each trained learner on the test sets generated by `init_resampling`.
#' @param store_benchmark_result (`logical(1)`)\cr
#'  Whether to store the benchmark result in [EnsembleFSResult] or not.
#' @param store_models (`logical(1)`)\cr
#'  Whether to store models in [auto_fselector] or not.
#' @param callbacks (Named list of lists of [CallbackBatchFSelect])\cr
#'  Callbacks to be used for each learner.
#'  The lists must be named by the learner ids.
#'
#' @template param_fselector
#' @template param_task
#' @template param_terminator
#'
#' @returns an [EnsembleFSResult] object.
#'
#' @source
#' `r format_bib("saeys2008", "abeel2010", "pes2020")`
#' @export
#' @examples
#' \donttest{
#'   efsr = ensemble_fselect(
#'     fselector = fs("random_search"),
#'     task = tsk("sonar"),
#'     learners = lrns(c("classif.rpart", "classif.featureless")),
#'     init_resampling = rsmp("subsampling", repeats = 2),
#'     inner_resampling = rsmp("cv", folds = 3),
#'     inner_measure = msr("classif.ce"),
#'     measure = msr("classif.acc"),
#'     terminator = trm("evals", n_evals = 10)
#'   )
#'   efsr
#' }
ensemble_fselect = function(
  fselector,
  task,
  learners,
  init_resampling,
  inner_resampling,
  inner_measure,
  measure,
  terminator,
  callbacks = NULL,
  store_benchmark_result = TRUE,
  store_models = FALSE
  ) {
  assert_task(task)
  assert_learners(as_learners(learners), task = task)
  assert_resampling(init_resampling)
  assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling"))
  assert_resampling(inner_resampling)
  assert_measure(inner_measure, task = task)
  assert_measure(measure, task = task)
  callbacks = map(callbacks, function(callbacks) assert_callbacks(as_callbacks(callbacks)))
  if (length(callbacks)) assert_names(names(callbacks), subset.of = map_chr(learners, "id"))
  assert_flag(store_benchmark_result)
  assert_flag(store_models)

  # create auto_fselector for each learner
  afss = map(learners, function(learner) {
    auto_fselector(
      fselector = fselector,
      learner = learner,
      resampling = inner_resampling,
      measure = inner_measure,
      terminator = terminator,
      store_models = store_models,
      callbacks = callbacks[[learner$id]]
    )
  })

  design = benchmark_grid(
    tasks = task,
    learners = afss,
    resamplings = init_resampling
  )

  bmr = benchmark(design, store_models = TRUE)

  afss = bmr$score()$learner

  # extract features
  features = map(afss, function(afs) {
    afs$fselect_result$features[[1]]
  })

  # extract n_features
  n_features = map_int(afss, function(afs) {
    afs$fselect_result$n_features[[1]]
  })

  # extract inner scores
  inner_scores = map_dbl(afss, function(afs) {
    afs$fselect_instance$archive$best()[, inner_measure$id, with = FALSE][[1]]
  })

  # extract scores on the test sets
  scores = bmr$score(measure)
  # remove `bmr_score` class
  class(scores) = c("data.table", "data.frame")

  set(scores, j = "features", value = features)
  set(scores, j = "n_features", value = n_features)
  set(scores, j = sprintf("%s_inner", inner_measure$id), value = inner_scores)
  setnames(scores, "iteration", "resampling_iteration")

  # remove R6 objects
  set(scores, j = "learner", value = NULL)
  set(scores, j = "task", value = NULL)
  set(scores, j = "resampling", value = NULL)
  set(scores, j = "prediction_test", value = NULL)
  set(scores, j = "task_id", value = NULL)
  set(scores, j = "nr", value = NULL)
  set(scores, j = "resampling_id", value = NULL)
  set(scores, j = "uhash", value = NULL)

  # extract importance scores if RFE optimization was used
  if (class(fselector)[1] == "FSelectorBatchRFE") {
    imp_scores = map(afss, function(afs) {
      afs$fselect_result$importance[[1]]
    })
    set(scores, j = "importance", value = imp_scores)
  }

  EnsembleFSResult$new(
    result = scores,
    features = task$feature_names,
    benchmark_result = if (store_benchmark_result) bmr,
    measure = measure,
    inner_measure = inner_measure
  )
}