s2: Machine Learning in R

Documented in getFeatureImportance getFeatureImportanceLearner

#' @title Calculates feature importance values for trained models.
#'
#' @description
#' For some learners it is possible to calculate a feature importance measure.
#' \code{getFeatureImportance} extracts those values from trained models.
#' See below for a list of supported learners.
#'
#'
#' \itemize{
#'    \item{boosting} \cr
#'    {Measure which accounts the gain of Gini index given by a feature
#'    in a tree and the weight of that tree.}
#'    \item{cforest} \cr
#'    {Permutation principle of the 'mean decrease in accuracy' principle
#'    in randomForest. If \code{auc=TRUE} (only for binary classification),
#'    area under the curve is used as measure.  The algorithm used for the survival
#'    learner is 'extremely slow and experimental; use at your own risk'.
#'    See \link[party:varimp]{varimp} for details and further parameters.}
#'    \item{gbm} \cr
#'    {Estimation of relative influence for each feature. See
#'    \link[gbm:relative.influence]{relative.influence}
#'    for details and further parameters.}
#'    \item{randomForest} \cr
#'    {For \code{type = 2} (the default) the 'MeanDecreaseGini' is measured,
#'    which is based on the Gini impurity index used for the calculation of the nodes.
#'    Alternatively, you can set \code{type} to 1, then the measure is the mean
#'    decrease in accuracy calculated on OOB data. Note, that in this case
#'    the learner's parameter \code{importance} needs to be set to be able to compute
#'    feature importance values.
#'    See \link[randomForest:importance]{importance} for details.}
#'    \item{RRF} \cr
#'    {This is identical to randomForest.}
#'    \item{randomForestSRC} \cr
#'    {This method can calculate feature importance for
#'    various measures. By default the Breiman-Cutler permutation method is used.
#'    See \link[randomForestSRC:vimp]{vimp} for details.}
#'    \item{ranger} \cr
#'    {Supports both measures mentioned above for the randomForest
#'    learner. Note, that you need to specifically set the learners parameter
#'    \code{importance}, to be able to compute feature importance measures.
#'    See \link[ranger:importance]{importance} and
#'    \link[ranger:ranger]{ranger} for details.}
#'    \item{rpart} \cr
#'    {Sum of decrease in impurity for each of the surrogate variables at each node.}
#'    \item{xgboost} \cr
#'    {The value implies the relative contribution of the corresponding feature to the model
#'    calculated by taking each feature's contribution for each tree in the model. The exact
#'    computation of the importance in xgboost is undocumented.}
#'  }
#'
#' @param object [\code{\link{WrappedModel}}]\cr
#'   Wrapped model, result of \code{\link{train}}.
#' @param ... [any]\cr
#'   Additional parameters, which are passed to the underlying importance value
#'   generating function.
#' @return [\code{FeatureImportance}] An object containing a \code{data.frame} of the variable importances and further information.
#' @export
getFeatureImportance = function(object, ...) {

  assertClass(object, classes = "WrappedModel")
  lrn = checkLearner(object$learner, props = "featimp")
  imp = getFeatureImportanceLearner(lrn, object, ...)

  if (!check_numeric(imp, names = "unique") && !check_subset(names(imp), object$features))
    stop("getFeatureImportanceLearner did not return a named vector with names of the task features.")

  #We need to add missing pars with zero and order them
  imp[setdiff(object$features, names(imp))] = 0
  imp = imp[object$features]

  #convert named vector to data.frame with columns and set NA to 0
  imp[is.na(imp)] = 0L
  imp = as.data.frame(t(imp))
  rownames(imp) = NULL

  makeS3Obj("FeatureImportance",
    res = imp,
    task.desc = getTaskDesc(object),
    learner = lrn,
    measure = NA,
    contrast = NA,
    aggregation = identity,
    nmc = NA,
    replace = NA,
    local = FALSE)
}

#' @title Calculates feature importance values for a given learner.
#'
#' @description
#'
#' This function is mostly for internal usage. To calculate feature importance use \code{\link{getFeatureImportance}}.
#'
#' The return value is a named numeric vector. There does not need to be one value for each feature in the dataset.
#' In \code{\link{getFeatureImportance}} missing features will get an importance of zero and if the vector contains \code{NA}
#' they will also be replaced with zero.
#'
#' @param .learner [\code{\link{Learner}} | \code{character(1)}]\cr
#'   The learner.
#' @param .model [\code{\link{WrappedModel}}]\cr
#'  The model.
#' @param ... [any]\cr
#' Additional parameters, which are passed to the underlying importance value
#' generating function.
#' @return [\code{numeric}] A named vector of variable importance.
#' @export
#' @keywords internal
getFeatureImportanceLearner = function(.learner, .model, ...) {
  UseMethod("getFeatureImportanceLearner")
}

#' @export
getFeatureImportanceLearner.BaseWrapper = function(.learner, .model, ...) {
  getFeatureImportanceLearner(.learner$next.learner, .model = .model, ...)
}