DALEXtra: Extension for 'DALEX' Package

Documented in funnel_measure print.funnel_measure

#' Caluculate difference in performance in models across different categories
#'
#' Function \code{funnel_measure} allows users to compare two models based on their explainers. It partitions dataset on which models were built
#' and creates categories according to quantiles of columns in \code{parition data}. \code{nbins} parameter determines number of quantiles.
#' For each category difference in provided measure is being calculated. Positive value of that difference means that Champion model
#' has better performance in specified category, while negative value means that one of the Challengers was better. Function allows
#' to compare multiple Challengers at once.
#'
#' @param champion - explainer of champion model.
#' @param challengers - explainer of challenger model or list of explainers.
#' @param measure_function - measure function that calculates performance of model based on true observation and prediction.
#'                           Order of parameters is important and should be (y, y_hat). The measure calculated by the function
#'                           should have the property that lower score value indicates better model. If NULL, RMSE will be used for regression,
#'                           one minus auc for classification and crossentropy for multiclass classification.
#' @param nbins - Number of quantiles (partition points) for numeric columns. In case when more than one quantile have the same value, there will be less partition points.
#' @param partition_data - Data by which test dataset will be partitioned for computation. Can be either data.frame or character vector.
#'                         When second is passed, it has to indicate names of columns that will be extracted from test data.
#'                         By default full test data. If data.frame, number of rows has to be equal to number of rows in test data.
#' @param cutoff - Threshold for categorical data. Entries less frequent than specified value will be merged into one category.
#' @param cutoff_name - Name for new category that arised after merging entries less frequent than \code{cutoff}
#' @param factor_conversion_threshold - Numeric columns with lower number of unique values than value of this parameter will be treated as factors
#' @param categories - a named list of variable names that will be plotted in a different colour. By default it is partitioned on Explanatory, External and Target.
#' @param show_info - Logical value indicating if progress bar should be shown.
#'
#' @return An object of the class \code{funnel_measure}
#'
#' It is a named list containing following fields:
#' \itemize{
#' \item \code{data} data.frame that consists of columns:
#'    \itemize{
#'    \item \code{Variable} Variable according to which partitions were made
#'    \item \code{Measure} Difference in measures. Positive value indicates that champion was better, while negative that challenger.
#'    \item \code{Label} String that defines subset of \code{Variable} values (partition rule).
#'    \item \code{Challenger} Label of challenger explainer that was used in \code{Measure}
#'    \item \code{Category} a category of the variable passed to function
#'    }
#' \item \code{models_info} data.frame containing information about models used in analysis
#' }
#'
#' @rdname funnel_measure
#' @export
#'
#' @importFrom utils setTxtProgressBar txtProgressBar
#' @importFrom stats quantile
#'
#' @examples
#' \donttest{
#' library("mlr")
#' library("DALEXtra")
#' task <- mlr::makeRegrTask(
#'   id = "R",
#'   data = apartments,
#'   target = "m2.price"
#' )
#' learner_lm <- mlr::makeLearner(
#'   "regr.lm"
#' )
#' model_lm <- mlr::train(learner_lm, task)
#' explainer_lm <- explain_mlr(model_lm, apartmentsTest, apartmentsTest$m2.price, label = "LM")
#'
#' learner_rf <- mlr::makeLearner(
#'   "regr.ranger"
#' )
#' model_rf <- mlr::train(learner_rf, task)
#' explainer_rf <- explain_mlr(model_rf, apartmentsTest, apartmentsTest$m2.price, label = "RF")
#'
#' learner_gbm <- mlr::makeLearner(
#'   "regr.gbm"
#' )
#' model_gbm <- mlr::train(learner_gbm, task)
#' explainer_gbm <- explain_mlr(model_gbm, apartmentsTest, apartmentsTest$m2.price, label = "GBM")
#'
#'
#' plot_data <- funnel_measure(explainer_lm, list(explainer_rf, explainer_gbm),
#'                             nbins = 5, measure_function = DALEX::loss_root_mean_square)
#' plot(plot_data)
#' }


funnel_measure <-
  function(champion,
           challengers,
           measure_function = NULL,
           nbins = 5,
           partition_data = champion$data,
           cutoff = 0.01,
           cutoff_name = "Other",
           factor_conversion_threshold = 7,
           show_info = TRUE,
           categories = NULL) {
    data <- champion$data

    if (inherits(challengers, "explainer")) {
      challengers <- list(challengers)
    }

    switch (class(partition_data),
      "character" = {
        partition_data <- data[partition_data]
      },
      "data.frame" = {},
      stop("Wrong format of partition_data. Has to be either character vector indicating columns from Champion's data or a data.frame")
    )

    if (any(sapply(challengers, function(x) {
      !inherits(x, "explainer") 
    })) | !inherits(champion, "explainer")) {
      stop("Champion and all of challengers has to be explainer objects")
    }

    if (is.null(measure_function)) {
      measure_function <- set_measure_function(champion, challengers)
    }


    models_info <- data.frame(label = champion$label, class = class(champion$model)[1], type = "Champion", stringsAsFactors = FALSE)
    for (e in challengers) {
      models_info <- rbind(models_info,
                           list(label = e$label, class = class(e$model)[1], type = "Challenger"),
                           stringsAsFactors = FALSE)
    }
    #It iterates progress bar and col_names
    col_index <- 1
    y <- champion$y
    ret <- data.frame()
    col_names <- colnames(partition_data)
    if (show_info) {
      pb = txtProgressBar(
        min = 0,
        max = ncol(partition_data),
        initial = 0,
        style = 3
      )
    }
    for (col in partition_data) {
      if (is.character(col)) {
        col <- as.factor(col)
      }
      if (length(unique(col)) < factor_conversion_threshold) {
        col <- as.factor(col)
      }
      if (is.numeric(col)) {
        #Separate case for first entry as it has to be equal on the both sides of compratment due to semi constant variables
        quantiles <-
          round(quantile(col, probs = seq(0, 1, length.out = nbins+1)), 2)
        scoring_data <-
          data[(quantiles[1] <= col & col <= quantiles[2]),]
        scoring_y <-
          y[(quantiles[1] <= col & col <= quantiles[2])]
        champion_pred <- predict(champion, scoring_data)
        challengers_pred <- lapply(challengers, function(x) {
          predict(x, scoring_data)
        })
        diff <- lapply(challengers_pred, function(x) {
          measure_function(scoring_y, champion_pred) - measure_function(scoring_y, x)
        })
        for (d in 1:length(diff)) {
          ret <-
            rbind(
              ret,
              list(
                "Variable" = col_names[col_index],
                # Negative diff in order to have dots where champion is better on the right side
                "Measure" = -diff[[d]],
                "Label" = ifelse(
                  quantiles[1] == quantiles[2],
                  quantiles[1],
                  paste("[", quantiles[1], ", ", quantiles[2], "]", sep = "")
                ),
                "Challenger" = challengers[[d]]$label
              ),
              stringsAsFactors = FALSE
            )
        }
        #From 2 becasue first qunatile was in upper case. To (length(quantiles) - 1) becasue we have i+1 in the loop.
        for (i in 2:(length(quantiles) - 1)) {
          scoring_data <-
            data[(quantiles[i] < col & col <= quantiles[i + 1]),]
          scoring_y <-
            y[(quantiles[i] < col & col <= quantiles[i + 1])]
          #In case of empty compartment
          if (length(scoring_y) == 0)
            next()
          champion_pred <- predict(champion, scoring_data)
          challengers_pred <- lapply(challengers, function(x) {
            predict(x, scoring_data)
          })
          diff <- lapply(challengers_pred, function(x) {
            measure_function(scoring_y, champion_pred) - measure_function(scoring_y, x)
          })
          for (d in 1:length(diff)) {
            ret <-
              rbind(
                ret,
                list(
                  "Variable" = col_names[col_index],
                  # Negative diff in order to have dots where champion is better on the right side
                  "Measure" = -diff[[d]],
                  "Label" = ifelse(
                    quantiles[i] == quantiles[i + 1],
                    quantiles[i] ,
                    paste("(", quantiles[i], ", ", quantiles[i+1], "]", sep = "")
                  ),
                  "Challenger" = challengers[[d]]$label
                ),
                stringsAsFactors = FALSE
              )
          }
        }
      } else if (is.factor(col)) {
        col <- as.character(col)

        if (length(unique(col)) > 4) {
          freq <- table(col) / length(col)
          names_to_cut <- names(freq[freq < cutoff])
          for (name in names_to_cut) {
            col[col == name] <- cutoff_name
          }
        }
        for (level in sort(unique(col))) {
          scoring_data <-
            data[col == level,]
          scoring_y <-
            y[col == level]
          champion_pred <- predict(champion, scoring_data)
          challengers_pred <- lapply(challengers, function(x) {
            predict(x, scoring_data)
          })
          diff <- lapply(challengers_pred, function(x) {
            measure_function(scoring_y, champion_pred) - measure_function(scoring_y, x)
          })
          for (d in 1:length(diff)) {
            ret <-
              rbind(
                ret,
                list(
                  "Variable" = col_names[col_index],
                  # Negative diff in order to have dots where champion is better on the right side
                  "Measure" = -diff[[d]],
                  "Label" = level,
                  "Challenger" = challengers[[d]]$label
                ),

                stringsAsFactors = FALSE
              )
          }
        }
      } else {
        stop(paste("Not recognizable column type"), col_names[col_index])
      }

      if (show_info) setTxtProgressBar(pb, col_index)

      col_index <- col_index + 1
    }
    if (is.null(categories)) {
      data_names <- names(partition_data)
      features <- data_names[data_names %in% names(champion$data)]
      target_check <-  apply(partition_data, 2, function(x) {
        all(as.character(x) == as.character(champion$y))
      })
      target <- NULL
      if (any(target_check)) {
        target <- data_names[target_check]
      }
      other_variables <- setdiff(data_names, c(features, target))
      categories <- list(Explanatory = features, Target = target, External = other_variables)
    }
    ret$Category <- ""
    for (i in 1:length(categories)) {
      if (length(categories[[i]]) > 0){
        for (variable in categories[[i]]) {
          ret[ret$Variable == variable,]$Category <- names(categories)[i]
        }
      }
    }
    # Check if they are Variables without category
    if (length(ret[ret$Category == "",]$Category) > 0){
      ret[ret$Category == "",]$Category <- "Other Variables"
    }
    ret <- list(data = ret, models_info = models_info)
    names(ret$data$Label) <- NULL
    class(ret) <- "funnel_measure"
    ret
  }

#' Print funnel_measure object
#'
#' @param x an object of class \code{funnel_measure}
#' @param ... other parameters
#'
#' @export
#' @examples
#' \donttest{
#' library("DALEXtra")
#' library("mlr")
#' task <- mlr::makeRegrTask(
#'   id = "R",
#'   data = apartments,
#'   target = "m2.price"
#' )
#' learner_lm <- mlr::makeLearner(
#'   "regr.lm"
#' )
#' model_lm <- mlr::train(learner_lm, task)
#' explainer_lm <- explain_mlr(model_lm, apartmentsTest, apartmentsTest$m2.price, label = "LM")
#'
#' learner_rf <- mlr::makeLearner(
#'   "regr.ranger"
#' )
#' model_rf <- mlr::train(learner_rf, task)
#' explainer_rf <- explain_mlr(model_rf, apartmentsTest, apartmentsTest$m2.price, label = "RF")
#'
#' learner_gbm <- mlr::makeLearner(
#'   "regr.gbm"
#' )
#' model_gbm <- mlr::train(learner_gbm, task)
#' explainer_gbm <- explain_mlr(model_gbm, apartmentsTest, apartmentsTest$m2.price, label = "GBM")
#'
#' plot_data <- funnel_measure(explainer_lm, list(explainer_rf, explainer_gbm),
#'                             nbins = 5, measure_function = DALEX::loss_root_mean_square)
#' print(plot_data)
#' }

print.funnel_measure <- function(x, ...) {
  cat("Funnel measure head:\n")
  print(head(x$data))
  cat("Models Info\n")
  print(head(x$models_info))
}