R/plot_ind_contr.R

Defines functions plotIndividualContribution

Documented in plotIndividualContribution

#' @title Decompose the predicted value based on the given features
#'
#' @description
#' This function visualizes the contribution of each feature regarding the predicted value.
#' By default, multiple base learners defined on one feature are aggregated. If you
#' want to show the contribution of single base learner, then set `aggregate = FALSE`.
#'
#' @return `ggplot` object containing the graphic.
#' @param cboost ([Compboost])\cr
#'   A trained [Compboost] object.
#' @param newdata (`data.frame()`)\cr
#'   Data frame containing exactly one row holding the new observations.
#' @param aggregate (`logical(1L)`)\cr
#'   Number of colored base learners added to the legend.
#' @param colbreaks (`numeric()`)\cr
#'   Breaks to visualize/highlight different predicted values. Default creates different
#'   colors for positive and negative score values. If set to `NULL` no coloring
#'   is applied.
#' @param collabels (`character(length(colbreaks) - 1)`)\cr
#'   Labels for the color breaks. If set to `NULL` intervals are used as labels.
#' @param nround (`integer(1L)`)\cr
#'   Digit passed to [round] for labels (default is `nround = 2L`).
#' @param offset (`logical(1L)`)\cr
#'   Flag to indicate whether the offset should be added to the figure or not.
#' @examples
#' dat = mtcars
#' fnum = c("cyl", "disp", "hp", "drat", "wt", "qsec")
#' fcat = c("vs", "am", "gear", "carb")
#' for (fn in fcat) dat[[fn]] = as.factor(dat[[fn]])
#'
#' cboost = Compboost$new(data = dat, target = "mpg",
#'   loss = LossQuadratic$new())
#'
#' for (fn in fnum) cboost$addComponents(fn, df = 3)
#' for (fn in fcat) cboost$addBaselearner(fn, "ridge", BaselearnerCategoricalRidge)
#' cboost$train(500L)
#' cbreaks = c(-Inf, -0.1, 0.1, Inf)
#' clabs   = c("bad", "middle", "good")
#' plotIndividualContribution(cboost, dat[10, ], colbreaks = cbreaks,
#'   collabels = clabs)
#' plotIndividualContribution(cboost, dat[10, ], offset = FALSE,
#'   colbreaks = cbreaks, collabels = clabs)
#' @export
plotIndividualContribution = function(cboost, newdata, aggregate = TRUE, colbreaks = c(-Inf, 0, Inf),
  collabels = c("negative", "positive"), nround = 2L, offset = TRUE) {

  if (! requireNamespace("ggplot2", quietly = TRUE)) stop("Please install ggplot2 to create plots.")
  checkmate::assertClass(cboost, "Compboost")
  checkmate::assertLogical(aggregate, len = 1L)

  if (! is.null(colbreaks))
    checkmate::assertNumeric(colbreaks, finite = FALSE, min.len = 2L)

  if (! is.null(collabels)) {
    if (is.null(colbreaks))
      checkmate::assertCharacter(collabels)
    else
      checkmate::assertCharacter(collabels, len = length(colbreaks) - 1)
  }

  checkmate::assertIntegerish(nround, len = 1L)
  checkmate::assertLogical(offset, len = 1L)

  if (is.null(cboost$model))
    stop("Model has not been trained!")

  if (! cboost$model$isTrained())
    stop("Model has not been trained!")

  feats = vapply(cboost$model$getFactoryMap(), FUN.VALUE = character(1), FUN = function(blf) {
    paste(unique(blf$getFeatureName()), collapse = "_")
  })
  blnames = names(feats)
  if (offset)
    df_bls  = data.frame(bl = c(blnames, "offset"), feat = c(feats, "offset"))
  else
    df_bls  = data.frame(bl = blnames, feat = feats)

  nuisance = lapply(unique(cboost$bl_factory_list$getDataNames()), function(fn) {
    checkmate::assertChoice(fn, choices = colnames(newdata))
  })
  checkmate::assertDataFrame(newdata, nrows = 1L)
  ll_ds    = cboost$prepareData(newdata)
  ll_preds = c(cboost$model$predictIndividual(ll_ds), offset = cboost$model$getOffset())
  df_preds = data.frame(bl = names(ll_preds), value = unname(unlist(ll_preds)))

  df_plt = merge(df_bls, df_preds, by = "bl", all.x = TRUE)
  df_plt$value[is.na(df_plt$value)] = 0

  fval = paste0("(", vapply(X = newdata, FUN.VALUE = character(1L), FUN = function(x) {
    if (is.numeric(x)) return(as.character(round(x, nround)))
    return(as.character(x))
  }), ")")

  df_fval = do.call(rbind, lapply(cboost$baselearner_list, function(f) {
    fn = f$factory$getFeatureName()
    fl = vapply(fn, function(f) {
      x = newdata[[f]]
      if (is.numeric(x)) return(as.character(round(x, nround)))
      return(as.character(x))
    }, character(1))
    out = data.frame(feat = paste(fn, collapse = "_"), label = paste(sprintf("%s (%s)", fn, fl), collapse = ", "))
    return(out)
  }))
  rownames(df_fval) = NULL

  if (offset) {
    df_fval = rbind(df_fval, data.frame(feat = "offset", label = "offset"))
  }

  if (aggregate) {
    df_plt = aggregate(x = df_plt$value, by = list(df_plt$feat), FUN = sum)
    names(df_plt) = c("feat", "value")
    df_plt        = merge(df_plt, df_fval, by = "feat")
  } else {
    df_plt$feat  = df_plt$bl
    df_plt$label = df_plt$feat
  }
  df_plt        = df_plt[order(df_plt$value, decreasing = TRUE), ]
  df_plt$bl_num = rev(seq_len(nrow(df_plt)))

  if (! is.null(colbreaks)) {
    if (! is.null(collabels))
      df_plt$cbreak = cut(df_plt$value, breaks = colbreaks, labels = collabels)
    else
      df_plt$cbreak = cut(df_plt$value, breaks = colbreaks)
  }

  pred = round(sum(df_plt$value), nround)
  subtitle = paste0("Score: ", pred)

  .data = ggplot2::.data
  if (is.null(colbreaks[1]))
    gg = ggplot2::ggplot(df_plt, ggplot2::aes(x = .data$value, y = .data$bl_num))
  else
    gg = ggplot2::ggplot(df_plt, ggplot2::aes(x = .data$value, y = .data$bl_num,
      color = .data$cbreak, fill = .data$cbreak))

  gg = gg +
    ggplot2::geom_vline(xintercept = 0, color = "dark grey", alpha = 0.6) +
    ggplot2::geom_segment(ggplot2::aes(xend = 0, yend = .data$bl_num)) +
    ggplot2::geom_point() +
    ggplot2::ylab("") +
    ggplot2::xlab("Contribution to predicted value") +
    ggplot2::labs(color = "", fill = "") +
    ggplot2::scale_y_continuous(labels = df_plt$label, breaks = df_plt$bl_num) +
    ggplot2::ggtitle("Prediction", subtitle)
  return(gg)
}
schalkdaniel/compboost documentation built on April 15, 2023, 9:03 p.m.