Nothing
#' Update "EffectData" Object
#'
#' @description
#' Updates an "EffectData" object by
#' - turning discrete values to factor (especially useful with the next option),
#' - collapsing levels of categorical variables with many levels,
#' - dropping empty bins,
#' - dropping small bins,
#' - dropping bins with missing name, or
#' - sorting the variables by their importance, see [effect_importance()]-
#'
#' Except for `sort_by`, all arguments are vectorized, i.e., you can
#' pass a vector or list of the same length as `object`.
#'
#' @param object Object of class "EffectData".
#' @param sort_by By which statistic ("pd", "pred_mean", "y_mean", "resid_mean", "ale")
#' should the results be sorted? The default is "no" (no sorting). Calculated
#' after all other update steps, e.g., after collapsing or dropping rare levels.
#' @param to_factor Should discrete features be treated as factors?
#' In combination with `collapse_m`, this can be used to collapse rare values of
#' discrete numeric features.
#' @param collapse_m If a factor or character feature has more than `collapse_m` levels,
#' rare levels are collapsed into a new level "other p". Standard deviations are
#' collapsed via root of the weighted average variances. The default is 15.
#' Set to `Inf` for no collapsing.
#' @param collapse_by How to determine "rare" levels in `collapse_m`?
#' Either "weight" (default) or "N". Only matters in situations with case weights `w`.
#' @param drop_empty Drop empty bins. Equivalent to `drop_below_n = 1`.
#' The default is `FALSE`.
#' @param drop_below_n Drop bins with N below this value. Applied after collapsing.
#' The default is 0.
#' @param drop_below_weight Drop bins with weight below this value. Applied after
#' collapsing. The default is 0.
#' @param na.rm Should missing bin centers be dropped? Default is `FALSE`.
#' @param ... Currently not used.
#' @returns A modified object of class "EffectData".
#' @seealso
#' [feature_effects()], [average_observed()], [average_predicted()],
#' [partial_dependence()], [ale()], [bias()], [effect_importance()]
#' @export
#' @examples
#' fit <- lm(Sepal.Length ~ ., data = iris)
#' xvars <- colnames(iris)[-1]
#' feature_effects(fit, v = xvars, data = iris, y = "Sepal.Length", breaks = 5) |>
#' update(sort = "pd", collapse_m = 2) |>
#' plot()
update.EffectData <- function(
object,
sort_by = c("no", "pd", "pred_mean", "y_mean", "resid_mean", "ale"),
to_factor = FALSE,
collapse_m = 15L,
collapse_by = c("weight", "N"),
drop_empty = FALSE,
drop_below_n = 0,
drop_below_weight = 0,
na.rm = FALSE,
...
) {
sort_by <- match.arg(sort_by)
collapse_by <- match.arg(collapse_by)
stopifnot(collapse_m >= 2L)
out <- mapply(
update_one,
x = object,
to_factor = to_factor,
collapse_m = collapse_m,
collapse_by = collapse_by,
drop_empty = drop_empty,
drop_below_n = drop_below_n,
drop_below_weight = drop_below_weight,
na.rm = na.rm,
SIMPLIFY = FALSE
)
class(out) <- "EffectData"
if (sort_by == "no") out else out[order(-effect_importance(out, by = sort_by))]
}
# Helper functions
update_one <- function(
x,
to_factor,
collapse_m,
collapse_by,
drop_empty,
drop_below_n,
drop_below_weight,
na.rm
) {
discrete <- is_discrete(x)
if (to_factor && discrete && !is.factor(x$bin_mid)) {
x$bin_mid <- x$bin_mean <- factor(x$bin_mean)
}
if (drop_empty) {
x <- droplevels(subset(x, N > 0L))
}
if ((is.factor(x$bin_mid) || is.character(x$bin_mid)) && collapse_m < nrow(x)) {
x <- .collapse_m(x, m = collapse_m, by = collapse_by)
}
if (drop_below_n > 0) {
x <- droplevels(subset(x, N >= drop_below_n))
}
if (drop_below_weight > 0) {
x <- droplevels(subset(x, weight >= drop_below_weight))
}
if (isTRUE(na.rm)) {
x <- subset(x, !is.na(bin_mid))
}
attr(x, "discrete") <- discrete
return(x)
}
.collapse_m <- function(x, m, by) {
stopifnot(
m >= 2L,
m < nrow(x),
is.factor(x$bin_mid) || is.character(x$bin_mid)
)
# Split into large and small levels
ind <- order(x[[by]], decreasing = TRUE)
x_keep <- x[sort(ind[1:(m - 1L)]), ] # largest m - 1 categories in original order
x_agg <- x[ind[m:nrow(x)], ] # the rest
# Derive name of new value: "other" or "other1" etc.
fact <- is.factor(x_keep$bin_mid)
if (fact) {
x_keep <- droplevels(x_keep)
}
lvl <- stats::na.omit(as.character(x_keep$bin_mid))
oth <- paste("other", nrow(x_agg))
if (fact) {
levels(x_keep$bin_mid) <- levels(x_keep$bin_mean) <- c(lvl, oth)
}
# Collapse other rows
w <- x_agg$weight
x_new <- data.frame(
bin_mid = oth, bin_width = 0.7, bin_mean = oth, N = sum(x_agg$N), weight = sum(w)
)
m_cols <- intersect(colnames(x), c("pred_mean", "y_mean", "resid_mean", "pd", "ale"))
if (length(m_cols)) {
x_new[, m_cols] <- collapse::fmean(x_agg[m_cols], w = w, drop = FALSE, na.rm = TRUE)
}
s_cols <- intersect(colnames(x), c("pred_sd", "y_sd", "resid_sd"))
if (length(s_cols)) {
x_new[, s_cols] <- sqrt(
collapse::fmean(x_agg[s_cols]^2, w = w, drop = FALSE, na.rm = TRUE)
)
}
rbind(x_keep, x_new) # Column order of x_new does not matter
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.