Nothing
#' Plot Dataset Level Model Performance Explanations
#'
#' @param x a model to be explained, preprocessed by the \code{\link{explain}} function
#' @param ... other parameters
#' @param geom either \code{"prc"}, \code{"roc"}, \code{"ecdf"}, \code{"boxplot"}, \code{"gain"}, \code{"lift"} or \code{"histogram"} determines how residuals shall be summarized
#' @param loss_function function that calculates the loss for a model based on model residuals. By default it's the root mean square. NOTE that this argument was called \code{lossFunction}.
#' @param lossFunction alias for \code{loss_function} held for backwards compatibility.
#' @param show_outliers number of largest residuals to be presented (only when geom = boxplot).
#' @param ptlabel either \code{"name"} or \code{"index"} determines the naming convention of the outliers
#'
#' @return An object of the class \code{model_performance}.
#'
#' @export
#' @examples
#' \donttest{
#' library("ranger")
#' titanic_ranger_model <- ranger(survived~., data = titanic_imputed, num.trees = 50,
#' probability = TRUE)
#' explainer_ranger <- explain(titanic_ranger_model, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived)
#' mp_ranger <- model_performance(explainer_ranger)
#' plot(mp_ranger)
#' plot(mp_ranger, geom = "boxplot", show_outliers = 1)
#'
#' titanic_ranger_model2 <- ranger(survived~gender + fare, data = titanic_imputed,
#' num.trees = 50, probability = TRUE)
#' explainer_ranger2 <- explain(titanic_ranger_model2, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived,
#' label = "ranger2")
#' mp_ranger2 <- model_performance(explainer_ranger2)
#' plot(mp_ranger, mp_ranger2, geom = "prc")
#' plot(mp_ranger, mp_ranger2, geom = "roc")
#' plot(mp_ranger, mp_ranger2, geom = "lift")
#' plot(mp_ranger, mp_ranger2, geom = "gain")
#' plot(mp_ranger, mp_ranger2, geom = "boxplot")
#' plot(mp_ranger, mp_ranger2, geom = "histogram")
#' plot(mp_ranger, mp_ranger2, geom = "ecdf")
#'
#' titanic_glm_model <- glm(survived~., data = titanic_imputed, family = "binomial")
#' explainer_glm <- explain(titanic_glm_model, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived, label = "glm",
#' predict_function = function(m,x) predict.glm(m,x,type = "response"))
#' mp_glm <- model_performance(explainer_glm)
#' plot(mp_glm)
#'
#' titanic_lm_model <- lm(survived~., data = titanic_imputed)
#' explainer_lm <- explain(titanic_lm_model, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived, label = "lm")
#' mp_lm <- model_performance(explainer_lm)
#' plot(mp_lm)
#'
#' plot(mp_ranger, mp_glm, mp_lm)
#' plot(mp_ranger, mp_glm, mp_lm, geom = "boxplot")
#' plot(mp_ranger, mp_glm, mp_lm, geom = "boxplot", show_outliers = 1)
#' }
#'
#
plot.model_performance <- function(x, ..., geom = "ecdf", show_outliers = 0, ptlabel = "name", lossFunction = loss_function, loss_function = function(x) sqrt(mean(x^2))) {
if (!(ptlabel %in% c("name", "index"))){
stop("The plot.model_performance() function requires label to be name or index.")
}
# lossFunction is deprecated
# if (methods::hasArg("lossFunction")) {
# warning("lossFunction is deprecated, please use loss_function instead")
# loss_function <- list(...)[["lossFunction"]]
# }
# extract residuals
# combine into a single object
if (length(list(...)) == 0) {
# if single explainer
df <- x$residuals
} else {
# if multiple explainers
args <- lapply(list(...),
function(tmp) tmp$residuals)
args[["x"]] <- x$residuals
df <- do.call(combine_explainers, rev(args))
}
df$label <- reorder(df$label, df$diff, loss_function)
if (ptlabel == "name") {
df$name <- NULL
df$name <- rownames(df)
}
nlabels <- length(unique(df$label))
switch(geom,
ecdf = plot.model_performance_ecdf(df, nlabels),
boxplot = plot.model_performance_boxplot(df, show_outliers, loss_function, nlabels),
histogram = plot.model_performance_histogram(df, nlabels),
prc = plot.model_performance_prc(df, nlabels),
roc = plot.model_performance_roc(df, nlabels),
gain = plot.model_performance_gain(df, nlabels),
lift = plot.model_performance_lift(df, nlabels)
)
}
plot.model_performance_ecdf <- function(df, nlabels) {
label <- name <- NULL
ggplot(df, aes(abs(diff), color = label)) +
stat_ecdf(geom = "step") +
theme_default_dalex() +
scale_color_manual(name = "Model", values = colors_discrete_drwhy(nlabels)) +
xlab(expression(group("|", residual, "|"))) +
scale_y_continuous(breaks = seq(0,1,0.1),
labels = paste(seq(100,0,-10),"%"),
trans = "reverse",
name = "") +
ggtitle(expression(paste("Reverse cumulative distribution of ", group("|", residual, "|"))))
}
plot.model_performance_boxplot <- function(df, show_outliers, loss_function, nlabels) {
label <- name <- NULL
pl <- ggplot(df, aes(x = label, y = abs(diff), fill = label)) +
stat_boxplot(alpha = 0.4, coef = 1000) +
stat_summary(fun = loss_function, geom = "point", shape = 20, size=10, color="red", fill="red") +
theme_vertical_default_dalex() +
scale_fill_manual(name = "Model", values = colors_discrete_drwhy(nlabels)) +
ylab("") +
scale_x_discrete("", limits = rev(levels(df$label))) + # added to fix https://github.com/ModelOriented/DALEX/issues/400
ggtitle(
expression(paste("Boxplots of ", group("|", residual, "|"))),
"Red dot stands for root mean square of residuals"
) +
coord_flip()
if (show_outliers > 0) {
df$rank <- unlist(tapply(-abs(df$diff), df$label, rank, ties.method = "min"))
df_small <- df[df$rank <= show_outliers,]
pl <- pl +
geom_point(data = df_small) +
geom_text(data = df_small,
aes(label = name), srt = 90,
hjust = -0.2, vjust = 1)
}
pl
}
plot.model_performance_histogram <- function(df, nlabels) {
diff <- label <- NULL
# commented to keep it consistent with other plots
# see: https://github.com/ModelOriented/DALEX/issues/400
# if (length(levels(df$label)) > 1) levels(df$label) <- rev(levels(df$label))
ggplot(df, aes(diff, fill = label)) +
geom_histogram(bins = 100) +
facet_wrap(~label, ncol = 1) +
theme_default_dalex() + xlab("residuals") + theme(legend.position = "none") +
scale_fill_manual(name = "Model", values = colors_discrete_drwhy(nlabels)) +
ggtitle("Histogram for residuals")
}
# precision-recall curve
plot.model_performance_prc <- function(df, nlabels) {
dfl <- split(df, factor(df$label))
prcdfl <- lapply(dfl, function(df) {
pred_sorted <- df[order(df$predicted, decreasing = TRUE), ]
# assuming that y = 0/1 where 1 is the positive
recall <- cumsum(pred_sorted$observed)/sum(pred_sorted$observed)
precis <- cumsum(pred_sorted$observed)/seq_along(pred_sorted$observed)
data.frame(precis = precis, recall = recall, label = df$label[1])
})
prcdf <- do.call(rbind, prcdfl)
precis <- recall <- label <- NULL
ggplot(prcdf, aes(x = recall, y = precis, color = label)) +
geom_line() +
theme_default_dalex() +
scale_color_manual(name = "Model", values = colors_discrete_drwhy(nlabels)) +
scale_x_continuous("Recall", limits = c(0, 1), expand = c(0, 0)) +
scale_y_continuous("Precision", limits = c(0, 1), expand = c(0, 0)) +
coord_fixed() +
ggtitle("Precision Recall Curve")
}
# receiver operating characteristic
plot.model_performance_roc <- function(df, nlabels) {
dfl <- split(df, factor(df$label))
rocdfl <- lapply(dfl, function(df) {
# assuming that y = 0/1 where 1 is the positive
tpr_tmp <- tapply(df$observed, df$predicted, sum)
tpr <- c(0,cumsum(rev(tpr_tmp)))/sum(df$observed)
fpr_tmp <- tapply(1 - df$observed, df$predicted, sum)
fpr <- c(0,cumsum(rev(fpr_tmp)))/sum(1 - df$observed)
data.frame(tpr = tpr, fpr = fpr, label = df$label[1])
})
rocdf <- do.call(rbind, rocdfl)
fpr <- tpr <- label <- NULL
ggplot(rocdf, aes(x = fpr, y = tpr, color = label)) +
geom_abline(slope = 1, intercept = 0, color = "grey", lty = 2) +
geom_line() +
theme_default_dalex() +
scale_color_manual(name = "Model", values = colors_discrete_drwhy(nlabels)) +
scale_x_continuous("False positive rate", limits = c(0, 1), expand = c(0, 0)) +
scale_y_continuous("True positive rate", limits = c(0, 1), expand = c(0, 0)) +
coord_fixed() +
ggtitle("Receiver Operator Characteristic")
}
plot.model_performance_gain <- function(df, nlabels) {
dfl <- split(df, factor(df$label))
rocdfl <- lapply(dfl, function(df) {
pred_sorted <- df[order(df$predicted, decreasing = TRUE), ]
# assuming that y = 0/1 where 1 is the positive
lift <- cumsum(pred_sorted$observed)/length(pred_sorted$observed)
pr <- seq_along(pred_sorted$observed)/length(pred_sorted$observed)
data.frame(lift = lift, pr = pr, label = df$label[1])
})
rocdf <- do.call(rbind, rocdfl)
max_lift <- sum(df$observed)/nrow(df)
pr <- lift <- label <- NULL
ggplot(rocdf, aes(x = pr, y = lift, color = label)) +
geom_abline(slope = max_lift, intercept = 0, color = "grey", lty = 2) +
geom_line() +
theme_default_dalex() +
scale_color_manual(name = "Model", values = colors_discrete_drwhy(nlabels)) +
scale_x_continuous("Positive rate", limits = c(0, 1), expand = c(0, 0)) +
scale_y_continuous("True positive rate", expand = c(0, 0)) +
ggtitle("Cumulative Gains chart")
}
plot.model_performance_lift <- function(df, nlabels) {
dfl <- split(df, factor(df$label))
rocdfl <- lapply(dfl, function(df) {
pred_sorted <- df[order(df$predicted, decreasing = TRUE), ]
# assuming that y = 0/1 where 1 is the positive
lift <- cumsum(pred_sorted$observed)/length(pred_sorted$observed)
pr <- seq_along(pred_sorted$observed)/length(pred_sorted$observed)
data.frame(lift = lift/pr, pr = pr, label = df$label[1])
})
rocdf <- do.call(rbind, rocdfl)
max_lift <- sum(df$observed)/nrow(df)
pr <- lift <- label <- NULL
ggplot(rocdf, aes(x = pr, y = lift/max_lift, color = label)) +
geom_abline(slope = 0, intercept = 1, color = "grey", lty = 2) +
geom_line() +
theme_default_dalex() +
scale_color_manual(name = "Model", values = colors_discrete_drwhy(nlabels)) +
scale_x_continuous("Positive rate", limits = c(0, 1), expand = c(0, 0)) +
scale_y_continuous("Lift", expand = c(0, 0)) +
ggtitle("Lift chart")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.