Nothing
#' Natural language description of feature importance explainer
#'
#' @details Function \code{describe.ceteris_paribus()} generates a natural language description of
#' ceteris paribus profile. The description summarizes variable values, that would change
#' model's prediction at most. If a ceteris paribus profile for multiple variables is passed,
#' \code{variables} must specify a single variable to be described. Works only for a ceteris paribus profile
#' for one observation. In current version only categorical values are discribed. For \code{display_numbers = TRUE}
#' three most important variable values are displayed, while \code{display_numbers = FALSE} displays
#' all the important variables, however without further details.
#'
#' @importFrom graphics plot
#' @importFrom stats quantile
#'
#' @examples
#' library("DALEX")
#' library("ingredients")
#' library("ranger")
#'
#' \donttest{
#' model_titanic_rf <- ranger(survived ~., data = titanic_imputed, probability = TRUE)
#'
#' explain_titanic_rf <- explain(model_titanic_rf,
#' data = titanic_imputed[,-8],
#' y = titanic_imputed[,8],
#' label = "ranger forest",
#' verbose = FALSE)
#'
#' selected_passangers <- select_sample(titanic_imputed, n = 10)
#' cp_rf <- ceteris_paribus(explain_titanic_rf, selected_passangers)
#' pdp <- aggregate_profiles(cp_rf, type = "partial", variable_type = "categorical")
#' describe(pdp, variables = "gender")
#' }
#'
#' @export
#' @rdname describe
describe.partial_dependence_explainer <- function(x,
nonsignificance_treshold = 0.15,
...,
display_values = FALSE,
display_numbers = FALSE,
variables = NULL,
label = "prediction") {
# ERROR HANDLING
if (length(unique(x[ ,'_vname_'])) == 1) variables <- as.character(x[1,'_vname_'])
if (is.null(variables)) stop("Choose a single variable to be described.")
if (!is(variables, "character")) stop("Enter the single variables name as character.")
# Assigning model's name
model_name <- as.character(x[1,'_label_'])
model_name <- paste(toupper(substr(model_name, 1, 1)), substr(model_name, 2, nchar(model_name)), sep="")
# Generating description
if (is(x[ ,'_x_'], "numeric")) {
description <- describe_aggregated_profiles_continuous(x = x,
nonsignificance_treshold = nonsignificance_treshold,
display_values = display_values,
display_numbers = display_numbers,
variables = variables,
label = label,
model_name = model_name)
} else {
x[ ,'_x_'] <- as.character(x[ ,'_x_'])
description <- describe_aggregated_profiles_factor(x = x,
nonsignificance_treshold = nonsignificance_treshold,
display_values = display_values,
display_numbers = display_numbers,
variables = variables,
label = label,
model_name = model_name)
}
class(description) <- c("aggregated_profiles_description", "description", "character")
description
}
describe_aggregated_profiles_factor <- function(x,
nonsignificance_treshold,
display_values,
display_numbers,
variables,
label,
model_name) {
# Specifying a df for easier variable description
df_list <- specify_df_aggregated(x = x,
variables = variables,
nonsignificance_treshold = nonsignificance_treshold)
df <- df_list$df
treshold <- df_list$treshold
#Selecting model's prediction for the observation being explained
baseline_prediction <- attr(x, "mean_prediction")
# Choosing the mode of the explanation
if (display_numbers) {
argument1 <- NULL
argument2 <- NULL
argument3 <- NULL
if (nrow(df) > 0) {
sign1 <- if (df[1,'_yhat_'] > baseline_prediction) "increases" else "decreases"
argument1 <- paste0("The most important change in ", model_name,
"'s prediction would occur for ", variables, " = ",
df[1,'variable_name'], ". It ",sign1,
" the prediction by ", df$importance[1], ".")
}
if (nrow(df) > 1) {
sign2 <- if (df[2,'_yhat_'] > baseline_prediction) "increases" else "decreases"
argument2 <- paste0("The second most important change in ", "the",
" prediction would occur for ", variables, " = ",
df[2,'variable_name'], ". It ",sign2,
" the prediction by ", df$importance[2], ".")
}
if (nrow(df) > 2) {
sign3 <- if (df[3,'_yhat_'] > baseline_prediction) "increases" else "decreases"
argument3 <- paste0("The third most important change in ", "the",
" prediction would occur for ", variables, " = ",
df[3,'variable_name'], ". It ",sign3,
" the prediction by ", df$importance[3], ".")
}
introduction <- paste0(model_name,"'s mean ", label, " is equal to ", round(baseline_prediction, 3))
argumentation <- paste(argument1, argument2, argument3, sep = " \n")
summary <- ifelse((nrow(df) > 3),
paste0("Other variable values are with less importance.",
" They do not change the ", label, " by more than ",
df$importance[4],"."),
"All variables are being displayed.")
description <- paste0(introduction, "\n\n",
argumentation, "\n\n",
summary)
} else {
df_important <- df[which(df[ ,'important'] == TRUE ), ]
df_positive <- df[which(df_important[ ,'_yhat_'] >= baseline_prediction), ]
df_negative <- df[which(df_important[ ,'_yhat_'] < baseline_prediction), ]
if (nrow(df_positive) == 0) {
arguments_increasing <- NULL
} else {
increasing <- paste(df_positive[ ,'variable_name'], collapse = ", ")
arguments_increasing <- paste0("increase substantially if the value of ",
variables, " variable would change to ", increasing)
}
if (nrow(df_negative) == 0) {
arguments_decreasing <- NULL
} else {
decreasing <- paste(df_negative[ ,'variable_name'], collapse = ", ")
arguments_decreasing <- paste0("decrease substantially if the value of ",
variables, " variable would change to ", decreasing)
}
introduction <- paste0(model_name, "'s mean " ,
label, " is equal to ",
round(baseline_prediction,3), ".")
argumentation <- ifelse((is.null(arguments_increasing) | is.null(arguments_decreasing)),
paste0("Model's prediction would ", arguments_increasing,
arguments_decreasing, ".\n",
"The largest change would be marked if ",
variables, " variable would change to ",
df_important[1,"variable_name"], "."),
paste0("Model's prediction would ", arguments_increasing,
". On the other hand, ",
model_name,"'s ", label, " would ", arguments_decreasing,
". The largest change would be marked if ",
variables, " variable would change to ",
df_important[1,"variable_name"], "."))
described_all <- (nrow(df_important) == nrow(df))
summary <- ifelse(described_all,
"All the variables were displayed.",
paste0("Other variables are with less importance and they do not change ",
label, " by more than ", round(treshold,2), "%."))
description <- paste0(introduction, "\n\n",
argumentation, "\n\n",
summary)
}
description
}
describe_aggregated_profiles_continuous <- function(x,
nonsignificance_treshold,
display_values,
display_numbers,
variables,
label,
model_name) {
# Specifying a df for variable description
df <- specify_df_aggregated(x = x, variables = variables)$df
baseline_prediction <- attr(x, "mean_prediction")
introduction <- paste0(model_name, "'s mean ", label,
" is equal to ", round(baseline_prediction, 3), ".")
# prefix
max_name <- df[which.max(df$`_yhat_`), variables]
min_name <- df[which.min(df$`_yhat_`), variables]
cutpoint <- find_optimal_cutpoint(smooth(df$`_yhat_`))
# do not round if it's below minimum #76
cut_name <- max(
round(df[cutpoint, variables], 3),
min(c(df[, variables])) # never get smaller than min
)
# Test if the break point is between max_name and min_name
multiple_breakpoints <- ifelse((cut_name < min(min_name, max_name) | cut_name > max(min_name, max_name)),
TRUE,
FALSE)
if (multiple_breakpoints) {
df_additional <- df[which(df[ ,variables] == min(min_name, max_name)):which(df[ ,variables] == max(min_name, max_name)), ]
cutpoint_additional <- find_optimal_cutpoint(smooth(na.omit(df_additional$`_yhat_`)))
}
breakpoint_description <- ifelse(multiple_breakpoints,
paste0("Breakpoints are identified at (",
variables, " = ", cut_name, " and ",
variables, " = ",
round(df[cutpoint_additional, variables], 3), ")."),
paste0("Breakpoint is identified at (",
variables, " = ", cut_name, ")."))
prefix <- paste0("The highest prediction occurs for (", variables, " = ", max_name, "),",
" while the lowest for (", variables, " = ", min_name, ").\n",
breakpoint_description)
cutpoint <- ifelse(multiple_breakpoints,
cutpoint_additional,
cutpoint)
sufix <- describe_numeric_variable(original_x = attr(x, "observations"),
df = df,
cutpoint = cutpoint,
variables = variables)
description <- paste(introduction, prefix, sufix, sep = "\n\n")
description
}
#:# could use some comments
specify_df_aggregated <- function(x, variables, nonsignificance_treshold) {
baseline_prediction <- attr(x, "mean_prediction")
df <- x[which(x[ ,'_vname_'] == variables), ]
if (nrow(df) == 0) stop("There is no such variable.")
df <- df[ ,c("_x_","_yhat_")]
colnames(df)[1] <- variables
df['variable_name'] <- paste0('"', df[ ,variables],'"')
treshold <- NULL
if (is(df[ ,variables], "factor") | is(df[ ,variables], "character")) {
df['importance'] <- sapply(df[ ,'_yhat_'], function(x) abs(x-baseline_prediction))
df['importance'] <- round(df['importance'],3)
df <- df[order(df[ ,'importance'], decreasing = TRUE), ]
# Seting the treshold
most_important_prediction <- max(df[ ,'importance'])
treshold <- most_important_prediction * nonsignificance_treshold
# Modifying names for better description display
df['important'] <- sapply(df[ ,'importance'], function(x) ifelse(x < treshold, TRUE, FALSE))
}
list("df" = df, "treshold" = treshold)
}
#' @export
describe.partial_dependency_explainer <- describe.partial_dependence_explainer
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.