#' Create a plot representing a Hosmer-Lemeshow goodness-of-fit test.
#'
#' @param models list or numeric. If a numeric vector, this will be
#' interpreted as a vector of risk scores to compare to a
#' dependent variable. If a \code{tundraContainer} (see the
#' tundra package), the second argument should be a validation
#' data set to be scored ad-hoc with the model object.
#' If a \emph{named} list is passed, multiple validation graphs
#' will be overlayed onto one plot; this list can heterogeneously
#' consist of numeric vectors or scores or \code{tundraContainer}
#' model objects.
#' @param validation_data data.frame or integer. If a \code{data.frame},
#' the column given by \code{dep_var_name} (by default "dep_var")
#' will be extracted and used as the empirical signal (0 or 1).
#' For each bucket (see the \code{buckets} parameter), a comparison
#' of the mean score in that quantile to the empirical mean
#' of the dependent variable will be graphed. An ideal classifier
#' will fully separate the positive and negative cases and look
#' like the Hamiltonian step function jumping from 0 to 1 after
#' some cutoff.
#'
#' If an integer is passed, this will be assumed to be the dependent
#' variable in the same order as the scores given by the \code{models}
#' parameter (if it is also a numeric vector). Note that one cannot
#' pass model objects, that is \code{tundraContainers}, in the
#' first parameter \code{models} if \code{validation_data} is a vector.
#' @param buckets integer. The number of cuts (by default, 10).
#' @param dep_var_name character. The name of the dependent variable.
#' This will be used to extract a column out of the \code{validation_data}
#' for comparison against the fitted risk scores. By default,
#' \code{"dep_var"}.
#' @param id_var_name character. The name of the ID variable. By default,
#' simply \code{"id"}.
#' @param plot logical. Whether or not to plot to an output device
#' straight away, by default \code{TRUE}.
#' @examples \dontrun{
#' set.seed(100) # to make it determenistic
#' validation_dat <- data.frame(dep_var = sample(c(1, 0), 1000, replace = TRUE))
#' good_preds <- validation_dat[['dep_var']] + rnorm(NROW(validation_dat))
#' bad_preds <- rnorm(NROW(validation_dat))
#' make_validation_plot(models = list(good_mod = good_preds, bad_mod = bad_preds),
#' validation_data = validation_dat)
#' }
#' @export
make_validation_plot <- function(models, validation_data,
buckets = 10, dep_var_name = "dep_var",
id_var_name = "id", plot = TRUE, ...) {
stopifnot(is.data.frame(validation_data) || is.numeric(validation_data),
is.numeric(buckets), is.simple_string(dep_var_name),
is.simple_string(id_var_name), is.simple_logical(plot))
if (!is(models, "list")) { models <- list(models) } # cannot use `is.list` because tundraContainers are list-like.
models <- lapply(models, validate_model)
validation_object <- structure(class = "validation_object_list",
make_validation_object(models, validation_data, buckets,
dep_var_name, id_var_name)
)
if (isTRUE(plot)) {
output_validation_plot(validation_object, ...)
}
invisible(validation_object)
}
#' @param scale numeric. Adjust plot size, by default \code{1}.
#' @param output_type function. The output type for the plot, by default \code{\link[grDevices]{png}}.
#' @param filename character. Path to save output png file (for only a single plot_type).
#' @param ... additional arguments to \code{\link[graphics]{plot}}.
#' @param validation_object_list validation_object_list. Internal parameter.
#' @rdname make_validation_plot
output_validation_plot <- function(validation_object_list, scale = 1, output_type = grDevices::png,
filename = NULL, ...) {
stopifnot(is(validation_object_list, "validation_object_list"))
stopifnot(is.function(output_type))
# Plot the results.
if (!is.null(filename)) {
output_type(filename, width = scale * 600, height = scale * 500)
on.exit(dev.off(), add = TRUE)
}
plot(validation_object_list, scale = scale, ...)
}
#' @rdname plot_validation_object_list
#' @param x object.
#' @param ... Internal.
#' @export
plot.validation_object_list <- function(x, ...) {
plot_validation_object_list(validation_object_list = x, ...)
}
#' Plot a \code{validation_plot} object.
#'
#' @param validation_object_list list. The validation objects to plot.
#' @param ylab character. (For plot_type = "standard" only. Default is "Default (\%)")
#' @param title character. The title of the plot, by default \code{NULL} for
#' no title.
#' @param linecolors character. For plot_type = "standard" only.
#' Vector of colors for each model. Default is rainbow.
#' @param line_types character. "Solid", "dashed" or "dotted", by default the former.
#' @param plotroc logical. Whether to print ROC in legend.
#' @param numbers logical. Whether to print number of defaults in
#' each bucket above the plotted point, but only if you are
#' plotting a single model. Default is \code{TRUE}.
#' @param showpoints logical. If \code{TRUE}, print only lines
#' on validation graph, not points.
#' @param increment integer. Increments of y-axis scale, in percent.
#' Default = 10.
#' @param n_per_row integer.
#' @param scale integer. If different than 1, used to scale output image.
#' @param use_normalizer logical.
#' @param ymax double. Controls for the maximum of y-axis scale if provided, may need to
#' be used together with increment to be effective.
#' @export
plot_validation_object_list <- function(validation_object_list, ylab = "Default (%)",
title = NULL, linecolors = c(), line_types = NULL,
plotroc = TRUE, numbers = TRUE, showpoints = FALSE,
increment = 10, n_per_row = 1, scale = 1,
use_normalizer = FALSE, ymax = NULL) {
# mean responses for all models
if (isTRUE(use_normalizer)) {
normalizer_key <- "normalized_mean_response"
} else {
normalizer_key <- "mean_response"
}
all_responses <- unlist(lapply(validation_object_list, `[[`, normalizer_key))
buckets <- length(validation_object_list[[1]][[normalizer_key]])
number_of_obs <- sum(validation_object_list[[1]]$bucket_size)
# set up plot parameters
if (missing(increment)) { # automatically set plot increment if user doesn't specify
range <- diff(range(all_responses, na.rm = TRUE))
n <- ceiling(log10(range / 5)) # Why the 5 magic number?
x <- c(1L, 2L, 5L) * 10 ^ n
m <- range / c(x / 10, x)
inc <- x[m == m[which.min(abs(range / x - 5))]]
} else {
inc <- increment / 100
}
ymin <- floor(min(all_responses, na.rm = TRUE)/inc) * inc
ymax <- ymax %||% (ceiling(max(all_responses, na.rm = TRUE)/inc) * inc)
# set up plot
op <- graphics::par(oma = c(5L, 2L, 0L, 0L), mgp = c(3L, 1L, 0L),
bty = "l", yaxs = "i", xpd = NA)
graphics::plot(c(1L, buckets), c(ymin, ymax),
xaxt = "n", yaxt = "n", xlab = "", ylab = "", type = "n")
graphics::axis(side = 1L, at = seq_len(buckets), cex = scale, cex.axis = scale)
if (is.element("gini", names(validation_object_list[[1]]))) {
# Y-axis is unbounded.
graphics::axis(side = 2L, las = 2L, at = seq(ymin + inc, ymax, by = inc),
labels = seq(ymin + inc, ymax, by = inc), cex.axis = scale)
} else { # y-axis is a pct
graphics::axis(side = 2L, las = 2L, at = seq(ymin + inc, ymax, by = inc),
labels = sapply(seq(ymin + inc, ymax,by = inc),
function(x) paste0(100 * x, "%")),
cex.axis = scale)
}
label <- switch(as.character(buckets),
"10" = "Decile", "5" = "Quintile", "Quantile")
graphics::mtext(label, 1L, 3L, font = 2L, cex = 1.2 * scale)
graphics::mtext(ylab, 2L, 4L, font = 2L, cex = 1.2 * scale)
# Get plot coordinates.
xmin <- graphics::par("usr")[1]
xmax <- graphics::par("usr")[2]
x_delta <- xmax - xmin
ymin <- graphics::par("usr")[3]
ymax <- graphics::par("usr")[4]
y_delta <- ymax - ymin
# Make title.
if (!is.null(title)) {
graphics::rect(xmin, ymax + 0.05 * y_delta, xmax,
ymax + 0.15 * y_delta, border = NA, col = "slateblue4")
graphics::text(0.5 * (xmin + xmax), ymax + 0.1 * y_delta,
labels = title, col = "#FFFFFF", font = 2, cex = scale)
}
# write size of data set
nrows_in_legend <- (length(validation_object_list) - 1) / n_per_row
x <- graphics::grconvertX(0.02, from = "ndc", to = "user")
y <- graphics::grconvertY(0.02, from = "ndc", to = "user")
label <- paste(number_of_obs, "data points used in analysis")
graphics::text(x, y, labels = label, adj = 0, cex = scale)
# add validation curves for all models
index <- 0
for (vobj in validation_object_list) {
index <- index + 1
with(vobj, {
# plot formatting arguments
# - line width
lwd <- ifelse(showpoints, 1, 6)
# - line type
lty <- 1
if (length(line_types) == 1) {
lty <- line_types
} else if (index <= length(line_types)) {
lty <- line_types[index]
}
# - color
if (index <= length(linecolors)) {
ccc <- linecolors[[index]]
} else {
ccc <- default_colors(length(validation_object_list))[index]
}
# add points and lines to plot for this model
if (use_normalizer) {
show_value <- normalized_mean_response
} else {
show_value <- mean_response
}
if (showpoints) {
graphics::points(seq_len(buckets), show_value, pch = 19, col = ccc)
}
graphics::lines(seq_len(buckets), show_value, col = ccc, lwd = lwd, lty = lty)
# Add legend.
if (isTRUE(plotroc) && is.element("roc", names(vobj))) {
label <- substitute(
paste(m," (AUC = ", r[-le] ^ ue, ")"),
list(m = model_name, r = round(roc, digits = 3),
le = round(roc_le, digits = 3), ue = round(roc_ue, digits = 3))
)
} else if (plotroc && is.element("gini", names(vobj))) {
label <- substitute(
paste(m," (Gini = ", g, "), (NRMSD = ", RMSD, ")"),
list(m = model_name, g = round(gini, digits = 3), RMSD = round(NRMSD, digits = 3))
)
} else {
label <- model_name
}
row <- floor((index - 1) / n_per_row)
column <- (index - 1) %% n_per_row
xd <- x_delta / n_per_row
xlo <- xmin + column * xd
xhi <- xlo + xd
xm <- (xlo + xhi) / 2
x1 <- xlo + 1 * x_delta / 20 # Line start.
x2 <- xlo + 3 * x_delta / 20 # Line end.
x3 <- xlo + 4 * x_delta / 20 # Text start.
y <- ymin - 0.25 * y_delta - row * y_delta * 0.06
graphics::lines(c(x1, x2), c(y, y), col = ccc, lwd = lwd, lty = lty)
graphics::text(x = x3, y = y, labels = label, adj = 0, col = ccc, cex = scale)
# Add number of ones above the plotted point.
if (numbers && length(validation_object_list)==1) {
if (use_normalizer) {
labels <- paste(formatC(100 * normalized_mean_response, digits = 3,
format = "fg"), "%", sep = "")
graphics::text(seq_len(buckets), normalized_mean_response + 0.05 * y_delta,
labels, cex = scale)
} else if (is.element("gini", names(validation_object_list[[1]]))) {
labels <- paste(formatC(mean_response, digits = 3, format = "fg"), sep = "")
graphics::text(seq_len(buckets), mean_response + 0.05 * y_delta, labels, cex = scale)
} else {
labels <- paste(formatC(100 * mean_response, digits = 3, format = "fg"), "%", sep = "")
graphics::text(seq_len(buckets), mean_response + 0.05 * y_delta, labels, cex = scale)
}
}
})
}
# Add lift if there are only 2 models.
if (length(validation_object_list) == 2) {
if (use_normalizer) {
response <- "normalized_mean_response"
} else {
response <- "mean_response"
}
a0 <- validation_object_list[[1]][[response]][1]
a1 <- validation_object_list[[2]][[response]][1]
b0 <- validation_object_list[[1]][[response]][buckets]
b1 <- validation_object_list[[2]][[response]][buckets]
lift_a <- paste0(round(100 * (a0 - a1) / a1), "%")
lift_b <- paste0(round(100 * (b0 - b1) / b1), "%")
dd <- 0.02 * x_delta
graphics::lines(c(1L, 1L), c(a0, a1), lwd = 2L)
graphics::text(c(1 - dd, 1 - dd), c(0.5 * (a0 + a1), 0.5 * (a0 + a1)),
labels = lift_a, adj = 1L)
graphics::lines(c(buckets, buckets), c(b0, b1), lwd = 2)
graphics::text(c(buckets + dd, buckets + dd), c(0.5 * (b0 + b1), 0.5 * (b0 + b1)),
labels = lift_b, adj = 0, cex = 0.9)
}
}
validate_model <- function(model) {
if (!is.numeric(model) && !is(model, "tundraContainer")) {
stop("Currently, only numeric and tundraContainer inputs are ",
"accepted in the first argument to ",
sQuote("make_validation_plot"), "; instead, I received a ",
sQuote(crayon::red(class(model)[1L])))
}
model
}
# The following Gini definition is adapted from https://www.kaggle.com/wiki/RCodeForGini
# It uses y_pred to rank y_actual, and generate Lorenz Curve by calculating the cumulative sum of y_act vs y_pred
# And Gini is the sum of diff between Lorenz Curve and the Line of Equality
# Note that the area between the two curves is Gini / length(y_actual)
# And the ratio of this area to the area of perfect inequality is Gini * 2/length(y_actual)
# This is precisely the Gini coefficient (http://en.wikipedia.org/wiki/Gini_coefficient)
SumModelGini <- function(y_actual, y_pred){
# length(y_actual) == length(y_pred) already checked by function body
df <- data.frame(y_act = y_actual, y_pred = y_pred)
df <- df[order(df$y_pred), ]
df$cumPosFound <- cumsum(df$y_act)
totalPos <- sum(df$y_act)
df$Lorentz <- df$cumPosFound / totalPos
#This 45 degree diagonal reference line is the 'Line of Equality'
df$random = (1:nrow(df))/nrow(df)
df$Gini <- df$Lorentz - df$random
sum(df$Gini)
}
# Normalizion by Gini of perfect prediction
# a.l.a. y_pred preserves the order of y_act, we should expect NormalizedGini(nGini) to be 1
# Thus, this metric is only sensitive to order of the prediction rather than the actual magnitude
# @examples
# any monotonic transformation (e.g y_pred = y_act^2) will still yield nGini = 1
# a total reverse of order (e.g. y_act = 1:1000, y_pred = 1000:1) will yield nGini = -1
# nGini is bounded from [-1, 1]
# any perturbation of order will shift nGnini downwards from 1
# a random order (e.g. y_act = 1:1000, y_pred = rnorm(1000) )will yield nGini ~ 0
NormalizedGini <- function(y_actual, y_pred){
SumModelGini(y_actual, y_pred) / SumModelGini(y_actual, y_actual)
}
# Root-Mean-Square Deviation
RMSD <- function(y_actual, y_pred){
sqrt(mean((y_actual - y_pred)^2))
}
make_validation_object <- function(scores, ...) {
UseMethod("make_validation_object")
}
make_validation_object.list <- function(scores, ...) {
structure(lapply(seq_along(scores), function(i, ...) {
model_name <- names(scores)[i]
if (model_name == "" || is.null(model_name)) {
model_name <- paste0("Model ", i)
}
make_validation_object(scores[[i]], ..., model_name = model_name)
}, ...), names = names(scores))
}
make_validation_object.tundraContainer <- function(scores, validation_data, ...) {
scores <- scores$predict(validation_data)
make_validation_object(scores, validation_data = validation_data, ...)
}
make_validation_object.numeric <- function(scores, validation_data, buckets,
dep_var_name, id_var_name,
model_name = "Model") {
# If the data frame has more than 500K rows, ROC will not be calculated.
TOO_MANY_ROWS_FOR_ROC <- 500000
# Create S3 class to store validation information.
validation_object <- structure(list(), class = "validation_object")
validation_object$model_name <- model_name
# Get the ordered response (actual and predicted)
if (length(scores) != NROW(validation_data)) {
stop("Prediction vector argument of unequal length to ",
"validation dataframe.")
}
# Warn user that something is wrong
if (all(is.na(scores))) {
stop("All predictions are NA.")
} else if (any(is.na(scores))) {
warning("Some predictions are NA.")
keep <- !is.na(scores)
scores <- scores[keep]
validation_data <- validation_data[keep, ]
} else if (length(unique(scores)) == 1) {
stop("Predictions are all identical!")
}
if (length(unique(validation_data[[dep_var_name]])) > 2) {
# For continuous output, use "normalized Gini" and "normalized RMSD" as error metric
# Definition:
# "nGini" reflects the relative order between y_act and y_pred, its value is bounded between [-1, 1]
# a perfect preservation of order will yield nGini = 1, and a total reverse of order will yield nGnini = -1
# "nRMSD" is the most common metric for Regression Problems, a perfect prediction will yield nRMSD = 0
validation_object$NRMSD <- NormalizedRMSD(validation_data[[dep_var_name]], scores)
validation_object$gini <- NormalizedGini(validation_data[[dep_var_name]], scores)
} else {
# For Classification problem, use ROC as error metric
# warn if data frame is too large for pROC::ci
roc <- if (nrow(validation_data) > TOO_MANY_ROWS_FOR_ROC) {
message("The data frame has too many rows to calculate ROC.")
message("If you insist to calculate ROC, please type in in ",
crayon::green("Y"), ". If not, please type in ", crayon::red("N"))
still_cal_ROC <- readline()
if (still_cal_ROC == "Y") {
message("Calculating ROC for big data frame, this might take very long time...")
pROC::ci(factor(validation_data[[dep_var_name]]), scores, of = "auc")
} else {
NULL
}
} else {
pROC::ci(factor(validation_data[[dep_var_name]]), scores, of = "auc")
}
# compute validation metrics
validation_object$roc <- roc[2L]
validation_object$roc_le <- roc[2L] - roc[1L] # lower error
validation_object$roc_ue <- roc[3L] - roc[2L] # upper error
}
ordered_response <- validation_data[[dep_var_name]][order(scores)]
normalizer_name <- "blah" # TODO: (RK) Implement.
if (is.element(normalizer_name, names(validation_data))) {
ordered_normalizer <- validation_data[[normalizer_name]][order(scores)]
}
ordered_preds <- scores[order(scores)]
# find average in buckets
bins <- floor(buckets*(seq_along(ordered_preds)-1)/length(ordered_preds))+1
validation_object$bin_lo <- tapply(ordered_preds, bins, min, na.rm = TRUE)
validation_object$bin_hi <- tapply(ordered_preds, bins, max, na.rm = TRUE)
validation_object$bucket_size <- table(bins)
validation_object$mean_response <- tapply(ordered_response, bins, mean, na.rm=T)
validation_object$total_response <- tapply(ordered_response, bins, sum, na.rm=T)
validation_object$mean_prediction <- tapply(ordered_preds, bins, mean, na.rm=T)
if (is.element(normalizer_name, names(validation_data))) {
ordered_ids <- validation_data[order(scores), id_var_name]
groups <- split(ordered_ids, bins)
validation_object$normalized_mean_response <-
vapply(groups, function(ids) unlist(Map(weighted.mean,
list(validation_data[[dep_var_name]][validation_data[[id_var_name]] %in% ids]),
list(validation_data[[normalizer_name]][validation_data[[id_var_name]] %in% ids])))
, numeric(1))
validation_object$normalizer <- tapply(ordered_normalizer, bins, sum, na.rm = TRUE)
}
validation_object
}
#' @rdname make_validation_plot
#' @export
validation_plot <- make_validation_plot
# Normalized RMSD to faciliate comparison between datasets with different scales
NormalizedRMSD <- function(y_actual, y_pred){
if(max(y_actual) == min(y_actual)){
stop("the indicator is homogenous and uninformative")
}
RMSD(y_actual, y_pred)/( max(y_actual) - min(y_actual) )
}
default_colors <- function(num) {
colors <- c("#BE5050FF", "#9BBA5FFF", "#8065A1FF",
"#F5954FFF", "#1E2A36FF", "#43AC47FF")
if (num > length(colors)) {
c(colors, grDevices::rainbow(length(colors) - num))
} else { utils::head(colors, num) }
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.