knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This vingette reproduces Figure 2 in the article: a calibration plot. It uses the data object test_set_predictions
.
rm(list = ls()) library(recidivismsl) library(dplyr)
We only want to plot calibration of the main analyses (not the logistic regression models with only a single RITA-factors as a predictor.)
test_set_predictions_mains <- select(test_set_predictions, 1:3, model_names_main)
Calculating expected and observed outcomes by quintile will be facilitated by having a separate data frame for each model including the estimated probabilities, corresponding quintile, and pertinent outcome.
We start by making a list vectors of estimated probabilities that we then turn to a list of data frames - at this point with a single column.
gen_models <- stringr::str_subset(names(test_set_predictions_mains), "^gen_") vio_models <- stringr::str_subset(names(test_set_predictions_mains), "^vio_") model_df_list <- as.list(test_set_predictions_mains[c(gen_models, vio_models)]) model_df_list <- purrr::map(model_df_list, .f = ~ data_frame(prediction = .x))
For each data frame we add a column with the corresponding quintile
# Function to use make_quintiles <- function(x) { quint_breaks <- (quantile(x, probs = seq(0, 1, 0.2))) quintiles <- cut(x, breaks = quint_breaks, include.lowest = TRUE, labels = FALSE) return(quintiles) } model_df_list <- purrr::map(.x = model_df_list, .f = ~ mutate(.x, quintile = make_quintiles(prediction)))
We the add the pertinent outcome variable
# Split into seperate lists for general and violent recidivism model_df_list_gen <- purrr::map(.x = model_df_list[gen_models], .f = ~cbind(.x, outcome = test_set_predictions_mains$reoffenceThisTerm)) model_df_list_vio <- purrr::map(.x = model_df_list[vio_models], .f = ~cbind(.x, outcome = test_set_predictions_mains$newO_violent)) # Combine back model_df_list <- c(model_df_list_gen, model_df_list_vio)
Assertions
# All data frames have 3 colmns stopifnot(all(purrr::map(model_df_list, length) == 3)) # All models are included (and nothing else) stopifnot(all(sort(c(vio_models, gen_models)) == sort(names(model_df_list)))) # Columns in data frame have the expected names list_of_column_names <- purrr::map(model_df_list, names) stopifnot(all( purrr::map_lgl(.x = list_of_column_names, .f = ~ identical(.x, c("prediction", "quintile", "outcome"))) ))
For each data frame we calculate the expected and observed recidivism rates. We get a list of data frames with 5 rows (one per quintile) and three columns (quintile, observed rate and expected rate.)
# Function to use. get_rates_per_bin <- function(model_data_frame) { model_data_frame %>% group_by(quintile) %>% summarise(n = n(), # n in quintile obs = sum(as.numeric(outcome) == 2), # number of reoffences obs_rate = obs/n * 100, # observed rate exp_rate = mean(prediction)*100) %>% # expected rate select(-n, -obs) # remove intermediate steps } calibration_list <- purrr::map(model_df_list, get_rates_per_bin)
Now we bind together all the data frames and create an identifying label with the model name. This results in a data frame with four columns and 24 models x 5 quintiles = 120 rows.
calibration_dframe <- bind_rows(calibration_list, .id = "model_name") stopifnot(ncol(calibration_dframe) == 4) stopifnot(nrow(calibration_dframe) == 120)
We also want the descriptions of the models that we can take from model_grid
model_desc <- model_grid[c("model_name", "outcome", "predictors", "model_type")] calibration_dframe <- left_join(calibration_dframe, model_desc, by = "model_name")
With this we can make a plot.
library(ggplot2) calibration_plot <- calibration_dframe %>% filter(predictors != "All including term") %>% ggplot(aes(x = exp_rate, y = obs_rate, group = predictors)) + geom_line(aes(lty = predictors), alpha = 0.5) + geom_point(aes(shape = predictors)) + geom_abline() + scale_linetype(name = "Predictor set") + scale_shape_manual(values = c(16, 4, 1, 17), name = "Predictor set") + scale_x_continuous(limits = c(0, 100)) + facet_grid(outcome ~ model_type) + ylab("Observed recidivism rate in quintile") + xlab("Expected probability of recidivism in quintile") + ggthemes::theme_tufte(base_family = "sans") + theme(legend.position = "bottom") calibration_plot
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.