#
# Runs a bootstrap estimation procedure to estimate uncertainty
# bounds on a given utility curve.
#
# Libraries used: sigr, boot, cdata, rquery/rqdatatable, wrapr
#
# Inputs:
# d: data frame of predictions, outcomes, and utilities
# 'true_positive_value': reward for a true positive (column name)
# 'false_positive_value': penalty for a false positive (column name)
# 'true_negative_value': reward for a true negative (column name)
# 'false_negative_value': penalty for a false negative (column name)
# prediction_column_name: name of predictions column
# outcome_column_name: name of outcome column
#
# Returns a list:
# plot_thin: data frame in thin form of total value (utility) curve, and
# mean curve of bootstrap estimates;
# all as functions of threshold
# boot_stats: frame of mean and median curves of bootstrap estimates, along with
# the boundary curves of the 95% and 50% quantiles, in wide form
#
estimate_utility_graph <- function(
d,
...,
prediction_column_name,
outcome_column_name) {
wrapr::stop_if_dot_args(substitute(list(...)), "estimate_utility_graph")
# calculate utility curve (actual values)
values <- model_utility(d, prediction_column_name, outcome_column_name)
# get the thresholds
threshold_list <- values$threshold[(!is.na(values$threshold))]
# calculates the utility curve from a bootstrap sample (described by indices)
f <- function(d, indices, ...) {
vi <- model_utility(d[indices, ], prediction_column_name, outcome_column_name)
fn <- approxfun(vi$threshold, vi$total_value,
yleft = min(vi$total_value),
yright = max(vi$total_value))
fn(threshold_list)
}
# run the bootstrap
boot_stats <- boot(data = d, statistic = f, R = 1000,
parallel = 'multicore', ncpus = parallel::detectCores())
boot_data <- as.data.frame(boot_stats$t)
# each column corresponds to a threshold
colnames(boot_data) <- threshold_list
# put it into long form
boot_data <- pivot_to_blocks(boot_data,
nameForNewKeyColumn = 'threshold',
nameForNewValueColumn = 'total_value',
columnsToTakeFrom = colnames(boot_data))
# turn thresholds back into numbers
boot_data$threshold <- as.numeric(boot_data$threshold)
# functions to calculate quantiles
q_0.025 <- function(x) { quantile(x, probs = 0.025) }
q_0.25 <- function(x) { quantile(x, probs = 0.25) }
q_0.50 <- function(x) { quantile(x, probs = 0.50) }
q_0.75 <- function(x) { quantile(x, probs = 0.75) }
q_0.975 <- function(x) { quantile(x, probs = 0.975) }
# create summary frame
boot_summary <- project(boot_data,
mean_total_value = mean(total_value),
q_0.025 = q_0.025(total_value),
q_0.25 = q_0.25(total_value),
q_0.50 = q_0.50(total_value),
q_0.75 = q_0.75(total_value),
q_0.975 = q_0.975(total_value),
groupby = 'threshold') %.>%
orderby(., 'threshold')
value_range <- values[
(values$threshold >= min(threshold_list)) &
(values$threshold <= max(threshold_list)), ]
# get the actual utility curve and the bootstrap mean curves
# (2 frames)
boot_thin <- boot_summary %.>%
select_columns(., qc(threshold, mean_total_value)) %.>%
rename_columns(., 'total_value' := 'mean_total_value') %.>%
extend(., estimate = 'bootstrapped value')
value_thin <- value_range %.>%
select_columns(., qc(threshold, total_value)) %.>%
extend(., estimate = 'estimated value')
value_thin <- value_thin[complete.cases(value_thin), , drop = FALSE]
plot_thin <- rbind(boot_thin, value_thin)
plot_thin <- plot_thin[
(complete.cases(plot_thin)) &
(plot_thin$threshold >= min(threshold_list)) &
(plot_thin$threshold <= max(threshold_list)), ]
list(plot_thin = plot_thin, boot_summary = boot_summary)
}
#
# Estimate parametric ideal utility.
#
# Fit ideal beta curves and compute utility with respect to those curves
#
# Libraries used: sigr, cdata, rquery/rqdatatable, wrapr
#
# Inputs:
# d: data frame of predictions, outcomes, and utilities
# prediction_column_name: name of predictions column
# outcome_column_name: name of outcome column
# true_positive_value: reward for a true positive (scalar)
# false_positive_value: penalty for a false positive (scalar)
# true_negative_value: reward for a true negative (scalar)
# false_negative_value: penalty for a false negative (scalar)
#
# Returns a list:
# plot_thin: data frame in thin form of total value (utility) curve, and
# mean curve of bootstrap estimates;
# all as functions of threshold
# boot_stats: frame of mean and median curves of bootstrap estimates, along with
# the boundary curves of the 95% and 50% quantiles, in wide form
#
parametric_utility_graph <- function(
d,
...,
prediction_column_name,
outcome_column_name,
true_positive_value,
false_positive_value,
true_negative_value,
false_negative_value) {
wrapr::stop_if_dot_args(substitute(list(...)), "estimate_utility_graph")
# estimate a parametric value curve
# try to recover per-class beta distribution parameters
unpack[shape1_pos, shape2_pos, shape1_neg, shape2_neg] <-
sigr::find_ROC_matching_ab(modelPredictions = d[[prediction_column_name]],
yValues = d[[outcome_column_name]])
total_pos <- sum(d[[outcome_column_name]])
total_neg <- sum(!d[[outcome_column_name]])
# generate the required tails
theoretical_values <- data.frame(
estimate = 'parametric fit',
threshold = values$threshold)
pos_take_count <- pbeta(theoretical_values$threshold, shape1 = shape1_pos, shape2 = shape2_pos, lower.tail = FALSE) * total_pos
pos_take_count[is.na(pos_take_count)] <- 0
neg_take_count <- pbeta(theoretical_values$threshold, shape1 = shape1_neg, shape2 = shape2_neg, lower.tail = FALSE) * total_neg
neg_take_count[is.na(neg_take_count)] <- 0
theoretical_values$true_negative_count <- total_neg - neg_take_count
theoretical_values$false_negative_count <- total_pos - pos_take_count
theoretical_values$true_positive_count <- pos_take_count
theoretical_values$false_positive_count <- neg_take_count
theoretical_values$total_value <-
theoretical_values$true_negative_count * true_negative_value +
theoretical_values$false_negative_count * false_negative_value +
theoretical_values$true_positive_count * true_positive_value +
theoretical_values$false_positive_count * false_positive_value
theoretical_values
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.