knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.height = 6, fig.width = 9, message = FALSE ) options(rmarkdown.html_vignette.check_title = FALSE)
library(aihuman) library(tidyr) library(dplyr) library(ggplot2) ## set default ggplot theme theme_set(theme_bw(base_size = 15) + theme(plot.title = element_text(hjust = 0.5)))
Ben-Michael, Eli, D. James Greiner, Melody Huang, Kosuke Imai, Zhichao Jiang, and Sooahn Shin. "Does AI help humans make better decisions?: A statistical evaluation framework for experimental and observational studies", 2024.
The data used for this paper, and made available here, are interim, based on only half of the observations in the study and (for those observations) only half of the study follow-up period. We use them only to illustrate methods, not to draw substantive conclusions.
In this vignette, we will use the data originally from Imai, Jiang, Greiner, Halen, and Shin (2023). The essential part of the data is:
Z
: A binary treatment $Z_i \in {0,1}$ (e.g. PSA provision)D
: A binary decision $D_i \in {0,1}$ (e.g. judge's release decision)A
: A binary AI recommendation $A_i \in {0,1}$ (e.g. dichotomized pretrial public safety assessment scores)Y
: A binary outcome $Y_i \in {0,1}$ (e.g. new criminal activity)The analysis can be conducted based on the following workflow:
We will be using the following data:
# randomized PSA provision (0: none, 1: provided) Z <- aihuman::NCAdata$Z # judge's release decision (0: signature, 1: cash) D <- if_else(aihuman::NCAdata$D == 0, 0, 1) # dichotomized pretrial public safety assessment scores (0: signature, 1: cash) A <- aihuman::PSAdata$DMF # new criminal activity (0: no, 1: yes) Y <- aihuman::NCAdata$Y
For subgroup analysis, we will use the following covariates:
# race race_vec <- aihuman::NCAdata |> mutate(race = if_else(White == 1, "White", "Non-white")) |> pull(race) # gender gender_vec <- aihuman::NCAdata |> mutate(gender = if_else(Sex == 1, "Male", "Female")) |> pull(gender)
In the paper, we use doubly robust estimators throughout the analysis. For this purpose, we will be fitting the nuisance functions using the following covariates ($X_i$):
cov_mat <- aihuman::NCAdata |> select(-c(Y, D, Z)) |> bind_cols(FTAScore = aihuman::PSAdata$FTAScore, NCAScore = aihuman::PSAdata$NCAScore, NVCAFlag = aihuman::PSAdata$NVCAFlag) |> as.matrix() colnames(cov_mat)
Then, the first set of the nuisance functions can be computed as follows, based on the Generalized Boosted Regression Modeling (see gbm::gbm
function):
nuis_func <- compute_nuisance_functions(Y = Y, D = D, Z = Z, V = cov_mat, shrinkage = 0.01, n.trees = 1000)
This gives us the following nuisance functions:
nuis_func$z_models$d_pred
gives estimated $\hat{m}^{D}(z, X_i)$.nuis_func$z_models$y_pred
gives estimated $\hat{m}^{Y}(z, X_i)$.nuis_func$pscore
gives estimated $\hat{e}(1, X_i)$.The second set of the nuisance functions conditioning on AI recommendation can be computed similarly:
nuis_func_ai <- compute_nuisance_functions_ai(Y = Y, D = D, Z = Z, A = A, V = cov_mat, shrinkage = 0.01, n.trees = 1000)
This gives us the following nuisance functions:
nuis_func_ai$z_models$d_pred
gives estimated $\hat{m}^{D}(z, a, X_i)$.nuis_func_ai$z_models$y_pred
gives estimated $\hat{m}^{Y}(z, a, X_i)$.nuis_func_ai$pscore
gives estimated $\hat{e}(1, X_i)$.For the reproducibility, we will be using the pre-computed nuisance functions, generated from the same codes above.
Comparison between human decisions and PSA-generated recommendations are summarized in the following contingency table (Table 1 in the paper).
counts <- table(D[Z == 0], A[Z == 0]) proportions <- prop.table(counts) * 100 combined_table_human <- matrix(sprintf("%.1f%% (%d)", proportions, counts), nrow = nrow(counts), ncol = ncol(counts)) colnames(combined_table_human) <- c("Signature", "Cash") rownames(combined_table_human) <- c("Signature", "Cash") knitr::kable(combined_table_human, caption = "Table 1 (Human v. PSA)")
counts <- table(D[Z == 1], A[Z == 1]) proportions <- prop.table(counts) * 100 combined_table_human_ai <- matrix(sprintf("%.1f%% (%d)", proportions, counts), nrow = nrow(counts), ncol = ncol(counts)) colnames(combined_table_human_ai) <- c("Signature", "Cash") rownames(combined_table_human_ai) <- c("Signature", "Cash") knitr::kable(combined_table_human_ai, caption = "Table 1 (Human+PSA v. PSA)")
We can analyze the impact of AI recommendations on agreement between human decisions and AI recommendations using the difference in means estimates of an indicator $1{D_i = A_i}$. The results are as follows (Figure S1 in the appendix).
table_agreement( Y = Y, D = D, Z = Z, A = A, subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender" ) |> mutate_at(vars(agree_diff, agree_diff_se, ci_ub, ci_lb), ~round(., 3)) |> knitr::kable(caption = "Agreement of Human and PSA Decision Makers") plot_agreement( Y = Y, D = D, Z = Z, A = A, subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), p.title = "Agreement between Human Decisions and PSA Recommendations", p.lb = -0.25, p.ub = 0.25 )
We now compare difference in risk between human+AI and human decision makers using AIPW estimators.
You may choose between (1) AIPW estimator and (2) difference-in-means (Neyman) estimator.
You can also specify the loss ratio between false positives and false negatives using l01
parameter.
For the subgroup analysis, you can specify the subgroup variable using X
.
# AIPW estimator compute_stats_aipw( Y = Y, D = D, Z = Z, nuis_funcs = nuis_func, true.pscore = rep(0.5, length(D)), X = NULL, l01 = 1 ) # Difference-in-means (Neyman) estimator compute_stats( Y = Y, D = D, Z = Z, X = NULL, l01 = 1 )
The results can be visualized as follows (Figure 1 in the paper). You may use plot_diff_human()
function for the Neyman estimator.
plot_diff_human_aipw( Y = Y, D = D, Z = Z, nuis_funcs = nuis_func, l01 = 1, true.pscore = rep(0.5, length(D)), subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), p.title = NULL, p.lb = -0.3, p.ub = 0.3 )
We can answer the question using the following subgroup analysis defined by A
.
First, subgroup analysis for the cases where AI recommends harsher decision ($A = 1$). The figure shows how the human overrides the AI recommendation in terms of true negative proportion (TNP), false negative proportion (FNP), and their differences (Figure S2 in the appendix).
plot_diff_subgroup( Y = Y, D = D, Z = Z, A = A, a = 1, l01 = 1, nuis_funcs = nuis_func, true.pscore = rep(0.5, length(D)), subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), p.title = NULL, p.lb = -0.5, p.ub = 0.5, label = "TNP - FNP", metrics = c("True Negative Proportion (TNP)", "False Negative Proportion (FNP)", "TNP - FNP") )
Second, subgroup analysis for the cases where AI recommends lenient decision ($A = 0$) is also available. The figure shows how human overrides the AI recommendation in terms of true positive proportion (TPP), false positive proportion (FPP), and their differences (Figure S3 in the appendix).
plot_diff_subgroup( Y = Y, D = D, Z = Z, A = A, a = 0, l01 = 1, nuis_funcs = nuis_func, true.pscore = rep(0.5, length(D)), subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), p.title = NULL, p.lb = -0.5, p.ub = 0.5, label = "TPP - FPP", metrics = c("True Positive Proportion (TPP)", "False Positive Proportion (FPP)", "TPP - FPP") )
We now compare difference in risk between AI and human decision makers using AIPW estimators. Note that these quantities are partially identified given the assumptions made in the paper.
# AIPW estimator compute_bounds_aipw( Y = Y, D = D, Z = Z, A = A, nuis_funcs = nuis_func, nuis_funcs_ai = nuis_func_ai, true.pscore = rep(0.5, length(D)), X = NULL, l01 = 1 )
The results can be visualized as follows (Figure 2 and Figure S4 in the paper).
# AI versus Human plot_diff_ai_aipw( Y = Y, D = D, Z = Z, A = A, z_compare = 0, nuis_funcs = nuis_func, nuis_funcs_ai = nuis_func_ai, l01 = 1, true.pscore = rep(0.5, length(D)), subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), p.title = NULL, p.lb = -0.3, p.ub = 0.3 ) # AI versus Human+AI plot_diff_ai_aipw( Y = Y, D = D, Z = Z, A = A, z_compare = 1, nuis_funcs = nuis_func, nuis_funcs_ai = nuis_func_ai, l01 = 1, true.pscore = rep(0.5, length(D)), subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), p.title = NULL, p.lb = -0.3, p.ub = 0.3 )
Note that researcher can specify different AI recommendations by changing the A
variable.
For example, in the paper, we have used Llama3 recommendations as an alternative comparison.
# Llama3 versus Human plot_diff_ai_aipw( Y = Y, D = D, Z = Z, A = A_llama, z_compare = 0, nuis_funcs = nuis_func, nuis_funcs_ai = nuis_func_ai, l01 = 1, true.pscore = rep(0.5, length(D)), subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), y.lab = "Llama3 versus Human", p.label = c("Llama3 worse", "Llama3 better"), p.title = NULL, p.lb = -0.5, p.ub = 0.5 )
As noted earlier, one may specify different loss functions by changing the l01
parameter.
Here, we provide an example of when human decision maker is preferred over AI recommendation (Figure 3 and Figure S5 in the paper).
# When to prefer human decision maker over AI recommendation plot_preference( Y = Y, D = D, Z = Z, A = A, z_compare = 0, nuis_funcs = nuis_func, nuis_funcs_ai = nuis_func_ai, l01_seq = 10^seq(-2, 2, length.out = 200), alpha = 0.05, true.pscore = rep(0.5, length(D)), subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), p.title = NULL, legend.position = "bottom", p.label = c("AI-alone preferred", "Human-alone preferred", "Ambiguous") ) + scale_color_manual("", values = c("", "#4d9221", "grey30"), labels = c("", "Human-alone preferred", "Ambiguous"), drop = FALSE ) + scale_linetype_manual("", values = c(1, 1, 3), labels = c("", "Human-alone preferred", "Ambiguous"), drop = FALSE) # When to prefer human+AI decision maker over AI recommendation plot_preference( Y = Y, D = D, Z = Z, A = A, z_compare = 1, nuis_funcs = nuis_func, nuis_funcs_ai = nuis_func_ai, l01_seq = 10^seq(-2, 2, length.out = 200), alpha = 0.05, true.pscore = rep(0.5, length(D)), subgroup1 = race_vec, subgroup2 = gender_vec, label.subgroup1 = "Race", label.subgroup2 = "Gender", x.order = c("Overall", "Non-white", "White", "Female", "Male"), p.title = NULL, legend.position = "bottom", p.label = c("AI-alone preferred", "Human-alone preferred", "Ambiguous") ) + scale_color_manual("", values = c("", "#4d9221", "grey30"), labels = c("", "Human-with-algorithm preferred", "Ambiguous"), drop = FALSE ) + scale_linetype_manual("", values = c(1, 1, 3), labels = c("", "Human-with-algorithm preferred", "Ambiguous"), drop = FALSE)
Finally, we can conduct policy learning using the following codes.
The optimization requires gurobi
software, and you may need to install it separately (documentation).
In this vignette, we provide the codes for replicating the results in the paper (Figure 4 and Table 3).
library(gurobi) ## Optimization functions make_edge_mat <- function(X) { n <- nrow(X) lapply(1:n, function(i) { vapply(1:n, function(j) { if(all(X[i,] <= X[j,]) & i != j) { e <- numeric(2 * n) e[j] <- 1 e[n + i] <- -1 } else { e <- numeric(2 * n) } return(e) }, numeric(2 * n)) %>% t() }) %>% do.call(rbind, .) %>% unique() %>% Matrix::Matrix() } make_action_mat <- function(n, k) { do.call(cbind, lapply(1:k, function(a) a * Matrix::Diagonal(n))) } create_monotone_constraints <- function(X, rev = FALSE) { n <- nrow(X) d <- ncol(X) # policy vector is (pi(1,.),...,pi(A,.)) # sum to one constraints pi_sum_to_one <- do.call(cbind, lapply(1:2, function(x) Matrix::Diagonal(n))) rhs_sum_to_one <- rep(1, n) sense_sum_to_one <- rep("=", n) # monotonicy constraints edge_mat <- make_edge_mat(X) action_mat <- make_action_mat(n, 2) # the following codes insure that (1) edge_mat: select pairs that monotonicity should hold in a correct order (i.e. if X[i,]<X[j,] then assign 1 to j and -1 to i) (2) rbind(action_mat, action_mat): if multiplied by a vector of binary actions (length = 2*n since there exist two actions), it selects the action that has been chosen (e.g. if action 1 = 0 and action 2 = 1 for a given observation, it gives the value of 2) monotone_mat <- edge_mat %*% rbind(action_mat, action_mat) rhs_monotone <- rep(0, nrow(monotone_mat)) if(isTRUE(rev)) { sense_monotone <- rep("<=", nrow(monotone_mat)) } else { sense_monotone <- rep(">=", nrow(monotone_mat)) } A <- rbind(pi_sum_to_one, monotone_mat) rhs <- c(rhs_sum_to_one, rhs_monotone) sense <- c(sense_sum_to_one, sense_monotone) # restrict variables vtype <- rep("B", n * 2) # binary policy decisions return(list(A = A, rhs = rhs, sense = sense, vtype = vtype)) } get_monotone_policy <- function(X, wts, rev = FALSE) { n <- nrow(X) # get the constraints model <- create_monotone_constraints(as.matrix(X), rev = rev) # solve model$obj <- c(numeric(n), wts) model$modelsense <- "min" sol <- gurobi(model) # extract values policy <- apply(matrix(sol$x, ncol = 2), 1, which.max) - 1 return(policy) } get_monotone_policy_ai <- function(X, wts, rev = FALSE) { X_df <- data.frame(X) wts_df <- data.frame(wts) colnames(wts_df) <- "wts" comb_df <- bind_cols(X_df, wts_df) # get distinct X values and the sum of the weights on them grouped_df <- comb_df %>% group_by(across(-wts)) %>% summarise(across(everything(), sum), n = n(), .groups = "drop" ) X_distinct <- grouped_df %>% select(-wts, -n) %>% as.matrix() wts_distinct <- grouped_df %>% pull(wts) policy <- get_monotone_policy(X_distinct, wts_distinct, rev = rev) grouped_df <- grouped_df %>% mutate(policy = policy) return(grouped_df) } ## Weights functions compute_wts_dr <- function(Y, D, Z, nuis_funcs, pscore = NULL, l01 = 1) { ## update the propensity score if provided if (!is.null(pscore)) { nuis_funcs$pscore <- pscore } ## check whether Z is 0/1 binary variable if (any(sort(unique(Z)) != c(0, 1))) { stop("Z must be a binary variable") } dat <- data.frame(Y = Y, D = D, Z = Z, pscore = nuis_funcs$pscore) preds <- nuis_funcs$z_models |> pivot_wider(names_from = c(Z), values_from = c(-Z, -idx), names_sep = "") |> select(-idx) dat <- dat %>% bind_cols(preds) wts <- dat %>% mutate( p.1i = (1 - d_pred1) * ((1 + l01) * y_pred1 - l01) + (1 + l01) * Z * (1 - D) * (Y - y_pred1) / pscore - ((1 + l01) * y_pred1 - l01) * Z * (D - d_pred1) / pscore, p.0i = (1 - d_pred0) * ((1 + l01) * y_pred0 - l01) + (1 + l01) * (1 - Z) * (1 - D) * (Y - y_pred0) / (1 - pscore) - ((1 + l01) * y_pred0 - l01) * (1 - Z) * (D - d_pred0) / (1 - pscore), p.i = p.1i - p.0i ) %>% pull(p.i) return(wts) } compute_bound_wts_dr <- function(Y, A, D, Z, nuis_funcs1, nuis_funcs2, pscore = NULL, l01 = 1) { ## update the propensity score if provided if (!is.null(pscore)) { nuis_funcs1$pscore <- pscore nuis_funcs2$pscore <- pscore } ## check whether Z is 0/1 binary variable if (any(sort(unique(Z)) != c(0, 1))) { stop("Z must be a binary variable") } data <- data.frame(Y = Y, D = D, Z = Z, A = A, pscore = nuis_funcs1$pscore) preds1 <- nuis_funcs1$z_models |> pivot_wider(names_from = c(Z), values_from = c(-Z, -idx), names_sep = "") |> select(-idx) preds2 <- nuis_funcs2$z_models |> pivot_wider(names_from = c(Z, A), values_from = c(-Z, -A, -idx), names_sep = "") |> select(-idx) data <- data %>% bind_cols(preds1, preds2) wts <- data %>% mutate( # gU0.i = 1 if z'=1 i.e. Pr(Y=0,D=0|A=0,Z=1,X=x) > Pr(Y=0,D=0|A=0,Z=0,X=x) gU0.i = 1 * ((1 - d_pred10) * (1 - y_pred10) >= (1 - d_pred00) * (1 - y_pred00)), # phi_z0(x): Pr(Y=0,D=0,A=0|Z=z,X=x) p10.i = (1 - A) * (1 - d_pred10) * (1 - y_pred10) - Z * (1 - A) * (1 - D) * (Y - y_pred10) / pscore - (1 - y_pred10) * Z * (1 - A) * (D - d_pred10) / pscore, p00.i = (1 - A) * (1 - d_pred00) * (1 - y_pred00) - (1 - Z) * (1 - A) * (1 - D) * (Y - y_pred00) / (1 - pscore) - (1 - y_pred00) * (1 - Z) * (1 - A) * (D - d_pred00) / (1 - pscore) ) %>% mutate( p10_gU0.i = p10.i * gU0.i, p00_gU0.i = p00.i * gU0.i ) %>% mutate( fn_diff_ub.0i = p01.i - p0.i + p00.D.i - p10_gU0.i + p00_gU0.i, fp_diff_ub.0i = p01.i - p0.i + p01.D.i - p10_gU0.i + p00_gU0.i, loss_diff_ub.0i = fn_diff_ub.0i + l01 * fp_diff_ub.0i ) %>% pull(loss_diff_ub.0i) return(wts) }
get_policy_value <- function(policy, n_obs = 1891) { v <- policy |> mutate(wts_policy = wts * policy) |> pull(wts_policy) |> sum() return(v/n_obs) }
Let's consider policy class with an increasing monotonicity constraint.
psaX <- cov_mat[,c("FTAScore", "NCAScore", "NVCAFlag")] nca_wts <- compute_wts_dr(Y = Y, D = D, Z = Z, nuis_funcs = nuis_func, pscore = 0.5, l01 = 1) nca_provide_policy <- get_monotone_policy_ai(X = psaX, wts = nca_wts)
## The estimated values of the empirical risk minimization problem get_policy_value(nca_provide_policy, n_obs = length(Y)) ## Visualization nca_provide_policy |> filter(NVCAFlag == 1) |> mutate( policy = factor(policy, levels = c(0, 1)), NVCAFlag = paste0("NVCA Flag: ", NVCAFlag) ) |> ggplot(aes(x = NCAScore, y = FTAScore, fill = policy)) + geom_tile(color = "grey50") + geom_segment(aes(x = 2.5, xend = 4.5, y = 4.5, yend = 4.5), lty = 3) + geom_segment(aes(x = 4.5, xend = 4.5, y = 4.5, yend = 1.5), lty = 3) + facet_grid(. ~ NVCAFlag) + scale_fill_brewer("PSA Recommendation Provision", type = "seq", palette = "Blues", direction = 1 ) + scale_y_reverse(breaks = 1:6) + scale_x_continuous(breaks = 1:6) + expand_limits(x = 0.5:6, y =0.5:6) + labs(x = "NCA Score", y = "FTA Score", title = "Whether to provide PSA recommendations") + coord_fixed(expand = F) + theme_classic(base_size = 15) + scale_fill_brewer("", breaks = 0:1, labels = c("Not Provide", "Provide")) + theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5)) + geom_text(aes(label = n), color = "grey10")
We can also consider policy class with a decreasing monotonicity constraint.
nca_provide_policy_dec <- get_monotone_policy_ai(X = psaX, wts = nca_wts, rev = TRUE)
get_policy_value(nca_provide_policy_dec, n_obs = length(Y))
Let's consider policy class with an increasing monotonicity constraint.
nca_wts <- compute_bound_wts_dr(Y = Y, A = A, D = D, Z = Z, nuis_funcs1 = nuis_func, nuis_funcs2 = nuis_func_ai, pscore = 0.5, l01 = 1) nca_follow_policy <- get_monotone_policy_ai(X = psaX, wts = nca_wts)
## The estimated values of the empirical risk minimization problem get_policy_value(nca_follow_policy, n_obs = length(Y)) ## Visualization nca_follow_policy |> filter(NVCAFlag == 1) |> mutate( policy = factor(policy, levels = c(0, 1)), NVCAFlag = paste0("NVCA Flag: ", NVCAFlag) ) |> ggplot(aes(x = NCAScore, y = FTAScore, fill = policy)) + geom_tile(color = "grey50") + geom_segment(aes(x = 2.5, xend = 4.5, y = 4.5, yend = 4.5), lty = 3) + geom_segment(aes(x = 4.5, xend = 4.5, y = 4.5, yend = 1.5), lty = 3) + facet_grid(. ~ NVCAFlag) + scale_fill_brewer("PSA Recommendation Provision", type = "seq", palette = "Blues", direction = 1 ) + scale_y_reverse(breaks = 1:6) + scale_x_continuous(breaks = 1:6) + expand_limits(x = 0.5:6, y =0.5:6) + labs(x = "NCA Score", y = "FTA Score", title = "Whether to follow PSA recommendations") + coord_fixed(expand = F) + theme_classic(base_size = 15) + scale_fill_brewer("", type = "seq", palette = "Reds", direction = 1, breaks = 0:1, label = c("Do not follow", "Follow")) + theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5)) + geom_text(aes(label = n), color = "grey10")
We can also consider policy class with a decreasing monotonicity constraint.
nca_follow_policy_dec <- get_monotone_policy_ai(X = psaX, wts = nca_wts, rev = TRUE)
get_policy_value(nca_follow_policy_dec, n_obs = length(Y))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.