pooling.R
In rstanarm: Bayesian Applied Regression Modeling via Stan

## ----knitr-settings, include=FALSE--------------------------------------------
stopifnot(require(knitr))
opts_chunk$set(
  comment=NA, 
  message = FALSE, 
  warning = FALSE, 
  eval = identical(Sys.getenv("NOT_CRAN"), "true"),
  dev = "png",
  dpi = 150,
  fig.asp = 0.618,
  fig.width = 7,
  out.width = "70%",
  fig.align = "center"
)

## ----SETTINGS-gg, include=TRUE------------------------------------------------
library(ggplot2)
library(bayesplot)
theme_set(bayesplot::theme_default())

## ----load-data----------------------------------------------------------------
library(rstanarm)
data(bball1970)
bball <- bball1970
print(bball)

## ----N-K-y--------------------------------------------------------------------
# A few quantities we'll use throughout
N <- nrow(bball)
K <- bball$AB
y <- bball$Hits
K_new <- bball$RemainingAB
y_new <- bball$RemainingHits

## ----create-objects, results="hold"-------------------------------------------
batting_avg <- function(x) print(format(round(x, digits = 3), nsmall = 3), quote = FALSE)
player_avgs <- y / K # player avgs through 45 AB
tot_avg <- sum(y) / sum(K) # overall avg through 45 AB

cat("Player averages through 45 at-bats:\n")
batting_avg(player_avgs)
cat("Overall average through 45 at-bats:\n")
batting_avg(tot_avg)

## ----echo=FALSE---------------------------------------------------------------
par(mfrow = c(1,3), las = 1)
p_alpha <- function(alpha) {
  dnorm(alpha, -1, 1)
}
p_theta <- function(theta) {
  dnorm(log(theta) - log1p(-theta), -1, 1) / (theta - theta^2)
}
curve2 <- function(expr, limits, xlab, ...) {
  curve(expr, from = limits[1], to = limits[2], xlab = xlab,
    lwd = 3, bty = "l", ylab = "", cex.lab = 1.5, ...)
}
curve2(p_alpha, c(-3, 1), expression(alpha))
text(x = 0.25, y = 0.35, labels = expression(p(alpha)), cex = 1.5)
curve2(p_theta, c(0, 1), expression(theta), col = "red", ylim = c(0, 2.5))
text(x = 0.575, y = 1.5, labels = expression(p(theta)), cex = 1.5, col = "red")
curve2(p_alpha, c(-3, 1), expression(paste(alpha,", ", theta)), ylim = c(0, 2.5))
curve2(p_theta, c(0,1), col = "red", add = TRUE)
text(x = -1, y = 0.65, labels = expression(p(alpha)), cex = 1.5)
text(x = -0.5, y = 1.5, labels = expression(p(theta)), cex = 1.5, col = "red")

## ----full-pooling, results="hide"---------------------------------------------
SEED <- 101
wi_prior <- normal(-1, 1)  # weakly informative prior on log-odds
fit_pool <- stan_glm(cbind(Hits, AB - Hits) ~ 1, data = bball, family = binomial("logit"),
                     prior_intercept = wi_prior, seed = SEED)

## ----summary-stats-function---------------------------------------------------
invlogit <- plogis  # function(x) 1/(1 + exp(-x))
summary_stats <- function(posterior) {
  x <- invlogit(posterior)  # log-odds -> probabilities
  t(apply(x, 2, quantile, probs = c(0.1, 0.5, 0.9))) 
}

pool <- summary_stats(as.matrix(fit_pool))  # as.matrix extracts the posterior draws
pool <- matrix(pool,  # replicate to give each player the same estimates
               nrow(bball), ncol(pool), byrow = TRUE, 
               dimnames = list(bball$Player, c("10%", "50%", "90%")))
batting_avg(pool)

## ----no-pooling, results="hide"-----------------------------------------------
fit_nopool <- update(fit_pool, formula = . ~ 0 + Player, prior = wi_prior)
nopool <- summary_stats(as.matrix(fit_nopool))
rownames(nopool) <- as.character(bball$Player)
batting_avg(nopool)

## ----no-pooling-print, echo=FALSE---------------------------------------------
batting_avg(nopool)

## ----partial-pooling, results="hide"------------------------------------------
fit_partialpool <- 
  stan_glmer(cbind(Hits, AB - Hits) ~ (1 | Player), data = bball, 
             family = binomial("logit"),
             prior_intercept = wi_prior, seed = SEED)

## ----partial-pooling-shift-draws----------------------------------------------
# shift each player's estimate by intercept (and then drop intercept)
shift_draws <- function(draws) {
  sweep(draws[, -1], MARGIN = 1, STATS = draws[, 1], FUN = "+")
}
alphas <- shift_draws(as.matrix(fit_partialpool))
partialpool <- summary_stats(alphas)
partialpool <- partialpool[-nrow(partialpool),]
rownames(partialpool) <- as.character(bball$Player)
batting_avg(partialpool)

## ----plot-observed-vs-estimated-----------------------------------------------
library(ggplot2)
models <- c("complete pooling", "no pooling", "partial pooling")
estimates <- rbind(pool, nopool, partialpool)
colnames(estimates) <- c("lb", "median", "ub")
plotdata <- data.frame(estimates, 
                       observed = rep(player_avgs, times = length(models)), 
                       model = rep(models, each = N), 
                       row.names = NULL)

ggplot(plotdata, aes(x = observed, y = median, ymin = lb, ymax = ub)) +
  geom_hline(yintercept = tot_avg, color = "lightpink", size = 0.75) +
  geom_abline(intercept = 0, slope = 1, color = "skyblue") + 
  geom_linerange(color = "gray60", size = 0.75) + 
  geom_point(size = 2.5, shape = 21, fill = "gray30", color = "white", stroke = 0.2) + 
  facet_grid(. ~ model) +
  coord_fixed() +
  scale_x_continuous(breaks = c(0.2, 0.3, 0.4)) +
  labs(x = "Observed Hits / AB", y = "Predicted chance of hit") +
  ggtitle("Posterior Medians and 80% Intervals")

## ----log_p_new----------------------------------------------------------------
newdata <- data.frame(Hits = y_new, AB = K_new, Player = bball$Player)
fits <- list(Pooling = fit_pool, 
             NoPooling = fit_nopool, 
             PartialPooling = fit_partialpool)

# compute log_p_new matrix with each of the models in 'fits'
log_p_new_mats <- lapply(fits, log_lik, newdata = newdata)

# for each matrix in the list take the row sums
log_p_new <- sapply(log_p_new_mats, rowSums)
M <- nrow(log_p_new)
head(log_p_new)

## ----log_p_new-mean-----------------------------------------------------------
mean_log_p_new <- colMeans(log_p_new)
round(sort(mean_log_p_new, decreasing = TRUE), digits = 1)

## ----log_sum_exp--------------------------------------------------------------
log_sum_exp <- function(u) {
  max_u <- max(u)
  a <- 0
  for (n in 1:length(u)) {
    a <- a + exp(u[n] - max_u)
  }
  max_u + log(a)
}

# Or equivalently using vectorization
log_sum_exp <- function(u) {
  max_u <- max(u)
  max_u + log(sum(exp(u - max_u)))
}

## ----log_mean_exp-------------------------------------------------------------
log_mean_exp <- function(u) {
  M <- length(u)
  -log(M) + log_sum_exp(u)
}

## ----comment=NA---------------------------------------------------------------
new_lps <- lapply(log_p_new_mats, function(x) apply(x, 2, log_mean_exp))

# sum over the data points
new_lps_sums <- sapply(new_lps, sum)

round(sort(new_lps_sums, decreasing = TRUE), digits = 1)

## ----loo----------------------------------------------------------------------
loo_compare(loo(fit_partialpool), loo(fit_pool), loo(fit_nopool))

## ----ppd----------------------------------------------------------------------
newdata <- data.frame(Hits = y_new, AB = K_new, Player = bball$Player)
ppd_pool <- posterior_predict(fit_pool, newdata)
ppd_nopool <- posterior_predict(fit_nopool, newdata)
ppd_partialpool <- posterior_predict(fit_partialpool, newdata)
colnames(ppd_pool) <- colnames(ppd_nopool) <- colnames(ppd_partialpool) <- as.character(bball$Player)
colMeans(ppd_partialpool)

## ----clemente-----------------------------------------------------------------
z_1 <- ppd_partialpool[, 1]
clemente_80pct <- (y[1] + quantile(z_1, prob = c(0.1, 0.9))) / (K[1] + K_new[1])
batting_avg(clemente_80pct)

## ----ppd-stats----------------------------------------------------------------
ppd_intervals <- function(x) t(apply(x, 2, quantile, probs = c(0.25, 0.75)))
ppd_summaries <- (1 / K_new) * rbind(ppd_intervals(ppd_pool),
                                     ppd_intervals(ppd_nopool),
                                     ppd_intervals(ppd_partialpool))
df_ppd <- data.frame(player = rep(1:length(y_new), 3),
                     y = rep(y_new / K_new, 3),
                     lb = ppd_summaries[, "25%"],
                     ub = ppd_summaries[, "75%"],
                     model = rep(models, each = length(y_new)))

## ----plot-ppd-----------------------------------------------------------------
ggplot(df_ppd, aes(x=player, y=y, ymin=lb, ymax=ub)) + 
  geom_linerange(color = "gray60", size = 2) + 
  geom_point(size = 2.5, color = "skyblue4") +
  facet_grid(. ~ model) +
  labs(x = NULL, y = "batting average") + 
  scale_x_continuous(breaks = NULL) +
  ggtitle(expression(
    atop("Posterior Predictions for Batting Average in Remainder of Season",
         atop("50% posterior predictive intervals (gray bars); observed (blue dots)", ""))))

## ----event-probabilities, results="hold"--------------------------------------
draws_partialpool <- shift_draws(as.matrix(fit_partialpool))
thetas_partialpool <- plogis(draws_partialpool)
thetas_partialpool <- thetas_partialpool[,-ncol(thetas_partialpool)]
colnames(thetas_partialpool) <- as.character(bball$Player)
ability_gt_400 <- thetas_partialpool > 0.4
cat("Pr(theta_n >= 0.400 | y)\n")
colMeans(ability_gt_400)[c(1, 5, 10)]

some_gt_350 <- apply(thetas_partialpool, 1, function(x) max(x) > 0.35)
cat("Pr(at least one theta_n >= 0.350 | y)\n")
mean(some_gt_350)

## ----echo=FALSE---------------------------------------------------------------
thetas_pool <- plogis(as.matrix(fit_pool))
thetas_nopool <- plogis(as.matrix(fit_nopool))
some_gt_350_all <- sapply(list(thetas_pool, thetas_nopool, thetas_partialpool), 
                          function(x) apply(x, 1, max) > 0.35)
chance_gt_350 <- round(100 * colMeans(some_gt_350_all))

## ----ranking------------------------------------------------------------------
reverse_rank <- function(x) 1 + length(x) - rank(x) # so lower rank is better
rank <- apply(thetas_partialpool, 1, reverse_rank)
t(apply(rank, 1, quantile, prob = c(0.1, 0.5, 0.9)))

## ----plot-ranks---------------------------------------------------------------
df_rank <- data.frame(name = rep(bball$Player, each = M), 
                      rank = c(t(rank)))

ggplot(df_rank, aes(rank)) +
  stat_count(width = 0.8) +
  facet_wrap(~ name) +
  scale_x_discrete("Rank", limits = c(1, 5, 10, 15)) +
  scale_y_discrete("Probability", limits = c(0, 0.1 * M, 0.2 * M),
                   labels = c("0.0", "0.1", "0.2")) + 
  ggtitle("Rankings for Partial Pooling Model")

## ----plot-best-player---------------------------------------------------------
thetas_nopool <- plogis(as.matrix(fit_nopool))
colnames(thetas_nopool) <- as.character(bball$Player)
rank_nopool <- apply(thetas_nopool, 1, reverse_rank)
is_best_nopool <- rowMeans(rank_nopool == 1)
is_best_partialpool <- rowMeans(rank == 1)

df_is_best <- data.frame(unit = rep(bball$Player, 2), 
                         is_best = c(is_best_partialpool, is_best_nopool), 
                         model = rep(c("partial pooling", "no pooling"), each = N))

ggplot(df_is_best, aes(x=unit, y=is_best)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ model) +
  scale_y_continuous(name = "Pr[player is best]") +
  ggtitle("Who is the Best Player?") + 
  theme(axis.text.x = element_text(angle = -45, vjust = 1, hjust = 0))

## ----plot-ppc-stats-mean------------------------------------------------------
pp_check(fit_nopool, plotfun = "stat", stat = "mean")

## ----plot-ppc-stats-----------------------------------------------------------
tstat_plots <- function(model, stats) {
  lapply(stats, function(stat) {
    graph <- pp_check(model, plotfun = "stat", stat = stat, 
                      seed = SEED) # optional arguments
    graph + xlab(stat) + theme(legend.position = "none")
  })
}
Tstats <- c("mean", "sd", "min", "max")
ppcs_pool <- tstat_plots(fit_pool, Tstats)
ppcs_nopool <- tstat_plots(fit_nopool, Tstats)
ppcs_partialpool <- tstat_plots(fit_partialpool, Tstats)

if (require(gridExtra)) {
  grid.arrange(
    arrangeGrob(grobs = ppcs_pool, nrow = 1, left = "Pooling"), 
    arrangeGrob(grobs = ppcs_nopool, nrow = 1, left = "No Pooling"),
    arrangeGrob(grobs = ppcs_partialpool, nrow = 1, left = "Partial Pooling")
  )
}

## ----p-value------------------------------------------------------------------
yrep <- posterior_predict(fit_nopool, seed = SEED) # seed is optional
Ty <- sd(y)
Tyrep <- apply(yrep, 1, sd)

# tail-area probability
p <- 1 - mean(Tyrep > Ty)
print(p)

## ----plot-ppc-y-vs-yrep-------------------------------------------------------
pp_check(fit_partialpool, plotfun = "hist", nreps = 15, binwidth = 0.025) +
  ggtitle("Model: Partial Pooling")