Nothing
## ---- knitr-settings, include=FALSE-------------------------------------------
stopifnot(require(knitr))
opts_chunk$set(
comment=NA,
message = FALSE,
warning = FALSE,
eval = identical(Sys.getenv("NOT_CRAN"), "true"),
dev = "png",
dpi = 150,
fig.asp = 0.618,
fig.width = 7,
out.width = "70%",
fig.align = "center"
)
## ---- SETTINGS-gg, include=TRUE-----------------------------------------------
library(ggplot2)
library(bayesplot)
theme_set(bayesplot::theme_default())
## ---- load-data---------------------------------------------------------------
library(rstanarm)
data(bball1970)
bball <- bball1970
print(bball)
## ---- N-K-y-------------------------------------------------------------------
# A few quantities we'll use throughout
N <- nrow(bball)
K <- bball$AB
y <- bball$Hits
K_new <- bball$RemainingAB
y_new <- bball$RemainingHits
## ---- create-objects, results="hold"------------------------------------------
batting_avg <- function(x) print(format(round(x, digits = 3), nsmall = 3), quote = FALSE)
player_avgs <- y / K # player avgs through 45 AB
tot_avg <- sum(y) / sum(K) # overall avg through 45 AB
cat("Player averages through 45 at-bats:\n")
batting_avg(player_avgs)
cat("Overall average through 45 at-bats:\n")
batting_avg(tot_avg)
## ---- echo=FALSE--------------------------------------------------------------
par(mfrow = c(1,3), las = 1)
p_alpha <- function(alpha) {
dnorm(alpha, -1, 1)
}
p_theta <- function(theta) {
dnorm(log(theta) - log1p(-theta), -1, 1) / (theta - theta^2)
}
curve2 <- function(expr, limits, xlab, ...) {
curve(expr, from = limits[1], to = limits[2], xlab = xlab,
lwd = 3, bty = "l", ylab = "", cex.lab = 1.5, ...)
}
curve2(p_alpha, c(-3, 1), expression(alpha))
text(x = 0.25, y = 0.35, labels = expression(p(alpha)), cex = 1.5)
curve2(p_theta, c(0, 1), expression(theta), col = "red", ylim = c(0, 2.5))
text(x = 0.575, y = 1.5, labels = expression(p(theta)), cex = 1.5, col = "red")
curve2(p_alpha, c(-3, 1), expression(paste(alpha,", ", theta)), ylim = c(0, 2.5))
curve2(p_theta, c(0,1), col = "red", add = TRUE)
text(x = -1, y = 0.65, labels = expression(p(alpha)), cex = 1.5)
text(x = -0.5, y = 1.5, labels = expression(p(theta)), cex = 1.5, col = "red")
## ---- full-pooling, results="hide"--------------------------------------------
SEED <- 101
wi_prior <- normal(-1, 1) # weakly informative prior on log-odds
fit_pool <- stan_glm(cbind(Hits, AB - Hits) ~ 1, data = bball, family = binomial("logit"),
prior_intercept = wi_prior, seed = SEED)
## ---- summary-stats-function--------------------------------------------------
invlogit <- plogis # function(x) 1/(1 + exp(-x))
summary_stats <- function(posterior) {
x <- invlogit(posterior) # log-odds -> probabilities
t(apply(x, 2, quantile, probs = c(0.1, 0.5, 0.9)))
}
pool <- summary_stats(as.matrix(fit_pool)) # as.matrix extracts the posterior draws
pool <- matrix(pool, # replicate to give each player the same estimates
nrow(bball), ncol(pool), byrow = TRUE,
dimnames = list(bball$Player, c("10%", "50%", "90%")))
batting_avg(pool)
## ---- no-pooling, results="hide"----------------------------------------------
fit_nopool <- update(fit_pool, formula = . ~ 0 + Player, prior = wi_prior)
nopool <- summary_stats(as.matrix(fit_nopool))
rownames(nopool) <- as.character(bball$Player)
batting_avg(nopool)
## ---- no-pooling-print, echo=FALSE--------------------------------------------
batting_avg(nopool)
## ---- partial-pooling, results="hide"-----------------------------------------
fit_partialpool <-
stan_glmer(cbind(Hits, AB - Hits) ~ (1 | Player), data = bball,
family = binomial("logit"),
prior_intercept = wi_prior, seed = SEED)
## ---- partial-pooling-shift-draws---------------------------------------------
# shift each player's estimate by intercept (and then drop intercept)
shift_draws <- function(draws) {
sweep(draws[, -1], MARGIN = 1, STATS = draws[, 1], FUN = "+")
}
alphas <- shift_draws(as.matrix(fit_partialpool))
partialpool <- summary_stats(alphas)
partialpool <- partialpool[-nrow(partialpool),]
rownames(partialpool) <- as.character(bball$Player)
batting_avg(partialpool)
## ---- plot-observed-vs-estimated----------------------------------------------
library(ggplot2)
models <- c("complete pooling", "no pooling", "partial pooling")
estimates <- rbind(pool, nopool, partialpool)
colnames(estimates) <- c("lb", "median", "ub")
plotdata <- data.frame(estimates,
observed = rep(player_avgs, times = length(models)),
model = rep(models, each = N),
row.names = NULL)
ggplot(plotdata, aes(x = observed, y = median, ymin = lb, ymax = ub)) +
geom_hline(yintercept = tot_avg, color = "lightpink", size = 0.75) +
geom_abline(intercept = 0, slope = 1, color = "skyblue") +
geom_linerange(color = "gray60", size = 0.75) +
geom_point(size = 2.5, shape = 21, fill = "gray30", color = "white", stroke = 0.2) +
facet_grid(. ~ model) +
coord_fixed() +
scale_x_continuous(breaks = c(0.2, 0.3, 0.4)) +
labs(x = "Observed Hits / AB", y = "Predicted chance of hit") +
ggtitle("Posterior Medians and 80% Intervals")
## ---- log_p_new---------------------------------------------------------------
newdata <- data.frame(Hits = y_new, AB = K_new, Player = bball$Player)
fits <- list(Pooling = fit_pool,
NoPooling = fit_nopool,
PartialPooling = fit_partialpool)
# compute log_p_new matrix with each of the models in 'fits'
log_p_new_mats <- lapply(fits, log_lik, newdata = newdata)
# for each matrix in the list take the row sums
log_p_new <- sapply(log_p_new_mats, rowSums)
M <- nrow(log_p_new)
head(log_p_new)
## ---- log_p_new-mean----------------------------------------------------------
mean_log_p_new <- colMeans(log_p_new)
round(sort(mean_log_p_new, decreasing = TRUE), digits = 1)
## ---- log_sum_exp-------------------------------------------------------------
log_sum_exp <- function(u) {
max_u <- max(u)
a <- 0
for (n in 1:length(u)) {
a <- a + exp(u[n] - max_u)
}
max_u + log(a)
}
# Or equivalently using vectorization
log_sum_exp <- function(u) {
max_u <- max(u)
max_u + log(sum(exp(u - max_u)))
}
## ---- log_mean_exp------------------------------------------------------------
log_mean_exp <- function(u) {
M <- length(u)
-log(M) + log_sum_exp(u)
}
## ----comment=NA---------------------------------------------------------------
new_lps <- lapply(log_p_new_mats, function(x) apply(x, 2, log_mean_exp))
# sum over the data points
new_lps_sums <- sapply(new_lps, sum)
round(sort(new_lps_sums, decreasing = TRUE), digits = 1)
## ---- loo---------------------------------------------------------------------
loo_compare(loo(fit_partialpool), loo(fit_pool), loo(fit_nopool))
## ---- ppd---------------------------------------------------------------------
newdata <- data.frame(Hits = y_new, AB = K_new, Player = bball$Player)
ppd_pool <- posterior_predict(fit_pool, newdata)
ppd_nopool <- posterior_predict(fit_nopool, newdata)
ppd_partialpool <- posterior_predict(fit_partialpool, newdata)
colnames(ppd_pool) <- colnames(ppd_nopool) <- colnames(ppd_partialpool) <- as.character(bball$Player)
colMeans(ppd_partialpool)
## ---- clemente----------------------------------------------------------------
z_1 <- ppd_partialpool[, 1]
clemente_80pct <- (y[1] + quantile(z_1, prob = c(0.1, 0.9))) / (K[1] + K_new[1])
batting_avg(clemente_80pct)
## ---- ppd-stats---------------------------------------------------------------
ppd_intervals <- function(x) t(apply(x, 2, quantile, probs = c(0.25, 0.75)))
ppd_summaries <- (1 / K_new) * rbind(ppd_intervals(ppd_pool),
ppd_intervals(ppd_nopool),
ppd_intervals(ppd_partialpool))
df_ppd <- data.frame(player = rep(1:length(y_new), 3),
y = rep(y_new / K_new, 3),
lb = ppd_summaries[, "25%"],
ub = ppd_summaries[, "75%"],
model = rep(models, each = length(y_new)))
## ---- plot-ppd----------------------------------------------------------------
ggplot(df_ppd, aes(x=player, y=y, ymin=lb, ymax=ub)) +
geom_linerange(color = "gray60", size = 2) +
geom_point(size = 2.5, color = "skyblue4") +
facet_grid(. ~ model) +
labs(x = NULL, y = "batting average") +
scale_x_continuous(breaks = NULL) +
ggtitle(expression(
atop("Posterior Predictions for Batting Average in Remainder of Season",
atop("50% posterior predictive intervals (gray bars); observed (blue dots)", ""))))
## ---- event-probabilities, results="hold"-------------------------------------
draws_partialpool <- shift_draws(as.matrix(fit_partialpool))
thetas_partialpool <- plogis(draws_partialpool)
thetas_partialpool <- thetas_partialpool[,-ncol(thetas_partialpool)]
colnames(thetas_partialpool) <- as.character(bball$Player)
ability_gt_400 <- thetas_partialpool > 0.4
cat("Pr(theta_n >= 0.400 | y)\n")
colMeans(ability_gt_400)[c(1, 5, 10)]
some_gt_350 <- apply(thetas_partialpool, 1, function(x) max(x) > 0.35)
cat("Pr(at least one theta_n >= 0.350 | y)\n")
mean(some_gt_350)
## ---- echo=FALSE--------------------------------------------------------------
thetas_pool <- plogis(as.matrix(fit_pool))
thetas_nopool <- plogis(as.matrix(fit_nopool))
some_gt_350_all <- sapply(list(thetas_pool, thetas_nopool, thetas_partialpool),
function(x) apply(x, 1, max) > 0.35)
chance_gt_350 <- round(100 * colMeans(some_gt_350_all))
## ---- ranking-----------------------------------------------------------------
reverse_rank <- function(x) 1 + length(x) - rank(x) # so lower rank is better
rank <- apply(thetas_partialpool, 1, reverse_rank)
t(apply(rank, 1, quantile, prob = c(0.1, 0.5, 0.9)))
## ---- plot-ranks--------------------------------------------------------------
df_rank <- data.frame(name = rep(bball$Player, each = M),
rank = c(t(rank)))
ggplot(df_rank, aes(rank)) +
stat_count(width = 0.8) +
facet_wrap(~ name) +
scale_x_discrete("Rank", limits = c(1, 5, 10, 15)) +
scale_y_discrete("Probability", limits = c(0, 0.1 * M, 0.2 * M),
labels = c("0.0", "0.1", "0.2")) +
ggtitle("Rankings for Partial Pooling Model")
## ---- plot-best-player--------------------------------------------------------
thetas_nopool <- plogis(as.matrix(fit_nopool))
colnames(thetas_nopool) <- as.character(bball$Player)
rank_nopool <- apply(thetas_nopool, 1, reverse_rank)
is_best_nopool <- rowMeans(rank_nopool == 1)
is_best_partialpool <- rowMeans(rank == 1)
df_is_best <- data.frame(unit = rep(bball$Player, 2),
is_best = c(is_best_partialpool, is_best_nopool),
model = rep(c("partial pooling", "no pooling"), each = N))
ggplot(df_is_best, aes(x=unit, y=is_best)) +
geom_bar(stat = "identity") +
facet_wrap(~ model) +
scale_y_continuous(name = "Pr[player is best]") +
ggtitle("Who is the Best Player?") +
theme(axis.text.x = element_text(angle = -45, vjust = 1, hjust = 0))
## ---- plot-ppc-stats-mean-----------------------------------------------------
pp_check(fit_nopool, plotfun = "stat", stat = "mean")
## ---- plot-ppc-stats----------------------------------------------------------
tstat_plots <- function(model, stats) {
lapply(stats, function(stat) {
graph <- pp_check(model, plotfun = "stat", stat = stat,
seed = SEED) # optional arguments
graph + xlab(stat) + theme(legend.position = "none")
})
}
Tstats <- c("mean", "sd", "min", "max")
ppcs_pool <- tstat_plots(fit_pool, Tstats)
ppcs_nopool <- tstat_plots(fit_nopool, Tstats)
ppcs_partialpool <- tstat_plots(fit_partialpool, Tstats)
if (require(gridExtra)) {
grid.arrange(
arrangeGrob(grobs = ppcs_pool, nrow = 1, left = "Pooling"),
arrangeGrob(grobs = ppcs_nopool, nrow = 1, left = "No Pooling"),
arrangeGrob(grobs = ppcs_partialpool, nrow = 1, left = "Partial Pooling")
)
}
## ---- p-value-----------------------------------------------------------------
yrep <- posterior_predict(fit_nopool, seed = SEED) # seed is optional
Ty <- sd(y)
Tyrep <- apply(yrep, 1, sd)
# tail-area probability
p <- 1 - mean(Tyrep > Ty)
print(p)
## ---- plot-ppc-y-vs-yrep------------------------------------------------------
pp_check(fit_partialpool, plotfun = "hist", nreps = 15, binwidth = 0.025) +
ggtitle("Model: Partial Pooling")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.