# DOCUMENTATION -----------------------------------------------------------
# Author: Jesse Bruhn
# Contact: jbruhn@princeton.edu
# PREAMBLE ----------------------------------------------------------------
#Clear workspace
rm(list=ls())
#Load Packages
pckgs <- c("tidyverse")
lapply(pckgs, library, character.only=TRUE)
#Set Options
options(scipen=100)
#Clean Workspace
rm(pckgs)
#Session Info for Reproducibility
Sys.Date()
sessionInfo()
#Set random number seed for reproducibility
set.seed(343408)
# NOTES -------------------------------------------------------------------
# This is inspired by another classic application of empirical bayes. In
# baseball, a pitcher throws a ball. The hitter has to hit the ball with a
# bat and run to a white bag called a base. If the hitter misses the ball
# or gets tagged with the ball after hitting it, he is out. If not, he
# is said to get a "hit".
# Thus every time a batter steps up to the plate, they either get out (0)
# or get a hit (1). The number of times a batter gets a hit, divided by the
# total number of times they attempt to hit is known as a batters "Batting
# Average". Players with a high batting average are more valuable then
# players with a low batting average.
# We want to use data on players historical batting averages to predict
# who will be the most valuable players in a subsequent year.
# EXAMPLE -----------------------------------------------------------------
#First we extract data from the ``Lahman'' package for the years 2000 - 2018.
require("Lahman")
baseball_data <- Lahman::Batting %>%
as_tibble() %>%
filter(yearID>=2000)
#only keep the variables we need
baseball_data <- baseball_data %>%
select(playerID, yearID, AB, H, X2B, X3B, HR)
#remove player-years who had 0 at bats
baseball_data <- baseball_data %>%
filter(AB>1)
#and calculate batting averages
baseball_data <- baseball_data %>%
mutate(batting_average = H/AB)
#Creat a player ID factor variable that will be useful later
baseball_data <- baseball_data %>%
mutate(player_factor = as_factor(playerID))
#Now we split the data into training and testing data. We will use the
#earlier data to train the model and the later date (2018) to test our
#model fit
train <- baseball_data %>%
filter(yearID < 2018)
test <- baseball_data %>%
filter(yearID == 2018)
#Now we restrict the data to players who had at least 250 at bats in 2018.
#This is to ensure that we are only trying to predict batting averages
#for players for whom we can estimate a reasonably precise batting average in 2018.
#This will give us a reseasonable amount of precision for the out-of-sample
#exercise.
test <- test %>%
filter(AB >= 250)
train <- train %>%
filter(playerID %in% unique(test$playerID))
#now filter out players in the test data for whom we have no training
#data
test <- test %>%
filter(playerID %in% unique(train$playerID))
#finally suppose there are only 30 players of interest that we are scouting
#and hence we are only interested in predicting the 2018 performance of these
#20 players. Restricting the data in this way it is not essential,
#but it will make the final results more dramatic because it shows that the method
#works well even when the pooled esitmate is still noisy.
test <- sample_n(test, size = 50)
train <- train %>%
filter(playerID %in% unique(test$playerID))
#observe that we have a very unequal number of at-bats and years of data
#across players in the training data
unequal <- train %>%
group_by(playerID) %>%
summarise(years_of_data = n(),
total_at_bats = sum(AB),
average_batting_average = mean(batting_average)) %>%
arrange(total_at_bats)
head(unequal)
tail(unequal)
hist(unequal$total_at_bats, breaks = seq(from = 1, to = 5500, by = 100))
hist(unequal$average_batting_average, breaks = seq(from = 0, to = .4, by = .025))
#The estimated batting averages for the guys with 1000-5000 at bats may be reliable
#But it is probably not a good way to judge the value of someone with only 100 at bats.
#What if we instead used a pooled esimate for all the players as our prediction for
#their future performance?
pooled_est <- mean(train$batting_average)
print(pooled_est)
#This seems like a better bet for the players without much experience,
#but is probably not a good way to evaluate the players who have a long
#history for us to judge.
#Now let's use empirical bayes to synthesize both estimates.
#First, we will get our unpooled estimates from a linear model that assumes
#homeskedastic errors. Note tht we weight by the numer of at bats to recover
#the estimates we would
unpooled.model <- lm(batting_average ~ player_factor, train, weights = AB)
test <- test %>%
mutate(unpooled_estimate = predict(unpooled.model, test),
unpooled_std_error = predict(unpooled.model, test, se.fit = TRUE)$se.fit)
# unpooled.model <- lm(batting_average ~ -1 + player_factor, train, weights = AB)
#
# unpooled.model.coefs <- broom::tidy(unpooled.model) %>%
# mutate(playerID = str_replace(term, "player_factor", "")) %>%
# select(playerID, estimate, std.error) %>%
# rename(unpooled_estimate = estimate,
# unpooled_std_error = std.error)
#
# test <- test %>%
# left_join(unpooled.model.coefs)
# plot(test$unpooled_estimate, test$batting_average)
#now let's get the pooled estimate using a regression as well
pooled.model <- lm(batting_average ~ 1, train, weights = AB)
test <- test %>%
mutate(pooled_estimate = predict(pooled.model, test))
#which estimate should we use? Why not both? Denote player i's batting
#average by theta_i. Our esimtate of the batting average is distributed
#theta_i_hat ~ N(theta_i, tau_i_hat) where tau_i_hat is the standard
#error of the sampling distribution. Now suppose that the individual
#batting averages are drawn from a normally distributed population of
#heterogenous batting averages such that theta_i ~ N(theta, sigma)
#Note that we have already calculated theta_i_hat, tau_i_hat, so
#let's collect those here in matrix form:
theta_i_hat <- matrix(test$unpooled_estimate)
tau_i_hat <- diag(test$unpooled_std_error^2)
#We don't know theta or sigma. But we can estimate them.
#Theta we can estimate via the corresponding pooled regression
pooled.model <- lm(batting_average ~ 1, train, weights = AB)
test <- test %>%
mutate(pooled_estimate = predict(pooled.model, test))
theta_hat <- matrix(test$pooled_estimate)
#To estimate sigma, note that the difference between the residuals
#of the pooled model and the unpooled model is simply theta_i. Thus
#var(u_pooled - u_unpooled) = var(theta_i) = sigma^2
sigma_hat <- diag(rep(var(pooled.model$residuals - unpooled.model$residuals),
length.out = nrow(test)
)
)
#now let's calculate the empirical bayes estimates
tau_i_hat_inv <- solve(tau_i_hat)
sigma_hat_inv <- solve(sigma_hat)
Gamma <- solve(tau_i_hat_inv + sigma_hat_inv)
W0 <- Gamma %*% sigma_hat_inv
W1 <- Gamma %*% tau_i_hat_inv
theta_eb <- W0 %*% theta_hat + W1 %*% theta_i_hat
test <- test %>%
mutate(eb_estimate = theta_eb)
#Now we will compare the predicted batting averages we estimated using the
#earlier data to the actual realizations in 2018
accuracy <- test %>%
summarise(mse_unpooled = mean((batting_average - unpooled_estimate)^2),
mse_pooled = mean((batting_average - pooled_estimate)^2),
mse_eb = mean((batting_average - eb_estimate)^2)
)
print(accuracy)
#And we see that the empirical bayes is closer to the true parameters on
#average than either hte unpooled estimator or the pooled estimator. In fact
#the improvements are substantial:
accuracy$mse_unpooled/accuracy$mse_eb
accuracy$mse_pooled/accuracy$mse_eb
#and eb just crushes the unpooled and the pooled estimators on MSE out of sample.
#Let's see why by ploting all three estimates for each player with the acompanying
#errorbars for the unpooled estimate
test %>%
arrange(unpooled_estimate) %>%
ggplot(aes(x = as_factor(playerID), y = unpooled_estimate, alpha = .5)) +
geom_point(color = "blue") +
geom_pointrange(aes(ymin = unpooled_estimate - 1.96*unpooled_std_error,
ymax = unpooled_estimate + 1.96*unpooled_std_error, )) +
geom_point(aes(y = eb_estimate), color = "red") +
geom_hline(yintercept = test$pooled_estimate[1], color = "blue")
#the empirical bayes takes the noisy estimates and
#shrinks them towards the pooled mean while leaving the more precisley
#estimated batting averages alone
# SYNTAX FOR R PACKAGE ----------------------------------------------------
#This example differs from 8 schools because we are estimating the parameters
#ourselves in linear models using microdata, rather than just taking
#point estimates + standard errors from a set of studies.
#I think we want to make it easy for the user to just pass the models they have
#estimated direclty to our package and let the work happen in the back end. This
#will also require the user to specific a formula to designate which parameters
#in the unpooled model should be shrunk, as well as which parameters in the pooled
#model describe the pooled mean.
#For the baseball example, the function call would look like:
simpleShrink(
pooled_model,
unpooled_model,
shrink_formula = player_factor ~ 1
)
#and return the weights, the unpooled estimates, the empirical bayes estimates,
#posterior variance, etc.
# MORE COMPLICATED BASEBALL EXAMPLE ---------------------------------------
#we now consider the same example as before, but we will allow the
#pooled mean to vary with additional observables. I.E. we will assume
#that theta_i ~ N(X'B, sigma) where X' is a vector of other observable
#characteristics (in this case, weight, height, and age). First, we need to
#bring in some additional info about the players
player_info <- Lahman::People %>%
select(playerID, weight, height, birthYear)
train <- train %>%
left_join(player_info) %>%
mutate(age = yearID - birthYear)
test <- test %>%
left_join(player_info) %>%
mutate(age = yearID - birthYear)
#
alt_pooled_model <- lm(batting_average ~ weight + height + age, train, weights = AB)
test <- test %>%
mutate(alt_pooled_estimate = predict(alt_pooled_model, test))
alt_theta_hat <- matrix(test$alt_pooled_estimate)
#note that tau_i_hat and sigma_hat are unaffected by the more complicated
#pooled model. Thus we can use the same weights as before to calculate EB.
alt_theta_eb <- W0 %*% alt_theta_hat + W1 %*% theta_i_hat
test <- test %>%
mutate(alt_eb_estimate = alt_theta_eb)
test %>%
select(unpooled_estimate, pooled_estimate, eb_estimate, alt_pooled_estimate, alt_eb_estimate)
#and we will compare model fit out of sample once again.
accuracy <- test %>%
summarise(mse_unpooled = mean((batting_average - unpooled_estimate)^2),
mse_pooled = mean((batting_average - pooled_estimate)^2),
mse_eb = mean((batting_average - eb_estimate)^2),
mse_alt_eb = mean((batting_average - alt_eb_estimate)^2)
)
print(accuracy)
accuracy$mse_unpooled/accuracy$mse_alt_eb
accuracy$mse_pooled/accuracy$mse_alt_eb
accuracy$mse_eb/accuracy$mse_alt_eb
#and we see that the richer model for the pooled mean yields some small
#improvement.
# SYNTAX FOR R PACKAGE ----------------------------------------------------
#For the more complicated baseball example, the function call would look like:
simpleShrink(
pooled_model,
unpooled_model,
shrink_formula = player_factor ~ height + weight + age
)
#and return the weights, the unpooled estimates, the empirical bayes estimates,
#posterior variance, etc.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.