library(shiny)
library(ggplot2)
library(dplyr)
library(readr)
library(LearnBayes)
library(tidyr)
retro_data <- read_csv("https://raw.githubusercontent.com/bayesball/Predicting_Outcomes/main/retro2019data.csv")
prediction_exercise <- function(d,
date_break,
minAB = 100,
type = "H",
logn = 3,
ab = c(1, 1),
exclude_pitchers = "Yes"){
# define Outcome variable
if(type == "H"){
d %>% mutate(Outcome = ifelse(H == 1, 1, 0)) ->
d
}
if(type == "SO"){
d %>% mutate(Outcome =
ifelse(EVENT_CD == 3, 1, 0)) ->
d
}
if(type == "HR"){
d %>% mutate(Outcome =
ifelse(EVENT_CD == 23, 1, 0)) ->
d
}
# include pitcher batting?
if(exclude_pitchers == "Yes"){
d <- filter(d, BAT_FLD_CD != 1)
}
# divide data into two parts using date_break
d1 <- filter(d, Date <= date_break,
AB_FL == TRUE)
d2 <- filter(d, Date > date_break,
AB_FL == TRUE)
# fit bb model to data from 1st part
d1 %>%
group_by(BAT_ID) %>%
summarize(y = sum(Outcome),
n = n(),
AVG = y / n) %>%
filter(n >= minAB) -> S1
fit <- fit_bb_model2(S1,
prior = list(ab = ab, logn = logn))
S1$MLM_Predicted <- fit$d$est
# compute AVG in 2nd part and merge with 1st part
d2 %>%
group_by(BAT_ID) %>%
summarize(y2 = sum(Outcome),
n2 = n()) %>%
filter(n2 > 0) %>%
mutate(AVG2 = y2 / n2) %>%
select(BAT_ID, y2, n2, AVG2) -> S2
# merge two datasets
S12 <- inner_join(S1, S2, by = "BAT_ID")
# compute sum of squared prediction errors for
# raw and multilevel estimates
S12 %>%
summarize(Observed = sum((AVG - AVG2) ^ 2),
Multilevel = sum((MLM_Predicted - AVG2) ^ 2)) -> crit
list(S12 = S12,
K = fit$K, eta = fit$eta,
crit = crit)
}
fit_bb_model2 <- function(data, prior){
bblogpost <- function(theta, datapar){
y <- datapar$df$y
n <- datapar$df$n
ab <- datapar$ab
logn <- datapar$logn
eta <- exp(theta[1])/(1 + exp(theta[1]))
K <- exp(theta[2])
logf <- function(y, n, K, eta){
lbeta(K * eta + y, K * (1 - eta) + n - y) -
lbeta(K * eta, K * (1 - eta))
}
loglike <- sum(logf(y, n, K, eta))
loglike + dbeta(eta, ab[1], ab[2], log = TRUE) +
+ dlogis(theta[2], logn, 1, log = TRUE) +
log(eta * (1 - eta))
}
datapar <- list(df = data,
ab = prior$ab,
logn = prior$logn)
mode <- laplace(bblogpost, c(0, prior$logn), datapar)$mode
eta <- exp(mode[1])/(1 + exp(mode[1]))
K <- exp(mode[2])
list(eta = eta, K = K,
d = data.frame(data,
est = (data$y + K * eta) / (data$n + K)))
}
# shiny app
ui <- fluidPage(
theme = bslib::bs_theme(version = 4,
bootswatch = "darkly"),
fluidRow(
column(4, wellPanel(
h4(id="big-heading",
"Choose Inputs"),
radioButtons(
inputId = "type",
label = h6("Outcome:"),
choices = c("H", "SO", "HR"),
selected = c("H"),
inline = TRUE),
dateInput("date_break",
label = h6("Date Breakpoint:"),
value = "2019-07-01"),
sliderInput("minAB",
h6("Minimum At-Bats:"),
min = 1,
max = 300,
value = 1),
radioButtons(
inputId = "exclude_pitcher",
label = h6("Exclude pitcher batting:"),
choices = c("Yes", "No"),
selected = c("Yes"),
inline = TRUE),
h5("Parameter Estimates:"),
tableOutput("table1"),
downloadButton("downloadData", "Download Rates")
)),
column(8, wellPanel(
h4(id="big-heading",
"Predicting 2019 Batting Rates", align = "center"),
tabsetPanel(type = "tabs",
tabPanel("Intro",
hr(),
h5("Prediction Problem:"),
p("Have Retrosheet data
for all at-bats during 2019
season. Divide data into
'Train' and 'Test' groups.
The problem is to accurately
predict the batting rates in the
Test group from the observed rates in
the Train group."),
img(src="Rplot.png",
height = 200, width = 300),
hr(),
tags$ul(
tags$li("Choose Outcome -- either
H (hit), SO (strikeout) or
HR (home run)."),
tags$li("Choose the Date Breakpoint that divides
the Test and Train datasets."),
tags$li("Choose the Minimum number of
At-Bats for the Train dataset."),
tags$li("Do you wish to exclude pitcher batting?
(Yes or No)")
),
p("This app illustrates the use of a
multilevel model to predict
the rates in the Test dataset.")
),
tabPanel("Rates",
plotOutput("plot1",
height = "405px"),
h5("Sum of Squared Errors in Predicting Future Rates:"),
tableOutput("table2")
),
tabPanel("Talents",
plotOutput("plot2",
height = "405px"),
hr(),
p("This graph displays the estimated Beta density for the true rates
using the Beta/Binomial multilevel model where the Beta shape parameters
are given by a = K eta and b = K (1 - eta). The mean and standard
deviation are eta and SD = sqrt(eta (1 - eta) / (K + 1)).")
),
tabPanel("Description",
p('This app illustrates prediction of batting rates using the following
Beta/Binomial multilevel model'),
p('We observe y_1, ..., y_N,
where y_i, the count of either H, SO, or HR
for player i in AB_i at-bats,
is Binomial(AB_i, p_i) where p_i is the
true rate. Assume p_1, ..., p_N
are drawn from a Beta distribution with
shape parameters K eta and K (1 - eta). At the
last stage, eta is assumed Beta(1, 1) and log K
is Logistic with location 3 and scale 1.'),
p('Given values of K and eta, the posterior estimate of
the true rate p_i is (AB_i / (AB_i + K) (y_i / AB_i) +
(K / (AB_i + K) eta.'),
h5('Using the App'),
p("One selects a date during the 2019 season. One trains
the model using hitting data up to that date, and predicts
rates for hitting data after that date. One
decides on the type of rate (H, SO or HR), the
minimum number of AB for batters in the training
dataset, and whether or not you wish to exclude pitchers
batting from the dataset."),
p("Point estimates for K and eta and
the corresponding talent standard deviation
are shown in the left panel.
The `Rates` tab displays dotplots for
the observed first-period rates and the predicted
second-period rates. The sum of squared
prediction errors are displayed both for the observed
first-period rates and the
multilevel model predictions. The
`Talents`
tab shows the estimated Beta density curve for the
true rates.")))
))
))
server <- function(input, output, session) {
output$plot1 <- renderPlot({
out <- prediction_exercise(retro_data,
input$date_break,
input$minAB,
input$type,
exclude_pitchers = input$exclude_pitcher)
out$S12 %>%
mutate(Observed = y / n) %>%
select(BAT_ID, Observed, MLM_Predicted) %>%
pivot_longer(cols = Observed:MLM_Predicted,
names_to = "Type",
values_to = "Estimate") -> s1
the_title <- paste("Observed ", input$type,
" Rates & MLM Predictions",
sep = "")
the_subtitle <- paste("Date Breakpoint: ",
input$date_break,
", minAB = ",
input$minAB, sep = "")
if(input$exclude_pitcher == "Yes"){
the_subtitle <- paste(the_subtitle, ", Pitchers Excluded",
sep = "")
} else {
the_subtitle <- paste(the_subtitle, ", All Batters",
sep = "")
}
s1$Type <- factor(s1$Type,
levels = c("Observed", "MLM_Predicted"))
ggplot(s1, aes(Type, Estimate)) +
geom_jitter(width = 0.2, color = "red") +
ylab("Rate") + xlab("Method") +
labs(title = the_title,
subtitle = the_subtitle) +
theme(text = element_text(size=14),
plot.title =
element_text(colour = "white", size = 14,
hjust = 0.5),
plot.subtitle =
element_text(colour = "white", size = 10,
hjust = 0.5)) +
theme(plot.background = element_rect(fill = "grey32"),
axis.text = element_text(color = "white"),
axis.title = element_text(color = "white")) +
theme(
panel.background = element_rect(fill = "bisque",
colour = "grey"))
}, res = 96)
output$plot2 <- renderPlot({
out <- prediction_exercise(retro_data,
input$date_break,
input$minAB,
input$type,
exclude_pitchers = input$exclude_pitcher)
a <- round(out$K * out$eta, 1)
b <- round(out$K * (1 - out$eta), 1)
s <- sqrt(out$eta * (1 - out$eta) / (out$K + 1))
x_lo <- max(out$eta - 4 * s, 0)
x_hi <- min(out$eta + 4 * s, 1)
TH <- theme(plot.title = element_text(colour = "white",
size = 18, hjust = 0.5),
plot.subtitle = element_text(colour = "white",
size = 18, hjust = 0.5),
text = element_text(size = 18),
panel.background = element_rect(fill = "bisque",
colour = "grey")) +
theme(plot.background = element_rect(fill = "grey32"),
axis.text = element_text(color = "white"),
axis.title = element_text(color = "white"))
Title <- paste("Beta(", a, ",", b, ")")
Subtitle <- paste("Talent Curve for", input$type, "Rates")
if(input$exclude_pitcher == "Yes"){
Subtitle <- paste(Subtitle, ", Pitchers Excluded",
sep = "")
} else {
Subtitle <- paste(Subtitle, ", All Batters",
sep = "")
}
x <- NULL
ggplot(data.frame(x = c(x_lo, x_hi)), aes(x)) +
stat_function(fun = dbeta,
geom = "line",
color = "red",
size = 1.5,
args = list(shape1 = a, shape2 = b)) +
labs(title = Title, subtitle = Subtitle) +
TH +
xlab(paste(input$type, "Rate")) +
ylab("Density")
})
output$table1 <- renderTable({
out <- prediction_exercise(retro_data,
input$date_break,
input$minAB,
input$type,
exclude_pitchers = input$exclude_pitcher)
data.frame(eta = out$eta,
K = out$K,
SD = sqrt(out$eta * (1 - out$eta) /
(out$K + 1)))
}, digits = 3)
output$table2 <- renderTable({
out <- prediction_exercise(retro_data,
input$date_break,
input$minAB,
input$type,
exclude_pitchers = input$exclude_pitcher)
d <- out$crit
names(d) <- paste(names(d), input$type, "Rate")
d
}, digits = 3)
output$downloadData <- downloadHandler(
filename = "hitting_rates_output.csv",
content = function(file) {
out <- prediction_exercise(retro_data,
input$date_break,
input$minAB,
input$type,
exclude_pitchers = input$exclude_pitcher)$S12
out$Date_Break <- input$date_break
out$Min_AB <- input$minAB
out$Type <- input$type
out$Pitchers_Excluded <- input$exclude_pitcher
write.csv(out, file, row.names = FALSE)
}
)
}
shinyApp(ui = ui, server = server)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.