#' Plot football matches probabilities for out-of-sample football matches.
#'
#' The function provides a table containing the home win, draw and away win probabilities for a bunch of
#' out-of-sample matches as specified by \code{stan_foot} or \code{mle_foot}.
#'
#' @param object An object either of class \code{stanFoot}, \code{CmdStanFit}, \code{\link[rstan]{stanfit}}, or class
#' \code{\link{list}} containing the Maximum Likelihood Estimates (MLE) for the model parameters fitted
#' with \code{mle_foot}.
#' @param data A data frame containing match data with columns:
#' \itemize{
#' \item \code{periods}: Time point of each observation (integer >= 1).
#' \item \code{home_team}: Home team's name (character string).
#' \item \code{away_team}: Away team's name (character string).
#' \item \code{home_goals}: Goals scored by the home team (integer >= 0).
#' \item \code{away_goals}: Goals scored by the away team (integer >= 0).
#' }
#' @param home_team The home team(s) for the predicted matches.
#' @param away_team The away team(s) for the predicted matches.
#'
#' @return
#' A list with components:
#' \itemize{
#' \item \code{prob_table}: A data frame containing the results probabilities of the out-of-sample matches.
#' \item \code{prob_plot}: A \code{ggplot} object for Bayesian models only showing the posterior predictive heatmap
#' of exact score probabilities, with the true result highlighted.
#' }
#'
#' @details
#'
#' For Bayesian models the results probabilities are computed according to the
#' simulation from the posterior predictive distribution of future (out-of-sample) matches.
#' Specifically, matches are ordered from those in which the favorite team has the highest posterior probability
#' of winning to those where the underdog is more likely to win. For MLE models
#' fitted via the \code{mle_foot} the probabilities are computed by simulating from the MLE estimates.
#'
#'
#' @author Leonardo Egidi \email{legidi@units.it} and Roberto Macrì Demartino \email{roberto.macridemartino@deams.units.it}.
#'
#' @examples
#' \dontrun{
#' if (instantiate::stan_cmdstan_exists()) {
#' library(dplyr)
#'
#' data("italy")
#' italy_2000 <- italy %>%
#' dplyr::select(Season, home, visitor, hgoal, vgoal) %>%
#' dplyr::filter(Season == "2000")
#'
#' colnames(italy_2000) <- c("periods", "home_team", "away_team", "home_goals", "away_goals")
#'
#'
#'
#' fit <- stan_foot(
#' data = italy_2000,
#' model = "double_pois",
#' predict = 18
#' ) # double pois
#'
#' foot_prob(
#' fit, italy_2000, "Inter",
#' "Bologna FC"
#' )
#'
#' foot_prob(fit, italy_2000) # all the out-of-sample matches
#' }
#' }
#'
#' @importFrom posterior as_draws_rvars draws_of
#' @importFrom rstan extract
#' @importFrom ggplot2 ggplot geom_tile theme_bw scale_fill_gradient geom_rect labs
#' scale_x_continuous scale_y_continuous element_text scale_color_manual
#' guides facet_wrap unit ylab xlab element_blank
#' @importFrom dplyr group_by mutate arrange distinct
#' @importFrom extraDistr rbvpois rskellam
#' @importFrom metRology rt.scaled
#'
#' @export
foot_prob <- function(object, data, home_team, away_team) {
# ____________________________________________________________________________
# Data and argument checks ####
required_cols <- c("periods", "home_team", "away_team", "home_goals", "away_goals")
missing_cols <- setdiff(required_cols, names(data))
if (length(missing_cols) > 0) {
stop(paste("data is missing required columns:", paste(missing_cols, collapse = ", ")))
}
teams <- unique(c(data$home_team, data$away_team))
if (inherits(object, "stanFoot") || inherits(object, "CmdStanFit")) {
if (inherits(object, "stanFoot")) {
# For stanFoot objects, the draws are available in object$fit
draws <- object$fit$draws()
} else {
# For CmdStanFit objects, draw directly from object
draws <- object$draws()
}
draws <- posterior::as_draws_rvars(draws)
# Check that at least one of 'y_prev' or 'diff_y_prev' exists in draws.
if (!("y_prev" %in% names(draws) || "diff_y_prev" %in% names(draws))) {
stop("Model '%s' does not contain 'y_prev' or 'diff_y_prev' in its samples. Ensure the model was fitted with 'predict' set appropriately.")
}
sims <- list()
# Use 'y_prev' if available; otherwise, use 'diff_y_prev'
if ("y_prev" %in% names(draws)) {
sims$y_prev <- posterior::draws_of(draws[["y_prev"]])
predict <- dim(sims$y_prev)[2]
} else {
sims$diff_y_prev <- posterior::draws_of(draws[["diff_y_prev"]])
predict <- dim(sims$diff_y_prev)[2]
}
} else if (inherits(object, "stanfit")) {
sims <- rstan::extract(object)
if (!("y_prev" %in% names(sims) || "diff_y_prev" %in% names(sims))) {
stop("Model '%s' does not contain 'y_prev' or 'diff_y_prev' in its samples. Ensure the model was fitted with 'predict' set appropriately.")
}
if ("y_prev" %in% names(sims)) {
predict <- dim(sims$y_prev)[2]
} else {
predict <- dim(sims$diff_y_prev)[2]
}
} else if (inherits(object, "list")) {
predict <- object$predict
} else {
stop("Provide one among these four model fit classes: 'stanfit', 'CmdStanFit', 'stanFoot' or 'list'.")
}
if (is.null(predict) || predict == 0) {
stop("foot_prob cannot be used if the 'predict' argument is set to zero.")
}
# ____________________________________________________________________________
# Dataframes and plots ####
data_prev <- data[(dim(data)[1] - predict + 1):(dim(data)[1]), ]
# Checks on home_team/away_team
if (missing(home_team) & missing(away_team)) {
home_team <- data_prev$home_team
away_team <- data_prev$away_team
}
if (length(home_team) != length(away_team)) {
stop("Please, include the same number for home and away teams.")
}
find_match <- c()
for (i in 1:length(home_team)) {
find_match[i] <- which(data_prev$home_team %in% home_team[i] & data_prev$away_team %in% away_team[i])
}
true_goal_home <- data_prev$home_goals[find_match]
true_goal_away <- data_prev$away_goals[find_match]
if (length(find_match) == 0) {
stop(paste("There is not any out-of-sample match:",
home_team, "-", away_team,
sep = ""
))
}
# Compute probabilities with stan/mle
if (inherits(object, c("stanFoot", "stanfit", "CmdStanFit"))) {
if (is.null(sims$y_prev)) { # student_t model
M <- dim(sims$diff_y_prev)[1]
prob_h <- prob_d <- prob_a <- c()
x <- round(sims$diff_y_prev, 0)
prob_h <- round(apply(x, 2, function(x) sum(x > 0)) / M, 3)
prob_d <- round(apply(x, 2, function(x) sum(x == 0)) / M, 3)
prob_a <- round(apply(x, 2, function(x) sum(x < 0)) / M, 3)
# only table
tbl <- data.frame(
home_team = home_team,
away_team = away_team,
prob_h = prob_h[find_match],
prob_d = prob_d[find_match],
prob_a = prob_a[find_match]
)
return(list(prob_table = tbl))
} else { # poisson models
M <- dim(sims$y_prev)[1]
prediction1 <- sims$y_prev[, find_match, 1]
prediction2 <- sims$y_prev[, find_match, 2]
prob_h <- prob_d <- prob_a <- c()
# For MLO computation in multiple matches branch:
row_pos <- rep(NA, length(find_match))
col_pos <- rep(NA, length(find_match))
mlo <- rep(NA, length(find_match))
data_exp_tot <- data.frame(
Home = numeric(0),
Away = numeric(0),
Prob = numeric(0),
matches = character(0),
true_goal_home = numeric(0),
true_goal_away = numeric(0)
)
if (length(find_match) == 1) {
posterior_prop1 <- table(subset(prediction1, prediction1 < 15))
posterior_prop2 <- table(subset(prediction2, prediction2 < 15))
team_a <- home_team
team_b <- away_team
x_min <- y_min <- min(
length(posterior_prop1),
length(posterior_prop2)
)
counts_mix <- matrix(0, x_min, y_min)
for (j in 1:x_min) {
for (t in 1:y_min) {
counts_mix[j, t] <- posterior_prop1[j] * posterior_prop2[t]
}
}
dim1 <- dim(counts_mix)[1]
dim2 <- dim(counts_mix)[2]
x_seq <- seq(0, dim1 - 1, length.out = dim1)
y_seq <- seq(0, dim2 - 1, length.out = dim2)
data_exp <- expand.grid(Home = x_seq, Away = y_seq)
data_exp$Prob <- as.double(counts_mix / (M * M))
data_exp$matches <- paste(team_a, "-", team_b)
data_exp$true_goal_home <- true_goal_home
data_exp$true_goal_away <- true_goal_away
# Overall "adjusted" probabilities
prob_h_val <- sum(counts_mix[lower.tri(counts_mix)] / (M * M)) / sum(data_exp$Prob)
prob_d_val <- sum(diag(counts_mix / (M * M))) / sum(data_exp$Prob)
prob_a_val <- sum(counts_mix[upper.tri(counts_mix)] / (M * M)) / sum(data_exp$Prob)
# MLO (most likely outcome) using which.max and arrayInd to get a single index
ind_max <- which.max(counts_mix)
pos <- arrayInd(ind_max, dim(counts_mix))
row_pos_val <- pos[1]
col_pos_val <- pos[2]
mlo_val <- paste(row_pos_val - 1, "-", col_pos_val - 1, " (",
round(max(counts_mix / (M * M)), 3), ")",
sep = ""
)
data_exp_tot <- rbind(data_exp_tot, data_exp)
tbl <- data.frame(
home_team = home_team,
away_team = away_team,
prob_h = round(prob_h_val, 3),
prob_d = round(prob_d_val, 3),
prob_a = round(prob_a_val, 3),
mlo = mlo_val
)
p <- ggplot(data_exp_tot, aes(Home, Away, z = Prob)) +
geom_tile(aes(fill = Prob)) +
theme_bw() +
scale_fill_gradient(low = "white", high = "black") +
geom_rect(
aes(
xmin = as.numeric(as.vector(true_goal_home)) - 0.5,
xmax = as.numeric(as.vector(true_goal_home)) + 0.5,
ymin = as.numeric(as.vector(true_goal_away)) - 0.5,
ymax = as.numeric(as.vector(true_goal_away)) + 0.5,
color = "True Result"
),
fill = "transparent", linewidth = 1.5
) +
labs(title = "Posterior match probabilities") +
scale_x_continuous(breaks = unique(data_exp_tot$Home)) +
scale_y_continuous(breaks = unique(data_exp_tot$Away)) +
theme_bw() +
theme(
plot.title = element_text(size = 18),
strip.text = element_text(size = 12),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16),
plot.subtitle = element_text(size = 13),
axis.title = element_text(size = 18),
legend.text = element_text(size = 14)
) +
scale_color_manual(name = "", values = c("True Result" = "red")) +
guides(color = guide_legend(override.aes = list(fill = NA)))
} else {
team_a <- team_b <- c()
for (i in 1:length(find_match)) {
posterior_prop1 <- table(prediction1[, i])
posterior_prop2 <- table(prediction2[, i])
team_a[i] <- home_team[i]
team_b[i] <- away_team[i]
x_min <- y_min <- 5
counts_mix <- matrix(0, x_min, y_min)
for (j in 1:x_min) {
for (t in 1:y_min) {
counts_mix[j, t] <- posterior_prop1[j] * posterior_prop2[t]
}
}
dim1 <- dim(counts_mix)[1]
dim2 <- dim(counts_mix)[2]
x_seq <- seq(0, dim1 - 1, length.out = dim1)
y_seq <- seq(0, dim2 - 1, length.out = dim2)
data_exp <- expand.grid(Home = x_seq, Away = y_seq)
data_exp$Prob <- as.double(counts_mix / (M * M))
data_exp$matches <- paste(team_a[i], "-", team_b[i])
data_exp$true_goal_home <- true_goal_home[i]
data_exp$true_goal_away <- true_goal_away[i]
# overall "adjusted" probabilities
prob_h[i] <- sum(counts_mix[lower.tri(counts_mix)] / (M * M)) / sum(data_exp$Prob)
prob_d[i] <- sum(diag(counts_mix / (M * M))) / sum(data_exp$Prob)
prob_a[i] <- sum(counts_mix[upper.tri(counts_mix)] / (M * M)) / sum(data_exp$Prob)
# MLO (most likely outcome) using which.max and arrayInd
ind_max <- which.max(counts_mix)
pos <- arrayInd(ind_max, dim(counts_mix))
row_pos[i] <- pos[1]
col_pos[i] <- pos[2]
mlo[i] <- paste(row_pos[i] - 1, "-", col_pos[i] - 1, " (",
round(max(counts_mix / (M * M)), 3), ")",
sep = ""
)
data_exp_tot <- rbind(data_exp_tot, data_exp)
}
tbl <- data.frame(
home_team = home_team,
away_team = away_team,
prob_h = round(prob_h, 3),
prob_d = round(prob_d, 3),
prob_a = round(prob_a, 3),
mlo = mlo
)
data_exp_tot <- data_exp_tot %>%
dplyr::group_by(matches) %>%
dplyr::mutate(
prob_h = sum(Prob[Home > Away]),
prob_d = sum(Prob[Home == Away]),
prob_a = sum(Prob[Home < Away])
)
data_exp_tot$favorite <- rep(team_a, each = x_min * y_min)
data_exp_tot$underdog <- rep(team_b, each = x_min * y_min)
# Use which() to avoid NA values in indexes
indexes <- which(data_exp_tot$prob_h < data_exp_tot$prob_a)
temp1 <- data_exp_tot$prob_h[indexes]
temp2 <- data_exp_tot$prob_a[indexes]
data_exp_tot$prob_h[indexes] <- temp2
data_exp_tot$prob_a[indexes] <- temp1
temp_name1 <- data_exp_tot$favorite[indexes]
temp_name2 <- data_exp_tot$underdog[indexes]
data_exp_tot$favorite[indexes] <- temp_name2
data_exp_tot$underdog[indexes] <- temp_name1
temp_coord1 <- data_exp_tot$Home[indexes]
temp_coord2 <- data_exp_tot$Away[indexes]
data_exp_tot$Home[indexes] <- temp_coord2
data_exp_tot$Away[indexes] <- temp_coord1
temp_tg1 <- data_exp_tot$true_goal_home[indexes]
temp_tg2 <- data_exp_tot$true_goal_away[indexes]
data_exp_tot$true_goal_home[indexes] <- temp_tg2
data_exp_tot$true_goal_away[indexes] <- temp_tg1
data_exp_tot <- dplyr::arrange(data_exp_tot, prob_h)
fav_teams <- data_exp_tot %>% distinct(favorite)
und_teams <- data_exp_tot %>% distinct(underdog)
axes_titles <- data.frame(
matches = unique(data_exp_tot$matches),
axis_title_x = fav_teams[, 2],
axis_title_y = und_teams[, 2]
)
data_exp_tot$new_matches <- paste(data_exp_tot$favorite, "-", data_exp_tot$underdog)
num_matches <- length(unique(data_exp_tot$new_matches))
nrow_plot <- ceiling(sqrt(num_matches))
ncol_plot <- ceiling(num_matches / nrow_plot)
p <- ggplot(data_exp_tot, aes(Home, Away, z = Prob)) +
geom_tile(aes(fill = Prob)) +
theme_bw() +
scale_fill_gradient(low = "white", high = "black") +
facet_wrap(
facets = ~ reorder(new_matches, prob_h),
scales = "fixed",
ncol = ncol_plot, nrow = nrow_plot
) +
geom_rect(
aes(
xmin = as.numeric(as.vector(true_goal_home)) - 0.5,
xmax = as.numeric(as.vector(true_goal_home)) + 0.5,
ymin = as.numeric(as.vector(true_goal_away)) - 0.5,
ymax = as.numeric(as.vector(true_goal_away)) + 0.5,
color = "True Result"
),
fill = "transparent", linewidth = 1.5
) +
labs(title = "Posterior match probabilities") +
ylab("Underdog") +
xlab("Favorite") +
theme_bw() +
theme(
plot.title = element_text(size = 18),
strip.text = element_text(size = 11),
strip.background = element_blank(),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16),
plot.subtitle = element_text(size = 8.5),
axis.title = element_text(size = 18),
legend.text = element_text(size = 14),
panel.spacing = unit(0.2, "lines")
) +
scale_color_manual(name = "", values = c("True Result" = "red")) +
guides(color = guide_legend(override.aes = list(fill = NA)))
}
return(list(prob_table = tbl, prob_plot = p))
}
} else if (inherits(object, "list")) {
model <- object$model
predict <- object$predict
n.iter <- 200
team1_prev <- object$team1_prev
team2_prev <- object$team2_prev
N_prev <- predict
prediction_routine <- function(team1_prev, team2_prev, att, def, home,
corr, ability, model, predict, n.iter) {
mean_home <- exp(home[1, 2] + att[team1_prev, 2] + def[team2_prev, 2])
mean_away <- exp(att[team2_prev, 2] + def[team1_prev, 2])
if (model == "double_pois") {
x <- y <- matrix(NA, n.iter, predict)
for (n in 1:N_prev) {
x[, n] <- rpois(n.iter, mean_home[n])
y[, n] <- rpois(n.iter, mean_away[n])
}
} else if (model == "biv_pois") {
couple <- array(NA, c(n.iter, predict, 2))
for (n in 1:N_prev) {
couple[, n, ] <- rbvpois(n.iter,
a = mean_home[n],
b = mean_away[n],
c = corr[1, 2]
)
}
x <- couple[, , 1]
y <- couple[, , 2]
} else if (model == "skellam") {
diff_y <- matrix(NA, n.iter, predict)
for (n in 1:N_prev) {
diff_y[, n] <- rskellam(n.iter,
mu1 = mean_home[n],
mu2 = mean_away[n]
)
}
x <- diff_y
y <- matrix(0, n.iter, predict)
} else if (model == "student_t") {
sigma_y <- object$sigma_y
diff_y <- matrix(NA, n.iter, predict)
for (n in 1:N_prev) {
diff_y[, n] <- rt.scaled(n.iter,
df = 7,
mean = home[1, 2] + ability[team1_prev[n], 2] - ability[team2_prev[n], 2],
sd = sigma_y
)
}
x <- round(diff_y)
y <- matrix(0, n.iter, predict)
}
prob_func <- function(mat_x, mat_y) {
res <- mat_x - mat_y
prob_h <- apply(res, 2, function(x) sum(x > 0)) / n.iter
prob_d <- apply(res, 2, function(x) sum(x == 0)) / n.iter
prob_a <- apply(res, 2, function(x) sum(x < 0)) / n.iter
return(list(
prob_h = prob_h,
prob_d = prob_d,
prob_a = prob_a
))
}
conf <- prob_func(x, y)
tbl <- data.frame(
home_team = teams[team1_prev[find_match]],
away_team = teams[team2_prev[find_match]],
prob_h = conf$prob_h[find_match],
prob_d = conf$prob_d[find_match],
prob_a = conf$prob_a[find_match]
)
return(tbl)
}
if (predict != 0) {
prob_matrix <- prediction_routine(
team1_prev, team2_prev, object$att,
object$def, object$home,
object$corr, object$abilities, model, predict,
n.iter
)
}
return(list(prob_table = prob_matrix))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.