Model Sports Injuries as Counts
In injurytools: A Toolkit for Sports Injury Data Analysis

library(knitr)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
options(rmarkdown.html_vignette.check_title = FALSE) # to supress R-CMD check

## to fold/hook the code
hook_output <- knit_hooks$get("output")
knit_hooks$set(output = function(x, options) {
  lines <- options$output.lines
  if (is.null(lines)) {
    return(hook_output(x, options))  # pass to default hook
  }
  x <- unlist(strsplit(x, "\n"))
  more <- "..."
  if (length(lines) == 1) {
    if (length(x) > lines) {
      # truncate the output, but add ....
      x <- c(head(x, lines), more)
    }
  } else {
    x <- c(if (abs(lines[1]) > 1) more else NULL,
           x[lines],
           if (length(x) > lines[abs(length(lines))]) more else NULL
    )
  }
  # paste these lines together
  x <- paste(c(x, ""), collapse = "\n")
  hook_output(x, options)
})

modern_r <- getRversion() >= "4.1.0"

library(injurytools)
library(dplyr)
library(stringr)
library(tidyr)
library(lme4)
library(pscl)
# library(glmmTMB)
library(MASS)
library(ggplot2)
library(gridExtra)
library(knitr)

# set the global theme of all plots
theme_set(theme_bw())

Example data: we continue exploring the cohort of Liverpool Football Club male's first team players over two consecutive seasons, 2017-2018 and 2018-2019, scrapped from https://www.transfermarkt.com/ website[^visualize-note-1].

[^visualize-note-1]: These data sets are provided for illustrative purposes. We warn that they might not be accurate and could potentially include discrepancies or incomplete information compared to what actually occurred.

This article provides some modelling approaches when injuries are seen as count data.

First, the distribution of the count/rate variables is explored, e.g. number of injuries and number of days lost per exposure time. Afterwards, four different models are presented, that depending on the distribution of the data, will provide a better fit to them due to the distributional assumptions made by each of them.

Exploring the distribution of injuries

We plot the histograms of the injury incidence and injury burden variables for each season.

See the code to prepare the data

## 17/18
df_exposures1718 <- prepare_exp(df_exposures0 = 
                                  raw_df_exposures |> filter(season == "17/18"),
                                player        = "player_name",
                                date          = "year",
                                time_expo     = "minutes_played") |> 
  mutate(seasonb = date2season(date))
df_injuries1718 <- prepare_inj(df_injuries0   =
                                 raw_df_injuries |> filter(season == "17/18"),
                               player         = "player_name",
                               date_injured   = "from",
                               date_recovered = "until")
injd1718 <- prepare_all(data_exposures = df_exposures1718,
                        data_injuries  = df_injuries1718,
                        exp_unit = "matches_minutes")

## 18/19
df_exposures1819 <- prepare_exp(df_exposures0 = 
                                  raw_df_exposures |> filter(season == "18/19"),
                                player        = "player_name",
                                date          = "year",
                                time_expo     = "minutes_played") |> 
  mutate(seasonb = date2season(date))
df_injuries1819 <- prepare_inj(df_injuries0   = 
                                 raw_df_injuries |> filter(season == "18/19"),
                               player         = "player_name",
                               date_injured   = "from",
                               date_recovered = "until")
injd1819 <- prepare_all(data_exposures = df_exposures1819,
                        data_injuries  = df_injuries1819,
                        exp_unit = "matches_minutes")

## calculate injury summary statistics
injds1718 <- injsummary(injd1718, quiet = T)
injds1819 <- injsummary(injd1819, quiet = T)

injds1718p <- injds1718$playerwise
injds1819p <- injds1819$playerwise

injdsp <- bind_rows("Season 17-18" = injds1718p,
                    "Season 18-19" = injds1819p,
                    .id = "season")

## plot
p1 <- ggplot(data = injdsp) + 
  geom_histogram(aes(x = injincidence, fill = season)) + 
  facet_wrap(~season) +
  scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
  ylab("Number of players") + 
  xlab("Incidence (number of injuries per 100 player-match)") +
  ggtitle("Histogram of injury incidence in each season") + 
  theme(legend.position = "none")

p2 <- ggplot(data = injdsp) + 
  geom_histogram(aes(x = injburden, fill = season)) + 
  facet_wrap(~season) +
  scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
  ylab("Number of players") + 
  xlab("Burden (number of days lost due to injury per 100 player-match)") +
  ggtitle("Histogram of injury burden in each season") + 
  theme(legend.position = "none")

grid.arrange(p1, p2, ncol = 1)

p1 <- ggplot(data = injdsp) + 
  geom_histogram(aes(x = injincidence, fill = season)) + 
  facet_wrap(~season) +
  scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
  ylab("Number of players") + 
  xlab("Incidence (number of injuries per 100 player-match)") +
  ggtitle(bquote("Histogram of injury" ~ bold("incidence") ~ "in each season")) + 
  theme(legend.position = "none")

p2 <- ggplot(data = injdsp) + 
  geom_histogram(aes(x = injburden, fill = season)) + 
  facet_wrap(~season) +
  scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
  ylab("Number of players") + 
  xlab("Burden (number of days lost due to injury per 100 player-match)") +
  ggtitle(bquote("Histogram of injury" ~ bold("burden") ~ "in each season")) + 
  theme(legend.position = "none")

Code for further plot specifications

theme_counts <- theme(axis.text = element_text(size = rel(1.2)),
                      axis.title = element_text(size = rel(1.3)),
                      strip.text = element_text(size = rel(1.4)),
                      plot.title = element_text(size = rel(1.4)),
                      legend.text = element_text(size =  rel(1.3)),
                      legend.title = element_text(size = rel(1.3)))
p1 <- p1 + theme_counts
p2 <- p2 + theme_counts

theme_counts <- theme(axis.text = element_text(size = rel(1.2)),
                      axis.title = element_text(size = rel(1.3)),
                      strip.text = element_text(size = rel(1.4)),
                      plot.title = element_text(size = rel(1.4)),
                      legend.text = element_text(size =  rel(1.3)),
                      legend.title = element_text(size = rel(1.3)))
p1 <- p1 + theme_counts
p2 <- p2 + theme_counts

grid.arrange(p1, p2, ncol = 1)

In the following, we use injds1718p data.

- Poisson (mixed) model

We merge into the injds1718p data frame the player-related information (i.e. positionb, age, assists, goals, yellows etc.) available in the raw_df_exposures data frame.

## 17/18
df_exposures1718 <- prepare_exp(df_exposures0 = 
                                  raw_df_exposures |> filter(season == "17/18"),
                                player        = "player_name",
                                date          = "year",
                                time_expo     = "minutes_played") |> 
  mutate(seasonb = date2season(date))

injds1718p <- injds1718p |> 
  mutate(seasonb = "2017/2018") |> 
  ## join to have info, such as position, age, citizenship etc.
  left_join(df_exposures1718, by = c("player" = "player", 
                                     "seasonb" = "seasonb")) |> 
  ## create positionb column 
  ## (so that the categories are: Attack, Defender, Goalkeeper and Midfield)
  mutate(positionb = factor(str_split_i(position, "_", 1)))

## quit Goalkeepers
injds1718p <- dplyr::filter(injds1718p, positionb != "Goalkeeper") |> 
  droplevels()

To fit a Poisson regression model for injury incidence (considering positionb as a covariate), we do:

incidence_glm_pois <- glm(ninjuries ~ positionb, # + offset(log(totalexpo))
                          offset = log(totalexpo),
                          data = injds1718p,
                          family = poisson)

Or alternatively, we can use the glmmTMB::glmmTMB function:

# incidence_glm_pois2 <- glmmTMB(formula = ninjuries ~ foot, 
#                                offset = log(totalexpo), 
#                                family = poisson(), data = injds1718p)
# summary(incidence_glm_pois2)

Besides, if we have repeated measurements as in injdsp, we can fit a Mixed Model via:

Add more covariates to injdsp data frame

df_exposures <- prepare_exp(df_exposures0 = raw_df_exposures,
                                player        = "player_name",
                                date          = "year",
                                time_expo     = "minutes_played") |> 
  mutate(seasonb = date2season(date))

injdsp <- injdsp |> 
  mutate(seasonb = if_else(season == "Season 17-18", "2017/2018", "2018/2019")) |> 
  ## join to have info, such as position, age, citizenship etc.
  left_join(df_exposures, by = c("player" = "player",
                                 "seasonb" = "seasonb")) |> 
  ## create positionb column 
  ## (so that the categories are: Attack, Defender, Goalkeeper and Midfield)
  mutate(positionb = factor(str_split_i(position, "_", 1))) |> 
  droplevels()

incidence_glmm_pois <- glmer(formula = ninjuries ~ positionb + (1 | player),
                             offset = log(totalexpo),
                             data = injdsp,
                             family = poisson)
# incidence_glmm_pois2 <- glmmTMB::glmmTMB(formula = ninjuries ~ positionb + (1 | player), 
#                                          offset = log(totalexpo),
#                                          data = injdsp,
#                                          family = poisson)

We can do the analogue for the burden, e.g.:

burden_glm_pois <- glm(ndayslost ~ positionb, offset = log(totalexpo), ## or ~ foot
                       data = injds1718p,
                       family = poisson)

As of now, lets stick with the model burden_glm_pois and lets interpret the model output.

summary(burden_glm_pois)

The estimated coefficients are:

cbind(estimate = exp(coef(burden_glm_pois)) * c(90*100, 1, 1), 
      exp(confint(burden_glm_pois)) * c(90*100, 1, 1)) |> # (to report per 100 player-matches)
  kable()

- Negative Binomial model

burden_glm_nb <- glm.nb(ndayslost ~ positionb + offset(log(totalexpo)),
                           data = injds1718p)

summary(burden_glm_nb)

- Zero-inflated Poisson model

burden_zinfpois <- zeroinfl(ndayslost ~ positionb | positionb,
                             offset = log(totalexpo),
                             data = injds1718p,
                             link = "logit",
                             dist = "poisson",
                             trace = FALSE, EM = FALSE)
## Or:
# burden_zinfpois <- glmmTMB::glmmTMB(formula = ndayslost ~ 1 +  positionb,
#                                     offset = log(totalexpo),
#                                     ziformula = ~ 1 + positionb,
#                                     data = injds1718p,
#                                     family = poisson)

summary(burden_zinfpois)

- Zero-inflated Negative Binomial model

burden_zinfnb <- zeroinfl(ndayslost ~ positionb | positionb,
                             offset = log(totalexpo),
                             data = injds1718p,
                             link = "logit",
                             dist = "negbin",
                             trace = FALSE, EM = FALSE)
## Or:
# burden_zinfnb <- glmmTMB::glmmTMB(ndayslost ~ 1 + positionb, offset = log(totalexpo),
#                                      ziformula = ~ 1 + positionb,
#                                      data = injds1718p,
#                                      family = nbinom2)

summary(burden_zinfnb)

Model comparison

Finally, lets compare the four models.

We compute the conditional predicted mean probabilities of each model:

## pois
## predprob: for each subj, prob of observing each value
phat_pois <- predprob(burden_glm_pois) 
phat_pois_mn <- apply(phat_pois, 2, mean) 
## nb
phat_nb <- predprob(burden_glm_nb)           
phat_nb_mn <- apply(phat_nb, 2, mean) 
## zinfpois
phat_zinfpois <- predprob(burden_zinfpois)            
phat_zinfpois_mn <- apply(phat_zinfpois, 2, mean) 
## zinfnb
phat_zinfnb <- predprob(burden_zinfnb)           
phat_zinfnb_mn <- apply(phat_zinfnb, 2, mean) 

## put in a data frame
idx <- seq(1, 62, length.out = 30)
df_probs <- data.frame(phat_pois_mn     = phat_pois_mn[idx],
                       phat_nb_mn       = phat_nb_mn[idx],
                       phat_zinfpois_mn = phat_zinfpois_mn[idx],
                       phat_zinfnb_mn   = phat_zinfnb_mn[idx], 
                       x= idx) |> 
  tidyr::gather(key = "prob_type", value = "value", -x) |> 
  mutate(prob_type = factor(prob_type))

And we display them over the histogram of the data to examine the fits:

ggplot(data = injds1718p) + 
  geom_histogram(aes(x = injburden/100, after_stat(density)), 
                 breaks = seq(-0.5, 62, length.out = 30),
                 col = "black", alpha = 0.5) +
  geom_point(data = df_probs, aes(x = x, y = value, 
                                  group = prob_type, col = prob_type)) + 
  geom_line(data = df_probs, aes(x = x, y = value, 
                                 group = prob_type, col = prob_type)) + 
  ylim(c(0, 0.3)) + xlab("Injury burden") + ylab("Density") +
  scale_color_manual(name = "Model:",
                     labels = c("Negative Binomial", "Poisson",
                                "Zero-Inflated Negative Binomial",
                                "Zero-Inflated Poisson"),
                     values = c("darkblue", "chocolate", "purple", "red")) +
  ggtitle("Histogram of injury burden in 2017/2018\nwith conditional Poisson, NB, ZIP and ZINB Densities") +
  theme_counts +
  theme(legend.position = c(0.7, 0.7))

## using base R
with(injds1718p, {
  hist(ndayslost, prob = TRUE, breaks = seq(-0.5, 316.5, length.out = 30), 
       xlab = "Injury burden (number of injuries per player-season)",
       main = "Histogram of overall injury burden\nwith conditional Poisson, NB, ZIP and ZINB Densities")
  lines(x = idx, y = phat_pois_mn[idx], type = "b", lwd = 2, col = "black")
  lines(x = idx, y = phat_nb_mn[idx], type = "b", lwd = 2, col = "purple")
  lines(x = idx, y = phat_zinfpois_mn[idx], type = "b", lwd = 2, col = "red")
  lines(x = idx, y = phat_zinfnb_mn[idx], type = "b", lwd = 2, col = "blue")
})
legend(250, 0.026, c("Poisson", "NB", "ZIP","ZINB"), lty = 1,
       col = c("black", "purple", "red","blue"), lwd = 2)

Besides, we compute goodness of fit measures such as AIC, BIC and deviance explained:

models <- list("Poisson model" = burden_glm_pois,
               "Negative binomial model" = burden_glm_nb,
               "Zero-inflated Poisson model" = burden_zinfpois,
               "Zero-inflated Negative Binomial model" = burden_zinfnb)

res_gof <- lapply(models, function(model) {
  aic      <- AIC(model)
  bic      <- BIC(model)
  if (class(model)[[1]] == "zeroinfl") {
    deviance <- -2*logLik(model)[[1]]
    null_model <- update(model, .~ -positionb)
    null_deviance <- -2*logLik(null_model)[[1]]
  } else {
    deviance <- model$deviance
    null_deviance <- model$null.deviance
  }
  dev_expl <- (null_deviance - deviance)/null_deviance * 100
  return(res = data.frame(aic = aic, bic = bic, dev_expl = dev_expl))
})

res_gof |>   
  bind_rows(.id = "model") |>  
  ## arrange them according to dev_expl.
  arrange(desc(dev_expl)) |> 
  knitr::kable(digits = 2,
               col.names = c("Model", "AIC", "BIC", "Deviance Explained"))