R/pitelo.R
In f1pits: F1 Pit Stop Datasets

Documented in pitelo

#' @title ELO calculation using pit stop data
#' @description Function to calculate ELO ratings from a pit stop dataset
#' @details
#' The function calculates ELO ratings, using the position of each team's pit stop in each round.
#'
#' By default, the \strong{median} stat is used because it better reflects global pit stop performance across teams.
#' It is less sensitive to extreme real data values and outliers, but provides more stability than the mean. For example, a single poor pit stop does not significantly affect efficiency.
#'
#' Meanwhile, the \strong{mean} is more affected by small sample sizes or outliers, such as penalties served by a driver during pit stops, but it is more sensitive to pit stop mistakes.
#'
#' The last statistic uses the best pit stop result by team, ignoring consistency and reflecting maximum peak of performance potential.
#'
#' On the other hand, two methodologies can be applied for ELO calculation: \strong{sequential} or \strong{batch}.
#'
#' In the \strong{sequential approach}, the ELO is updated after each match intra-event, so depends on the order in which matchups are processed.
#' This changes future expectations within the same race, reinforcing early leadership while reducing later gains for the leader and smoothing final differences.
#' Consequently, this approach is more suitable for analyzing individual races.
#'
#' In contrast, the \strong{batch approach} updates the ELO only after the entire event has finished. Race matchups are not considered immediately; instead they are aggregated afterward, making long-term comparisons more fair, neutral and independent of temporal order.
#'
#' \code{k} parameter controls the magnitude of change in each ELO cycle calculation.
#'
#' \code{c} establishes the win probability ratio between two teams based on a specific difference in rating points.
#'
#' \code{d} is the expected probability to win (sensitivity).
#'
#' Default values \code{k}, \code{c} and \code{d} used are based on Hvattum and Arntzen (2010). With these default values, a 400 point ELO difference implies that the stronger team has 10 times more chances of winning than the weaker team.
#' @param pits_data Tibble data generated by the pits() function
#' @param stat_fun Type of stat used in ELO calculations: median (1, by default), mean (2) and min value (3) position
#' @param calc Type of calculation: batch (1, by default) or sequential (2)
#' @param k Weight factor magnitude (velocity) to change ELO ratings (by default, 20)
#' @param c Base factor that establishes the win probability ratio between two teams for a given rating difference in the score ELO calculation (by default, 10)
#' @param d Scaling factor amplitude that determines the sensitivity of the expected win probability to rating differences in the score ELO calculation (by default, 400)
#' @param fml Team family mode. Collapse the different names of the same team structure (by default TRUE, enabled)
#' @param elo ELO provided (if is omitted, 1000 will be used for all by default)
#' @return A tibble containing the ELO calculations
#' @examples
#' \donttest{
#' pitstop_data_elo_example <- tibble::tibble(
#'   Pos. = 1:11,
#'   Team = c("Team_3","Team_1","Team_1","Team_1","Team_1",
#'            "Team_2","Team_1","Team_3","Team_3","Team_2","Team_3"),
#'   Driver = rep("Driver", 11),
#'   "Time (sec)" = c(2.18, 2.21, 2.24, 3, 4, 4.04, 4.07, 4.08, 7.88, 8.88, 14.54),
#'   Lap = 1:11,
#'   Points = rep(0, 11),
#'   Round = rep(0, 11),
#'   Year = rep(2026, 11))
#' pitelo(pitstop_data_elo_example)
#' pitelo(pitstop_data_elo_example, stat_fun = 2)
#' pitelo(pitstop_data_elo_example, stat_fun = 3)
#' }
#' @references Elo, A. E. (1978). \emph{The rating of chessplayers, past and present}. Arco Publishing. ISBN-10: 0668047216.
#'
#' Glickman, M. E., & Jones, A. C. (1999). Rating the chess rating system. \emph{CHANCE-BERLIN THEN NEW YORK-}, 12, 21-28.
#'
#' Hvattum, L. M., & Arntzen, H. (2010). Using ELO ratings for match result prediction in association football. \emph{International Journal of Forecasting}, 26(3), 460--470. \doi{10.1016/j.ijforecast.2009.10.002}.
#' @importFrom dplyr filter count slice pull mutate recode group_by summarise distinct arrange inner_join desc
#' @importFrom tibble tibble
#' @importFrom stats median setNames
#' @export
pitelo <- function(pits_data, stat_fun = 1, calc = 1, k=20, c=10, d=400, fml = TRUE,elo=NULL) {

  ### 0.a Checking arguments

  if (is.null(stat_fun)) {
    stat_fun <- 1  # default
  }

  if (stat_fun == 1) {
    message("Using default median stats")

  } else if (stat_fun == 2) {
    message("Using mean stats")

  } else if (stat_fun == 3) {
    message("Using min stats")

  } else {
    stop("stat_fun argument must be 1, 2 or 3")
  }

  param1 <- switch(as.character(stat_fun),
                   "1" = median, # median, default (similar to mean, but you lost sensibility to real extreme data, but more stable than mean for outliers -for one bad pitstop not down your efficiency-
                   "2" = mean,   # mean: global pitstop pace. This penalty teams without data
                   "3" = min)    # min (IGNORES consistency, it reflects max-potential, peak-potential)

  if (is.null(calc)) {
    calc <- 1  # default
  }

  if (calc == 1) {
    message("Using default batch calc")

  } else if (calc == 2) {
    message("Using sequential calc")

  } else {
    stop("calc argument must be 1 or 2")
  }

  ### 0.b PREPROCESSING RAW DATA (CONVERSION TEAM NAMES FROM SAME FAMILY TEAM)

  if (fml) {

    message("Team family mode enabled")

    if (!"Year" %in% names(pits_data)) {
      stop("Column 'Year' is required in pits_data")
    }

    team_family <- list(
      c("Toro Rosso","AlphaTauri", "RB", "Racing Bulls"),
      c("Renault", "Alpine"),
      c("Sauber", "Alfa Romeo", "Audi"),
      c("Force India", "Racing Point", "Aston Martin"))

    latest_year <- max(pits_data$Year, na.rm = TRUE)

    team_map <- unlist(lapply(team_family, function(group) {

      tmp <- pits_data %>%
        filter(Team %in% group, Year == latest_year) %>%
        count(Team)

      if (nrow(tmp) == 0) {
        return(setNames(group, group))
      }

      last_team <- tmp %>%
        slice(1) %>%
        pull(Team)

      setNames(rep(last_team, length(group)), group)
    }))

    pits_data2 <- pits_data %>%
      mutate(
        Team = recode(Team, !!!team_map)
      )

  } else {
    message("Team family mode disabled")
    pits_data2 <- pits_data
  }

  ### 0.c ELO base value

  if (is.null(elo) || !is.data.frame(elo)) {

    elo <- data.frame(
      Team = unique(pits_data2$Team),
      Rating = 1000
    )

    message("elo_data is missing or is not a data.frame \nUsing ELO default value (1000)")
  }

  ### 1. Data observations race by race

  team_race <-  pits_data2 %>%
    group_by(Year, Round, Team) %>%
    summarise(
      team_score = param1(Pos.),    # mean: global pitstop pace. This penalty teams without data or teams with drivers penalties in pits to serve
      .groups = "drop"              # min (IGNORES consistency, it refects max-potential, peak-potential)
    )                               # median, default (similar to mean, but you lost sensibility to real extreme data, but more stable than mean for outliers -for one bad pitstop or driver penalty not down a lot your efficiency-

  ### 3. ELO function

  elo_update <- function(RA, RB, S, K = k) { # k: Velocity update. Small time differences, must be low

    EA <- 1 / (1 + c ^ ((RB - RA) / d)) # expected probability, sensibility model, how much i points i must win?

    RA + K * (S - EA)                     # Standard values based from: https://doi.org/10.1016/j.ijforecast.2009.10.002
  }

  ### 4. Order

  schedule <- team_race %>%
    distinct(Year, Round) %>%
    arrange(Year, Round)

  ### 5. race by race ELO calc (argument)

  if (calc == 1) {

    ### loop carreras

    for (ki in 1:nrow(schedule)) {

      y <- schedule$Year[ki]
      r <- schedule$Round[ki]

      race <- team_race %>%
        filter(Year == y, Round == r)

      pairwise <- race %>%
        inner_join(
          race,
          by = c("Year", "Round"),
          relationship = "many-to-many"
        ) %>%
        filter(Team.x < Team.y) %>%
        mutate(
          S = ifelse(team_score.x < team_score.y, 1, 0)
        )

      ### ratings PRE-carrera
      old_ratings <- elo$Rating
      names(old_ratings) <- elo$Team

      ### acumulador de cambios
      delta <- setNames(rep(0, nrow(elo)), elo$Team)

      ### recorrer duelos
      for (i in 1:nrow(pairwise)) {

        A <- pairwise$Team.x[i]
        B <- pairwise$Team.y[i]
        S <- pairwise$S[i]

        RA <- old_ratings[A]
        RB <- old_ratings[B]

        ### calcular nuevos ratings hipotéticos
        new_A <- elo_update(RA, RB, S, K = k)
        new_B <- elo_update(RB, RA, 1 - S, K = k)

        ### convertir a cambios (IMPORTANTÍSIMO)
        change_A <- new_A - RA
        change_B <- new_B - RB

        ### acumular
        delta[A] <- delta[A] + change_A
        delta[B] <- delta[B] + change_B
      }

      ### aplicar TODO al final
      elo$Rating <- elo$Rating + delta
    }

  } else if (calc == 2) {

    for (ki in 1:nrow(schedule)) {

      y <- schedule$Year[ki]
      r <- schedule$Round[ki]

      race <- team_race %>%
        filter(Year == y, Round == r)

      pairwise <- race %>%
        inner_join(
          race,
          by = c("Year", "Round"),
          relationship = "many-to-many"
        ) %>%
        filter(Team.x < Team.y) %>%
        mutate(
          S = ifelse(team_score.x < team_score.y, 1, 0)
        )

      for (i in 1:nrow(pairwise)) {

        A <- pairwise$Team.x[i]
        B <- pairwise$Team.y[i]
        S <- pairwise$S[i]

        RA <- elo$Rating[elo$Team == A]
        RB <- elo$Rating[elo$Team == B]

        elo$Rating[elo$Team == A] <- elo_update(RA, RB, S, K = k)

        elo$Rating[elo$Team == B] <- elo_update(RB, RA, 1 - S, K = k)
      }
    }

  }

  elo_final <- elo %>%
    arrange(desc(Rating))

  return(elo_final)
}