R/data.R

Defines functions compile.dataset make.features

#' @importFrom magrittr "%>%"
#' @importFrom rlang .data

#' @export
compile.dataset <- function() {
  competitions <- StatsBombR::FreeCompetitions()
  matches <- StatsBombR::FreeMatches(competitions)
  events <- StatsBombR::StatsBombFreeEvents(matches)
  # TODO unpack lists in lineups$lineup
  # lineups <- StatsBombR::StatsBombFreeLineups(matches)

  matches <- tidyr::unnest(matches, c(.data$home_team.managers, .data$away_team.managers), names_sep='.', keep_empty=TRUE)

  events <- StatsBombR::allclean(events)
  goalkeeper.not.null <- events$goalkeeper.end_location != "NULL"
  events$goalkeeper.end_location.x <- purrr::modify_if(events$goalkeeper.end_location, goalkeeper.not.null, dplyr::first)
  events$goalkeeper.end_location.y <- purrr::modify_if(events$goalkeeper.end_location, goalkeeper.not.null, dplyr::last)
  carry.not.null <- events$carry.end_location != "NULL"
  events$carry.end_location.x <- purrr::modify_if(events$carry.end_location, carry.not.null, dplyr::first)
  events$carry.end_location.y <- purrr::modify_if(events$carry.end_location, carry.not.null, dplyr::last)
  events <- dplyr::select(
    events, -c(
      .data$related_events, .data$location, .data$tactics.lineup, .data$pass.end_location,
      .data$shot.end_location, .data$shot.freeze_frame, .data$goalkeeper.end_location,
      .data$carry.end_location
    )
  )
  events[events == "NULL"] <- NA
  events$goalkeeper.end_location.x <- unlist(events$goalkeeper.end_location.x)
  events$goalkeeper.end_location.y <- unlist(events$goalkeeper.end_location.y)
  events$carry.end_location.x <- unlist(events$carry.end_location.x)
  events$carry.end_location.y <- unlist(events$carry.end_location.y)

  list(matches = matches, events = events)
}

#' @export
make.features <- function(events) {
  events.phased <- events %>%
    dplyr::group_by(OffensivePhase = data.table::rleid(.data$match_id, .data$period, .data$possession_team.id)) %>%
    dplyr::mutate(TimeInOffensivePhase = max(.data$ElapsedTime) - min(.data$ElapsedTime)) %>%
    dplyr::ungroup()

  offensive.phases <- unique(events.phased[, c("match_id", "possession_team.name", "OffensivePhase", "TimeInOffensivePhase")])
  row.names(offensive.phases) <- offensive.phases$OffensivePhase

  events.phased$TimeInLastDefensivePhase <- tibble::add_row(
    offensive.phases[events.phased$OffensivePhase - 1, "TimeInOffensivePhase"],
    TimeInOffensivePhase = rep(0, dplyr::tally(events.phased[events.phased$OffensivePhase == 1, ])$n),
    .before = 1
  )$TimeInOffensivePhase

  first.period.phases <- events.phased %>%
    dplyr::group_by(.data$match_id, .data$period) %>%
    dplyr::summarize(FirstPhase = dplyr::first(.data$OffensivePhase)) %>%
    dplyr::pull(.data$FirstPhase)
  events.phased[events.phased$OffensivePhase %in% first.period.phases, "TimeInLastDefensivePhase"] <- 0

  offensive.phases <- offensive.phases %>%
    dplyr::group_by(.data$match_id) %>%
    dplyr::mutate(TotalPossessionTime = cumsum(.data$TimeInOffensivePhase) - .data$TimeInOffensivePhase) %>%
    dplyr::group_by(.data$possession_team.name, add = TRUE) %>%
    dplyr::mutate(TimeInPreviousOffensivePhases = cumsum(.data$TimeInOffensivePhase) - .data$TimeInOffensivePhase) %>%
    dplyr::ungroup()

  events.phased <- dplyr::left_join(events.phased, offensive.phases) %>%
    dplyr::mutate(PossessionRate = (.data$TimeInPreviousOffensivePhases + .data$TimeInPoss) / (.data$TotalPossessionTime + .data$TimeInPoss))

  events.phased <- events.phased %>%
    dplyr::filter(.data$team.id == .data$possession_team.id) %>%
    dplyr::group_by(PlayerPossession = data.table::rleid(.data$OffensivePhase, .data$player.id)) %>%
    dplyr::transmute(.data$id, PlayerPossessionTime = max(.data$ElapsedTime) - min(.data$ElapsedTime)) %>%
    dplyr::ungroup() %>%
    dplyr::right_join(events.phased, "id")

  events.phased
}
ElSaico/ettie documentation built on Nov. 8, 2019, 1:32 p.m.