R/helpers-afltables-playerstats.R

Defines functions scrape_afltables_match

#' Return afltables player match stats
#'
#' \code{scrape_afltables_match} returns a character vector containing match URLs for the specified date range
#'
#' This function returns the full afltables.com match stats for each player and each game specified in `match_urls`.
#' It is useful to use the helper function `get_afltables_urls` to return these or simply navigate to afltables.com
#' and find the match of interest.
#'
#' @param match_urls A list of URL's for matches to scrape data from
#' @keywords internal
#' @noRd
scrape_afltables_match <- function(match_urls) {
  
  # For each game url, download data, extract the stats
  # tables #3 and #5 and bind together
  cli::cli_process_start("Downloading data")

  # nolint start
  # pb <- dplyr::progress_estimated(length(match_urls))

  match_xmls <- match_urls %>%
    purrr::map(~ {
      # pb$tick()$print()
      xml2::read_html(.)
    })
  # nolint end

  cli::cli_process_done()
  cli::cli_process_start("Processing XMLS")


  replace_names <- function(x) {
    names(x) <- as.character(x[1, ])
    x[-1, ]
  }

  details <- match_xmls %>%
    purrr::map(rvest::html_nodes, "br+ table td") %>%
    purrr::map(rvest::html_text)

  home_scores <- match_xmls %>%
    purrr::map(rvest::html_nodes, "br+ table tr:nth-child(2) td") %>%
    purrr::map(rvest::html_text)

  away_scores <- match_xmls %>%
    purrr::map(rvest::html_nodes, "br+ table tr:nth-child(3) td") %>%
    purrr::map(rvest::html_text)

  # Check if notes table exists
  notes_tbl <- match_xmls %>%
    purrr::map(rvest::html_nodes, "table:nth-child(10)") %>%
    purrr::map(rvest::html_text) %>%
    purrr::map(purrr::is_empty)

  notes_fn <- function(x) {
    if (x) {
      c(3, 5)
    } else {
      c(4, 6)
    }
  }

  notes_ind <- notes_tbl %>%
    purrr::map(notes_fn)

  games <- match_xmls %>%
    purrr::map(rvest::html_table, fill = TRUE) %>%
    purrr::map2(.y = notes_ind, ~ .x[.y]) %>%
    purrr::modify_depth(1, ~ purrr::map(., replace_names))

  home_games <- games %>%
    purrr::map(1) %>%
    purrr::map2(.y = home_scores, ~ dplyr::mutate(.x, Playing.for = .y[1]))

  away_games <- games %>%
    purrr::map(2) %>%
    purrr::map2(.y = away_scores, ~ dplyr::mutate(.x, Playing.for = .y[1]))

  games <- home_games %>%
    purrr::map2(.y = away_games, ~ dplyr::bind_rows(.x, .y))

  att_lgl <- details %>%
    purrr::map(~ stringr::str_detect(.x[2], "Attendance"))

  att_fn <- function(x) {
    if (x) {
      "(?<=Date:\\s)(.*)(?=\\sAtt)"
    } else {
      "(?<=Date:\\s)(.*)(?=\\s)"
    }
  }

  date_str <- att_lgl %>%
    purrr::map(att_fn)

  args <- list(games, details, date_str)

  games_df <- args %>%
    purrr::pmap(~ dplyr::mutate(..1, Date = stringr::str_extract(..2[2], ..3)))

  games_df <- games_df %>%
    purrr::map2(.y = details, ~ dplyr::mutate(
      .x,
      Round = stringr::str_extract(.y[2], "(?<=Round:\\s)(.*)(?=\\sVenue)"),
      Venue = stringr::str_extract(.y[2], "(?<=Venue:\\s)(.*)(?=\\Date)"),
      Attendance = stringr::str_extract(.y[2], "(?<=Attendance:\\s)(.*)"),
      Umpires = .y[length(.y)]
    ))

  games_df <- games_df %>%
    purrr::map2(.y = home_scores, ~ dplyr::mutate(
      .x,
      Home.team = .y[1],
      HQ1 = .y[2],
      HQ2 = .y[3],
      HQ3 = .y[4],
      HQ4 = .y[5]
    )) %>%
    purrr::map2(.y = away_scores, ~ dplyr::mutate(
      .x,
      Away.team = .y[1],
      AQ1 = .y[2],
      AQ2 = .y[3],
      AQ3 = .y[4],
      AQ4 = .y[5]
    )) %>%
    purrr::reduce(dplyr::bind_rows)

  games_df <- games_df %>%
    dplyr::mutate(Date = gsub("\\([^]]*", "", .data$Date))

  # Remove columns with NA and abbreviations
  games_df <- games_df[, !(names(games_df) %in% "NA")]
  games_df <- games_df[, !(stringr::str_detect(
    names(games_df),
    "Abbreviations"
  ))]

  # Fix names
  names(games_df) <- make.names(names(games_df))

  # nolint start
  if ("X." %in% names(games_df)) games_df <- dplyr::rename(games_df, Jumper.No. = "X.")
  if ("X1." %in% names(games_df)) {
    games_df <- dplyr::rename(games_df,
      One.Percenters = "X1."
    )
  }
  if ("X.P" %in% names(games_df)) games_df <- dplyr::rename(games_df, TOG = "X.P")
  # nolint end

  # change column types
  games_df <- games_df %>%
    dplyr::filter(!.data$Player %in% c("Rushed", "Totals", "Opposition")) %>%
    utils::type.convert(na.strings = "NA", as.is = TRUE)

  games_cleaned <- games_df %>%
    dplyr::mutate(
      Date = lubridate::dmy_hm(.data$Date),
      Local.start.time = as.integer(format(.data$Date, "%H%M")),
      Date = lubridate::ymd(format(.data$Date, "%Y-%m-%d")),
      Season = as.integer(lubridate::year(.data$Date))
    ) %>%
    tidyr::separate("Player",
      into = c("Surname", "First.name"), sep = ","
    ) %>%
    dplyr::mutate_at(c("Surname", "First.name"), stringr::str_squish) %>%
    tidyr::separate("Umpires",
      into = c(
        "Umpire.1", "Umpire.2",
        "Umpire.3", "Umpire.4"
      ),
      sep = ",", fill = "right"
    ) %>%
    dplyr::mutate_at(
      dplyr::vars(dplyr::starts_with("Umpire")),
      stringr::str_replace, " \\(.*\\)", ""
    )

  sep <- function(...) {
    dots <- list(...)
    tidyr::separate(..., into = sprintf(
      "%s%s", dots[[2]],
      c("G", "B", "P")
    ), sep = "\\.")
  }

  score_cols <- c("HQ1", "HQ2", "HQ3", "HQ4", "AQ1", "AQ2", "AQ3", "AQ4")
  games_cleaned <- games_cleaned %>%
    Reduce(f = sep, x = score_cols) %>%
    dplyr::mutate_at(dplyr::vars(dplyr::contains("HQ")), as.integer) %>%
    dplyr::mutate_at(dplyr::vars(dplyr::contains("AQ")), as.integer) %>%
    dplyr::rename(
      Home.score = "HQ4P",
      Away.score = "AQ4P"
    )

  ids <- get_afltables_player_ids(
    min(games_cleaned$Season):max(games_cleaned$Season)
  )

  games_joined <- games_cleaned %>%
    dplyr::mutate(Player = paste(.data$First.name, .data$Surname)) %>%
    dplyr::left_join(ids,
      by = c("Season", "Player", "Playing.for" = "Team")
    ) %>%
    dplyr::select(-"Player")

  df <- games_joined %>%
    dplyr::rename(!!!rlang::syms(with(stat_abbr, setNames(stat.abb, stat))))

  if (!rlang::has_name(games_joined, "Substitute")) {
    afldata_cols <- afldata_cols[afldata_cols != "Substitute"]
  }

  df <- df %>%
    dplyr::select(dplyr::one_of(afldata_cols))

  df <- df %>%
    dplyr::mutate_if(is.numeric, ~ ifelse(is.na(.), 0, .)) %>%
    dplyr::mutate(Round = as.character(.data$Round))

  cli::cli_process_done()
  return(df)
}

#' Return match URLs for specified dates
#'
#' \code{get_afltables_urls} returns a character vector containing match URLs for the specified date range
#'
#' This function returns match URLs for the specified date range. This will typically be used to pass to
#' to `scrape_afltables_match` to return player match results.
#'
#'
#' Returns the ID for the players in the relevent seasons
#'
#' @param start_date character string for start date return to URLs from, in "dmy" or "ymd" format
#' @param end_date optional, character string for end date to return URLS, in "dmy" or "ymd" format
#'
#' @keywords internal
#' @noRd
get_afltables_urls <- function(start_date,
                               end_date = Sys.Date()) {
  start_date <- lubridate::parse_date_time(start_date, c("dmy", "ymd"))

  if (is.na(start_date)) {
    stop(paste(
      "Date format not recognised",
      "Check that start_date is in dmy or ymd format"
    ))
  }
  end_date <- lubridate::parse_date_time(end_date, c("dmy", "ymd"))

  if (is.na(end_date)) {
    stop(paste(
      "Date format not recognised.",
      "Check that end_date is in dmy or ymd format"
    ))
  }

  Seasons <- format(start_date, "%Y"):format(end_date, "%Y")

  url_works <- function(url) {
    tryCatch(
      xml2::read_html(url),
      error = function(e) {
        NULL
      }
    )
  }

  html_games <- Seasons %>%
    purrr::map(~ paste0("https://afltables.com/afl/seas/", ., ".html")) %>%
    purrr::map(url_works)

  html_games <- Filter(Negate(is.null), html_games)

  dates <- html_games %>%
    purrr::map(
      rvest::html_nodes,
      # "table+ table tr:nth-child(1) > td:nth-child(4)"
      "tr:nth-child(1) > td:nth-child(4)"
    ) %>%
    purrr::map(rvest::html_text) %>%
    purrr::map(stringr::str_extract, "\\d{1,2}-[A-z]{3}-\\d{4}") %>%
    purrr::map(lubridate::dmy) %>%
    purrr::map(~ .x[!is.na(.x)]) %>%
    purrr::map(~ .x > start_date & .x < end_date)

  match_ids <- html_games %>%
    # purrr::map(rvest::html_nodes, "tr+ tr b+ a") %>%
    purrr::map(rvest::html_nodes, "tr:nth-child(2) td:nth-child(4) a") %>%
    purrr::map(rvest::html_attr, "href") %>%
    purrr::map(~ stringr::str_replace(., "..", "https://afltables.com/afl"))

  # Return only id's that match
  match_ids <- match_ids %>%
    purrr::map2(.y = dates, ~ magrittr::extract(.x, .y)) %>%
    purrr::reduce(c)

  match_ids[!is.na(match_ids)]
}

#' Get afltables player ids
#'
#' Returns the ID for the players in the relevent seasons
#'
#' @param seasons Seasons to return ids for
#'
#'
#' @keywords internal
#' @noRd
get_afltables_player_ids <- function(seasons) {
  base_url <- function(x) {
    paste0("https://afltables.com/afl/stats/", x, "_stats.txt")
  }

  # nolint start
  git_url <- "https://raw.githubusercontent.com/jimmyday12/fitzroy_data/main/data-raw/afl_tables_playerstats/player_ids.csv"
  # nolint end

  col_vars <- c("Season", "Player", "ID", "Team")

  ids <- git_url %>%
    readr::read_csv(col_types = c("dcdc")) %>%
    dplyr::mutate(ID = as.integer(.data$ID)) %>%
    dplyr::select(!!col_vars) %>%
    dplyr::distinct() %>%
    dplyr::filter(.data$Season %in% seasons)

  # check for new ids
  readUrl <- function(url) {
    out <- tryCatch(readr::read_csv(url,
      col_types = readr::cols(),
      guess_max = 10000
    ),
    error = function(cond) {
      return(data.frame())
    }
    )
    return(out)
  }

  start <- 2017
  
  # need to check if the current years season has started
  current_year <- Sys.Date() %>% format("%Y") %>% as.numeric()
  end <- max(max(seasons), current_year)

  urls <- purrr::map_chr(start:end, base_url)

  ids_new <- urls %>%
    purrr::set_names() %>%
    purrr::map(readUrl) %>%
    purrr::discard(~ nrow(.x) == 0) 

  # Some DFs have numeric columns as 'chr' and some have them as 'dbl',
  # so we need to make them consistent before joining to avoid type errors
  mixed_cols <- c("Round", "Jumper.No.")
  cols_to_convert <- intersect(mixed_cols, colnames(ids_new[[1]]))

  ids_new <- ids_new %>%
    purrr::map_dfr(~ dplyr::mutate_at(., cols_to_convert, as.character),
                   .id = "Season") %>%
    dplyr::mutate(Season = stringr::str_remove(.data$Season, "https://afltables.com/afl/stats/"),
                  Season = stringr::str_remove(.data$Season, "_stats.txt"),
                  Season = as.numeric(.data$Season))
  
  if (nrow(ids_new) < 1) {
    return(ids)
  }

  ids_new <- ids_new %>%
    dplyr::select(!!col_vars) %>%
    dplyr::distinct() %>%
    dplyr::rename(Team.abb = "Team")  %>%
    dplyr::left_join(team_abbr, by = c("Team.abb" = "Team.abb")) %>%
    dplyr::select(!!col_vars)
  
  ids_new_min <- min(ids_new$Season)
  ids_new_max <- max(ids_new$Season)
  
  ids_old <- ids %>%
    dplyr::filter(.data$Season < ids_new_min | .data$Season > ids_new_max)
  
  if (nrow(ids_old) < 1) {
    return(dplyr::distinct(ids_new))
  } else {
    ids <- dplyr::bind_rows(ids_old, ids_new) %>%
      dplyr::distinct()
    
    return(ids)
  }

  
}
jimmyday12/fitzRoy documentation built on March 26, 2024, 8:56 p.m.