R/scrape_nhl_schedule.r

Defines functions get_espn_schedule_dates get_espn_schedule get_espn_game_id get_nhl_schedule

Documented in get_nhl_schedule

#' This function extracts a list of all game ids for a given date range from the NHL.com API
#'
#' @param start_date the start date of the desired interval
#' @param end_date the end date of the desired interval
#' @export
get_nhl_schedule <- function(start_date = lubridate::today(), end_date = start_date, add_espn_ids = FALSE, verbose = NULL){

  json_schedule <- NULL
  iattempts <- 5

  ## Attempt to grab the schedule from the NHL API
  while (is.null(json_schedule) & iattempts > 0){
    schedule_url <-
      paste0("https://statsapi.web.nhl.com/api/v1/schedule?startDate=",
             start_date,
             "&endDate=",
             end_date)

    json_schedule <-
      tryCatch(
        jsonlite::fromJSON(schedule_url),
        error = function(e) {NULL}
      )

    iattempts <- iattempts - 1
  }

  # Check if there is data available
  if (length(json_schedule$dates$games) == 0) {
    warning("No NHL Schedule Data Returned")

    return(NULL)
  }
  else {

    final_schedule <-
      dplyr::bind_rows(json_schedule$dates) %>%
      tibble::as_tibble(.name_repair = unique) %>%
      tidyr::unnest(games) %>%
      jsonlite::flatten() %>%
      dplyr::select(
        game_id = gamePk,
        season,
        feed_api = link,
        game_type = gameType,
        game_date = date,
        game_start = gameDate,
        game_status = status.abstractGameState,
        coded_game_state = status.codedGameState,
        game_status_detail = status.detailedState,
        status_code = status.statusCode,
        start_is_tbd = status.startTimeTBD,
        home_team_id = dplyr::any_of("teams.home.team.id"),
        home_team = dplyr::any_of("teams.home.team.name"),
        home_team_api = dplyr::any_of("teams.home.team.link"),
        home_team_score = dplyr::any_of("teams.home.score"),
        home_team_wins = dplyr::any_of("teams.home.leagueRecord.wins"),
        home_team_losses = dplyr::any_of("teams.home.leagueRecord.losses"),
        home_team_ot_losses = dplyr::any_of("teams.home.leagueRecord.ot"),
        home_team_record_type = dplyr::any_of("teams.home.leagueRecord.type"),
        away_team_id = dplyr::any_of("teams.away.team.id"),
        away_team = dplyr::any_of("teams.away.team.name"),
        away_team_api = dplyr::any_of("teams.away.team.link"),
        away_team_score = dplyr::any_of("teams.away.score"),
        away_team_wins = dplyr::any_of("teams.away.leagueRecord.wins"),
        away_team_losses = dplyr::any_of("teams.away.leagueRecord.losses"),
        away_team_ot_losses = dplyr::any_of("teams.away.leagueRecord.ot"),
        away_team_record_type = dplyr::any_of("teams.away.leagueRecord.type"),
        venue_id = dplyr::any_of("venue.id"),
        venue_name = dplyr::any_of("venue.name"),
        venue_api = dplyr::any_of("venue.link"),
        content_api = dplyr::any_of("content.link")) %>%
      dplyr::mutate(game_date = as.Date(game_date))

    if(add_espn_ids == TRUE){

      raw_espn_games <- get_espn_schedule_dates(unique(final_schedule$game_date), verbose = verbose)
      team_ids <- get_nhl_api_teams()[c("team_id", "team_name")]

      espn_games <-
        raw_espn_games %>%
        dplyr::left_join(team_ids, by = c("home" = "team_name")) %>%
        dplyr::rename(home_team_id = team_id) %>%
        dplyr::left_join(team_ids, by = c("away" = "team_name")) %>%
        dplyr::rename(away_team_id = team_id) %>%
        dplyr::select(
          game_date,
          espn_game_id = game_id,
          home_team_id,
          away_team_id
        ) %>%
        dplyr::mutate(game_date = as.Date(x = game_date, format = "%Y%m%d"))

      espn_home_games <-
        espn_games %>%
        dplyr::select(
          game_date,
          espn_game_id,
          home_team_id
        )

      espn_away_games <-
        espn_games %>%
        dplyr::select(
          game_date,
          espn_game_id,
          away_team_id
        )

      final_schedule <-
        final_schedule %>%
        dplyr::left_join(espn_games, by = c("game_date", "home_team_id", "away_team_id")) %>%
        dplyr::rename(combined_espn_id = espn_game_id) %>%
        dplyr::left_join(espn_home_games, by = c("game_date", "home_team_id")) %>%
        dplyr::rename(home_espn_id = espn_game_id) %>%
        dplyr::left_join(espn_away_games, by = c("game_date", "away_team_id")) %>%
        dplyr::rename(away_espn_id = espn_game_id) %>%
        dplyr::mutate(espn_game_id = dplyr::coalesce(combined_espn_id, home_espn_id, away_espn_id), .after = game_id) %>%
        dplyr::select(-one_of("combined_espn_id", "home_espn_id", "away_espn_id"))
    }

    return(final_schedule)
  }


}

get_espn_game_id <- function(game_id) {

  all_schedules <- NULL

  game_list <-
    game_id %>%
    tibble::as_tibble_col(column_name = "game_id")

  year_list <-
    stringr::str_extract_all(game_id, "^[0-9]{4}", simplify = TRUE) %>%
    unique() %>%
    tibble::as_tibble_col(column_name = "year") %>%
    dplyr::mutate(schedule_year = paste0(year, as.numeric(year) + 1))

  for(iSeason in year_list$schedule_year){

    raw_json <- NULL
    iattempts <- 5

    while (is.null(raw_json) & iattempts > 0){
      schedule_url <-
        paste0("https://statsapi.web.nhl.com/api/v1/schedule?season=",
               iSeason
               )

      raw_json <-
        tryCatch(
          jsonlite::fromJSON(schedule_url),
          error = function(e) {NULL}
        )

      iattempts <- iattempts - 1
    }

    if (! is.null(raw_json) ) {

      parsed_json <-
        dplyr::bind_rows(raw_json$dates) %>%
        tibble::as_tibble(.name_repair = unique) %>%
        tidyr::unnest(games) %>%
        jsonlite::flatten() %>%
        dplyr::select(
          game_id = gamePk,
          game_date = date,
          home_team_id = teams.home.team.id,
          away_team_id = teams.away.team.id) %>%
        dplyr::mutate(game_date = as.Date(x = game_date, format = "%Y-%m-%d"))
    } else {
      parsed_json <- NULL
    }

    all_schedules <-
      dplyr::bind_rows(
        all_schedules,
        parsed_json
      )

  }

  appended_games <-
    dplyr::left_join(x = game_list, y = all_schedules, by = "game_id")

  date_list <-
    appended_games$game_date %>%
    unique()

  raw_espn_games <- get_espn_schedule_dates(date_list)
  team_ids <- get_nhl_api_teams()[c("team_id", "team_name")]

  espn_games <-
    raw_espn_games %>%
    dplyr::left_join(team_ids, by = c("home" = "team_name")) %>%
    dplyr::rename(home_team_id = team_id) %>%
    dplyr::left_join(team_ids, by = c("away" = "team_name")) %>%
    dplyr::rename(away_team_id = team_id) %>%
    dplyr::select(
      game_date,
      espn_game_id = game_id,
      home_team_id,
      away_team_id
    ) %>%
    dplyr::mutate(game_date = as.Date(x = game_date, format = "%Y%m%d"))

  espn_home_games <-
    espn_games %>%
    dplyr::select(
      game_date,
      espn_game_id,
      home_team_id
    )

  espn_away_games <-
    espn_games %>%
    dplyr::select(
      game_date,
      espn_game_id,
      away_team_id
    )

  appended_games <-
    appended_games %>%
    dplyr::left_join(espn_games, by = c("game_date", "home_team_id", "away_team_id")) %>%
    dplyr::rename(combined_espn_id = espn_game_id) %>%
    dplyr::left_join(espn_home_games, by = c("game_date", "home_team_id")) %>%
    dplyr::rename(home_espn_id = espn_game_id) %>%
    dplyr::left_join(espn_away_games, by = c("game_date", "away_team_id")) %>%
    dplyr::rename(away_espn_id = espn_game_id) %>%
    dplyr::mutate(espn_game_id = dplyr::coalesce(combined_espn_id, home_espn_id, away_espn_id), .after = game_id) %>%
    dplyr::select(game_id, espn_game_id)

  return(appended_games)
}

get_espn_schedule <- function(start_date = lubridate::today(), end_date = start_date, verbose = NULL) {

  if(is.null(verbose)){
    verbose_lower <- "na"
  } else {
    verbose_lower <- tolower(verbose)
  }

  date_list <-
    seq(from = as.Date(start_date), to = as.Date(end_date), by = 1) %>%
    stringr::str_remove_all("-|\\s|/")

  raw_schedule <- NULL
  espn_schedule <- NULL
  iattempts <- 5

  for (iDate in date_list){
    if(verbose_lower %in% c("detail")){
      cat("Scraping ESPN Schedule for ", iDate, "\n", sep = "")
    }

    base_url <-
      paste0("https://www.espn.com/nhl/scoreboard?date=", iDate)

    while (is.null(raw_schedule) & iattempts > 0){

      raw_schedule <-
        tryCatch(
          xml2::read_html(base_url),
          error = function(e) {NULL}
        )

      iattempts <- iattempts - 1
    }

    if (is.null(raw_schedule)) {
      warning("No NHL Schedule Data Returned")

    } else {

      espn_game_ids <-
        raw_schedule %>%
        rvest::html_nodes(".Scoreboard") %>%
        rvest::html_attr("id")

      espn_home_teams <-
        raw_schedule %>%
        rvest::html_nodes(".ScoreboardScoreCell__Item--home") %>%
        rvest::html_nodes(".ScoreCell__TeamName") %>%
        rvest::html_text2()

      espn_away_teams <-
        raw_schedule %>%
        rvest::html_nodes(".ScoreboardScoreCell__Item--away") %>%
        rvest::html_nodes(".ScoreCell__TeamName") %>%
        rvest::html_text2()

      espn_date <-
        tibble::tibble(
          game_id = espn_game_ids,
          home = espn_home_teams,
          away = espn_away_teams) %>%
        dplyr::mutate(game_date = iDate, .before = game_id)

      espn_schedule <-
        dplyr::bind_rows(
          espn_schedule,
          espn_date
        )

      base_url <- NULL
      raw_schedule <- NULL
      iattempts <- 5

    }

  }

  return(espn_schedule)

}

get_espn_schedule_dates <- function(dates, verbose = NULL) {

  if(is.null(verbose)){
    verbose_lower <- "na"
  } else {
    verbose_lower <- tolower(verbose)
  }

  date_list <-
    dates %>%
    stringr::str_remove_all("-|\\s|/")

  raw_schedule <- NULL
  espn_schedule <- NULL
  iattempts <- 5

  for (iDate in date_list){
    if(verbose_lower %in% c("detail")){
      cat("Scraping ESPN Schedule for ", iDate, "\n", sep = "")
    }

    base_url <-
      paste0("https://www.espn.com/nhl/scoreboard?date=", iDate)

    while (is.null(raw_schedule) & iattempts > 0){

      raw_schedule <-
        tryCatch(
          xml2::read_html(base_url),
          error = function(e) {NULL}
        )

      iattempts <- iattempts - 1
    }

    if (is.null(raw_schedule)) {
      warning("No NHL Schedule Data Returned")

    } else {

      espn_game_ids <-
        raw_schedule %>%
        rvest::html_nodes(".Scoreboard") %>%
        rvest::html_attr("id")

      espn_home_teams <-
        raw_schedule %>%
        rvest::html_nodes(".ScoreboardScoreCell__Item--home") %>%
        rvest::html_nodes(".ScoreCell__TeamName") %>%
        rvest::html_text2()

      espn_away_teams <-
        raw_schedule %>%
        rvest::html_nodes(".ScoreboardScoreCell__Item--away") %>%
        rvest::html_nodes(".ScoreCell__TeamName") %>%
        rvest::html_text2()

      espn_date <-
        tibble::tibble(
          game_id = espn_game_ids,
          home = espn_home_teams,
          away = espn_away_teams) %>%
        dplyr::mutate(game_date = iDate, .before = game_id)

      espn_schedule <-
        dplyr::bind_rows(
          espn_schedule,
          espn_date
        )

      base_url <- NULL
      raw_schedule <- NULL
      iattempts <- 5

    }

  }

  return(espn_schedule)

}
EFastner/icescrapR documentation built on Jan. 15, 2022, 1:11 p.m.