R/scrape_nhl_pbp.r

Defines functions parse_nhl_espn_events get_nhl_espn_data parse_nhl_api_events get_nhl_api_events parse_nhl_html_events get_nhl_html_events

#################### NHL HTML ####################

get_nhl_html_events <- function(game_id){

  season <-
    paste0(
      stringr::str_sub(game_id, start = 1, end = 4),
      as.numeric(stringr::str_sub(game_id, start = 1, end = 4)) + 1
    )

  season_game <-
    stringr::str_sub(game_id, start = -6, end = -1)

  raw_html <- NULL

  iattempts <- 5

  base_url <- paste0("http://www.nhl.com/scores/htmlreports/", season, "/PL", season_game, ".HTM")

  while (! is.character(raw_html) & iattempts > 0){
    raw_html <-
      tryCatch(
        xml2::read_html(base_url),
        error = function(e) {NULL}
      )

    iattempts <- iattempts -1
  }

  if(length(raw_html) == 0) {
    warning("HTML Game Data Not Available")

    return(NULL)
  } else {
    return(raw_html)
  }
}

parse_nhl_html_events <- function(raw_html_data, game_id = NULL){

  event_col_names <-
    c(
      "event_id",
      "period",
      "strength",
      "time_elapsed",
      "event_type",
      "description",
      "away_on_ice",
      "home_on_ice"
    )

  raw_html_odds <-
    raw_html_data %>%
    rvest::html_elements(".oddColor") %>%
    rvest::html_children() %>%
    rvest::html_text() %>%
    matrix(ncol = 8, byrow = TRUE, dimnames = list(NULL, event_col_names)) %>%
    tibble::as_tibble(.name_repair = "unique")

  raw_html_evens <-
    raw_html_data %>%
    rvest::html_elements(".evenColor") %>%
    rvest::html_children() %>%
    rvest::html_text() %>%
    matrix(ncol = 8, byrow = TRUE, dimnames = list(NULL, event_col_names)) %>%
    tibble::as_tibble(.name_repair = "unique")

  pbp_table <-
    dplyr::bind_rows(raw_html_odds, raw_html_evens) %>%
    dplyr::mutate(
      time_elapsed_period =
        ifelse(
          nchar(stringr::str_extract(time_elapsed, "^[0-9]{1,2}:[0-9]{2}")) == 4,
          paste0("0", stringr::str_extract(time_elapsed, "^[0-9]{1,2}:[0-9]{2}")),
          stringr::str_extract(time_elapsed, "^[0-9]{1,2}:[0-9]{2}")
          ),
      time_remaining_period =
        ifelse(
          nchar(stringr::str_extract(time_elapsed, "(?<=:[0-9]{2})[0-9]{1,2}:[0-9]{2}$")) == 4,
          paste0("0", stringr::str_extract(time_elapsed, "(?<=:[0-9]{2})[0-9]{1,2}:[0-9]{2}$")),
          stringr::str_extract(time_elapsed, "(?<=:[0-9]{2})[0-9]{1,2}:[0-9]{2}$")
          ),
      home_on_ice = stringr::str_replace_all(stringr::str_trim(home_on_ice), "\\s+", "|"),
      away_on_ice = stringr::str_replace_all(stringr::str_trim(away_on_ice), "\\s+", "|")
    ) %>%
    dplyr::select(!time_elapsed) %>%
    dplyr::arrange(as.numeric(event_id))

  if (! is.null(game_id)){

    season <-
      paste0(
        stringr::str_sub(game_id, start = 1, end = 4),
        as.numeric(stringr::str_sub(game_id, start = 1, end = 4)) + 1
      )

    season_game <-
      stringr::str_sub(game_id, start = -6, end = -1)

    pbp_table <-
      pbp_table %>%
      dplyr::mutate(
        game_id = game_id,
        season = season,
        season_game = season_game,
        .before = event_id
      )
  }

  if(nrow(pbp_table) == 0){
    warning("HTML Game Data Not Available")

    return(NULL)
  }

  return(pbp_table)
}

#################### NHL API ####################

get_nhl_api_events <- function(game_id){

  raw_feed <- NULL
  iattempts <- 5

  feed_url <-
    paste0("https://statsapi.web.nhl.com/api/v1/game/",
           game_id,
           "/feed/live")

  while(! is.character(raw_feed) & iattempts > 0){
    raw_feed <-
      tryCatch(
        jsonlite::fromJSON(feed_url),
        error = function(e) {NULL}
      )

    iattempts <- iattempts - 1
  }

  if(length(raw_feed$liveData$plays$allPlays) == 0){
    warning("No NHL Game Data Returned")

    return(NULL)
  } else {
    return(raw_feed)
  }
}

parse_nhl_api_events <- function(api_events_feed){

  event_conversion <-
    tibble::tribble(
      ~event_name,                 ~event_type,
      "GAME_SCHEDULED",            "SCHED",
      "PERIOD_READY",              "PRDY",
      "PERIOD_START",              "PSTR",
      "FACEOFF",                   "FAC",
      "MISSED_SHOT",               "MISS",
      "SHOT",                      "SHOT",
      "STOP",                      "STOP",
      "GIVEAWAY",                  "GIVE",
      "HIT",                       "HIT",
      "PENALTY",                   "PENL",
      "BLOCKED_SHOT",              "BLOCK",
      "TAKEAWAY",                  "TAKE",
      "GOAL",                      "GOAL",
      "PERIOD_END",                "PEND",
      "PERIOD_OFFICIAL",           "POFF",
      "GAME_END",                  "GEND",
      "GAME_OFFICIAL",             "GOFF",
      "SHOOTOUT_COMPLETE",         "SOC"
    )

  all_api_plays <- api_events_feed$liveData$plays$allPlays

  flat_events <-
    all_api_plays %>%
    jsonlite::flatten() %>%
    dplyr::as_tibble() %>%
    tidyr::unnest_wider(players) %>%
    tidyr::unnest_wider(player) %>%
    tidyr::unnest_wider(id, names_sep = "_", names_repair = "unique") %>%
    tidyr::unnest_wider(playerType, names_sep = "_", names_repair = "unique") %>%
    tidyr::unnest_wider(fullName, names_sep = "_", names_repair = "unique") %>%
    tidyr::unnest_wider(link, names_sep = "_", names_repair = "unique")

  if(flat_events$seasonTotal %>% length() > 0){
    flat_events <-
      flat_events %>%
      tidyr::unnest_wider(seasonTotal, names_sep = "_", names_repair = "unique")
  }

  flat_events <-
    flat_events %>%
    dplyr::mutate(event_id = dplyr::row_number(),
                  game_id = api_events_feed$gamePk,
                  season =
                    paste0(
                      stringr::str_sub(api_events_feed$gamePk, start = 1, end = 4),
                      as.numeric(stringr::str_sub(api_events_feed$gamePk, start = 1, end = 4)) + 1
                    ),
                  season_game =
                    stringr::str_sub(api_events_feed$gamePk, start = -6, end = -1)
    ) %>%
    dplyr::select(
      dplyr::any_of("game_id"),
      dplyr::any_of("season"),
      dplyr::any_of("season_game"),
      dplyr::any_of("event_id"),
      event_index_id = dplyr::any_of("about.eventIdx"),
      nhl_event_id = dplyr::any_of("about.eventId"),
      event_at = dplyr::any_of("about.dateTime"),
      period_num = dplyr::any_of("about.period"),
      time_elapsed_period = dplyr::any_of("about.periodTime"),
      time_remaining_period = dplyr::any_of("about.periodTimeRemaining"),
      event_name = dplyr::any_of("result.eventTypeId"),
      event_description = dplyr::any_of("result.description"),
      event_detail = dplyr::any_of("result.secondaryType"),
      goal_strength = dplyr::any_of("result.strength.code"),
      is_game_winning_goal = dplyr::any_of("result.gameWinningGoal"),
      is_empty_net_goal = dplyr::any_of("result.emptyNet"),
      penalty_severity = dplyr::any_of("result.penaltySeverity"),
      penalty_minutes = dplyr::any_of("result.penaltyMinutes"),
      home_score = dplyr::any_of("about.goals.home"),
      away_score = dplyr::any_of("about.goals.away"),
      coordinates_x = dplyr::any_of("coordinates.x"),
      coordinates_y = dplyr::any_of("coordinates.y"),
      dplyr::starts_with("id_"),
      dplyr::starts_with("fullName_"),
      dplyr::starts_with("playerType_"),
      team_id = team.id,
      team_name = team.name,
      team_code = team.triCode
    ) %>%
    dplyr::left_join(x = ., y = event_conversion, by = c("event_name" = "event_name")) %>%
    dplyr::relocate(event_type, .after = event_name) %>%
    dplyr::rename_with(.fn = stringr::str_replace, .cols = everything(), "id_", "player_id_") %>%
    dplyr::rename_with(.fn = stringr::str_replace, .cols = everything(), "fullName", "player_name") %>%
    dplyr::rename_with(.fn = stringr::str_replace, .cols = everything(), "playerType", "player_type") %>%
    dplyr::arrange(event_at)

  flat_events$period_num <- as.character(flat_events$period_num)

  if(nrow(flat_events) == 0){
    warning("No ESPN data available")

    return(NULL)
  }

  return(flat_events)
}

#################### ESPN ####################

get_nhl_espn_data <- function(espn_game_id){

  base_URL <-
    paste0("https://www.espn.com/nhl/gamecast/data/masterFeed?lang=en&isAll=true&rand=0&gameId=",
           espn_game_id)

  raw_page <- NULL
  iattempts <- 5

  while(! is.character(raw_page) & iattempts > 0){

    raw_page <-
      tryCatch(
        RCurl::getURL(
          url = base_URL
        ),
        error = function(e) {NULL}
      )

    iattempts = iattempts - 1

  }

  if(length(raw_page) == 0){
    warning("No ESPN Game Data Returned")

    return(NULL)
  } else {

    converted_page <-
      tryCatch(
        xml2::read_xml(raw_page),
        error = function(e) {NULL}
      )

    if (is.null(converted_page)){
      warning("ESPN XML is Malformed. Exporting entire page as character")

      converted_page <-
        raw_page

    }

  }

  return(converted_page)
}

parse_nhl_espn_events <- function(raw_espn_data, nhl_game_id = NULL){

  espn_plays_columns <-
    c(
      "coordinates_x",            # X1
      "coordinates_y",            # X2
      "espn_type",                # X3
      "time",                     # X4
      "period_num",               # X5
      "espn_plays_6",             # X6
      "espn_plays_7",             # X7
      "espn_plays_8",             # X8
      "event_description",        # X9
      "espn_plays_10",            # X10
      "espn_plays_11",            # X11
      "espn_plays_12",            # X12
      "espn_plays_13",            # X13
      "espn_plays_14",            # X14
      "espn_team_id",             # X15
      "espn_plays_16",            # X16
      "espn_plays_17",            # X17
      "espn_plays_18",            # X18
      "espn_plays_19",            # X19
      "espn_plays_20"             # X20
    )

  if (is.character(raw_espn_data)){

    play_ids <-
      raw_espn_data %>%
      stringr::str_extract_all("(?<=Play id=\")[0-9]+(?=\">)") %>%
      unlist() %>%
      tibble::tibble(espn_game_id = .)

    play_data <-
      raw_espn_data %>%
      as.character() %>%
      stringr::str_extract_all("<!\\[CDATA\\[.+?>") %>%
      sapply(stringr::str_remove,"<!\\[CDATA\\[", simplify = TRUE) %>%
      sapply(stringr::str_remove, "\\]{2}>", simplify = TRUE) %>%
      dplyr::as_tibble(.name_repair = unique)

    parsed_plays <-
      play_data %>%
      dplyr::filter(!stringr::str_detect(play_data$value, "([0-9]{1,2}:[0-9]{2} PM|[0-9]{1,2}:[0-9]{2} AM|http://)")) %>%
      tidyr::separate(col = value, into = espn_plays_columns, sep = "~") %>%
      suppressMessages()

    final_events <- dplyr::bind_cols(play_ids, parsed_plays) %>% suppressMessages()

  } else {
    espn_plays <-
      raw_espn_data %>%
      xml2::xml_child(search = "Plays") %>%
      xml2::xml_text()

    if (espn_plays == "") {

      warning("No Play Data Included in Raw Data")
      return(NULL)

    } else {

      play_ids <-
        raw_espn_data %>%
        xml2::xml_child(search = "Plays") %>%
        xml2::xml_children() %>%
        xml2::xml_attr(attr = "id") %>%
        tibble::as_tibble_col(column_name = "espn_play_id") %>%
        dplyr::mutate(
          espn_game_id = stringr::str_sub(espn_play_id, start = 1, end = 9),
          .before = espn_play_id
        )

      parsed_plays <-
        raw_espn_data %>%
        xml2::xml_child(search = "Plays") %>%
        XML::xmlParse() %>%
        XML::xpathSApply("//text()", XML::xmlValue) %>%
        tibble::as_tibble(.name_repair = "unique") %>%
        tidyr::separate(col = value, into = espn_plays_columns, sep = "~", extra = "warn") %>%
        suppressMessages()

      final_events <- dplyr::bind_cols(play_ids, parsed_plays) %>% suppressMessages()
    }
  }

  if (! is.null(nhl_game_id)){
    final_events <-
      final_events %>%
      dplyr::mutate(
        game_id = nhl_game_id,
        season =
          paste0(
            stringr::str_sub(game_id, start = 1, end = 4),
            as.numeric(stringr::str_sub(game_id, start = 1, end = 4)) + 1
          ),
        season_game = stringr::str_sub(game_id, start = -6, end = -1),
        .before = coordinates_x
      )
  }

  if(nrow(final_events) == 0){
    warning("ESPN play data not available")

    return(NULL)
  }

  return(final_events)
}

parse_nhl_espn_rosters <- function(raw_espn_data){

  espn_skater_columns <-
    c(
      "espn_player_id",            # X1
      "espn_roster_2",             # X2
      "player_name",               # X3
      "player_number",             # X4
      "espn_team_id",              # X5
      "player_position",           # X6
      "player_picture_url",        # X7
      "espn_roster_8",             # X8
      "espn_roster_9",             # X9
      "espn_roster_10",            # X10
      "espn_roster_11",            # X11
      "espn_roster_12",            # X12
      "espn_roster_13",            # X13
      "espn_roster_14",            # X14
      "espn_roster_15",            # X15
      "espn_roster_16",            # X16
      "espn_roster_17",            # X17
      "espn_roster_18",            # X18
      "espn_roster_19",            # X19
      "espn_roster_20",            # X20
      "espn_roster_21",            # X21
      "espn_roster_22",            # X22
      "espn_roster_23",            # X23
      "espn_roster_24",            # X24
      "espn_roster_25",            # X25
      "espn_roster_26",            # X26
      "espn_roster_27"             # X27
    )


  espn_goalie_columns <-
    c(
      "espn_player_id",            # X1
      "espn_roster_2",             # X2
      "player_name",               # X3
      "player_number",             # X4
      "espn_team_id",              # X5,
      "player_position",           # X6
      "player_picture_url",        # X7
      "espn_roster_8",             # X8
      "espn_roster_9",             # X9
      "espn_roster_10",            # X10
      "espn_roster_11",            # X11
      "espn_roster_12",            # X12
      "espn_roster_13",            # X13
      "espn_roster_14",            # X14
      "espn_roster_15",            # X15
      "espn_roster_16",            # X16
      "espn_roster_17"            # X17
    )

  if (is.character(raw_espn_data)){

    parsed_data <-
      raw_espn_data %>%
      as.character() %>%
      stringr::str_extract_all("<!\\[CDATA\\[.+?>") %>%
      sapply(stringr::str_remove,"<!\\[CDATA\\[", simplify = TRUE) %>%
      sapply(stringr::str_remove, "\\]{2}>", simplify = TRUE) %>%
      dplyr::as_tibble(.name_repair = unique)

    parsed_roster <-
      parsed_data %>%
      dplyr::filter(stringr::str_detect(parsed_data$value, "http://")) %>%
      tidyr::separate(col = value, into = espn_skater_columns, sep = "~", extra = "warn", fill = "right") %>%
      dplyr::mutate(player_position = stringr::str_trim(.$player_position))

  } else {

    parsed_roster <-
      raw_espn_data %>%
      xml2::xml_child(search = "Players") %>%
      XML::xmlParse() %>%
      XML::xpathSApply("//text()", XML::xmlValue) %>%
      tibble::as_tibble(.name_repair = "unique") %>%
      tidyr::separate(col = value, into = espn_skater_columns, sep = "~", extra = "warn", fill = "right") %>%
      dplyr::mutate(player_position = stringr::str_trim(.$player_position))

  }

  skaters <-
    parsed_roster %>%
    dplyr::filter(player_position != "G")

  goalies <-
    parsed_roster %>%
    dplyr::filter(player_position == "G") %>%
    dplyr::select(1:17)

  colnames(goalies) <- espn_goalie_columns

  return(list(skaters, goalies))
}
EFastner/icescrapR documentation built on Jan. 15, 2022, 1:11 p.m.