knitr::opts_chunk$set(echo = TRUE)
library(magrittr)
library(tidyjson)
library(purrr)
library(dplyr)

Motivation

I want to document my process of exploring a data source and writing a package to scrape it.

Data process

At a high level, my process is to examine the data, parsewrite naive parsing systems, then go back and fix the infrastructure when you fully understand the problem. In this case, I have a general sense of what the data should look like (e.g. player stats and so on) but not a great sense of how the NBA.com (or some other data source) data is architectured.

A Quora answer took me to the NBA today file.

nba_api_data <- function() {
  # TODO: cache this call
  link_data <- jsonlite::fromJSON("http://data.nba.net/10s/prod/v1/today.json")
  current_year <- link_data$seasonScheduleYear
  links <- link_data$links
  return(links)
}
api_info <- nba_api_data() 
api_info %>% head

This produces the up to date links to various endpoints in the NBA universe. Variable names are templated here e.g. {{gameDate}}.

Next, we look at the calendar json. This gives us the number of games per day, but more importantly, it gives a canonical listing of game days (and the date format, never obvious!)

current_calendar <- function() {
    # i think this is the number of games per day?
    cal_data <- jsonlite::fromJSON("http://data.nba.net/10s/prod/v1/calendar.json")
    cal_data <- cal_data[-c(1:4)]
    dates <- names(cal_data)
    games <- unlist(cal_data)
    return(data.frame(date = dates, games = games, row.names = NULL))
}
cal_df <- current_calendar()
cal_df %>% head()

Parsing scoreboards

To get a sense of what scoreboards look like for a given day, let's pick the first day of the season and examine the games. Note that the format of the url is consistent, where we need to swap the date of the game (but now we have a canonical list of all gamedays).

A lot of the data are treated as nested data frames, because apparently it is not enough to say "this game was played in AmericanAirlines Arena". In a database setting, typically we would have a separate arena table and include only an arena ID here. But this is the wild wild west!

game <- jsonlite::fromJSON("http://data.nba.net/10s/prod/v1/20171001/scoreboard.json") %>% extract2("games")
game <- game %>% jsonlite::flatten(recursive=T)

This give us a very large (column-wise) data frame, because the API has several nested lists of unimportant data (e.g. broadcast info). This info could be valuable but might be best included in a separate table. We then isolate a number of variables that we know we need.

game2 <- game %>% 
    dplyr::select(seasonStageId, seasonYear, startDateEastern, gameId, statusNum, 
                  startTimeUTC, arena.name, gameDuration.hours, gameDuration.minutes,
                  visitor_id = vTeam.teamId, visitor_code = vTeam.triCode,
                  visitor_win = vTeam.win, visitor_loss = vTeam.loss, visitor_series_win = vTeam.seriesWin,
                  visitor_series_loss = vTeam.seriesLoss, visitor_score = vTeam.score,
                  visitor_linescore = vTeam.linescore,
                  home_id = hTeam.teamId, home_code = hTeam.triCode,
                  home_win = hTeam.win, home_loss = hTeam.loss, home_series_win = hTeam.seriesWin,
                  home_series_loss = hTeam.seriesLoss, home_score = hTeam.score,
                  home_linescore = hTeam.linescore)

We can also break out the linescore, i.e. points scored by quarter.

game2 <- game2 %>% 
    rowwise %>% 
    mutate(visitor_score_q1 = visitor_linescore$score[1],
              visitor_score_q2 = visitor_linescore$score[2],
              visitor_score_q3 = visitor_linescore$score[3],
              visitor_score_q4 = visitor_linescore$score[4],
              home_score_q1 = home_linescore$score[1],
              home_score_q2 = home_linescore$score[2],
              home_score_q3 = home_linescore$score[3],
              home_score_q4 = home_linescore$score[4]) %>%
    select(-home_linescore, -visitor_linescore)

We can combine all of this to get us a function that parses the games from a given date.

parse_games_from_date <- function(gameday) {
    tryCatch({
        game_data_url <- sprintf("http://data.nba.net/10s/prod/v1/%s/scoreboard.json", gameday)
    game <- jsonlite::fromJSON(game_data_url) %>% extract2("games")
    game <- game %>% jsonlite::flatten(recursive=T)
    if(!("arena.name" %in% names(game))) {
        game$arena.name <- ""
    }
    game2 <- game %>% 
        dplyr::select(seasonStageId, seasonYear, startDateEastern, gameId, statusNum, 
                      startTimeUTC, arena.name, gameDuration.hours, gameDuration.minutes,
                      visitor_id = vTeam.teamId, visitor_code = vTeam.triCode,
                      visitor_win = vTeam.win, visitor_loss = vTeam.loss, visitor_series_win = vTeam.seriesWin,
                      visitor_series_loss = vTeam.seriesLoss, visitor_score = vTeam.score,
                      visitor_linescore = vTeam.linescore,
                      home_id = hTeam.teamId, home_code = hTeam.triCode,
                      home_win = hTeam.win, home_loss = hTeam.loss, home_series_win = hTeam.seriesWin,
                      home_series_loss = hTeam.seriesLoss, home_score = hTeam.score,
                      home_linescore = hTeam.linescore)
    game2 <- game2 %>% 
        rowwise %>% 
        mutate(visitor_score_q1 = coalesce(visitor_linescore$score[1], "-1"),
                  visitor_score_q2 = coalesce(visitor_linescore$score[2], "-1"),
                  visitor_score_q3 = coalesce(visitor_linescore$score[3], "-1"),
                  visitor_score_q4 = coalesce(visitor_linescore$score[4], "-1"),
                  home_score_q1 = coalesce(home_linescore$score[1], "-1"),
                  home_score_q2 = coalesce(home_linescore$score[2], "-1"),
                  home_score_q3 = coalesce(home_linescore$score[3], "-1"),
                  home_score_q4 = coalesce(home_linescore$score[4], "-1")) %>%
                  select(-home_linescore, -visitor_linescore)

    return(game2)
    }, error = function(e) {
        cat(paste(gameday))
        cat(paste(e))
    })
}

Here, we get the games from 2017-10-02:

parse_games_from_date("20161002") %>% knitr::kable()

Because we got all the game dates from the calendar, we can then collect data for all the games.

game_dates <- cal_df %>%
    filter(games > 0) %$%
    date
game_dates <- game_dates[as.numeric(paste(game_dates)) < 20171014]
game_df <- purrr::map_df(game_dates, parse_games_from_date)

Interestingly, our parsing fails on an all-star game, which does remind us that we should be safer with how we parse data. This is typically done with having a default value if mising.

fail_game <- jsonlite::fromJSON("http://data.nba.net/10s/prod/v1/20170218/scoreboard.json") %>% extract2("games")
fail_game <- fail_game %>% jsonlite::flatten(recursive=T)

fail_game %>% knitr::kable()

Parsing game stats

Getting play by play info is pretty easy.

parse_pbp_game <- function(date, game) {

    pbp_url_q1 <- sprintf("http://data.nba.net/10s/prod/v1/%s/%s_pbp_%s.json", 
                      date, game, 1)
    pbp_url_q2 <- sprintf("http://data.nba.net/10s/prod/v1/%s/%s_pbp_%s.json", 
                      date, game, 2)
    pbp_url_q3 <- sprintf("http://data.nba.net/10s/prod/v1/%s/%s_pbp_%s.json", 
                      date, game, 3)
    pbp_url_q4 <- sprintf("http://data.nba.net/10s/prod/v1/%s/%s_pbp_%s.json", 
                      date, game, 4)
    pbp_1 <- jsonlite::fromJSON(pbp_url_q1, flatten=T)$plays
    pbp_2 <- jsonlite::fromJSON(pbp_url_q2, flatten=T)$plays
    pbp_3 <- jsonlite::fromJSON(pbp_url_q3, flatten=T)$plays
    pbp_4 <- jsonlite::fromJSON(pbp_url_q4, flatten=T)$plays
    return(dplyr::bind_rows(pbp_1, pbp_2, pbp_3, pbp_4))
}

pbp_df <- parse_pbp_game("20171013", "0011700069") 
pbp_df %>% 
    filter(eventMsgType == 1)


stillmatic/hoopr documentation built on May 29, 2019, 9:51 a.m.