knitr::opts_chunk$set(echo = TRUE)
old <- options(rmarkdown.html_vignette.check_title = FALSE)

We will be acquiring data from gamezone.stats.com, using the gamezoneR package, created by Jack Lichtenstein.

R & RStudio

This tutorial will require the use of R and RStudio. You can follow the instructions at R Studio on how to get started.

Import libraries

library(dplyr)
library(ggplot2)

# set a ggplot2 theme
theme_set(
  theme_bw() + 
    theme(plot.title = element_text(face = "bold", size = 32/.pt, hjust = 0), 
          plot.subtitle = element_text(face = "italic", size = 26/.pt), 
          plot.caption = element_text(face = "italic", size = 20/.pt), 
          strip.background = element_rect(color = "black", size = 3, linetype = "blank"),
          strip.text = element_text(face = "bold", size = 24/.pt, hjust = 0), 
          panel.grid.minor = element_blank(), 
          panel.border = element_blank(), 
          axis.ticks = element_blank(), 
          axis.text = element_text(size = 24/.pt), 
          axis.title = element_text(face = "bold", size = 26/.pt), 
          legend.title = element_text(face = "bold", size = 26/.pt), 
          legend.text = element_text(size = 24/.pt)))

Scraping schedules

There are two functions to scrape schedules:

The gamezoneR::gamezone_mbb_master_schedule() function has a parameter for ranked_games, which is a logical argument specifying whether or not to scrape only the games on that date where at least one team is ranked in the AP top 25 poll. Turning this off will scrape all games from the inputted date, regardless of whether a team is ranked or not.

Please note that specifying ranked_games = T increases the speed of the function because it utilizes the underlying API, while ranked_games = F does not.

# first, the ranked games for a given date
schedule <- gamezoneR::gamezone_mbb_master_schedule(date = "2021-03-10", ranked_games = T)

head(schedule)

# then, all the games for a given date
schedule <- gamezoneR::gamezone_mbb_master_schedule(date = "2021-03-10", ranked_games = F)

head(schedule)

We are able to scrape games by date, but also by team using the gamezoneR::get_team_schedule() function.

duke_schedule <- gamezoneR::gamezone_mbb_team_schedule(team = "Duke", season = "2018-19")

head(duke_schedule)

Scraping the play-by-play

Now that we have a schedule to work with, we can use the gamezoneR::gamezone_mbb_pbp() function to scrape play-by-play data for individual gameIds. The argument sub_parse is a logical parameter specifying whether or not to attempt to parse substitution and lineup data. If TRUE, the play-by-play will be appended to include columns home_1 through away_5, specifying the names of the 10 players on the floor.

Please note that GameZone does not track all substitutions. As a result, a new column, sub_error will reside in the appended play-by-play data. I have not come across many games with no such errors.

pbp <- gamezoneR::gamezone_mbb_pbp(duke_schedule$game_id[1],
                                   sub_parse = T)

pbp %>% 
  dplyr::count(sub_error)

head(pbp)

Now, let's scrape all of Duke's games from the Zion Williamson 2018-19 season using purrr::map_df().

duke_pbp <- purrr::map_df(duke_schedule$game_id, 
                          gamezoneR::gamezone_mbb_pbp, sub_parse = F)

Let's plot some shot charts. We will use the built in gamezoneR::base_court ggplot court.

duke_shots <- duke_pbp %>% 
  dplyr::filter(!is.na(loc_x),
                shooter == "Zion Williamson")

gamezoneR::base_court +
  geom_point(data = duke_shots,
             aes(loc_x, loc_y, color = shot_outcome),
             alpha = 0.8) +
  theme(axis.line = element_blank(),
        axis.text= element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        panel.background = element_blank(),
        panel.grid = element_blank(),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 30/.pt, margin = margin(0, 0, 5, 0)),
        plot.subtitle = element_text(face = "italic", hjust = 0.5, size = 24/.pt),
        plot.caption = element_text(face = "italic", hjust = 1, size = 20/.pt, margin = margin(0, 0, 0, 0)),
        legend.spacing.x = grid::unit(0, 'cm'),
        legend.title = element_text(size = 20/.pt, face = "bold"),
        legend.text = element_text(size = 16/.pt),
        legend.margin = margin(0, 0, 0, 0),
        legend.position = 'bottom',
        legend.box.margin = margin(-35, 0, 0, 0),
        plot.margin = margin(5, 0, 5, 0)) +
  labs(title = "All of Zion Williamson's shots at Duke",
       subtitle = "2018-19 college basketball season",
       color = "Outcome",
       caption = "Chart: @jacklich10 | Data: @gamezoneR")

You might notice that Zion seems to have very few shots at the rim compared to what you might expect. This is true! STATS LLC's API codes all layups and dunks at either (25, 5.25) or (26, 6.25), so many shots are plotted on top of one another. I have no idea why this is the case, but you might want to experiment with randomizing the locations of these shots when plotting shot charts.

Since gamezoneR version 0.1.1 now attempts to parse possession information from the play-by-play, we can also look at a team's offensive and defensive efficiency (by points per possession). Below is Duke's offensive and defensive efficiency by game from the 2018-19 college basketball season.

duke_pbp %>%
  dplyr::filter(!is.na(poss_before)) %>%
  dplyr::mutate(poss_number = as.numeric(poss_number),
                shot_made_numeric = dplyr::case_when(
                  is.na(shot_outcome) ~ NA_real_,
                  shot_outcome == "made" ~ 1,
                  shot_outcome == "missed" ~ 0),
                shot_value = dplyr::case_when(
                  is.na(shot_outcome) ~ NA_real_,
                  free_throw == 1 ~ 1,
                  three_pt == 1 ~ 3,
                  T ~ 2),
                points = dplyr::case_when(
                  shot_made_numeric == 0 ~ 0,
                  shot_made_numeric == 1 & free_throw == 1 ~ 1,
                  shot_made_numeric == 1 & three_pt == 1 ~ 3,
                  shot_made_numeric == 1 & three_pt == 0 & free_throw == 0 ~ 2)) %>%
  dplyr::group_by(date, game_id, poss_before, poss_number) %>%
  dplyr::summarise(fgm = sum(shot_outcome == "made" & free_throw == F, na.rm = T),
                   fga = sum(!is.na(shot_outcome) & free_throw == F),
                   ftm = sum(shot_outcome == "made" & free_throw == T),
                   fta = sum(!is.na(shot_outcome) & free_throw == T),
                   points = sum(points, na.rm = T),
                   .groups = "drop") %>%
  dplyr::group_by(date, game_id, poss_before) %>%
  dplyr::summarise(poss = dplyr::n(),
                   across(fgm:points, sum),
                   .groups = "drop") %>%
  dplyr::mutate(ppp = points/poss,
                type = ifelse(poss_before == "Duke", "Offense", "Defense"),
                color = ifelse(poss_before == "Duke", "black", "#003366"),
                fill = ifelse(poss_before == "Duke", "#001A57", "white")) %>%
  ggplot(aes(date, ppp, fill = fill, color = color)) +
  geom_line() +
  geom_point(aes(size = poss),
             pch = 21, stroke = 0.9) +
  scale_color_identity() +
  scale_fill_identity() +
  scale_size_continuous(range = c(0.8, 3.5)) +
  labs(title = "Duke's offensive and defensive efficiency by game",
       subtitle = "2018-19 college basketball season",
       x = "Date",
       y = "Points per possession",
       size = "# of possessions",
       caption = "Chart: @jacklich10 | Data: @gamezoneR")

Loading in all play-by-play data from the repository

Alternatively, you may want to load in all games at once from the remote repository. To do so, run the following lines of code:

future::plan("multisession")
progressr::with_progress({
  pbp <- gamezoneR::load_gamezone_pbp(c("2016-17", "2017-18", "2018-19", "2019-20", "2020-21"))
})

Below, we aggregate each team's offensive and defensive efficiency's (again by points per possession) by game.

pbp_summarized <- pbp %>%
  dplyr::distinct() %>% 
  dplyr::filter(!is.na(poss_before)) %>%
  dplyr::mutate(poss_number = as.numeric(poss_number),
                shot_made_numeric = dplyr::case_when(
                  is.na(shot_outcome) ~ NA_real_,
                  shot_outcome == "made" ~ 1,
                  shot_outcome == "missed" ~ 0),
                shot_value = dplyr::case_when(
                  is.na(shot_outcome) ~ NA_real_,
                  free_throw == 1 ~ 1,
                  three_pt == 1 ~ 3,
                  T ~ 2),
                points = dplyr::case_when(
                  shot_made_numeric == 0 ~ 0,
                  shot_made_numeric == 1 & free_throw == 1 ~ 1,
                  shot_made_numeric == 1 & three_pt == 1 ~ 3,
                  shot_made_numeric == 1 & three_pt == 0 & free_throw == 0 ~ 2)) %>%
  dplyr::group_by(season, date, game_id, poss_before, poss_number) %>%
  dplyr::summarise(fgm = sum(shot_outcome == "made" & free_throw == F, na.rm = T),
                   fga = sum(!is.na(shot_outcome) & free_throw == F),
                   ftm = sum(shot_outcome == "made" & free_throw == T),
                   fta = sum(!is.na(shot_outcome) & free_throw == T),
                   points = sum(points, na.rm = T),
                   .groups = "drop") %>%
  dplyr::group_by(season, date, game_id, poss_before) %>%
  dplyr::summarise(poss = dplyr::n(),
                   across(fgm:points, sum),
                   .groups = "drop") %>%
  dplyr::mutate(ppp = points/poss)

Let's take a look at the most proficient offenses in college basketball since 2016-17:

if (!requireNamespace('gt', quietly = TRUE)){
  install.packages('gt')
}
if (!requireNamespace('remotes', quietly = TRUE)){
  install.packages('remotes')
}
remotes::install_github("jthomasmock/espnscrapeR")
library(gt)
library(espnscrapeR)

pbp_summarized %>%
  dplyr::group_by(season, poss_before) %>%
  dplyr::summarise(games = dplyr::n(),
                   ppp = sum(ppp*poss)/sum(poss),
                   .groups = "drop") %>%
  dplyr::filter(games >= 10) %>%
  dplyr::arrange(dplyr::desc(ppp)) %>%
  head(15) %>%
  dplyr::left_join(gamezoneR::mbb_team_info %>%
                     dplyr::select(game_zone, team_logo_espn),
                   by = c("poss_before" = "game_zone")) %>%
  dplyr::select(season, team_logo_espn, dplyr::everything()) %>%
  gt::gt() %>%
  gt::tab_header(title = gt::md("**Best team offensive efficiencies**"),
                 subtitle = gt::md("*2016-17 through 2020-21 college basketball seasons*")) %>%
  gt::cols_label(team_logo_espn = "",
                 poss_before = "Team",
                 games = "Games",
                 ppp = "PPP") %>%
  gt::fmt_number(columns = dplyr::contains("ppp"),
                 decimals = 2) %>%
  gt::tab_style(style = list(
    gt::cell_borders(
      sides = "left",
      color = "black",
      weight = gt::px(3))),
    locations = list(
      gt::cells_body(
        columns = c(ppp)))) %>%
  gt::tab_style(style = list(
    gt::cell_borders(
      sides = "bottom",
      color = "black",
      weight = gt::px(3))),
    locations = list(
      gt::cells_column_labels(
        columns = gt::everything()))) %>%
  gt::data_color(columns = dplyr::ends_with("ppp"),
                 colors = scales::col_numeric(
                   palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"),
                   domain = NULL)) %>%
  gt::text_transform(locations = gt::cells_body(columns = c(team_logo_espn)),
                     fn = function(x) gt::web_image(url = x, height = 30)) %>%
  gt::tab_source_note("Table: @jacklich10 | Data: @gamezoneR") %>%
  espnscrapeR::gt_theme_538() %>%
  gt::cols_align(columns = c(poss_before),
                 align = "left") %>%
  gt::tab_options(data_row.padding = gt::px(2))


JackLich10/gamezoneR documentation built on April 24, 2023, 4:15 p.m.