knitr::opts_chunk$set(echo = TRUE) old <- options(rmarkdown.html_vignette.check_title = FALSE)
We will be acquiring data from gamezone.stats.com, using the gamezoneR
package, created by Jack Lichtenstein.
This tutorial will require the use of R and RStudio. You can follow the instructions at R Studio on how to get started.
library(dplyr) library(ggplot2) # set a ggplot2 theme theme_set( theme_bw() + theme(plot.title = element_text(face = "bold", size = 32/.pt, hjust = 0), plot.subtitle = element_text(face = "italic", size = 26/.pt), plot.caption = element_text(face = "italic", size = 20/.pt), strip.background = element_rect(color = "black", size = 3, linetype = "blank"), strip.text = element_text(face = "bold", size = 24/.pt, hjust = 0), panel.grid.minor = element_blank(), panel.border = element_blank(), axis.ticks = element_blank(), axis.text = element_text(size = 24/.pt), axis.title = element_text(face = "bold", size = 26/.pt), legend.title = element_text(face = "bold", size = 26/.pt), legend.text = element_text(size = 24/.pt)))
There are two functions to scrape schedules:
gamezoneR::gamezone_mbb_master_schedule()
gamezoneR::gamezone_mbb_team_schedule()
The gamezoneR::gamezone_mbb_master_schedule()
function has a parameter for ranked_games
, which is a logical argument specifying whether or not to scrape only the games on that date where at least one team is ranked in the AP top 25 poll. Turning this off will scrape all games from the inputted date, regardless of whether a team is ranked or not.
Please note that specifying ranked_games = T
increases the speed of the function because it utilizes the underlying API, while ranked_games = F
does not.
# first, the ranked games for a given date schedule <- gamezoneR::gamezone_mbb_master_schedule(date = "2021-03-10", ranked_games = T) head(schedule) # then, all the games for a given date schedule <- gamezoneR::gamezone_mbb_master_schedule(date = "2021-03-10", ranked_games = F) head(schedule)
We are able to scrape games by date, but also by team using the gamezoneR::get_team_schedule()
function.
duke_schedule <- gamezoneR::gamezone_mbb_team_schedule(team = "Duke", season = "2018-19") head(duke_schedule)
Now that we have a schedule to work with, we can use the gamezoneR::gamezone_mbb_pbp()
function to scrape play-by-play data for individual gameIds. The argument sub_parse
is a logical parameter specifying whether or not to attempt to parse substitution and lineup data. If TRUE, the play-by-play will be appended to include columns home_1
through away_5
, specifying the names of the 10 players on the floor.
Please note that GameZone does not track all substitutions. As a result, a new column, sub_error
will reside in the appended play-by-play data. I have not come across many games with no such errors.
pbp <- gamezoneR::gamezone_mbb_pbp(duke_schedule$game_id[1], sub_parse = T) pbp %>% dplyr::count(sub_error) head(pbp)
Now, let's scrape all of Duke's games from the Zion Williamson 2018-19 season using purrr::map_df()
.
duke_pbp <- purrr::map_df(duke_schedule$game_id, gamezoneR::gamezone_mbb_pbp, sub_parse = F)
Let's plot some shot charts. We will use the built in gamezoneR::base_court
ggplot court.
duke_shots <- duke_pbp %>% dplyr::filter(!is.na(loc_x), shooter == "Zion Williamson") gamezoneR::base_court + geom_point(data = duke_shots, aes(loc_x, loc_y, color = shot_outcome), alpha = 0.8) + theme(axis.line = element_blank(), axis.text= element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), panel.background = element_blank(), panel.grid = element_blank(), plot.title = element_text(face = "bold", hjust = 0.5, size = 30/.pt, margin = margin(0, 0, 5, 0)), plot.subtitle = element_text(face = "italic", hjust = 0.5, size = 24/.pt), plot.caption = element_text(face = "italic", hjust = 1, size = 20/.pt, margin = margin(0, 0, 0, 0)), legend.spacing.x = grid::unit(0, 'cm'), legend.title = element_text(size = 20/.pt, face = "bold"), legend.text = element_text(size = 16/.pt), legend.margin = margin(0, 0, 0, 0), legend.position = 'bottom', legend.box.margin = margin(-35, 0, 0, 0), plot.margin = margin(5, 0, 5, 0)) + labs(title = "All of Zion Williamson's shots at Duke", subtitle = "2018-19 college basketball season", color = "Outcome", caption = "Chart: @jacklich10 | Data: @gamezoneR")
You might notice that Zion seems to have very few shots at the rim compared to what you might expect. This is true! STATS LLC's API codes all layups and dunks at either (25, 5.25) or (26, 6.25), so many shots are plotted on top of one another. I have no idea why this is the case, but you might want to experiment with randomizing the locations of these shots when plotting shot charts.
Since gamezoneR
version 0.1.1 now attempts to parse possession information from the play-by-play, we can also look at a team's offensive and defensive efficiency (by points per possession). Below is Duke's offensive and defensive efficiency by game from the 2018-19 college basketball season.
duke_pbp %>% dplyr::filter(!is.na(poss_before)) %>% dplyr::mutate(poss_number = as.numeric(poss_number), shot_made_numeric = dplyr::case_when( is.na(shot_outcome) ~ NA_real_, shot_outcome == "made" ~ 1, shot_outcome == "missed" ~ 0), shot_value = dplyr::case_when( is.na(shot_outcome) ~ NA_real_, free_throw == 1 ~ 1, three_pt == 1 ~ 3, T ~ 2), points = dplyr::case_when( shot_made_numeric == 0 ~ 0, shot_made_numeric == 1 & free_throw == 1 ~ 1, shot_made_numeric == 1 & three_pt == 1 ~ 3, shot_made_numeric == 1 & three_pt == 0 & free_throw == 0 ~ 2)) %>% dplyr::group_by(date, game_id, poss_before, poss_number) %>% dplyr::summarise(fgm = sum(shot_outcome == "made" & free_throw == F, na.rm = T), fga = sum(!is.na(shot_outcome) & free_throw == F), ftm = sum(shot_outcome == "made" & free_throw == T), fta = sum(!is.na(shot_outcome) & free_throw == T), points = sum(points, na.rm = T), .groups = "drop") %>% dplyr::group_by(date, game_id, poss_before) %>% dplyr::summarise(poss = dplyr::n(), across(fgm:points, sum), .groups = "drop") %>% dplyr::mutate(ppp = points/poss, type = ifelse(poss_before == "Duke", "Offense", "Defense"), color = ifelse(poss_before == "Duke", "black", "#003366"), fill = ifelse(poss_before == "Duke", "#001A57", "white")) %>% ggplot(aes(date, ppp, fill = fill, color = color)) + geom_line() + geom_point(aes(size = poss), pch = 21, stroke = 0.9) + scale_color_identity() + scale_fill_identity() + scale_size_continuous(range = c(0.8, 3.5)) + labs(title = "Duke's offensive and defensive efficiency by game", subtitle = "2018-19 college basketball season", x = "Date", y = "Points per possession", size = "# of possessions", caption = "Chart: @jacklich10 | Data: @gamezoneR")
Alternatively, you may want to load in all games at once from the remote repository. To do so, run the following lines of code:
future::plan("multisession") progressr::with_progress({ pbp <- gamezoneR::load_gamezone_pbp(c("2016-17", "2017-18", "2018-19", "2019-20", "2020-21")) })
Below, we aggregate each team's offensive and defensive efficiency's (again by points per possession) by game.
pbp_summarized <- pbp %>% dplyr::distinct() %>% dplyr::filter(!is.na(poss_before)) %>% dplyr::mutate(poss_number = as.numeric(poss_number), shot_made_numeric = dplyr::case_when( is.na(shot_outcome) ~ NA_real_, shot_outcome == "made" ~ 1, shot_outcome == "missed" ~ 0), shot_value = dplyr::case_when( is.na(shot_outcome) ~ NA_real_, free_throw == 1 ~ 1, three_pt == 1 ~ 3, T ~ 2), points = dplyr::case_when( shot_made_numeric == 0 ~ 0, shot_made_numeric == 1 & free_throw == 1 ~ 1, shot_made_numeric == 1 & three_pt == 1 ~ 3, shot_made_numeric == 1 & three_pt == 0 & free_throw == 0 ~ 2)) %>% dplyr::group_by(season, date, game_id, poss_before, poss_number) %>% dplyr::summarise(fgm = sum(shot_outcome == "made" & free_throw == F, na.rm = T), fga = sum(!is.na(shot_outcome) & free_throw == F), ftm = sum(shot_outcome == "made" & free_throw == T), fta = sum(!is.na(shot_outcome) & free_throw == T), points = sum(points, na.rm = T), .groups = "drop") %>% dplyr::group_by(season, date, game_id, poss_before) %>% dplyr::summarise(poss = dplyr::n(), across(fgm:points, sum), .groups = "drop") %>% dplyr::mutate(ppp = points/poss)
Let's take a look at the most proficient offenses in college basketball since 2016-17:
if (!requireNamespace('gt', quietly = TRUE)){ install.packages('gt') } if (!requireNamespace('remotes', quietly = TRUE)){ install.packages('remotes') } remotes::install_github("jthomasmock/espnscrapeR")
library(gt) library(espnscrapeR) pbp_summarized %>% dplyr::group_by(season, poss_before) %>% dplyr::summarise(games = dplyr::n(), ppp = sum(ppp*poss)/sum(poss), .groups = "drop") %>% dplyr::filter(games >= 10) %>% dplyr::arrange(dplyr::desc(ppp)) %>% head(15) %>% dplyr::left_join(gamezoneR::mbb_team_info %>% dplyr::select(game_zone, team_logo_espn), by = c("poss_before" = "game_zone")) %>% dplyr::select(season, team_logo_espn, dplyr::everything()) %>% gt::gt() %>% gt::tab_header(title = gt::md("**Best team offensive efficiencies**"), subtitle = gt::md("*2016-17 through 2020-21 college basketball seasons*")) %>% gt::cols_label(team_logo_espn = "", poss_before = "Team", games = "Games", ppp = "PPP") %>% gt::fmt_number(columns = dplyr::contains("ppp"), decimals = 2) %>% gt::tab_style(style = list( gt::cell_borders( sides = "left", color = "black", weight = gt::px(3))), locations = list( gt::cells_body( columns = c(ppp)))) %>% gt::tab_style(style = list( gt::cell_borders( sides = "bottom", color = "black", weight = gt::px(3))), locations = list( gt::cells_column_labels( columns = gt::everything()))) %>% gt::data_color(columns = dplyr::ends_with("ppp"), colors = scales::col_numeric( palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"), domain = NULL)) %>% gt::text_transform(locations = gt::cells_body(columns = c(team_logo_espn)), fn = function(x) gt::web_image(url = x, height = 30)) %>% gt::tab_source_note("Table: @jacklich10 | Data: @gamezoneR") %>% espnscrapeR::gt_theme_538() %>% gt::cols_align(columns = c(poss_before), align = "left") %>% gt::tab_options(data_row.padding = gt::px(2))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.