#' Individual Game Play-By-Play Scraper
#'
#' This function retrieves and cleans play-by-play information for an individual game.
#' Warns users of potential errors and mistakes made by the game trackers. The number of player discrepancies warning
#' counts displays the number of events players committed when it is found they were not on the court at the time of the
#' event. The substitution mistake warning indicates an unclean substitution was entered. (ex. 2 players enter and 1 leaves)
#' @param game_id string made up of digits given to each unique game. This can be found in the play-by-play url for each game.
#' @param use_file Boolean. If true, retrieve from local storage rather than url. File path constructed from `base_path` directory
#' @param save_file Boolean. If true, save html for game to local file. File path constructed from `base_path`
#' @param base_path String. Specify base directory of html file save, ex. "/Users/jake/html_files/"
#' @param overwrite Boolean. If true, save file will overwrite an existing file at same path. Otherwise will read from existing file (if use_file=T)
#' @import dplyr
#' @importFrom XML readHTMLTable
#' @export
#' @return data frame containing play-by-play data for a game, where each row represents an individual event from the game.
#' \itemize{
#' \item{ID} - Numeric game id that is given for each unique game
#' \item{Date} - Game date
#' \item{Home} - Home team name
#' \item{Away} - Away team name
#' \item{Time} - String for game time in format reported originally by NCAA
#' \item{Game_Time} - String reporting game time elapsed. For example, a normal game starts at 00:00 and ends at 40:00
#' \item{Game_Seconds} - Number of seconds elapsed in the game
#' \item{Half_Status} - Number indicating which half: 1,2 for regulation, 3+ for OTs
#' \item{Home_Score} - Score for home team after event occurred as reported by NCAA
#' \item{Away_Score} - Score for away team after event occurred as reported by NCAA
#' \item{Event_Team} - Which team was repsonsibile for the play-by-play entry
#' \item{Event_Description} - The text description of the event on the NCAA site
#' \item{Player_1} - The primary player responsible for the event
#' \item{Player_2} - The secondary player within an event. As of now this is only the assister on a made shot.
#' \item{Event_Type} - String representing the event that occurred as reported by the description
#' \item{Event_Result} - Takes values of made/missed for shot attempts, otherwise NA
#' \item{Shot_Value} - Numeric value of points awarded by shot type. Ranges from 1-3 for shot attempts, otherwise NA
#' \item{Event_Length} - Estimate of time before events calculated from event time - last previous event time
#' \item{Home.1} - One of the players on the court for the home team
#' \item{Home.2} - One of the players on the court for the home team
#' \item{Home.3} - One of the players on the court for the home team
#' \item{Home.4} - One of the players on the court for the home team
#' \item{Home.5} - One of the players on the court for the home team
#' \item{Away.1} - One of the players on the court for the away team
#' \item{Away.2} - One of the players on the court for the away team
#' \item{Away.3} - One of the players on the court for the away team
#' \item{Away.4} - One of the players on the court for the away team
#' \item{Away.5} - One of the players on the court for the away team
#' \item{Status} - This reports the cleanliness of the data frame. Will be CLEAN if no errors are found. Otherwise will say
#' the number of errors that occurred or a potential substitution mistake occurred.
#' }
#' @examples
#' scrape_game(4674164)
scrape_game <- function(game_id, save_file=F, use_file=F, base_path = NA, overwrite=F) {
#track status of cleanliness of data for game
status <- "CLEAN"
base_url <- "https://stats.ncaa.org/game/play_by_play/"
url_text <- paste0(base_url, game_id)
# new
base_url <- "https://stats.ncaa.org/contests/game_id/play_by_play/"
url_text <- glue::glue("https://stats.ncaa.org/contests/{game_id}/play_by_play/")
file_dir <- paste0(base_path, "play_by_play/")
file_path <- paste0(file_dir, game_id, ".html")
isUrlRead <- F
# Give user option to save raw html file (to make future processing more efficient)
if (save_file & !is.na(base_path) & (!file.exists(file_path) | overwrite)) {
isUrlRead <- T
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
dir.create(file_dir, recursive = T, showWarnings = F)
writeLines(html, file_path)
} else if (file.exists(file_path) & use_file) {
html <- readLines(file_path, warn=F)
} else {
isUrlRead <- T
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
}
table <- XML::readHTMLTable(html)
if (length(table) == 0) {
message("Game Not Found")
return(data.frame())
}
# Pull scores for each half
half_scores <- table[[2]][1:2,]
# Get the data frames for the regulation portion
first_half <- table[[4]] %>%
dplyr::mutate_if(is.factor, as.character) %>%
dplyr::mutate(Half_Status = 1)
second_half <- table[[5]] %>%
dplyr::mutate_if(is.factor, as.character) %>%
dplyr::mutate(Half_Status = 2)
game <- dplyr::bind_rows(first_half, second_half)
# Check if overtime period(s) exist
if (ncol(half_scores) == 4) {
numbOTs <- 0
} else{
# Iterate through overtimes and add to game data frame
numbOTs <- length(half_scores) - 4
for (i in 1:numbOTs) {
ot_data <- table[[5 + i]] %>%
dplyr::mutate_if(is.factor, as.character) %>%
dplyr::mutate(Half_Status = 2 + i)
game <- dplyr::bind_rows(game, ot_data)
}
}
# Essentially, there are two different codes/systems used by the NCAA to track games
# This makes the content completely different and needs to be adjusted for
# 'V2' is dubbed version 2, as it has more detail and began usage more recently
# Can find the format by looking at the first entries as they are constant and unique to each version
# As V1 is older and uses less detail, we will format the data in accordance with V1
format <-
if (((first_half[1, 1] == "20:00:00") &
(first_half[1, 2] == "game start" |
first_half[1, 2] == "period start"|
first_half[1,2] == "jumpball startperiod")) |
any(grepl("commercial",game[,2]))|
any(grepl("Technical",first_half[1,]))) {
"V2"
} else{
"V1"
}
# Removes unneccesary extraneous rows that may arise
game <- dplyr::filter(game, !is.na(Score))
# Get the game metadata
meta <- table[[2]]
# Get Game Date- removing start time because I don't really see a use in play by play
datetime <- meta[3,1]
date <- substr(datetime, 1, 10)
date <- ifelse(is.null(date),"",date)
# Get the teams playing
away_team <- colnames(game)[2]
home_team <- colnames(game)[4]
# Get game time- formatted as 20:00 counting down to 0 for each half
time <- substr(game[, 1], 1, 5)
# Convert game time to seconds- goes from 0 at start to 2400+ at end of game
time_in_seconds <-
unlist(lapply(strsplit(time, ":"), function(x) {
if (nchar(x[1]) == 2) {
(as.numeric(x[1]) * 60 + as.numeric(x[2]))
} else{
NA
}
}))
game_time <- ifelse(game[, 5] < 2,
1200 + 1200 * (game[, 5] - 1),
2400 + 300 * (game[, 5] - 2)) - time_in_seconds
# Differently formatted time that goes from 0 at game start to 40:00+ at end of game
mins <- game_time %/% 60
mins <- ifelse(mins < 10, paste0("0", mins), as.character(mins))
secs <- game_time %% 60
secs <- ifelse(secs < 10, paste0("0", secs), as.character(secs))
game_display <- paste0(mins, ":", secs)
# Separates score column into a separated home and away score
scores <- strsplit(game[, 3], "-")
home_score <- unlist(lapply(scores, function(x) {
x[2]
}))
away_score <- unlist(lapply(scores, function(x) {
x[1]
}))
# Data begins formatted with away team events in one column and home team events in another
# With the formatting in use, there can never be a home and away event at the same time
# This allows the home and away events to be merged into an events column
player1_h <- game[, 4]
player1_a <- game[, 2]
player1 <- player1_h
player1[which(player1 == "" | is.na(player1))] <- player1_a[which(player1_a != "")]
events <- player1
#Pulls the event team by looking at which entry side it comes from
event_team <- ifelse(player1_h == player1, home_team, away_team)
# Handle PBP Version ####
# Here is where the data is converted to a standard code
if (format == "V2") {
# Version 2 Cleaning
# Uses: player name, event
event_split <- strsplit(events, ",")
event_split <- lapply(event_split, function(x){
if(length(x) == 3){
temp <- x
temp[1] <- paste0(temp[1], temp[2])
temp[2] <- temp[3]
temp <- temp[1:2]
return(temp)
} else{
return(x)
}
})
# This pulls player names from left of comma
players <-
unlist(lapply(event_split, function(x) {
x[1]
}))
# Converts to formatting of names used in V1 which is FIRST.LAST
players <- gsub("[^[:alnum:] ]", "", players)
player_name <- gsub("\\s+", ".", toupper(players))
# Remove any notation of JR/SR/II/III from player name
player_name <- gsub("(\\.JR\\.|\\.SR\\.|\\.J\\.R\\.|\\.JR\\.|JR\\.|SR\\.|\\.SR|\\.JR|\\.SR|\\.III|\\.II|\\.IV)$","", player_name)
player_name <- trimws(player_name)
# Now getting events from left of comma
events <-
unlist(lapply(event_split, function(x) {
x[2]
}))
# Call function that converts the formatting of events from V2 to V1
events <- convert_events(events)
} else {
# Version 1 Cleaning
# Follows format LAST,FIRST event_description
event_split <- strsplit(events, " ")
# Find the all uppercase words to get player names
players <-
unlist(lapply(event_split, function(x) {
paste(x[which(x == toupper(x) | x == "Team,")], collapse = " ")
}))
# Take all other words as a component of the event
events <-
unlist(lapply(event_split, function(x) {
paste(x[which(x != toupper(x) & x != "Team,")], collapse = " ")
}))
# Clean the player names into a proper format
clean_players <- strsplit(players, ",")
first <- unlist(lapply(clean_players, function(x) {
x[2]
}))
last <- unlist(lapply(clean_players, function(x) {
x[1]
}))
first <- ifelse(is.na(first), "", first)
first <- gsub("[^[:alnum:] ]", "", first)
last <- gsub("[^[:alnum:] ]", "", last)
player_name <- paste0(first, ".", last)
player_name <- ifelse(substr(player_name, 1, 2) == ".T", "TEAM", player_name)
player_name <- gsub("\\s+", ".", player_name)
player_name <- gsub("(\\.JR\\.|\\.SR\\.|\\.J\\.R\\.|\\.JR\\.|JR\\.|SR\\.|\\.SR|\\.JR|\\.SR|\\.III|\\.II|\\.IV)$","", player_name)
}
# Now format controls for version
# Separate event into first word and rest of word
first_word <-
unlist(lapply(strsplit(events, " "), function(x) {
x[1]
}))
remaining <- unlist(lapply(strsplit(events, " "),
function(x) {
paste(x[2:length(x)], collapse = " ")
}))
# Shots are preceded by "made" or "missed", this strips that from the event type
event_type <-
ifelse(first_word %in% c("made", "missed"), remaining, events)
# This pulls the event result only in the case of shots
event_result <-
ifelse(first_word %in% c("made", "missed"),
first_word,
NA_character_)
# Now put together created variables into first data frame
dirty_game <- data.frame(
ID = game_id,
Date = date,
Home = home_team,
Away = away_team,
Time = time,
Game_Time = game_display,
Game_Seconds = game_time,
Half_Status = game$Half_Status,
Home_Score = home_score,
Away_Score = away_score,
Event_Team = event_team,
Player_1 = player_name,
Event_Type = event_type,
Event_Result = event_result,
Event_Description = player1,
stringsAsFactors = F
) %>%
dplyr::mutate(
Event_Priority = case_when(
Event_Type == "won Jumpball" ~ 1,
Event_Type == "lost Jumpball" ~ 2,
Event_Type == "Offensive Rebound" ~ 3,
Event_Type %in% c(
"Three Point Jumper",
"Two Point Jumper",
"Layup",
"Hook",
"Dunk",
"Tip In"
) ~ 4,
Event_Type == "Assist" ~ 5, # Order assist directly after shot so rows can be merged
Event_Type == "Turnover" ~ 6,
Event_Type == "Steal" ~ 7,
!Event_Type %in% c("Enters Game", "Leaves Game") ~ 6,
T ~ 7
),
Home_Score = as.numeric(Home_Score),
Away_Score = as.numeric(Away_Score)
) %>%
# Function is in use as scorekeepers often don't follow same ordering
# This formats as Row 1: Shot, Row 2: Assist, Next: Substitutions, Final: Everything Else
# dplyr::group_by(Game_Seconds, Home_Score, Away_Score) %>%
dplyr::arrange(Half_Status, Game_Seconds, Home_Score, Away_Score, Event_Priority) %>%
# dplyr::do(order_seconds(.)) %>%
# dplyr::ungroup() %>%
# Data formats assists independently of shots in following row
# This makes it easier to use data but combining shot+assist into one row with a player_2 column for the assist player
dplyr::mutate(
Player_2 = ifelse(lead(Event_Type) == "Assist", lead(Player_1), NA_character_),
Event_Description = ifelse(
lead(Event_Type) == "Assist" &
!is.na(lead(Event_Type)),
paste(Event_Description, "-", lead(Event_Description)),
Event_Description
)
) %>%
dplyr::filter(Event_Type != "Assist") %>%
dplyr::select(-Event_Priority) %>%
dplyr::mutate(
Poss_Num = NA,
Poss_Team = NA,
Event_Length = Game_Seconds - dplyr::lag(Game_Seconds),
Event_Length = ifelse(is.na(Event_Length), Game_Seconds, Event_Length),
Shot_Value = dplyr::case_when(
Event_Type == "Two Point Jumper" ~ 2,
Event_Type == "Layup" ~ 2,
Event_Type == "Three Point Jumper" ~ 3,
Event_Type == "Dunk" ~ 2,
Event_Type == "Free Throw" ~ 1,
Event_Type == "Tip In" ~ 2,
Event_Type == "Hook" ~ 2
)
)
# Calculating Possessions ####
poss_num <- 0
nrows <- 0
for(i in 1:max(dirty_game$Half_Status)) {
half_data <- dplyr::filter(dirty_game, Half_Status == i)
poss_team <- dplyr::first(half_data$Event_Team[!half_data$Event_Type %in% c("Leaves Game", "Enters Game")])
other_team <- ifelse(poss_team == home_team, away_team, home_team)
poss_switch = F
poss_num <- poss_num + 1
for(j in 1:nrow(half_data)) {
# If row has event that ends possession, increment num and swap team
team <- half_data$Event_Team[j]
type <- half_data$Event_Type[j]
result <- half_data$Event_Result[j]
seconds <- half_data$Game_Seconds[j]
# Using max(j-1, 1) to handle first entry
swap <- poss_switch & seconds != half_data$Game_Seconds[max(j-1, 1)]
if(swap) {
poss_num <- poss_num + 1
tmp <- poss_team
poss_team <- other_team
other_team <- tmp
poss_switch <- F
}
# force switch possession if event is attributed to wrong possession team, if parse is false this puts it back to correct
# revisit - might be a more efficient way
if (!is.na(result) & team != poss_team | type == "Turnover" & poss_team != team) {
poss_num <- poss_num + 1 - swap*1
tmp <- poss_team
poss_team <- other_team
other_team <- tmp
poss_switch <- F
}
and_one <- any(half_data$Event_Type[half_data$Game_Seconds == seconds] == "Free Throw") # Detect and-one to not switch possession on the made shot
next_reb <-half_data$Event_Type[j+1] %in% c("Defensive Rebound", "Free Throw") # Catch final free throw misses -- note free throw sequences are occassionally out of order in pbp
if(
type %in% c("Defensive Rebound", "Turnover") |
(type %in% c("Two Point Jumper", "Three Point Jumper", "Layup", "Dunk", "Tip In", "Hook") & result == "made" & !and_one) |
(type == "Free Throw" & result == "made" & !next_reb)
) {
# Free throw made + current or next time does not include a defensive rebound
poss_switch = T
}
dirty_game$Poss_Num[j+nrows] <- poss_num
dirty_game$Poss_Team[j+nrows] <- poss_team
}
nrows <- nrows + nrow(half_data)
}
pos_start <- dirty_game %>%
dplyr::group_by(Poss_Num) %>%
dplyr::mutate(
Terminal = last(Event_Type)
) %>%
dplyr::summarise(End = any(Terminal %in% c("Two Point Jumper", "Three Point Jumper", "Free Throw",
"Dunk", "Layup", "Hook", "Tip In", "Steal", "Defensive Rebound"
))*1, .groups = "keep") %>%
dplyr::ungroup() %>%
dplyr::mutate(Valid = dplyr::lag(End, default =0)) %>%
dplyr::select(Poss_Num, Valid)
dirty_game <- dirty_game %>%
dplyr::left_join(pos_start, by = "Poss_Num") %>%
dplyr::group_by(Poss_Num) %>%
dplyr::mutate(
Poss_Length = cumsum(Event_Length),
isTransition = (first(Poss_Length) <= 10 & (first(Event_Type) %in% c(
"Steal", "Dunk", "Layup", "Hook", "Tip In",
"Two Point Jumper", "Three Point Jumper") |
first(Event_Type) == "Draws Foul" & Event_Team == Poss_Team |
first(Event_Type) == "Commits Foul" & Event_Team != Poss_Team) & Valid)
) %>%
dplyr::select(-Valid) %>%
dplyr::ungroup()
# Test case for if shots are attributed correctly
test_poss <- dirty_game %>% group_by(Event_Team, Poss_Team, Half_Status) %>% summarise(Pts = sum(!is.na(Event_Result), na.rm=T), .groups = "keep") %>% arrange(Half_Status)
if(sum(test_poss$Pts>0) != (2+numbOTs)*2) {
message("Warning: Possession Parsing Has Errors")
}
# Detect games with invalid substitutions - rather than parsing clearly flawed data, return with no sub format
player_subs <- dirty_game %>%
filter(Event_Type %in% c("Enters Game", "Exits Game")) %>%
.$Player_1
# 1. If a number is used for a substitution
# 2. If TEAM is used
# 3. If a team name is used
invalid_sub <- any(grepl(paste0("\\.[0-9]+|\\.TEAM|",toupper(home_team),"|", toupper(away_team)), player_subs))
# Now Check to See if Players Were Recorded in the Game
if (length(unique(dirty_game$Player_1)) == 1 | invalid_sub | length(player_subs) == 0) {
# No Player Cleaning
# Found no player names in data
# Does final cleaning of data without finding on off
# Gets the length of each event and assigns a shot value
# Creates the final dataframe to be used without players
clean_game <- dirty_game %>%
dplyr::mutate(
Status = "NO_PLAYER", #set status variable mentioned earlier
Sub_Deviate = nrow(.)
) %>%
bind_cols(as.data.frame(matrix(rep(NA, nrow(dirty_game)*10),
ncol = 10,
nrow = nrow(dirty_game))) %>%
rename(Home.1 = V1, Home.2 = V2, Home.3 = V3, Home.4 = V4, Home.5 = V5,
Away.1 = V6, Away.2 = V7, Away.3 = V8, Away.4 = V9, Away.5 = V10
)) %>%
dplyr::select(
ID:Away,
Half_Status,
Time:Event_Team,
Event_Description,
Player_1,
Player_2,
Event_Type,
Event_Result,
Shot_Value,
Event_Length,
Poss_Num,
Poss_Team,
Poss_Length,
Home.1:Away.5,
Status,
Sub_Deviate
)
# Report results
# Include game info, that pbp was missing players, and which format was used
message(paste(
date,
home_team,
"v",
away_team,
"| Status: No Substitution Data",
format,
"|",
game_id
))
return(clean_game)
} else {
# Search for extraneous error substitutions
# All substitutions events must have two concurring rows, an entry and an exit.
# Find errors when an odd number of substitutions occur together
mins_errors <- dirty_game %>%
dplyr::filter(Event_Type %in% c("Leaves Game", "Enters Game"),
Game_Seconds != 1200) %>%
dplyr::group_by(Game_Seconds) %>%
dplyr::summarise(count = dplyr::n(), .groups = "keep") %>%
dplyr::ungroup() %>%
dplyr::filter(count %% 2 != 0)
# Report substitition mistake messages to users
# Possibly find a way to deal with this but for now just reporting
if (nrow(mins_errors) > 0) {
# message(
# paste(
# "Potential Substitution Entry Mistakes within: Half",
# # FIX THIS TO JUST SHOW HALF NOT SPECIFIC TIME
# paste(unique(mins_errors$Game_Seconds %/% 1200)+1, collapse = ", ")
# )
# )
# Changes the status variable to note a sub mistake was made
status <- "SUB_MISTAKE"
}
# On Court ####
# Create empty matrix variables to store on court
home_player_matrix <- NA
away_player_matrix <- NA
# Since starters aren't identified for each half, need to go through process to find them
for (i in 1:(2 + numbOTs)) {
#Iterates through the number of game session
half_data <- dplyr::filter(dirty_game, Half_Status == i)
#Get vectors for players leaving and entering game for both teams
home_leaving <-
dplyr::filter(half_data,
Event_Team == Home,
Event_Type == "Leaves Game",
# Player_1 != "TEAM",
Time != "00:00",
(Time != "20:00" & Half_Status %in% 1:2) | (Time != "05:00" & Half_Status >2)
)$Player_1
home_entering <-
dplyr::filter(half_data,
Event_Team == Home,
Event_Type == "Enters Game",
# Player_1 != "TEAM",
Time != "00:00",
(Time != "20:00" & Half_Status %in% 1:2) | (Time != "05:00" & Half_Status >2)
)$Player_1
away_leaving <-
dplyr::filter(half_data,
Event_Team == Away,
Event_Type == "Leaves Game",
# Player_1 != "TEAM",
Time != "00:00",
(Time != "20:00" & Half_Status %in% 1:2) | (Time != "05:00" & Half_Status >2)
)$Player_1
away_entering <-
dplyr::filter(half_data,
Event_Team == Away,
Event_Type == "Enters Game",
# Player_1 != "TEAM",
(Time != "20:00" & Half_Status %in% 1:2) | (Time != "05:00" & Half_Status >2)
)$Player_1
# Find players explicitly defined as starting
true_home_starters <- (half_data %>%
dplyr::filter(Home == Event_Team,
(Time == "20:00" & Half_Status %in% 1:2) | (Time == "05:00" & Half_Status >2),
Time != "00:00",
Event_Type == "Enters Game"
))$Player_1
# Figure out starters by finding players that have a "Leaves Game" entry before an "Enters Game" entry
if (length(home_leaving) > 0) {
home_starters <- c()
for (j in 1:length(home_leaving)) {
if (!home_leaving[j] %in% home_entering[1:(j - 1)] &
!home_leaving[j] %in% home_leaving[1:j - 1]) {
home_starters <- c(home_starters, home_leaving[j])
}
}
}
# Find players explicitly defined as beginning from bench
true_home_nonstarters <- (half_data %>%
dplyr::filter(Home == Event_Team,
(Time == "20:00" & Half_Status %in% 1:2) | (Time == "05:00" & Half_Status >2),
Event_Type == "Leaves Game"
))$Player_1
# Ignore if a player subs on for themselves at the beginning of a half
temp_swap <- true_home_starters
true_home_starters <- true_home_starters[which(!true_home_starters %in% true_home_nonstarters)]
true_home_nonstarters <- true_home_nonstarters[which(!true_home_nonstarters %in% temp_swap)]
# Remove defined non-starters from starters
home_starters <- home_starters[which(!home_starters %in% true_home_nonstarters)]
true_home_starters <- true_home_starters[which(!true_home_starters %in% home_starters)]
home_starters <- c(home_starters, true_home_starters)
# To handle data entry errors, often 5 starters cannot be found using proper method above
home_starters <- if (length(home_starters) < 5) {
# First remove any extraneous Player.1 entries dealing with team events
# I think I've narrowed it down so only "TEAM" can show up in certain cases, but couldn't see a need to remove the others just in case
home_split <-
dplyr::filter(
half_data,
Event_Team == Home,!Player_1 %in% c(
"TEAM.TEAM",
"TEAM.TEAM 30",
" TEAM.TEAM",
"TEAM",
"TEAM.TEAM ",
"TEAM.TEAM 20"
)
)
# Handle case if a player was subbed out for themselves and started the game
# Has actually happened on several occassions
error_catch <- c()
for (time in unique(home_split$Game_Seconds)) {
# Idk just ignoring late subs when game gets weird
temp <- dplyr::filter(home_split, Game_Seconds == time, Game_Seconds < max(Game_Seconds)-60)
ons <-
temp$Player_1[which(temp$Event_Type == "Enters Game")]
offs <-
temp$Player_1[which(temp$Event_Type == "Leaves Game")]
error_catch <- c(error_catch, ons[ons %in% offs])
}
error_catch <- error_catch[!error_catch %in% home_starters]
# Looks for players that registered events but never subbed in/out, this implies they are a starter
non_subs <-
unique(home_split[which(!home_split$Player_1 %in% c(home_leaving, home_entering,home_starters,true_home_nonstarters)),]$Player_1)
# See if player recorded an event before ever subbing out of the game
play_before_sub <- home_split %>%
group_by(Player_1) %>%
filter(first(Game_Seconds) < first(.$Game_Seconds[which(.$Event_Type == "Leaves Game")])) %>%
distinct(Player_1) %>%
filter(!Player_1 %in% home_starters) %>%
unlist(., use.names = F)
all_starters <- unique(c(home_starters, non_subs, play_before_sub, error_catch))
# all_starters <- all_starters[which(!all_starters %in% home_bench)]
# If these methods find more than five starters, just chooses the first five found until a better way is suggested
# Warn user that this is being used
if (length(all_starters) > 5) {
# message(
# paste(
# "Using approximate starter finder, choosing:\n",
# paste(all_starters[1:5], collapse = ", "),
# "\nfrom: ",
# paste(all_starters, collapse = ", ")
# )
# )
all_starters[1:5]
# If 5, checks have successfully found 5 starters
} else if(length(all_starters) == 5){
all_starters[1:5]
# Handle case when less than 5 starters are found even after error checks
} else {
# Just takes first n players that have recorded an event in the half
# all_found <- unique(c(home_starters, play_before_sub, non_subs, error_catch))
all_half_players <- half_data %>%
filter(Event_Team == Home,
!Event_Type %in% c("Enters Game", "Leaves Game"),
Player_1 != "TEAM",
!Player_1 %in% all_starters) %>%
.$Player_1 %>%
unlist() %>%
unique() %>%
.[1:(5-length(all_starters))]
# If able to find 5 players, return them as starter and warn user
if(length(c(all_starters, all_half_players)) == 5) {
message("Warning In Substitution Data - Not Enough Starters Found. Using Estimate")
c(all_starters, all_half_players)
} else {
all_starters[1:5]
}
}
} else {
home_starters[1:5]
}
# Attempting to guess on who is on the court when a player plays the entire half and doesn't register a stat
# Best guess I could think of was look at the last player to record a stat in prior halfs
if(any(is.na(home_starters))){
numb.players <- sum(is.na(home_starters))
half_using <- if(i ==1){2:(numbOTs+1)} else {1:i}
prior_half <- filter(dirty_game,
Half_Status %in% half_using,
Event_Team == Home,
!Player_1 %in% c(home_starters,home_enter_players, home_leaving,"TEAM"),
!Event_Type %in% c("Enters Game"))
players <- if(i == 1){
unique(prior_half$Player_1, fromLast = T)[1:numb.players]
} else {
rev(unique(prior_half$Player_1, fromLast = T))[1:numb.players]
}
home_starters[is.na(home_starters)] <- players
message(paste("5 starters not found for half",i, "choosing",players, collapse = "/"))
}
# Repeated process is done for the away team, refer to comments above
true_away_starters <- (half_data %>%
dplyr::filter(Away == Event_Team,
(Time == "20:00" & Half_Status %in% 1:2) | (Time == "05:00" & Half_Status >2),
Time != "00:00",
Event_Type == "Enters Game"
))$Player_1
if (length(away_leaving) > 0) {
away_starters <- c()
# away_bench <- c()
for (j in 1:length(away_leaving)) {
if (!away_leaving[j] %in% away_entering[1:(j - 1)]) {
away_starters <- c(away_starters, away_leaving[j])
# away_bench <- c(away_bench, away_entering[j])
}
}
}
true_away_nonstarters <- (half_data %>%
dplyr::filter(Away == Event_Team,
(Time == "20:00" & Half_Status %in% 1:2) | (Time == "05:00" & Half_Status >2),
Event_Type == "Leaves Game"
))$Player_1
temp_swap <- true_away_starters
true_away_starters <- true_away_starters[which(!true_away_starters %in% true_away_nonstarters)]
true_away_nonstarters <- true_away_nonstarters[which(!true_away_nonstarters %in% temp_swap)]
away_starters <- away_starters[which(!away_starters %in% true_away_nonstarters)]
true_away_starters <- true_away_starters[which(!true_away_starters %in% away_starters)]
away_starters <- c(away_starters, true_away_starters)
away_starters <- if (length(away_starters) < 5) {
away_split <-
dplyr::filter(
half_data,
Event_Team == Away,!Player_1 %in% c(
"TEAM.TEAM",
"TEAM.TEAM 30",
" TEAM.TEAM",
"TEAM",
"TEAM.TEAM ",
"TEAM.TEAM 20"
)
)
error_catch <- c()
for (time in unique(away_split$Game_Seconds)) {
temp <- dplyr::filter(away_split, Game_Seconds == time, Game_Seconds < max(Game_Seconds)-60)
ons <-
temp$Player_1[which(temp$Event_Type == "Enters Game")]
offs <-
temp$Player_1[which(temp$Event_Type == "Leaves Game")]
error_catch <- c(error_catch, ons[ons %in% offs])
}
error_catch <- error_catch[!error_catch %in% away_starters]
non_subs <-
unique(away_split[which(!away_split$Player_1 %in% c(away_leaving, away_entering,true_away_nonstarters,away_starters)),]$Player_1)
# See if player recorded an event before subbing out at start
play_before_sub <- away_split %>%
group_by(Player_1) %>%
filter(first(Game_Seconds) < first(.$Game_Seconds[which(.$Event_Type == "Leaves Game")])) %>%
distinct(Player_1) %>%
filter(!Player_1 %in% away_starters) %>%
unlist(., use.names = F)
all_starters <- unique(c(away_starters, non_subs, play_before_sub, error_catch))
# all_starters <- all_starters[which(!all_starters %in% away_bench)]
if (length(all_starters) > 5) {
# message(
# paste(
# "Using approximate starter finder, choosing:\n",
# paste(all_starters[1:5], collapse = ", "),
# "\nfrom: ",
# paste(all_starters, collapse = ", ")
# )
# )
all_starters[1:5]
} else if(length(all_starters) == 5){
all_starters
} else {
# all_found <- unique(c(away_starters, play_before_sub, non_subs, error_catch))
# Just takes first n players that have recorded an event in the half
all_half_players <- half_data %>%
filter(Event_Team == Away,
!Event_Type %in% c("Enters Game", "Leaves Game"),
Player_1 != "TEAM",
!Player_1 %in% all_starters) %>%
.$Player_1 %>%
unlist() %>%
unique() %>%
.[1:(5-length(all_starters))]
# If able to find 5 players, return them as starter and warn user
if(length(c(all_starters, all_half_players)) == 5) {
message("Warning In Substitution Data - Not Enough Starters Found. Using Estimate")
c(all_starters, all_half_players)
} else {
all_starters[1:5]
}
}
} else {
away_starters[1:5]
}
if(any(is.na(away_starters))){
numb.players <- sum(is.na(away_starters))
half_using <- if(i ==1){2:(numbOTs+1)} else {1:i}
prior_half <- filter(dirty_game,
Half_Status %in% half_using,
Event_Team == Away,
!Player_1 %in% c(away_starters,away_entering,away_leaving,"TEAM"),
!Event_Type %in% c("Enters Game"))
players <- if(i == 1){
unique(prior_half$Player_1, fromLast = T)[1:numb.players]
} else {
rev(unique(prior_half$Player_1, fromLast = T))[1:numb.players]
}
away_starters[1:5][is.na(away_starters[1:5])] <- players
message(paste("5 starters not found for half", i, "choosing", players, collapse = "/"))
}
# Now an empty matrix is built that will be iterated through to store the lineups
# Repeat process is done for home and away team
home_mat <- matrix(
c(home_starters,home_starters,rep(NA_character_, nrow(half_data) * 5 - 5)),
nrow = nrow(half_data) + 1, ncol = 5, byrow = T)
away_mat <- matrix(
c(away_starters,away_starters,rep(NA_character_, nrow(half_data) * 5 - 5)),
nrow = nrow(half_data) + 1, ncol = 5, byrow = T)
# Vectors of substitutes are used and diminished as events happen to track who is subbed
home_exit_players <- if(length(home_leaving)>0){home_leaving}else{"HOPEFULLY THIS IS NOBODY'S NAME"}
home_enter_players <- if(length(home_entering)>0){home_entering}else{"HOPEFULLY THIS IS NOBODY'S NAME"}
away_exit_players <- if(length(away_leaving)>0){away_leaving}else{"HOPEFULLY THIS IS NOBODY'S NAME"}
away_enter_players <- if(length(away_entering)>0){away_entering}else{"HOPEFULLY THIS IS NOBODY'S NAME"}
# Go through each row of the data frame of events for each half
for (k in 1:(nrow(half_data))) {
#First case looks at if it is a home substitution and the player matches expectations (next in vector and already on court)
if (half_data$Event_Type[k] == "Leaves Game" &
half_data$Event_Team[k] == home_team &
half_data$Player_1[k] == home_exit_players[1] &
!home_enter_players[1] %in% home_mat[k,] &
half_data$Time[k] != "00:00") {
#Then find row index of player leaving
ind <- match(home_mat[k,], home_exit_players[1])
ind <- which(!is.na(ind))
#Create new row replacing the leaving player with the entering one
new_players <- home_mat[k,]
new_players[ind] <- home_enter_players[1]
#Add the new lineup as the next row in the matrix
home_mat[k + 1, ] <- new_players
#Go to the next subbed players for entering and leaving
home_enter_players <-
home_enter_players[2:length(home_enter_players)]
home_exit_players <-
home_exit_players[2:length(home_exit_players)]
#If a home sub occurs, the away lineup will stay the same
away_mat[k + 1, ] <- away_mat[k,]
# Now repeat the same process but for the away team
# Since only one event can occur per row this can be handled with if/else if
} else if (half_data$Event_Type[k] == "Leaves Game" &
half_data$Event_Team[k] == away_team &
half_data$Player_1[k] == away_exit_players[1] &
!away_enter_players[1] %in% away_mat[k,] &
half_data$Time[k] != "00:00") {
ind <- match(away_mat[k,], away_exit_players[1])
ind <- which(!is.na(ind))
new_players <- away_mat[k,]
new_players[ind] <- away_enter_players[1]
away_mat[k + 1, ] <- new_players
away_enter_players <-
away_enter_players[2:length(away_enter_players)]
away_exit_players <-
away_exit_players[2:length(away_exit_players)]
home_mat[k + 1, ] <- home_mat[k,]
# Now handling some error situations with subs
# Often if an entry error occurs, a player is eventually subbed in that is already on the court
# There is no good way to deal with this, I've elected to just ignore the sub. and skip over the change
} else if (half_data$Event_Type[k] == "Leaves Game" &
half_data$Event_Team[k] == away_team &
half_data$Player_1[k] == away_exit_players[1] &
away_enter_players[1] %in% away_mat[k,] &
half_data$Time[k] != "00:00") {
away_enter_players <-
away_enter_players[2:length(away_enter_players)]
away_exit_players <-
away_exit_players[2:length(away_exit_players)]
home_mat[k + 1, ] <- home_mat[k,]
away_mat[k + 1, ] <- away_mat[k,]
} else if (half_data$Event_Type[k] == "Leaves Game" &
half_data$Event_Team[k] == home_team &
half_data$Player_1[k] == home_exit_players[1] &
home_enter_players[1] %in% home_mat[k,] &
half_data$Time[k] != "00:00") {
home_enter_players <-
home_enter_players[2:length(home_enter_players)]
home_exit_players <-
home_exit_players[2:length(home_exit_players)]
home_mat[k + 1, ] <- home_mat[k,]
away_mat[k + 1, ] <- away_mat[k,]
# Finally, if there was no sub-type event we can just fill the new row with the prior
} else{
home_mat[k + 1, ] <- home_mat[k,]
away_mat[k + 1, ] <- away_mat[k,]
}
home_enter_players <- home_enter_players[!is.na(home_enter_players)]
home_exit_players <- home_exit_players[!is.na(home_exit_players)]
away_enter_players <- away_enter_players[!is.na(away_enter_players)]
away_exit_players <- away_exit_players[!is.na(away_exit_players)]
}
#This adds the matrix for each half to the game matrix
home_player_matrix <-
rbind(home_player_matrix, home_mat[-1,])
away_player_matrix <-
rbind(away_player_matrix, away_mat[-1,])
###END OF LOOP
}
# Player Cleaning ####
#Remove the first row as it is made up of NAs from nature of how it's structured
home_player_matrix <- home_player_matrix[-1,]
away_player_matrix <- away_player_matrix[-1,]
#Rename columns to denote players
colnames(home_player_matrix) <-
c("Home.1", "Home.2", "Home.3", "Home.4", "Home.5")
colnames(away_player_matrix) <-
c("Away.1", "Away.2", "Away.3", "Away.4", "Away.5")
#Adds these columns to a new play by play data frame
mild_game <-
dplyr::bind_cols(list(dirty_game, as.data.frame(home_player_matrix, row.names = F), as.data.frame(away_player_matrix, row.names = F))
)
#Add the event length variable which can often be helpful
home_starters <- unlist(mild_game[1,23:27])
away_starters <- unlist(mild_game[1,28:32])
#Can now put together final data frame
clean_game <- mild_game %>%
dplyr::mutate_if(is.factor, as.character) %>%
dplyr::select(
ID:Away,
Half_Status,
Time:Event_Team,
Event_Description,
Player_1,
Player_2,
Event_Type,
Event_Result,
Shot_Value,
Event_Length,
Poss_Num,
Poss_Team,
Poss_Length,
isTransition,
Home.1:Away.5
) %>%
dplyr::mutate(
Status = status,
# Greater than 25 and 10-5 minutes left
# Greater than 20 and 5-2 minutes left
# Greater than 15 and <2 minutes left
Garbage_Thresh = dplyr::case_when(
abs(Home_Score - Away_Score) >= 25 & Game_Seconds >= 1800 ~ T,
abs(Home_Score - Away_Score) >= 20 & Game_Seconds >= 2100 ~ T,
abs(Home_Score - Away_Score) >= 15 & Game_Seconds >= 2280 ~ T,
TRUE ~ F
),
# Only call garbage time if <= 3 starters are in... note: Ben Falk / CTG uses 2
Starter_Thresh = ((Home.1 %in% home_starters) + (Home.2 %in% home_starters) +
(Home.3 %in% home_starters) + (Home.4 %in% home_starters) + (Home.5 %in% home_starters) +
(Away.1 %in% away_starters) + (Away.2 %in% away_starters) + (Away.3 %in% away_starters) +
(Away.4 %in% away_starters) + (Away.5 %in% away_starters)) <= 3,
# If both thresholds are met we hit garbage time and stay in it
isGarbageTime = cumsum(Garbage_Thresh*Starter_Thresh) >= 1
) %>%
dplyr::select(-Garbage_Thresh, -Starter_Thresh) %>%
# Making fix so that the players on the court to start a possession are credited for the entire possession
dplyr::group_by(Poss_Num) %>%
dplyr::mutate(
Home.1 = first(Home.1),
Home.2 = first(Home.2),
Home.3 = first(Home.3),
Home.4 = first(Home.4),
Home.5 = first(Home.5),
Away.1 = first(Away.1),
Away.2 = first(Away.2),
Away.3 = first(Away.3),
Away.4 = first(Away.4),
Away.5 = first(Away.5)
) %>%
dplyr::ungroup()
# Final round of checking for data entry mistakes by scorekeeper
# Look for if a player is said to do an event and they aren't on the court as determined above
player_errors <- apply(clean_game, 1, function(x) {
if (sum(x[which(colnames(clean_game) == "Player_1"):which(colnames(clean_game) == "Player_2")] %in% c(x[which(colnames(clean_game) == "Home.1"):which(colnames(clean_game) == "Away.5")]), na.rm = T) == 0) {
return(T)
} else{
return(F)
}
})
# Oftentimes these errors simply occur when event ordering gets mixed up by scorekeeper
# I don't really think this should be framed as a true error, as they are deadball events
# This isn't necessarily an error
entry_mistakes <-
clean_game[which(
player_errors &
!clean_game$Event_Type %in% c(
"Leaves Game",
"Enters Game",
"Free Throw"
) & clean_game$Player_1 != "TEAM"
),]
# Provide a column for deviations, allowing user to filter pbp with too many errors
clean_game$Sub_Deviate <- nrow(entry_mistakes)
# Warns user of number of entry mistakes found - only report if significant
warn <- ifelse(nrow(entry_mistakes)>15, paste(nrow(entry_mistakes), "deviations"), "")
source <- ifelse(isUrlRead, "web", "local")
# Give user final message about the status of the game they've scraped
message(paste(date, home_team, "v", away_team, "| ", format, "|", game_id, "|", source, "|", warn))
# Sys.sleep so the ncaa server isn't overworked
if(isUrlRead) {
Sys.sleep(2)
}
return(clean_game)
}
}
#' Date Schedule Scrape
#'
#' This function returns a schedule for the given date and specified conference.
#' Results are included if applicable, as well as the play-by-play game id
#' @param date a character object containing a date in the format mm/dd/yyyy. Defaults is previous day (yesterday)
#' @param conference the common name used for a conference, not sensitive to case, spacing, punctuation, etc.
#' @param conference.ID alternatively, if the conference ID is known it replace the conference name variable
#' @importFrom XML readHTMLTable
#' @import dplyr
#' @import stringr
#' @return data frame with each row representing an inidividual game
#' \itemize{
#' \item{Date} - Game date
#' \item{Start_Time} - Start time reported in eastern time zone (I believe)
#' \item{Home} - Home team
#' \item{Away} - Away team
#' \item{GameID} - If the game is finished and has play-by-play data available, the game ID used to scrape game data
#' \item{Home_Score} - If the game is finished, the final score for the home team
#' \item{Away_Score} - If the game is finished, the final score for the away team
#' \item{Attendance} - The attendance count reported by the NCAA
#' \item{Neutral_Site} - A logical variable that is true when the game was played at a designated neutral destination
#' }
#' @export
#' @examples
#' get_date_games(date = "12/11/2018", conference = "Ivy")
#' get_date_games(date = "12/11/2018", conference.ID = 865)
get_date_games <-
function(date = as.character(format(Sys.Date() - 1, "%m/%d/%Y")),
conference = "All",
conference.ID = NA,
use_file = F,
save_file = F,
base_path = NA) {
#First convert the character date given by user into a date object
dateform <- as.Date(as.character(date), format = "%m/%d/%Y")
# Find the season id needed by the url given the date of the game
# The pbp only goes back to 2011 in most cases, so no need to pull deeper
seasonid <- dplyr::case_when(
# 24-25
dateform > as.Date("2024-05-01") &
dateform <= as.Date("2025-05-01") ~ 18403,
# 23-24
dateform > as.Date("2023-05-01") &
dateform <= as.Date("2024-05-01") ~ 18221,
# 22-23
dateform > as.Date("2022-05-01") &
dateform <= as.Date("2023-05-01") ~ 17940,
# 21-22
dateform > as.Date("2021-05-01") &
dateform <= as.Date("2022-05-01") ~ 17783,
# 20-21
dateform > as.Date("2020-05-01") &
dateform <= as.Date("2021-05-01") ~ 17420,
# 19-20
dateform > as.Date("2019-05-01") &
dateform <= as.Date("2020-05-01") ~ 17060,
#18-19
dateform > as.Date("2018-05-01") &
dateform <= as.Date("2019-05-01") ~ 16700,
#17-18
dateform > as.Date("2017-05-01") &
dateform <= as.Date("2018-05-01") ~ 13533,
#16-17
dateform > as.Date("2016-05-01") &
dateform <= as.Date("2017-05-01") ~ 13100,
#15-16
dateform > as.Date("2015-05-01") &
dateform <= as.Date("2016-05-01") ~ 12700,
#14-15
dateform > as.Date("2014-05-01") &
dateform <= as.Date("2015-05-01") ~ 12320,
#13-14
dateform > as.Date("2013-05-01") &
dateform <= as.Date("2014-05-01") ~ 11700,
#12-13
dateform > as.Date("2012-05-01") &
dateform <= as.Date("2013-05-01") ~ 10883,
#11-12
dateform > as.Date("2011-05-01") &
dateform <= as.Date("2012-05-01") ~ 10480,
#10-11
dateform > as.Date("2010-05-01") &
dateform <= as.Date("2011-05-01") ~ 10220,
T ~ 0
)
if (seasonid == 0) {
return("Season Not Available")
}
# If user doesn't know the conference id, they can enter a conference name
# The naming conventions are handled below and stripped of any case, spaces, punctuation, etc.
conferenceform <-
tolower(sub("[^[:alnum:]=\\.]", "", conference))
conferenceid <- dplyr::case_when(
conferenceform == "aac" ~ 823,
conferenceform == "acc" ~ 821,
conferenceform == "asun" ~ 920,
conferenceform == "americaneast" ~ 845,
conferenceform == "atlantic10" ~ 820,
conferenceform == "big12" ~ 25354,
conferenceform == "bigeast" ~ 30184,
conferenceform == "bigsky" ~ 825,
conferenceform == "bigsouth" ~ 826,
conferenceform %in% c("bigten", "big10") ~ 827,
conferenceform == "bigwest" ~ 904,
conferenceform == "cusa" ~ 24312,
conferenceform == "caa" ~ 837,
conferenceform == "horizon" ~ 881,
conferenceform == "ivy" ~ 865,
conferenceform == "maac" ~ 871,
conferenceform == "mac" ~ 875,
conferenceform == "meac" ~ 876,
conferenceform == "mvc" ~ 884,
conferenceform == "mwc" ~ 5486,
conferenceform == "nec" ~ 846,
conferenceform == "ovc" ~ 902,
conferenceform == "pac12" ~ 905,
conferenceform == "patriot" ~ 838,
conferenceform == "sec" ~ 911,
conferenceform == "swac" ~ 916,
conferenceform == "socon" ~ 912,
conferenceform == "southland" ~ 914,
conferenceform == "summit" ~ 819,
conferenceform == "sunbelt" ~ 818,
conferenceform == "wac" ~ 923,
conferenceform == "wcc" ~ 922,
conferenceform == "all" ~ 0,
T ~ 99999999
)
# When a bad entry is found return error
if (conferenceid == 99999999) {
message("Conference ID not found, using all")
# return(NA)
conferenceid = 0
}
# When the user gives their own conference ID, this replaces the text option
if (!is.na(conference.ID)) {
conferenceid = conference.ID
}
#formats date as found in the url
date2 <- gsub("[/]", "%2F", date)
#pulls the necessary url
url_text <-
paste0(
"https://stats.ncaa.org/season_divisions/",
seasonid,
"/scoreboards?game_date=",
date2,
"&conference_id=",
conferenceid,
"&commit=Submit"
)
file_dir <- paste0(base_path, "date_games/")
file_path <- paste0(file_dir, date2, "_", conferenceid, ".html")
# Give user option to save raw html file (to make future processing more efficient)
if (save_file & !is.na(base_path)) {
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
dir.create(file_dir, recursive = T, showWarnings = F)
writeLines(html, file_path)
}
# Reads the html and pulls the table holding the scores
if (use_file & !is.na(base_path)) {
html <- readLines(file_path, warn=F)
} else {
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
}
table <- XML::readHTMLTable(html)
if(length(table) == 0) {
stop("No Games Table Found")
} else {
table <- table[[1]]
}
#The table is always read in the same messy way
#Each game in the schedule starts on a row following pattern 1,6,11,etc. this gets all of those indices
starting_rows <- (1:(nrow(table) / 5)) * 5 - 4
# Pull game meta data from relevant part of the table
game_date <- as.character(table$V1[starting_rows])
attendance <- as.character(table$V7[starting_rows])
away_team <- as.character(table$V3[starting_rows])
home_team <- as.character(table$V2[starting_rows + 3])
home_score <- as.character(table$V3[starting_rows + 3])
away_score <- as.character(table$V5[starting_rows])
# sees if a box score is available for each game
box_score_present <- as.character(table$V1[starting_rows + 4]) == "Box Score"
#This searches for all game IDs on the schedule page, using links found in the html
game_ids <-
unlist(stringr::str_extract_all(html, "(?<=/contests/)\\d+(?=/box_score)"))
game_ids <- game_ids[which(!game_ids %in% seasonid)]
# Handle cancelled games with missing game ids, or if game is missing a box score
id_found <- rep(NA, length(away_score))
id_found[!away_score %in% c("Canceled", "Ppd") & box_score_present] <- game_ids
#Also creates variable used to find if a game was held at a neutral side
isNeutral <- table$V6[starting_rows] != ""
#Informs user of how many games and games with a relevant ID were found
message(paste(date, "|", length(game_ids), "games found"))
# Unfortunately, the game ID for boxscore isn't the same as the game ID for pbp
# As a result, this function needs to convert from boxscore to pbp but as a result be slower
# Need to read each box score page and find link to pbp page
url2 <-
paste0("https://stats.ncaa.org/contests/", id_found, "/box_score")
# Clean team names (remove records, like "Rutgers (1-0)")
# home_name = gsub(" [(][0-9]{1-2}\\-[0-9]{1-2}[)]","", home_team)
home_name = gsub(" \\([0-9].+","", home_team) |>
gsub(pattern = '\\#[0-9]{1,2} ', replacement = "")
home_wins = as.vector(stringr::str_extract_all(home_team, "(?<=[(])\\d+(?=-)", T))
home_losses = as.vector(stringr::str_extract_all(home_team, "(?<=-)\\d+(?=[)])", T))
# away_name = gsub(" [(][0-9]{1-2}\\-[0-9]{1-2}[)]","", away_team)
away_name = gsub(" \\([0-9].+","", away_team) |>
gsub(pattern = '\\#[0-9]{1,2} ', replacement = "")
away_wins = as.vector(stringr::str_extract_all(away_team, "(?<=[(])\\d+(?=-)", T))
away_losses = as.vector(stringr::str_extract_all(away_team, "(?<=-)\\d+(?=[)])", T))
if(length(home_wins) == 0){home_wins = NA}
if(length(home_losses) == 0){home_losses = NA}
if(length(away_wins) == 0){away_wins = NA}
if(length(away_losses) == 0){away_losses = NA}
#Create dataframe
game_data <- data.frame(
Date = substr(game_date, 1, 10),
Start_Time = substr(game_date, 12, 19),
Home = home_name,
Away = away_name,
BoxID = id_found,
GameID = id_found,
Home_Score = home_score,
Away_Score = away_score,
Attendance = attendance,
Neutral_Site = isNeutral,
Home_Wins = as.numeric(home_wins),
Home_Losses = as.numeric(home_losses),
Away_Wins = as.numeric(away_wins),
Away_Losses = as.numeric(away_losses),
stringsAsFactors = F,
row.names = NULL
)
if(length(id_found)==0){
message("No Game IDs Found")
}
return(game_data)
}
#' Team Schedule Scrape
#'
#' This function returns a data frame of the schedule for the specified team. This will include game ids used
#' for play-by-play scraping if the game has ended, along with the team scores and attendance.
#' @param team.id The unique id given to each college/team for each season. This can be found in the url of the team page.
#' @param season Season following format yyy1-y2, ex "2018-19"
#' @param team.name Alternative to using the id, you can get a team from data(teamids) with a season and team name specification.
#' This inputs a team name, to be used along with season. This needs the school name not the complete team name, so "Duke" not "Duke Blue Devils".
#' @importFrom XML readHTMLTable
#' @import dplyr
#' @import stringr
#' @return data frame with each row representing an individual game
#' \itemize{
#' \item{Date} - Game date
#' \item{Home} - Home team
#' \item{Home_Score} - If the game is finished, the final score for the home team
#' \item{Away} - Away team
#' \item{Away_Score} - If the game is finished, the final score for the away team
#' \item{Game_ID} - If the game is finished and has play-by-play data available, the game ID used to scrape game data
#' \item{isNeutral} - A logical variable that is true when the game was played at a designated neutral destination
#' \item{Detail} - Additional detail such as if the game went into OT and # of OTs
#' }
#' @export
#' @examples
#' get_team_schedule(team.id = 450680)
#' get_team_schedule(season = "2018-19", team.name = "Penn")
get_team_schedule <-
function(team.id = NA,
season = NA,
team.name = NA,
use_file = F,
save_file = F,
base_path = NA,
overwrite = F) {
# If the user doesn't know id and instead gives a team name and season searches team DB for ID
# This can only be done since 16-17 at the moment
if (is.na(team.id) & !is.na(team.name) & !is.na(season)) {
team.id <-
bigballR::teamids$ID[which(bigballR::teamids$Team == team.name & bigballR::teamids$Season == season)]
} else if(is.na(team.id) & is.na(team.name) & is.na(season)){
message("Improper Request")
return(NULL)
}
# Pull the relevant table from the team webpage
url_text <- paste0("https://stats.ncaa.org/teams/", team.id)
file_dir <- paste0(base_path, "team_schedule/")
file_path <- paste0(file_dir, team.id, ".html")
if (save_file & !is.na(base_path) & (!file.exists(file_path) | overwrite)) {
isUrlRead <- T
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
dir.create(file_dir, recursive = T, showWarnings = F)
writeLines(html, file_path)
} else if (file.exists(file_path) & use_file) {
html <- readLines(file_path, warn=F)
} else {
isUrlRead <- T
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
}
tables <- XML::readHTMLTable(html)
if(!is.null(tables[[1]])) {
df <- data.frame(as.matrix(tables[[1]]), stringsAsFactors = F)
} else {
message(paste(team.id, "has no schedule"))
return(data.frame())
}
df <- df[seq(1,nrow(df), by = 2),]
df <- df[!is.na(df$Opponent),]
# fix strings like "Campbell 2022-23 MBB App State MTE" and "UC Santa Barbara @Phoenix, AZ (2022-23 MBB Jerry Colangelo Classic)"
df$Opponent_with_detail <- df$Opponent |>
str_remove(" 202.*$")
# Keep two different versions: df$Opponent_with_detail includes neutral site descriptions,
# df$Opponent does not.
df$Opponent <- df$Opponent_with_detail |>
str_remove(" \\@[A-Z].*$")
game_ids <-
unlist(stringr::str_extract_all(html, "(?<=contests/)\\d+(?=[/])"))
message("\nParsing Schedule")
# Handle opponent and neutral games as both are broken up using an '@' character
parsed <- lapply(df$Opponent, strsplit, "@")
parsed <- lapply(parsed, function(x) {
x <- unlist(x)
t <- stringr::str_extract(x, "(?<=[\\#[0-9]+] ).*")
t[is.na(t)] <- x[is.na(t)]
if(!any(trimws(t) %in% bigballR::teamids$Team)) {
for(j in 1:length(t)) {
i <- 1
while (!substr(t[j], 1, i) %in% bigballR::teamids$Team && i <= nchar(t[j])) {
i <- i + 1
}
t[j] = substr(t[j], 1, i)
}
}
return(t)
})
# Use df$Opponent_with_detail to get neutral site information
parsed_detail <- lapply(df$Opponent_with_detail, strsplit, "@")
parsed_detail <- lapply(parsed_detail, function(x) {
x <- unlist(x)
t <- stringr::str_extract(x, "(?<=[\\#[0-9]+] ).*")
t[is.na(t)] <- x[is.na(t)]
if(!any(trimws(t) %in% bigballR::teamids$Team)) {
for(j in 1:length(t)) {
i <- 1
while (!substr(t[j], 1, i) %in% bigballR::teamids$Team && i <= nchar(t[j])) {
i <- i + 1
}
t[j] = substr(t[j], 1, i)
}
}
return(t)
})
# Pulls the opponent if they are the home team
Home <-
lapply(parsed, function(x) {
if (x[1] == "" & !is.na(x[2]))
return(x[2])
})
# Searches for neutral game and gets location
Neutral <-
lapply(parsed_detail, function(x) {
if (length(x) == 2 & x[1] != "")
return(x[2])
})
#Iterate through games and finds the home and away team
home_team <- rep(NA, length(parsed))
away_team <- rep(NA, length(parsed))
is_neutral <- rep(F, length(parsed_detail))
for (i in 1:length(parsed)) {
if (!is.null(Home[[i]])) {
home_team[i] <- trimws(Home[[i]])
} else {
away_team[i] <- trimws(parsed[[i]][[1]][1])
}
# Also updates variable to specify if there is a home game
if (!is.null(Neutral[[i]])) {
is_neutral[i] <- T
}
}
team_name <- bigballR::teamids$Team[which(bigballR::teamids$ID == team.id)]
#This cleans the score information
score <- strsplit(df$Result, "-")
selected_score <-
trimws(gsub("W", "", gsub("L", "", sapply(score, function(x) {
x[1]
}))))
opponent_score <- trimws(sapply(score, function(x) {
x[2]
}))
# Separate out the details, which is # of OTs
detail <- unname(sapply(opponent_score, function(x) {
a <- gsub("\\)", "", strsplit(x, "\\(")[[1]][2])
}))
opponent_score <-
unname(sapply(opponent_score, function(x)
strsplit(x, " \\(")[[1]][1]))
detail <- ifelse(selected_score %in% c("Canceled", "Ppd"), selected_score, detail)
selected_score <- ifelse(selected_score %in% c("Canceled", "Ppd"), NA, selected_score)
# add NA game_ids for cancelled games
new_game_ids <- rep(NA, nrow(df))
if (length(game_ids) > 0) {
new_game_ids[is.na(detail) | detail != 'Canceled'][1:length(game_ids)] <- game_ids
}
#Put everything together into tidy data frame
team_data <- data.frame(
Date = df$Date,
Home = ifelse(!is.na(home_team), home_team, team_name),
Home_Score = ifelse(!is.na(home_team), opponent_score, selected_score),
Away = ifelse(!is.na(away_team), away_team, team_name),
Away_Score = ifelse(!is.na(away_team), opponent_score, selected_score),
Box_ID = new_game_ids,
Game_ID = new_game_ids,
isNeutral = is_neutral,
Detail = detail,
Attendance = as.numeric(gsub(",", "", df$Attendance)),
stringsAsFactors = F
)
#Replace blank portions of schedule with dashes, as that is used on NCAA site but NA is better for this purpose
team_data[team_data == "-"] <- NA
#Give user final status update and returns the df
message(paste0(
team_name[1],
" complete -- ",
nrow(team_data),
"/",
length(game_ids),
" games/ids found"
))
return(team_data)
}
#' Team Roster Scrape
#'
#' This function returns a data frame of the roster for the specified team. This will include player names and positions
#' as well as jersey number, height and school year.
#' @param team.id The unique id given to each college/team for each season. This can be found in the url of the team page.
#' @param season Alternative to using the id, you can get a team from data(teamids) with a season and team name specification.
#' String for the season stored as yyy1-y2 (2018-19 is current)
#' @param team.name Alternative to using the id, you can get a team from data(teamids) with a season and team name specification.
#' This inputs a team name, to be used along with season. This needs the school name not the complete team name, so "Duke" not "Duke Blue Devils".
#' @importFrom XML readHTMLTable
#' @import stringr
#' @import dplyr
#' @return data frame with each row representing a player on the roster
#' \itemize{
#' \item{Jersey} - Player jersey number
#' \item{Player} - Name of Player
#' \item{Pos} - Position (one of G,F,C) as designated by the NCAA
#' \item{Ht} - Height as reported by the NCAA
#' \item{Yr} - School year, as Fr, So, Jr, Sr
#' }
#' @export
#' @examples
#' get_team_roster(team.id = 450680)
#' get_team_roster(season = "2018-19", team.name = "Penn")
get_team_roster <-
function(team.id = NA,
season = NA,
team.name = NA,
use_file = F,
save_file = F,
base_path = NA,
overwrite = F) {
# If the user doesn't know id and instead gives a team name and season searches team DB for ID
# This can only be done since 16-17 at the moment
if (is.na(team.id) & !is.na(team.name) & !is.na(season)) {
team.id <-
bigballR::teamids$ID[which(bigballR::teamids$Team == team.name & bigballR::teamids$Season == season)]
} else if(is.na(team.id) & is.na(team.name) & is.na(season)){
message("Improper Request")
return(NULL)
}
#Pull html for the team page
url_text <- paste0("https://stats.ncaa.org/teams/", team.id)
file_dir <- paste0(base_path, "team_schedule/")
file_path <- paste0(file_dir, team.id, ".html")
isUrlRead <- F
# Give user option to save raw html file (to make future processing more efficient)
if (save_file & !is.na(base_path) & (!file.exists(file_path) | overwrite)) {
isUrlRead <- T
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
dir.create(file_dir, recursive = T, showWarnings = F)
writeLines(html, file_path)
} else if (file.exists(file_path) & use_file) {
html <- readLines(file_path, warn=F)
} else {
isUrlRead <- T
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
}
#Find link to the team roster page
roster_link <- html[which(grepl("Roster", html))]
roster_link <-
stringr::str_extract_all(roster_link, "(?<=\\\")(.*)(?=[\\\"])")
roster_url <- paste0("https://stats.ncaa.org", roster_link)
roster_url <- paste0("https://stats.ncaa.org/teams/", team.id, "/roster")
#Read html for the roster page and format it so it can be usabl
file_dir <- paste0(base_path, "team_roster/")
file_path <- paste0(file_dir, team.id, ".html")
if (save_file & !is.na(base_path) & (!file.exists(file_path) | overwrite)) {
isUrlRead <- T
file_url <- url(roster_url, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
dir.create(file_dir, recursive = T, showWarnings = F)
writeLines(html, file_path)
} else if (file.exists(file_path) & use_file) {
html <- readLines(file_path, warn=F)
} else {
isUrlRead <- T
file_url <- url(roster_url, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
}
table <- XML::readHTMLTable(html)[[1]][, 1:9] %>%
mutate(across(everything(), as.character))
# Return the more usable roster page
player <- table$Name
clean_name <- player
format <- gsub("[^[:alnum:] ]", "", clean_name)
format <- toupper(gsub("\\s+",".", format))
player_name <- gsub("(\\.JR\\.|\\.SR\\.|\\.J\\.R\\.|\\.JR\\.|JR\\.|SR\\.|\\.SR|\\.JR|\\.SR|\\.III|\\.II|\\.IV)$","", format)
player_name <- trimws(player_name)
table$Player <- player_name
table$CleanName <- clean_name
table$HtInches <- unname(sapply(table$Height, function(x){
a = as.numeric(strsplit(x,"-")[[1]])
12*a[1] + a[2]
}))
if (isUrlRead) {
Sys.sleep(0.5)
}
return(table)
}
#' Multiple Game Play-By-Play Scraper
#'
#' This is the suggested function for scraping play-by-play, as it handles and binds multiple games.
#' Please see the scrape_game() function documentation for a more detailed description.
#' @param game_ids A string/numeric object or vector containing game ids
#' @export
#' @examples
#' get_play_by_play(c(4671170, 4674164))
#' team_sched <- get_team_schedule(team.id = 450680)
#' get_play_by_play(team_sched$Game_ID)
get_play_by_play <- function(game_ids, use_file = F, save_file = F, base_path = NA, overwrite=F) {
#Cleans list of game ids to remove nas
game_ids <- game_ids[!is.na(game_ids)]
#Scrape all game ids into list
game_list <- lapply(game_ids, function(x) {
# Add error handling so if one game throws an error it will report and continue iterating
tryCatch(scrape_game(x, use_file = use_file, save_file = save_file, base_path = base_path, overwrite=overwrite), error = function(e){
print(paste0("Error with game id: ", x, " // ", e))
return(NA)
})
})
dirty_ind <- which(is.na(game_list))
#Remove any incorrect games found
if(length(dirty_ind) > 0) game_list <- game_list[-dirty_ind]
#Bind rows together and return combined dataframe
game_data <- do.call("binder", game_list)
if(length(dirty_ind) != 0) {
message(paste(paste(game_ids[dirty_ind], collapse = ","), "removed"))
}
return(game_data)
}
#' Lineup Compiler
#'
#' This function takes in a play-by-play dataframe, and generates all possible lineups for both teams.
#' It then calculates a variety of statistics/metrics at a lineup level.
#' @param play_by_play_data data frame consisting of play-by-play data from the functions scrape_game() or get_play_by_play()
#' @import dplyr
#' @export
#' @return data frame with each row representing a unique lineup. All stats for entire lineup or opponent (o- prefix)
#' \itemize{
#' \item{P1} - Player in lineup
#' \item{P2} - Player in lineup
#' \item{P3} - Player in lineup
#' \item{P4} - Player in lineup
#' \item{P5} - Player in lineup
#' \item{Team} - Team for the lineup
#' \item{Mins} - Minutes the lineup was on the court
#' \item{PTS} - Points scored
#' \item{FGA} - Field goal attempts
#' \item{TO} - Turnovers
#' \item{TPA} - Three point attempts
#' \item{FGM} - Field goals made
#' \item{TPM} - Three points made
#' \item{FTA} - Free throw attempts
#' \item{FTM} - Free throws made
#' \item{ORB} - Offensive rebounds
#' \item{DRB} - Defensive rebounds
#' \item{RIMA} - Rim attempts: defined as layups, dunks, tip-ins, hook attempts
#' \item{BLK} - Blocked shots
#' \item{AST} - Assists
#' \item{POSS} - (Offensive) Possessions: (FGA + .475 x FTA - ORB + TO + oFGA + .475 x oFTA - oORB + oTO) /2
#' \item{TS.} - True shooting percentage: (PTS / 2) / (FGA + .475 x FTA),
#' \item{eFG.} - Effective field goal percentage: (FGM + 0.5 x TPM) / FGA,
#' \item{TPP} - Three point percentage: TPA/TPM
#' \item{FTP} - Free throw percentage: FTA/FTM
#' \item{TPrate} - Three point attempt rate: TPA/FGA
#' \item{ASTrate} - Assist rate: AST/FGM
#' \item{TOrate} - Turnover rate: TO/POSS
#' \item{FTrate} - Free throw rate: FTA/FGA
#' \item{BLKrate} - Block rate: BLK/FGA
#' \item{ORB.} - Offensive rebound percentage: ORB / (ORB + oDRB)
#' \item{DRB.} - Defensive rebound percentage: DRB / (DRB + oORB)
#' \item{ORTG} - Offensive efficiency: 100 * (PTS/POSS)
#' \item{DRTG} - Defensive efficiency: 100 * (oPTS/POSS)
#' \item{NETRTG} - Net efficiency: OEFF - DEFF
#' \item{TimePerPoss} - Average time per possession (Seconds): (Possessions / Mins) * 60
#' }
get_lineups <-
function(play_by_play_data = NA, include_transition = F) {
missing_rows <- apply(play_by_play_data[,which(colnames(play_by_play_data)=="Home.1"):which(colnames(play_by_play_data)=="Away.5")], 1, function(x){sum(is.na(x))})
message(paste("Forced to remove", length(which(missing_rows!=0)), "rows due to missing players in on/off"))
lineup_stuff <- play_by_play_data %>%
dplyr::filter(missing_rows==0) %>%
dplyr::filter(!Event_Type %in% c("Enters Game", "Leaves Game"))
# Now sorts the home and away player alphabetically so players are always in the same column for a given lineup
lineup_stuff2 <- apply(lineup_stuff, 1, function(x)
{
home_players <- sort(x[which(colnames(lineup_stuff)=="Home.1"):which(colnames(lineup_stuff)=="Home.5")])
away_players <- sort(x[which(colnames(lineup_stuff)=="Away.1"):which(colnames(lineup_stuff)=="Away.5")])
return(c(x[1:(which(colnames(lineup_stuff)=="Home.1")-1)], home_players, away_players, x[(which(colnames(lineup_stuff)=="Away.5")+1):ncol(lineup_stuff)]))
})
#Converts the sorted back into a data frame
lineup_stuff2 <-
data.frame(matrix(unlist(lineup_stuff2), ncol = ncol(lineup_stuff), byrow = T), stringsAsFactors = F)
colnames(lineup_stuff2) <- colnames(play_by_play_data)
#Get all home lineups and calculate a variety of stats for each lineup
#o is used to denote opponents
suppressMessages(
home_lineups <- lineup_stuff2 %>%
dplyr::group_by(Home.1, Home.2, Home.3, Home.4, Home.5, Home) %>%
dplyr::mutate(Shot_Value = as.numeric(Shot_Value),
Event_Length = as.numeric(Event_Length),
Poss_Num = as.numeric(Poss_Num),
oPOSS = ifelse(Poss_Team == Home, as.numeric(paste0(ID,Poss_Num)), NA),
dPOSS = ifelse(Poss_Team == Away, as.numeric(paste0(ID,Poss_Num)), NA)
) %>%
dplyr::summarise(
#can sum event lengths to get total amount of time across entries
Mins = sum(Event_Length, na.rm = T)/60,
oMins = sum(Event_Length * !is.na(oPOSS), na.rm = T)/60,
dMins = sum(Event_Length * !is.na(dPOSS), na.rm = T)/60,
# Get total possessions by the count of distinct possession numbers
oPOSS = dplyr::n_distinct(oPOSS, na.rm = T),
dPOSS = dplyr::n_distinct(dPOSS, na.rm = T),
#points
PTS = sum(
(Event_Team == Home) * (Event_Result == "made") * Shot_Value, na.rm = T),
dPTS = sum(
(Event_Team == Away) * (Event_Result == "made") * Shot_Value, na.rm = T),
#field goal attempts
FGA = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * 1, na.rm = T),
dFGA = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * 1, na.rm = T),
#field goal makes
FGM = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * (Event_Result == "made") * 1, na.rm = T),
dFGM = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * (Event_Result == "made") * 1, na.rm = T),
#three point attempts
TPA = sum((Shot_Value == 3) * (Event_Team == Home) * 1, na.rm = T),
dTPA = sum((Shot_Value == 3) * (Event_Team == Away) * 1, na.rm = T),
#three point makes
TPM = sum((Shot_Value == 3) * (Event_Team == Home) * (Event_Result == "made") * 1, na.rm = T),
dTPM = sum((Shot_Value == 3) * (Event_Team == Away) * (Event_Result == "made") * 1, na.rm = T),
#free throw attempts
FTA = sum((Shot_Value == 1) * (Event_Team == Home) * 1, na.rm = T),
dFTA = sum((Shot_Value == 1) * (Event_Team == Away) * 1, na.rm = T),
#free throw makes
FTM = sum((Shot_Value == 1) * (Event_Team == Home) * (Event_Result == "made") * 1, na.rm = T),
dFTM = sum((Shot_Value == 1) * (Event_Team == Away) * (Event_Result == "made") * 1, na.rm = T),
#rough estimate of rim attempts using terminology of ncaa
RIMA = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * 1, na.rm = T),
dRIMA = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * 1, na.rm = T),
RIMM = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * 1, na.rm = T),
dRIMM = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * 1, na.rm = T),
#offensive rebounds
ORB = sum((Event_Type == "Offensive Rebound") * (Event_Team == Home) * 1, na.rm = T),
dORB = sum((Event_Type == "Offensive Rebound") * (Event_Team == Away) * 1, na.rm = T),
#defensive rebounds
DRB = sum((Event_Type == "Defensive Rebound") * (Event_Team == Home) * 1, na.rm = T),
dDRB = sum((Event_Type == "Defensive Rebound") * (Event_Team == Away) * 1, na.rm = T),
#blocked shots
BLK = sum((Event_Type == "Blocked Shot") * (Event_Team == Home) * 1, na.rm = T),
dBLK = sum((Event_Type == "Blocked Shot") * (Event_Team == Away) * 1, na.rm = T),
#turnovers
TO = sum((Event_Type == "Turnover") * (Event_Team == Home) * 1, na.rm = T),
dTO = sum((Event_Type == "Turnover") * (Event_Team == Away) * 1, na.rm = T),
#assists
AST = sum((!is.na(Player_2)) * (Event_Team == Home) * 1, na.rm = T),
dAST = sum((!is.na(Player_2)) * (Event_Team == Away) * 1, na.rm = T)
) %>%
dplyr::rename(
P1 = Home.1,
P2 = Home.2,
P3 = Home.3,
P4 = Home.4,
P5 = Home.5,
Team = Home
))
# Now calculate optional half-court + transition
if(include_transition) {
suppressMessages(
home_lineups_trans <- lineup_stuff2 %>%
dplyr::group_by(Home.1, Home.2, Home.3, Home.4, Home.5, Home) %>%
dplyr::mutate(Shot_Value = as.numeric(Shot_Value),
Event_Length = as.numeric(Event_Length),
isTransition = as.logical(isTransition)*1,
isHalfCourt = 1-isTransition,
Poss_Num = as.numeric(Poss_Num),
oPOSS_num = ifelse(Poss_Team == Home, as.numeric(paste0(ID,Poss_Num)), NA),
dPOSS_num = ifelse(Poss_Team == Away, as.numeric(paste0(ID,Poss_Num)), NA)
) %>%
dplyr::summarise(
#can sum event lengths to get total amount of time across entries
Mins_trans = sum(Event_Length * isTransition, na.rm = T)/60,
oMins_trans = sum(Event_Length * (!is.na(oPOSS_num)*1) * isTransition, na.rm = T)/60,
dMins_trans = sum(Event_Length * (!is.na(dPOSS_num)*1) * isTransition, na.rm = T)/60,
# Get total possessions by the count of distinct possession numbers
oPOSS_trans = dplyr::n_distinct(oPOSS_num *ifelse(isTransition == 1, 1, NA), na.rm = T),
dPOSS_trans = dplyr::n_distinct(dPOSS_num*ifelse(isTransition == 1, 1, NA), na.rm = T),
#points
PTS_trans = sum(
(Event_Team == Home) * (Event_Result == "made") * Shot_Value * isTransition, na.rm = T),
dPTS_trans = sum(
(Event_Team == Away) * (Event_Result == "made") * Shot_Value * isTransition, na.rm = T),
#field goal attempts
FGA_trans = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * isTransition, na.rm = T),
dFGA_trans = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * isTransition, na.rm = T),
#field goal makes
FGM_trans = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * (Event_Result == "made") * isTransition, na.rm = T),
dFGM_trans = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * (Event_Result == "made") * isTransition, na.rm = T),
#three point attempts
TPA_trans = sum((Shot_Value == 3) * (Event_Team == Home) * isTransition, na.rm = T),
dTPA_trans = sum((Shot_Value == 3) * (Event_Team == Away) * isTransition, na.rm = T),
#three point makes
TPM_trans = sum((Shot_Value == 3) * (Event_Team == Home) * (Event_Result == "made") * isTransition, na.rm = T),
dTPM_trans = sum((Shot_Value == 3) * (Event_Team == Away) * (Event_Result == "made") * isTransition, na.rm = T),
#free throw attempts
FTA_trans = sum((Shot_Value == 1) * (Event_Team == Home) * isTransition, na.rm = T),
dFTA_trans = sum((Shot_Value == 1) * (Event_Team == Away) * isTransition, na.rm = T),
#free throw makes
FTM_trans = sum((Shot_Value == 1) * (Event_Team == Home) * (Event_Result == "made") * isTransition, na.rm = T),
dFTM_trans = sum((Shot_Value == 1) * (Event_Team == Away) * (Event_Result == "made") * isTransition, na.rm = T),
#rough estimate of rim attempts using terminology of ncaa
RIMA_trans = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * isTransition, na.rm = T),
dRIMA_trans = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * isTransition, na.rm = T),
RIMM_trans = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * isTransition, na.rm = T),
dRIMM_trans = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * isTransition, na.rm = T),
#offensive rebounds
ORB_trans = sum((Event_Type == "Offensive Rebound") * (Event_Team == Home) * isTransition, na.rm = T),
dORB_trans = sum((Event_Type == "Offensive Rebound") * (Event_Team == Away) * isTransition, na.rm = T),
#defensive rebounds
DRB_trans = sum((Event_Type == "Defensive Rebound") * (Event_Team == Home) * isTransition, na.rm = T),
dDRB_trans = sum((Event_Type == "Defensive Rebound") * (Event_Team == Away) * isTransition, na.rm = T),
#blocked shots
BLK_trans = sum((Event_Type == "Blocked Shot") * (Event_Team == Home) * isTransition, na.rm = T),
dBLK_trans = sum((Event_Type == "Blocked Shot") * (Event_Team == Away) * isTransition, na.rm = T),
#turnovers
TO_trans = sum((Event_Type == "Turnover") * (Event_Team == Home) * isTransition, na.rm = T),
dTO_trans = sum((Event_Type == "Turnover") * (Event_Team == Away) * isTransition, na.rm = T),
#assists
AST_trans = sum((!is.na(Player_2)) * (Event_Team == Home) * isTransition, na.rm = T),
dAST_trans = sum((!is.na(Player_2)) * (Event_Team == Away) * isTransition, na.rm = T),
#HALF COURT
Mins_half = sum(Event_Length * isHalfCourt, na.rm = T)/60,
oMins_half = sum(Event_Length * (!is.na(oPOSS_num)*1) * isHalfCourt, na.rm = T)/60,
dMins_half = sum(Event_Length * (!is.na(dPOSS_num)*1) * isHalfCourt, na.rm = T)/60,
oPOSS_half = dplyr::n_distinct(oPOSS_num *ifelse(isHalfCourt == 1, 1, NA), na.rm = T),
dPOSS_half = dplyr::n_distinct(dPOSS_num*ifelse(isHalfCourt == 1, 1, NA), na.rm = T),
PTS_half = sum(
(Event_Team == Home) * (Event_Result == "made") * Shot_Value * isHalfCourt, na.rm = T),
dPTS_half = sum(
(Event_Team == Away) * (Event_Result == "made") * Shot_Value * isHalfCourt, na.rm = T),
FGA_half = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * isHalfCourt, na.rm = T),
dFGA_half = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * isHalfCourt, na.rm = T),
FGM_half = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * (Event_Result == "made") * isHalfCourt, na.rm = T),
dFGM_half = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * (Event_Result == "made") * isHalfCourt, na.rm = T),
TPA_half = sum((Shot_Value == 3) * (Event_Team == Home) * isHalfCourt, na.rm = T),
dTPA_half = sum((Shot_Value == 3) * (Event_Team == Away) * isHalfCourt, na.rm = T),
TPM_half = sum((Shot_Value == 3) * (Event_Team == Home) * (Event_Result == "made") * isHalfCourt, na.rm = T),
dTPM_half = sum((Shot_Value == 3) * (Event_Team == Away) * (Event_Result == "made") * isHalfCourt, na.rm = T),
FTA_half = sum((Shot_Value == 1) * (Event_Team == Home) * isHalfCourt, na.rm = T),
dFTA_half = sum((Shot_Value == 1) * (Event_Team == Away) * isHalfCourt, na.rm = T),
FTM_half = sum((Shot_Value == 1) * (Event_Team == Home) * (Event_Result == "made") * isHalfCourt, na.rm = T),
dFTM_half = sum((Shot_Value == 1) * (Event_Team == Away) * (Event_Result == "made") * isHalfCourt, na.rm = T),
RIMA_half = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * isHalfCourt, na.rm = T),
dRIMA_half = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * isHalfCourt, na.rm = T),
RIMM_half = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * isHalfCourt, na.rm = T),
dRIMM_half = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * isHalfCourt, na.rm = T),
ORB_half = sum((Event_Type == "Offensive Rebound") * (Event_Team == Home) * isHalfCourt, na.rm = T),
dORB_half = sum((Event_Type == "Offensive Rebound") * (Event_Team == Away) * isHalfCourt, na.rm = T),
DRB_half = sum((Event_Type == "Defensive Rebound") * (Event_Team == Home) * isHalfCourt, na.rm = T),
dDRB_half = sum((Event_Type == "Defensive Rebound") * (Event_Team == Away) * isHalfCourt, na.rm = T),
BLK_half = sum((Event_Type == "Blocked Shot") * (Event_Team == Home) * isHalfCourt, na.rm = T),
dBLK_half = sum((Event_Type == "Blocked Shot") * (Event_Team == Away) * isHalfCourt, na.rm = T),
TO_half = sum((Event_Type == "Turnover") * (Event_Team == Home) * isHalfCourt, na.rm = T),
dTO_half = sum((Event_Type == "Turnover") * (Event_Team == Away) * isHalfCourt, na.rm = T),
AST_half = sum((!is.na(Player_2)) * (Event_Team == Home) * isHalfCourt, na.rm = T),
dAST_half = sum((!is.na(Player_2)) * (Event_Team == Away) * isHalfCourt, na.rm = T)
) %>%
dplyr::rename(
P1 = Home.1,
P2 = Home.2,
P3 = Home.3,
P4 = Home.4,
P5 = Home.5,
Team = Home
)
)
}
#same done for away team
suppressMessages(
away_lineups <- lineup_stuff2 %>%
dplyr::group_by(Away.1, Away.2, Away.3, Away.4, Away.5, Away) %>%
dplyr::mutate(Shot_Value = as.numeric(Shot_Value),
Event_Length = as.numeric(Event_Length),
Poss_Num = as.numeric(Poss_Num),
oPOSS_num = ifelse(Poss_Team == Away, as.numeric(paste0(ID,Poss_Num)), NA),
dPOSS_num = ifelse(Poss_Team == Home, as.numeric(paste0(ID,Poss_Num)), NA)
) %>%
dplyr::summarise(
Mins = sum(Event_Length / 60, na.rm = T),
oMins = sum(Event_Length * !is.na(oPOSS_num), na.rm = T)/60,
dMins = sum(Event_Length * !is.na(dPOSS_num), na.rm = T)/60,
oPOSS = dplyr::n_distinct(oPOSS_num, na.rm = T),
dPOSS = dplyr::n_distinct(dPOSS_num, na.rm = T),
PTS = sum((Event_Team == Away) * (Event_Result == "made") * Shot_Value, na.rm = T),
dPTS = sum((Event_Team == Home) * (Event_Result == "made") * Shot_Value,na.rm = T),
FGA = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * 1, na.rm = T),
dFGA = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * 1, na.rm = T),
FGM = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * (Event_Result == "made") * 1, na.rm = T),
dFGM = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * (Event_Result == "made") * 1, na.rm = T),
TPA = sum((Shot_Value == 3) * (Event_Team == Away) * 1, na.rm = T),
dTPA = sum((Shot_Value == 3) * (Event_Team == Home) * 1, na.rm = T),
TPM = sum((Shot_Value == 3) * (Event_Team == Away) * (Event_Result == "made") * 1, na.rm = T),
dTPM = sum((Shot_Value == 3) * (Event_Team == Home) * (Event_Result == "made") * 1, na.rm = T),
FTA = sum((Shot_Value == 1) * (Event_Team == Away) * 1, na.rm = T),
dFTA = sum((Shot_Value == 1) * (Event_Team == Home) * 1, na.rm = T),
FTM = sum((Shot_Value == 1) * (Event_Team == Away) * (Event_Result == "made") * 1, na.rm = T),
dFTM = sum((Shot_Value == 1) * (Event_Team == Home) * (Event_Result == "made") * 1, na.rm = T),
RIMA = sum((
Event_Type %in% c("Dunk", "Layup", "Hook","Tip-In")
) * (Event_Team == Away) * 1, na.rm = T),
dRIMA = sum((
Event_Type %in% c("Dunk", "Layup", "Hook","Tip-In")
) * (Event_Team == Home) * 1, na.rm = T),
RIMM = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * 1, na.rm = T),
dRIMM = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * 1, na.rm = T),
ORB = sum((Event_Type == "Offensive Rebound") * (Event_Team == Away) * 1, na.rm = T),
dORB = sum((Event_Type == "Offensive Rebound") * (Event_Team == Home) * 1, na.rm = T),
DRB = sum((Event_Type == "Defensive Rebound") * (Event_Team == Away) * 1, na.rm = T),
dDRB = sum((Event_Type == "Defensive Rebound") * (Event_Team == Home) * 1, na.rm = T),
BLK = sum((Event_Type == "Blocked Shot") * (Event_Team == Away) * 1, na.rm = T),
dBLK = sum((Event_Type == "Blocked Shot") * (Event_Team == Home) * 1, na.rm = T),
TO = sum((Event_Type == "Turnover") * (Event_Team == Away) * 1, na.rm = T),
dTO = sum((Event_Type == "Turnover") * (Event_Team == Home) * 1, na.rm = T),
AST = sum((!is.na(Player_2)) * (Event_Team == Away) * 1, na.rm = T),
dAST = sum((!is.na(Player_2)) * (Event_Team == Home) * 1, na.rm = T)
) %>%
dplyr::rename(
P1 = Away.1,
P2 = Away.2,
P3 = Away.3,
P4 = Away.4,
P5 = Away.5,
Team = Away
))
if(include_transition) {
suppressMessages(
away_lineups_trans <- lineup_stuff2 %>%
dplyr::group_by(Away.1, Away.2, Away.3, Away.4, Away.5, Away) %>%
dplyr::mutate(Shot_Value = as.numeric(Shot_Value),
Event_Length = as.numeric(Event_Length),
isTransition = as.logical(isTransition)*1,
isHalfCourt = 1-isTransition,
Poss_Num = as.numeric(Poss_Num),
oPOSS_num = ifelse(Poss_Team == Away, as.numeric(paste0(ID,Poss_Num)), NA),
dPOSS_num = ifelse(Poss_Team == Home, as.numeric(paste0(ID,Poss_Num)), NA)
) %>%
dplyr::summarise(
Mins_trans = sum(Event_Length * isTransition, na.rm = T)/60,
oMins_trans = sum(Event_Length * (!is.na(oPOSS_num)*1) * isTransition, na.rm = T)/60,
dMins_trans = sum(Event_Length * (!is.na(dPOSS_num)*1) * isTransition, na.rm = T)/60,
# Get total possessions by the count of distinct possession numbers
oPOSS_trans = dplyr::n_distinct(oPOSS_num *ifelse(isTransition == 1, 1, NA), na.rm = T),
dPOSS_trans = dplyr::n_distinct(dPOSS_num*ifelse(isTransition == 1, 1, NA), na.rm = T),
PTS_trans = sum((Event_Team == Away) * (Event_Result == "made") * Shot_Value * isTransition, na.rm = T),
dPTS_trans = sum((Event_Team == Home) * (Event_Result == "made") * Shot_Value * isTransition,na.rm = T),
FGA_trans = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * isTransition, na.rm = T),
dFGA_trans = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * isTransition, na.rm = T),
FGM_trans = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * (Event_Result == "made") * isTransition, na.rm = T),
dFGM_trans= sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * (Event_Result == "made") * isTransition, na.rm = T),
TPA_trans = sum((Shot_Value == 3) * (Event_Team == Away) * isTransition, na.rm = T),
dTPA_trans = sum((Shot_Value == 3) * (Event_Team == Home) * isTransition, na.rm = T),
TPM_trans = sum((Shot_Value == 3) * (Event_Team == Away) * (Event_Result == "made") * isTransition, na.rm = T),
dTPM_trans = sum((Shot_Value == 3) * (Event_Team == Home) * (Event_Result == "made") * isTransition, na.rm = T),
FTA_trans = sum((Shot_Value == 1) * (Event_Team == Away) * isTransition, na.rm = T),
dFTA_trans = sum((Shot_Value == 1) * (Event_Team == Home) * isTransition, na.rm = T),
FTM_trans = sum((Shot_Value == 1) * (Event_Team == Away) * (Event_Result == "made") * isTransition, na.rm = T),
dFTM_trans = sum((Shot_Value == 1) * (Event_Team == Home) * (Event_Result == "made") * isTransition, na.rm = T),
RIMA_trans = sum((
Event_Type %in% c("Dunk", "Layup", "Hook","Tip-In")
) * (Event_Team == Away) * isTransition, na.rm = T),
dRIMA_trans = sum((
Event_Type %in% c("Dunk", "Layup", "Hook","Tip-In")
) * (Event_Team == Home) * isTransition, na.rm = T),
RIMM_trans = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * isTransition, na.rm = T),
dRIMM_trans = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * isTransition, na.rm = T),
ORB_trans = sum((Event_Type == "Offensive Rebound") * (Event_Team == Away) * isTransition, na.rm = T),
dORB_trans = sum((Event_Type == "Offensive Rebound") * (Event_Team == Home) * isTransition, na.rm = T),
DRB_trans = sum((Event_Type == "Defensive Rebound") * (Event_Team == Away) * isTransition, na.rm = T),
dDRB_trans = sum((Event_Type == "Defensive Rebound") * (Event_Team == Home) * isTransition, na.rm = T),
BLK_trans = sum((Event_Type == "Blocked Shot") * (Event_Team == Away) * isTransition, na.rm = T),
dBLK_trans = sum((Event_Type == "Blocked Shot") * (Event_Team == Home) * isTransition, na.rm = T),
TO_trans = sum((Event_Type == "Turnover") * (Event_Team == Away) * isTransition, na.rm = T),
dTO_trans = sum((Event_Type == "Turnover") * (Event_Team == Home) * isTransition, na.rm = T),
AST_trans = sum((!is.na(Player_2)) * (Event_Team == Away) * isTransition, na.rm = T),
dAST_trans = sum((!is.na(Player_2)) * (Event_Team == Home) * isTransition, na.rm = T),
# HALF COURT
Mins_half = sum(Event_Length * isHalfCourt, na.rm = T)/60,
oMins_half = sum(Event_Length * (!is.na(oPOSS_num)*1) * isHalfCourt, na.rm = T)/60,
dMins_half = sum(Event_Length * (!is.na(dPOSS_num)*1) * isHalfCourt, na.rm = T)/60,
# Get total possessions by the count of distinct possession numbers
oPOSS_half = dplyr::n_distinct(oPOSS_num *ifelse(isHalfCourt == 1, 1, NA), na.rm = T),
dPOSS_half = dplyr::n_distinct(dPOSS_num*ifelse(isHalfCourt == 1, 1, NA), na.rm = T),
PTS_half = sum((Event_Team == Away) * (Event_Result == "made") * Shot_Value * isHalfCourt, na.rm = T),
dPTS_half = sum((Event_Team == Home) * (Event_Result == "made") * Shot_Value * isHalfCourt,na.rm = T),
FGA_half = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * isHalfCourt, na.rm = T),
dFGA_half = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * isHalfCourt, na.rm = T),
FGM_half = sum((Shot_Value %in% c(2, 3)) * (Event_Team == Away) * (Event_Result == "made") * isHalfCourt, na.rm = T),
dFGM_half= sum((Shot_Value %in% c(2, 3)) * (Event_Team == Home) * (Event_Result == "made") * isHalfCourt, na.rm = T),
TPA_half = sum((Shot_Value == 3) * (Event_Team == Away) * isHalfCourt, na.rm = T),
dTPA_half = sum((Shot_Value == 3) * (Event_Team == Home) * isHalfCourt, na.rm = T),
TPM_half = sum((Shot_Value == 3) * (Event_Team == Away) * (Event_Result == "made") * isHalfCourt, na.rm = T),
dTPM_half = sum((Shot_Value == 3) * (Event_Team == Home) * (Event_Result == "made") * isHalfCourt, na.rm = T),
FTA_half = sum((Shot_Value == 1) * (Event_Team == Away) * isHalfCourt, na.rm = T),
dFTA_half = sum((Shot_Value == 1) * (Event_Team == Home) * isHalfCourt, na.rm = T),
FTM_half = sum((Shot_Value == 1) * (Event_Team == Away) * (Event_Result == "made") * isHalfCourt, na.rm = T),
dFTM_half = sum((Shot_Value == 1) * (Event_Team == Home) * (Event_Result == "made") * isHalfCourt, na.rm = T),
RIMA_half = sum((
Event_Type %in% c("Dunk", "Layup", "Hook","Tip-In")
) * (Event_Team == Away) * isHalfCourt, na.rm = T),
dRIMA_half = sum((
Event_Type %in% c("Dunk", "Layup", "Hook","Tip-In")
) * (Event_Team == Home) * isHalfCourt, na.rm = T),
RIMM_half = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Away) * isHalfCourt, na.rm = T),
dRIMM_half = sum((Event_Result == "made") * (
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (Event_Team == Home) * isHalfCourt, na.rm = T),
ORB_half = sum((Event_Type == "Offensive Rebound") * (Event_Team == Away) * isHalfCourt, na.rm = T),
dORB_half = sum((Event_Type == "Offensive Rebound") * (Event_Team == Home) * isHalfCourt, na.rm = T),
DRB_half = sum((Event_Type == "Defensive Rebound") * (Event_Team == Away) * isHalfCourt, na.rm = T),
dDRB_half = sum((Event_Type == "Defensive Rebound") * (Event_Team == Home) * isHalfCourt, na.rm = T),
BLK_half = sum((Event_Type == "Blocked Shot") * (Event_Team == Away) * isHalfCourt, na.rm = T),
dBLK_half = sum((Event_Type == "Blocked Shot") * (Event_Team == Home) * isHalfCourt, na.rm = T),
TO_half = sum((Event_Type == "Turnover") * (Event_Team == Away) * isHalfCourt, na.rm = T),
dTO_half = sum((Event_Type == "Turnover") * (Event_Team == Home) * isHalfCourt, na.rm = T),
AST_half = sum((!is.na(Player_2)) * (Event_Team == Away) * isHalfCourt, na.rm = T),
dAST_half = sum((!is.na(Player_2)) * (Event_Team == Home) * isHalfCourt, na.rm = T)
) %>%
dplyr::rename(
P1 = Away.1,
P2 = Away.2,
P3 = Away.3,
P4 = Away.4,
P5 = Away.5,
Team = Away
))
}
#combine lineups from home and away and calculate a variety of stats
suppressMessages(
lineups <- dplyr::bind_rows(home_lineups, away_lineups) %>%
dplyr::group_by(P1, P2, P3, P4, P5, Team) %>%
dplyr::summarise_if(is.numeric, sum) %>%
dplyr::mutate(
#estimate of possesions using common formula
ePOSS = (round(FGA + .475*FTA - ORB + TO) + round(dFGA + .475*dFTA - dORB + dTO)) / 2,
#efficiency scaled to points per 100 possessions
ORTG = PTS / oPOSS * 100,
DRTG = dPTS / dPOSS * 100,
NETRTG = ORTG - DRTG,
# field goal percentage
FG. = FGM / FGA,
dFG. = dFGM / dFGA,
#three point percentage
TPP = TPM / TPA,
dTPP = dTPM / dTPA,
#free throw percentage
FTP = FTM / FTA,
dFTP = dFTM / dFTA,
#effective shooting percentage
eFG. = (FGM + 0.5 * TPM) / FGA,
deFG. = (dFGM + 0.5 * dTPM) / dFGA,
#true shooting percentage
TS. = (PTS / 2) / (FGA + .475 * FTA),
dTS. = (dPTS / 2) / (dFGA + .475 * dFTA),
# rim field goal percentage
RIM. = RIMM / RIMA,
dRIM. = dRIMM / dRIMA,
# midrange field goal percentage
MID. = (FGM - RIMM - TPM) / (FGA - RIMA - TPA),
dMID. = (dFGM - dRIMM - dTPM) / (dFGA - dRIMA - dTPA),
#% of fga that are threes
TPrate = TPA / FGA,
dTPrate = dTPA / dFGA,
#% of fga at the rim
RIMrate = RIMA / FGA,
dRIMrate = dRIMA / dFGA,
#%midrange fga
MIDrate = (FGA - TPA - RIMA) / FGA,
dMIDrate = (dFGA - dTPA - dRIMA) / dFGA,
#rate of free throw attempts per field goal attempt
FTrate = FTA / FGA,
dFTrate = dFTA / dFGA,
#percentage of makes that are assisted
ASTrate = AST / FGM,
dASTrate = dAST / dFGM,
#percentage of possessions ending with turnovers
TOrate = TO / oPOSS,
dTOrate = dTO / dPOSS,
#rate that team blocks shots (so defensively) per opponent attempt
BLKrate = BLK / dFGA,
oBLKrate = dBLK / FGA,
#rebounding percentages
ORB. = ORB / (ORB + dDRB),
DRB. = DRB / (DRB + dORB),
# time per possession in seconds = Mins per possession
TimePerPoss = (oMins / oPOSS) * 60,
dTimePerPoss = (dMins / dPOSS) * 60
) %>%
#no need to have long decimals so round everything
dplyr::mutate(across(where(is.numeric), ~ round(., 3))) %>%
dplyr::ungroup() %>%
dplyr::select(P1:Team, Mins:dPOSS, ORTG:NETRTG, dplyr::everything()))
#change any NA/infinite/etc. that comes up in calculations to 0
lineups[is.na(lineups)] <- 0
lineups[,7:ncol(lineups)] <- apply(lineups[,7:ncol(lineups)], 2, function(x){ifelse(is.infinite(x),0,x)})
if(include_transition) {
#combine lineups from home and away and calculate a variety of stats
suppressMessages(
lineups_trans <- dplyr::bind_rows(home_lineups_trans, away_lineups_trans) %>%
dplyr::group_by(P1, P2, P3, P4, P5, Team) %>%
dplyr::summarise_if(is.numeric, sum) %>%
dplyr::ungroup() %>%
dplyr::mutate(
ORTG_trans = PTS_trans / oPOSS_trans * 100,
DRTG_trans = dPTS_trans / dPOSS_trans * 100,
NETRTG_trans = ORTG_trans - DRTG_trans,
FG._trans = FGM_trans / FGA_trans,
dFG._trans = dFGM_trans / dFGA_trans,
TPP_trans = TPM_trans / TPA_trans,
dTPP_trans = dTPM_trans / dTPA_trans,
FTP_trans = FTM_trans / FTA_trans,
dFTP_trans = dFTM_trans / dFTA_trans,
eFG._trans = (FGM_trans + 0.5 * TPM_trans) / FGA_trans,
deFG._trans = (dFGM_trans + 0.5 * dTPM_trans) / dFGA_trans,
TS._trans = (PTS_trans / 2) / (FGA_trans + .475 * FTA_trans),
dTS._trans = (dPTS_trans / 2) / (dFGA_trans + .475 * dFTA_trans),
RIM._trans = RIMM_trans / RIMA_trans,
dRIM._trans = dRIMM_trans / dRIMA_trans,
MID._trans = (FGM_trans - RIMM_trans - TPM_trans) / (FGA_trans - RIMA_trans - TPA_trans),
dMID._trans = (dFGM_trans - dRIMM_trans - dTPM_trans) / (dFGA_trans - dRIMA_trans - dTPA_trans),
TPrate_trans = TPA_trans / FGA_trans,
dTPrate_trans = dTPA_trans / dFGA_trans,
RIMrate_trans = RIMA_trans / FGA_trans,
dRIMrate_trans = dRIMA_trans / dFGA_trans,
MIDrate_trans = (FGA_trans - TPA_trans - RIMA_trans) / FGA_trans,
dMIDrate_trans = (dFGA_trans - dTPA_trans - dRIMA_trans) / dFGA_trans,
FTrate_trans = FTA_trans / FGA_trans,
dFTrate_trans = dFTA_trans / dFGA_trans,
ASTrate_trans = AST_trans / FGM_trans,
dASTrate_trans = dAST_trans / dFGM_trans,
TOrate_trans = TO_trans / oPOSS_trans,
dTOrate_trans = dTO_trans / dPOSS_trans,
BLKrate_trans = BLK_trans / dFGA_trans,
oBLKrate_trans = dBLK_trans / FGA_trans,
ORB._trans = ORB_trans / (ORB_trans + dDRB_trans),
DRB._trans = DRB_trans / (DRB_trans + dORB_trans),
TimePerPoss_trans = (oMins_trans / oPOSS_trans) * 60,
dTimePerPoss_trans = (dMins_trans / dPOSS_trans) * 60,
ORTG_half = PTS_half / oPOSS_half * 100,
DRTG_half = dPTS_half / dPOSS_half * 100,
NETRTG_half = ORTG_half - DRTG_half,
FG._half = FGM_half / FGA_half,
dFG._half = dFGM_half / dFGA_half,
TPP_half = TPM_half / TPA_half,
dTPP_half = dTPM_half / dTPA_half,
FTP_half = FTM_half / FTA_half,
dFTP_half = dFTM_half / dFTA_half,
eFG._half = (FGM_half + 0.5 * TPM_half) / FGA_half,
deFG._half = (dFGM_half + 0.5 * dTPM_half) / dFGA_half,
TS._half = (PTS_half / 2) / (FGA_half + .475 * FTA_half),
dTS._half = (dPTS_half / 2) / (dFGA_half + .475 * dFTA_half),
RIM._half = RIMM_half / RIMA_half,
dRIM._half = dRIMM_half / dRIMA_half,
MID._half = (FGM_half - RIMM_half - TPM_half) / (FGA_half - RIMA_half - TPA_half),
dMID._half = (dFGM_half - dRIMM_half - dTPM_half) / (dFGA_half - dRIMA_half - dTPA_half),
TPrate_half = TPA_half / FGA_half,
dTPrate_half = dTPA_half / dFGA_half,
RIMrate_half = RIMA_half / FGA_half,
dRIMrate_half = dRIMA_half / dFGA_half,
MIDrate_half = (FGA_half - TPA_half - RIMA_half) / FGA_half,
dMIDrate_half = (dFGA_half - dTPA_half - dRIMA_half) / dFGA_half,
FTrate_half = FTA_half / FGA_half,
dFTrate_half = dFTA_half / dFGA_half,
ASTrate_half = AST_half / FGM_half,
dASTrate_half = dAST_half / dFGM_half,
TOrate_half = TO_half / oPOSS_half,
dTOrate_half = dTO_half / dPOSS_half,
BLKrate_half = BLK_half / dFGA_half,
oBLKrate_half = dBLK_half / FGA_half,
ORB._half = ORB_half / (ORB_half + dDRB_half),
DRB._half = DRB_half / (DRB_half + dORB_half),
TimePerPoss_half = (oMins_half / oPOSS_half) * 60,
dTimePerPoss_half = (dMins_half / dPOSS_half) * 60
# HALF COURT
) %>%
#no need to have long decimals so round everything
dplyr::mutate(across(where(is.numeric), ~ round(., 3))) %>%
dplyr::ungroup()
)
#change any NA/infinite/etc. that comes up in calculations to 0
lineups_trans[is.na(lineups_trans)] <- 0
lineups_trans[,7:ncol(lineups_trans)] <- apply(lineups_trans[,7:ncol(lineups_trans)], 2, function(x){ifelse(is.infinite(x),0,x)})
lineups_combo <- lineups %>%
dplyr::left_join(
lineups_trans, by = c("P1", "P2", "P3", "P4", "P5", "Team")
) %>%
dplyr:: mutate(
oTransPCT = oPOSS_trans / oPOSS,
dTransPCT = dPOSS_trans / dPOSS
) %>%
select(P1:dTimePerPoss, oTransPCT, dTransPCT, Mins_trans:dTimePerPoss_half)
lineups_combo[is.na(lineups_combo)] <- 0
lineups_combo[,7:ncol(lineups_combo)] <- apply(lineups_combo[,7:ncol(lineups_combo)], 2, function(x){ifelse(is.infinite(x),0,x)})
return(lineups_combo)
}
return(lineups)
}
#' Team Stats Function
#'
#' This function takes in pbp data and calculates team level stats
#' @param play_by_play_data data frame consisting of play-by-play data from the functions scrape_game() or get_play_by_play()
#' @import dplyr
#' @export
#' @return data frame with each row representing a team
get_team_stats <-
function(play_by_play_data = NA, include_transition = F) {
team_data <- play_by_play_data %>%
dplyr::mutate(across(Home.1:Away.5, ~""))
team_stats <- team_data %>%
dplyr::group_by(ID, Home, Away) %>%
do(get_lineups(., include_transition)) %>%
dplyr::select(-P1:-P5)
}
#' On-Off Comparison Function
#'
#' This function passes in lineup data and calculates the on/off lineup statistics for all lineup combinations of players specified.
#' This allows users to view on/off statistics for individual players, as well as combinations of multiple players. Users can also
#' specify if they'd like specific players to be included or excluded from all lineups in use
#' @param Players character vector of players desired to be compared with on/off
#' @param Lineup_Data data frame made up of lineups collected from the get_lineups() function
#' @param Included character vector of players. These players will be on the court for every lineup considered.
#' @param Excluded character vector of players. These players will be off the court for every lineup considered.
#' @import dplyr
#' @export
#' @return data frame with each row representing an on/off combination. Explanations of statistics found in get_lineups()
#' @examples Duke_Lineups = get_lineups(get_team_schedule(season="2018-19", team.name = "Duke))
#' Get all on and off combinations for the first 3 players when Tre Jones is on the court
#' on_off_generator(Players = c("CAM.REDDISH","RJ.BARRETT","ZION.WILLIAMSON"), Lineup_Data = Duke_Lineups, Included = "TRE.JONES")
on_off_generator <-
function(Players,
Lineup_Data,
Included = NA,
Excluded = NA,
include_transition = F
) {
if(!include_transition) {
Lineup_Data <- dplyr::select(Lineup_Data, -matches("_trans|_half"))
}
#first find which team is being looked for from the players mentioned
find_team <- unique(
dplyr::filter(
Lineup_Data,
P1 %in% Players |
P2 %in% Players |
P3 %in% Players |
P4 %in% Players |
P5 %in% Players
)$Team
)
#if the wrong number of teams are identified
if (length(find_team) != 1) {
stop("ERROR- Player team not found")
# return(NULL)
}
# Generates all relevant lineups using the included and excluded variables
data <- if (!is.na(Included[1]) | !is.na(Excluded[1])) {
get_player_lineups(Lineup_Data, Included, Excluded)
} else{
Lineup_Data %>%
dplyr::filter(Team == find_team)
}
# Create matrix for on/off of each player identified
players <-
matrix(rep(NA, nrow(data) * length(Players)), ncol = length(Players))
colnames(players) <- Players
# Find if the player is in each lineup
for (i in 1:length(Players)) {
players[, i] <- apply(data, 1, function(x) {
playerIn <- Players[i] %in% x[1:5]
})
}
# get all possible combinations of identified players
poss_ind <- rep(list(c(T, F)), length(Players))
combos <- expand.grid(poss_ind)
#calculate stats for each combination generated above
final <- data.frame()
for (i in 1:nrow(combos)) {
relRow <- apply(players, 1,
function(x) {
if (sum(x == combos[i,]) == length(Players)) {
return(T)
} else{
return(F)
}
})
on <- data[relRow == T,] %>%
dplyr::summarise(across(where(is.numeric), sum)) %>%
dplyr::mutate(Status = paste(Players, ifelse(combos[i,], "On", "Off"), collapse = " | "))
final <- dplyr::bind_rows(final, on)
}
#convert to get metrics defined previously
final <- final %>%
dplyr::mutate(
ePOSS = (round(FGA + .475*FTA - ORB + TO) + round(dFGA + .475*dFTA - dORB + dTO)) / 2,
ORTG = PTS / oPOSS * 100,
DRTG = dPTS / dPOSS * 100,
NETRTG = ORTG - DRTG,
FG. = FGM / FGA,
dFG. = dFGM / dFGA,
TPP = TPM / TPA,
dTPP = dTPM / dTPA,
FTP = FTM / FTA,
dFTP = dFTM / dFTA,
eFG. = (FGM + 0.5 * TPM) / FGA,
deFG. = (dFGM + 0.5 * dTPM) / dFGA,
TS. = (PTS / 2) / (FGA + .475 * FTA),
dTS. = (dPTS / 2) / (dFGA + .475 * dFTA),
RIM. = RIMM / RIMA,
dRIM. = dRIMM / dRIMA,
MID. = (FGM - RIMM - TPM) / (FGA - RIMA - TPA),
dMID. = (dFGM - dRIMM - dTPM) / (dFGA - dRIMA - dTPA),
TPrate = TPA / FGA,
dTPrate = dTPA / dFGA,
RIMrate = RIMA / FGA,
dRIMrate = dRIMA / dFGA,
MIDrate = (FGA - TPA - RIMA) / FGA,
dMIDrate = (dFGA - dTPA - dRIMA) / dFGA,
FTrate = FTA / FGA,
dFTrate = dFTA / dFGA,
ASTrate = AST / FGM,
dASTrate = dAST / dFGM,
TOrate = TO / oPOSS,
dTOrate = dTO / dPOSS,
BLKrate = BLK / dFGA,
oBLKrate = dBLK / FGA,
ORB. = ORB / (ORB + dDRB),
DRB. = DRB / (DRB + dORB),
TimePerPoss = (oMins / oPOSS) * 60,
dTimePerPoss = (dMins / dPOSS) * 60
) %>%
dplyr::mutate(across(where(is.numeric), ~ round(., 3))) %>%
dplyr::select(Status, Mins:PTS, oPOSS:NETRTG, everything())
final[is.na(final)] <- 0
if(include_transition) {
final <- final %>%
dplyr::mutate(
ORTG_trans = PTS_trans / oPOSS_trans * 100,
DRTG_trans = dPTS_trans / dPOSS_trans * 100,
NETRTG_trans = ORTG_trans - DRTG_trans,
FG._trans = FGM_trans / FGA_trans,
dFG._trans = dFGM_trans / dFGA_trans,
TPP_trans = TPM_trans / TPA_trans,
dTPP_trans = dTPM_trans / dTPA_trans,
FTP_trans = FTM_trans / FTA_trans,
dFTP_trans = dFTM_trans / dFTA_trans,
eFG._trans = (FGM_trans + 0.5 * TPM_trans) / FGA_trans,
deFG._trans = (dFGM_trans + 0.5 * dTPM_trans) / dFGA_trans,
TS._trans = (PTS_trans / 2) / (FGA_trans + .475 * FTA_trans),
dTS._trans = (dPTS_trans / 2) / (dFGA_trans + .475 * dFTA_trans),
RIM._trans = RIMM_trans / RIMA_trans,
dRIM._trans = dRIMM_trans / dRIMA_trans,
MID._trans = (FGM_trans - RIMM_trans - TPM_trans) / (FGA_trans - RIMA_trans - TPA_trans),
dMID._trans = (dFGM_trans - dRIMM_trans - dTPM_trans) / (dFGA_trans - dRIMA_trans - dTPA_trans),
TPrate_trans = TPA_trans / FGA_trans,
dTPrate_trans = dTPA_trans / dFGA_trans,
RIMrate_trans = RIMA_trans / FGA_trans,
dRIMrate_trans = dRIMA_trans / dFGA_trans,
MIDrate_trans = (FGA_trans - TPA_trans - RIMA_trans) / FGA_trans,
dMIDrate_trans = (dFGA_trans - dTPA_trans - dRIMA_trans) / dFGA_trans,
FTrate_trans = FTA_trans / FGA_trans,
dFTrate_trans = dFTA_trans / dFGA_trans,
ASTrate_trans = AST_trans / FGM_trans,
dASTrate_trans = dAST_trans / dFGM_trans,
TOrate_trans = TO_trans / oPOSS_trans,
dTOrate_trans = dTO_trans / dPOSS_trans,
BLKrate_trans = BLK_trans / dFGA_trans,
oBLKrate_trans = dBLK_trans / FGA_trans,
ORB._trans = ORB_trans / (ORB_trans + dDRB_trans),
DRB._trans = DRB_trans / (DRB_trans + dORB_trans),
TimePerPoss_trans = (oMins_trans / oPOSS_trans) * 60,
dTimePerPoss_trans = (dMins_trans / dPOSS_trans) * 60,
ORTG_half = PTS_half / oPOSS_half * 100,
DRTG_half = dPTS_half / dPOSS_half * 100,
NETRTG_half = ORTG_half - DRTG_half,
FG._half = FGM_half / FGA_half,
dFG._half = dFGM_half / dFGA_half,
TPP_half = TPM_half / TPA_half,
dTPP_half = dTPM_half / dTPA_half,
FTP_half = FTM_half / FTA_half,
dFTP_half = dFTM_half / dFTA_half,
eFG._half = (FGM_half + 0.5 * TPM_half) / FGA_half,
deFG._half = (dFGM_half + 0.5 * dTPM_half) / dFGA_half,
TS._half = (PTS_half / 2) / (FGA_half + .475 * FTA_half),
dTS._half = (dPTS_half / 2) / (dFGA_half + .475 * dFTA_half),
RIM._half = RIMM_half / RIMA_half,
dRIM._half = dRIMM_half / dRIMA_half,
MID._half = (FGM_half - RIMM_half - TPM_half) / (FGA_half - RIMA_half - TPA_half),
dMID._half = (dFGM_half - dRIMM_half - dTPM_half) / (dFGA_half - dRIMA_half - dTPA_half),
TPrate_half = TPA_half / FGA_half,
dTPrate_half = dTPA_half / dFGA_half,
RIMrate_half = RIMA_half / FGA_half,
dRIMrate_half = dRIMA_half / dFGA_half,
MIDrate_half = (FGA_half - TPA_half - RIMA_half) / FGA_half,
dMIDrate_half = (dFGA_half - dTPA_half - dRIMA_half) / dFGA_half,
FTrate_half = FTA_half / FGA_half,
dFTrate_half = dFTA_half / dFGA_half,
ASTrate_half = AST_half / FGM_half,
dASTrate_half = dAST_half / dFGM_half,
TOrate_half = TO_half / oPOSS_half,
dTOrate_half = dTO_half / dPOSS_half,
BLKrate_half = BLK_half / dFGA_half,
oBLKrate_half = dBLK_half / FGA_half,
ORB._half = ORB_half / (ORB_half + dDRB_half),
DRB._half = DRB_half / (DRB_half + dORB_half),
TimePerPoss_half = (oMins_half / oPOSS_half) * 60,
dTimePerPoss_half = (dMins_half / dPOSS_half) * 60,
oTransPCT = oPOSS_trans / oPOSS,
dTransPCT = dPOSS_trans / dPOSS
) %>%
#no need to have long decimals so round everything
dplyr::mutate(across(where(is.numeric), ~ round(., 3)))
}
final[is.na(final)] <- 0
final[,2:ncol(final)] <- apply(final[,2:ncol(final)], 2, function(x){ifelse(is.infinite(x),0,x)})
return(final)
}
#' Player Lineup Finder
#'
#' This function finds all lineups from a given lineup data source that include/exclude certain players. It acts
#' as a quick way to filter lineups for players
#' @param Lineup_Data a data frame of lineups created from get_lineups()
#' @param Included a character vector of players to be included in all lineups
#' @param Excluded a character vector of players to be excluded from all lineups
#' @import dplyr
#' @export
#' @return data frame of lineups with statistics documented in get_lineups()
get_player_lineups <-
function(Lineup_Data = NA,
Included = NA,
Excluded = NA) {
if(any(is.na(Included)) & any(is.na(Excluded))) {
return(Lineup_Data)
}
#create variable storing whether the lineup includes/excludes correct players
relRow <- rep(T , nrow(Lineup_Data))
relRow2 <- rep(T , nrow(Lineup_Data))
#iterates through included and finds rows that are needed
if (!any(is.na(Included))) {
for (i in 1:length(Included)) {
relRow <-
relRow * apply(Lineup_Data, 1, function(x)
(Included[i] %in% x[1:5]))
}
}
#iterates through rows for excluded and finds needed
if (!any(is.na(Excluded))) {
for (i in 1:length(Excluded)) {
relRow2 <-
relRow2 * apply(Lineup_Data, 1, function(x)
(!Excluded[i] %in% x[1:5]))
}
}
#take all rows where both cases are true
new_df <-Lineup_Data[which(relRow == 1 & relRow2 == 1),]
return(new_df)
}
#' Player Stats Calculator
#'
#' This function calculates many player stats for either individual games or aggregate to get multi-game stats.
#' @param play_by_play_data a data frame of play-by-play data from either get_play_by_play() or scrape_game()
#' @param keep.dirty logical object to remove entries from potentially corrupted games. Explained in get_lineups()
#' @param garbage.filter logical object to remove garbage time minutes from the sample. Explained in get_lineups()
#' @param error.thresh determine how many discrepancies can still occur for a game to be included. Explained in get_lineups()
#' @param multi.games Logical object. When false stats will be calculated on a game level. When true all games will be aggregated
#' this can be used to get season or multi-game player stats. Defaults to FALSE.
#' @return dataframe with each row as a player game or multi-game, with accompanying stats and details. Stats explained in get_lineups
#' but now at the individual player level rather than lineup level. New variables in use:
#' \itemize{
#' \item{GS} - Game Score: PTS + 0.4 x FGM - 0.7 x FGA - 0.4 x (FTA - FTM) + 0.7 x ORB + 0.3 x DRB + STL + 0.7 x AST + 0.7 x BLK - 0.4 x PF - TOV
#' \item{PBACK} - Successful putback: A rim attempt made preceded by an offensive rebound from the same player
#' @export
#' @import dplyr
get_player_stats <- function(play_by_play_data = NA, multi.games = F, simple = F) {
if (length(play_by_play_data) == 0) {
message("INPUT NOT FOUND")
return(NULL)
}
### Individual results
# First calculates main counting stats at a game level for each player
if(simple) {
player_stats <- play_by_play_data %>%
dplyr::group_by(ID, Date, Home, Away, Event_Team, Player_1) %>%
dplyr::summarise(
PTS = sum((Event_Result == "made") * Shot_Value, na.rm = T),
FGA = sum((Shot_Value %in% c(2, 3)), na.rm = T),
FGM = sum((Shot_Value %in% c(2, 3)) * (Event_Result == "made"), na.rm = T),
TPA = sum((Shot_Value == 3), na.rm = T),
TPM = sum((Shot_Value == 3) * (Event_Result == "made"), na.rm = T),
RIMA = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
), na.rm = T),
RIMM = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In") * (Event_Result == "made")
), na.rm = T),
FTA = sum((Shot_Value == 1), na.rm = T),
FTM = sum((Shot_Value == 1) * (Event_Result == "made"), na.rm = T),
ORB = sum((Event_Type == "Offensive Rebound"), na.rm = T),
DRB = sum((Event_Type == "Defensive Rebound"), na.rm = T),
TOV = sum((Event_Type == "Turnover"), na.rm = T),
STL = sum((Event_Type == "Steal"), na.rm = T),
BLK = sum((Event_Type == "Blocked Shot"), na.rm = T),
PF = sum((Event_Type == "Commits Foul"), na.rm = T),
FGA_trans = sum((Shot_Value %in% c(2, 3)) * isTransition, na.rm = T),
FGM_ast = sum((Shot_Value %in% c(2, 3)) * (Event_Result == "made") * (!is.na(Player_2)), na.rm = T),
.groups = 'drop'
) %>%
dplyr::filter(Player_1 != "TEAM") %>%
dplyr::rename(Player = Player_1,
Team = Event_Team) %>%
dplyr::ungroup()
} else {
player_stats <- play_by_play_data %>%
dplyr::mutate(
BLK_rim = (Event_Type == "Blocked Shot") * (lag(Event_Type) %in% c("Dunk", "Layup", "Hook", "Tip-In")),
BLK_mid = (Event_Type == "Blocked Shot") * (lag(Event_Type) == "Two Point Jumper"),
BLK_three = (Event_Type == "Blocked Shot") * (lag(Event_Type) == "Three Point Jumper"),
) %>%
dplyr::group_by(ID, Date, Home, Away, Event_Team, Player_1) %>%
dplyr::summarise(
PTS = sum((Event_Result == "made") * Shot_Value, na.rm = T),
FGA = sum((Shot_Value %in% c(2, 3)), na.rm = T),
FGM = sum((Shot_Value %in% c(2, 3)) * (Event_Result == "made"), na.rm = T),
TPA = sum((Shot_Value == 3), na.rm = T),
TPM = sum((Shot_Value == 3) * (Event_Result == "made"), na.rm = T),
RIMA = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
), na.rm = T),
RIMM = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In") * (Event_Result == "made")
), na.rm = T),
PBACKA = sum((Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")) * (lag(Event_Type) == "Offensive Rebound"),
na.rm = T
),
PBACKM = sum((Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")) * (Event_Result == "made") * (lag(Event_Type) == "Offensive Rebound"),
na.rm = T
),
FTA = sum((Shot_Value == 1), na.rm = T),
FTM = sum((Shot_Value == 1) * (Event_Result == "made"), na.rm = T),
ORB = sum((Event_Type == "Offensive Rebound"), na.rm = T),
DRB = sum((Event_Type == "Defensive Rebound"), na.rm = T),
TOV = sum((Event_Type == "Turnover"), na.rm = T),
STL = sum((Event_Type == "Steal"), na.rm = T),
BLK = sum((Event_Type == "Blocked Shot"), na.rm = T),
PF = sum((Event_Type == "Commits Foul"), na.rm = T),
# Assisted / Unassisted
PTS_ast = sum((Event_Result == "made") * Shot_Value * (!is.na(Player_2)), na.rm = T),
FGM_ast = sum((Shot_Value %in% c(2, 3)) * (Event_Result == "made") * (!is.na(Player_2)), na.rm = T),
TPM_ast = sum((Shot_Value == 3) * (Event_Result == "made") * (!is.na(Player_2)), na.rm = T),
RIMM_ast = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In") * (Event_Result == "made")
* (!is.na(Player_2))), na.rm = T),
PTS_unast = sum((Event_Result == "made") * Shot_Value * (is.na(Player_2)), na.rm = T),
FGA_unast = sum((Shot_Value %in% c(2, 3)) * (is.na(Player_2)), na.rm = T),
FGM_unast = sum((Shot_Value %in% c(2, 3)) * (Event_Result == "made") * (is.na(Player_2)), na.rm = T),
TPA_unast = sum((Shot_Value == 3) * (is.na(Player_2)), na.rm = T),
TPM_unast = sum((Shot_Value == 3) * (Event_Result == "made") * (is.na(Player_2)), na.rm = T),
RIMA_unast = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (is.na(Player_2)), na.rm = T),
RIMM_unast = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In") * (Event_Result == "made")
* (is.na(Player_2))), na.rm = T),
# Transition / Halfcourt
PTS_trans = sum((Event_Result == "made") * Shot_Value * isTransition, na.rm = T),
FGA_trans = sum((Shot_Value %in% c(2, 3)) * isTransition, na.rm = T),
FGM_trans = sum((Shot_Value %in% c(2, 3)) * (Event_Result == "made") * isTransition, na.rm = T),
FTA_trans = sum((Shot_Value == 1) * isTransition, na.rm = T),
FTM_trans = sum((Shot_Value == 1) * (Event_Result == "made") * isTransition, na.rm = T),
TPA_trans = sum((Shot_Value == 3) * isTransition, na.rm = T),
TPM_trans = sum((Shot_Value == 3) * (Event_Result == "made") * isTransition, na.rm = T),
RIMA_trans = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * isTransition, na.rm = T),
RIMM_trans = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In") * (Event_Result == "made")
* isTransition), na.rm = T),
ORB_trans = sum((Event_Type == "Offensive Rebound") * isTransition, na.rm = T),
DRB_trans = sum((Event_Type == "Defensive Rebound") * isTransition, na.rm = T),
TOV_trans = sum((Event_Type == "Turnover") * isTransition, na.rm = T),
STL_trans = sum((Event_Type == "Steal") * isTransition, na.rm = T),
BLK_trans = sum((Event_Type == "Blocked Shot") * isTransition, na.rm = T),
PTS_half = sum((Event_Result == "made") * Shot_Value * (1-isTransition), na.rm = T),
FGA_half = sum((Shot_Value %in% c(2, 3)) * (1-isTransition), na.rm = T),
FGM_half = sum((Shot_Value %in% c(2, 3)) * (Event_Result == "made") * (1-isTransition), na.rm = T),
FTA_half = sum((Shot_Value == 1) * (1-isTransition), na.rm = T),
FTM_half = sum((Shot_Value == 1) * (Event_Result == "made") * (1-isTransition), na.rm = T),
TPA_half = sum((Shot_Value == 3) * (1-isTransition), na.rm = T),
TPM_half = sum((Shot_Value == 3) * (Event_Result == "made") * (1-isTransition), na.rm = T),
RIMA_half = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In")
) * (1-isTransition), na.rm = T),
RIMM_half = sum((
Event_Type %in% c("Dunk", "Layup", "Hook", "Tip-In") * (Event_Result == "made")
* (1-isTransition)), na.rm = T),
ORB_half = sum((Event_Type == "Offensive Rebound") * (1-isTransition), na.rm = T),
DRB_half = sum((Event_Type == "Defensive Rebound") * (1-isTransition), na.rm = T),
TOV_half = sum((Event_Type == "Turnover") * (1-isTransition), na.rm = T),
STL_half = sum((Event_Type == "Steal") * (1-isTransition), na.rm = T),
BLK_half = sum((Event_Type == "Blocked Shot") * (1-isTransition), na.rm = T),
BLK_rim = sum(BLK_rim, na.rm = T),
BLK_mid = sum(BLK_mid, na.rm = T),
BLK_three = sum(BLK_three, na.rm = T),
.groups = "drop"
) %>%
dplyr::filter(Player_1 != "TEAM") %>%
dplyr::rename(Player = Player_1,
Team = Event_Team) %>%
dplyr::ungroup()
}
# Can then count assists form player 2 column
if(simple) {
assist_stats <- play_by_play_data %>%
dplyr::group_by(ID, Player_2) %>%
dplyr::summarise(AST = n()) %>%
dplyr::rename(Player = Player_2) %>%
dplyr::ungroup() %>%
dplyr::filter(!is.na(Player))
} else {
assist_stats <- play_by_play_data %>%
dplyr::group_by(ID, Player_2) %>%
dplyr::summarise(
AST = n(),
AST_trans = sum((!is.na(Player_2))*isTransition),
AST_half = sum((!is.na(Player_2))*(1-isTransition)), .groups = "drop") %>%
dplyr::rename(Player = Player_2) %>%
dplyr::ungroup() %>%
dplyr::filter(!is.na(Player))
}
# passes in pbp to calculate minutes for each player using extraneous function
minutes <- get_mins(play_by_play_data)
# merges all dataframes together by game and player
# calculates some game level stats of use
final_stats <-
dplyr::left_join(player_stats, assist_stats, by = c("Player", "ID")) %>%
dplyr::left_join(minutes, by = c("Player", "ID"))
final_stats$AST[is.na(final_stats$AST)] <- 0
if(simple) {
final_stats <- final_stats %>%
dplyr::mutate(
FG. = FGM / FGA,
TP. = TPM / TPA,
FT. = FTM / FTA,
TS. = (PTS / 2) / (FGA + .475 * FTA),
eFG. = (FGM + 0.5 * TPM) / FGA,
RIM. = RIMM / RIMA,
MIDA = FGA - TPA - RIMA,
MIDM = FGM - TPM - RIMM,
MID. = (FGM - RIMM - TPM) / (FGA - RIMA - TPA)) %>%
dplyr::mutate_if(is.numeric, round, 3) %>%
dplyr::select(
ID:Player, MINS, oPOSS, PTS, ORB, DRB, AST, STL, BLK, TOV, PF, TS., eFG., FGM, FGA, FG.,
TPM, TPA, TP., FTM, FTA, FT., RIMM, RIMA, RIM., MIDM, MIDA, MID., FGA_trans, FGM_ast)
final_stats[,7:ncol(final_stats)][is.na(final_stats[,7:ncol(final_stats)])] <- 0
} else {
final_stats <- final_stats %>%
dplyr::mutate(
FG. = FGM / FGA,
TP. = TPM / TPA,
FT. = FTM / FTA,
TS. = (PTS / 2) / (FGA + .475 * FTA),
eFG. = (FGM + 0.5 * TPM) / FGA,
RIM. = RIMM / RIMA,
MIDA = FGA - TPA - RIMA,
MIDM = FGM - TPM - RIMM,
MID. = (FGM - RIMM - TPM) / (FGA - RIMA - TPA),
PBACK. = PBACKM / PBACKA,
# Assisted / Unassisted
MIDM_ast = FGM_ast - TPM_ast - RIMM_ast,
FG._unast = FGM_unast / FGA_unast,
TP._unast = TPM_unast / TPA_unast,
eFG._unast = (FGM_unast + 0.5 * TPM_unast) / FGA_unast,
RIM._unast = RIMM_unast / RIMA_unast,
MIDA_unast = FGA_unast - TPA_unast - RIMA_unast,
MIDM_unast = FGM_unast - TPM_unast - RIMM_unast,
MID._unast = (FGM_unast - RIMM_unast - TPM_unast) / (FGA_unast - RIMA_unast - TPA_unast),
# Transition / Halfcourt
FG._trans = FGM_trans / FGA_trans,
TP._trans = TPM_trans / TPA_trans,
FT._trans = FTM_trans / FTA_trans,
TS._trans = (PTS_trans / 2) / (FGA_trans + .475 * FTA_trans),
eFG._trans = (FGM_trans + 0.5 * TPM_trans) / FGA_trans,
RIM._trans = RIMM_trans / RIMA_trans,
MIDA_trans = FGA_trans - TPA_trans - RIMA_trans,
MIDM_trans = FGM_trans - TPM_trans - RIMM_trans,
MID._trans = (FGM_trans - RIMM_trans - TPM_trans) / (FGA_trans - RIMA_trans - TPA_trans),
FG._half = FGM_half / FGA_half,
TP._half = TPM_half / TPA_half,
FT._half = FTM_half / FTA_half,
TS._half = (PTS_half / 2) / (FGA_half + .475 * FTA_half),
eFG._half = (FGM_half + 0.5 * TPM_half) / FGA_half,
RIM._half = RIMM_half / RIMA_half,
MIDA_half = FGA_half - TPA_half - RIMA_half,
MIDM_half = FGM_half - TPM_half - RIMM_half,
MID._half = (FGM_half - RIMM_half - TPM_half) / (FGA_half - RIMA_half - TPA_half),
pct_FGA_trans = FGA_trans / FGA,
pct_TPA_trans = TPA_trans / FGA,
pct_RIMA_trans = RIMA_trans / FGA,
pct_FGM_trans = FGM_trans / FGM,
pct_TPM_trans = TPM_trans / FGM,
pct_RIMM_trans = RIMM_trans / FGM,
pct_FGM_ast = FGM_ast / FGM,
pct_TPM_ast = TPM_ast / TPM,
pct_RIMM_ast = RIMM_ast / RIMM
) %>%
dplyr::mutate_if(is.numeric, round, 3) %>%
dplyr::select(
ID:Player, MINS, oPOSS, PTS, ORB, DRB, AST, STL, BLK, TOV, PF, TS., eFG., FGM, FGA, FG.,
TPM, TPA, TP., FTM, FTA, FT., RIMM, RIMA, RIM., MIDM, MIDA, MID., PBACKM, PBACKA, PBACK.,
BLK_rim, BLK_mid, BLK_three,
pct_FGA_trans:pct_RIMM_ast,
# Transition
PTS_trans, ORB_trans, DRB_trans, AST_trans, STL_trans, BLK_trans, TOV_trans, TS._trans, eFG._trans, FGM_trans, FGA_trans, FG._trans, TPM_trans, TPA_trans, TP._trans, FTM_trans, FTA_trans, FT._trans, RIMM_trans, RIMA_trans, RIM._trans, MIDM_trans, MIDA_trans, MID._trans,
# Half Court
PTS_half, ORB_half, DRB_half, AST_half, STL_half, BLK_half, TOV_half, TS._half, eFG._half, FGM_half, FGA_half, FG._half, TPM_half, TPA_half, TP._half, FTM_half, FTA_half, FT._half, RIMM_half, RIMA_half, RIM._half, MIDM_half, MIDA_half, MID._half,
# Assisted
PTS_ast, FGM_ast, TPM_ast, RIMM_ast, MIDM_ast,
# Unassisted
PTS_unast, eFG._unast, FGM_unast, FGA_unast, FG._unast, TPM_unast, TPA_unast, TP._unast, RIMM_unast, RIMA_unast, RIM._unast, MIDM_unast, MIDA_unast, MID._unast
)
final_stats[,7:ncol(final_stats)][is.na(final_stats[,7:ncol(final_stats)])] <- 0
}
# User has option to aggregate game stats into player stats over all games in play by play
# This essentially does the same processes as above but changes the grouping to exclude game specific ids
if (multi.games == T) {
starters <- play_by_play_data %>%
dplyr::group_by(ID) %>%
dplyr::slice(1) %>%
dplyr::ungroup() %>%
dplyr::select(Home.1:Away.5) %>%
unlist() %>%
table() %>%
as.data.frame() %>%
dplyr::rename("Player" = ".", "GS" = "Freq")
if(simple) {
multi_game <- final_stats %>%
dplyr::group_by(Player, Team) %>%
dplyr::mutate(GP = n()) %>%
dplyr::group_by(Player, Team, GP) %>%
dplyr::summarise_if(is.numeric, sum) %>%
dplyr::mutate(
FG. = FGM / FGA,
TP. = TPM / TPA,
FT. = FTM / FTA,
TS. = (PTS / 2) / (FGA + .475 * FTA),
eFG. = (FGM + 0.5 * TPM) / FGA,
RIM. = RIMM / RIMA,
MIDA = FGA - TPA - RIMA,
MIDM = FGM - TPM - RIMM,
MID. = (FGM - RIMM - TPM) / (FGA - RIMA - TPA),
PCT_FGA_trans = FGA_trans / FGA,
PCT_FGM_ast = FGM_ast / FGM
) %>%
dplyr::ungroup() %>%
dplyr::mutate_if(is.numeric, round, 3) %>%
dplyr::left_join(starters, by = "Player") %>%
dplyr::select(
Player, Team, GP, GS, MINS, oPOSS, PTS, ORB, DRB, AST, STL, BLK, TOV, PF, PCT_FGA_trans, PCT_FGM_ast, TS., eFG., FGM, FGA, FG.,
TPM, TPA, TP., FTM, FTA, FT., RIMM, RIMA, RIM., MIDM, MIDA, MID.)
} else {
multi_game <- final_stats %>%
dplyr::group_by(Player, Team) %>%
dplyr::mutate(GP = n()) %>%
dplyr::group_by(Player, Team, GP) %>%
dplyr::summarise_if(is.numeric, sum) %>%
dplyr::mutate(
FG. = FGM / FGA,
TP. = TPM / TPA,
FT. = FTM / FTA,
TS. = (PTS / 2) / (FGA + .475 * FTA),
eFG. = (FGM + 0.5 * TPM) / FGA,
RIM. = RIMM / RIMA,
MIDA = FGA - TPA - RIMA,
MIDM = FGM - TPM - RIMM,
MID. = (FGM - RIMM - TPM) / (FGA - RIMA - TPA),
PBACK. = PBACKM / PBACKA,
# Assisted / Unassisted
FG._unast = FGM_unast / FGA_unast,
TP._unast = TPM_unast / TPA_unast,
eFG._unast = (FGM_unast + 0.5 * TPM_unast) / FGA_unast,
RIM._unast = RIMM_unast / RIMA_unast,
MIDA_unast = FGA_unast - TPA_unast - RIMA_unast,
MIDM_unast = FGM_unast - TPM_unast - RIMM_unast,
MID._unast = (FGM_unast - RIMM_unast - TPM_unast) / (FGA_unast - RIMA_unast - TPA_unast),
# Transition / Halfcourt
FG._trans = FGM_trans / FGA_trans,
TP._trans = TPM_trans / TPA_trans,
FT._trans = FTM_trans / FTA_trans,
TS._trans = (PTS_trans / 2) / (FGA_trans + .475 * FTA_trans),
eFG._trans = (FGM_trans + 0.5 * TPM_trans) / FGA_trans,
RIM._trans = RIMM_trans / RIMA_trans,
MIDA_trans = FGA_trans - TPA_trans - RIMA_trans,
MIDM_trans = FGM_trans - TPM_trans - RIMM_trans,
MID._trans = (FGM_trans - RIMM_trans - TPM_trans) / (FGA_trans - RIMA_trans - TPA_trans),
FG._half = FGM_half / FGA_half,
TP._half = TPM_half / TPA_half,
FT._half = FTM_half / FTA_half,
TS._half = (PTS_half / 2) / (FGA_half + .475 * FTA_half),
eFG._half = (FGM_half + 0.5 * TPM_half) / FGA_half,
RIM._half = RIMM_half / RIMA_half,
MIDA_half = FGA_half - TPA_half - RIMA_half,
MIDM_half = FGM_half - TPM_half - RIMM_half,
MID._half = (FGM_half - RIMM_half - TPM_half) / (FGA_half - RIMA_half - TPA_half),
pct_FGA_trans = FGA_trans / FGA,
pct_TPA_trans = TPA_trans / FGA,
pct_RIMA_trans = RIMA_trans / FGA,
pct_FGM_trans = FGM_trans / FGM,
pct_TPM_trans = TPM_trans / FGM,
pct_RIMM_trans = RIMM_trans / FGM,
pct_FGM_ast = FGM_ast / FGM,
pct_TPM_ast = TPM_ast / TPM,
pct_RIMM_ast = RIMM_ast / RIMM
) %>%
dplyr::ungroup() %>%
dplyr::mutate_if(is.numeric, round, 3) %>%
dplyr::left_join(starters, by = "Player") %>%
dplyr::select(
Player, Team, GP, GS, MINS, oPOSS, PTS, ORB, DRB, AST, STL, BLK, TOV, PF, TS., eFG., FGM, FGA, FG.,
TPM, TPA, TP., FTM, FTA, FT., RIMM, RIMA, RIM., MIDM, MIDA, MID., PBACKM, PBACKA, PBACK., BLK_rim, BLK_mid, BLK_three,
pct_FGA_trans, pct_TPA_trans, pct_RIMA_trans, pct_FGM_trans, pct_TPM_trans, pct_RIMM_trans,
pct_FGM_ast, pct_TPM_ast, pct_RIMM_ast,
# Transition
PTS_trans, ORB_trans, DRB_trans, AST_trans, STL_trans, BLK_trans, TOV_trans, TS._trans, eFG._trans, FGM_trans, FGA_trans, FG._trans, TPM_trans, TPA_trans, TP._trans, FTM_trans, FTA_trans, FT._trans, RIMM_trans, RIMA_trans, RIM._trans, MIDM_trans, MIDA_trans, MID._trans,
# Half Court
PTS_half, ORB_half, DRB_half, AST_half, STL_half, BLK_half, TOV_half, TS._half, eFG._half, FGM_half, FGA_half, FG._half, TPM_half, TPA_half, TP._half, FTM_half, FTA_half, FT._half, RIMM_half, RIMA_half, RIM._half, MIDM_half, MIDA_half, MID._half,
# Assisted
PTS_ast, FGM_ast, TPM_ast, RIMM_ast, MIDM_ast,
# Unassisted
PTS_unast, eFG._unast, FGM_unast, FGA_unast, FG._unast, TPM_unast, TPA_unast, TP._unast, RIMM_unast, RIMA_unast, RIM._unast, MIDM_unast, MIDA_unast, MID._unast
)
}
multi_game[is.na(multi_game)] <- 0
return(multi_game)
} else {
return(final_stats)
}
}
convert_events <- function(events) {
#passes in all idenitified types of events in V2 pbp type and converts it to grammar used in V1 and adopted by this scraper
events2 <- case_when(
grepl("2pt", events) &
grepl("jumpshot", events) &
grepl("missed", events) ~ "missed Two Point Jumper",
grepl("3pt", events) &
grepl("jumpshot", events) &
grepl("missed", events) ~ "missed Three Point Jumper",
grepl("2pt", events) &
grepl("jumpshot", events) &
grepl("made", events) ~ "made Two Point Jumper",
grepl("3pt", events) &
grepl("jumpshot", events) &
grepl("made", events) ~ "made Three Point Jumper",
grepl("rebound defensivedeadball", events) ~ "Deadball Rebound",
grepl("rebound offensivedeadball", events) ~ "Deadball Rebound",
grepl("rebound defensive", events) ~ "Defensive Rebound",
grepl("layup", events) &
grepl("missed", events) ~ "missed Layup",
grepl("layup", events) &
grepl("made", events) ~ "made Layup",
grepl("steal", events) ~ "Steal",
grepl("foulon", events) ~ "Draw Foul",
grepl("assist", events) ~ "Assist",
grepl("foul ", events) ~ "Commits Foul",
grepl("substitution out", events) ~ "Leaves Game",
grepl("substitution in", events) ~ "Enters Game",
grepl("hookshot", events) & grepl("made", events) ~ "made Hook",
grepl("hookshot", events) &
grepl("missed", events) ~ "missed Hook",
grepl("freethrow", events) &
grepl("made", events) ~ "made Free Throw",
grepl("freethrow", events) &
grepl("missed", events) ~ "missed Free Throw",
grepl("timeout", events) ~ "Timeout",
grepl("dunk", events) & grepl("made", events) ~ "made Dunk",
grepl("dunk", events) & grepl("missed", events) ~ "missed Dunk",
grepl("alleyoop", events) & grepl("made", events) ~ "made Dunk",
grepl("alleyoop", events) &
grepl("missed", events) ~ "missed Dunk",
grepl("block", events) ~ "Blocked Shot",
grepl("rebound offensive", events) ~ "Offensive Rebound",
grepl("rebound ", events) ~ "Deadball Rebound",
grepl(" jumpball won", events) ~ "won Jumpball",
grepl(" jumpball lost", events) ~ "lost Jumpball",
grepl(" jumpball heldball", events) ~ "Jumpball (held ball)",
grepl(" jumpball outofbounds", events) ~ "Jumpball (out of bounds)",
grepl(" jumpball lodgedball", events) ~ "Jumpball (lodged ball)",
grepl("Team, foul", events) ~ "Team Foul",
grepl("turnover", events) ~ "Turnover",
grepl("wrongbasket", events) ~ "Wrong Basket (2pt Opp. Team)",
is.na(events) ~ NA_character_,
#if this comes up, I have not discovered the event and need to classify and convert it
TRUE ~ "ERROR CHECK THE EVENT"
)
return(events2)
}
get_mins <- function(play_by_play_data) {
final_df <- play_by_play_data %>%
dplyr::mutate(
home_poss = (Home == Poss_Team)
) %>%
dplyr::select(ID, home_poss, Event_Length, Poss_Num, tidyselect::matches('(Home|Away)\\.[1-5]')) %>%
tidyr::pivot_longer(cols = tidyselect::matches('(Home|Away)\\.[1-5]')) %>%
dplyr::mutate(
isOffense = dplyr::case_when(
home_poss & grepl('Home', name) ~ T,
!home_poss & grepl('Away', name) ~ T,
T ~ F
)
) %>%
dplyr::group_by(ID, value) %>%
dplyr::summarise(
MINS = sum(Event_Length) / 60,
oPOSS = dplyr::n_distinct(Poss_Num[which(isOffense)]),
.groups = 'drop'
) %>%
dplyr::rename(Player = value) %>%
dplyr::mutate_if(is.numeric, round, 3) %>%
dplyr::ungroup()
}
binder <- dplyr::bind_rows
#' Minutes Distribution Plot
#'
#' This function takes in play by play data and a team name and returns a plot
#' showing the distributon of when each player was on the court
#' @param pbp_data a data frame of pbp created from get_play_by_play() / scrape_game()
#' @param team a single team name
#' @param threshold minimum minutes played to include in plot
#' @import dplyr
#' @import ggplot2
#' @export
#' @return ggplot object containing minutes distribution
plot_mins_dist <- function(play_by_play_data = NA, team = NA, threshold = NA, split_position = F) {
if(all(is.na(play_by_play_data)) | is.na(team)) {
stop("Missing parameters")
}
if(is.na(threshold)) {
threshold <- 0
}
home_team <- play_by_play_data %>%
dplyr::filter(Home == team) %>%
dplyr::select(ID,Home,Away,Game_Seconds, Home.1:Home.5) %>%
dplyr::mutate(Game_Mins = Game_Seconds %/% 60)
away_team <- play_by_play_data %>%
dplyr::filter(Away == team) %>%
dplyr::select(ID,Home,Away,Game_Seconds,Away.1:Away.5) %>%
dplyr::mutate(Game_Mins = Game_Seconds %/% 60)
all_players <- data.frame(Game_Mins = NA, Player = NA, Home = NA, Away = NA, ID = NA)
for(i in 1:5) {
player_row <- home_team[,c(i+4,10,1:3)]
colnames(player_row)[1] <- "Player"
all_players <- rbind(all_players, player_row)
player_row <- away_team[,c(i+4,10,1:3)]
colnames(player_row)[1] <- "Player"
all_players <- rbind(all_players, player_row)
}
player_mins <- all_players %>%
dplyr::group_by(Player, Game_Mins, ID) %>%
dplyr::summarise(count = dplyr::n(), .groups = "drop") %>%
dplyr::group_by(Player, Game_Mins) %>%
dplyr::summarise(count = dplyr::n(), .groups = "drop") %>%
dplyr::filter(!is.na(Player), !is.na(Game_Mins))
totals <- expand.grid(unique(player_mins$Player), 0:40, stringsAsFactors = F)
colnames(totals) <- c("Player", "Game_Mins")
totals <- left_join(totals, player_mins, by = c("Player", "Game_Mins"))
totals$count[is.na(totals$count)] <- 0
labels <- sapply(totals$Player, function(x){
spl <- strsplit(x, split = "[.]")[[1]]
first <- paste0(substr(spl[1],1,1), tolower(substr(spl[1],2,nchar(spl[1]))))
last <- paste0(substr(spl[2],1,1), tolower(substr(spl[2],2,nchar(spl[2]))))
if(length(spl) > 2) {
paste(first, last, spl[3:length(spl)])
} else {
paste(first,last)
}
}, USE.NAMES = F)
if(split_position) {
year <- substr(first(play_by_play_data$Date),7,10)
month <- substr(first(play_by_play_data$Date),1,2)
year <- ifelse(as.numeric(month)<=5, as.numeric(year)-1, year)
season <- paste0(as.numeric(year), "-", as.numeric(year)-1999)
roster <- get_team_roster(team.name = team, season = season)
totals <- left_join(totals, roster, by = "Player")
totals$CleanName <- paste(totals$Jersey,"-", totals$CleanName)
}
totals$CleanName <- if(is.null(totals$CleanName)) labels else totals$CleanName
totals$CleanName <- ifelse(totals$CleanName == "NA - NA", labels, totals$CleanName)
p <- totals %>%
dplyr::group_by(CleanName) %>%
dplyr::mutate(Total_Mins = sum(count)) %>%
dplyr::ungroup() %>%
dplyr::filter(Total_Mins > threshold) %>%
ggplot2::ggplot() +
ggplot2::geom_tile(ggplot2::aes(reorder(CleanName, Total_Mins), Game_Mins, fill = count)) +
ggplot2::coord_flip() +
ggplot2::geom_hline(yintercept = seq(0,40,by=10), linetype = "dashed", color = "darkgrey") +
ggplot2::scale_fill_gradient2(low = "white", high = "steelblue") +
ggplot2::labs(x = "", y = "Minute", fill = "GP", caption = "Jake Flancer (@JakeFlancer) | Data: NCAA.com") +
ggplot2::theme_classic() +
ggplot2::theme(
plot.background = ggplot2::element_rect(fill = "gray75"),
panel.background = ggplot2::element_rect(fill = "gray75"),
legend.background = ggplot2::element_rect(fill = "gray75"),
axis.line = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
text = ggplot2::element_text(family = "Helvetica", color = "gray25", face = "bold")) +
ggplot2::ggtitle(paste(team,"minutes distribution"))
if(split_position) {
p +
ggplot2::facet_wrap(.~Pos, ncol = 1, scales = "free_y") +
ggplot2::theme(strip.background = ggplot2::element_blank(),
strip.text = ggplot2::element_text(family = "Helvetica", color = "gray25", face = "bold"))
} else {
p
}
}
#' Duo Plot
#'
#' Creates network plot of team ratings with players together on the court
#' Uses empirical bayesian formula to estimate ratings relative to team average:
#' regressed ORTG = (n_bar*r_bar_o + PTS) / (n_bar + POSS)
#' n_bar = average possessions per lineup + regressed possessions
#' r_bar_o = team offensive rating (PTS/POSS)*100
#' @param lineup_data a data frame of lineups created from get_lineups()
#' @param team a single team name
#' @param min_mins minimum number of minutes required for duo to play together
#' @param regressed_poss effectively number of team average possessions used to "shrink" ratings
#' @import dplyr
#' @import ggplot2
#' @importFrom gtools combinations
#' @importFrom igraph graph_from_data_frame
#' @import ggraph
#' @importFrom ggraph guide_edge_colourbar
#' @export
#' @return ggplot object of network plot
plot_duos <- function(Lineup_Data = NA, team = NA, min_mins = 0, regressed_poss = 50) {
if(all(is.na(Lineup_Data)) | is.na(team)) {
stop("Missing Function Parameters")
}
lineup_data <- dplyr::filter(Lineup_Data, Team == team)
players <- unique(unlist(lineup_data[,1:5]))
players <- players[!is.na(players)]
labels <- sapply(players, function(x){
spl <- strsplit(x, split = "[.]")[[1]]
first <- paste0(substr(spl[1],1,1), tolower(substr(spl[1],2,nchar(spl[1]))))
last <- paste0(substr(spl[2],1,1), tolower(substr(spl[2],2,nchar(spl[2]))))
if(length(spl) > 2) {
paste(first, last, spl[3:length(spl)])
} else {
paste(first,last)
}
})
r_bar_o <- (sum(lineup_data$PTS) / sum(lineup_data$oPOSS))
r_bar_d <- (sum(lineup_data$dPTS) / sum(lineup_data$dPOSS))
n_bar <- (mean(lineup_data$dPOSS) + mean(lineup_data$oPOSS))/2 + regressed_poss
duos <- data.frame(gtools::combinations(n=length(players), r=2, v=players, repeats.allowed = F))
colnames(duos) <- c("from","to")
duos$mins <- NA
duos$ortg <- NA
duos$drtg <- NA
for(i in 1:nrow(duos)){
tmp <- get_player_lineups(lineup_data, Included = unlist(unname(duos[i,1:2])))
duos$mins[i] <- sum(tmp$Mins)
duos$ortg[i] <- sum(tmp$PTS) / sum(tmp$oPOSS)
duos$drtg[i] <- sum(tmp$dPTS) / sum(tmp$dPOSS)
duos$adjortg[i] <- (n_bar*r_bar_o + sum(tmp$PTS)) / (n_bar + sum(tmp$oPOSS))
duos$adjdrtg[i] <- (n_bar*r_bar_d + sum(tmp$dPTS)) / (n_bar + sum(tmp$dPOSS))
}
duos[,1:2] <- apply(duos[,1:2],2,as.character)
duos$netrtg <- duos$ortg - duos$drtg
duos$adjrtg <- (duos$adjortg - duos$adjdrtg)
scale <- round(c(min(duos$adjrtg)-.02,mean(duos$adjrtg),max(duos$adjrtg)+.02),2)
dataset <- dplyr::filter(duos, mins >= min_mins)
if(nrow(dataset) <= 0) {
print("No Duos Found- Try Lowering Minutes")
return(data.frame())
}
final_players <- unique(unlist(dataset[,1:2]))
grph <- igraph::graph_from_data_frame(dataset,
directed = F,
vertices = data.frame(name = sort(final_players),
lab = sort(labels[which(names(labels) %in% sort(final_players))]))
)
ggraph::ggraph(grph, layout = "linear", circular = T) +
ggraph::geom_edge_arc(ggplot2::aes(edge_width = mins,
color = adjrtg*100)) +
ggraph::scale_edge_colour_gradientn(breaks = scale*100,
colors = c("steelblue","white","indianred"),
labels = scale*100,
name = "Adj. Net Efficiency per 100",
limits = c(min(scale)*100,max(scale)*100),
na.value = "transparent",
guide = ggraph::guide_edge_colorbar()) +
ggraph::scale_edge_width(name = "Minutes Together") +
ggraph::geom_node_text(ggplot2::aes(label = lab), color = "gray25", size = 4.5, fontface = "bold") +
ggraph::geom_node_point(size = 15, alpha = 0.1, color = "gray50") +
ggraph::theme_graph() +
ggplot2::labs(title = paste(team, "Duos Performance"),
subtitle = paste("team performance when pairs of players are on the court together, min.", min_mins, "minutes"),
caption = "Jake Flancer (@JakeFlancer) | Data: NCAA.com") +
ggplot2::theme(
plot.title = ggplot2::element_text(hjust = 0, size = 18, family = "Helvetica", color = "gray25", face = "bold"),
plot.subtitle = ggplot2::element_text(size = 12, family = "Helvetica", color = "gray25", face = "bold"),
plot.background = ggplot2::element_rect(fill = "gray75"),
plot.caption = ggplot2::element_text(family = "Helvetica", color = "gray25", face = "bold"),
legend.title = ggplot2::element_text(family = "Helvetica", color = "gray25", face = "bold")
) +
ggplot2::scale_x_continuous(expand = c(.15, .15), limits = c(-1,1)) +
ggplot2::scale_y_continuous(expand = c(.15, .15), limits = c(-1,1))
}
#' Box Score Scrape
#'
#' This function returns a box score for the given game
#' @param game_id box id for the game
#' @importFrom XML readHTMLTable
#' @import dplyr
#' @return data frame with each row representing a player in the game
#' @export
#' @examples
#' scrape_box(1982642)
scrape_box <-
function(game_id,
use_file = F,
save_file = F,
base_path = NA,
overwrite = F) {
status <- "CLEAN"
url_text <- paste0("https://stats.ncaa.org/contests/", game_id,"/individual_stats")
file_dir <- paste0(base_path, "box_score/")
file_path <- paste0(file_dir, game_id, ".html")
isUrlRead <- F
# Give user option to save raw html file (to make future processing more efficient)
if (save_file & !is.na(base_path) & (!file.exists(file_path) | overwrite)) {
isUrlRead <- T
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
dir.create(file_dir, recursive = T, showWarnings = F)
writeLines(html, file_path)
} else if (file.exists(file_path) & use_file) {
html <- readLines(file_path, warn=F)
} else {
isUrlRead <- T
file_url <- url(url_text, headers = c("User-Agent" = "My Custom User Agent"))
html <- readLines(con = file_url, warn=F)
close(file_url)
}
table <- XML::readHTMLTable(html)
if (length(table) == 0) {
message("Game Not Found")
return(data.frame())
}
background <- table[[1]]
away <- table[[4]]
home <- table[[5]]
away_end <- which(away[['Name']] == 'TEAM') - 1
if (length(away_end) != 1) {
away_end <- nrow(away) - 2
}
away <- away[1:away_end,]
away$Team <- gsub(" \\((.*)\\)", "", background[2,1])
away <- away[,names(away) != 'Avg']
home_end <- which(home[['Name']] == 'TEAM') - 1
if (length(home_end) != 1) {
home_end <- nrow(home) - 2
}
home <- home[1:home_end,]
home$Team <- gsub(" \\((.*)\\)", "", background[3,1])
home <- home[,names(home) != 'Avg']
box <- bind_rows(home, away)
clean_name <- box$Name
format <- gsub("[^[:alnum:] ]", "", clean_name)
format <- toupper(gsub("\\s+",".", format))
player_name <- gsub("(\\.JR\\.|\\.SR\\.|\\.J\\.R\\.|\\.JR\\.|JR\\.|SR\\.|\\.SR|\\.JR|\\.SR|\\.III|\\.II|\\.IV)$","", format)
player_name <- trimws(player_name)
clean_name <- gsub("(\\.JR\\.|\\.SR\\.|\\.J\\.R\\.|\\.JR\\.|JR\\.|SR\\.|\\.SR|\\.JR|\\.SR|\\.III|\\.II|\\.IV)$","", clean_name, ignore.case = T)
clean_name <- trimws(clean_name)
box$CleanName <- clean_name
box$Player <- player_name
box$Game_ID <- game_id
box$Box_ID <- game_id
final <- box %>%
rename("TPM" = "3FG", "TPA" = "3FGA", "FTM" = "FT", "ORB" = "ORebs", "DRB" = "DRebs", "TRB" = "TotReb", "Tech" = "TechFouls") %>%
select(Box_ID, Team, Player, everything()) %>%
filter(Player != "TEAM.TEAM")
if(isUrlRead) {
Sys.sleep(2)
}
message(paste(background[3,1], "v", background[2,1], "|", game_id))
return(final)
}
#' Box Scores Scrape
#'
#' This function returns a single data frame for a vector of box ids. A wrapping of scrape_box for multiple games
#' @param game_id vector of box ids for the game
#' @param multi.games whether to aggregate over games
#' @return data frame with each row representing a player in the game
#' @export
#' @examples
#' get_box_scores(c(1982642, 1982641))
get_box_scores <- function(game_ids, multi.games = F, use_file = F, save_file = F, base_path = NA, overwrite=F) {
#Cleans list of game ids to remove nas
game_ids <- game_ids[!is.na(game_ids)]
#Scrape all game ids into list
game_list <- lapply(game_ids, function(x) {
# Add error handling so if one game throws an error it will report and continue iterating
tryCatch(scrape_box(x, use_file = use_file, save_file = save_file, base_path = base_path, overwrite=overwrite), error = function(e){
print(paste0("Error with game id: ", x, " // ", e))
return(NA)
})
})
dirty_ind <- which(is.na(game_list))
#Remove any incorrect games found
if(length(dirty_ind) > 0) game_list <- game_list[-dirty_ind]
#Bind rows together and return combined dataframe
game_data <- do.call("binder", game_list)
if(length(dirty_ind) != 0) {
message(paste(paste(game_ids[dirty_ind], collapse = ","), "removed"))
}
game_data <- game_data %>%
dplyr::filter(MP != '') %>%
dplyr::mutate(
dplyr::across(any_of(c("G", "FGM", "FGA", "TPM", "TPA", "FTM", "FTA", "PTS", "ORB", "DRB", "TRB", "AST", "TO", "STL", "BLK", "Fouls", "DQ", "Tech")), function(x){x[x==''] <- 0; return(x)}),
dplyr::across(any_of(c("MP", "G", "FGM", "FGA", "TPM", "TPA", "FTM", "FTA", "PTS", "ORB", "DRB", "TRB", "AST", "TO", "STL", "BLK", "Fouls", "DQ", "Tech")), ~gsub("\\/", "", .x)),
dplyr::across(any_of(c("G", "FGM", "FGA", "TPM", "TPA", "FTM", "FTA", "PTS", "ORB", "DRB", "TRB", "AST", "TO", "STL", "BLK", "Fouls", "DQ", "Tech")), as.numeric),
MP = round(as.numeric(gsub(":(.*)", "", MP)) + as.numeric(gsub("(.*):", "", MP))/60, 1),
FG. = FGM / FGA,
TP. = TPM / TPA,
FT. = FTM / FTA,
TS. = (PTS / 2) / (FGA + .475 * FTA),
eFG. = (FGM + 0.5 * TPM) / FGA,
dplyr::across(where(is.numeric), function(x){x[is.nan(x)] <- 0; return(x)})
) %>%
dplyr::select(
Game_ID:MP,
any_of(c("PTS", "ORB", "DRB", "TRB", "AST", "TO", "STL", "BLK", "FGA", "FGM", "FG.", "TPA", "TPM", "TP.", "FTA", "FTM", "FT.", "TS.", "eFG.", "Fouls", "DQ", "Tech", "CleanName"))
)
if (multi.games == T) {
multi_game <- game_data %>%
dplyr::select(-Game_ID) %>%
dplyr::group_by(Player, CleanName, Team, Pos) %>%
dplyr::summarise(
across(where(is.numeric), sum),
G = n(),
.groups = "drop"
) %>%
dplyr::mutate(
FG. = FGM / FGA,
TP. = TPM / TPA,
FT. = FTM / FTA,
TS. = (PTS / 2) / (FGA + .475 * FTA),
eFG. = (FGM + 0.5 * TPM) / FGA) %>%
dplyr::mutate(dplyr::across(where(is.numeric), function(x){x[is.nan(x)] <- 0; return(x)})) %>%
dplyr::select(
Player, CleanName, Team, Pos, MP, G, PTS, ORB, DRB, TRB, AST, TO, STL, BLK, FGA, FGM, FG., TPA, TPM, TP., FTA, FTM, FT., TS., eFG., Fouls, DQ, Tech, CleanName
)
return(multi_game)
}
return(game_data)
}
#' Possession Parsing
#'
#' This function aggregates play by play data into individual possession results
#' @param play_by_play_data dataframe of play by play data from get_play_by_play
#' @return data frame with each row representing a possession
#' @export
get_possessions <- function(play_by_play_data = NA, simple = F) {
if(simple) {
possession_df <- play_by_play_data %>%
dplyr::group_by(ID, Date, Home, Away, Poss_Num, Poss_Team,
Home.1, Home.2, Home.3, Home.4, Home.5, Away.1, Away.2, Away.3, Away.4, Away.5) %>%
dplyr::summarise(
PTS = sum(Shot_Value * (Event_Result == "made"), na.rm = T),
.groups = "drop"
)
} else {
possession_df <- play_by_play_data %>%
dplyr::mutate(
End = dplyr::lag(Event_Type)
) %>%
dplyr::group_by(ID, Date, Home, Away, Half_Status, Poss_Num, Poss_Team,
Home.1, Home.2, Home.3, Home.4, Home.5, Away.1, Away.2, Away.3, Away.4, Away.5) %>%
dplyr::summarise(
Home_Score = dplyr::first(Home_Score),
Away_Score = dplyr::first(Away_Score),
PTS = sum(Shot_Value * (Event_Result == "made"), na.rm = T),
isAssisted = sum(!is.na(Player_2) > 0),
isTransition = max(isTransition),
isGarbageTime = max(isGarbageTime),
startEventType = dplyr::first(End),
firstShotTime = dplyr::first(Poss_Length[Event_Type %in% c("Layup", "Dunk", "Tip In", "Hook", "Two Point Jumper", "Three Point Jumper")]),
firstShotType = dplyr::first(Event_Type[Event_Type %in% c("Layup", "Dunk", "Tip In", "Hook", "Two Point Jumper", "Three Point Jumper")]),
lastEventTime = dplyr::last(Poss_Length),
lastEventType = dplyr::last(Event_Type),
.groups = "drop"
)
}
missing_rows <- apply(possession_df[,which(colnames(possession_df)=="Home.1"):which(colnames(possession_df)=="Away.5")], 1, function(x){sum(is.na(x))})
message(paste("Forced to remove", length(which(missing_rows!=0)), "rows due to missing players in on/off"))
possession_df <- possession_df %>%
dplyr::filter(missing_rows==0)
# Now sorts the home and away player alphabetically so players are always in the same column for a given lineup
if(!simple) {
sorted_df <- apply(possession_df, 1, function(x)
{
home_players <- sort(x[which(colnames(possession_df)=="Home.1"):which(colnames(possession_df)=="Home.5")])
away_players <- sort(x[which(colnames(possession_df)=="Away.1"):which(colnames(possession_df)=="Away.5")])
return(c(x[1:(which(colnames(possession_df)=="Home.1")-1)], home_players, away_players, x[(which(colnames(possession_df)=="Away.5")+1):ncol(possession_df)]))
})
#Converts the sorted back into a data frame
sorted_df <-
data.frame(matrix(unlist(sorted_df), ncol = ncol(possession_df), byrow=T), stringsAsFactors = F)
colnames(sorted_df) <- colnames(possession_df)
sorted_df <- sorted_df %>%
mutate(across(c("ID", "Poss_Num", "Home_Score", "Away_Score", "PTS",
"isAssisted", "isTransition", "isGarbageTime", "firstShotTime", "lastEventTime"), as.numeric))
return(sorted_df)
} else {
return(possession_df)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.