R/data_prep.R

#' Load data containing results of each game and return a DataFrame.
#' 
#' @param game_data_fname Filename of Armchair Analysis GAME table
#' @param remove_ties Optional
#' @return data_frame
#' @importFrom magrittr %>%
#' @importFrom magrittr %<>%
load_games <- function(game_data_fname, remove_ties = FALSE){
  games <- readr::read_csv(game_data_fname)
  
  # Data from 2000 import is less reliable, omit this season
  # and use regular season games only.
  games %<>%
    dplyr::filter(seas >= 2001 & wk <= 17)
  
  # Calculate winner
  games <- dplyr::mutate(games, winner = ifelse(ptsv > ptsh, v,
                           ifelse(ptsh > ptsv, h, "TIE")))
  
  if(remove_ties){
    games <- dplyr::filter(games, winner != "TIE")
  }
  
  return(games)
}

#' Load the play by play data and return a data frame.
#' 
#' @param pbp_data_fname Location of play by play data
#' @param games Data frame, game-level data frame created by \code{load_games}
#' @param remove_knees Optional
#' @return data_frame
load_pbp <- function(pbp_data_fname, games, remove_knees = FALSE){
  pbp <- readr::read_csv(pbp_data_fname) %>%
    dplyr::select(gid, pid, off, def, type, qtr, min, sec,
           kne, ptso, ptsd, timo, timd, dwn, ytg, yfog,
           yds, fd, fgxp, good, pnet, pts, detail)
  
  # Remove overtime
  pbp <- dplyr::filter(pbp, qtr <= 4)
  
  # Restrict to regular season games after 2000
  pbp <- dplyr::filter(pbp, gid %in% games$gid)
  
  if(remove_knees){
    pbp <- dplyr::filter(pbp, is.na(kne))
  }
  
  return(pbp)
}

#' Swap game state columns for offense & defense dependent variables.
#' The play by play data has some statistics on punts and kickoffs in terms
#' of the receiving team. Switch these to reflect the game state for
#' the kicking team.
#' 
#' @param df Play-by-play data frame
#' @return data_frame
switch_offense <- function(df){
  # used solution from http://stackoverflow.com/questions/7746567/how-to-swap-values-between-2-columns
  df[df$type == "PUNT" | df$type == "KOFF",
     c("off", "def", "ptso", "ptsd", "timo", "timd")] <- df[df$type == "PUNT" | df$type == "KOFF",
                                                            c("def", "off", "ptsd",
                                                              "ptso", "timd", "timo")]

  # if any points are scored on a PUNT/KOFF, they are given in terms
  # of the receiving team - switch this
  dplyr::mutate(df, pts = ifelse(type == "PUNT" | type == "KOFF", -1 * pts, pts))
  
  return(df)
}

#' Parse all fourth downs and determine if teams intended to go for it,
#' punt, or attempt a field goal. If intent is not clear, do not include
#' the play.
#' 
#' @param df Play-by-play data frame
#' @return data_frame
code_fourth_downs <- function(df){
  fourths <- dplyr::filter(df, dwn == 4) %>%
    dplyr::mutate(goforit = 0,
                  punt = 0,
                  kick = 0)
  
  # Omit false start, delay of game, encroachment, neutral zone infraction
  # We cannot infer from these plays if the offense was going to
  # go for it or not.
  
  omitstring <- "encroachment|false start|delay of game|neutral zone infraction"
  fourths <- dplyr::filter(fourths, !grepl(pattern = omitstring, x = detail, ignore.case = TRUE))
  
  # Ran a play
  fourths %<>%
    dplyr::mutate(goforit = ifelse(type == "RUSH" | type == "PASS", 1, goforit),
           punt = ifelse(type == "RUSH" | type == "PASS", 0, punt),
           kick = ifelse(type == "RUSH" | type == "PASS", 0, kick))
  
  # Field goal attempts and punts
  fourths %<>%
    dplyr::mutate(goforit = ifelse(type == "FGXP" | type == "PUNT", 0, goforit),
           kick = ifelse(type == "FGXP", 1, kick),
           punt = ifelse(type == "PUNT", 1, punt))
  
  # Punted, but penalty on play
  puntstring <- "punts|out of bounds"
  fourths %<>%
    dplyr::mutate(punt = ifelse(type == "NOPL" &
                           grepl(pattern = puntstring, x = detail, ignore.case = TRUE), 1, punt))
  
  # Kicked, but penalty on play
  kickstring <- "field goal is|field goal attempt"
  fourths %<>%
    dplyr::mutate(kick = ifelse(type == "NOPL" &
                           grepl(pattern = kickstring, x = detail, ignore.case = TRUE), 1, kick))
  
  # Went for it, but penalty on play
  gostring <- paste0("pass to|incomplete|sacked|left end|up the middle|pass interference|",
                     "right tackle|right guard|right end|pass intended|left tackle|left guard|",
                     "pass deep|pass short|up the middle")
  fourths %<>%
    dplyr::mutate(goforit = ifelse(type == "NOPL" & 
                                     grepl(pattern = gostring, x = detail, ignore.case = TRUE) &
                                     -grepl(pattern = puntstring, x = detail, ignore.case = TRUE) &
                                     -grepl(pattern = kickstring, x = detail, ignore.case = TRUE),
                                   1, goforit))
  
  fourths %<>%
    dplyr::mutate(sum = goforit + punt + kick) %>%
    dplyr::filter(sum == 1) %>%
    dplyr::select(-sum)
  
  return(fourths)
}

#' Historical field goal success rates by field position.
#' 
#' By default, uses only attempts from >= 2011 season to reflect
#' more improved kicker performance.
#' 
#' Returns and writes results to a CSV.
#' 
#' NOTE: These are somewhat sparse and irregular at longer FG ranges.
#' This is because kickers who attempt long FGs are not selected at
#' random -- they are either in situations which require a long FG
#' attempt or are kickers with a known long range. The NYT model
#' uses a logistic regression kicking model developed by Josh Katz
#' to smooth out these rates.
#' 
#' 
#' @param fg_data_fname Filename of Armchair Analysis GAME table
#' @param out_fname Filename of where to write output to
#' @param min_pid Starting point in play-by-play data of which kicks to include.
#' Defaults to start of 2011 season
#' @return data_frame
fg_success_rate <- function(fg_data_fname, out_fname, min_pid = 473957){
  fgs <- readr::read_csv(fg_data_fname)
  
  fgs <- dplyr::filter(fgs, fgxp == "FG" & pid > min_pid)
  
  fgs_grouped <- fgs %>%
    dplyr::group_by(dist) %>%
    dplyr::summarize(N = n(),
              average = mean(good))
  
  fgs_grouped <- dplyr::mutate(fgs_grouped, yfog = 100 - (dist - 17))
  fgs_grouped %>%
    dplyr::select(yfog, average) %>%
    readr::write_csv(out_fname)
  
  return(fgs_grouped)
}

#' Sub in simple logit for field goal success rates.
#' 
#' @param fname Filename of NYT model output
#' @param outname Filename of where to write output to
#' @return data_frame
nyt_fg_model <- function(fname, outname){
  fgs <- readr::read_csv(fname) %>%
    dplyr::mutate(yfog = 100 - (fg_distance - 17))
  
  write_csv(fgs, outname)
  
  return(fgs)
}

#' Group punts by kicking field position to get average return distance.
#' Currently does not incorporate the possibility of a muffed punt
#' or punt returned for a TD.
#' 
#' @param punt_data_fname Filename of where PUNT table is stored
#' @param out_fname Filename of where to write output to
#' @param joined Joined play-by-play and game data
#' @return data_frame
punt_averages <- function(punt_data_fname, out_fname, joined){
  punts <- readr::read_csv(punt_data_fname) %>%
    dplyr::left_join(dplyr::select(joined, pid, yfog))
  
  punts_dist <- punts %>%
    dplyr::group_by(yfog) %>%
    dplyr::summarize(pnet = mean(pnet))
  
  readr::write_csv(punts_dist, out_fname)
  
  return(punts_dist)
}

#' Group 4th down decisions by score difference and field
#' position to get coarse historical comparisons.
#' 
#' Writes these to a CSV and returns them.
#' 
#' @param fourths Joined play-by-play and game data
#' @return data_frame
#' @importFrom dplyr ungroup
group_coaches_decisions <- function(fourths, out_fname){
  df <- fourths %>%
    dplyr::mutate(down_by_td = score_diff <= -4,
           up_by_td = score_diff >= 4,
           yfog_bin = floor(yfog / 20),
           short = ytg <= 3,
           med = ytg >= 4 & ytg <= 7,
           long = ytg > 7) %>%
    dplyr::mutate_each(dplyr::funs(as.numeric), down_by_td, up_by_td, short:long)
  
  grouped <- df %>%
    dplyr::group_by(down_by_td, up_by_td, yfog_bin,
                    short, med, long)
  
  goforit <- grouped %>%
    dplyr::summarize(proportion_went = mean(goforit),
              sample_size = n())
  punt <- grouped %>%
    dplyr::summarize(proportion_punted = mean(punt),
                     sample_size = n())
  kick <- grouped %>%
    dplyr::summarize(proportion_kicked = mean(kick),
                     sample_size = n())
  
  decisions <- grouped %>%
    dplyr::summarize(proportion_went = mean(goforit),
              sample_size = n(),
              proportion_punted = mean(punt),
              sample_size_punt = n(),
              proportion_kicked = mean(kick),
              sample_size_kick = n())
  
  readr::write_csv(decisions, out_fname)
  return(decisions)
}

#' Find the mean 1st down success rate at a given point in the field.
#' 
#' @param df_plays Play by play data frame
#' @param yfog String, must be 'yfog' or 'yfog_bin'. If yfog, use the actual yards
#' from own goal. If yfog_bin, use the decile of the field instead.
#' @return data_frame
first_down_rates <- function(df_plays, yfog){
  downs <- df_plays
  if(yfog == "yfog_bin") {
    # Break down the field into deciles
    downs[[yfog]] <- with(downs, floor(yfog / 10))
    downs <- dplyr::filter(downs, yfog < 90)
  } else {
    downs <- dplyr::filter(downs, yfog >= 90)
  }

  # For each segment, find the average first down rate by dwn and ytg
  if(yfog == "yfog_bin") {
    grouped <- downs %>%
      dplyr::group_by(yfog_bin, dwn, ytg) %>%
      dplyr::summarize(fdr = mean(first_down),
                       N = n())
  } else {
    grouped <- downs %>%
      dplyr::group_by(yfog, dwn, ytg) %>%
      dplyr::summarize(fdr = mean(first_down),
                       N = n())
  }

  # Just keep 3rd and 4th downs
  grouped <- dplyr::filter(grouped, dwn >= 3)
  if(yfog == "yfog_bin") {
    merged <- grouped %>%
      dplyr::left_join(grouped, by = c("yfog_bin", "ytg")) %>%
      ungroup
  } else {
    merged <- grouped %>%
      dplyr::left_join(grouped, by = c("yfog", "ytg")) %>%
      ungroup
  }

  # Note this will lose scenarios that have *only* ever seen a 4th down
  # This matches to one play since 2001.
  merged <- dplyr::filter(merged, dwn.x == 4 & dwn.y == 3)
  
  # Compute a weighted mean of FDR on 3rd & 4th down to deal with sparsity
  merged %<>%
    dplyr::mutate(weighted_N.x = fdr.x * N.x,
                  weighted_N.y = fdr.y * N.y,
                  weighted_total = weighted_N.x + weighted_N.y,
                  total_N = N.x + N.y,
                  weighted_fdr = weighted_total / total_N) %>%
    dplyr::select(-weighted_N.x, -weighted_N.y, -weighted_total, -total_N) %>%
    dplyr::rename(dwn = dwn.x)
  
  # Need to fill in any missing combinations where possible
  if(yfog == "yfog_bin"){
    merged <- tidyr::complete(merged, yfog_bin, dwn, ytg)
  } else {
    merged <- tidyr::complete(merged, yfog, dwn, ytg)
  }
  merged %<>%
    dplyr::rename(fdr = weighted_fdr)
  
  # Eliminate impossible combinations
  if(yfog == "yfog_bin"){
    # Sparse situations, just set to p(success) = .1
    merged %<>%
      dplyr::mutate(fdr = ifelse(ytg > 13, 0.10, fdr))
    
    # Missing values inside -10 because no one goes for it here
    merged %<>%
      dplyr::mutate(fdr = ifelse(is.na(fdr.x) & ytg <= 3, .2, fdr),
                    fdr = ifelse(is.na(fdr.x) & ytg > 3, .1, fdr))
    
    # Fill in missing values
#     merged %<>%
#       dplyr::mutate(fdr = approxfun(fdr))
    readr::write_csv(merged, "data/fd_open_field.csv")
  } else {
    merged %<>%
      dplyr::filter(yfog + ytg <= 100) %>%
      dplyr::mutate(fdr = ifelse(yfog == 99 & ytg == 1, fdr.x, fdr))
    #     merged %<>%
    #       dplyr::mutate(fdr = approxfun(fdr))
    readr::write_csv(merged, "data/fd_inside_10.csv")
  }
  return(merged)
}

#' Join the computed first down rates with the play by play data.
#' 
#' @param df Joined play-by-play data.
#' @param fd_open_field Results of \code{first_down_rates} with yfog set to 'yfog_bin'.
#' @param fd_inside_10 Results of \code{first_down_rates} with yfog set to 'yfog'.
#' @return data_frame
join_df_first_down_rates <- function(df, fd_open_field, fd_inside_10){
  open_field <- df %>%
    dplyr::filter(yfog < 90) %>%
    dplyr::mutate(yfog_bin = floor(yfog / 10)) %>%
    dplyr::left_join(fd_open_field) %>%
    dplyr::select(-yfog_bin)
  
  inside_10 <- df %>%
    dplyr::filter(yfog > 90) %>%
    dplyr::left_join(fd_inside_10)
  
  new_df <- dplyr::bind_rows(open_field, inside_10)
  return(new_df)
}

#' Code a situation a 1 if the offense can kneel to end the game based on time
#' remaining, defensive timeouts remaining, down, and score difference.
#' 
#' @param df Joined play-by-play data.
#' @return data_frame
dp_kneel_down <- function(df){
  df %<>%
    dplyr::mutate(kneel_down = 0,
                  kneel_down = ifelse(timd == 0 & secs_left <= 120 & dwn == 1 &
                                        score_diff > 0, 1, kneel_down),
                  kneel_down = ifelse(timd == 1 & secs_left <= 84 & dwn == 1 &
                                        score_diff > 0, 1, kneel_down),
                  kneel_down = ifelse(timd == 2 & secs_left <= 48 & dwn == 1 &
                                        score_diff > 0, 1, kneel_down),
                  
                  kneel_down = ifelse(timd == 0 & secs_left <= 84 & dwn == 2 &
                                        score_diff > 0, 1, kneel_down),
                  kneel_down = ifelse(timd == 1 & secs_left <= 45 & dwn == 2 &
                                        score_diff > 0, 1, kneel_down),
                  
                  kneel_down = ifelse(timd == 0 & secs_left <= 42 & dwn == 3 &
                                        score_diff > 0, 1, kneel_down),
                  
                  kneel_down = ifelse(score_diff <= 0 | dwn == 4, 0, kneel_down))
  return(df)
}

#' Determine the starting point for the final possession in each
#' non-overtime, regular season game to use as a proxy for the probability
#' that the team will have another possession in the game during the 4th quarter.
#' 
#' Used to weight the win probabilities in the 4th quarter.
#' 
#' @param drive_fname Filename of where DRIVE table is stored
#' @param out_name Filename of where to write output to
#' @param games Data frame generated by \code{load_games}
#' @return data_frame
calculate_prob_poss <- function(drive_fname, out_name, games){
  drives <- readr::read_csv(drive_fname) %>%
    dplyr::left_join(dplyr::select(games, gid, seas, wk))
  
  # Restrict to non-overtime games
  final_qtr <- drives %>%
    dplyr::group_by(gid) %>%
    dplyr::filter(qtr == max(qtr))
  overtime_games <- dplyr::filter(final_qtr, qtr > 4)
  drives_reduced <- drives %>%
    dplyr::filter(!(gid %in% overtime_games$gid))
  
  # Starting time of final drive of game
  drives_reduced %<>%
    dplyr::filter(qtr == 4)
  final_drive <- drives_reduced %>%
    dplyr::group_by(gid) %>%
    dplyr::filter(fpid == max(fpid)) %>%
    dplyr::mutate(secs = min * 60 + sec)
  
  # Group and get summary statistics
  final_drives <- final_drive %>%
    dplyr::group_by(secs) %>%
    dplyr::summarize(n = n()) %>%
    ungroup %>%
    dplyr::mutate(pct = n / sum(n),
                  cum_pct = cumsum(pct))
  
  readr::write_csv(final_drives, "data/final_drives.csv")
  return(final_drives)
}

#' Prepare Armchair Analysis data for analysis.
#' 
#' @param pbp_data_location Directory where Armchair Analysis csv
#' files are stored.
#' @return None
#' @export
data_prep <- function(pbp_data_location){
  if(!dir.exists(file.path(getwd(), "data"))){
    cat("Making data directory.", fill = TRUE)
    dir.create(file.path(getwd(), "data"), showWarnings = FALSE)
  }
  
  cat("Loading game data.", fill = TRUE)
  games <- load_games(file.path(pbp_data_location, "GAME.csv"))
  cat("Loading play by play data", fill = TRUE)
  pbp <- load_pbp(file.path(pbp_data_location, "PBP.csv"), games, remove_knees = FALSE)
  
  cat("Joining game and play by play data.", fill = TRUE)
  joined <- dplyr::left_join(pbp, games)
  
  # Switch offensive and defensive stats on PUNT/KOFF
  cat("Munging data...", fill = TRUE)
  joined <- switch_offense(joined)
  
  # Modify the spread so that the sign is negative when the offense
  # is favored and positive otherwise
  joined %<>%
    dplyr::mutate(spread = sprv,
                  spread = ifelse(off != v, spread,
                                  -1 * spread))
  
  # For model purposes, touchdowns are "first downs" (successful conversion)
  joined %<>%
    dplyr::mutate(first_down = ifelse((fd == "Y" & !is.na(fd)) |
                                        (pts >= 6 & !is.na(pts)), 1, 0))
  
  # Add winners for classification task
  joined %<>%
    dplyr::mutate(win = as.numeric(off == winner))
  
  # Features needed for the win probability model
  joined %<>%
    dplyr::mutate(score_diff = ptso - ptsd,
                  secs_left = (((4 - qtr) * 15.0) * 60 +
                                 (min * 60) + sec))
  
  # Group all fourth downs that indicate if the team went for it or not
  # by down, yards to go, and yards from own goal
  cat("Processing fourth downs.", fill = TRUE)
  fourths <- code_fourth_downs(joined)
  
  # Merge the goforit column back into all plays, not just fourth downs
  joined %<>%
    dplyr::left_join(dplyr::select(fourths, gid, pid, goforit))
  
  cat("Grouping and saving historical 4th down decisions.", fill = TRUE)
  decisions <- group_coaches_decisions(fourths, "data/coaches_decisions.csv")
  fourths_grouped <- fourths %>%
    dplyr::group_by(dwn, ytg, yfog) %>%
    dplyr::mutate(mean = mean(goforit),
                  N = n())
  readr::write_csv(fourths_grouped, "data/fourths_grouped.csv")
  
  # Remove kickoffs and extra points, retain FGs
  joined %<>%
    dplyr::filter(type != "KOFF") %>%
    dplyr::filter(fgxp != "XP" | is.na(fgxp))
  
  cat("Grouping and saving field goal attempts and punts.", fill = TRUE)
  # Adding data/FGXP.csv "good" and "fkicker" columns
  joined %<>%
    dplyr::select(-good) %>%
    dplyr::left_join(readr::read_csv(file.path(pbp_data_location, "FGXP.csv")) %>%
                       dplyr::select(pid, fkicker, good))
  fgs_grouped <- fg_success_rate(file.path(pbp_data_location, "FGXP.csv"),
                                 "data/fgs_grouped.csv")
  punt_dist <- punt_averages(file.path(pbp_data_location, "PUNT.csv"),
                             "data/punts_grouped.csv", joined)
  
  # Code situations where the offense can take a knee(s) to win
  cat("Coding kneel downs.", fill = TRUE)
  joined <- dp_kneel_down(joined)
  
  cat("Computing first down rates.", fill = TRUE)
  
  # Only rush and pass plays that were actually executed are eligible
  # for computing first down success rates
  df_plays <- dplyr::filter(joined, type == "PASS" | type == "RUSH")
  fd_open_field <- first_down_rates(df_plays, "yfog_bin")
  fd_inside_10 <- first_down_rates(df_plays, "yfog")
  joined <- join_df_first_down_rates(joined, fd_open_field, fd_inside_10)
  
  cat("Calculating final drive statistics.", fill = TRUE)
  final_drives <- calculate_prob_poss(file.path(pbp_data_location, "DRIVE.csv"),
                                      "data/final_drives.csv", games)
  
  cat("Writing cleaned play-by-play data.", fill = TRUE)
  readr::write_csv(joined, "data/pbp_cleaned.csv")
}
bensoltoff/rnflstats documentation built on May 12, 2019, 2:08 p.m.