R/scrape_games.R

Defines functions scrape_games

Documented in scrape_games

#' scrape_games
#'
#' @param ssn The starting year of the season from where we are scraping games
#' @param wk_start The first week number of games being scraped
#' @param wk_stop The final week number of games being scraped
#'
#' @return A data frame with columns Team, Score, Year, Week
#' @export
#' @importFrom magrittr %>%
#'
#' @examples
#' G <- scrape_games(ssn = 2017, wk_stop = 17)
#' tail(G)
scrape_games <- function(ssn = 2021, wk_start = 1, wk_stop){
  #Define a few variables
  #Where we scrape from:
  nfl_base <- "https://www.pro-football-reference.com/"

  #teams involved from 2010 onward; we look for these specific strings
  nfl_teams <- c("Green Bay Packers",
                 "Chicago Bears",
                 "Tennessee Titans",
                 "Cleveland Browns",
                 "Los Angeles Rams",
                 "Carolina Panthers",
                 "Washington Redskins",
                 "Washington Football Team",
                 "Philadelphia Eagles",
                 "Buffalo Bills",
                 "New York Jets",
                 "Atlanta Falcons",
                 "Minnesota Vikings",
                 "Baltimore Ravens",
                 "Miami Dolphins",
                 "Kansas City Chiefs",
                 "Jacksonville Jaguars",
                 "Cincinnati Bengals",
                 "Seattle Seahawks",
                 "Indianapolis Colts",
                 "Los Angeles Chargers",
                 "Las Vegas Raiders",
                 "San Francisco 49ers",
                 "Tampa Bay Buccaneers",
                 "New York Giants",
                 "Dallas Cowboys",
                 "Detroit Lions",
                 "Arizona Cardinals",
                 "Pittsburgh Steelers",
                 "New England Patriots",
                 "Houston Texans",
                 "New Orleans Saints",
                 "Denver Broncos",
                 "Oakland Raiders",
                 "St. Louis Rams",
                 "San Diego Chargers")

  #total number of teams
  l <- nchar(nfl_teams)

  date_list <- c()
  score_step <- 3
  scores <- c()
  for (year in ssn) {
    for (week in wk_start:wk_stop) {
      #manipulate base string to get a specific week in a specific season
      date <- paste0("years/", year, "/week_", week, ".htm")
      paste0(nfl_base, date) -> url

      #execute string scrape
      game <- rvest::read_html(url) %>%
        rvest::html_node("#content") %>%
        rvest::html_text()

      #getting the teams that played during the current week
      hold <- sapply(nfl_teams, regexpr, game)
      played <- hold[hold > 0]
      ord <- order(played)

      #ordering teams that played; eliminates having to check entire list
      played <- played[ord]
      l_play <- l[hold > 0][ord]

      #number of teams that played
      n <- length(played)

      #getting score information
      day_data <- c()
      if (n > 0) {
        for (t in 1:n) {
          #stepping through string
          score_loc <- played[t] + l_play[t] + score_step

          #collecting scores... plus garbage
          score_text <- substr(game, score_loc, score_loc + 3)

          #read score
          score <- readr::parse_number(score_text)

          #add to weekly data frame
          new <- c(names(played)[t], score, year, week)
          day_data <- rbind(day_data, new)
        }
      }

      #add to entire list of scores
      scores <- rbind(scores, day_data)
    }
  }
  scores <- as.data.frame(scores)
  names(scores) <- c("Team", "Score", "Year", "Week")
  scores$Score <- as.numeric(scores$Score)
  scores$Year <- as.numeric(scores$Year)
  scores$Week <- as.numeric(scores$Week)
  scores <- scores[!is.na(scores$Score),]
  return(scores)
}
przybylee/NFLpredictions documentation built on Feb. 9, 2025, 9:22 p.m.