chl/R/get_schedule.R

#' Gets schedule data and box score URLs for specified league and season
#' 
#' Returns a data frame of league schedule and game data for user supplied leagues & seasons.
#' 
#' @param league Leagues from which the user wants to scrape data ("OHL" and/or "WHL" and/or "QMJHL").
#' @param season Seasons for which the user wants to scrape data. Must be of the form \code{2017-18}, \code{1995-96}, etc.
#' @param progress Sets a Progress Bar. Defaults to \code{TRUE}.
#' @param ... Allows the user to supply other information to the function. If you don't know what this means, then don't worry about it.
#' @examples 
#' get_schedule("OHL", "2018-19")
#' 
#' get_schedule("QMJHL", "1999-00", progress = FALSE)
#' 
#' get_schedule(c("OHL", "WHL", "QMJHL"), c("2018-19", "2017-18"))
#' 
#' @export
#' @import dplyr
#' 
get_schedule <- function(league, season, ..., progress = TRUE) {
  
  leagues <- league %>%
    as_tibble() %>%
    purrr::set_names("league") %>%
    mutate_all(toupper) %>%
    distinct()
  
  seasons <- season %>%
    as_tibble() %>%
    purrr::set_names("season") %>%
    distinct()
  
  mydata <- tidyr::crossing(leagues, seasons)
  
  if (progress) {
    
    pb <- progress::progress_bar$new(format = "get_schedule() [:bar] :percent eta: :eta", clear = FALSE, total = nrow(mydata), show_after = 0) 
    
    pb$tick(0)
    
  }
  
  driver <- RSelenium::rsDriver(verbose = FALSE)
  
  on.exit(driver$client$close())
  on.exit(driver$server$stop())
  
  .get_schedule <- function(league, season, ...) {
    
    if (league == "NOJHL") {  
      
      if (season == "2019-20") {url = "https://nojhl.com/stats/schedule/all-teams/24/all-months?league=1&gametype=-1"}
      
      else if (season == "2017-18") {url = "http://ontariohockeyleague.com/schedule/60"}
      else if (season == "2016-17") {url = "http://ontariohockeyleague.com/schedule/56"}
      else if (season == "2015-16") {url = "http://ontariohockeyleague.com/schedule/54"}
      else if (season == "2014-15") {url = "http://ontariohockeyleague.com/schedule/51"}
      else if (season == "2013-14") {url = "http://ontariohockeyleague.com/schedule/49"}
      else if (season == "2012-13") {url = "http://ontariohockeyleague.com/schedule/46"}
      else if (season == "2011-12") {url = "http://ontariohockeyleague.com/schedule/44"}
      else if (season == "2010-11") {url = "http://ontariohockeyleague.com/schedule/42"}
      else if (season == "2009-10") {url = "http://ontariohockeyleague.com/schedule/38"}
      else if (season == "2008-09") {url = "http://ontariohockeyleague.com/schedule/35"}
      else if (season == "2007-08") {url = "http://ontariohockeyleague.com/schedule/32"}
      else if (season == "2006-07") {url = "http://ontariohockeyleague.com/schedule/29"}
      else if (season == "2005-06") {url = "http://ontariohockeyleague.com/schedule/26"}
      else if (season == "2004-05") {url = "http://ontariohockeyleague.com/schedule/24"}
      else if (season == "2003-04") {url = "http://ontariohockeyleague.com/schedule/21"}
      else if (season == "2002-03") {url = "http://ontariohockeyleague.com/schedule/17"}
      else if (season == "2001-02") {url = "http://ontariohockeyleague.com/schedule/14"}
      else if (season == "2000-01") {url = "http://ontariohockeyleague.com/schedule/11"}
      else if (season == "1999-00") {url = "http://ontariohockeyleague.com/schedule/9"}
      else if (season == "1998-99") {url = "http://ontariohockeyleague.com/schedule/6"}
      else if (season == "1997-98") {url = "http://ontariohockeyleague.com/schedule/4"}
    
    else {stop("League not available. Sorry!")}
    
    seq(2, 5, by = 0.001) %>%
      sample(1) %>%
      Sys.sleep()
    
    driver$client$navigate(url)
    
    Sys.sleep(3)
    
    page <- driver$client$getPageSource() %>%
      purrr::pluck(1) %>%
      xml2::read_html()
    
    url_prefix <- case_when(league == "OHL" ~ "http://nojhl.com")
    
    season_short <- season %>%
      stringr::str_split("\\-", simplify = TRUE, n = 2) %>%
      purrr::pluck(1) %>%
      as.numeric()
    
    game_urls <- page %>%
      rvest::html_nodes('[title="Game Centre"]') %>%
      rvest::html_attr("href") %>%
      stringr::str_c(url_prefix, .) %>%
      stringr::str_replace_all(c("play_by_play" = "boxscore")) %>%
      as_tibble() %>%
      purrr::set_names("url")
    
    if (league == "QMJHL") {
      
      date <- page %>%
        rvest::html_nodes(".table__td--schedule-date") %>%
        rvest::html_text() %>%
        as_tibble() %>%
        purrr::set_names("date")
      
    }
    
    else if (league %in% c("OHL", "WHL")) {
      
      date <- page %>%
        rvest::html_nodes(".table__td--schedule-date") %>%
        rvest::html_text() %>%
        as_tibble() %>%
        purrr::set_names("month_day") %>%
        mutate(month_day = stringr::str_split(month_day, ", ", simplify = TRUE, n = 2)[,2]) %>%
        mutate(month = stringr::str_split(month_day, " ", simplify = TRUE, n = 2)[,1]) %>%
        mutate(day = stringr::str_split(month_day, " ", simplify = TRUE, n = 2)[,2]) %>%
        mutate(year = ifelse(month %in% c("Aug", "Sep", "Oct", "Nov", "Dec"), season_short, season_short + 1)) %>%
        mutate(date = stringr::str_c(year, month, day, sep = "-")) %>%
        mutate(date = lubridate::ymd(date)) %>%
        mutate(date = as.character(date)) %>%
        select(date)
      
    }
    
    if (league == "OHL") {
      
      attendance <- as.character(NA)
      
    }
    
    else if (league == "WHL") {
      
      attendance <- page %>%
        rvest::html_nodes(".table__td--+ .table__td") %>%
        rvest::html_text() %>%
        as.character()
      
    }
    
    else if (league == "QMJHL") {
      
      attendance <- page %>%
        rvest::html_nodes(".table__td--schedule-time+ .table__td") %>%
        rvest::html_text() %>%
        as.character()
      
    }
    
    game_notes <- page %>%
      rvest::html_nodes(".table__td--schedule-time") %>%
      rvest::html_text() %>%
      as_tibble() %>%
      purrr::set_names("game_notes") %>%
      mutate(game_notes = stringr::str_replace_all(game_notes, c("Final" = "")))
    
    venue <- page %>%
      rvest::html_nodes(".table__td--") %>%
      rvest::html_text() %>%
      as_tibble() %>%
      purrr::set_names("venue")
    
    goals <- page %>%
      rvest::html_nodes(".table__td--schedule-score") %>%
      rvest::html_text() %>%
      as_tibble() %>%
      purrr::set_names("goals")
    
    away_goals <- goals %>%
      filter(row_number() %% 2 == 1) %>%
      purrr::set_names("away_goals")
    
    home_goals <- goals %>%
      filter(row_number() %% 2 == 0) %>%
      purrr::set_names("home_goals")
    
    home_team <- page %>%
      rvest::html_nodes(".table__td--schedule-home") %>%
      rvest::html_text() %>%
      as_tibble() %>%
      purrr::set_names("home_team") %>%
      mutate_all(toupper)
    
    away_team <- page %>%
      rvest::html_nodes(".table__td--schedule-away") %>%
      rvest::html_text() %>%
      as_tibble() %>%
      purrr::set_names("away_team") %>%
      mutate_all(toupper)
    
    schedule <- date %>%
      bind_cols(away_team) %>%
      bind_cols(away_goals) %>%
      bind_cols(home_team) %>%
      bind_cols(home_goals) %>%
      bind_cols(game_notes) %>%
      bind_cols(venue) %>%
      bind_cols(game_urls) %>%
      mutate(attendance = attendance) %>%
      mutate(league = league) %>%
      mutate(season = season) %>%
      mutate(date = stringr::str_extract(date, "[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}")) %>%
      filter(!stringr::str_detect(url, "preview")) %>%
      mutate(venue = ifelse(league == "WHL", stringr::str_split(venue, "\\-[^\\-]*$", simplify = TRUE, n = 2)[,1], venue)) %>%
      mutate_all(stringr::str_squish) %>%
      mutate_all(~na_if(., "")) %>%
      mutate_at(vars(away_goals, home_goals, attendance), as.numeric) %>%
      distinct() %>%
      select(league, season, date, away_team, away_goals, home_team, home_goals, game_notes, venue, attendance, url)
    
    if (progress) {pb$tick()}
    
    return(schedule)
    
  }
  
  schedule_data <- purrr::map2_dfr(mydata[["league"]], mydata[["season"]], .get_schedule)
  
  return(schedule_data)
  
}
Dooms31/nojhl documentation built on Oct. 9, 2020, 2:27 a.m.