#' Gets schedule data and box score URLs for specified league and season
#'
#' Returns a data frame of league schedule and game data for user supplied leagues & seasons.
#'
#' @param league Leagues from which the user wants to scrape data ("OHL" and/or "WHL" and/or "QMJHL").
#' @param season Seasons for which the user wants to scrape data. Must be of the form \code{2017-18}, \code{1995-96}, etc.
#' @param progress Sets a Progress Bar. Defaults to \code{TRUE}.
#' @param ... Allows the user to supply other information to the function. If you don't know what this means, then don't worry about it.
#' @examples
#' get_schedule("OHL", "2018-19")
#'
#' get_schedule("QMJHL", "1999-00", progress = FALSE)
#'
#' get_schedule(c("OHL", "WHL", "QMJHL"), c("2018-19", "2017-18"))
#'
#' @export
#' @import dplyr
#'
get_schedule <- function(league, season, ..., progress = TRUE) {
leagues <- league %>%
as_tibble() %>%
purrr::set_names("league") %>%
mutate_all(toupper) %>%
distinct()
seasons <- season %>%
as_tibble() %>%
purrr::set_names("season") %>%
distinct()
mydata <- tidyr::crossing(leagues, seasons)
if (progress) {
pb <- progress::progress_bar$new(format = "get_schedule() [:bar] :percent eta: :eta", clear = FALSE, total = nrow(mydata), show_after = 0)
pb$tick(0)
}
driver <- RSelenium::rsDriver(verbose = FALSE)
on.exit(driver$client$close())
on.exit(driver$server$stop())
.get_schedule <- function(league, season, ...) {
if (league == "NOJHL") {
if (season == "2019-20") {url = "https://nojhl.com/stats/schedule/all-teams/24/all-months?league=1&gametype=-1"}
else if (season == "2017-18") {url = "http://ontariohockeyleague.com/schedule/60"}
else if (season == "2016-17") {url = "http://ontariohockeyleague.com/schedule/56"}
else if (season == "2015-16") {url = "http://ontariohockeyleague.com/schedule/54"}
else if (season == "2014-15") {url = "http://ontariohockeyleague.com/schedule/51"}
else if (season == "2013-14") {url = "http://ontariohockeyleague.com/schedule/49"}
else if (season == "2012-13") {url = "http://ontariohockeyleague.com/schedule/46"}
else if (season == "2011-12") {url = "http://ontariohockeyleague.com/schedule/44"}
else if (season == "2010-11") {url = "http://ontariohockeyleague.com/schedule/42"}
else if (season == "2009-10") {url = "http://ontariohockeyleague.com/schedule/38"}
else if (season == "2008-09") {url = "http://ontariohockeyleague.com/schedule/35"}
else if (season == "2007-08") {url = "http://ontariohockeyleague.com/schedule/32"}
else if (season == "2006-07") {url = "http://ontariohockeyleague.com/schedule/29"}
else if (season == "2005-06") {url = "http://ontariohockeyleague.com/schedule/26"}
else if (season == "2004-05") {url = "http://ontariohockeyleague.com/schedule/24"}
else if (season == "2003-04") {url = "http://ontariohockeyleague.com/schedule/21"}
else if (season == "2002-03") {url = "http://ontariohockeyleague.com/schedule/17"}
else if (season == "2001-02") {url = "http://ontariohockeyleague.com/schedule/14"}
else if (season == "2000-01") {url = "http://ontariohockeyleague.com/schedule/11"}
else if (season == "1999-00") {url = "http://ontariohockeyleague.com/schedule/9"}
else if (season == "1998-99") {url = "http://ontariohockeyleague.com/schedule/6"}
else if (season == "1997-98") {url = "http://ontariohockeyleague.com/schedule/4"}
else {stop("League not available. Sorry!")}
seq(2, 5, by = 0.001) %>%
sample(1) %>%
Sys.sleep()
driver$client$navigate(url)
Sys.sleep(3)
page <- driver$client$getPageSource() %>%
purrr::pluck(1) %>%
xml2::read_html()
url_prefix <- case_when(league == "OHL" ~ "http://nojhl.com")
season_short <- season %>%
stringr::str_split("\\-", simplify = TRUE, n = 2) %>%
purrr::pluck(1) %>%
as.numeric()
game_urls <- page %>%
rvest::html_nodes('[title="Game Centre"]') %>%
rvest::html_attr("href") %>%
stringr::str_c(url_prefix, .) %>%
stringr::str_replace_all(c("play_by_play" = "boxscore")) %>%
as_tibble() %>%
purrr::set_names("url")
if (league == "QMJHL") {
date <- page %>%
rvest::html_nodes(".table__td--schedule-date") %>%
rvest::html_text() %>%
as_tibble() %>%
purrr::set_names("date")
}
else if (league %in% c("OHL", "WHL")) {
date <- page %>%
rvest::html_nodes(".table__td--schedule-date") %>%
rvest::html_text() %>%
as_tibble() %>%
purrr::set_names("month_day") %>%
mutate(month_day = stringr::str_split(month_day, ", ", simplify = TRUE, n = 2)[,2]) %>%
mutate(month = stringr::str_split(month_day, " ", simplify = TRUE, n = 2)[,1]) %>%
mutate(day = stringr::str_split(month_day, " ", simplify = TRUE, n = 2)[,2]) %>%
mutate(year = ifelse(month %in% c("Aug", "Sep", "Oct", "Nov", "Dec"), season_short, season_short + 1)) %>%
mutate(date = stringr::str_c(year, month, day, sep = "-")) %>%
mutate(date = lubridate::ymd(date)) %>%
mutate(date = as.character(date)) %>%
select(date)
}
if (league == "OHL") {
attendance <- as.character(NA)
}
else if (league == "WHL") {
attendance <- page %>%
rvest::html_nodes(".table__td--+ .table__td") %>%
rvest::html_text() %>%
as.character()
}
else if (league == "QMJHL") {
attendance <- page %>%
rvest::html_nodes(".table__td--schedule-time+ .table__td") %>%
rvest::html_text() %>%
as.character()
}
game_notes <- page %>%
rvest::html_nodes(".table__td--schedule-time") %>%
rvest::html_text() %>%
as_tibble() %>%
purrr::set_names("game_notes") %>%
mutate(game_notes = stringr::str_replace_all(game_notes, c("Final" = "")))
venue <- page %>%
rvest::html_nodes(".table__td--") %>%
rvest::html_text() %>%
as_tibble() %>%
purrr::set_names("venue")
goals <- page %>%
rvest::html_nodes(".table__td--schedule-score") %>%
rvest::html_text() %>%
as_tibble() %>%
purrr::set_names("goals")
away_goals <- goals %>%
filter(row_number() %% 2 == 1) %>%
purrr::set_names("away_goals")
home_goals <- goals %>%
filter(row_number() %% 2 == 0) %>%
purrr::set_names("home_goals")
home_team <- page %>%
rvest::html_nodes(".table__td--schedule-home") %>%
rvest::html_text() %>%
as_tibble() %>%
purrr::set_names("home_team") %>%
mutate_all(toupper)
away_team <- page %>%
rvest::html_nodes(".table__td--schedule-away") %>%
rvest::html_text() %>%
as_tibble() %>%
purrr::set_names("away_team") %>%
mutate_all(toupper)
schedule <- date %>%
bind_cols(away_team) %>%
bind_cols(away_goals) %>%
bind_cols(home_team) %>%
bind_cols(home_goals) %>%
bind_cols(game_notes) %>%
bind_cols(venue) %>%
bind_cols(game_urls) %>%
mutate(attendance = attendance) %>%
mutate(league = league) %>%
mutate(season = season) %>%
mutate(date = stringr::str_extract(date, "[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}")) %>%
filter(!stringr::str_detect(url, "preview")) %>%
mutate(venue = ifelse(league == "WHL", stringr::str_split(venue, "\\-[^\\-]*$", simplify = TRUE, n = 2)[,1], venue)) %>%
mutate_all(stringr::str_squish) %>%
mutate_all(~na_if(., "")) %>%
mutate_at(vars(away_goals, home_goals, attendance), as.numeric) %>%
distinct() %>%
select(league, season, date, away_team, away_goals, home_team, home_goals, game_notes, venue, attendance, url)
if (progress) {pb$tick()}
return(schedule)
}
schedule_data <- purrr::map2_dfr(mydata[["league"]], mydata[["season"]], .get_schedule)
return(schedule_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.