load_data/ncaa.R

library(dplyr)
library(httr)
library(lubridate)
library(purrr)
library(readr)

# @ESPN - I'm just gonna use this until you say stop
# Don't worry, I'm not making money off of it
NCAA_SCOREBOARD <- 'http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard'
SEASON_TYPES <- c(
  regular = 2,  # 15-16 weeks
  bowls = 3  # 1 week
)

.lazy_get <- function(url, query) {
  params <- paste(names(query), query, sep = '', collapse = '')
  file_name <- sprintf('load_data/the_internet/%s%s.rds',
                       gsub('[^[:alnum:] ]', '', url),
                       params)
  if (file.exists(file_name)) {
    readRDS(file_name)
  } else {
    content <- GET(NCAA_SCOREBOARD, query = query) %>% content()
    saveRDS(content, file_name)
    content
  }
}

.get_ncaa_scoreboard <- function(season, week, season_type) {
  .lazy_get(NCAA_SCOREBOARD, query = list(
    lang = 'en',
    region = 'us',
    calendartype = 'blacklist',
    limit = 300,
    dates = season,
    seasontype = season_type,
    week = week,
    groups = 80
  ))
}

.parse_game <- function(event) {
  # event from the ESPN api
  if (!event$status$type$completed) {
    return(tibble())
  }

  teams <- event$competitions[[1]]$competitors
  team_names <- teams %>% map_chr(~.x$team$displayName)
  scores <- teams %>% map_chr(~.x$score) %>% as.integer()
  is_winner <- teams %>% map_lgl(~.x$winner)

  if (!any(is_winner)) {
    # If tie, "winner" is the first team
    # Doesn't matter b/c the scores are what's used from this
    is_winner <- c(TRUE, FALSE)
  }

  tibble(
    winner = team_names[is_winner],
    loser = team_names[!is_winner],
    winner_score = scores[is_winner],
    loser_score = scores[!is_winner],
    date = as.Date(event$date)
  )
}

.get_week_results <- function(season, week, season_type = SEASON_TYPES['regular']) {
  raw_week <- .get_ncaa_scoreboard(season, week, season_type)
  week_results <- raw_week$events %>%
    map(~.parse_game(.x)) %>%
    bind_rows()
  if (nrow(week_results) == 0) {
    # Empty weeks happen sometimes
    return(tibble())
  }
  # Set all the dates to the most common date in the week, most likely Saturday
  common_date <- week_results$date %>%
    table() %>%
    sort() %>%
    names() %>%
    last()
  week_results$date <- common_date
  week_results
}

.get_season_results <- function(season) {
  # Might only need to go to 16 for NCAA,
  # but weeks after the final week return empty lists anyway
  regular_season <- seq_len(16) %>%
    map(~.get_week_results(season, .x, SEASON_TYPES['regular'])) %>%
    bind_rows()
  post_season <- seq_len(1) %>%
    map(~.get_week_results(season, .x, SEASON_TYPES['bowls'])) %>%
    bind_rows()
  bind_rows(regular_season, post_season) %>%
    mutate(season)
}

.remove_spaces <- function(team_name) {
  # Some names look like Minnesota            Vikings
  gsub('\\s+', ' ', team_name)
}

seasons <- seq(1999, 2017)

# Lame: removing "cache" for current season
.clear_season <- function(season) {
  files <- dir('load_data/the_internet/', full.names = TRUE)
  season_files <- files[grepl(sprintf('dates%d', season), files)]
  file.remove(season_files)
}
.clear_season(seasons[length(seasons)])


# Yep, I just remove them for now...lame...
# Maybe when I get smarter, I can set them to a different "date"
.remove_duplicates <- function(day_games) {
  all_players <- c(day_games$winner, day_games$loser)
  is_duplicate <- duplicated(all_players) %>%
    matrix(byrow = F, ncol = 2) %>%
    apply(MARGIN = 1, FUN = any)
  day_games[!is_duplicate,]
}


ncaa_data <- seasons %>%
  map_df(~.get_season_results(.x)) %>%
  bind_rows() %>%
  mutate(group = 'ncaa') %>%
  split(.$date) %>%
  purrr::map_df(.remove_duplicates) %>%
  bind_rows() %>%
  write_csv('load_data/ncaa.csv') %>%
  write_csv('app/data/default.csv')
NathanDeMaria/glicko documentation built on May 23, 2019, 3:09 p.m.