R/utils.R

Defines functions get_fdata extract_name possibly_read_csv make_url make_season current_season

# TODO: move utility functions into ./R/utils.R? Move blocks of code into functions?
current_season <- function(){
  # make sure the most recent season start year is correct.
  # require(lubridate)
  todayy <- today()
  (todayy %>% year()) - ifelse(todayy %>% month() > 6, 0, 1) - 2000
}
make_season <- function(yr) paste0(yr, yr+1)
make_url <-
  function(yr, url_base = 'https://www.football-data.co.uk/mmz4281/')
    paste0(url_base, make_season(yr), '/data.zip')
possibly_read_csv <- function(url_csv){ # ----
  # purrr::possibly(
  readr::read_csv(url_csv,
    col_types = cols(.default = "c"), # keep cols as character
    locale = locale(encoding = "windows-1252")) # essential
  #  , otherwise = tibble())
}
# read_csv <- # memoise limit_rate safely ----
#   memoise::memoise(
#   ratelimitr::limit_rate(
#     possibly( function(...) {
#       pb$tick(tokens = list(what = "urll: ")) ;
#       readr::read_csv(...) # function (file, col_names = TRUE
#       # TODO: fread to speed up read?
#       # data.table::fread # function (input = "", file = NULL,
#     }, otherwise = tibble() ) ,
#     ratelimitr::rate(2, 1)))
# # memoise::memoise / ratelimitr::limit_rate / purrr::safely
# # read_csv <- readr::read_csv %>% safely() %>% limit_rate(rate(1, 1)) %>% memoise()

extract_name <- function(urll)
  # extract season name (e.g. 1415) from football-data url to zip file
  (urll %>%  str_match('/([0-9]{4})/'))[1,2]

get_fdata <- function(season_starts, board_name)  # ----
# download data for _all_ leagues from season_starts to now.
# Returns a single tibble
  tibble(season_start = season_starts) %>%
    mutate(
      season = season_start %>% make_season, # season name
      urll = season_start %>% make_url, # url to raw data
      csv = urll %>% # for each season
        map( ~ map_dfr( # read each league from a csv then row bind them into one tibble per season.
          # pin each zip file - WARNING: name argument is essential else data overwritten each season
          # store in _sub_folder for git to store in _raw_ data else u have to download it each time.
          #
          # NB: we cache and read csv
          # i.e. read.csv(pin(resource)) over a list of csvs (for one season/zip file)
          pin(., name = extract_name(.), board = board_name,
            description = glue("Source: football-data.co.uk. ",
              "All leagues for one season ({extract_name(.)})") ),
          ~ possibly_read_csv(.)) )
    ) # mutate.

# list(urll) %>% # for each season
#   map(.x = urll, .f = ~ map_dfr( # read each league from a csv then row bind them into one tibble per season.
#     # pin each zip file - WARNING: name argument is essential else data overwritten each season
#     # store in _sub_folder for git to store in _raw_ data else u have to download it each time.
#     .x = .f = function(.){
#       pin(urll, name = extract_name(urll), board = board_name,
#         description = glue("Source: football-data.co.uk. ",
#           "Football-Data is a free football betting portal ",
#           "providing historical results & odds for many years of data")) %>%
#       pin_info(name = extract_name(urll), board = board_name)
#         pin_get(name = extract_name(urll), board = board_name, extract=TRUE) %>%
#         str()
#       }) )

transform_fdata <- # tranform raw data ----
. %>%
  unnest(csv) %>%
  type.convert() %>%
  mutate(Date = Date %>% as.character()) %>%
  mutate(datee = Date %>%
      as_date(format = paste0("%d/%m/",
        ifelse(nchar(Date) == 8, "%y", "%Y"))),
    .after = Date) %>%
  select(-Date) %>%
  arrange(desc(datee), desc(Time), Div) %>%
  relocate(datee, Time, Div, .before = season_start) %>%
  filter(!is.na(Div)) # TODO: check which season/Div generate blanks
JohnGavin/fdata documentation built on Jan. 29, 2021, 1:38 p.m.