_targets.R

# https://wlandau.github.io/targets/reference/index.html  ----
# tar_edit() - edit _this_ (_targets.R) file
# tar_renv()
# tar_render()	Render a dependency-aware R Markdown report.
# tar_knit()	Run a dependency-aware knitr report.
# tar_change()	Always run a target when a custom object changes.
# tar_force()	Always run a target when a custom condition is true.
# tar_suppress()	Never run a target when a custom condition is true.# tar_plan()
#
# tar_script({ # Write a _targets.R script to the current working directory.
#   tar_script({ tar_pipeline(...) )
# Every targets project requires a target script in the project root.
# The target script must always be named _targets.R.
# Every _targets.R file should run the following steps in the order below:

# 1. Package: load the targets package. ----
# This step is automatically inserted at the top of  _targets.R files
# produced by tar_script() if library_targets is TRUE,
# so you do not need to explicitly include it in code.
options(warnPartialMatchArgs = FALSE)
# options(tidyverse.quiet = TRUE) # not needed cos suppressPackageStartupMessages
suppressPackageStartupMessages({
  # library(renv) ; # ----
  # renv::init(bare = TRUE)
  # # disable automatic snapshots
  # auto.snapshot <- getOption("renv.config.auto.snapshot")
  # options(renv.config.auto.snapshot = FALSE)
  # renv::install(pkgs[-c(1:2)]) ; .libPaths()
  # renv::install(c('pins', 'conflicted','visdat'))
  # renv::settings$ignored.packages() ;  renv::settings$snapshot.type("all")
  # renv::install("devtools") # renv::install("digest@0.6.19")
  # devtools::install_github("ropensci/tarchetypes")
  # renv::restore() # revert to the previous state as encoded in the lockfile
  # renv::snapshot() #
  # renv::use_python() # https://blog.rstudio.com/2019/11/06/renv-project-environments-for-r/#Integration_with_Python
  # renv::deactivate() ; renv::activate() # removes the renv auto-loader from the project .Rprofile
  # renv::install("rstudio/renv") # renv::install(type = "binary")
  # TODO: install shiny to see if displays missing tables below.
  pkgs <- c('targets', 'tarchetypes',  # ----
    'renv', 'dplyr', 'readr', 'tibble', 'rmarkdown', 'stringr', # stringr::str_subset # 'tidyverse',
    'purrr', 'tidyr', 'ggplot2', # purrr::walk and purrr::map tidyr::unnest tidyr::drop_na
    # 'conflicted', # 'logger', "here", 'assertthat',
    'fs', # 'tidymodels', fs requires tidymodels (12 more packages!)
    # jbg_utils 'ratelimitr', 'memoise',
    # polite sessions (httr and rvest),  policies (robotstxt),
    # rate-limiting and response caching (ratelimitr and memoise).
     # 'reticulate::py_save_object',
    'visdat', # 'visNetwork/htmlwidgets', 'naniar', # vis_miss
    'details', 'reticulate', # readme tabs
    'lubridate', 'gt', 'pins', 'glue'
    # 'backports', 'checkmate', 'commonmark', 'rappdirs', 'sass', 'Rcpp', 'filelock', 'zip'
  )
    # 'polite', # https://dmi3kno.github.io/polite/
    # 'future', 'reticulate',
    # 'data.table', 'recipes', 'caret', 'glmnet'
    # 'parsnip', # fit
  # renv::snapshot(packages = pkgs, prompt = TRUE) # WARNING: 'packages' is critical
  # WARNING: purrr::walk/base::lapply(pkgs, library ... FAILS targets
  # https://github.com/rstudio/renv/issues/143
  # renv::status()
  base::sapply(pkgs, library, character.only = TRUE, quiet = TRUE,
    logical.return = TRUE, warn.conflicts = FALSE) #  %>% print()
  # purrr::walk(pkgs, library, character.only = TRUE)
  # renv::install('pacman') ;library(pacman) ; p_load(pkgs)
})

# 2. Options: tar_option_set() defaults for targets-specific settings ----
# #   such as the names of required packages.
# # Uncomment below to deploy targets to parallel jobs when running tar_make_clustermq().
tar_option_set(tidy_eval = TRUE,
  packages = pkgs,
  tar_option_set(error="workspace") # on err, find workspace image file in `_targets/workspaces/`.
  # (Or, tar_option_set(workspace = c("target1", "target2")) - always save workspaces for specific targets.
  # tar_meta(fields = error) %>% na.omit() ; list.files("_targets/workspaces")
  # tar_workspace(<<failed target name>>) # environment of failed target command - reproduce error
  # tar_undebug() # remove (large) workspace files
  # debug() debugonce() undebug() browse() # debugging functions utilities
  # https://rstats.wtf/debugging-r-code.html # interactive debugging
  # debugonce(<<function>>) Shift+F9 set breakpt == browser(), 'where' stack trace, 's' step into `n` next line, `c` next breakpoint `Q` to exit
  # Or debug interactively while `tar_make()`
  # tar_option_set(debug = "<<function name>>", cue = tar_cue(mode = "never")) then tar_make(callr_function = NULL) in the R console
)
#
# suppressMessages({
#   conflict_prefer("pluck", "rvest")
#   conflict_prefer("select", "dplyr")
#   conflict_prefer("col_factor", "readr", "scales")
#   conflict_prefer("spec", "readr", "yardstick")
#   conflict_prefer("discard", "purrr", "scales")
#   conflict_prefer("collapse", "dplyr", "glue")
#   conflict_prefer("filter", "dplyr", "stats")
#   conflict_prefer("lag", "dplyr", "stats")
#   conflict_prefer("fixed", "recipes", "stringr")
# })

# tar_option_set ----
# How to pass this setting?
# TODO: globaenv not needed?
# tar_option_set(envir = globalenv() )
# tar_option_get("envir")
#
# #   Even if you have no specific options to set,
# #   call tar_option_set() to register the proper environment.
# # ending with a call to tar_pipeline().
# # tar_option_set() # set packages globally for all subsequent targets you define.
# # tar_option_set(envir = environment())
# # data <- tar_target(target_name, get_data(!!n_rows), tidy_eval = FALSE)
# # print(data)
# # tar_option_reset() # Reset all target options you previously chose
#
# # use local multicore computing when running tar_make_clustermq().
# options(clustermq.scheduler = "multisession")
# sfuture::plan(future::multicore) # or multisession
#
# 3. Globals: load custom functions and global objects into memory. ----
# #   calls to source() defining user-defined functions
# list.files("R", pattern = "\\.R$", recursive = FALSE, ignore.case = TRUE,
# full.names = TRUE) %>% walk(source)
fs::dir_ls('R', regexp = "\\.R$") %>%
  str_subset("plan[.]R|_targets_[.]+.R", negate = TRUE) %>%
  # lapply(source)
  walk(source)
# global functions from jbg_utils
# fs::dir_ls(
#   file.path(here::here(), '..', 'jbg_utils', 'R'),
#   regexp = "\\.R$") %>%
#   str_subset("plan[.]R", negate = TRUE) %>%
#   walk(source)

# rel_fp <- file.path('.')
# (fp <- file.path(rel_fp, 'data_raw'))
# dir.exists(fp) %>% stopifnot


# plan_fdata = tar_plan( # ----


plan_fdata = tar_plan( # TODO: move blocks into functions ----

  season_starts  = {
    first_season_yr <-  14
    first_season_yr:current_season() },

  # create a new local board
  #
  # TODO: https://www.kaggle.com/paololol/league-of-legends-ranked-matches?select=matches.csv
  # https://www.kaggle.com/fortyeth/analyzing-win-attributes-in-league-of-legends
  # https://www.kaggle.com/gulsahdemiryurek/let-s-predict-league-of-legends-match-score
  # https://www.kaggle.com/search?q=league+of+legends
  # https://datasetsearch.research.google.com/search?query=league%20of%20legends&docid=HwcmHeE3TNPOdMwAAAAAAA%3D%3D
  #
  board_name = c('football_data', 'local')[2],
  fp_cache  = file.path(".", "data", "pins"), # options(pins.path = "./.pins_cache")
  create_board = { # ----
    # cache the data locally, in a subfolder, for git to store it.
    if (!exists(fp_cache)) dir.create(fp_cache, recursive=TRUE, showWarnings = FALSE)
    # TODO: get 'local' cache, copy 'data.txt' to 'football-data'
    #if (!dir.exists("./data/pins/football-data/1415"))
    #  dir.create("./data/pins/football-data/1415", recursive=TRUE, showWarnings = FALSE)
    # TODO: add branch?
    board_register_local(board_name, cache = fp_cache)
    # configure default board ----
    # options(pins.board = )
    # board_get(pins_cache)
    # board_list() # board_default() ;
    # pin_find(board = board_name)
    # pin_info(name = '1415' , board = board_name)
    # pin_log(board = board_name)
    #
    # urll <- 'https://www.football-data.co.uk/mmz4281/1415/data.zip'
    # pin_get(name = urll %>% extract_name(), board = board_name)
    # pin_get(name = '1415' , board = board_name) %>% str()
    #
    # pin_remove(name = urll %>% extract_name(), board = board_name)
    # pin_remove(name = "15-16", board = c(board_name, "local")[2])
    #
    # board_deregister(name = c(board_name, "local")[1])
    # options(pins.board = NULL) # revert default board
    #
    # pins:::pin_find_empty()
    # pin_default_name(x = urll %>% extract_name(), board = board_name)
    # pins:::board_pin_create.local(board = board_name,
    #   name = extract_name(urll))
    #   urll = url_base)
    #
    #    # read_csv( pin(urll, name = extract_name(urll) , board = board_name) )
    #     urll <- "https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv"
    #    read_csv(pin(urll)) # NB: does it read into local by default?
    #    read_csv(pin(urll, name = "1415" ) )

  },

  # download football-data as a tibble ----
  # NB: force target to call get_data then pins decide if data needs to redownloaded.
  #  i.e. target cannot know if url has changed - only pins can know by checking.
  tar_force(name = fdata, command = {
    raw <- season_starts %>% get_fdata(board_name)
    raw %>% transform_fdata() } , force = TRUE),
  # fdata = {
  #   raw <- season_starts %>% get_fdata(board_name)
  #   raw %>% transform_fdata() } ,
  # fdata %>% str(max.level = 1)
  # fdata %>% head(c(4, 3))
  #
  # usethis::use_data(fdata, overwrite = TRUE) # use that data!
  pins_path = "data/pins",
  pin_fdata = board_register_github(name = "football_data_github",
    repo = "JohnGavin/fdata", branch = 'master', # token = NULL, # use GITHUB_PAT
    path = pins_path, host = "https://api.github.com", cache = fp_cache, # board_cache_path(),
    description = glue("Source: football-data.co.uk. ",
      "Football-Data is a free football betting portal ",
      "providing historical results & odds for many years of data.")
    ),


  tar_target(fo_fdata, { # write a file then return a path
    fo = './data/football_data_co_uk.csv.gz'
    fdataa <- fdata %>% arrange(desc(datee)) # cannot assign to fdata a target
    fdataa %>% write_csv(fo)
    # fdataa %>% write_rds(fo %>% str_replace('csv\\.gz$', 'rds'))
    # fdataa %>% reticulate::py_save_object(fo %>% str_replace('csv\\.gz$', 'pkl'))
    fo}, format = "file") ,# , deployment = "main"

  chk_data_recent = { # check data is recent ----
    (most_recent <- fdata %>%
    summarise(datee = datee %>% max(na.rm = TRUE), .groups = 'keep') %>%
    pull(datee))
    (today() - most_recent < period(day = 5)) %>% stopifnot() },
    # assert_that(today() - most_recent < period(day = 5)) %>% stopifnot() },
  chk_datee_na =
    (fdata %>% filter(is.na(datee)) %>% nrow %>% `==`(0)) %>% stopifnot,
    # assert_that(fdata %>% filter(is.na(datee)) %>% nrow %>% `==`(0)) %>% stopifnot,

  divs_dates_recent = # most recent match dates by Div - older first
    fdata %>% group_by(Div) %>%
    summarize(across(where(is.Date), max, na.rm = TRUE), .groups = 'keep') %>%
    ungroup() %>%
    arrange(datee) ,
  # fdata %>% # A named list of functions ----
  #   group_by(Div) %>%
  #   summarise(across( class(everything()) %>% contains('date|time') ,
  #     list(min = min, max = max), .names = "{.col}.fn{.fn}"), .groups = 'keep')
  top_divs_dates_recent =
    fdata %>% # pull(Div) %>% table
    dplyr::filter(Div %in% c('E0', paste0(c('D', 'F', 'I', 'SP'), 1) ) ) %>%
    group_by(Div) %>%
    summarize(most_recent_match = max(datee)) %>%
    arrange(desc(most_recent_match)),

  raw_csv_list = fp_cache %>% fs::dir_ls(regexp = "\\.csv$", recurse = 2) ,

  gg_dat_miss = fdata %>%
    sample_n(size = 2e3) %>%
    arrange(desc(datee)) %>%
    visdat::vis_miss(warn_large_data=FALSE) +
    theme(axis.text.x = list(element_text(size=6, angle=60), element_blank())[[1]]),

  # see `tar_render()` function in https://wlandau.github.io/tarchetypes
  # and https://wlandau.github.io/targets-manual/files.html#literate-programming
    # WARNING: do NOT assign tar_render i.e. abc = tar_render(...)
    #   The _object_ README now depends on tar_read/tar_load objs in path Rmd file
   tar_render(README, path = "./README.Rmd", quiet=FALSE,
     params = list(pins_path = pins_path) )
  # which runs rmarkdown::render("report.Rmd", params = list(your_param = your_target))

) # plan_fdata

### Scrape a webpage ----
# library(polite)
# library(rvest)
# library(progress)
#
#   urll <- c("https://www.football-data.co.uk/data.php",
#     "https://www.football-data.co.uk/englandm.php")[2]
#   session <- bow(urll)
#   current_page <- nod(session, urll) %>%
#     scrape(verbose=TRUE)
#
#   # First find all the urls
#   table_urls <- current_page %>%
#     html_nodes("a") %>%
#     html_attr("href") %>%
#     xml2::url_absolute(urll) %>%
#     tibble(urll = .)
#   table_urls %>% bind_cols(
#     table_urls %>% select(urll) %>%
#       map(str_match, '/([0-9]{4})/([a-zA-Z0-9]+\\.csv)') %>%
#       `[[`(1) %>% `[`(, -1) %>%
#       structure(., dimnames = list(NULL, c('season', 'Div'))) %>%
#       as_tibble() ) %>%
#     mutate(Divv = Div %>% str_replace('.csv$', '')) %>%
#     select(-Div) %>%
#     filter(!is.na(season), season > "1000", season < "2030") %>%
#     distinct() %>%
#     head(20) ->
#     table_urls
#
#   pb <- progress_bar$new(
#     format = paste0("Downloading [:bar] :current/:total (:percent) ",
#     "eta: :eta elapsed: :elapsed ",
#     "rate: :rate bytes: :bytes  what: :what") ,
#     total = nrow(table_urls), clear = FALSE, width= 60)
#   # pb$tick(0) # initialise progress.
#
#
#   system.time(
#     table_urls %>%
#       mutate(csv = urll %>%
#         map( read_csv,
#           col_types = cols(.default = "c"), fileEncoding = "latin1")
#       ) ->
#       table_csvs
#   )
#   table_csvs$csv[[1]]
#   table_csvs %>%
#     unnest(csv) -> # %>%
#     table_csv
#   glimpse(table_csv)
#   # confirm Divv == Div then remove Divv
#   assert_that( table_csvs %>% summarize(chk = all(Divv == Div)) %>%
#       pull(chk) ) %>% stopifnot()
#   table_csv %>% select(-Divv) %>%
#     type.convert(as.is = TRUE)
#
#
#
#   system.time( #
#     table_csvs <- table_urls %>%
#       filter(urll %>% str_ends('csv')) %>%
#       mutate(csv = urll %>% map_dfr(read_csv, col_types = cols()) %>% list())
#   )
#   # https://www.football-data.co.uk/mmz4281/9394/E1.csv
#   table_csvs %>% select(csv) %>%
#
#   while(!is.na(urll)){
#     # make it verbose
#     message("Scraping ", urll)
#     # nod and scrape
#     current_page <- nod(session, urll) %>%
#       scrape(verbose=TRUE)
#     # extract post titles
#     hrbrmstr_posts <- current_page %>%
#       html_nodes( c("nth-child", ".entry-title a")[1]) %>%
#       polite::html_attrs_dfr() %>%
#       rbind(hrbrmstr_posts)
#     # see if there's "Older posts" button
#     urll <- current_page %>%
#       html_node(".nav-previous a") %>%
#       html_attr("href")
#   } # end while loop
#
#   tibble::as_tibble(hrbrmstr_posts)


# plan_04 = tar_plan( # ----
#
#   # plan_04 inputs
#   # fdata - b_data_past_filters__pinn_feat_results_stk.csv.gz
#   # odds - b_data_past_filters__ps_op_2_cl_stk.csv
#   # us_match_stats_mapped_fd - us_match_stats_mapped_fd.rds
#   # fd_fb - merged_football-data_fbref_probs_xg.csv.g
#   #
#   # plan map_lge_teams inputs
#   # fb_match_odds_mapped_us - fb_match_odds_mapped_us.rds (better than 'odds')
#
#   # pivot to one row per match
#   # Align team by dates
#   # Algo to map league/team names
#   # Outline of team mapping steps
#   # Restrict to common dates between the two sources
#   # Dates where only one match is played
#
#   # merge fb odds (fb_match_odds_mapped_us) to fb match stats (fd_fb)
#   #   fb_match_odds_mapped_us (34k) x fd_fb (24k, 1 rpm)
#   #   fb_odds_wide_map_us (5k)
#   # merge fb odds & match stats to understat us_match_stats
#
#   # fb_match_odds_mapped_us.rds (long format)
#   #   ALL odds (incl tg and AH)
#   #   (34k rows but in long format so really just 2k rows cos 2 seasons
#   #   for 19/20 and 20/21 only
#   #     fb_match_odds_mapped_us %>% pull(Date) %>% range()
#   #   and act (Pinnacle) v exp (Poisson) goals
#   #     NOT 1x2 odds only (not tg and AH) and act (Pinnacle) v exp (Poisson) goals
#   fb_match_odds_mapped_us =
#     read_rds('../../data/understatr/fb_match_odds_mapped_us.rds') ,
#   # (fb_match_odds_mapped_us) %>% pull(date) %>% range()
#   # fd has 10 leagues. us_match_stats_mapped_fd had 5.
#   # So us_home, us_away and league_names has nans.
#   # plot_fb_match_odds_mapped_us = vis_miss(fb_match_odds_mapped_us) ,
#   # fb_match_odds_mapped_us %>% select(date) %>% drop_na() %>% pull() %>% range()
#
#   # fb_match_odds_mapped_us %>% select(contains('date'))
#   fb_odds_wide_map_us = fb_match_odds_mapped_us %>%
#     select(-mm) %>% # filter(mm %in% c('PS', 'P'))
#     pivot_wider(id_cols = -c('bet', 'type', 'C', 'exp', 'act'),
#       names_from = c('bet', 'type'),
#       values_from = c('C', 'exp', 'act')) ,
#   # Amelia::missmap
#   plot_fb_odds_wide_map_us = vis_miss(fb_odds_wide_map_us) ,
#   # FIXME: why is expected AH == expected TG? But for actual?
#   # fb_odds_wide_map_us %>% mutate(
#   #     fixme_A = (exp_A_ah - exp_lt2.5_tg) / exp_lt2.5_tg,
#   #     fixme_H = (exp_H_ah - exp_gt2.5_tg) / exp_gt2.5_tg,
#   #     .after = exp_gt2.5_tg) %>%
#   #   summarise(A_max = max(fixme_A), A_min = min(fixme_A),
#   #     H_max = max(fixme_H), H_min = min(fixme_H), .groups = 'keep')
#
#
#   # # football-data odds contains all team names in football-data / fbref.com
#   # c(fb_odds_wide_map_us$home, fb_odds_wide_map_us$away) %>%
#   #   setdiff(c(fd_fb$HomeTeam, fd_fb$AwayTeam))
#   # # FIXME: so why is the converse missing 51 team names (minor team names)?
#   # c(fd_fb$HomeTeam, fd_fb$HomeTeam) %>%
#   #   setdiff(c(fb_odds_wide_map_us$home, fb_odds_wide_map_us$home) )
#   # # fb_odds_wide_map_us$home %>% setdiff(fd_fb$HomeTeam)
#
#   # names(fb_odds_wide_map_us) %>% intersect(names(fd_fb))
#   # fb_odds_wide_map_us %>% select(contains('date') ); fd_fb %>% select(contains('date'))
#   fd_fb_us = fb_odds_wide_map_us %>% left_join(fd_fb,
#     by = c('Date' = 'datetime', 'home' = 'HomeTeam', # team_h',
#       'away' = 'AwayTeam', 'season', 'Div', 'future',
#       "xg_H", "xg_A" # why is "xg_H", "xg_A" in both?
#     ) ) ,
#   plot_fd_fb_us = vis_miss(fd_fb_us, warn_large_data=FALSE) ,
#   # 2nd div leagues are all nan in understat columns
#   tmp <- fd_fb_us %>% filter(!((Div %>% str_detect('2$')) | (Div == 'E1'))) %>%
#     select( -c(names(fb_odds_wide_map_us))) %>%
#     filter( rowSums(is.na(.)) != ncol(.)) ,
#   vis_miss(tmp, warn_large_data=FALSE) , # skimr::skim(tmp)
#   tmp <- tmp %>% summarize(across(everything(), ~ all(is.na(.)))) %>% unlist
#   fd_fb_us_2 = fd_fb_us %>%
#     filter(!((Div %>% str_detect('2$')) | (Div == 'E1'))) %>%
#     select(-c(names(fb_odds_wide_map_us), names(tmp[tmp])))
#   vis_miss(fd_fb_us_2, warn_large_data=FALSE) , # skimr::skim(tmp)
#
#   # all date/date_time columns should be upto date
#   conflict_prefer("first", "dplyr") ; conflict_prefer("last", "dplyr")
#   fd_fb_us %>% summarise( across(
#     where(lubridate::is.POSIXt)| where(is.date),
#     max, na.rm = TRUE) ) %>% t # OR
#   fd_fb_us %>% select(contains('date') | contains('time')) %>%
#     summarise(across(everything(), max, na.rm = TRUE), .groups = 'keep') %>%
#     # summarise_all( max, na.rm = TRUE) %>% min() # summarise_all deprecated
#     # .fns = list(first = first, last = last) ) )
#
#   # NB: fd_fb assumes that ./notebook/01_merged_data/drake.R has updated
#   #   all local csvs in ./data/01_merged_data/*.csvs from _private_ github.
#   fd_fb %>% select(date_only) %>% drop_na() %>% pull() %>% max()
#   fd_fb %>% select(ldn_time) %>% drop_na() %>% pull() %>% max()
#
#   # FIXME: us_match_stats_mapped_fd has two rows per match and many columns are not clear.
#   #   e.g. forcasts are for 'home' team but not labelled.
#   tar_load(us_match_stats_mapped_fd)
#   us_1_rpm = # one row per match
#     us_match_stats_mapped_fd %>% arrange(Div, datetime, h.title)
#   fd_fb %>% select(contains('date'))
#   us_match_stats_mapped_fd %>% select(contains('date'))
#
#
#   #   equivalent to fb_match_odds ( == match_ids_probs_team_stats.rds )
#   #   but with map to understat
#   # match_ids_probs_team_stats =
#   #   read_rds('../../data/understatr/match_ids_probs_team_stats.rds') %>%
#   #   arrange(desc(isResult), desc(date), league_name, match_id, side),
#   # two sides for every match.
#   # match_ids_probs_team_stats %>% group_by(match_id) %>% filter(n() != 2)
#   # match_ids_probs_team_stats %>% filter(isResult) %>% pull(datetime) %>% range
#
#   # See also event dataset - shots_by_match_xg_probs_team_stats.csv.gz
#   #   650k rows - all shots for each side per match
#   #   a superset of shots_by_match_xg.csv.gz
#
# )

# plan_04 = tar_plan( # ----
#
#   # plan_04 inputs
#   # fdata - b_data_past_filters__pinn_feat_results_stk.csv.gz
#   # odds - b_data_past_filters__ps_op_2_cl_stk.csv
#   # us_match_stats_mapped_fd - us_match_stats_mapped_fd.rds
#   # fd_fb - merged_football-data_fbref_probs_xg.csv.g
#   #
#   # plan map_lge_teams inputs
#   # fb_match_odds_mapped_us - fb_match_odds_mapped_us.rds (better than 'odds')
#
#   # fdata - b_data_past_filters__pinn_feat_results_stk.csv.gz
#   tar_target(fi_fdata, file.path(fp, # pins::pin
#     'b_data_past_filters__pinn_feat_results_stk.csv.gz'),
#     format = "file"), # , deployment = "main"
#   fdata_raw = read_data(fi_fdata),
#   fdata = tidy_fdata(fdata_raw),
#   # TODO: is pins just for input data?
#   tar_target(fo_fdata, # write a file then return a path
#     { fo = fi_fdata %>% str_replace('csv\\.gz$', 'rds')
#     fdata %>% write_rds(fo)
#     fo}, format = "file") ,# , deployment = "main"
#
#   # fd_fb - merged_football-data_fbref_probs_xg.csv.g
#   tar_target(fi_fd_fb, file.path(fp, '01_merged_data',
#     'merged_football-data_fbref_probs_xg.csv.gz'),
#     format = "file"), # , deployment = "main"
#   # fi_fd_fb <- file.path(fp, '01_merged_data', 'merged_football-data_fbref_probs_xg.csv.gz')
#   # football.co.uk (match stats but not odds) and fbref.com (summary table)
#   # FIXME: get_local_csv mutates some cols so is not a general function
#   #   strip out mutation step as seperate function.
#   # NB: fd_fb assumes that ./notebook/01_merged_data/drake.R has updated
#   # local csvs in ./data/01_merged_data/*.csvs from _private_ github.
#   fd_fb = get_local_csv(fn = basename(fi_fd_fb), fp =  dirname(fi_fd_fb)) %>% rename(datetime = date) ,
#
#   # odds - b_data_past_filters__ps_op_2_cl_stk.csv
#   tar_target(fi_odds, file.path(fp, # pins::pin
#     'b_data_past_filters__ps_op_2_cl_stk.csv'),
#     format = "file"), # , deployment = "main"
#   odds = read_odds(fi_odds),
#
#   # us_match_stats_mapped_fd - us_match_stats_mapped_fd.rds
#   tar_target(fi_us, file.path(fp, 'understatr', 'us_match_stats_mapped_fd.rds'), format = "file"), # , deployment = "main"
#   # fi_us <- file.path(fp, 'understatr', 'us_match_stats_mapped_fd.rds')
#   us_match_stats_mapped_fd = read_rds(fi_us) %>% filter(isResult) %>%
#     arrange(desc(isResult), desc(date), datetime, league_name, match_id, h.title) %>%
#     filter(isResult) %>%
#     select(!contains('id$')) %>%
#     select(dplyr::matches('name|xg|xp|forecast|date|title|season|goal'),
#       deep.h:ppda_allowed.def.h) %>%
#     # TODO: add week of season.
#     # TODO: xG v xG.h & xGA v xG.a
#     # TODO: Pinn tg <> 1.5 v actual result calibration fd_fb
#     mutate(hr = datetime %>% lubridate::hour(),
#       day = datetime %>% lubridate::day(),
#       mth = datetime %>% lubridate::month(), .after = datetime) %>%
#     rename(date_only = date) %>%
#     # select(!dplyr::matches('date')) %>%
#     mutate(tg = goals.h + goals.a, gd = goals.h - goals.a, .after = goals.a) ,
#   # us_match_stats_mapped_fd.rds - a superset of us_match_stats.csv.gz/rds
#   plot_us_match_stats_mapped_fd = vis_miss(us_match_stats_mapped_fd ) ,
#
#   # merge in FTR
#   #   train_data = !Season %in% last_2_seasons
#   train_test = merge_FTR_odds(odds, fdata),
#
#   # Apply steps to the training data. via the `recipes` package:
#   train_prep = recipe_prep(train_test$train_data),
#   # bake these test/train steps using this recipe
#   train_bake = train_prep %>% bake(new_data = train_test$train_data),
#   test_bake  = train_prep %>% bake(new_data = train_test$test_data),
#   # Pinnacle closing highest probabilty v actual outcome
#   # TODO: repeat for understat forecast probabilities.
#   pred_lst = predictions(train_test$test_data), # pred_lst %>% names
#   plot_all_calib = get_plot_all_calib_frame(pred_lst$all_calib_frame),
#   train_control = trainControl(
#     method = c("timeslice", "adaptive_cv")[2],
#     summaryFunction = multiClassSummary, classProbs = T),
#   #### SLOOOOW - method="adaptive_cv"
#   tuneLength = 30,
#   fit_glmnet = train( FTR ~ . ,
#     family="multinomial", method="glmnet",
#     data = train_bake, #  %>% head(1e1),
#     # tuneLength = 30 => seconds  697.009 => plot _much_ better!
#     # tuneLength = 40 => seconds 1339.54 == 22 mins => plot _much_ better!
#     tuneLength = tuneLength, trControl=train_control), # 50
#   tar_target(fo_fit_glmnet, # fo - write the file then return a path
#     {fo <- paste0('fit_glmnet_', tuneLength, '.rds')
#     fit_glmnet %>% write_rds(fo)
#     fo},
#     format = "file"), # , deployment = "main"
#   pred_glmnet = predict(fit_glmnet, newdata = test_bake),
#   # str(pred_glmnet, max.level = 1)
#   pr_pred_glmnet = predict(fit_glmnet, newdata = test_bake, type = "prob"),
#   # str(pr_pred_glmnet, max.level = 1)
#   # expect match odds to be the main variable driver else model is bad
#   plot_varImp = get_plot_varImp(fit_glmnet),
#   # increase to tuneLength = 50 ? 20 -> 30 is a long wait!
#   plot_pred_calib = get_plot_pred_calibration(
#     pr_pred_glmnet, pred_lst$all_calib_frame, test_bake),
#   table_pred = get_table_pred(pred_lst, pred_glmnet, test_bake),
#   strategy = get_strategy(train_test$test_data, pr_pred_glmnet),
#
#   # see `tar_render()` function in https://wlandau.github.io/tarchetypes
#   # and https://wlandau.github.io/targets-manual/files.html#literate-programming
#   # fo_04_recipes_calibrate_PS_odds.Rmd =
#   tar_render(name = report,
#     path = "./04_recipes_calibrate_PS_odds.Rmd",
#     params = list(your_param = "test var"), quiet = TRUE )
#   # which runs rmarkdown::render("report.Rmd", params = list(your_param = your_target))
#
#   # mod <- logistic_reg(penalty = tune(),
#   #   mixture = tune()) %>%
#   #   set_engine("glmnet")
#   # df = train_bake %>%
#   #   pivot_longer(cols = PSCH:PSCA, names_to = 'hda',
#   #     values_to = 'std_pred') %>%
#   #   relocate(hda, std_pred, .after = FTR),
#   # glmn_fit_h =
#   #   linear_reg(penalty = 0.001, mixture = 0.5) %>%
#   #   set_engine("glmnet") %>%
#   #   parsnip::fit(unclass(FTR) ~ ., data = df),
#   # glmn_fit_d =
#   #   linear_reg(penalty = 0.001, mixture = 0.5) %>%
#   #   set_engine("glmnet") %>%
#   #   parsnip::fit(PSCD ~ ., data = train_bake),
#   # glmn_fit_a =
#   #   linear_reg(penalty = 0.001, mixture = 0.5) %>%
#   #   set_engine("glmnet") %>%
#   #   parsnip::fit(PSCA ~ ., data = train_bake)
#
#
# )
#
# parse_trending_package_readms <- tar_plan( # ----
#   # https://blog.r-hub.io/2019/12/03/readmes/#tools-for-writing-and-re-using-content
#   # Simplified drake-like syntax for targets pipelines.
#   # https://blog.r-hub.io/2019/12/03/readmes/#tools-for-writing-and-re-using-content
#   # merging the lists of top downloaded and trending CRAN packages
#   pkglist = get_pkglist() ,
#   # again with pkgsearch, we’ll extract their metadata,
#   # before keeping only those that have a GitHub README
#   meta = get_meta(pkglist) ,
#   # extract their preferred README from the GitHub V3 API.
#   # Some of them won’t even have one so we’ll lose them from the sample
#   # NB: memoise::memoise NB: use pins?
#   readme_lines = get_readme_lines(meta) ,
#   # extract is the number of lines of the README
#   count_lines = get_count_lines(readmes = readme_lines) ,
#   plot_count_lines = get_plot_readme_lines( count_lines) ,
#   # look into other indicators of size: the number of lines of R code, the number of words outside of code and output.
#   # without using too many regular expressions, we shall convert the Markdown content to XML via commonmark and use XPath to parse it.
#   xml_readme_lines = get_xml_readme_lines(count_lines) ,
#   # count lines of code
#   count_lines_xlm = get_count_lines_xlm(xml_readme_lines) ,
#   # words in text?
#   words_in_text = get_words_in_text(xml_readme_lines) ,
#   # extract headers
#   structuree = get_headers(xml_readme_lines) ,
#   print_readme_structuree =
#     capture.output( get_print_readme_structure(
#       structure = structuree$structuree[[4]][[1]] ) ) ,
#   # most common headers
#   most_common_headers = get_most_common_headers(structuree)
# )
#
# recipe_glmnet <- tar_plan( # ----
#   tar_target(
#     glmn_rec,
#     recipe(mpg ~ ., data = mtcars) %>%
#       step_normalize(all_predictors())
#   ),
#   tar_target(
#     mod,
#     linear_reg(penalty = 0.1, mixture = 1) %>% set_engine("glmnet")
#   ),
#   tar_target(glmn_wflow,
#     workflow() %>%
#       add_model(mod) %>%
#       add_recipe(glmn_rec)
#   ),
#   tar_target(
#     glmn_fit,
#     glmn_wflow %>%
#       fit(data = mtcars)
#   ),
#   tar_target(
#     coeff,
#     glmn_fit %>%
#       pull_workflow_fit() %>%
#       pluck("fit") %>%
#       coef(s = 0.1)
#   )
# )
#
# hist_fit <- tar_plan( # ----
#   # individual tar_target() objects or _nested lists_ of tar_target() objects.
#   tar_target(
#     name = raw_data_file, command = "data/raw_data.csv",  format = "file",
#     deployment = "main"
#   ),
#   tar_target(raw_data, read_csv(raw_data_file, col_types = cols()),
#     # relevant to tar_make_clustermq() and tar_make_future().
#     # "worker": target builds on a parallel worker.
#     # "main":   target builds on the host-machine managing the pipeline
#     deployment = "main"
#   ),
#   tar_target(data3, tibble(y = letters, x = seq_len(length(y)) ) ),
#   tar_target(data2, I(24) ), # { data.frame(x = seq_len(26), y = letters) } ),
#   # tar_target(sdfdas, I(xx()) ), # { data.frame(x = seq_len(26), y = letters) } ),
#   tar_target(data, raw_data %>%
#       mutate(Ozone = replace_na(Ozone, mean(Ozone, na.rm = TRUE)))
#   ),
#   tar_target(hist, create_plot(data)),
#   tar_target(fit, biglm(Ozone ~ Wind + Temp, data)),
#   # see `tar_render()` function in https://wlandau.github.io/tarchetypes
#   # and https://wlandau.github.io/targets-manual/files.html#literate-programming
#   fo_report =
#     tar_render(name = report, path = "./report.Rmd",
#       params = list(your_param = data), quiet = FALSE )
#   # which runs rmarkdown::render("report.Rmd", params = list(your_param = your_target))
#
# )
#
# plan_dt = tar_plan( # ----
#   # https://github.com/rasmusab/tidyverse-in-a-table/blob/master/tidyverse_in_a_table.R
#   tidyverse_in_a_table = get_tidyverse_in_a_table() ,
#   widget_tidyverse_in_a_table = DT::saveWidget(tidyverse_in_a_table,
#     "tidyverse_in_a_table_standalone.html")
# )
#
#
#
# return plan ----
# parse_trending_package_readms
# plan_dt
# hist_fit
# recipe_glmnet
# tar_pipeline( list(recipe_glmnet = recipe_glmnet , hist_fit = hist_fit ) )
# tar_plan( recipe_glmnet = recipe_glmnet , hist_fit = hist_fit )
plan_fdata
JohnGavin/fdata documentation built on Jan. 29, 2021, 1:38 p.m.