# https://wlandau.github.io/targets/reference/index.html ----
# tar_edit() - edit _this_ (_targets.R) file
# tar_renv()
# tar_render() Render a dependency-aware R Markdown report.
# tar_knit() Run a dependency-aware knitr report.
# tar_change() Always run a target when a custom object changes.
# tar_force() Always run a target when a custom condition is true.
# tar_suppress() Never run a target when a custom condition is true.# tar_plan()
#
# tar_script({ # Write a _targets.R script to the current working directory.
# tar_script({ tar_pipeline(...) )
# Every targets project requires a target script in the project root.
# The target script must always be named _targets.R.
# Every _targets.R file should run the following steps in the order below:
# 1. Package: load the targets package. ----
# This step is automatically inserted at the top of _targets.R files
# produced by tar_script() if library_targets is TRUE,
# so you do not need to explicitly include it in code.
options(warnPartialMatchArgs = FALSE)
# options(tidyverse.quiet = TRUE) # not needed cos suppressPackageStartupMessages
suppressPackageStartupMessages({
# library(renv) ; # ----
# renv::init(bare = TRUE)
# # disable automatic snapshots
# auto.snapshot <- getOption("renv.config.auto.snapshot")
# options(renv.config.auto.snapshot = FALSE)
# renv::install(pkgs[-c(1:2)]) ; .libPaths()
# renv::install(c('pins', 'conflicted','visdat'))
# renv::settings$ignored.packages() ; renv::settings$snapshot.type("all")
# renv::install("devtools") # renv::install("digest@0.6.19")
# devtools::install_github("ropensci/tarchetypes")
# renv::restore() # revert to the previous state as encoded in the lockfile
# renv::snapshot() #
# renv::use_python() # https://blog.rstudio.com/2019/11/06/renv-project-environments-for-r/#Integration_with_Python
# renv::deactivate() ; renv::activate() # removes the renv auto-loader from the project .Rprofile
# renv::install("rstudio/renv") # renv::install(type = "binary")
# TODO: install shiny to see if displays missing tables below.
pkgs <- c('targets', 'tarchetypes', # ----
'renv', 'dplyr', 'readr', 'tibble', 'rmarkdown', 'stringr', # stringr::str_subset # 'tidyverse',
'purrr', 'tidyr', 'ggplot2', # purrr::walk and purrr::map tidyr::unnest tidyr::drop_na
# 'conflicted', # 'logger', "here", 'assertthat',
'fs', # 'tidymodels', fs requires tidymodels (12 more packages!)
# jbg_utils 'ratelimitr', 'memoise',
# polite sessions (httr and rvest), policies (robotstxt),
# rate-limiting and response caching (ratelimitr and memoise).
# 'reticulate::py_save_object',
'visdat', # 'visNetwork/htmlwidgets', 'naniar', # vis_miss
'details', 'reticulate', # readme tabs
'lubridate', 'gt', 'pins', 'glue'
# 'backports', 'checkmate', 'commonmark', 'rappdirs', 'sass', 'Rcpp', 'filelock', 'zip'
)
# 'polite', # https://dmi3kno.github.io/polite/
# 'future', 'reticulate',
# 'data.table', 'recipes', 'caret', 'glmnet'
# 'parsnip', # fit
# renv::snapshot(packages = pkgs, prompt = TRUE) # WARNING: 'packages' is critical
# WARNING: purrr::walk/base::lapply(pkgs, library ... FAILS targets
# https://github.com/rstudio/renv/issues/143
# renv::status()
base::sapply(pkgs, library, character.only = TRUE, quiet = TRUE,
logical.return = TRUE, warn.conflicts = FALSE) # %>% print()
# purrr::walk(pkgs, library, character.only = TRUE)
# renv::install('pacman') ;library(pacman) ; p_load(pkgs)
})
# 2. Options: tar_option_set() defaults for targets-specific settings ----
# # such as the names of required packages.
# # Uncomment below to deploy targets to parallel jobs when running tar_make_clustermq().
tar_option_set(tidy_eval = TRUE,
packages = pkgs,
tar_option_set(error="workspace") # on err, find workspace image file in `_targets/workspaces/`.
# (Or, tar_option_set(workspace = c("target1", "target2")) - always save workspaces for specific targets.
# tar_meta(fields = error) %>% na.omit() ; list.files("_targets/workspaces")
# tar_workspace(<<failed target name>>) # environment of failed target command - reproduce error
# tar_undebug() # remove (large) workspace files
# debug() debugonce() undebug() browse() # debugging functions utilities
# https://rstats.wtf/debugging-r-code.html # interactive debugging
# debugonce(<<function>>) Shift+F9 set breakpt == browser(), 'where' stack trace, 's' step into `n` next line, `c` next breakpoint `Q` to exit
# Or debug interactively while `tar_make()`
# tar_option_set(debug = "<<function name>>", cue = tar_cue(mode = "never")) then tar_make(callr_function = NULL) in the R console
)
#
# suppressMessages({
# conflict_prefer("pluck", "rvest")
# conflict_prefer("select", "dplyr")
# conflict_prefer("col_factor", "readr", "scales")
# conflict_prefer("spec", "readr", "yardstick")
# conflict_prefer("discard", "purrr", "scales")
# conflict_prefer("collapse", "dplyr", "glue")
# conflict_prefer("filter", "dplyr", "stats")
# conflict_prefer("lag", "dplyr", "stats")
# conflict_prefer("fixed", "recipes", "stringr")
# })
# tar_option_set ----
# How to pass this setting?
# TODO: globaenv not needed?
# tar_option_set(envir = globalenv() )
# tar_option_get("envir")
#
# # Even if you have no specific options to set,
# # call tar_option_set() to register the proper environment.
# # ending with a call to tar_pipeline().
# # tar_option_set() # set packages globally for all subsequent targets you define.
# # tar_option_set(envir = environment())
# # data <- tar_target(target_name, get_data(!!n_rows), tidy_eval = FALSE)
# # print(data)
# # tar_option_reset() # Reset all target options you previously chose
#
# # use local multicore computing when running tar_make_clustermq().
# options(clustermq.scheduler = "multisession")
# sfuture::plan(future::multicore) # or multisession
#
# 3. Globals: load custom functions and global objects into memory. ----
# # calls to source() defining user-defined functions
# list.files("R", pattern = "\\.R$", recursive = FALSE, ignore.case = TRUE,
# full.names = TRUE) %>% walk(source)
fs::dir_ls('R', regexp = "\\.R$") %>%
str_subset("plan[.]R|_targets_[.]+.R", negate = TRUE) %>%
# lapply(source)
walk(source)
# global functions from jbg_utils
# fs::dir_ls(
# file.path(here::here(), '..', 'jbg_utils', 'R'),
# regexp = "\\.R$") %>%
# str_subset("plan[.]R", negate = TRUE) %>%
# walk(source)
# rel_fp <- file.path('.')
# (fp <- file.path(rel_fp, 'data_raw'))
# dir.exists(fp) %>% stopifnot
# plan_fdata = tar_plan( # ----
plan_fdata = tar_plan( # TODO: move blocks into functions ----
season_starts = {
first_season_yr <- 14
first_season_yr:current_season() },
# create a new local board
#
# TODO: https://www.kaggle.com/paololol/league-of-legends-ranked-matches?select=matches.csv
# https://www.kaggle.com/fortyeth/analyzing-win-attributes-in-league-of-legends
# https://www.kaggle.com/gulsahdemiryurek/let-s-predict-league-of-legends-match-score
# https://www.kaggle.com/search?q=league+of+legends
# https://datasetsearch.research.google.com/search?query=league%20of%20legends&docid=HwcmHeE3TNPOdMwAAAAAAA%3D%3D
#
board_name = c('football_data', 'local')[2],
fp_cache = file.path(".", "data", "pins"), # options(pins.path = "./.pins_cache")
create_board = { # ----
# cache the data locally, in a subfolder, for git to store it.
if (!exists(fp_cache)) dir.create(fp_cache, recursive=TRUE, showWarnings = FALSE)
# TODO: get 'local' cache, copy 'data.txt' to 'football-data'
#if (!dir.exists("./data/pins/football-data/1415"))
# dir.create("./data/pins/football-data/1415", recursive=TRUE, showWarnings = FALSE)
# TODO: add branch?
board_register_local(board_name, cache = fp_cache)
# configure default board ----
# options(pins.board = )
# board_get(pins_cache)
# board_list() # board_default() ;
# pin_find(board = board_name)
# pin_info(name = '1415' , board = board_name)
# pin_log(board = board_name)
#
# urll <- 'https://www.football-data.co.uk/mmz4281/1415/data.zip'
# pin_get(name = urll %>% extract_name(), board = board_name)
# pin_get(name = '1415' , board = board_name) %>% str()
#
# pin_remove(name = urll %>% extract_name(), board = board_name)
# pin_remove(name = "15-16", board = c(board_name, "local")[2])
#
# board_deregister(name = c(board_name, "local")[1])
# options(pins.board = NULL) # revert default board
#
# pins:::pin_find_empty()
# pin_default_name(x = urll %>% extract_name(), board = board_name)
# pins:::board_pin_create.local(board = board_name,
# name = extract_name(urll))
# urll = url_base)
#
# # read_csv( pin(urll, name = extract_name(urll) , board = board_name) )
# urll <- "https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv"
# read_csv(pin(urll)) # NB: does it read into local by default?
# read_csv(pin(urll, name = "1415" ) )
},
# download football-data as a tibble ----
# NB: force target to call get_data then pins decide if data needs to redownloaded.
# i.e. target cannot know if url has changed - only pins can know by checking.
tar_force(name = fdata, command = {
raw <- season_starts %>% get_fdata(board_name)
raw %>% transform_fdata() } , force = TRUE),
# fdata = {
# raw <- season_starts %>% get_fdata(board_name)
# raw %>% transform_fdata() } ,
# fdata %>% str(max.level = 1)
# fdata %>% head(c(4, 3))
#
# usethis::use_data(fdata, overwrite = TRUE) # use that data!
pins_path = "data/pins",
pin_fdata = board_register_github(name = "football_data_github",
repo = "JohnGavin/fdata", branch = 'master', # token = NULL, # use GITHUB_PAT
path = pins_path, host = "https://api.github.com", cache = fp_cache, # board_cache_path(),
description = glue("Source: football-data.co.uk. ",
"Football-Data is a free football betting portal ",
"providing historical results & odds for many years of data.")
),
tar_target(fo_fdata, { # write a file then return a path
fo = './data/football_data_co_uk.csv.gz'
fdataa <- fdata %>% arrange(desc(datee)) # cannot assign to fdata a target
fdataa %>% write_csv(fo)
# fdataa %>% write_rds(fo %>% str_replace('csv\\.gz$', 'rds'))
# fdataa %>% reticulate::py_save_object(fo %>% str_replace('csv\\.gz$', 'pkl'))
fo}, format = "file") ,# , deployment = "main"
chk_data_recent = { # check data is recent ----
(most_recent <- fdata %>%
summarise(datee = datee %>% max(na.rm = TRUE), .groups = 'keep') %>%
pull(datee))
(today() - most_recent < period(day = 5)) %>% stopifnot() },
# assert_that(today() - most_recent < period(day = 5)) %>% stopifnot() },
chk_datee_na =
(fdata %>% filter(is.na(datee)) %>% nrow %>% `==`(0)) %>% stopifnot,
# assert_that(fdata %>% filter(is.na(datee)) %>% nrow %>% `==`(0)) %>% stopifnot,
divs_dates_recent = # most recent match dates by Div - older first
fdata %>% group_by(Div) %>%
summarize(across(where(is.Date), max, na.rm = TRUE), .groups = 'keep') %>%
ungroup() %>%
arrange(datee) ,
# fdata %>% # A named list of functions ----
# group_by(Div) %>%
# summarise(across( class(everything()) %>% contains('date|time') ,
# list(min = min, max = max), .names = "{.col}.fn{.fn}"), .groups = 'keep')
top_divs_dates_recent =
fdata %>% # pull(Div) %>% table
dplyr::filter(Div %in% c('E0', paste0(c('D', 'F', 'I', 'SP'), 1) ) ) %>%
group_by(Div) %>%
summarize(most_recent_match = max(datee)) %>%
arrange(desc(most_recent_match)),
raw_csv_list = fp_cache %>% fs::dir_ls(regexp = "\\.csv$", recurse = 2) ,
gg_dat_miss = fdata %>%
sample_n(size = 2e3) %>%
arrange(desc(datee)) %>%
visdat::vis_miss(warn_large_data=FALSE) +
theme(axis.text.x = list(element_text(size=6, angle=60), element_blank())[[1]]),
# see `tar_render()` function in https://wlandau.github.io/tarchetypes
# and https://wlandau.github.io/targets-manual/files.html#literate-programming
# WARNING: do NOT assign tar_render i.e. abc = tar_render(...)
# The _object_ README now depends on tar_read/tar_load objs in path Rmd file
tar_render(README, path = "./README.Rmd", quiet=FALSE,
params = list(pins_path = pins_path) )
# which runs rmarkdown::render("report.Rmd", params = list(your_param = your_target))
) # plan_fdata
### Scrape a webpage ----
# library(polite)
# library(rvest)
# library(progress)
#
# urll <- c("https://www.football-data.co.uk/data.php",
# "https://www.football-data.co.uk/englandm.php")[2]
# session <- bow(urll)
# current_page <- nod(session, urll) %>%
# scrape(verbose=TRUE)
#
# # First find all the urls
# table_urls <- current_page %>%
# html_nodes("a") %>%
# html_attr("href") %>%
# xml2::url_absolute(urll) %>%
# tibble(urll = .)
# table_urls %>% bind_cols(
# table_urls %>% select(urll) %>%
# map(str_match, '/([0-9]{4})/([a-zA-Z0-9]+\\.csv)') %>%
# `[[`(1) %>% `[`(, -1) %>%
# structure(., dimnames = list(NULL, c('season', 'Div'))) %>%
# as_tibble() ) %>%
# mutate(Divv = Div %>% str_replace('.csv$', '')) %>%
# select(-Div) %>%
# filter(!is.na(season), season > "1000", season < "2030") %>%
# distinct() %>%
# head(20) ->
# table_urls
#
# pb <- progress_bar$new(
# format = paste0("Downloading [:bar] :current/:total (:percent) ",
# "eta: :eta elapsed: :elapsed ",
# "rate: :rate bytes: :bytes what: :what") ,
# total = nrow(table_urls), clear = FALSE, width= 60)
# # pb$tick(0) # initialise progress.
#
#
# system.time(
# table_urls %>%
# mutate(csv = urll %>%
# map( read_csv,
# col_types = cols(.default = "c"), fileEncoding = "latin1")
# ) ->
# table_csvs
# )
# table_csvs$csv[[1]]
# table_csvs %>%
# unnest(csv) -> # %>%
# table_csv
# glimpse(table_csv)
# # confirm Divv == Div then remove Divv
# assert_that( table_csvs %>% summarize(chk = all(Divv == Div)) %>%
# pull(chk) ) %>% stopifnot()
# table_csv %>% select(-Divv) %>%
# type.convert(as.is = TRUE)
#
#
#
# system.time( #
# table_csvs <- table_urls %>%
# filter(urll %>% str_ends('csv')) %>%
# mutate(csv = urll %>% map_dfr(read_csv, col_types = cols()) %>% list())
# )
# # https://www.football-data.co.uk/mmz4281/9394/E1.csv
# table_csvs %>% select(csv) %>%
#
# while(!is.na(urll)){
# # make it verbose
# message("Scraping ", urll)
# # nod and scrape
# current_page <- nod(session, urll) %>%
# scrape(verbose=TRUE)
# # extract post titles
# hrbrmstr_posts <- current_page %>%
# html_nodes( c("nth-child", ".entry-title a")[1]) %>%
# polite::html_attrs_dfr() %>%
# rbind(hrbrmstr_posts)
# # see if there's "Older posts" button
# urll <- current_page %>%
# html_node(".nav-previous a") %>%
# html_attr("href")
# } # end while loop
#
# tibble::as_tibble(hrbrmstr_posts)
# plan_04 = tar_plan( # ----
#
# # plan_04 inputs
# # fdata - b_data_past_filters__pinn_feat_results_stk.csv.gz
# # odds - b_data_past_filters__ps_op_2_cl_stk.csv
# # us_match_stats_mapped_fd - us_match_stats_mapped_fd.rds
# # fd_fb - merged_football-data_fbref_probs_xg.csv.g
# #
# # plan map_lge_teams inputs
# # fb_match_odds_mapped_us - fb_match_odds_mapped_us.rds (better than 'odds')
#
# # pivot to one row per match
# # Align team by dates
# # Algo to map league/team names
# # Outline of team mapping steps
# # Restrict to common dates between the two sources
# # Dates where only one match is played
#
# # merge fb odds (fb_match_odds_mapped_us) to fb match stats (fd_fb)
# # fb_match_odds_mapped_us (34k) x fd_fb (24k, 1 rpm)
# # fb_odds_wide_map_us (5k)
# # merge fb odds & match stats to understat us_match_stats
#
# # fb_match_odds_mapped_us.rds (long format)
# # ALL odds (incl tg and AH)
# # (34k rows but in long format so really just 2k rows cos 2 seasons
# # for 19/20 and 20/21 only
# # fb_match_odds_mapped_us %>% pull(Date) %>% range()
# # and act (Pinnacle) v exp (Poisson) goals
# # NOT 1x2 odds only (not tg and AH) and act (Pinnacle) v exp (Poisson) goals
# fb_match_odds_mapped_us =
# read_rds('../../data/understatr/fb_match_odds_mapped_us.rds') ,
# # (fb_match_odds_mapped_us) %>% pull(date) %>% range()
# # fd has 10 leagues. us_match_stats_mapped_fd had 5.
# # So us_home, us_away and league_names has nans.
# # plot_fb_match_odds_mapped_us = vis_miss(fb_match_odds_mapped_us) ,
# # fb_match_odds_mapped_us %>% select(date) %>% drop_na() %>% pull() %>% range()
#
# # fb_match_odds_mapped_us %>% select(contains('date'))
# fb_odds_wide_map_us = fb_match_odds_mapped_us %>%
# select(-mm) %>% # filter(mm %in% c('PS', 'P'))
# pivot_wider(id_cols = -c('bet', 'type', 'C', 'exp', 'act'),
# names_from = c('bet', 'type'),
# values_from = c('C', 'exp', 'act')) ,
# # Amelia::missmap
# plot_fb_odds_wide_map_us = vis_miss(fb_odds_wide_map_us) ,
# # FIXME: why is expected AH == expected TG? But for actual?
# # fb_odds_wide_map_us %>% mutate(
# # fixme_A = (exp_A_ah - exp_lt2.5_tg) / exp_lt2.5_tg,
# # fixme_H = (exp_H_ah - exp_gt2.5_tg) / exp_gt2.5_tg,
# # .after = exp_gt2.5_tg) %>%
# # summarise(A_max = max(fixme_A), A_min = min(fixme_A),
# # H_max = max(fixme_H), H_min = min(fixme_H), .groups = 'keep')
#
#
# # # football-data odds contains all team names in football-data / fbref.com
# # c(fb_odds_wide_map_us$home, fb_odds_wide_map_us$away) %>%
# # setdiff(c(fd_fb$HomeTeam, fd_fb$AwayTeam))
# # # FIXME: so why is the converse missing 51 team names (minor team names)?
# # c(fd_fb$HomeTeam, fd_fb$HomeTeam) %>%
# # setdiff(c(fb_odds_wide_map_us$home, fb_odds_wide_map_us$home) )
# # # fb_odds_wide_map_us$home %>% setdiff(fd_fb$HomeTeam)
#
# # names(fb_odds_wide_map_us) %>% intersect(names(fd_fb))
# # fb_odds_wide_map_us %>% select(contains('date') ); fd_fb %>% select(contains('date'))
# fd_fb_us = fb_odds_wide_map_us %>% left_join(fd_fb,
# by = c('Date' = 'datetime', 'home' = 'HomeTeam', # team_h',
# 'away' = 'AwayTeam', 'season', 'Div', 'future',
# "xg_H", "xg_A" # why is "xg_H", "xg_A" in both?
# ) ) ,
# plot_fd_fb_us = vis_miss(fd_fb_us, warn_large_data=FALSE) ,
# # 2nd div leagues are all nan in understat columns
# tmp <- fd_fb_us %>% filter(!((Div %>% str_detect('2$')) | (Div == 'E1'))) %>%
# select( -c(names(fb_odds_wide_map_us))) %>%
# filter( rowSums(is.na(.)) != ncol(.)) ,
# vis_miss(tmp, warn_large_data=FALSE) , # skimr::skim(tmp)
# tmp <- tmp %>% summarize(across(everything(), ~ all(is.na(.)))) %>% unlist
# fd_fb_us_2 = fd_fb_us %>%
# filter(!((Div %>% str_detect('2$')) | (Div == 'E1'))) %>%
# select(-c(names(fb_odds_wide_map_us), names(tmp[tmp])))
# vis_miss(fd_fb_us_2, warn_large_data=FALSE) , # skimr::skim(tmp)
#
# # all date/date_time columns should be upto date
# conflict_prefer("first", "dplyr") ; conflict_prefer("last", "dplyr")
# fd_fb_us %>% summarise( across(
# where(lubridate::is.POSIXt)| where(is.date),
# max, na.rm = TRUE) ) %>% t # OR
# fd_fb_us %>% select(contains('date') | contains('time')) %>%
# summarise(across(everything(), max, na.rm = TRUE), .groups = 'keep') %>%
# # summarise_all( max, na.rm = TRUE) %>% min() # summarise_all deprecated
# # .fns = list(first = first, last = last) ) )
#
# # NB: fd_fb assumes that ./notebook/01_merged_data/drake.R has updated
# # all local csvs in ./data/01_merged_data/*.csvs from _private_ github.
# fd_fb %>% select(date_only) %>% drop_na() %>% pull() %>% max()
# fd_fb %>% select(ldn_time) %>% drop_na() %>% pull() %>% max()
#
# # FIXME: us_match_stats_mapped_fd has two rows per match and many columns are not clear.
# # e.g. forcasts are for 'home' team but not labelled.
# tar_load(us_match_stats_mapped_fd)
# us_1_rpm = # one row per match
# us_match_stats_mapped_fd %>% arrange(Div, datetime, h.title)
# fd_fb %>% select(contains('date'))
# us_match_stats_mapped_fd %>% select(contains('date'))
#
#
# # equivalent to fb_match_odds ( == match_ids_probs_team_stats.rds )
# # but with map to understat
# # match_ids_probs_team_stats =
# # read_rds('../../data/understatr/match_ids_probs_team_stats.rds') %>%
# # arrange(desc(isResult), desc(date), league_name, match_id, side),
# # two sides for every match.
# # match_ids_probs_team_stats %>% group_by(match_id) %>% filter(n() != 2)
# # match_ids_probs_team_stats %>% filter(isResult) %>% pull(datetime) %>% range
#
# # See also event dataset - shots_by_match_xg_probs_team_stats.csv.gz
# # 650k rows - all shots for each side per match
# # a superset of shots_by_match_xg.csv.gz
#
# )
# plan_04 = tar_plan( # ----
#
# # plan_04 inputs
# # fdata - b_data_past_filters__pinn_feat_results_stk.csv.gz
# # odds - b_data_past_filters__ps_op_2_cl_stk.csv
# # us_match_stats_mapped_fd - us_match_stats_mapped_fd.rds
# # fd_fb - merged_football-data_fbref_probs_xg.csv.g
# #
# # plan map_lge_teams inputs
# # fb_match_odds_mapped_us - fb_match_odds_mapped_us.rds (better than 'odds')
#
# # fdata - b_data_past_filters__pinn_feat_results_stk.csv.gz
# tar_target(fi_fdata, file.path(fp, # pins::pin
# 'b_data_past_filters__pinn_feat_results_stk.csv.gz'),
# format = "file"), # , deployment = "main"
# fdata_raw = read_data(fi_fdata),
# fdata = tidy_fdata(fdata_raw),
# # TODO: is pins just for input data?
# tar_target(fo_fdata, # write a file then return a path
# { fo = fi_fdata %>% str_replace('csv\\.gz$', 'rds')
# fdata %>% write_rds(fo)
# fo}, format = "file") ,# , deployment = "main"
#
# # fd_fb - merged_football-data_fbref_probs_xg.csv.g
# tar_target(fi_fd_fb, file.path(fp, '01_merged_data',
# 'merged_football-data_fbref_probs_xg.csv.gz'),
# format = "file"), # , deployment = "main"
# # fi_fd_fb <- file.path(fp, '01_merged_data', 'merged_football-data_fbref_probs_xg.csv.gz')
# # football.co.uk (match stats but not odds) and fbref.com (summary table)
# # FIXME: get_local_csv mutates some cols so is not a general function
# # strip out mutation step as seperate function.
# # NB: fd_fb assumes that ./notebook/01_merged_data/drake.R has updated
# # local csvs in ./data/01_merged_data/*.csvs from _private_ github.
# fd_fb = get_local_csv(fn = basename(fi_fd_fb), fp = dirname(fi_fd_fb)) %>% rename(datetime = date) ,
#
# # odds - b_data_past_filters__ps_op_2_cl_stk.csv
# tar_target(fi_odds, file.path(fp, # pins::pin
# 'b_data_past_filters__ps_op_2_cl_stk.csv'),
# format = "file"), # , deployment = "main"
# odds = read_odds(fi_odds),
#
# # us_match_stats_mapped_fd - us_match_stats_mapped_fd.rds
# tar_target(fi_us, file.path(fp, 'understatr', 'us_match_stats_mapped_fd.rds'), format = "file"), # , deployment = "main"
# # fi_us <- file.path(fp, 'understatr', 'us_match_stats_mapped_fd.rds')
# us_match_stats_mapped_fd = read_rds(fi_us) %>% filter(isResult) %>%
# arrange(desc(isResult), desc(date), datetime, league_name, match_id, h.title) %>%
# filter(isResult) %>%
# select(!contains('id$')) %>%
# select(dplyr::matches('name|xg|xp|forecast|date|title|season|goal'),
# deep.h:ppda_allowed.def.h) %>%
# # TODO: add week of season.
# # TODO: xG v xG.h & xGA v xG.a
# # TODO: Pinn tg <> 1.5 v actual result calibration fd_fb
# mutate(hr = datetime %>% lubridate::hour(),
# day = datetime %>% lubridate::day(),
# mth = datetime %>% lubridate::month(), .after = datetime) %>%
# rename(date_only = date) %>%
# # select(!dplyr::matches('date')) %>%
# mutate(tg = goals.h + goals.a, gd = goals.h - goals.a, .after = goals.a) ,
# # us_match_stats_mapped_fd.rds - a superset of us_match_stats.csv.gz/rds
# plot_us_match_stats_mapped_fd = vis_miss(us_match_stats_mapped_fd ) ,
#
# # merge in FTR
# # train_data = !Season %in% last_2_seasons
# train_test = merge_FTR_odds(odds, fdata),
#
# # Apply steps to the training data. via the `recipes` package:
# train_prep = recipe_prep(train_test$train_data),
# # bake these test/train steps using this recipe
# train_bake = train_prep %>% bake(new_data = train_test$train_data),
# test_bake = train_prep %>% bake(new_data = train_test$test_data),
# # Pinnacle closing highest probabilty v actual outcome
# # TODO: repeat for understat forecast probabilities.
# pred_lst = predictions(train_test$test_data), # pred_lst %>% names
# plot_all_calib = get_plot_all_calib_frame(pred_lst$all_calib_frame),
# train_control = trainControl(
# method = c("timeslice", "adaptive_cv")[2],
# summaryFunction = multiClassSummary, classProbs = T),
# #### SLOOOOW - method="adaptive_cv"
# tuneLength = 30,
# fit_glmnet = train( FTR ~ . ,
# family="multinomial", method="glmnet",
# data = train_bake, # %>% head(1e1),
# # tuneLength = 30 => seconds 697.009 => plot _much_ better!
# # tuneLength = 40 => seconds 1339.54 == 22 mins => plot _much_ better!
# tuneLength = tuneLength, trControl=train_control), # 50
# tar_target(fo_fit_glmnet, # fo - write the file then return a path
# {fo <- paste0('fit_glmnet_', tuneLength, '.rds')
# fit_glmnet %>% write_rds(fo)
# fo},
# format = "file"), # , deployment = "main"
# pred_glmnet = predict(fit_glmnet, newdata = test_bake),
# # str(pred_glmnet, max.level = 1)
# pr_pred_glmnet = predict(fit_glmnet, newdata = test_bake, type = "prob"),
# # str(pr_pred_glmnet, max.level = 1)
# # expect match odds to be the main variable driver else model is bad
# plot_varImp = get_plot_varImp(fit_glmnet),
# # increase to tuneLength = 50 ? 20 -> 30 is a long wait!
# plot_pred_calib = get_plot_pred_calibration(
# pr_pred_glmnet, pred_lst$all_calib_frame, test_bake),
# table_pred = get_table_pred(pred_lst, pred_glmnet, test_bake),
# strategy = get_strategy(train_test$test_data, pr_pred_glmnet),
#
# # see `tar_render()` function in https://wlandau.github.io/tarchetypes
# # and https://wlandau.github.io/targets-manual/files.html#literate-programming
# # fo_04_recipes_calibrate_PS_odds.Rmd =
# tar_render(name = report,
# path = "./04_recipes_calibrate_PS_odds.Rmd",
# params = list(your_param = "test var"), quiet = TRUE )
# # which runs rmarkdown::render("report.Rmd", params = list(your_param = your_target))
#
# # mod <- logistic_reg(penalty = tune(),
# # mixture = tune()) %>%
# # set_engine("glmnet")
# # df = train_bake %>%
# # pivot_longer(cols = PSCH:PSCA, names_to = 'hda',
# # values_to = 'std_pred') %>%
# # relocate(hda, std_pred, .after = FTR),
# # glmn_fit_h =
# # linear_reg(penalty = 0.001, mixture = 0.5) %>%
# # set_engine("glmnet") %>%
# # parsnip::fit(unclass(FTR) ~ ., data = df),
# # glmn_fit_d =
# # linear_reg(penalty = 0.001, mixture = 0.5) %>%
# # set_engine("glmnet") %>%
# # parsnip::fit(PSCD ~ ., data = train_bake),
# # glmn_fit_a =
# # linear_reg(penalty = 0.001, mixture = 0.5) %>%
# # set_engine("glmnet") %>%
# # parsnip::fit(PSCA ~ ., data = train_bake)
#
#
# )
#
# parse_trending_package_readms <- tar_plan( # ----
# # https://blog.r-hub.io/2019/12/03/readmes/#tools-for-writing-and-re-using-content
# # Simplified drake-like syntax for targets pipelines.
# # https://blog.r-hub.io/2019/12/03/readmes/#tools-for-writing-and-re-using-content
# # merging the lists of top downloaded and trending CRAN packages
# pkglist = get_pkglist() ,
# # again with pkgsearch, we’ll extract their metadata,
# # before keeping only those that have a GitHub README
# meta = get_meta(pkglist) ,
# # extract their preferred README from the GitHub V3 API.
# # Some of them won’t even have one so we’ll lose them from the sample
# # NB: memoise::memoise NB: use pins?
# readme_lines = get_readme_lines(meta) ,
# # extract is the number of lines of the README
# count_lines = get_count_lines(readmes = readme_lines) ,
# plot_count_lines = get_plot_readme_lines( count_lines) ,
# # look into other indicators of size: the number of lines of R code, the number of words outside of code and output.
# # without using too many regular expressions, we shall convert the Markdown content to XML via commonmark and use XPath to parse it.
# xml_readme_lines = get_xml_readme_lines(count_lines) ,
# # count lines of code
# count_lines_xlm = get_count_lines_xlm(xml_readme_lines) ,
# # words in text?
# words_in_text = get_words_in_text(xml_readme_lines) ,
# # extract headers
# structuree = get_headers(xml_readme_lines) ,
# print_readme_structuree =
# capture.output( get_print_readme_structure(
# structure = structuree$structuree[[4]][[1]] ) ) ,
# # most common headers
# most_common_headers = get_most_common_headers(structuree)
# )
#
# recipe_glmnet <- tar_plan( # ----
# tar_target(
# glmn_rec,
# recipe(mpg ~ ., data = mtcars) %>%
# step_normalize(all_predictors())
# ),
# tar_target(
# mod,
# linear_reg(penalty = 0.1, mixture = 1) %>% set_engine("glmnet")
# ),
# tar_target(glmn_wflow,
# workflow() %>%
# add_model(mod) %>%
# add_recipe(glmn_rec)
# ),
# tar_target(
# glmn_fit,
# glmn_wflow %>%
# fit(data = mtcars)
# ),
# tar_target(
# coeff,
# glmn_fit %>%
# pull_workflow_fit() %>%
# pluck("fit") %>%
# coef(s = 0.1)
# )
# )
#
# hist_fit <- tar_plan( # ----
# # individual tar_target() objects or _nested lists_ of tar_target() objects.
# tar_target(
# name = raw_data_file, command = "data/raw_data.csv", format = "file",
# deployment = "main"
# ),
# tar_target(raw_data, read_csv(raw_data_file, col_types = cols()),
# # relevant to tar_make_clustermq() and tar_make_future().
# # "worker": target builds on a parallel worker.
# # "main": target builds on the host-machine managing the pipeline
# deployment = "main"
# ),
# tar_target(data3, tibble(y = letters, x = seq_len(length(y)) ) ),
# tar_target(data2, I(24) ), # { data.frame(x = seq_len(26), y = letters) } ),
# # tar_target(sdfdas, I(xx()) ), # { data.frame(x = seq_len(26), y = letters) } ),
# tar_target(data, raw_data %>%
# mutate(Ozone = replace_na(Ozone, mean(Ozone, na.rm = TRUE)))
# ),
# tar_target(hist, create_plot(data)),
# tar_target(fit, biglm(Ozone ~ Wind + Temp, data)),
# # see `tar_render()` function in https://wlandau.github.io/tarchetypes
# # and https://wlandau.github.io/targets-manual/files.html#literate-programming
# fo_report =
# tar_render(name = report, path = "./report.Rmd",
# params = list(your_param = data), quiet = FALSE )
# # which runs rmarkdown::render("report.Rmd", params = list(your_param = your_target))
#
# )
#
# plan_dt = tar_plan( # ----
# # https://github.com/rasmusab/tidyverse-in-a-table/blob/master/tidyverse_in_a_table.R
# tidyverse_in_a_table = get_tidyverse_in_a_table() ,
# widget_tidyverse_in_a_table = DT::saveWidget(tidyverse_in_a_table,
# "tidyverse_in_a_table_standalone.html")
# )
#
#
#
# return plan ----
# parse_trending_package_readms
# plan_dt
# hist_fit
# recipe_glmnet
# tar_pipeline( list(recipe_glmnet = recipe_glmnet , hist_fit = hist_fit ) )
# tar_plan( recipe_glmnet = recipe_glmnet , hist_fit = hist_fit )
plan_fdata
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.