#' engineer new data for forecasts
#' @param use_case chr
#' @param schedule data.frame
#' @param odds data.frame
#' @param data data.frame
#' @return list
#' @export
engineer_new_data <- function(use_case = 'team_games',
schedule, odds, data) {
if (use_case == 'team_games') {
# gross - copy/paste the code from engineer_data to do all the necessary engineering (another reason to modularize)
eng_data <- data %>%
dplyr::mutate(home_away = dplyr::if_else(grepl('@', matchup), 'away', 'home')) %>%
dplyr::filter(season_year == max(season_year))
# team_games1 -------------------------------------------------------------
# this chain results in df at the team-game level with minimal engineering
team_games1 <- eng_data %>%
dplyr::group_by(season_year, season_type, team_id, game_id, game_date, wl, home_away) %>%
dplyr::summarise(
# first do all the numeric columns
dplyr::across(
tidyselect:::where(is.numeric),
sum,
.names = '{.col}_game'
),
# see how many players in rotation exceeded minutes thresholds
# TODO: FUNCTIONALIZE CORRECTLY
players_gt_0min_game = dplyr::n_distinct(player_id[min > 0]),
players_gt_10min_game = dplyr::n_distinct(player_id[min > 10]),
players_gt_20min_game = dplyr::n_distinct(player_id[min > 20]),
.groups = 'keep'
) %>%
# calculate team-game ratios and percentages
dplyr::mutate(ft_perc_game = ftm_game / fta_game,
fg_perc_game = fgm_game / fga_game,
fg3_perc_game = fg3m_game / fg3a_game,
ast_tov_ratio_game = ast_game / tov_game,
fg3a_fga_ratio_game = fg3a_game / fga_game,
fg3m_fgm_ratio_game = fg3m_game / fgm_game,
oreb_perc_game = oreb_game / reb_game,
ft_points_ratio_game = ftm_game / pts_game) %>%
dplyr::mutate(overtime = dplyr::if_else(min_game > 240, 1, 0),
count_overtime = dplyr::case_when(
overtime == 0 ~ 0,
overtime > 0 ~ (min_game - 240) / 25)) %>%
dplyr::select(-min_game) %>%
dplyr::ungroup()
# team_games2 -------------------------------------------------------------
# this chain applies a bunch of order-dependent and rolling window functions within team-seasons
## make some function lists
rwl <- make_rolling_window_list()
rolling_funs_list <- purrr::flatten(rwl)
team_games2 <- team_games1 %>%
dplyr::arrange(season_year, team_id, game_date) %>%
dplyr::group_by(season_year, season_type, team_id) %>%
dplyr::mutate(season_type_game_index = dplyr::row_number()) %>%
dplyr::mutate(
# TODO: ensure we can actually look into the future before making this feature available
# b2b_g1 = dplyr::case_when(dplyr::lead(game_date) == game_date + 1 ~ 1,
# dplyr::lead(game_date) != game_date + 1 ~ 0,
# row_number() == max(row_number()) ~ 0),
b2b_g2 = dplyr::case_when(dplyr::lag(game_date) == game_date - 1 ~ 1,
dplyr::lag(game_date) != game_date - 1 ~ 0,
dplyr::row_number() == 1 ~ 0)) %>%
## make some temporary features for easier incrementing and counting events
dplyr::mutate(
win = as.numeric(wl == 'W'),
lose = as.numeric(wl == 'L'),
home = as.numeric(home_away == 'home'),
away = as.numeric(home_away == 'away'),
dplyr::across(
c(win, lose, home, away),
rwl$rollsum_list)) %>%
# streaks
dplyr::mutate(
dplyr::across(
c(win, lose, home, away),
list(streak = ~purrr::accumulate(.x, incrementor)))) %>%
# now everything blows up with the boxscore stats
dplyr::mutate(
dplyr::across(
dplyr::ends_with('_game'),
rolling_funs_list)) %>%
# and we can also calc things like games in last period of days
# but this is a 2-arg function and i'm not sure how to do that in an across() call
dplyr::mutate(
dplyr::across(
game_id,
list(
last5days = ~rollcount_days_p(.x, game_date, 4, .complete = TRUE),
last11days = ~rollcount_days_p(.x, game_date, 10, .complete = TRUE),
last23days = ~rollcount_days_p(.x, game_date, 22, .complete = TRUE)
))) %>%
dplyr::select(-home, -away, -win, -lose) %>%
dplyr::ungroup()
# team_games3 -------------------------------------------------------------
# this chain joins the team_games data back onto itself but with the opponent's team_id, to get defensive (i.e. 'allowed') metrics
team_games3 <- team_games2 %>%
dplyr::mutate(join_var = dplyr::if_else(home_away == 'home', 'away', 'home')) %>%
dplyr::inner_join(
team_games2 %>%
dplyr::select(-season_year, -season_type, -wl,
-dplyr::matches('overtime'),
-dplyr::matches('game_index')),
by = c('game_id' = 'game_id',
'game_date' = 'game_date',
'join_var' = 'home_away')) %>%
dplyr::select(-join_var) %>%
dplyr::ungroup()
## fix the names to reflect what they actually mean
fixed_names3 <- gsub('.x', '', names(team_games3), fixed = TRUE)
fixed_names3 <- gsub('.y', '_allowed', fixed_names3, fixed = TRUE)
names(team_games3) <- fixed_names3
# team_games4 -------------------------------------------------------------
# this chain engineers game-level data (i.e. team vs. allowed stuff)
team_games4 <- team_games3 %>%
dplyr::mutate(score_margin_game = pts_game - pts_game_allowed) %>%
dplyr::group_by(season_year, season_type, team_id) %>%
dplyr::mutate(
dplyr::across(score_margin_game, rolling_funs_list)) %>%
dplyr::ungroup()
# team_games4_2 -----------------------------------------------------------
# this chain filters the historical data to the most recent observation
team_games4_2 <- team_games4 %>%
dplyr::mutate(ha_placeholder = home_away) %>%
dplyr::group_by(team_id) %>%
dplyr::filter(game_date == max(game_date)) %>% # this only gets the latest offense/defense
dplyr::select(-game_date, -game_id, -home_away) %>%
dplyr::ungroup()
# team_games4_3 -----------------------------------------------------------
# this chain adds the schedule for joining current game opponents
long_schedule <- make_long_schedule(schedule)
team_games4_3 <- team_games4_2 %>%
dplyr::inner_join(long_schedule)
# team_games5 -------------------------------------------------------------
# this chain joins opponent's offensive and defensive historical performance
team_games5 <- team_games4_3 %>%
dplyr::inner_join(
team_games4_3 %>%
dplyr::select(-season_year, -season_type, -game_date, -home_away, -wl),
by = c('team_id' = 'opp_team_id')) %>%
dplyr::select(-opp_team_id)
# fix the names again
fixed_names5 <- gsub('\\.x$', '', names(team_games5))
fixed_names5 <- gsub('^(.*)\\.y$', 'opp_\\1', fixed_names5)
names_regex <- grepl('_game$|_allowed$|_roll|_streak|players_gt|overtime|ratio|perc|wl', fixed_names5)
fixed_names5[names_regex] <- paste0(fixed_names5[names_regex], '_lagged')
names(team_games5) <- fixed_names5
# team_games6 -----------------------------------------------------------
# this chain calculates rolling team vs. current opponent stuff
team_games6 <- team_games5 %>%
dplyr::mutate(
delta_pts_game_lagged = pts_game_lagged - opp_pts_game_lagged,
delta_pts_game_rollmean_5_lagged = pts_game_rollmean_5_lagged - opp_pts_game_rollmean_5_lagged,
delta_pts_game_rollmean_11_lagged = pts_game_rollmean_11_lagged - opp_pts_game_rollmean_11_lagged,
delta_pts_game_rollmean_23_lagged = pts_game_rollmean_23_lagged - opp_pts_game_rollmean_23_lagged,
delta_fg_pct_game_lagged = fg_pct_game_lagged - opp_fg_pct_game_lagged,
delta_fg_pct_game_rollmean_5_lagged = fg_pct_game_rollmean_5_lagged - opp_fg_pct_game_rollmean_5_lagged,
delta_fg_pct_game_rollmean_11_lagged = fg_pct_game_rollmean_11_lagged - opp_fg_pct_game_rollmean_11_lagged,
delta_fg_pct_game_rollmean_23_lagged = fg_pct_game_rollmean_23_lagged - opp_fg_pct_game_rollmean_23_lagged,
delta_fg3_pct_game_lagged = fg3_pct_game_lagged - opp_fg3_pct_game_lagged,
delta_fg3_pct_game_rollmean_5_lagged = fg3_pct_game_rollmean_5_lagged - opp_fg3_pct_game_rollmean_5_lagged,
delta_fg3_pct_game_rollmean_11_lagged = fg3_pct_game_rollmean_11_lagged - opp_fg3_pct_game_rollmean_11_lagged,
delta_fg3_pct_game_rollmean_23_lagged = fg3_pct_game_rollmean_23_lagged - opp_fg3_pct_game_rollmean_23_lagged,
delta_reb_game_lagged = reb_game_lagged - opp_reb_game_lagged,
delta_reb_game_rollmean_5_lagged = reb_game_rollmean_5_lagged - opp_reb_game_rollmean_5_lagged,
delta_reb_game_rollmean_11_lagged = reb_game_rollmean_11_lagged - opp_reb_game_rollmean_11_lagged,
delta_reb_game_rollmean_23_lagged = reb_game_rollmean_23_lagged - opp_reb_game_rollmean_23_lagged,
delta_ast_game_lagged = ast_game_lagged - opp_ast_game_lagged,
delta_ast_game_rollmean_5_lagged = ast_game_rollmean_5_lagged - opp_ast_game_rollmean_5_lagged,
delta_ast_game_rollmean_11_lagged = ast_game_rollmean_11_lagged - opp_ast_game_rollmean_11_lagged,
delta_ast_game_rollmean_23_lagged = ast_game_rollmean_23_lagged - opp_ast_game_rollmean_23_lagged,
delta_stl_game_lagged = stl_game_lagged - opp_stl_game_lagged,
delta_stl_game_rollmean_5_lagged = stl_game_rollmean_5_lagged - opp_stl_game_rollmean_5_lagged,
delta_stl_game_rollmean_11_lagged = stl_game_rollmean_11_lagged - opp_stl_game_rollmean_11_lagged,
delta_stl_game_rollmean_23_lagged = stl_game_rollmean_23_lagged - opp_stl_game_rollmean_23_lagged,
delta_tov_game_lagged = tov_game_lagged - opp_tov_game_lagged,
delta_tov_game_rollmean_5_lagged = tov_game_rollmean_5_lagged - opp_tov_game_rollmean_5_lagged,
delta_tov_game_rollmean_11_lagged = tov_game_rollmean_11_lagged - opp_tov_game_rollmean_11_lagged,
delta_tov_game_rollmean_23_lagged = tov_game_rollmean_23_lagged - opp_tov_game_rollmean_23_lagged,
delta_pf_game_lagged = pf_game_lagged - opp_pf_game_lagged,
delta_pf_game_rollmean_5_lagged = pf_game_rollmean_5_lagged - opp_pf_game_rollmean_5_lagged,
delta_pf_game_rollmean_11_lagged = pf_game_rollmean_11_lagged - opp_pf_game_rollmean_11_lagged,
delta_pf_game_rollmean_23_lagged = pf_game_rollmean_23_lagged - opp_pf_game_rollmean_23_lagged,
delta_pfd_game_lagged = pfd_game_lagged - opp_pfd_game_lagged,
delta_pfd_game_rollmean_5_lagged = pfd_game_rollmean_5_lagged - opp_pfd_game_rollmean_5_lagged,
delta_pfd_game_rollmean_11_lagged = pfd_game_rollmean_11_lagged - opp_pfd_game_rollmean_11_lagged,
delta_pfd_game_rollmean_23_lagged = pfd_game_rollmean_23_lagged - opp_pfd_game_rollmean_23_lagged,
delta_pts_game_lagged = pts_game_lagged - opp_pts_game_lagged,
delta_pts_game_rollmean_5_lagged = pts_game_rollmean_5_lagged - opp_pts_game_rollmean_5_lagged,
delta_pts_game_rollmean_11_lagged = pts_game_rollmean_11_lagged - opp_pts_game_rollmean_11_lagged,
delta_pts_game_rollmean_23_lagged = pts_game_rollmean_23_lagged - opp_pts_game_rollmean_23_lagged,
delta_pts_game_vs_opp_pts_game_allowed_lagged = pts_game_lagged - opp_pts_game_allowed_lagged,
delta_pts_game_vs_opp_pts_game_allowed_rollmean_5_lagged = pts_game_rollmean_5_lagged - opp_pts_game_rollmean_5_allowed_lagged,
delta_pts_game_vs_opp_pts_game_allowed_rollmean_11_lagged = pts_game_rollmean_11_lagged - opp_pts_game_rollmean_11_allowed_lagged,
delta_pts_game_vs_opp_pts_game_allowed_rollmean_23_lagged = pts_game_rollmean_23_lagged - opp_pts_game_rollmean_23_allowed_lagged,
delta_pts_game_allowed_vs_opp_pts_game_lagged = pts_game_allowed_lagged - opp_pts_game_lagged,
delta_pts_game_allowed_vs_opp_pts_game_rollmean_5_lagged = pts_game_rollmean_5_allowed_lagged - opp_pts_game_rollmean_5_lagged,
delta_pts_game_allowed_vs_opp_pts_game_rollmean_11_lagged = pts_game_rollmean_11_allowed_lagged - opp_pts_game_rollmean_11_lagged,
delta_pts_game_allowed_vs_opp_pts_game_rollmean_23_lagged = pts_game_rollmean_23_allowed_lagged - opp_pts_game_rollmean_23_lagged
)
# team_games7 -------------------------------------------------------------
# this chain adds the new odds data
team_games7 <- team_games6 %>%
dplyr::inner_join(odds)
# team_games8 -------------------------------------------------------------
# this chain increments game indices
team_games8 <- team_games7 %>%
dplyr::mutate(
season_type_game_index = season_type_game_index + 1,
opp_season_type_game_index = opp_season_type_game_index + 1
)
return(team_games7)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.