In JSzitas/soothsayer: Tidy Time Series Forecast Model Selection

knitr::opts_chunk$set(echo = TRUE)

Load all data

qs::qread("accuracies_1_2021-11-04_1.qs") -> accuracies1
qs::qread("accuracies_2_2021-11-05_1.qs") -> accuracies2
qs::qread("../data/M_all_features.qs") -> feature_df

library(magrittr)
accuracies <- dplyr::bind_rows(accuracies1, accuracies2) %>%
  as.data.frame() %>% 
  dplyr::select_if(~ !all(is.na(.x))) %>%
  dplyr::select_if(~ !all(.x == .x[1])) %>% 
  tidyr::drop_na() %>% 
  dplyr::filter( .model !=  "<environment>")

accuracy_ranks <- accuracies %>% 
  dplyr::mutate(dplyr::across(where(is.numeric), abs)) %>% 
  dplyr::group_by( key ) %>% 
  dplyr::mutate(dplyr::across(where(is.numeric), dplyr::min_rank)) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate( avg_rank = rowMeans( dplyr::across(where(is.numeric))) )

Winner takes all: (best method is taken as a label, and no probabilistic voodoo is done)

winner_takes_all <- accuracy_ranks %>% 
  dplyr::group_by(key) %>% 
  dplyr::filter(avg_rank == min(avg_rank)) %>% 
  dplyr::select( .model, key) %>% 
  dplyr::ungroup()

Fit model on this data:

soothsayer_data <- winner_takes_all %>%
  dplyr::full_join(feature_df, by = c("key" = "keys")) %>%
  dplyr::select_if(~ !all(is.na(.x))) %>%
  dplyr::select_if(~ !all(.x == .x[1])) %>%
  tidyr::drop_na() %>%
  dplyr::mutate(sample_type = sample(c("train", "test"),
    length(key),
    replace = TRUE,
    prob = c(0.7, 0.3)
  ))

soothsayer_train <- soothsayer_data %>%
  dplyr::filter(sample_type == "train") %>%
  dplyr::select(-sample_type) %>%
  dplyr::select(-key)
soothsayer_test <- soothsayer_data %>%
  dplyr::filter(sample_type == "test") %>%
  dplyr::select(-sample_type)

ranger_x <- soothsayer_train %>%
  dplyr::select(-.model) %>%
  as.matrix()
ranger_y <- soothsayer_train %>%
  dplyr::select(.model) %>%
  unlist() %>%
  as.factor()

soothsayer_model <- ranger::ranger(
  x = ranger_x,
  y = ranger_y,
  num.trees = 2000,
  probability = TRUE
)

soothsayer_model_importance <- ranger::ranger(
  x = ranger_x,
  y = ranger_y,
  num.trees = 2000,
  importance = 'impurity_corrected',
  probability = TRUE
)

Before we go on, lets look at which variables the model actually uses (pruning could prove useful):

variable_importances <- soothsayer_model_importance[["variable.importance"]]
# rescale so they sum to 100
variable_importances <- (variable_importances/sum(variable_importances)) * 100
df <- data.frame( var_imp = variable_importances, var = names(variable_importances) ) 

ggplot2::ggplot(df, ggplot2::aes( x = reorder(var,var_imp),
                                  y = var_imp,
                                  fill = var_imp )
                ) +  
      ggplot2::geom_bar(stat="identity", position="dodge") +
      ggplot2::coord_flip()+
      ggplot2::ylab("Variable Importance")+
      ggplot2::xlab("")+
      ggplot2::scale_fill_gradient(low="red", high="blue")

Pretty, but not super readable - what about top 20?

df %>%
  dplyr::arrange(dplyr::desc(var_imp)) %>% 
  .[1:30,] %>% 
  ggplot2::ggplot( ggplot2::aes( x = reorder(var,var_imp),
                                 y = var_imp,
                                 fill = var_imp )
                 ) +  
      ggplot2::geom_bar(stat="identity", position="dodge") +
      ggplot2::coord_flip()+
      ggplot2::ylab("Variable Importance")+
      ggplot2::xlab("")+
      ggplot2::scale_fill_gradient(low="red", high="blue")

Interesting. Most of these are either features from feasts or from Catch22.

preds <- predict(soothsayer_model, soothsayer_test %>%
  dplyr::select(-c(".model", "key")) %>%
  as.matrix(), type = "response")

selected_models <- preds[["predictions"]] %>%
  as.data.frame() %>% dplyr::rowwise() %>%
  dplyr::mutate( best = which.max(dplyr::across())) %>%
  dplyr::ungroup() %>% 
  dplyr::mutate( selected_model = colnames(preds[["predictions"]])[best] ) %>% 
  dplyr::select( selected_model ) %>% 
  dplyr::bind_cols( soothsayer_test ) %>% 
  dplyr::select( c("selected_model", "key") )

matches <- accuracies %>% 
  dplyr::right_join( selected_models, by = "key" )

Are we better than just fitting any individual model to all series?

check_against_model <- function( acc_data, model, metric = "RMSE" ) {

  model_acc <- acc_data %>% 
    dplyr::filter( .model == model  ) %>% 
    dplyr::select( tidyselect::all_of(c("key", metric)))

  selected_acc <- acc_data %>% 
    dplyr::filter( .model == selected_model ) %>% 
    dplyr::select(  tidyselect::all_of(c("key", metric)) )

  colnames(selected_acc)[ colnames(selected_acc) == metric ] <- paste0("selected_",metric)

  dplyr::full_join(model_acc, selected_acc, key = "key")
}

performance_vs_single_model <- purrr::map( c("ar", "arima", "croston", "ets", "nnetar", "theta"),
            function( model_name ){
              check_against_model(matches, model_name) %>%
                dplyr::mutate( ratio = RMSE/selected_RMSE) %>% 
                # deliberately remove high ratios
                dplyr::filter( ratio < quantile(ratio, probs = 0.99, na.rm = TRUE) &
                               ratio < 10) %>%
                dplyr::summarise( mean_ratio = mean(ratio, na.rm = TRUE)) %>% 
                dplyr::mutate( model_name = model_name )
            }) %>% 
  dplyr::bind_rows()

We get the performance against each individual model: The ratio is the RMSE of a given model, vs our model (averaged). Thus, numbers > 1 are good, numbers < 1 are bad, etc.

knitr::kable(performance_vs_single_model)

So we pretty much beat any single model individually (cool!). Maybe that changes with different metrics, or we can do even better.

How often do we do better, and how often do we do worse, though?

better_or_worse_than_single_model <- purrr::map( c("ar", "arima", "croston", "ets", "nnetar", "theta"),
            function( model_name ){
              check_against_model(matches, model_name) %>%
                dplyr::mutate( ratio = RMSE/selected_RMSE) %>% 
                dplyr::filter( !is.na(ratio) ) %>% 
                dplyr::summarise( worse = sum(ratio < 1)/length(ratio),
                                  better_or_equal = sum(ratio >= 1)/length(ratio)) %>% 
                dplyr::mutate( model_name = model_name )
            }) %>% 
  dplyr::bind_rows()

knitr::kable(better_or_worse_than_single_model, digits = 2)

So even against fable::ETS we are still better about r better_or_worse_than_single_model$better_or_equal*100% of the time. But how do we measure up to the best model for any particular time series, if we take this as a ratio of RMSE again?

check_against_best_model <- function( acc_data, metric = "RMSE" ) {

  model_acc <- acc_data %>%
    dplyr::select( tidyselect::all_of(c(metric,"key"))) %>% 
    dplyr::group_by(key) %>% 
    dplyr::mutate(ranking = dplyr::across(where(is.numeric), dplyr::min_rank)) %>% 
    dplyr::filter(ranking == 1) %>% 
    dplyr::ungroup() %>% 
    dplyr::select( tidyselect::all_of(c(metric, "key")))

  selected_acc <- acc_data %>% 
    dplyr::filter( .model == selected_model ) %>% 
    dplyr::select(  tidyselect::all_of(c("key", metric)) )

  colnames(selected_acc)[ colnames(selected_acc) == metric ] <- paste0("selected_",metric)

  dplyr::full_join(model_acc, selected_acc, key = "key")
}

performance_vs_best_model <- 
              check_against_best_model(matches) %>%
                dplyr::mutate( ratio = RMSE/selected_RMSE)
knitr::kable(dplyr::summarise( performance_vs_best_model, mean_ratio = mean(ratio, na.rm = TRUE)) )

Here we see that we are some r dplyr::summarise( performance_vs_best_model, mean_ratio = mean(ratio, na.rm = TRUE))% worse (on average) than the absolute best. What is the rough distribution here? (Note that if we always picked the best model, we would get a ratio of 1 - the lower we are, the worse.)

performance_vs_best_model %>% 
  ggplot2::ggplot( ggplot2::aes(x = ratio)) + 
  ggplot2::geom_histogram()

That looks like we pick the best model (or close to it) quite often - lets check:

performance_vs_best_model %>% 
  dplyr::summarise( picked_almost_best = sum(ratio > 0.95, na.rm = TRUE)/length(na.omit(ratio)),
                    picked_best = sum(ratio == 1, na.rm = TRUE)/length(na.omit(ratio))) %>% 
  knitr::kable()

So we pick close to the best model about r performance_vs_best_model$picked_almost_best*100% of the time, and the absolute best model about 38% of the time.

How often do we fail (i.e. how often do we pick a model with a RMSE more than 20% worse than the best one) ?

bad <- performance_vs_best_model %>% 
  dplyr::summarise( picked_bad = sum(ratio < 0.8, na.rm = TRUE)/length(na.omit(ratio))) 

  knitr::kable(bad)

What are the scales for the series where we fail?

ordered <- performance_vs_best_model %>% 
  dplyr::filter( ratio < 0.8  ) %>% 
  dplyr::select( key, RMSE, selected_RMSE, ratio ) %>% 
  dplyr::arrange( dplyr::desc(selected_RMSE) )
  knitr::kable(ordered)

How many series are we looking at? r nrow(dplyr::filter( performance_vs_best_model, ratio < 0.8)) What the heck happened with series r ordered[1, "key"]? Perhaps it makes sense to try to explain the discrepancies.

About r bad$picked_bad*100% of the time - unfortunate. Just how bad are these picks, and where do they tend to cluster?

performance_vs_best_model %>% 
  dplyr::filter( ratio < 0.8 ) %>% 
  ggplot2::ggplot( ggplot2::aes(x = ratio)) + 
  ggplot2::geom_histogram()

Can we possibly model the ratios based on the features?

ratio_model <- performance_vs_best_model %>% 
  dplyr::select( key, ratio ) %>% 
  dplyr::left_join(feature_df, by = c("key" = "keys")) %>% 
  dplyr::select( - c("key")) %>% 
  dplyr::select_if(~ !all(is.na(.x))) %>%
  dplyr::select_if(~ !all(.x == .x[1])) %>% 
  na.omit

ratio_importances <- ranger::ranger( y = dplyr::select(ratio_model, ratio) %>% unlist, 
                                     x = dplyr::select(ratio_model, -ratio) %>% as.matrix,
                                     num.trees = 2000,
                                     importance = 'impurity_corrected'
                                    )
variable_importances <- ratio_importances[["variable.importance"]]
# rescale so they sum to 100
variable_importances <- (variable_importances/sum(variable_importances)) * 100
df <- data.frame( var_imp = variable_importances, var = names(variable_importances) ) 

ggplot2::ggplot(df, ggplot2::aes( x = reorder(var,var_imp),
                                  y = var_imp,
                                  fill = var_imp )
                ) +  
      ggplot2::geom_bar(stat="identity", position="dodge") +
      ggplot2::coord_flip()+
      ggplot2::ylab("Variable Importance")+
      ggplot2::xlab("")+
      ggplot2::scale_fill_gradient(low="red", high="blue")

Huh, so perhaps we can learn more from the data (seeing that the variable importances do not seem to be random). Are there any anomalies in our data?

isolation <- isotree::isolation.forest(data = feature_df %>% dplyr::select(-keys), sample_size = 256, ntrees = 1000)

iso_preds <- isotree::predict.isolation_forest(isolation, feature_df)

Learning to rank methods

Can we just gradient boost this problem away, using all of the methods and all of the data?

soothsayer_data <- accuracies %>% 
  dplyr::select( key, .model, RMSE )%>%
  dplyr::full_join(feature_df, by = c("key" = "keys")) %>%
  dplyr::select_if(~ !all(is.na(.x))) %>%
  dplyr::select_if(~ !all(.x == .x[1])) %>%
  tidyr::drop_na() %>% 
# sample entire groups to avoid bias 
  dplyr::group_by(key) %>% 
  dplyr::mutate( sample_type = sample(c("train", "test"),
                                      1,
                                      replace = TRUE,
                                      prob = c(0.7, 0.3)
                                      )) %>% 
  dplyr::mutate( rank = 1/(RMSE/sum(RMSE))) %>% #/sum(RMSE)) %>% 
  dplyr::ungroup() %>% 
  dplyr::select(-RMSE) %>% 
  dplyr::relocate(rank, .before = .model) %>% 
  dplyr::mutate( key = factor(key, levels = unique(key))) %>% 
  dplyr::mutate( key = as.integer(key))

soothsayer_train <- soothsayer_data %>% 
  dplyr::filter( sample_type == "train" ) %>% 
  dplyr::select(-sample_type)
soothsayer_test <- soothsayer_data %>% 
  dplyr::filter( sample_type == "test") %>% 
  dplyr::select(-sample_type)


soothsayer_model_gbm <- gbm::gbm( rank ~.-.model,
                                  data = soothsayer_train,
                                  distribution = list( name = "pairwise", group = "key", metric = "conc" ),
                                  n.trees = 50,
                                  shrinkage=0.005,         # learning rate
                                  bag.fraction = 0.5,      # subsampling fraction
                                  train.fraction = 1,      # fraction of data for training
                                  n.minobsinnode = 10,
                                  interaction.depth = 2)
preds <- predict(soothsayer_model_gbm, soothsayer_test)
gbm::gbm.perf(soothsayer_model_gbm)