In mrfoliveira/Evaluation-procedures-for-forecasting-with-spatio-temporal-data: Evaluation procedures for forecasting with spatio-temporal data

This is a report generating the figures and tables in the paper:

Oliveira, Mariana, Luís Torgo, and Vítor Santos Costa. "Evaluation Procedures for Forecasting with Spatio-Temporal Data." In Proceedings of the European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases, ECML-PKDD (pp. 703–718). Springer, Cham, 2018.

library(knitr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(scmamp)

knitr::opts_chunk$set(echo = FALSE, eval = TRUE,
                     warning = FALSE, message = FALSE,
                     fig.width = 8, fig.path = "images/")

source("../../R/analyse_utils.R")
cbPal <- c("#5e3c99", "#e66101")

#' Abbreviate default experiment method names for use in plots
#'
#' @param x a vector of method names
#'
#' @return A vector of abbreviated names
abbr_est_names <- function(x){
  x <- gsub("^x", "CV", x)
  x <- gsub("^p", "P", x)
  x <- gsub("^h", "H", x)
  x <- gsub("^mc", "MC", x)

  x <- gsub("T", "t", x)
  x <- gsub("S", "s", x)
  x <- gsub("sP", "SP", x)
  x <- gsub("\\_t$", "\\_T", x)
  x <- gsub("\\_s$", "\\_S", x)
  x <- gsub("\\_st$", "\\_ST", x)
  x <- gsub("\\_stM$", "\\_STM", x)
  x <- gsub("sP", "SP", x)

  x <- gsub("all", "A", x)
  x <- gsub("block", "B", x)
  x <- gsub("rand", "R", x)
  x <- gsub("contig", "C", x)
  x <- gsub("checker", "S", x)
  x <- gsub("sliding", "slW", x)
  x <- gsub("growing", "grW", x)

  x <- gsub("CVtbsr_STM", "CVtBsR_STM", x)

  x
}

load("art_sumRes_med.Rdata")

sumRes <- sumRes %>% 
  mutate(
    estimator = abbr_est_names(as.character(estimator)),
    Err = estimated - real, 
    RelAbsErr = abs(estimated - real)/real,
    RelErr = (estimated - real)/real,
    AbsErr = abs(estimated - real)) %>%
  filter(metric=="nmae")

resMats <- sumRes %>% 
  select(-c(real, estimated:RelErr)) %>%
        tidyr::spread(estimator, AbsErr)

Artificial results

Median Errors

x <- sumRes %>% 
  filter(!grepl("slW", estimator),
         !grepl("rmSP", estimator)) %>%
  group_by(metric, estimator) %>%
  mutate(med = median(Err, na.rm=T),
         sd = sd(Err, na.rm=T),
         medErr = ifelse(med>0, "over", ifelse(med<0, "under", "acc"))) %>%
  separate(estimator, into = c("estimator", "buffer"), sep="_", fill="right") %>%
    mutate(buffer=ifelse(is.na(buffer) & !grepl("(H|MC)", estimator), "CV", ifelse(!grepl("(S|T)", buffer), "OOS", paste0("CV-", buffer)))) %>%
  mutate(buffer=gsub("buff", "", buffer))


gg1 <- ggplot(x, aes(x=estimator, y=Err, color = medErr)) + 
  geom_boxplot() + 
  scale_color_manual(values=cbPal) +
  facet_grid(.~buffer, scales = "free_x", space = "free_x") +
  theme(text = element_text(size=20),
        strip.text = element_text(angle=90),
        axis.text.x = element_text(angle = 90, hjust = 1 )) + 
  geom_hline(yintercept=0, linetype = "dashed")

print(gg1)

Fig. 1: Box plots of estimation errors incurred by cross-validation and out-of-sample procedures on 96 artificial data sets using 2 learning algorithms

Relative errors

x <- sumRes %>% 
  filter(!grepl("rmSP", estimator),
         !grepl("slW", estimator)) %>%
  mutate(Type=cut(RelAbsErr, 
                  breaks = c(0,0.005,0.01,Inf),
                  labels = c("[0,0.5]%","]0.5,1]%", ">1%"),
                  include.lowest=TRUE)) %>%
  group_by(metric, estimator, Type) %>%
  summarize(nType = n()) %>%
  mutate(frac = nType/sum(nType)) %>%
  separate(estimator, into = c("estimator", "buffer"), sep="_", fill="right") %>%
    mutate(buffer=ifelse(is.na(buffer) & !grepl("(H|MC)", estimator), "CV", ifelse(!grepl("(S|T)", buffer), "OOS", paste0("CV-", buffer)))) %>%
  mutate(buffer=gsub("buff", "", buffer))

gg4 <- ggplot(x, 
       aes(x=estimator, y=frac, fill=Type)) + 
  geom_bar(stat="identity") + 
  facet_grid(.~buffer, scales = "free", space = "free") + 
    theme(text = element_text(size=20),
          axis.text.x = 
            element_text(angle = 90, hjust = 1),
          strip.text.x = element_text(angle=90)) +
  scale_fill_brewer(palette="RdPu")

print(gg4)

Fig. 3(a) Bar plots of relative estimation errors incurred by cross-validation and out-of-sample procedures on 96 artificial ~~and 17 real-world~~ data sets using 2 learning algorithms. Note the different legends

Absolute errors

Best overall

MODELS <- c("lm", "ranger")
for(model in MODELS){
  g_size <- "GRID_SZ_64"
  t_size <- "TIME_SZ_150"

  cat(paste("\n\n####", g_size, t_size, model))
  resMat <- resMats %>%
    filter(
      grepl("nmae", metric),
      grepl(!!quo(UQ(model)), model),
      grepl(!!quo(UQ(g_size)), g_size),
      grepl(!!quo(UQ(t_size)), t_size))

  r <- resMat[, which(colnames(resMat) %in% c(c("CVtBsA", "CVtBsA_T", "MC56.4", "PtBsR_grW", "CVtRsR", "H80.20")))]
  colnames(r) <- gsub("\\_grW", "", colnames(r))
  colnames(r) <- gsub("buff", "", colnames(r))
  plotCD(r, decreasing=FALSE, cex=1.25)
}

Fig. 4: Critical difference diagram according to Friedman-Nemenyi test (at 5\% confidence level) for a subset of estimation procedures using 24 artificial data sets ($64\times64$ grid; 150 time-points each)

Table 3: Average ranks of absolute errors incurred by cross-validation and out-of-sample procedures when estimating performance on 96 artificial data sets. ~~Best results in bold~~

oos_rs <- list()
x_rs <- list()

G_SIZES <- unique(resMats$g_size)
T_SIZES <- unique(resMats$t_size)

for(model in MODELS){
  oos_rs[[model]] <- list()
  x_rs[[model]] <- list()
  for(g_size in G_SIZES){
    oos_rs[[model]][[g_size]] <- list()
    x_rs[[model]][[g_size]] <- list()
    for(t_size in T_SIZES){
      r <- resMats %>%
          filter(
            grepl("nmae", metric),
            grepl(!!quo(UQ(model)), model))
      r_oos <- r[,which(grepl("(H|MC|^P)", colnames(r)) & !grepl("rmSP", colnames(r)) &
            !grepl("slW", colnames(r)))]
      r_x <- r[,grep("CV", colnames(r))]

      oos_rs[[model]][[g_size]][[t_size]] <- apply(apply(r_oos,1,rank),1,mean)
      oos_rs[[model]][[g_size]][[t_size]] <- data.frame(estimator=names(oos_rs[[model]][[g_size]][[t_size]]),avgR=oos_rs[[model]][[g_size]][[t_size]])

      x_rs[[model]][[g_size]][[t_size]] <- apply(apply(r_x,1,rank),1,mean)
      x_rs[[model]][[g_size]][[t_size]] <- data.frame(estimator=names(x_rs[[model]][[g_size]][[t_size]]),avgR=x_rs[[model]][[g_size]][[t_size]])
    }
  }
}
x_rs <- bind_rows(lapply(x_rs, function(x) bind_rows(lapply(x, function(y) bind_rows(y, .id="t_size")), .id="g_size")), .id="model") %>% 
  spread(estimator, avgR)

oos_rs <- bind_rows(lapply(oos_rs, function(x) bind_rows(lapply(x, function(y) bind_rows(y, .id="t_size")), .id="g_size")), .id="model") %>% 
  spread(estimator, avgR)

x_rs %>% group_by(model) %>% summarize_if(is.numeric, mean) %>% knitr::kable(digits=2)
oos_rs %>% group_by(model) %>% summarize_if(is.numeric, mean) %>% knitr::kable(digits=2)

Real results

Median Errors

load("real_sumRes_med_NArm.Rdata")

realSumResTab <- realSumRes %>% 
  mutate(estimator=as.character(estimator),
         estimator = abbr_est_names(estimator)) %>%
  mutate(
    Err = estimated - real, 
    RelAbsErr = abs(estimated - real)/real,
    RelErr = (estimated - real)/real,
    AbsErr = abs(estimated - real)) %>%
  filter(metric=="nmae") %>%
  filter(!(estimator %in% c("CVtRsR_ST", "CVtRsR_T")))

realResMats <- realSumResTab %>% 
  select(-c(real, estimated:RelErr)) %>%
        tidyr::spread(estimator, AbsErr)

x <- realSumResTab %>%
            filter(!grepl("rmSP", estimator),
                   !grepl("slW", estimator)) %>%
  group_by(metric, estimator) %>%
  mutate(med = median(Err, na.rm=T),
         sd = sd(Err, na.rm=T),
         medErr = ifelse(med>0, "over", ifelse(med<0, "under", "acc"))) %>%
  separate(estimator, into = c("estimator", "buffer"), sep="_", fill="right") %>%
    mutate(buffer=ifelse(is.na(buffer) & !grepl("(H|MC)", estimator), "CV", ifelse(!grepl("(S|T)", buffer), "OOS", paste0("CV-", buffer)))) %>%
  mutate(buffer=gsub("buff", "", buffer))

ggplot(x %>% filter(!is.na(Err)), 
              aes(x=estimator, y=Err, color = medErr)) + 
  geom_boxplot() + 
  scale_color_manual(values=cbPal) +
  facet_grid(.~buffer, scales = "free_x", space = "free_x") +
  theme(text = element_text(size=20),
        strip.text = element_text(angle = 90),
          axis.text.x = element_text(angle = 90, hjust = 1 )) + 
  geom_hline(yintercept=0, linetype = "dashed")

Fig. 2: Box plots of estimation errors incurred by cross-validation and out-of-sample procedures on 17 real world data sets using 2 learning algorithms

Relative errors

x <- realSumResTab %>% 
  filter(!grepl("slW", estimator),
         !grepl("rmSP", estimator)) %>%
  mutate(RelErr = abs(estimated - real)/real) %>%
  mutate(Type=cut(RelErr, 
                  breaks = c(-Inf,-0.3,-0.05,0,0.05,0.3,Inf),
                  labels = c("<-30","[-30,-5]","]-5,0]","]0,5]","]5,30]", ">30"),
                  include.lowest=TRUE)) %>%
  group_by(metric, estimator, Type) %>%
  summarize(nType = n()) %>%
  mutate(frac = nType/sum(nType)) %>%
  ungroup() %>%
  separate(estimator, into = c("estimator", "buffer"), sep="_", fill="right") %>%
  mutate(buffer=ifelse(is.na(buffer) & !grepl("(H|MC)", estimator), 
                       "CV", 
                       ifelse(!grepl("(S|T)", buffer), "OOS", paste0("CV-", buffer)))) %>%
  mutate(buffer=gsub("buff", "", buffer))

ggplot(x, 
       aes(x=estimator, y=frac, fill=Type)) + 
  facet_grid(.~buffer, space = "free_x", scales="free_x") +
  geom_bar(stat="identity") + 
    theme(text = element_text(size=20),
          axis.text.x = 
            element_text(angle = 90, hjust = 1),
          strip.text.x = element_text(angle=90)) +
  scale_fill_brewer(palette="RdPu")

Fig. 3(b): Bar plots of relative estimation errors incurred by cross-validation and out-of-sample procedures on ~~96 artificial and~~ 17 real-world data sets using 2 learning algorithms. Note the different legends

Absolute errors

for(model in c("lm", "rf")){
  cat(paste0("\n\n####", model,"\n"))
  resMat <- realResMats %>%
    filter(
      grepl("nmae", metric),
           grepl(!!quo(UQ(model)), model))

    r <- resMat[, c("CVtBsA", "CVtBsR", "CVtRsR_S", "PtBsR_grW", "H80.20", "PtBsA_grW", "CVtRsR")]
    colnames(r)<-gsub("buff", "", colnames(r))
    colnames(r)<-gsub("\\_grW", "", colnames(r))
    r <- r[complete.cases(r),]
    plotCD(r, decreasing=FALSE, cex=1.25)
}

Fig. 5 Critical difference diagram according to Friedman-Nemenyi test (at 5\% confidence level) for a subset of estimation procedures using real data sets

Table 4: Average ranks of absolute errors incurred by cross-validation and out-of-sample procedures when estimating performance on 17 real-world data sets. ~~Best results in bold~~

MODELS <- c("lm", "rf")
oos_rs <- list()
x_rs <- list()
for(model in MODELS){
  r <- realResMats %>%
      filter(
        grepl("nmae", metric),
        grepl(!!quo(UQ(model)), model))
  r_oos <- r[,which(grepl("(H|MC|^P)", colnames(r)) & !grepl("rmSP", colnames(r)) &
        !grepl("slW", colnames(r)))]
  r_x <- r[,grep("CV", colnames(r))]

  oos_rs[[model]] <- apply(apply(r_oos,1,rank),1,mean)
  oos_rs[[model]] <- data.frame(estimator=names(oos_rs[[model]]),avgR=oos_rs[[model]])

  x_rs[[model]] <- apply(apply(r_x,1,rank),1,mean)
  x_rs[[model]] <- data.frame(estimator=names(x_rs[[model]]),avgR=x_rs[[model]])
}
x_rs <- bind_rows(x_rs, .id="model") %>% spread(estimator, avgR)
oos_rs <- bind_rows(oos_rs, .id="model") %>% spread(estimator, avgR)

#print(xtable::xtable(x_rs), include.rownames = FALSE, booktabs = TRUE)
#print(xtable::xtable(oos_rs), include.rownames = FALSE, booktabs = TRUE )

x_rs %>% knitr::kable(digits=2)
oos_rs %>% knitr::kable(digits=2)