In krzjoa/torchts: Time series Models with torch

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(torch)
#library(torchts)
library(rsample)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(parsnip)
library(timetk)

devtools::load_all()

Preparing dataset

parsnip API provides quick and convenient way to train time series models based on torch library.

First, let's prepare input data set using excelent timetk library. weather_pl dataset from torchts package contains 19-year history of weather data from two Polish "poles of extreme temperature", i.e. Suwałki and Tarnów. In this excercise, we'll use a slice of data registered in the "pole of warmth", i.e in the city of Tarnów.

tarnow_temp <- 
  weather_pl %>% 
  filter(station == "TRN") %>% 
  select(date, tmax_daily, tmin_daily, press_mean_daily)

tk_summary_diagnostics(tarnow_temp, date)

tarnow_temp <- 
  tarnow_temp %>% 
  mutate(year_day = as.numeric(lubridate::yday(date)))

As we can see, this is a time series between 1th of Janury 2001 till the end of 2020 - twenty years. In the next step, we'll split our data into two training and testing datasets using a handy time_series_split function.

Univariate time series

EPOCHS    <- 10
HORIZON   <- 3
TIMESTEPS <- 28

data_split <- 
  time_series_split(
    tarnow_temp, date, 
    initial = "18 years",
    assess  = "2 years", 
    lag     = TIMESTEPS
  )

# Training 
# TODO: błąd z integerem (categorical?)
# TODO: bind_result() function to bind forecast and ground truth
# TODO: select loss function
# TODO: add smoothed series (like a sine)

devtools::load_all()
# debugonce(torchts_rnn)
# debugonce(as_ts_dataset.data.frame)

rnn_model <- 
  rnn(
    timesteps    = TIMESTEPS,
    horizon      = HORIZON,
    epochs       = EPOCHS,
    learn_rate   = 0.001,
    hidden_units = 20,
    batch_size   = 32
  ) %>% 
  set_device('cuda') %>% 
  fit(tmax_daily ~ date, 
      data = training(data_split))

rnn_model

We want to generate forecast for a full year. To do so, we'll remove the outcome column values we'd like to forecast. Bear in mind that input data differs from the inputs for classical tabular-data algorithms. In this specific case, we have to keep some "history" (in this case: r TIMESTEPS steps, i.e: r TIMESTEPS days).

# Clear outcome variable
# Ostrzeżenie, jeśli wyczyszczono za dużo danych
# Są bugi w datasecie 

cleared_new_data <- 
  testing(data_split) %>% 
  clear_outcome(date, tmax_daily, TIMESTEPS)

# Forecast
fcast <-
  rnn_model %>%
  predict(new_data = testing(data_split))

fcast <-
  rnn_model %>%
  predict(new_data = cleared_new_data)

plot_forecast(
  testing(data_split),
  fcast,
  tmax_daily
)

Multivariate time series

In the next next, we're adding new predictor: press_mean_daily. It means that we extend the autoregressive model we obtained in the previous exercise by providing a variable that could be recognised as external regressor (what is a term commonly used in the context of methods like ARIMA etc.).

TIMESTEPS <- 20
HORIZON   <- 1

# Training 
rnn_model <- 
  rnn(
    timesteps    = TIMESTEPS,
    horizon      = HORIZON,
    epochs       = 2,
    hidden_units = 5,
    batch_size   = 32,
    scale        = TRUE 
  ) %>% 
  fit(tmax_daily ~ press_mean_daily + tmax_daily + index(date), 
      data = training(data_split))

# Clear outcome variable
cleared_new_data <- 
  testing(data_split) %>% 
  clear_outcome(date, tmax_daily, 20)

# Forecast
fcast <-
  rnn_model %>%
  predict(new_data = testing(data_split))

fcast_vs_true <- 
  bind_cols(
    n = 1:nrow(testing(data_split)),
    actual = testing(data_split)$tmax_daily, 
    fcast
  ) %>% 
  tidyr::pivot_longer(c(actual, .pred))


ggplot(fcast_vs_true) + 
  geom_line(aes(n, value, col = name)) +
  theme_minimal() + 
  ggtitle("Forecast vs actual values")

Multivariate time series with multiple outcomes

Finally, the most complex example. Here we'd like to forecast future values of two mutually related variables, i.e.: tmin_daily and tmax_daily. In this model we're dealing with a "double" autoregressive model, similar in some respects to the vector autoregession model (VAR).

TIMESTEPS <- 20
HORIZON   <- 1

# Training 
rnn_model <- 
  rnn(
    timesteps    = TIMESTEPS,
    horizon      = HORIZON,
    epochs       = 25,
    hidden_units = 5,
    batch_size   = 32,
    scale        = TRUE 
  ) %>% 
  fit(tmin_daily + tmax_daily ~ tmin_daily + tmax_daily + index(date), 
      data = training(data_split))

# Clear outcome variable
cleared_new_data <- 
  testing(data_split) %>% 
  clear_outcome(date, c(tmax_daily, tmin_daily), 20)

# Forecast
fcast <-
  rnn_model %>%
  predict(new_data = cleared_new_data)

tmax_daily <- 
  bind_cols(
    n        = 1:nrow(testing(data_split)),
    actual   = testing(data_split)$tmax_daily, 
    forecast = fcast$.pred_tmax_daily,
    variable = "tmax_daily"
  ) %>% 
  tidyr::pivot_longer(c(actual, forecast))

tmin_daily <- 
  bind_cols(
    n        = 1:nrow(testing(data_split)),
    actual   = testing(data_split)$tmin_daily, 
    forecast = fcast$.pred_tmin_daily,
    variable = "tmin_daily"
  ) %>% 
  tidyr::pivot_longer(c(actual, forecast))

fcast_vs_true <- 
  bind_rows(tmax_daily, tmin_daily)

ggplot(fcast_vs_true) + 
  geom_line(aes(n, value, col = name)) +
  theme_minimal() + 
  ggtitle("Forecast vs actual values") + 
  facet_wrap(~variable)