In krzjoa/torchts: Time series Models with torch

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(torch)
#library(torchts)
library(rsample)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(parsnip)
library(timetk)

# write.csv(weather_pl, file = "../weather_pl.csv")

devtools::load_all()

Preparing dataset

parsnip API provides quick and convenient way to train time series models based on torch library.

First, let's prepare input data set using excelent timetk library. weather_pl dataset from torchts package contains 19-year history of weather data from two Polish "poles of extreme temperature", i.e. Suwałki and Tarnów. In this excercise, we'll use a slice of data registered in the "pole of warmth", i.e in the city of Tarnów.

tarnow_temp <- 
  weather_pl %>% 
  filter(station == "TRN") %>% 
  select(date, tmax_daily, tmin_daily, press_mean_daily)

tk_summary_diagnostics(tarnow_temp, date)

tarnow_temp <- 
  tarnow_temp %>% 
  mutate(year_day =  as.numeric(lubridate::yday(date)))

As we can see, this is a time series between 1th of Janury 2001 till the end of 2020 - twenty years. In the next step, we'll split our data into two training and testing datasets using a handy time_series_split function.

Univariate time series

EPOCHS    <- 10
HORIZON   <- 7
TIMESTEPS <- 28

data_split <- 
  time_series_split(
    tarnow_temp, date, 
    initial = "18 years",
    assess  = "2 years", 
    lag     = TIMESTEPS
  )

mlp_model <-
  torchts_mlp(
    tmax_daily ~ date + year_day,
    data         = training(data_split),
    timesteps    = TIMESTEPS,
    horizon      = HORIZON,
    jump         = 1, 
    epochs       = EPOCHS,
    learn_rate   = 0.001,
    hidden_units = c(50, 30), 
    batch_size   = 32,
    scale        = TRUE,
    device       = 'cuda' 
  )

We want to generate forecast for a full year. To do so, we'll remove the outcome column values we'd like to forecast. Bear in mind that input data differs from the inputs for classical tabular-data algorithms. In this specific case, we have to keep some "history" (in this case: r TIMESTEPS steps, i.e: r TIMESTEPS days).

# Clear outcome variable
# Ostrzeżenie, jeśli wyczyszczono za dużo danych
cleared_new_data <- 
  testing(data_split) %>% 
  clear_outcome(date, c(tmax_daily), TIMESTEPS)

fcast <-
  mlp_model %>%
  predict(new_data = testing(data_split))

fcast <-
  mlp_model %>%
  predict(new_data = cleared_new_data)

fcast <- tibble(.pred = fcast)

# plot(ts(fcast))

plot_forecast(
  testing(data_split),
  fcast,
  tmax_daily
)

# fcast_vs_true <- 
#   bind_cols(
#     n = 1:nrow(testing(data_split)),
#     actual = testing(data_split)$tmax_daily, 
#     fcast
#   ) %>% 
#   tidyr::pivot_longer(c(actual, .pred))
#   
# ggplot(fcast_vs_true) + 
#   geom_line(aes(n, value, col = name)) +
#   theme_minimal() + 
#   ggtitle("Forecast vs actual values")

# test_input <- c(
#   -0.9743, -1.0750, -1.3369, -1.2966, -1.2866, -0.9440, -1.2866, -1.3168,
#          -0.8634, -0.9138, -1.0548, -0.9440, -0.8231, -1.2160, -1.1757, -0.8634,
#          -0.6519, -0.5612, -0.8131, -1.2765, -1.1153, -0.6922, -0.3799, -0.5914,
#          -0.9843
# )
# 
# inp <- torch_tensor(array(test_input, dim = c(1, 28)))$cuda()
# 
# 
# mlp_model$net(x_num = inp)

Multivariate time series

In the next next, we're adding new predictor: press_mean_daily. It means that we extend the autoregressive model we obtained in the previous exercise by providing a variable that could be recognised as external regressor (what is a term commonly used in the context of methods like ARIMA etc.).

EPOCHS    <- 10
HORIZON   <- 7
TIMESTEPS <- 28

data_split <- 
  time_series_split(
    tarnow_temp, date, 
    initial = "18 years",
    assess  = "2 years", 
    lag     = TIMESTEPS
  )

rnn_model <-
  torchts_rnn(
    tmax_daily ~  date + tmax_daily + year_day,
    data = training(data_split),
    timesteps    = TIMESTEPS,
    horizon      = HORIZON,
    jump         = 1, 
    epochs       = EPOCHS,
    learn_rate   = 0.001,
    hidden_units = 30,
    batch_size   = 32,
    scale        = TRUE,
    device       = 'cuda' 
  )

# Clear outcome variable
cleared_new_data <- 
  testing(data_split) %>% 
  clear_outcome(date, tmax_daily, 20)

# Forecast
fcast <-
  rnn_model %>%
  predict(new_data = testing(data_split))

plot_forecast(
  head(testing(data_split), nrow(fcast)),
  fcast,
  tmax_daily
)