knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(torch) #library(torchts) library(rsample) library(dplyr, warn.conflicts = FALSE) library(ggplot2) library(parsnip) library(timetk) devtools::load_all()
parsnip
API provides quick and convenient way to train time series models based on torch
library.
First, let's prepare input data set using excelent timetk
library. weather_pl
dataset from torchts
package contains 19-year history of weather data from two Polish "poles of extreme temperature", i.e. Suwałki and Tarnów. In this excercise, we'll use a slice of data registered in the "pole of warmth", i.e in the city of Tarnów.
tarnow_temp <- weather_pl %>% filter(station == "TRN") %>% select(date, tmax_daily, tmin_daily, press_mean_daily) tk_summary_diagnostics(tarnow_temp, date) tarnow_temp <- tarnow_temp %>% mutate(year_day = as.numeric(lubridate::yday(date)))
As we can see, this is a time series between 1th of Janury 2001 till the end of 2020 - twenty years.
In the next step, we'll split our data into two training and testing datasets using a handy time_series_split
function.
EPOCHS <- 10 HORIZON <- 3 TIMESTEPS <- 28 data_split <- time_series_split( tarnow_temp, date, initial = "18 years", assess = "2 years", lag = TIMESTEPS ) # Training # TODO: błąd z integerem (categorical?) # TODO: bind_result() function to bind forecast and ground truth # TODO: select loss function # TODO: add smoothed series (like a sine) devtools::load_all() # debugonce(torchts_rnn) # debugonce(as_ts_dataset.data.frame) rnn_model <- rnn( timesteps = TIMESTEPS, horizon = HORIZON, epochs = EPOCHS, learn_rate = 0.001, hidden_units = 20, batch_size = 32 ) %>% set_device('cuda') %>% fit(tmax_daily ~ date, data = training(data_split)) rnn_model
We want to generate forecast for a full year. To do so, we'll remove the outcome column values
we'd like to forecast. Bear in mind that input data differs from the inputs for classical tabular-data algorithms.
In this specific case, we have to keep some "history" (in this case: r TIMESTEPS
steps, i.e: r TIMESTEPS
days).
# Clear outcome variable # Ostrzeżenie, jeśli wyczyszczono za dużo danych # Są bugi w datasecie cleared_new_data <- testing(data_split) %>% clear_outcome(date, tmax_daily, TIMESTEPS) # Forecast fcast <- rnn_model %>% predict(new_data = testing(data_split)) fcast <- rnn_model %>% predict(new_data = cleared_new_data) plot_forecast( testing(data_split), fcast, tmax_daily )
In the next next, we're adding new predictor: press_mean_daily
. It means that we extend the autoregressive model we obtained in the previous exercise by providing a variable that could be recognised as external regressor (what is a term commonly used in the context of methods like ARIMA etc.).
TIMESTEPS <- 20 HORIZON <- 1 # Training rnn_model <- rnn( timesteps = TIMESTEPS, horizon = HORIZON, epochs = 2, hidden_units = 5, batch_size = 32, scale = TRUE ) %>% fit(tmax_daily ~ press_mean_daily + tmax_daily + index(date), data = training(data_split))
# Clear outcome variable cleared_new_data <- testing(data_split) %>% clear_outcome(date, tmax_daily, 20) # Forecast fcast <- rnn_model %>% predict(new_data = testing(data_split)) fcast_vs_true <- bind_cols( n = 1:nrow(testing(data_split)), actual = testing(data_split)$tmax_daily, fcast ) %>% tidyr::pivot_longer(c(actual, .pred)) ggplot(fcast_vs_true) + geom_line(aes(n, value, col = name)) + theme_minimal() + ggtitle("Forecast vs actual values")
Finally, the most complex example. Here we'd like to forecast future values of
two mutually related variables, i.e.: tmin_daily
and tmax_daily
.
In this model we're dealing with a "double" autoregressive model, similar in some respects to the vector autoregession model (VAR).
TIMESTEPS <- 20 HORIZON <- 1 # Training rnn_model <- rnn( timesteps = TIMESTEPS, horizon = HORIZON, epochs = 25, hidden_units = 5, batch_size = 32, scale = TRUE ) %>% fit(tmin_daily + tmax_daily ~ tmin_daily + tmax_daily + index(date), data = training(data_split))
# Clear outcome variable cleared_new_data <- testing(data_split) %>% clear_outcome(date, c(tmax_daily, tmin_daily), 20) # Forecast fcast <- rnn_model %>% predict(new_data = cleared_new_data) tmax_daily <- bind_cols( n = 1:nrow(testing(data_split)), actual = testing(data_split)$tmax_daily, forecast = fcast$.pred_tmax_daily, variable = "tmax_daily" ) %>% tidyr::pivot_longer(c(actual, forecast)) tmin_daily <- bind_cols( n = 1:nrow(testing(data_split)), actual = testing(data_split)$tmin_daily, forecast = fcast$.pred_tmin_daily, variable = "tmin_daily" ) %>% tidyr::pivot_longer(c(actual, forecast)) fcast_vs_true <- bind_rows(tmax_daily, tmin_daily) ggplot(fcast_vs_true) + geom_line(aes(n, value, col = name)) + theme_minimal() + ggtitle("Forecast vs actual values") + facet_wrap(~variable)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.