user_sample_2
In ubair: Effects of External Conditions on Air Quality

Run multiple stations and models for 9 euro ticket

Adapt directory path!

# set the dir where the data is stored
data_dir <- "../../Daten/user_sample_data/"

Adapt this part based on the effect/target/stations that is investigated

sample_name <- "NeunEuroTicket"
target <- "NO2"
stations <- list(Luenen = "DENW006", AachenBurtscheid = "DENW094")
meteo_variables <- c("TMP", "RFE", "WIG", "WIR", "LDR")

application_start <- lubridate::ymd("20220301") # = start reference time
date_effect_start <- lubridate::ymd_hm("20220601 00:00")
application_end <- lubridate::ymd("20220831") # = end effect time
buffer <- 0 # number of data points to be ignored before effect

trend <- "linear"
# hyperparameters can be set in params/params.yaml
model_types <- c("lightgbm", "rf", "dynamic_regression", "fnn")

window_size <- 14 # days of data to calculate the mean in prediction results

Load data and train models. This part does not necessarily need to be changed.

library(ubair)

# This might take a few seconds for large files
data <- load_uba_data_from_dir(data_dir = data_dir)

params <- load_params()
params$target <- target
params$meteo_variables <- meteo_variables

for (station_name in names(stations)) {
  station <- stations[[station_name]]
  predictions_all <- data.table::data.table()
  metrics_all <- data.table::data.table()
  env_data <- clean_data(data, station = station)
  dt_prepared <- prepare_data_for_modelling(env_data, params)
  dt_prepared <- dt_prepared[complete.cases(dt_prepared)]
  split_data <- split_data_counterfactual(
    dt_prepared,
    application_start = application_start,
    application_end = application_end
  )
  for (model_type in model_types) {
    message(paste("start training:", station_name, station, model_type))
    res <- run_counterfactual(split_data,
                              params,
                              detrending_function = trend,
                              model_type = model_type,
                              alpha = 0.9,
                              log_transform = FALSE
    )
    predictions <- data.table::copy(res$prediction)

    # plot
    bau_plot <- plot_counterfactual(predictions, params,
                                    window_size = window_size,
                                    date_effect_start,
                                    buffer = buffer
    )
    # evaluation
    metrics <- round(calc_performance_metrics(predictions,
                                              date_effect_start,
                                              buffer = buffer
    ), 2)

    effect <- estimate_effect_size(predictions,
                                   date_effect_start,
                                   buffer = buffer,
                                   verbose = FALSE
    )
    metrics["effect_size"] <- effect["absolute_effect"]
    metrics["relative_effect"] <- effect["relative_effect"]
    # add information for export
    metrics["model"] <- model_type
    metrics["trend"] <- trend
    metrics["station_name"] <- station_name
    metrics["station"] <- station
    metrics["buffer_start"] <- format(
      date_effect_start - as.difftime(buffer, units = "hours"),
      "%Y-%m-%d"
    )
    metrics["effect_start"] <- format(date_effect_start, "%Y-%m-%d")
    metrics_dt <- data.table::as.data.table(t(metrics))
    metrics_all <- rbind(metrics_all, metrics_dt)
    predictions[, station := station]
    predictions[, model := model_type]
    predictions[, trend := trend]
    predictions_all <- rbind(predictions_all, predictions)
  }

  # save predictions (hourly data) and metrics
  predictions_save <- dplyr::select(
    predictions_all,
    c(
      date,
      value,
      prediction,
      prediction_lower,
      prediction_upper,
      station,
      model,
      trend
    )
  )
  predictions_save$date <- format(predictions_save$date, "%Y-%m-%d %H:%M")
}
#> start training: Luenen DENW006 lightgbm
#> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000319 seconds.
#> You can set `force_row_wise=true` to remove the overhead.
#> And if memory is not enough, you can set `force_col_wise=true`.
#> [LightGBM] [Info] Total Bins 1549
#> [LightGBM] [Info] Number of data points in the train set: 60472, number of used features: 8
#> [LightGBM] [Info] Start training from score 0.000000
#> start training: Luenen DENW006 rf
#> start training: Luenen DENW006 dynamic_regression
#> Using data for dynamic regression training from  2021-01-22 01:00:00 to  2022-02-28 23:00:00. Too long training series can lead to worse performance. Adjust this via the dynamic_regression$ntrain hyperparameter.
#> start training: Luenen DENW006 fnn
#> start training: AachenBurtscheid DENW094 lightgbm
#> [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031247 seconds.
#> You can set `force_col_wise=true` to remove the overhead.
#> [LightGBM] [Info] Total Bins 1550
#> [LightGBM] [Info] Number of data points in the train set: 60039, number of used features: 8
#> [LightGBM] [Info] Start training from score -0.000000
#> start training: AachenBurtscheid DENW094 rf
#> start training: AachenBurtscheid DENW094 dynamic_regression
#> Using data for dynamic regression training from  2021-01-10 04:00:00 to  2022-02-28 23:00:00. Too long training series can lead to worse performance. Adjust this via the dynamic_regression$ntrain hyperparameter.
#> start training: AachenBurtscheid DENW094 fnn