Nothing
#' Full counterfactual simulation run
#'
#' Chains detrending, training of a selected model, prediction and retrending together
#' for ease of use. See documentation of individual functions for details.
#' @param split_data List of two named dataframes called train and apply
#' @param params A list of parameters that define the following:
#' \describe{
#' \item{meteo_variables}{A character vector specifying the names of the
#' meteorological variables used as inputs.}
#' \item{model}{A list of hyperparameters for training the chosen model. Name of this list
#' and its parameters depend on the chosen models. See [ubair::run_dynamic_regression()],
#' [ubair::run_lightgbm()], [ubair::run_rf()] and [ubair::run_fnn()] functions for details}
#' }
#' @param detrending_function String which defines type of trend to remove.
#' Options are "linear","quadratic", "exponential", "spline", "none". See [ubair::detrend()]
#' and [ubair::retrend_predictions()] for details.
#' @param model_type String to decide which model to use. Current options random
#' forest "rf", gradient boosted decision trees "lightgbm", "dynamic_regression" and feedforward neural network "fnn"
#' @param alpha Confidence level of the prediction interval between 0 and 1.
#' @param log_transform If TRUE, uses log transformation during detrending and
#' retrending. For details see [ubair::detrend()] documentation
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return Data frame of predictions, model and importance
#' @examples
#' data(mock_env_data)
#' split_data <- list(
#' train = mock_env_data[1:80, ],
#' apply = mock_env_data[81:100, ]
#' )
#' params <- load_params()
#' res <- run_counterfactual(split_data, params, detrending_function = "linear")
#' prediction <- res$retrended_predictions
#' random_forest_model <- res$model
#' @export
run_counterfactual <- function(split_data,
params,
detrending_function = "none",
model_type = "rf",
alpha = 0.9,
log_transform = FALSE,
calc_shaps = FALSE) {
variables <- c("day_julian", "weekday", "hour", params$meteo_variables)
detrended_list <- detrend(
split_data,
mode = detrending_function,
log_transform = log_transform
)
trend <- detrended_list$model
detrended_train <- detrended_list$train
detrended_apply <- detrended_list$apply
if (model_type == "rf") {
detrended_train <- detrended_train %>% select(value, !!variables)
res <- run_rf(
train = detrended_train,
test = detrended_apply,
model_params = params$random_forest,
alpha = alpha,
calc_shaps = calc_shaps
)
} else if (model_type == "lightgbm") {
detrended_train <- detrended_train %>%
select(value, dplyr::any_of(variables))
res <- run_lightgbm(
train = detrended_train,
test = detrended_apply,
model_params = params$lightgbm,
alpha = alpha,
calc_shaps = calc_shaps
)
} else if (model_type == "dynamic_regression") {
res <- run_dynamic_regression(
train = detrended_train,
test = detrended_apply,
params = params,
alpha = alpha,
calc_shaps = calc_shaps
)
} else if (model_type == "fnn") {
res <- suppressMessages(run_fnn(
train = detrended_train,
test = detrended_apply,
params = params,
calc_shaps = calc_shaps
))
} else {
stop("Wrong model_type. Select one of 'rf', 'lightgbm', 'dynamic_regression', 'fnn'.")
}
retrended_predictions <- retrend_predictions(res$dt_predictions,
trend,
log_transform = log_transform
)
list(prediction = retrended_predictions, model = res$model, importance = res$importance)
}
#' Run the dynamic regression model
#'
#' This function trains a dynamic regression model with fourier transformed temporal features
#' and meteorological variables as external regressors on the
#' specified training dataset and makes predictions on the test dataset in a
#' counterfactual scenario. This is referred to as a dynamic regression model in
#' [Forecasting: Principles and Practise, Chapter 10 - Dynamic regression models](https://otexts.com/fpp3/dynamic.html)
#'
#' Note: Runs the dynamic regression model for individualised use with own data pipeline.
#' Otherwise use [ubair::run_counterfactual()] to call this function.
#' @param train Dataframe of train data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param test Dataframe of test data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param params list of hyperparameters to use in dynamic_regression call. Only uses ntrain to specify
#' the number of data points to use for training. Default is 8760 which results in
#' 1 year of hourly data
#' @param alpha Confidence level of the prediction interval between 0 and 1.
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return Data frame of predictions and model
#' @importFrom forecast BoxCox.lambda auto.arima forecast
#' @export
run_dynamic_regression <- function(train,
test,
params,
alpha,
calc_shaps) {
train <- train %>% dplyr::filter(date < test$date[1])
# 24 * 365 = 8760 (1 year of training data)
ntrain <- ifelse(is.null(params$dynamic_regression$ntrain), 8760, params$dynamic_regression$ntrain)
train <- utils::tail(train, ntrain)
message(
paste("Using data for dynamic regression training from ", min(train$date), "to ", max(train$date)),
". Too long training series can lead to worse performance. Adjust this via the dynamic_regression$ntrain hyperparameter."
)
train_transformed <- .transform_input(train, params)
test_transformed <- .transform_input(test, params)
scale_result <- scale_data(
train_data = train_transformed,
apply_data = test_transformed
)
xreg <- scale_result$train %>%
dplyr::select(-value) %>%
as.matrix()
xreg_pred <- scale_result$apply %>%
dplyr::select(-value) %>%
as.matrix()
y <- scale_result$train$value %>% stats::ts()
model <- forecast::auto.arima(y,
d = 0, xreg = xreg,
seasonal = FALSE,
trace = FALSE,
allowdrift = TRUE,
allowmean = TRUE,
lambda = NULL,
biasadj = TRUE
)
pred <- forecast::forecast(
object = model,
xreg = xreg_pred,
level = alpha,
lambda = NULL,
biasadj = TRUE
)
test$prediction <- pred$mean %>% as.numeric()
test$prediction_lower <- pred$lower[, 1] %>% as.numeric()
test$prediction_upper <- pred$upper[, 1] %>% as.numeric()
dt_predictions <- rescale_predictions(
scale_result = scale_result,
dt_predictions = test
)
if (calc_shaps) {
rlang::check_installed(c("fastshap", "shapviz"),
reason = "calculate shap values for dynamic regression"
)
shap_vals <- fastshap::explain(
model,
X = xreg,
nsim = 100,
newdata = xreg_pred,
pred_wrapper = function(object, newdata) {
fc_prediction <- forecast::forecast(
object = object,
xreg = newdata,
lambda = NULL,
biasadj = TRUE
)
return(fc_prediction$mean %>% as.numeric())
},
shap_only = FALSE
)
shp <- shapviz::shapviz(shap_vals)
} else {
shp <- NULL
}
list(dt_predictions = dt_predictions, model = model, importance = shp)
}
#' Run random forest model with ranger
#'
#' This function trains a random forest model (ranger) on the
#' specified training dataset and makes predictions on the test dataset in a
#' counterfactual scenario. The model uses meteorological variables and temporal features.
#'
#' Note: Runs the random forest model for individualised use with own data pipeline.
#' Otherwise use [ubair::run_counterfactual()] to call this function.
#' @param train Dataframe of train data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param test Dataframe of test data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param model_params list of hyperparameters to use in ranger call. See \code{\link[ranger:ranger]{ranger:ranger()}} for options.
#' @param alpha Confidence level of the prediction interval between 0 and 1.
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return List with data frame of predictions and model
#' @import ranger
#' @export
run_rf <- function(train, test, model_params, alpha, calc_shaps) {
function_parameters <- list(
y = train$value,
x = train %>% select(-value),
importance = "none",
splitrule = "variance",
keep.inbag = TRUE,
quantreg = TRUE
)
function_parameters <- append(function_parameters, model_params)
model <- do.call(ranger::ranger, args = function_parameters)
quantiles <- data.table::as.data.table(predict(model,
test,
type = "quantiles",
quantiles = c((1 - alpha) / 2, (1 + alpha) / 2)
)$predictions)
if (calc_shaps) {
rlang::check_installed(c("treeshap", "shapviz"),
reason = "calculate shap values for random forest"
)
shap_subset <- test[sample(nrow(test), min(nrow(test), 1000)), ] %>% select(-value)
unified <- treeshap::ranger.unify(model, data.matrix(train %>% select(-value)))
treeshap_vals <- treeshap::treeshap(unified, data.matrix(shap_subset), verbose = FALSE)
shp <- shapviz::shapviz(treeshap_vals, X_pred = data.matrix(shap_subset), X = shap_subset)
} else {
shp <- NULL
}
test[, "prediction" := predict(model, test)$predictions]
test[, c("prediction_lower", "prediction_upper") := quantiles]
list(dt_predictions = test, model = model, importance = shp)
}
#' Run gradient boosting model with lightgbm
#'
#' This function trains a gradient boosting model (lightgbm) on the
#' specified training dataset and makes predictions on the test dataset in a
#' counterfactual scenario. The model uses meteorological variables and temporal features.
#'
#' Note: Runs the gradient boosting model for individualised use with own data pipeline.
#' Otherwise use [ubair::run_counterfactual()] to call this function.
#' @param train Dataframe of train data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param test Dataframe of test data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param model_params list of hyperparameters to use in lgb.train call.
#' See \code{\link[lightgbm:lgb.train]{lightgbm:lgb.train()}} params argument for details.
#' @param alpha Confidence level of the prediction interval between 0 and 1.
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return List with data frame of predictions and model
#' @import lightgbm
#' @examples
#' \donttest{
#' data(mock_env_data)
#' split_data <- list(
#' train = mock_env_data[1:80, ],
#' apply = mock_env_data[81:100, ]
#' )
#' params <- load_params()
#' variables <- c("day_julian", "weekday", "hour", params$meteo_variables)
#' res <- run_lightgbm(
#' train = mock_env_data[1:80, ],
#' test = mock_env_data[81:100, ], params$lightgbm, alpha = 0.9,
#' calc_shaps = FALSE
#' )
#' prediction <- res$dt_predictions
#' model <- res$model
#' }
#' @export
run_lightgbm <- function(train, test, model_params, alpha, calc_shaps) {
dtrain <- lgb.Dataset(
data = data.matrix(train %>% select(-value)),
label = train$value
)
model_mean <- lgb.train(
params = model_params[names(model_params) != "nrounds"],
data = dtrain,
nrounds = model_params$nrounds
)
params_lower <- append(
model_params[names(model_params) != "nrounds"],
list(
"objective" = "quantile",
"alpha" = (1 - alpha) / 2,
"verbosity" = 0
)
)
model_lower <- lgb.train(
params = params_lower,
data = dtrain,
nrounds = model_params$nrounds
)
params_upper <- append(
model_params[names(model_params) != "nrounds"],
list(
"objective" = "quantile",
"alpha" = (1 + alpha) / 2,
"verbosity" = 0
)
)
model_upper <- lgb.train(
params = params_upper,
data = dtrain,
nrounds = model_params$nrounds
)
dapply <- test %>% select(!!colnames(train))
dapply <- data.matrix(dapply %>% select(-value))
dt_predictions <- test
dt_predictions$prediction <- predict(model_mean, dapply)
dt_predictions$prediction_lower <- predict(model_lower, dapply)
dt_predictions$prediction_upper <- predict(model_upper, dapply)
if (calc_shaps) {
rlang::check_installed(c("shapviz"),
reason = "calculate shap values for lightgbm"
)
shp <- shapviz::shapviz(model_mean, X_pred = dapply, X = dapply)
} else {
shp <- NULL
}
list(dt_predictions = dt_predictions, model = model_mean, importance = shp)
}
#' Train a Feedforward Neural Network (FNN) in a Counterfactual Scenario.
#'
#' Trains a feedforward neural network (FNN) model on the
#' specified training dataset and makes predictions on the test dataset in a
#' counterfactual scenario. The model uses meteorological variables and
#' sin/cosine-transformed features. Scales the data before training and rescales
#' predictions, as the model does not converge with unscaled data.
#'
#' This function provides flexibility for users with their own data pipelines
#' or workflows. For a simplified pipeline, consider using
#' \code{\link[ubair:run_counterfactual]{run_counterfactual()}}.
#'
#' Experiment with hyperparameters such as \code{learning_rate},
#' \code{batchsize}, \code{hidden_layers}, and \code{num_epochs} to improve
#' performance.
#'
#' Warning: Using many or large hidden layers in combination with a high number
#' of epochs can lead to long training times.
#'
#' @param train A data frame or tibble containing the training dataset,
#' including the target variable (`value`)
#' and meteorological variables specified in `params$meteo_variables`.
#' @param test A data frame or tibble containing the test dataset on which
#' predictions will be made,
#' using the same meteorological variables as in the training dataset.
#' @param params A list of parameters that define the following:
#' \describe{
#' \item{meteo_variables}{A character vector specifying the names of the
#' meteorological variables used as inputs.}
#' \item{fnn}{A list of hyperparameters for training the feedforward neural
#' network, including:
#' \itemize{
#' \item \code{activation_fun}: The activation function for the hidden
#' layers (e.g., "sigmoid", "tanh").
#' \item \code{momentum}: The momentum factor for training.
#' \item \code{learningrate_scale}: Factor for adjusting learning rate.
#' \item \code{output_fun}: The activation function for the output layer
#' \item \code{batchsize}: The size of the batches during training.
#' \item \code{hidden_dropout}: Dropout rate for the hidden layers to
#' prevent overfitting.
#' \item \code{visible_dropout}: Dropout rate for the input layer.
#' \item \code{hidden_layers}: A vector specifying the number of neurons
#' in each hidden layer.
#' \item \code{num_epochs}: Number of epochs (iterations) for training.
#' \item \code{learning_rate}: Initial learning rate.
#' }
#' }
#' }
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with
#' \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return A list with three elements:
#' \describe{
#' \item{\code{dt_predictions}}{A data frame containing the test data along
#' with the predicted values:
#' \describe{
#' \item{\code{prediction}}{The predicted values from the FNN model.}
#' \item{\code{prediction_lower}}{The same predicted values, as no
#' quantile model is available yet for FNN.}
#' \item{\code{prediction_upper}}{The same predicted values, as no
#' quantile model is available yet for FNN.}
#' }
#' }
#' \item{\code{model}}{The trained FNN model object from the
#' \code{deepnet::nn.train()} function.}
#' \item{\code{importance}}{SHAP importance values (if
#' \code{calc_shaps = TRUE}). Otherwise, `NULL`.}
#' }
#' @examples
#' data(mock_env_data)
#' params <- load_params()
#' res <- run_fnn(
#' train = mock_env_data[1:80, ],
#' test = mock_env_data[81:100, ], params,
#' calc_shaps = FALSE
#' )
#' @export
run_fnn <- function(train, test, params, calc_shaps) {
rlang::check_installed(c("deepnet"), reason = "to run a fnn")
train_transformed <- .transform_input(train, params)
test_transformed <- .transform_input(test, params)
scale_result <- scale_data(
train_data = train_transformed,
apply_data = test_transformed
)
train_matrix <- scale_result$train %>%
dplyr::select(-value) %>%
as.matrix()
test_matrix <- scale_result$apply %>%
dplyr::select(-value) %>%
as.matrix()
target_vector <- scale_result$train$value
fnn_function_parameter <- append(
list(x = train_matrix, y = target_vector),
params$fnn
)
model <- do.call(deepnet::nn.train, args = fnn_function_parameter)
prediction <- deepnet::nn.predict(model, test_matrix)
# no quantil model yet for feedforward neural network
test <- test %>%
mutate(
prediction = prediction,
prediction_lower = prediction,
prediction_upper = prediction
)
test <- rescale_predictions(
scale_result = scale_result,
dt_predictions = test
)
if (calc_shaps) {
rlang::check_installed(c("fastshap", "shapviz"),
reason = "calculate shap values for fnn"
)
shap_vals <- fastshap::explain(
model,
X = train_matrix,
nsim = 50,
newdata = test_matrix,
pred_wrapper = function(object, newdata) {
deepnet::nn.predict(
nn = object,
x = newdata
) %>% as.numeric()
},
shap_only = FALSE
)
shp <- shapviz::shapviz(shap_vals)
} else {
shp <- NULL
}
list(dt_predictions = test, model = model, importance = shp)
}
#' Make fourier features out of hour, weekday and day of the year
#'
#' @return Numeric matrix of sin and cos transformed temporal features of dimension
#' n x 6.
#' @noRd
.make_fourier_features <- function(df) {
df$weekday <- df$weekday %>% as.numeric()
fourier_list <- list(
sin((24 * (df$day_julian) + df$hour) / (24 * 365) * 2 * pi),
cos((24 * (df$day_julian) + df$hour) / (24 * 365) * 2 * pi),
sin((24 * (df$weekday - 1) + df$hour) / (24 * 7) * 2 * pi),
cos((24 * (df$weekday - 1) + df$hour) / (24 * 7) * 2 * pi),
sin(df$hour / 24 * 2 * pi),
cos(df$hour / 24 * 2 * pi)
)
res_matrix <- matrix(unlist(fourier_list), ncol = 6, byrow = FALSE)
colnames(res_matrix) <- c(
"sin_year", "cos_year", "sin_week", "cos_week",
"sin_hour", "cos_hour"
)
res_matrix
}
#' Transform Data to cyclic features for fnn/dynamic regression
#'
#' @return Numeric matrix combining the selected meteorological variables
#' (transformed wind vectors as U and V components) and the Fourier features.
#' @noRd
.transform_input <- function(input_data, params) {
input_meteo <- input_data %>% select(!!params$meteo_variables)
if ("WIG" %in% params$meteo_variables && "WIR" %in% params$meteo_variables) {
input_meteo <- .make_wind_vectors(input_meteo) %>% select(-WIG, -WIR)
}
fourier_features <- .make_fourier_features(input_data)
transformed <- cbind(input_meteo, fourier_features, input_data[, "value"])
transformed
}
#' Creates wind vectors from direction and speed
#'
#' Takes a dataframe with columns WIG (wind speed) and
#' WIR (wind direction in degrees 0 to 360) and creates wind vectors with U and
#' V component
#' @return Data frame of the same format as the input with additional columns
#' "WIND_U" and "WIND_V".
#' @noRd
.make_wind_vectors <- function(dt_prepared) {
dt_prepared <- dt_prepared %>% mutate(
"WIND_U" = sin(pi * WIR / 180) * WIG,
"WIND_V" = cos(pi * WIR / 180) * WIG
)
dt_prepared
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.