Nothing
#' Removes trend from data
#'
#' Takes a list of train and application data as prepared by
#' [ubair::split_data_counterfactual()]
#' and removes a polynomial, exponential or cubic spline spline trend function.
#' Trend is obtained only from train data. Use as part of preprocessing before
#' training a model based on decision trees, i.e. random forest and lightgbm.
#' For the other methods it may be helpful but they are generally able to
#' deal with trends themselves. Therefore we recommend to try out different
#' versions and guide decisisions using the model evaluation metrics from
#' [ubair::calc_performance_metrics()].
#'
#' Apply [ubair::retrend_predictions()] to predictions to return to the
#' original data units.
#'
#' @param split_data List of two named dataframes called train and apply
#' @param mode String which defines type of trend is present. Options are
#' "linear", "quadratic", "exponential", "spline", "none".
#' "none" returns original data
#' @param num_splines Defines the number of cubic splines if `mode="spline"`.
#' Choose num_splines=1 for cubic polynomial trend. If `mode!="spline"`, this
#' parameter is ignored
#' @param log_transform If `TRUE`, use a log-transformation before detrending
#' to ensure positivity of all predictions in the rest of the pipeline.
#' A exp transformation is necessary during retrending to return to the solution
#' space. Use only in combination with `log_transform` parameter in
#' [ubair::retrend_predictions()]
#' @return List of 3 elements. 2 dataframes: detrended train, apply and the
#' trend function
#' @examples
#' data(mock_env_data)
#' split_data <- list(
#' train = mock_env_data[1:80, ],
#' apply = mock_env_data[81:100, ]
#' )
#' detrended_list <- detrend(split_data, mode = "linear")
#' detrended_train <- detrended_list$train
#' detrended_apply <- detrended_list$apply
#' trend <- detrended_list$model
#' @export
#' @importFrom stats lm predict spline
#' @import splines
detrend <- function(split_data,
mode = "linear",
num_splines = 5,
log_transform = FALSE) {
dt_train_new <- data.table::copy(split_data$train)
dt_apply_new <- data.table::copy(split_data$apply)
stopifnot("log_transform needs to be boolean, i.e. either TRUE or FALSE" = class(log_transform) == "logical")
if (log_transform) {
dt_train_new$value <- log(dt_train_new$value)
dt_apply_new$value <- log(dt_apply_new$value)
}
if (mode == "linear") {
trend <- lm(value ~ date_unix, data = dt_train_new)
} else if (mode == "quadratic") {
trend <- lm(value ~ date_unix + I(date_unix^2), data = dt_train_new)
} else if (mode == "exponential") {
trend <- lm(value ~ log(date_unix), data = dt_train_new)
} else if (mode == "spline") {
stopifnot("Set num_splines larger or equal to 1" = num_splines >= 1)
knots <- seq(
from = min(dt_train_new$date_unix),
to = max(dt_train_new$date_unix),
length.out = num_splines + 1
)
knots <- knots[2:(length(knots) - 1)]
if (length(knots) <= 2) { # If 2 or less knots, just fit cubic polynomial
trend <- lm(value ~ date_unix + I(date_unix^2) + I(date_unix^3),
data = dt_train_new
)
} else { # Else use splines
trend <- lm(value ~ bs(date_unix, knots = knots), data = dt_train_new)
}
} else if (mode == "none") {
trend <- lm(value ~ 1 - 1, data = dt_train_new)
} else {
stop("mode needs to be any of the following strings: 'linear',
'quadratic', 'exponential', 'spline', 'none'")
}
trend_train_values <- predict(trend, newdata = dt_train_new)
dt_train_new$value <- dt_train_new$value - trend_train_values
trend_apply_values <- predict(trend, newdata = dt_apply_new)
dt_apply_new$value <- dt_apply_new$value - trend_apply_values
return(list(train = dt_train_new, apply = dt_apply_new, model = trend))
}
#' Restors the trend in the prediction
#'
#' Takes a dataframe of predictions as returned by any of
#' the 'run_model' functions and restores a trend which was previously
#' removed via [ubair::detrend()]. This is necessary for the predictions
#' and the true values to have the same units. The function is basically
#' the inverse function to [ubair::detrend()] and should only be used in
#' combination with it.
#'
#' @param dt_predictions Dataframe of predictions with columns `value`,
#' `prediction`, `prediction_lower`, `prediction_upper`
#' @param trend lm object generated by [ubair::detrend()]
#' @param log_transform Returns values to solution space, if they have been
#' log transformed during detrending. Use only in combination with `log_transform`
#' parameter in detrend function.
#' @return Retrended dataframe with same structure as `dt_predictions`
#' which is returned by any of the run_model() functions.
#' @examples
#' \donttest{
#' data(mock_env_data)
#' split_data <- list(
#' train = mock_env_data[1:80, ],
#' apply = mock_env_data[81:100, ]
#' )
#' params <- load_params()
#' detrended_list <- detrend(split_data,
#' mode = "linear"
#' )
#' trend <- detrended_list$model
#' detrended_train <- detrended_list$train
#' detrended_apply <- detrended_list$apply
#' result <- run_lightgbm(
#' train = detrended_train,
#' test = detrended_apply,
#' model_params = params$lightgbm,
#' alpha = 0.9,
#' calc_shaps = FALSE
#' )
#' retrended_predictions <- retrend_predictions(result$dt_predictions, trend)
#' }
#' @export
retrend_predictions <- function(dt_predictions, trend, log_transform = FALSE) {
stopifnot("log_transform needs to be boolean, i.e. TRUE or FALSE" = class(log_transform) == "logical")
stopifnot("trend object needs to be a linear model of class 'lm'" = class(trend) == "lm")
stopifnot(
"Not all of 'value', 'prediction', 'prediction_lower', 'prediction_upper' are columns in dt_predictions" =
all(c("value", "prediction", "prediction_lower", "prediction_upper")
%in% colnames(dt_predictions))
)
trend_value <- predict(trend, newdata = dt_predictions)
if (log_transform) {
dt_predictions[,
c("value", "prediction", "prediction_lower", "prediction_upper") := lapply(.SD, \(x) exp(x + trend_value)),
.SDcols = c("value", "prediction", "prediction_lower", "prediction_upper")
]
} else {
dt_predictions[,
c("value", "prediction", "prediction_lower", "prediction_upper") := lapply(.SD, \(x) x + trend_value),
.SDcols = c("value", "prediction", "prediction_lower", "prediction_upper")
]
}
dt_predictions
}
#' Standardize Training and Application Data
#'
#' This function standardizes numeric columns of the `train_data` and applies
#' the same scaling (mean and standard deviation) to the corresponding columns
#' in `apply_data`. It returns the standardized data along with the scaling
#' parameters (means and standard deviations). This is particularly important
#' for neural network approaches as they tend to be numerically unstable and
#' deteriorate otherwise.
#'
#' @param train_data A data frame containing the training dataset to be
#' standardized. It must contain numeric columns.
#' @param apply_data A data frame containing the dataset to which the scaling
#' from `train_data` will be applied.
#'
#' @return A list containing the following elements:
#' \item{train}{The standardized training data.}
#' \item{apply}{The `apply_data` scaled using the means and standard deviations
#' from the `train_data`.}
#' \item{means}{The means of the numeric columns in `train_data`.}
#' \item{sds}{The standard deviations of the numeric columns in `train_data`.}
#' @export
#' @importFrom dplyr mutate across where
#' @examples
#' data(mock_env_data)
#' detrended_list <- list(
#' train = mock_env_data[1:80, ],
#' apply = mock_env_data[81:100, ]
#' )
#' scale_result <- scale_data(
#' train_data = detrended_list$train,
#' apply_data = detrended_list$apply
#' )
#' scaled_train <- scale_result$train
#' scaled_apply <- scale_result$apply
scale_data <- function(train_data,
apply_data) {
means <- attr(
scale(train_data %>% select(where(is.numeric))),
"scaled:center"
)
sds <- attr(
scale(train_data %>% select(where(is.numeric))),
"scaled:scale"
)
train_data <- train_data %>%
dplyr::mutate_if(is.numeric, ~ as.numeric(scale(.)))
apply_data <- apply_data %>%
mutate(across(
names(sds),
~ as.numeric(scale(.,
center = means[dplyr::cur_column()],
scale = sds[dplyr::cur_column()]
))
))
list(
train = train_data,
apply = apply_data,
means = means,
sds = sds
)
}
#' Rescale predictions to original scale.
#'
#' This function rescales the predicted values (`prediction`, `prediction_lower`,
#' `prediction_upper`). The scaling is reversed using the means and
#' standard deviations that were saved from the training data. It is the inverse
#' function to [ubair::scale_data()] and should be used only in combination.
#'
#' @param scale_result A list object returned by [ubair::scale_data()],
#' containing the means and standard deviations used for scaling.
#' @param dt_predictions A data frame containing the predictions,
#' including columns `prediction`, `prediction_lower`, `prediction_upper`.
#'
#' @return A data frame with the predictions and numeric columns rescaled back
#' to their original scale.
#'
#' @export
#' @examples
#' \donttest{
#' data(mock_env_data)
#' scale_res <- scale_data(
#' train_data = mock_env_data[1:80, ],
#' apply_data = mock_env_data[81:100, ]
#' )
#' params <- load_params()
#' res <- run_lightgbm(
#' train = scale_res$train, test = scale_res$apply,
#' params$lightgbm, alpha = 0.9, calc_shaps = FALSE
#' )
#' dt_predictions <- res$dt_predictions
#' rescaled_predictions <- rescale_predictions(scale_res, dt_predictions)
#' }
rescale_predictions <- function(scale_result, dt_predictions) {
means <- scale_result$means
sds <- scale_result$sds
rescaled_predictions <- dt_predictions %>%
mutate(
prediction = prediction * sds["value"] + means["value"],
prediction_lower = prediction_lower * sds["value"] + means["value"],
prediction_upper = prediction_upper * sds["value"] + means["value"]
)
return(rescaled_predictions)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.