Nothing
#' Prepare Data for Training a model
#'
#' Prepares environmental data by filtering for relevant components,
#' converting the data to a wide format, and adding temporal features. Should be
#' called before
#' \code{\link[ubair:split_data_counterfactual]{split_data_counterfactual()}}
#'
#' @param env_data A data table in long format.
#' Must include the following columns:
#' \describe{
#' \item{Station}{Station identifier for the data.}
#' \item{Komponente}{The environmental component being measured
#' (e.g., temperature, NO2).}
#' \item{Wert}{The measured value of the component.}
#' \item{date}{Timestamp as `POSIXct` object in `YYYY-MM-DD HH:MM:SS` format.}
#' \item{Komponente_txt}{A textual description of the component.}
#' }
#' @param params A list of modelling parameters loaded from `params.yaml`.
#' Must include:
#' \describe{
#' \item{meteo_variables}{A vector of meteorological variable names.}
#' \item{target}{The name of the target variable.}
#' }
#' @return A `data.table` in wide format, with columns:
#' `date`, one column per component, and temporal features
#' like `date_unix`, `day_julian`, `weekday`, and `hour`.
#' @examples
#' env_data <- data.table::data.table(
#' Station = c("StationA", "StationA", "StationA"),
#' Komponente = c("NO2", "TMP", "NO2"),
#' Wert = c(50, 20, 40),
#' date = as.POSIXct(c("2023-01-01 10:00:00", "2023-01-01 11:00:00", "2023-01-02 12:00:00"))
#' )
#' params <- list(meteo_variables = c("TMP"), target = "NO2")
#' prepared_data <- prepare_data_for_modelling(env_data, params)
#' print(prepared_data)
#'
#' @export
prepare_data_for_modelling <- function(env_data, params) {
components <- c(params$meteo_variables, params$target)
dt_filtered <- .extract_components(env_data, components)
dt_wide <- .cast_to_wide(dt_filtered)
if (!params$target %in% names(dt_wide)) {
warning(sprintf("Target '%s' is not present in the data. Make sure it exists
and you have set the correct target name", params$target))
stop("Exiting function due to missing target data.")
}
dt_prepared <- dt_wide %>%
dplyr::rename(value = params$target) %>%
.add_date_variables(replace = TRUE) %>%
dplyr::filter(!is.na(value))
dt_prepared
}
#' Turn date feature into temporal features date_unix, day_julian, weekday and
#' hour
#'
#' @param df Data.table with column date formatted as date-time object
#' @param replace Boolean which determines whether to replace existing temporal variables
#' @return A data.table with all relevant temporal features for modelling
#' @noRd
.add_date_variables <- function(df, replace) {
names <- names(df)
if (replace) {
df$date_unix <- as.numeric(df$date)
df$day_julian <- lubridate::yday(df$date)
df$weekday <- .wday_monday(df$date, as.factor = TRUE)
df$hour <- lubridate::hour(df$date)
} else {
if (!"date_unix" %in% names) {
df$date_unix <- as.numeric(df$date)
}
if (!"day_julian" %in% names) {
df$day_julian <- lubridate::yday(df$date)
}
if (!"weekday" %in% names) {
df$weekday <- .wday_monday(df$date, as.factor = TRUE)
}
if (!"hour" %in% names) {
df$hour <- lubridate::hour(df$date)
}
}
return(df)
}
#' Reformat lubridate weekdays into weekdays with monday as day 1
#'
#' @param x Vector of date-time objects
#' @param as.factor Boolean that determines whether to return a factor or a numeric vector
#' @noRd
.wday_monday <- function(x, as.factor = FALSE) {
x <- lubridate::wday(x)
x <- x - 1
x <- ifelse(x == 0, 7, x)
if (as.factor) {
x <- factor(x, levels = 1:7, ordered = TRUE)
}
return(x)
}
#' Split Data into Training and Application Datasets
#'
#' Splits prepared data into training and application datasets based on
#' specified date ranges for a business-as-usual scenario. Data before
#' `application_start` and after `application_end` is used as training data,
#' while data within the date range is used for application.
#'
#' @param dt_prepared The prepared data table.
#' @param application_start The start date(date object) for the application
#' period of the business-as-usual simulation. This coincides with the start of
#' the reference window.
#' Can be created by e.g. lubridate::ymd("20191201")
#' @param application_end The end date(date object) for the application period
#' of the business-as-usual simulation. This coincides with the end of
#' the effect window.
#' Can be created by e.g. lubridate::ymd("20191201")
#' @return A list with two elements:
#' \describe{
#' \item{train}{Data outside the application period.}
#' \item{apply}{Data within the application period.}
#' }
#' @examples
#' dt_prepared <- data.table::data.table(
#' date = as.Date(c("2023-01-01", "2023-01-05", "2023-01-10")),
#' value = c(50, 60, 70)
#' )
#' result <- split_data_counterfactual(
#' dt_prepared,
#' application_start = as.Date("2023-01-03"),
#' application_end = as.Date("2023-01-08")
#' )
#' print(result$train)
#' print(result$apply)
#' @export
split_data_counterfactual <- function(dt_prepared,
application_start,
application_end) {
stopifnot(
inherits(application_start, "Date"),
inherits(application_end, "Date")
)
stopifnot(application_start <= application_end)
dt_train <- dt_prepared[date < application_start | date > application_end]
dt_apply <- dt_prepared[date >= application_start & date <= application_end]
list(train = dt_train, apply = dt_apply)
}
#' Extract Components for Modelling
#' Stop with error message if any selected meteo variable/component is not
#' contained in the data.
#'
#' @param env_data Daily aggregated data table.
#' @param components Vector of component names to extract.
#' @return A data.table filtered by the specified components.
#' @noRd
.extract_components <- function(env_data, components) {
if (!all(components %in% env_data$Komponente)) {
missing_components <- components[!components %in% env_data$Komponente]
stop(paste(
"Data does not contain all selected variables:", missing_components,
"\n Check data and meteo_variables/params.yaml."
))
}
env_data[Komponente %in% components, list(Komponente, Wert, date)]
}
#' @param dt_filtered Filtered data.table.
#' @return A wide-format data.table.
#' @noRd
#' @examples
#' dt_filtered <- data.table::data.table(
#' date = as.POSIXct(c("2023-01-01", "2023-01-01", "2023-01-02")),
#' Komponente = c("NO2", "TMP", "NO2"),
#' Wert = c(50, 20, 40)
#' )
#' wide_data <- .cast_to_wide(dt_filtered)
#' print(wide_data)
.cast_to_wide <- function(dt_filtered) {
data.table::dcast(dt_filtered,
formula = date ~ Komponente,
value.var = "Wert"
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.