R/data_cleaning.R
In ubair: Effects of External Conditions on Air Quality

Documented in clean_data get_meteo_available

#' Clean and Optionally Aggregate Environmental Data
#'
#' Cleans a data table of environmental measurements by filtering for a specific
#' station, removing duplicates, and optionally aggregating the data on a daily
#' basis using the mean.
#'
#' @param env_data A data table in long format.
#' Must include columns:
#' \describe{
#'   \item{Station}{Station identifier for the data.}
#'   \item{Komponente}{Measured environmental component e.g. temperature, NO2.}
#'   \item{Wert}{Measured value.}
#'   \item{date}{Timestamp as Date-Time object (`YYYY-MM-DD HH:MM:SS` format).}
#'   \item{Komponente_txt}{Textual description of the component.}
#' }
#' @param station Character. Name of the station to filter by.
#' @param aggregate_daily Logical. If `TRUE`, aggregates data to daily mean values. Default is `FALSE`.
#' @return A `data.table`:
#' \itemize{
#'   \item If `aggregate_daily = TRUE`: Contains columns for station, component, day, year,
#'         and the daily mean value of the measurements.
#'   \item If `aggregate_daily = FALSE`: Contains cleaned data with duplicates removed.
#' }
#' @details Duplicate rows (by `date`, `Komponente`, and `Station`) are removed. A warning is issued
#' if duplicates are found.
#' @examples
#' # Example data
#' env_data <- data.table::data.table(
#'   Station = c("DENW094", "DENW094", "DENW006", "DENW094"),
#'   Komponente = c("NO2", "O3", "NO2", "NO2"),
#'   Wert = c(45, 30, 50, 40),
#'   date = as.POSIXct(c(
#'     "2023-01-01 08:00:00", "2023-01-01 09:00:00",
#'     "2023-01-01 08:00:00", "2023-01-02 08:00:00"
#'   )),
#'   Komponente_txt = c(
#'     "Nitrogen Dioxide", "Ozone", "Nitrogen Dioxide", "Nitrogen Dioxide"
#'   )
#' )
#'
#' # Clean data for StationA without aggregation
#' cleaned_data <- clean_data(env_data, station = "DENW094", aggregate_daily = FALSE)
#' print(cleaned_data)
#' @export
clean_data <- function(env_data, station, aggregate_daily = FALSE) {
  env_data <- .add_year_column(env_data)
  env_data <- dplyr::filter(env_data, Station == station)
  env_data_unique <- unique(env_data, by = c("date", "Komponente", "Station"))

  if (nrow(env_data_unique) < nrow(env_data)) {
    warning(sprintf(
      "%d duplicate row(s) were removed.",
      nrow(env_data) - nrow(env_data_unique)
    ))
  }
  if (aggregate_daily) {
    env_data_unique <- .aggregate_data(env_data_unique)
  }

  env_data_unique
}

#' Get Available Meteorological Components
#'
#' Identifies unique meteorological components from the provided environmental data,
#' filtering only those that match the predefined UBA naming conventions. These components
#' include "GLO", "LDR", "RFE", "TMP", "WIG", "WIR", "WIND_U", and "WIND_V".
#' @param env_data Data table containing environmental data.
#' Must contain column "Komponente"
#' @return A vector of available meteorological components.
#' @examples
#' # Example environmental data
#' env_data <- data.table::data.table(
#'   Komponente = c("TMP", "NO2", "GLO", "WIR"),
#'   Wert = c(25, 40, 300, 50),
#'   date = as.POSIXct(c(
#'     "2023-01-01 08:00:00", "2023-01-01 09:00:00",
#'     "2023-01-01 10:00:00", "2023-01-01 11:00:00"
#'   ))
#' )
#' # Get available meteorological components
#' meteo_components <- get_meteo_available(env_data)
#' print(meteo_components)
#' @export
get_meteo_available <- function(env_data) {
  meteo_available <- unique(env_data$Komponente) %>%
    .[. %in% c("GLO", "LDR", "RFE", "TMP", "WIG", "WIR", "WIND_U", "WIND_V")]
  meteo_available
}

#' @return A data.table with an added year column.
#' @noRd
.add_year_column <- function(env_data) {
  env_data_copy <- data.table::copy(env_data)
  env_data_copy[, year := lubridate::year(date)]
}

#' Adds a `day` column to the data, representing the date truncated to day-level
#' precision. This column is used for later aggregations.
#' @noRd
.add_day_column <- function(env_data) {
  env_data %>% dplyr::mutate(day = lubridate::floor_date(date, unit = "day"))
}

#' @return A data.table aggregated to daily mean values and a new column day.
#' @noRd
.aggregate_data <- function(env_data) {
  env_data <- .add_day_column(env_data)
  env_data[, list(Wert = mean(Wert, na.rm = TRUE)),
    by = list(Station, Komponente, Komponente_txt, day, year)
  ]
}