R/time_pad.R

Defines functions time_pad_worker time_pad

Documented in time_pad

#' Function to pad time series.
#'
#' \code{time_pad} use is similar to \strong{openair}'s \code{timeAverage}, but
#' the aggregation of values does not occur. \code{time_pad} does not drop 
#' non-numerical variables, can include identifiers post-pad, and can start and
#' end a padded time-series at a "nice place", for example, at the beginning of
#' an hour or day. 
#' 
#' \code{time_pad} pads a time series by calculating the maximum and minimum 
#' dates within a time series and then generating a uniform date sequence 
#' between the maximum and minimum dates. This date sequence is then joined to 
#' the input data frame and the missing values are represented as \code{NA}. 
#' 
#' @param df A tibble/data frame including parsed dates. The date 
#' variable/column must be named \code{date}.
#' 
#' @param interval Interval of returned time series. Some examples could be: 
#' \code{"min"} \code{"hour"}, \code{"day"}, \code{"month"}, \code{"year"} but 
#' multiples such as \code{"5 min"} work too. \code{interval} can also be a 
#' numeric value such as \code{0.5} which is useful for sub-second padding. 
#' 
#' @param by Should \code{time_pad} apply the padding function to groups within
#' \code{df}? This is helpful when there are many sites/other identifiers within
#' \code{df} which need to be padded individually. 
#' 
#' @param round What date-unit should the first and last observations be rounded
#' to? This allows the padded time-series to begin and end at a "nice place". 
#' Examples are \code{"hour"}, \code{"day"}, \code{"month"}, and \code{"year"}.
#' 
#' @param full Should the date joining use the \code{full_join} function? If 
#' \code{TRUE}, no input dates will be lost but the default is \code{FALSE}. 
#' 
#' @param warn Should the function give a warning when dates are duplicated? 
#' Default is \code{TRUE}. 
#' 
#' @seealso See \code{\link{round_date_interval}}, \code{timeAverage}, 
#' \code{\link{round_date}}, \code{\link{left_join}}, 
#' \code{\link{aggregate_by_date}}
#' 
#' @author Stuart K. Grange
#' 
#' @return Tibble. 
#' 
#' @examples
#' 
#' \dontrun{
#' 
#' # Pad time series so every minute is present
#' data_nelson_pad <- time_pad(data_nelson, interval = "min", round = "day")
#' 
#' # Keep identifying variables "site" and "sensor"
#' data_ozone_sensor_pad <- time_pad(
#'   data_ozone_sensor, 
#'   interval = "hour", 
#'   by = c("site", "sensor")
#' )
#' 
#' }
#' 
#' @export
time_pad <- function(df, interval = "hour", by = NA, round = NA, full = FALSE, 
                     warn = TRUE) {
  
  # Check input
  if (nrow(df) == 0) {
    cli::cli_abort("Input data frame has no observations.")
  }
  
  # Check if input has a date variable
  if (!"date" %in% names(df)) {
    cli::cli_abort(
      "Input must contain a date variable/column and must be named `date`."
    )
  }
  
  # Missing-ness test
  if (any(is.na(df$date))) {
    cli::cli_abort("`date` must not contain missing (`NA`) values.")
  }
  
  # Check class of date too
  if (!lubridate::is.POSIXct(df$date)) {
    cli::cli_abort("`date` must be a POSIXct date.")
  }
  
  # Clean an argument a bit
  interval <- interval %>% 
    stringr::str_to_lower() %>% 
    stringr::str_trim()
  
  # Switch interval if required
  interval <- dplyr::case_when(
    interval %in% c("second", "seconds") ~ "sec",
    interval %in% c("minute", "minutes") ~ "min",
    interval == "hours" ~ "hour",
    .default = interval
  )
  
  # Attempt to make interval a numeric value for sub-second padding
  interval <- tryCatch({
    as.numeric(interval)
  }, warning = function(w) {
    interval
  })
  
  # For dplyr's grouping
  if (is.na(by[1])) {
    by <- NULL
  }
  
  # Pad by group
  df <- df %>% 
    group_by(across(dplyr::all_of(by))) %>%
    dplyr::group_modify(
      ~time_pad_worker(
        ., 
        interval = interval, 
        by = by,
        round = round, 
        full = full,
        warn = warn
      ),
      .keep = FALSE
    ) %>% 
    ungroup()
  
  return(df)
  
}


time_pad_worker <- function(df, interval, by, round, full, warn) {
  
  # Find the start and end of the date sequence
  if (is.na(round)) {
    # No date rounding, use date values in df
    date_start <- min(df$date)
    date_end <- max(df$date)
  } else {
    # Date rounding
    date_start <- lubridate::floor_date(min(df$date), round)
    date_end <- lubridate::ceiling_date(max(df$date), round)
  }
  
  # Create the sequence of dates
  date_sequence <- seq(date_start, date_end, by = interval)
  
  # Remove final observation if ceiling rounded
  if (!is.na(round)) date_sequence <- date_sequence[-length(date_sequence)]
  
  # Make a tibble with a single date variable
  df_dates <- tibble(date = date_sequence)
  
  # Do the padding
  if (full) {
    df <- df %>% 
      dplyr::full_join(df_dates, ., by = join_by(date)) %>% 
      arrange(date)
  } else {
    df <- left_join(df_dates, df, by = join_by(date))
  }
  
  # Raise a warning if there are duplicated dates
  if (warn && any_duplicated(df$date)) {
    cli::cli_warn("Duplicated dates have been detected...")
  }
  
  return(df)
  
}
skgrange/threadr documentation built on May 11, 2024, 12:16 p.m.