#' Function to pad time series.
#'
#' \code{time_pad} use is similar to \strong{openair}'s \code{timeAverage}, but
#' the aggregation of values does not occur. \code{time_pad} does not drop
#' non-numerical variables, can include identifiers post-pad, and can start and
#' end a padded time-series at a "nice place", for example, at the beginning of
#' an hour or day.
#'
#' \code{time_pad} pads a time series by calculating the maximum and minimum
#' dates within a time series and then generating a uniform date sequence
#' between the maximum and minimum dates. This date sequence is then joined to
#' the input data frame and the missing values are represented as \code{NA}.
#'
#' @param df A tibble/data frame including parsed dates. The date
#' variable/column must be named \code{date}.
#'
#' @param interval Interval of returned time series. Some examples could be:
#' \code{"min"} \code{"hour"}, \code{"day"}, \code{"month"}, \code{"year"} but
#' multiples such as \code{"5 min"} work too. \code{interval} can also be a
#' numeric value such as \code{0.5} which is useful for sub-second padding.
#'
#' @param by Should \code{time_pad} apply the padding function to groups within
#' \code{df}? This is helpful when there are many sites/other identifiers within
#' \code{df} which need to be padded individually.
#'
#' @param round What date-unit should the first and last observations be rounded
#' to? This allows the padded time-series to begin and end at a "nice place".
#' Examples are \code{"hour"}, \code{"day"}, \code{"month"}, and \code{"year"}.
#'
#' @param full Should the date joining use the \code{full_join} function? If
#' \code{TRUE}, no input dates will be lost but the default is \code{FALSE}.
#'
#' @param warn Should the function give a warning when dates are duplicated?
#' Default is \code{TRUE}.
#'
#' @seealso See \code{\link{round_date_interval}}, \code{timeAverage},
#' \code{\link{round_date}}, \code{\link{left_join}},
#' \code{\link{aggregate_by_date}}
#'
#' @author Stuart K. Grange
#'
#' @return Tibble.
#'
#' @examples
#'
#' \dontrun{
#'
#' # Pad time series so every minute is present
#' data_nelson_pad <- time_pad(data_nelson, interval = "min", round = "day")
#'
#' # Keep identifying variables "site" and "sensor"
#' data_ozone_sensor_pad <- time_pad(
#' data_ozone_sensor,
#' interval = "hour",
#' by = c("site", "sensor")
#' )
#'
#' }
#'
#' @export
time_pad <- function(df, interval = "hour", by = NA, round = NA, full = FALSE,
warn = TRUE) {
# Check input
if (nrow(df) == 0) {
cli::cli_abort("Input data frame has no observations.")
}
# Check if input has a date variable
if (!"date" %in% names(df)) {
cli::cli_abort(
"Input must contain a date variable/column and must be named `date`."
)
}
# Missing-ness test
if (any(is.na(df$date))) {
cli::cli_abort("`date` must not contain missing (`NA`) values.")
}
# Check class of date too
if (!lubridate::is.POSIXct(df$date)) {
cli::cli_abort("`date` must be a POSIXct date.")
}
# Clean an argument a bit
interval <- interval %>%
stringr::str_to_lower() %>%
stringr::str_trim()
# Switch interval if required
interval <- dplyr::case_when(
interval %in% c("second", "seconds") ~ "sec",
interval %in% c("minute", "minutes") ~ "min",
interval == "hours" ~ "hour",
.default = interval
)
# Attempt to make interval a numeric value for sub-second padding
interval <- tryCatch({
as.numeric(interval)
}, warning = function(w) {
interval
})
# For dplyr's grouping
if (is.na(by[1])) {
by <- NULL
}
# Pad by group
df <- df %>%
group_by(across(dplyr::all_of(by))) %>%
dplyr::group_modify(
~time_pad_worker(
.,
interval = interval,
by = by,
round = round,
full = full,
warn = warn
),
.keep = FALSE
) %>%
ungroup()
return(df)
}
time_pad_worker <- function(df, interval, by, round, full, warn) {
# Find the start and end of the date sequence
if (is.na(round)) {
# No date rounding, use date values in df
date_start <- min(df$date)
date_end <- max(df$date)
} else {
# Date rounding
date_start <- lubridate::floor_date(min(df$date), round)
date_end <- lubridate::ceiling_date(max(df$date), round)
}
# Create the sequence of dates
date_sequence <- seq(date_start, date_end, by = interval)
# Remove final observation if ceiling rounded
if (!is.na(round)) date_sequence <- date_sequence[-length(date_sequence)]
# Make a tibble with a single date variable
df_dates <- tibble(date = date_sequence)
# Do the padding
if (full) {
df <- df %>%
dplyr::full_join(df_dates, ., by = join_by(date)) %>%
arrange(date)
} else {
df <- left_join(df_dates, df, by = join_by(date))
}
# Raise a warning if there are duplicated dates
if (warn && any_duplicated(df$date)) {
cli::cli_warn("Duplicated dates have been detected...")
}
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.