Nothing
#' Prepare a dataset for modeling by filling in temporal gaps in data collection
#'
#' In order to create a modeling dataset with feature lags that are temporally correct, the entry
#' function in \code{forecastML}, \code{\link{create_lagged_df}}, needs evenly-spaced time series with no
#' gaps in data collection. \code{fill_gaps()} can help here.
#' This function takes a \code{data.frame} with (a) dates, (b) the outcome being forecasted, and, optionally,
#' (c) dynamic features that change through time, (d) group columns for multiple time series modeling,
#' and (e) static or non-dynamic features for multiple time series modeling and returns a \code{data.frame}
#' with rows evenly spaced in time. Specifically, this function adds rows to the input dataset
#' while filling in (a) dates, (b) grouping information, and (c) static features. The (a) outcome and (b)
#' dynamic features will be \code{NA} for any missing time periods; these \code{NA} values can be left
#' as-is, user-imputed, or removed from modeling in the user-supplied modeling wrapper function for \code{\link{train_model}}.
#'
#' @param data A data.frame or object coercible to a data.frame with, minimally, dates and the outcome being forecasted.
#' @param date_col The column index--an integer--of the date index. This column should have class 'Date' or 'POSIXt'.
#' @param frequency Date/time frequency. A string taking the same input as \code{base::seq.Date(..., by = "frequency")}
#' or \code{base::seq.POSIXt..., by = "frequency")} e.g., '1 hour', '1 month', '7 days', '10 years' etc.
#' The highest frequency supported at present is '1 sec'.
#' @param groups Optional. A character vector of column names that identify the unique time series (i.e., groups/hierarchies)
#' when multiple time series are present.
#' @param static_features Optional. For grouped time series only. A character vector of column names that identify features that do not change through time.
#' These columns are expected to be used as model features but are not lagged (e.g., a ZIP code column). The most recent values for each
#' static feature for each group are used to fill in the resulting missing data in static features when new rows are
#' added to the dataset.
#' @return An object of class 'data.frame': The returned data.frame has the same number of columns and column order but
#' with additional rows to account for gaps in data collection. For grouped data, any new rows added to the returned data.frame will appear
#' between the minimum--or oldest--date for that group and the maximum--or most recent--date across all groups. If the user-supplied
#' forecasting algorithm(s) cannot handle missing outcome values or missing dynamic features, these should either be
#' imputed prior to \code{create_lagged_df()} or filtered out in the user-supplied modeling function for \code{\link{train_model}}.
#'
#' @section Methods and related functions:
#'
#' The output of \code{fill_gaps()} is passed into
#'
#' \itemize{
#' \item \code{\link{create_lagged_df}}
#' }
#'
#' @example /R/examples/example_fill_gaps.R
#'
#' @export
fill_gaps <- function(data, date_col = 1, frequency, groups = NULL,
static_features = NULL) {
if (!methods::is(data, c("data.frame"))) {
stop("The 'data' argument takes an object of class 'data.frame'.")
}
data <- as.data.frame(data)
if (length(date_col) != 1 || date_col > ncol(data)) {
stop("The 'data_col' argument should be an integer giving the column location of the date index.")
}
if (!methods::is(data[, date_col], "Date") && !methods::is(data[, date_col], c("POSIXt"))) {
stop("The date column identified by the 'data_col' argument should be an object of class 'Date' or 'POSIXt'.")
}
if (any(is.na(data[, date_col, drop = TRUE]))) {
stop("The date column identified by the 'date_col' argument has missing or 'NA' dates; remove them prior to running this function.")
}
if (missing(frequency)) {
stop("The 'frequency' argument is required to set the expected frequency of data/time collection e.g., '1 hour', '1 day', '3 months', '10 years' etc.;
see base::seq.Date() or base::seq.POSIXt() for valid date/time frequencies.")
}
if (!grepl("sec|min|hour|day|week|month|quarter|year", frequency)) {
stop("The 'frequency' argument should be a string containing one of 'sec', 'min', 'hour', 'day', 'week',
'month', 'quarter', or 'year'. This can optionally be preceded by a positive integer and a space
and/or followed by an 's'.")
}
if (is.null(groups) && !is.null(static_features)) {
stop("To avoid 0 variance features, static features--those that do not change through time--should only be modeled with grouped time series.")
}
# Used to re-order the returned dataset to match the input dataset.
col_names <- names(data)
date_name <- names(data)[date_col]
if (is.null(groups)) {
data <- data %>%
dplyr::arrange(!!rlang::sym(date_name))
} else {
data <- data %>%
dplyr::arrange(!!!rlang::syms(groups), !!rlang::sym(date_name))
}
# Create a merge template giving the date bounds for non-grouped or grouped data.
data_template <- data %>%
dplyr::group_by_at(dplyr::vars(!!!groups)) %>%
dplyr::summarize("date_min" = min(!!rlang::sym(date_name), na.rm = TRUE)) %>%
dplyr::ungroup()
data_template$date_max <- max(data[, date_name, drop = TRUE], na.rm = TRUE)
if (!is.null(static_features)) {
# Create a merge template to fill in the static features when left_join()-ing. The assumption
# here is that the last row or most recent observation has a non-missing static feature. This behavior
# guards against static features that are mostly static or have not changed recently but that may
# have changed in the distant past. The user will be made aware of this in the help docs.
data_static <- data %>%
dplyr::group_by_at(dplyr::vars(!!!groups)) %>%
dplyr::mutate("date_max" = max(!!rlang::sym(date_name), na.rm = TRUE)) %>%
dplyr::filter(!!rlang::sym(date_name) == .data$date_max) %>%
dplyr::select_at(dplyr::vars(groups, static_features))
}
data_template_list <- vector("list", nrow(data_template))
data_template_list <- lapply(seq_along(data_template_list), function(i) {
# Make a contiguous vector of dates.
date_seq <- data.frame(seq(data_template[i, "date_min", drop = TRUE],
data_template[i, "date_max", drop = TRUE],
by = frequency))
if (!is.null(groups)) {
# Make a dataset with the grouping columns that's the same length as the date sequence.
data_groups <- data_template[i, , drop = FALSE]
data_groups <- dplyr::select(data_groups, groups)
data_groups <- data_groups[rep(1, nrow(date_seq)), , drop = FALSE]
data_template_list <- dplyr::bind_cols(date_seq, data_groups)
names(data_template_list) <- c(date_name, groups)
} else {
data_template_list <- date_seq
names(data_template_list) <- date_name
}
data_template_list
})
data_template <- dplyr::bind_rows(data_template_list)
if (!is.null(static_features)) {
# If there are static features, merge these to the date and groups template
# with the gaps now removed.
data_template <- dplyr::left_join(data_template, data_static, by = groups)
# Now that the dates, groups, and any static features are date-complete with
# no gaps, we can remove static features from the input dataset followed by
# a left join with the data template.
data <- dplyr::select(data, -static_features)
}
data_out <- dplyr::left_join(data_template, data, by = c(date_name, groups))
if (is.null(groups)) {
data_out <- data_out %>%
dplyr::arrange(!!rlang::sym(date_name))
} else {
data_out <- data_out %>%
dplyr::arrange(!!!rlang::syms(groups), !!rlang::sym(date_name))
}
# Re-order the complete dataset to have the same column order as the input dataset.
data_out <- dplyr::select(data_out, col_names)
data_out <- as.data.frame(data_out)
return(data_out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.