Nothing
#' Standardize date variables
#'
#' When the format of the values in a column and/or the target columns are not
#' defined, we strongly recommend checking a few converted dates manually to
#' make sure that the dates extracted from a `character` vector or a `factor`
#' are correct.\cr
#'
#' Check for the presence of date values that could have multiple formats
#' from the \code{$multi_format_dates} element of the \code{report}.\cr
#'
#' @param data The input \code{<data.frame>} or \code{<linelist>}
#' @param target_columns A \code{<vector>} of the target date column names. When
#' the input data is a \code{<linelist>} object, this parameter can be set to
#' \code{linelist_tags} if you wish to standardize the date columns across
#' tagged columns only. Default is \code{NULL}.
#' @param format A \code{<vector>} of the expected formats in the date values
#' from the date columns. Default is \code{NULL}.
#' @param timeframe A \code{<vector>} of 2 values of type \code{<Date>}. If
#' provided, date values that do not fall within this timeframe will be set
#' to \code{NA}.
#' @param error_tolerance A \code{<numeric>} between 0 and 1 indicating the
#' proportion of entries which cannot be identified as dates to be tolerated;
#' if this proportion is exceeded, the original vector is returned, and a
#' message is issued; defaults to 0.4 (40 percent).
#' @param orders A \code{<list>} or \code{<vector>} of characters with the date
#' codes for fine-grained parsing of dates. This allows for parsing of mixed
#' dates. If a \code{<list>} is supplied, that \code{<list>} will be used for
#' successive tries in parsing. When this is not provided
#' (\code{orders = NULL}), the function will use the following order defined
#' in the guesser:
#'
#' ```
#' list(
#' quarter_partial_dates = c("Y", "Ym", "Yq"),
#' world_digit_months = c("Yq", "ymd", "ydm", "dmy", "mdy", "myd", "dym",
#' "Ymd", "Ydm", "dmY", "mdY", "mYd", "dYm"),
#' world_named_months = c("dby", "dyb", "bdy", "byd", "ybd", "ydb",
#' "dbY", "dYb", "bdY", "bYd", "Ybd", "Ydb"),
#' us_format = c("Omdy", "YOmd")
#' )
#' ```
#'
#' @returns The input dataset where the date columns have been standardized. The
#' date values that are out of the specified timeframe will be reported in
#' the report. Similarly, date values that comply with multiple formats will
#' also be featured in the report object.
#' @export
#'
#' @details Converting ambiguous character strings to dates is difficult for
#' many reasons:
#'
#' - dates may not use the standard Ymd format
#'
#' - within the same variable, dates may follow different formats
#'
#' - dates may be mixed with things that are not dates
#'
#' - the behavior of `as.Date` in the presence of non-date is hard to predict,
#' sometimes returning `NA`, sometimes issuing an error.
#'
#' This function tries to address all the above issues. Dates with the following
#' format should be automatically detected, irrespective of separators
#' (e.g. "-", " ", "/") and surrounding text:
#'
#' - "19 09 2018"
#' - "2018 09 19"
#' - "19 Sep 2018"
#' - "2018 Sep 19"
#' - "Sep 19 2018"
#'
#' \subsection{How it works}{
#'
#' This function relies heavily on [lubridate::parse_date_time()], which is an
#' extremely flexible date parser that works well for consistent date formats,
#' but can quickly become unwieldy and may produce spurious results.
#' `standardize_dates()` will use a list of formats in the `orders` argument to
#' run `parse_date_time()` with each format vector separately and take the first
#' correctly parsed date from all the trials.
#'
#' With the default orders shown above, the dates 03 Jan 2018, 07/03/1982, and
#' 08/20/85 are correctly interpreted as 2018-01-03, 1982-03-07, and 1985-08-20.
#' The examples section will show how you can manipulate the `orders` to be
#' customized for your situation.
#' }
#'
#' @examples
#' x <- c("03 Jan 2018", "07/03/1982", "08/20/85")
#' # The below will coerce values where the month is written in letters only
#' # into Date.
#' as.Date(lubridate::parse_date_time(x, orders = c("Ybd", "dby")))
#'
#' # coerce values where the month is written in letters or numbers into Date.
#' as.Date(lubridate::parse_date_time(x, orders = c("dmy", "Ymd")))
#'
#' # How to use standardize_dates()
#' dat <- standardize_dates(
#' data = readRDS(
#' system.file("extdata", "test_df.RDS", package = "cleanepi")
#' ),
#' target_columns = "date_first_pcr_positive_test",
#' format = NULL,
#' timeframe = NULL,
#' error_tolerance = 0.4,
#' orders = list(
#' world_named_months = c("Ybd", "dby"),
#' world_digit_months = c("dmy", "Ymd"),
#' US_format = c("Omdy", "YOmd")
#' )
#' )
standardize_dates <- function(data,
target_columns = NULL,
format = NULL,
timeframe = NULL,
error_tolerance = 0.4,
orders = NULL) {
checkmate::assert_data_frame(data, null.ok = FALSE, min.cols = 1L)
checkmate::assert_character(target_columns, null.ok = TRUE,
any.missing = FALSE)
checkmate::assert_character(format, null.ok = TRUE, any.missing = FALSE)
checkmate::assert_date(timeframe, any.missing = FALSE,
null.ok = TRUE, len = 2L, unique = TRUE)
checkmate::assert_numeric(error_tolerance, lower = 0L, upper = 1L,
max.len = 2L,
any.missing = FALSE, null.ok = TRUE)
checkmate::assert_list(orders, min.len = 1L, null.ok = TRUE)
# set the variable to store the ambiguous column
ambiguous_cols <- NULL
if (!is.null(target_columns)) {
# get the correct names in case some have been modified - see the
# `retrieve_column_names()` function for more details
target_columns <- retrieve_column_names(data, target_columns)
target_columns <- get_target_column_names(data, target_columns, cols = NULL)
if (!is.null(format)) {
# we assume that when the format is provided, all values in that column
# have the same format. Date standardization will be performed considering
# the entire column, not individual values.
format <- date_match_format_and_column(target_columns, format)
# loop through the target columns to standardise them
for (i in seq_along(target_columns)) {
data[[target_columns[i]]] <- as.Date(
data[[target_columns[i]]],
format = format[i]
)
# check for outliers and set them to NA
res <- date_convert(
data,
cols = target_columns[i],
error_tolerance,
timeframe = timeframe,
orders = orders
)
data <- res[["data"]]
if (res[["has_ambiguous_values"]]) {
ambiguous_cols <- c(ambiguous_cols, target_columns[i])
}
}
} else {
for (cols in target_columns) {
# convert to ISO8601 date using the inferred format
res <- date_convert(data, cols, error_tolerance, timeframe,
orders = orders)
data <- res[["data"]]
if (res[["has_ambiguous_values"]]) {
ambiguous_cols <- c(ambiguous_cols, cols)
}
}
}
} else {
res <- date_guess_convert(
data,
error_tolerance = error_tolerance,
timeframe = timeframe,
orders = orders
)
data <- res[["data"]]
ambiguous_cols <- res[["ambiguous_cols"]]
}
# alert on the presence of ambiguous values in some target columns
if (length(ambiguous_cols) > 0) {
cli::cli_inform(c(
"!" = tr_("Found {.cls numeric} values that could also be of type {.cls Date} in {cli::qty(length(ambiguous_cols))} column{?s}: {.field {toString(ambiguous_cols)}}."), # nolint: line_length_linter
i = tr_("It is possible to convert them into {.cls Date} using: {.code lubridate::as_date(x, origin = as.Date(\"1900-01-01\"))}"), # nolint: line_length_linter
"*" = tr_("where {.val x} represents here the vector of values from these columns ({.code data$target_column}).") # nolint: line_length_linter
))
}
return(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.