R/check_date_sequence.R

Defines functions is_date_sequence_ordered check_date_sequence

Documented in check_date_sequence is_date_sequence_ordered

#' Check whether the order of the sequence of date-events is valid
#'
#' @description Checks whether a date sequence in
#' a vector of  specified columns is in order or not.
#'
#' @param data A data frame
#' @param target_columns A vector of event column names. Users should specify at
#'    least 2 column names in the expected order. For example:
#'    target_columns = c("date_symptoms_onset", "date_hospitalization",
#'    "date_death").
#'    When the input data is a `linelist` object, this parameter can be set to
#'    `linelist_tags` if you wish to the date sequence across tagged columns
#'    only.
#'    The values in this column should be in the ISO8601 format (2024-12-31).
#'    Otherwise, use the `standardize_dates()` function to standardize them.
#'
#' @returns The input dataset. When found, the incorrect date sequences will be
#'    stored in the report where they can be accessed using
#'    `attr(data, "report")`.
#' @export
#'
#' @examples
#' # import the data
#' data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi"))
#'
#' # standardize the date values
#' data <- data |>
#'   standardize_dates(
#'     target_columns  = c("date_first_pcr_positive_test", "date.of.admission"),
#'     error_tolerance = 0.4,
#'     format          = NULL,
#'     timeframe       = NULL
#'   )
#'
#' good_date_sequence <- check_date_sequence(
#'   data           = data,
#'   target_columns = c("date_first_pcr_positive_test", "date.of.admission")
#' )
check_date_sequence <- function(data, target_columns) {
  checkmate::assert_vector(target_columns, any.missing = FALSE, min.len = 1L,
                           max.len = dim(data)[2], null.ok = FALSE,
                           unique = TRUE)
  checkmate::assert_data_frame(data, null.ok = FALSE)

  # get the correct names in case some have been modified - see the
  # `retrieve_column_names()` function for more details
  target_columns <- retrieve_column_names(data, target_columns)
  target_columns <- get_target_column_names(data, target_columns, cols = NULL)


  # check if all columns are part of the data frame
  if (!all(target_columns %in% names(data))) {
    idx            <- which(!(target_columns %in% names(data)))
    warning("\nRemoving unrecognised column name: ", target_columns[idx],
            call. = FALSE)
    target_columns <- target_columns[-idx]
    if (length(target_columns) < 2L) {
      stop("\nAt least 2 event dates are required!")
    }
  }

  # checking the date sequence
  tmp_data   <- data %>% dplyr::select(dplyr::all_of(target_columns))
  order_date <- apply(tmp_data, 1L, is_date_sequence_ordered)
  bad_order  <- which(!order_date)
  if (!all(order_date)) {
    tmp_data <- tmp_data[bad_order, ]
    # adding incorrect records to the report
    data     <- add_to_report(x     = data,
                              key   = "incorrect_date_sequence",
                              value = tmp_data)
    warning("Detected ", length(bad_order),
            " incorrect date sequences at line(s): ",
            toString(bad_order),
            call. = FALSE)
  }

  return(data)
}

#' Check order of a sequence of date-events
#'
#' @param x A vector of Date values
#'
#' @returns `TRUE` if elements of the vector are ordered, `FALSE` otherwise.
#' @keywords internal
is_date_sequence_ordered <- function(x) {
  return(!is.unsorted(x))
}

Try the cleanepi package in your browser

Any scripts or data that you put into this service are public.

cleanepi documentation built on June 22, 2024, 9:48 a.m.