R/timeliness_check.R
In DQA: Data Quality Assessment Tools

Documented in timeliness_check

#' @title Perform timeliness Check for Data Frame Columns
#' @description This function evaluates a source dataframe (`S_data`) against a set
#' of rules defined in a metadata dataframe (`M_data`).
#' it Checks timeliness rules (Temporal and availability conditions) on columns of a data frame, based on metadata specifications.
#' Supports flexible rule definition,date literal handling, and customizable output.
#'
#' @param S_data data.frame. The source data in which rules will be evaluated. Each column may be referenced by the rules.
#' @param M_data data.frame. Metadata describing variables and their timeliness rules. Must include columns \code{VARIABLE}, \code{Timeliness_Rule} and \code{TYPE}. Optionally includes \code{Timeliness_Error_Type}.
#' @param Result logical (default: \code{FALSE}). If \code{TRUE}, returns row-by-row evaluation results for each rule. If \code{FALSE}, returns a summary table for each rule.
#' @param show_column character vector (default: \code{NULL}). Names of columns from \code{S_data} to include in the result when \code{Result = TRUE}. Ignored otherwise.
#' @param date_parser_fun function (default: \code{smart_to_gregorian_vec}). Converting Persian dates to English,Function to convert date values or date literals to \code{Date} class. Must accept character vectors and return \code{Date} objects.
#' @param var_select character, numeric, or \code{"all"} (default: \code{"all"}). Subset of variables (rules) to check. Can be a character vector of variable names, numeric vector of row indices in \code{M_data}, or \code{"all"} to run all rules.
#' @param verbose logical (default: \code{FALSE}). If \code{TRUE}, prints diagnostic messages during rule processing and evaluation.
#'
#' @details
#' The metadata data.frame (\code{M_data}) **must** contain the following columns:
#' \itemize{
#'   \item \strong{VARIABLE}: Name of the variable in \code{S_data} to which the rule applies.
#'   \item \strong{Timeliness_Rule}: A logical rule provided as a string that defines temporal (date/time) conditions to be evaluated.
#'   \item \strong{TYPE}: Specifies the type of the variable (e.g., "numeric", "date", "character").
#'   \item \strong{Timeliness_Error_Type}: The error type for each rule will be reported in the summary output.Based on the importance and severity of the rule, it can include two options: "Warning" or "Error".
#' }
#'
#' For each variable described in \code{M_data}, the function:
#' \itemize{
#'   \item Preprocesses the rule: replaces 'val' with the variable name, parses date literals and substitutes them with placeholders.
#'   \item Converts referenced data columns to appropriate types (numeric, date) based on metadata.
#'   \item Evaluates the rule for each row, either vectorized or row-wise if needed.
#' }
#' If \code{Result = FALSE}, returns a summary table with counts and percentages of rows meeting/not meeting each condition. If \code{Result = TRUE}, returns a data.frame with boolean results for each rule, optionally including selected columns from the source data.
#'
#' @return
#' If \code{Result = FALSE}: a data.frame summary with columns:
#' \itemize{
#'   \item VARIABLE: Name of the variable/rule.
#'   \item Condition_Met: Number of rows where the rule is TRUE.
#'   \item Condition_Not_Met: Number of rows where the rule is FALSE.
#'   \item NA_Count: Number of rows with missing/indeterminate result.
#'   \item Total_Applicable: Number of non-NA rows.
#'   \item Total_Rows: Number of total rows.
#'   \item Percent_Met: Percentage of applicable rows meeting the condition.
#'   \item Percent_Not_Met: Percentage of applicable rows not meeting the condition.
#'   \item Timeliness_Error_Type: Error type from metadata (if available).
#' }
#' If \code{Result = TRUE}: a data.frame with one column per rule (variable), each containing logical values for every row, plus optional columns from the source data.
#' @examples
#' # Source data
#' S_data <- data.frame(
#'   VisitDate = c("2025-09-23", "2021-01-10", "2021-01-03","1404-06-28","1404-07-28",NA),
#'   Test_date = c("1404-07-01", "2021-01-09", "2021-01-14","1404-06-29","2025-09-19",NA)
#' )
#'
#' # META DATA
#' M_data <- data.frame(
#'   VARIABLE = c("VisitDate","Test_date"),
#'   Timeliness_Rule = c(
#'     "",
#'     "VisitDate<=Test_date",
#'     " Test_date-VisitDate <10 ",
#'     ""),
#'   TYPE=c("date","date"),
#'   Timeliness_Error_Type = c("Error","warning"),
#'   stringsAsFactors = FALSE
#' )
#'
#' result <- timeliness_check(
#'   S_data = S_data,
#'   M_data = M_data,
#'   Result = TRUE,
#'   show_column = FALSE
#' )
#'
#' print(result)
#'
#' result <- timeliness_check(
#'   S_data = S_data,
#'   M_data = M_data,
#'   Result = FALSE,
#'   var_select = c("VisitDate","Test_date")
#' )
#'
#' print(result)
#'
#' @export
timeliness_check <- function(
    S_data,
    M_data,
    Result = FALSE,
    show_column = NULL,
    date_parser_fun = smart_to_gregorian_vec,
    var_select = "all",
    verbose = FALSE
) {
  # Check required arguments and structure
  if (missing(S_data) || missing(M_data)) stop("S_data and M_data are required.")
  if (nrow(S_data) == 0) stop("S_data is empty.")
  if (nrow(M_data) == 0) stop("M_data is empty.")
  required_meta_cols <- c("VARIABLE", "Timeliness_Rule", "TYPE")
  if (!all(required_meta_cols %in% names(M_data))) stop("M_data must contain columns: VARIABLE, Timeliness_Rule, TYPE.")

  # Keep a full copy of metadata (for TYPE lookups)
  M_all <- M_data[!duplicated(M_data$VARIABLE, fromLast = TRUE), , drop = FALSE]
  # Subset metadata for selected variables/rules
  M_sub <- M_all
  if (!identical(var_select, "all")) {
    if (is.numeric(var_select)) {
      M_sub <- M_sub[var_select, , drop = FALSE]
    } else {
      M_sub <- M_sub[M_sub$VARIABLE %in% as.character(var_select), , drop = FALSE]
    }
  }
  if (nrow(M_sub) == 0) stop("No rules remain to run after var_select.")

  n <- nrow(S_data)
  date_literal_pattern <- "'1[34][0-9]{2}[-./][0-9]{1,2}[-./][0-9]{1,2}'"

  # Prepare rules: replace val, extract date literals, parse, detect referenced columns
  rule_list <- list()
  for (i in seq_len(nrow(M_sub))) {
    var_name <- as.character(M_sub$VARIABLE[i])
    rule_raw <- as.character(M_sub$Timeliness_Rule[i])
    if (is.na(rule_raw) || trimws(rule_raw) == "") next

    # Replace 'val' with actual column name
    rule_text <- gsub("\\bval\\b", var_name, rule_raw, perl = TRUE)

    # Find date literals and replace with placeholders
    placeholders <- list()
    dm <- gregexpr(date_literal_pattern, rule_text, perl = TRUE)
    if (dm[[1]][1] != -1) {
      lits <- unique(regmatches(rule_text, dm)[[1]])
      for (k in seq_along(lits)) {
        lit <- lits[k]
        date_str <- gsub("'", "", lit)
        greg_date <- tryCatch(date_parser_fun(date_str), error = function(e) NA)
        if (!is.na(greg_date)) {
          ph <- paste0(".DATE_", gsub("[^A-Za-z0-9_]", "_", var_name), "_", k)
          rule_text <- gsub(lit, ph, rule_text, fixed = TRUE)
          placeholders[[ph]] <- as.Date(greg_date)
        }
      }
    }

    # Parse rule expression
    parsed <- tryCatch(parse(text = rule_text), error = function(e) NULL)
    expr <- if (!is.null(parsed)) parsed[[1]] else NULL

    # Detect referenced columns in the rule
    vars_in_rule <- character(0)
    if (!is.null(expr)) {
      vars_in_rule <- intersect(all.vars(expr), names(S_data))
    } else {
      tmp <- tryCatch(parse(text = gsub("\\bval\\b", var_name, rule_raw)), error = function(e) NULL)
      if (!is.null(tmp)) vars_in_rule <- intersect(all.vars(tmp[[1]]), names(S_data))
    }

    rule_list[[length(rule_list) + 1]] <- list(
      var = var_name,
      raw = rule_raw,
      text = rule_text,
      expr = expr,
      placeholders = placeholders,
      vars_in_rule = unique(vars_in_rule),
      error_type = if ("Timeliness_Error_Type" %in% names(M_sub)) M_sub$Timeliness_Error_Type[i] else NA
    )
  }

  if (length(rule_list) == 0) {
    warning("No valid rule found for evaluation.")
    return(data.frame())
  }

  # Determine columns needed from S_data
  referenced_cols <- unique(unlist(lapply(rule_list, function(rd) rd$vars_in_rule)))
  vars_to_run <- unique(sapply(rule_list, function(rd) rd$var))
  needed_cols <- intersect(union(referenced_cols, vars_to_run), names(S_data))

  # Determine TYPEs from metadata
  type_map <- setNames(as.character(M_all$TYPE), M_all$VARIABLE)
  date_cols <- intersect(names(type_map[type_map == "date"]), needed_cols)
  numeric_cols <- intersect(names(type_map[type_map == "numeric"]), needed_cols)

  # Subset processed_data to only needed columns
  processed_data <- S_data[, needed_cols, drop = FALSE]

  # Apply type conversions
  if (length(date_cols) > 0) {
    for (cname in date_cols) {
      processed_data[[cname]] <- tryCatch(date_parser_fun(processed_data[[cname]]), error = function(e) processed_data[[cname]])
    }
  }
  if (length(numeric_cols) > 0) {
    for (cname in numeric_cols) {
      x <- processed_data[[cname]]
      if (is.numeric(x) || inherits(x, "Date")) next
      y <- if (is.factor(x)) as.character(x) else x
      if (is.character(y)) y <- trimws(gsub(",", "", y))
      processed_data[[cname]] <- suppressWarnings(as.numeric(y))
    }
  }

  # Evaluate each rule: try vectorized, else fall back to row-wise
  res_list <- list()
  for (rd in rule_list) {
    vname <- rd$var
    if (verbose) message("Evaluating rule for: ", vname)
    if (is.null(rd$expr)) {
      res_list[[vname]] <- rep(NA, n); next
    }

    # Minimal set of columns for this rule
    cols_needed_for_rule <- intersect(unique(c(rd$vars_in_rule, vname)), names(processed_data))
    cols_list <- as.list(processed_data[, cols_needed_for_rule, drop = FALSE])
    # Attach placeholders as needed
    if (length(rd$placeholders) > 0) {
      for (ph in names(rd$placeholders)) cols_list[[ph]] <- rd$placeholders[[ph]]
    }

    # Vectorized evaluation environment
    eval_env <- list2env(cols_list, parent = baseenv())

    vec_result <- tryCatch(eval(rd$expr, envir = eval_env), error = function(e) e)

    if (inherits(vec_result, "error")) {
      if (verbose) message("Vectorized eval failed for ", vname, " -> falling back to row-wise.")
      single_result <- sapply(seq_len(n), function(i) {
        row_env <- new.env(parent = baseenv())
        for (nm in names(cols_list)) {
          v <- cols_list[[nm]]
          val <- if (length(v) >= i) v[i] else v
          assign(nm, val, envir = row_env)
        }
        out <- tryCatch(eval(rd$expr, envir = row_env), error = function(e) NA)
        if (length(out) != 1) NA else as.logical(out)
      })
      res_list[[vname]] <- as.logical(single_result)
    } else {
      if (is.atomic(vec_result) && length(vec_result) == n) {
        res_list[[vname]] <- as.logical(vec_result)
      } else if (length(vec_result) == 1) {
        res_list[[vname]] <- rep(as.logical(vec_result), n)
      } else {
        if (verbose) message("Unexpected result shape for ", vname, " -> falling back to row-wise.")
        single_result <- sapply(seq_len(n), function(i) {
          row_env <- new.env(parent = baseenv())
          for (nm in names(cols_list)) {
            v <- cols_list[[nm]]; val <- if (length(v) >= i) v[i] else v
            assign(nm, val, envir = row_env)
          }
          out <- tryCatch(eval(rd$expr, envir = row_env), error = function(e) NA)
          if (length(out) != 1) NA else as.logical(out)
        })
        res_list[[vname]] <- as.logical(single_result)
      }
    }
  } # end rules loop

  # Format output
  res_dt <- as.data.frame(res_list, stringsAsFactors = FALSE)
  if (isTRUE(Result)) {
    if (!is.null(show_column)) {
      valid_cols <- intersect(show_column, names(S_data))
      if (length(valid_cols) > 0) {
        res_dt <- cbind(res_dt, S_data[, valid_cols, drop = FALSE])
      } else {
        warning("Selected show_column(s) do not exist in the data.")
      }
    }
    rownames(res_dt) <- NULL
    return(res_dt)
  } else {
    # When Result = FALSE: summary table, include Timeliness_Error_Type if present
    summary_list <- lapply(names(res_list), function(vn) {
      vec <- res_list[[vn]]
      na_count <- sum(is.na(vec))
      met <- sum(vec, na.rm = TRUE)
      total <- length(vec) - na_count
      error_type <- NA
      if ("Timeliness_Error_Type" %in% names(M_sub)) {
        row_match <- which(M_sub$VARIABLE == vn)
        if (length(row_match) > 0) error_type <- M_sub$Timeliness_Error_Type[row_match[1]]
      }
      data.frame(
        VARIABLE = vn,
        Condition_Met = met,
        Condition_Not_Met = total - met,
        NA_Count = na_count,
        Total_Applicable = total,
        Total_Rows = length(vec),
        Percent_Met = if (total > 0) round(100 * met / total, 2) else NA,
        Percent_Not_Met = if (total > 0) round(100 * (total - met) / total, 2) else NA,
        Timeliness_Error_Type = error_type,
        stringsAsFactors = FALSE
      )
    })
    return(do.call(rbind, summary_list))
  }
}