R/conformance.R
In DQAstats: Core Functions for Data Quality Assessment

Documented in value_conformance value_conformance_checks

# DQAstats - Perform data quality assessment (DQA) of electronic health
# records (EHR)
# Copyright (C) 2019-2024 Universitätsklinikum Erlangen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


#' @title value_conformance helper function
#'
#' @description Internal function to perform value conformance checks.
#'
#' @inheritParams descriptive_results
#' @param results A list object. The list should contain the results of
#'   either 'rv$results_descriptive' or 'rv$results_plausibility_atemporal'.
#' @param scope A character. Either "plausibility" or "descriptive".
#' @inheritParams dqa
#'
#' @return A list with one entry for each dataelement containing the raw results
#'   of the value conformance checks. Each entry contains the following (nested)
#'   list items:
#'   \describe{
#'   \item{source_data}{A nested list with the raw value conformance check
#'   results for the source data system.}
#'   \item{target_data}{A nested list with the raw value conformance check
#'   results for the target data system.}
#'   }
#'
#' @examples
#' \donttest{# runtime ~ 5 sec.
#' utils_path <- system.file(
#'   "demo_data/utilities/",
#'   package = "DQAstats"
#' )
#' mdr_filename <- "mdr_example_data.csv"
#' rv <- list()
#' rv$mdr <- read_mdr(
#'   utils_path = utils_path,
#'   mdr_filename <- mdr_filename
#' )
#'
#' source_system_name <- "exampleCSV_source"
#' target_system_name <- "exampleCSV_target"
#'
#' rv <- c(rv, create_helper_vars(
#'   mdr = rv$mdr,
#'   source_db = source_system_name,
#'   target_db = target_system_name
#' ))
#' # save source/target vars
#' rv$source$system_name <- source_system_name
#' rv$target$system_name <- target_system_name
#' rv$source$system_type <- "csv"
#' rv$target$system_type <- "csv"
#'
#' rv$log$logfile_dir <- tempdir()
#'
#' # set headless (without GUI, progressbars, etc.)
#' rv$headless <- TRUE
#'
#' # set configs
#' demo_files <- system.file("demo_data", package = "DQAstats")
#' Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files)
#' Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files)
#'
#' # get configs
#' rv$source$settings <- DIZutils::get_config_env(
#'   system_name = rv$source$system_name,
#'   logfile_dir = rv$log$logfile_dir,
#'   headless = rv$headless
#' )
#' rv$target$settings <- DIZutils::get_config_env(
#'   system_name = tolower(rv$target$system_name),
#'   logfile_dir = rv$log$logfile_dir,
#'   headless = rv$headless
#' )
#'
#' # set start_time (e.g. when clicking the 'Load Data'-button in shiny
#' rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET")
#'
#' # define restricting date
#' rv$restricting_date$use_it <- FALSE
#'
#' # load source data
#' tempdat <- data_loading(
#'   rv = rv,
#'   system = rv$source,
#'   keys_to_test = rv$keys_source
#' )
#' rv$data_source <- tempdat$outdata
#'
#' # load target data
#' tempdat <- data_loading(
#'   rv = rv,
#'   system = rv$target,
#'   keys_to_test = rv$keys_target
#' )
#' rv$data_target <- tempdat$outdata
#'
#' rv$data_plausibility$atemporal <- get_atemp_plausis(
#'   rv = rv,
#'   atemp_vars = rv$pl$atemp_vars,
#'   mdr = rv$mdr,
#'   headless = rv$headless
#' )
#'
#' # add the plausibility raw data to data_target and data_source
#' for (i in names(rv$data_plausibility$atemporal)) {
#'   for (k in c("source_data", "target_data")) {
#'     w <- gsub("_data", "", k)
#'     raw_data <- paste0("data_", w)
#'     rv[[raw_data]][[i]] <-
#'       rv$data_plausibility$atemporal[[i]][[k]][[raw_data]]
#'     rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL
#'   }
#'   gc()
#' }
#'
#' # calculate descriptive results
#' rv$results_descriptive <- descriptive_results(
#'   rv = rv,
#'   headless = rv$headless
#' )
#'
#' # calculate atemporal plausibilites
#' rv$results_plausibility_atemporal <- atemp_plausi_results(
#'   rv = rv,
#'   atemp_vars = rv$data_plausibility$atemporal,
#'   mdr = rv$mdr,
#'   headless = rv$headless
#' )
#'
#' # calculate unique plausibilites
#' rv$results_plausibility_unique <- uniq_plausi_results(
#'   rv = rv,
#'   uniq_vars = rv$pl$uniq_vars,
#'   mdr = rv$mdr,
#'   headless = rv$headless
#' )
#'
#' value_conformance(
#'   rv = rv,
#'   scope = "descriptive",
#'   results = rv$results_descriptive,
#'   headless = rv$headless,
#'   logfile_dir = rv$log$logfile_dir
#' )
#' }
#'
#' @export
#'
value_conformance <- function(
    rv,
    results,
    scope,
    headless = FALSE,
    logfile_dir) {

  stopifnot(
    scope %in% c("plausibility", "descriptive")
  )

  # workaround until DIZutils is on CRAN
  # (when using 'importFrom DIZutils %notin%', error exists due to
  # referencing package in NAMESPACE but not as Import in DESCRIPTION)
  "%notin%" <- utils::getFromNamespace(
    x = "%notin%",
    ns = "DIZtools"
  )

  # get names
  obj_names <- names(results)

  outlist <- future.apply::future_sapply(
    X = obj_names,
    FUN = function(i) {
      local({
        # initialize inner outlist
        outlist <- list()

        msg <- paste0("Performing value conformance check for '", i, "'")
        DIZtools::feedback(
          print_this = msg,
          findme = "5d061425eb",
          logfile_dir = logfile_dir,
          logjs = isFALSE(headless),
          headless = headless
        )

        desc_out <- results[[i]]$description
        stat_out <- results[[i]]$statistics

        # internal variable name
        int_name <- desc_out$source_data$internal_variable_name

        key_cols <- get_key_col(rv)
        key_col_name_src <- key_cols$source
        key_col_name_tar <- key_cols$target

        for (j in c("source_data", "target_data")) {
          d_out <- desc_out[[j]]
          s_out <- stat_out[[j]] %>%
            data.table::data.table()

          if (j == "source_data") {
            raw_data <- "data_source"
          } else if (j == "target_data") {
            raw_data <- "data_target"
          }

          # parse constraints
          constraints <- tryCatch(
            expr = {
              if (is.null(d_out$checks$constraints) ||
                  is.na(d_out$checks$constraints)) {
                c <- NA
              } else {
                c <- jsonlite::fromJSON(d_out$checks$constraints)
              }
              c
            }, error = function(e) {
              DIZtools::feedback(
                print_this = paste0("Error in fromJSON in function",
                                    " `value_conformance()`: ", e),
                findme = "f269cc7211",
                type = "Error",
                logfile_dir = logfile_dir,
                logjs = isFALSE(headless),
                headless = headless
              )
              c <- NA
              c
            }
          )

          if (j == "source_data") {
            tab <- rv$mdr[get("source_system_name") == rv$source$system_name &
                            get("designation") == i &
                            get("dqa_assessment") == 1, get(key_col_name_src)]
          } else {
            tab <- rv$mdr[get("source_system_name") == rv$target$system_name &
                            get("designation") == i &
                            get("dqa_assessment") == 1, get(key_col_name_tar)]
          }
          # internal_variable_name is equal for source and target
          # and only stored with source description
          ih <- desc_out$source_data$internal_variable_name

          # add logic to test for future dates by default, if no datetime
          # constraint is set
          if (d_out$checks$var_type == "datetime" &&
              is.na(constraints) &&
              scope == "descriptive") {
            constraints <- "future_dates"
          }

          if (any(!is.na(constraints))) {
            if (length(constraints[[1]]) > 0) {
              # initialize outlist
              outlist2 <- list()

              DIZtools::feedback(
                print_this = paste0(
                  "Analyzing variable '",
                  i,
                  "' with type '",
                  d_out$checks$var_type,
                  "' (",
                  j,
                  ")..."
                ),
                findme = "d9f44d5c97",
                logfile_dir = logfile_dir,
                logjs = isFALSE(headless),
                headless = headless
              )

              # categorical treatment (value_set)
              if (d_out$checks$var_type == "enumerated") {

                if (((nrow(s_out) == 1) && is.na(s_out[[1, 1]])) ||
                    (nrow(s_out) == 0)) {
                  outlist2$conformance_error <- TRUE
                  outlist2$conformance_results <-
                    "No data available to perform conformance checks."
                } else {
                  # get valueset from mdr
                  constraints <- constraints$value_set
                  # get levels from results
                  levels_results <-
                    s_out[, levels(get(colnames(s_out)[1]))]
                  # compare levels from results to constraints from valueset
                  #% (TRUE = constraint_error)
                  if (is.null(levels_results)) {
                    outlist2$conformance_error <- TRUE
                  } else {
                    outlist2$conformance_error <- any(levels_results %notin%
                                                        constraints)
                  }
                  # if TRUE, get those values, that do not fit
                  outlist2$conformance_results <-
                    ifelse(
                      isTRUE(outlist2$conformance_error),
                      paste0(
                        "Levels that are not conform with the value set:  \n",
                        paste(
                          levels_results[levels_results %notin% constraints],
                          collapse = "  \n"
                        )
                      ),
                      "No 'value conformance' issues found."
                    )
                }

                # numerics treatment (range: min, max, unit)
              } else if (d_out$checks$var_type %in% c("integer", "float")) {

                if (is.null(s_out) || nrow(s_out) == 0 ||
                    (any(is.na(s_out[, 1])) ||
                     (s_out[1, 1] == "NaN"))) {
                  outlist2$conformance_error <- TRUE
                  outlist2$conformance_results <-
                    "No data available to perform conformance checks."
                } else {

                  # set colnames (we need them here to correctly
                  # select the data)
                  colnames(s_out) <- c("name", "value")

                  error_flag <- FALSE

                  # TODO add value_thresholds here as tolerance-/border zone
                  result_min <- as.numeric(
                    s_out[get("name") == "Minimum", get("value")]
                  )
                  result_max <- as.numeric(
                    s_out[get("name") == "Maximum", get("value")]
                  )

                  # compare levels from results to constraints from valueset
                  #% (TRUE = constraint_error)
                  if (result_min < constraints$range$min) {
                    DIZtools::feedback(
                      paste(i, "from", j, ": result_min < range$min"),
                      findme = "21abaa37e2",
                      logfile_dir = logfile_dir,
                      logjs = isFALSE(headless),
                      headless = headless
                    )
                    error_flag <- TRUE
                  }

                  if (result_max > constraints$range$max) {
                    DIZtools::feedback(
                      paste(i, "from", j, ": result_max > range$max"),
                      findme = "44264e3a64",
                      logfile_dir = logfile_dir,
                      logjs = isFALSE(headless),
                      headless = headless
                    )
                    error_flag <- TRUE
                  }

                  outlist2$conformance_error <- error_flag
                  outlist2$conformance_results <-
                    ifelse(
                      isTRUE(error_flag),
                      "Extrem values are not conform with constraints.",
                      "No 'value conformance' issues found."
                    )
                }

                # string treatment (regex)
              } else if (d_out$checks$var_type == "string") {

                if (((nrow(s_out) == 1) && is.na(s_out[[1, 1]])) ||
                    (nrow(s_out) == 0)) {
                  outlist2$conformance_error <- TRUE
                  outlist2$conformance_results <-
                    "No data available to perform conformance checks."
                } else {
                  # get regex-pattern
                  pattern <- constraints$regex

                  # returns the number of not matching items
                  errors <- !grepl(
                    pattern = pattern,
                    x = as.character(
                      s_out[!is.na(get(int_name)), get(int_name)]
                    )
                  )
                  cnt_errors <- sum(errors)

                  error_flag <- ifelse(cnt_errors > 0, TRUE, FALSE)

                  outlist2$conformance_error <- error_flag
                  outlist2$conformance_results <-
                    ifelse(
                      isTRUE(error_flag),
                      paste0(
                        "Values that are not conform with ",
                        "regular expression:  \n",
                        paste(as.character(s_out[!is.na(get(int_name)),
                                                 get(int_name)])[errors],
                              collapse = "  \n")
                      ),
                      "No 'value conformance' issues found."
                    )
                }
              } else if (d_out$checks$var_type == "datetime") {
                if (is.na(s_out[[1, 2]])) {
                  outlist2$conformance_error <- TRUE
                  outlist2$conformance_results <-
                    "No data available to perform conformance checks."
                } else if (constraints == "future_dates" &&
                           scope == "descriptive") {
                  # check for future dates
                  fut_dat <- rv[[raw_data]][[tab]][
                    get(ih) > Sys.Date(),
                  ]

                  error_flag <- ifelse(nrow(fut_dat) > 0, TRUE, FALSE)

                  outlist2$conformance_error <- error_flag
                  outlist2$rule <- "No future dates allowed."
                  outlist2$conformance_results <-
                    ifelse(
                      isTRUE(error_flag),
                      paste0(
                        "Values that are not conform with ",
                        "rule 'No future dates allowed.':  \n",
                        paste(
                          as.character(
                            unique(
                              fut_dat[, get(ih)]
                            )
                          ),
                          collapse = "  \n")
                      ),
                      "No 'value conformance' issues found."
                    )

                } else {
                  ## Check if there only is a datetime_format in the
                  ## constraints (this is implicitly already checked
                  ## in load_database(), search for 'datetime_format'):
                  constraints_names <- names(constraints)
                  if (length(constraints_names) == 1 &&
                      constraints_names[[1]] == "datetime") {
                    if (is.null(s_out) || nrow(s_out) == 0 ||
                        (any(is.na(s_out[, 2])) ||
                         (s_out[4, 2] == "NaN"))) {
                      outlist2$conformance_error <- TRUE
                      outlist2$conformance_results <-
                        "No data available to perform conformance checks."
                    } else {

                      # set colnames (we need them here to correctly
                      # select the data)
                      colnames(s_out) <- c("name", "value")

                      error_flag <- FALSE

                      # TODO add value_thresholds here as tolerance-/border zone
                      result_min <- s_out[get("name") == "Min.", get("value")]
                      result_max <- s_out[get("name") == "Max.", get("value")]

                      # compare levels from results to constraints from valueset
                      #% (TRUE = constraint_error)
                      if (result_min < as.Date(constraints$datetime$min)) {
                        DIZtools::feedback(
                          paste(i, "from", j, ": result_min < datetime$min"),
                          findme = "21abaa38e2",
                          logfile_dir = logfile_dir,
                          logjs = isFALSE(headless),
                          headless = headless
                        )
                        error_flag <- TRUE
                      }

                      if (result_max > as.Date(constraints$datetime$max)) {
                        DIZtools::feedback(
                          paste(i, "from", j, ": result_max > datetime$max"),
                          findme = "44264e8a64",
                          logfile_dir = logfile_dir,
                          logjs = isFALSE(headless),
                          headless = headless
                        )
                        error_flag <- TRUE
                      }

                      outlist2$conformance_error <- error_flag
                      outlist2$conformance_results <-
                        ifelse(
                          isTRUE(error_flag),
                          "Extrem values are not conform with constraints.",
                          "No 'value conformance' issues found."
                        )

                      if (isTRUE(outlist2$conformance_error)) {
                        outlist2$rule <- constraints$datetime
                      }
                    }
                  }
                }
              } else {
                DIZtools::feedback(
                  print_this = paste0(
                    "Cannot check ",
                    scope,
                    " value conformance for variable '",
                    i,
                    "' because there is no check for vartype '",
                    d_out$checks$var_type,
                    "' implemented yet."
                  ),
                  type = "Warning",
                  findme = "4817d8aec4",
                  logfile_dir = logfile_dir,
                  logjs = isFALSE(headless),
                  headless = headless
                )
              }
              outlist[[j]] <- outlist2
            }
          } else {
            DIZtools::feedback(
              print_this = paste0(
                "Didn't perform value conformance checks, because there",
                " are no constraints in the MDR for designation '",
                i,
                "' (key = '",
                tab,
                "')."
              ),
              findme = "4b27c49aa9",
              logfile_dir = logfile_dir,
              logjs = isFALSE(headless),
              headless = headless
            )
          }
        }
        if (length(outlist) > 0) {
          return(outlist)
        } else {
          return()
        }
      })
    },
    simplify = FALSE,
    USE.NAMES = TRUE
  )

  # which list elements have a valid value?
  vec <- sapply(
    X = outlist,
    FUN = function(x) {
      return(length(x))
    }
  )

  return(outlist[vec > 0])
}


#' @title value_conformance_checks helper function
#'
#' @description Internal function to perform value conformance checks.
#'
#' @param results A list object. The list should contain the results of
#'   the function \code{value_conformance}.
#'
#' @return A data.table with the results of the automated comparison of the
#'   value conformance check results between the source data system and the
#'   target data system.
#'
#' @examples
#' \donttest{# runtime ~ 5 sec.
#' utils_path <- system.file(
#'   "demo_data/utilities/",
#'   package = "DQAstats"
#' )
#' mdr_filename <- "mdr_example_data.csv"
#' rv <- list()
#' rv$mdr <- read_mdr(
#'   utils_path = utils_path,
#'   mdr_filename <- mdr_filename
#' )
#'
#' source_system_name <- "exampleCSV_source"
#' target_system_name <- "exampleCSV_target"
#'
#' rv <- c(rv, create_helper_vars(
#'   mdr = rv$mdr,
#'   source_db = source_system_name,
#'   target_db = target_system_name
#' ))
#' # save source/target vars
#' rv$source$system_name <- source_system_name
#' rv$target$system_name <- target_system_name
#' rv$source$system_type <- "csv"
#' rv$target$system_type <- "csv"
#'
#' rv$log$logfile_dir <- tempdir()
#'
#' # set headless (without GUI, progressbars, etc.)
#' rv$headless <- TRUE
#'
#' # set configs
#' demo_files <- system.file("demo_data", package = "DQAstats")
#' Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files)
#' Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files)
#'
#' # get configs
#' rv$source$settings <- DIZutils::get_config_env(
#'   system_name = rv$source$system_name,
#'   logfile_dir = rv$log$logfile_dir,
#'   headless = rv$headless
#' )
#' rv$target$settings <- DIZutils::get_config_env(
#'   system_name = tolower(rv$target$system_name),
#'   logfile_dir = rv$log$logfile_dir,
#'   headless = rv$headless
#' )
#'
#' # set start_time (e.g. when clicking the 'Load Data'-button in shiny
#' rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET")
#'
#' # define restricting date
#' rv$restricting_date$use_it <- FALSE
#'
#' # load source data
#' tempdat <- data_loading(
#'   rv = rv,
#'   system = rv$source,
#'   keys_to_test = rv$keys_source
#' )
#' rv$data_source <- tempdat$outdata
#'
#' # load target data
#' tempdat <- data_loading(
#'   rv = rv,
#'   system = rv$target,
#'   keys_to_test = rv$keys_target
#' )
#' rv$data_target <- tempdat$outdata
#'
#' rv$data_plausibility$atemporal <- get_atemp_plausis(
#'   rv = rv,
#'   atemp_vars = rv$pl$atemp_vars,
#'   mdr = rv$mdr,
#'   headless = rv$headless
#' )
#'
#' # add the plausibility raw data to data_target and data_source
#' for (i in names(rv$data_plausibility$atemporal)) {
#'   for (k in c("source_data", "target_data")) {
#'     w <- gsub("_data", "", k)
#'     raw_data <- paste0("data_", w)
#'     rv[[raw_data]][[i]] <-
#'       rv$data_plausibility$atemporal[[i]][[k]][[raw_data]]
#'     rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL
#'   }
#'   gc()
#' }
#'
#' # calculate descriptive results
#' rv$results_descriptive <- descriptive_results(
#'   rv = rv,
#'   headless = rv$headless
#' )
#'
#' # calculate atemporal plausibilites
#' rv$results_plausibility_atemporal <- atemp_plausi_results(
#'   rv = rv,
#'   atemp_vars = rv$data_plausibility$atemporal,
#'   mdr = rv$mdr,
#'   headless = rv$headless
#' )
#'
#' # calculate unique plausibilites
#' rv$results_plausibility_unique <- uniq_plausi_results(
#'   rv = rv,
#'   uniq_vars = rv$pl$uniq_vars,
#'   mdr = rv$mdr,
#'   headless = rv$headless
#' )
#'
#' rv$conformance$value_conformance <- value_conformance(
#'   rv = rv,
#'   scope = "descriptive",
#'   results = rv$results_descriptive,
#'   headless = rv$headless,
#'   logfile_dir = rv$log$logfile_dir
#' )
#'
#' value_conformance_checks(results = rv$conformance$value_conformance)
#' }
#' @export
#'
value_conformance_checks <- function(results) {
  # get names
  obj_names <- names(results)

  # initialize output table
  out <- data.table::data.table(
    "Variable" = character(0),
    "Check Source Data" = character(0),
    "Check Target Data" = character(0)
  )


  for (i in obj_names) {
    if (results[[i]]$source_data$conformance_results ==
        "No data available to perform conformance checks.") {
      error_source <- "no data available"
    } else {
      error_source <-
        ifelse(
          !is.null(results[[i]]$source_data),
          ifelse(
            results[[i]]$source_data$conformance_error,
            "failed",
            "passed"
          ),
          "ERROR"
        )
    }
    if (results[[i]]$target_data$conformance_results ==
        "No data available to perform conformance checks.") {
      error_target <- "no data available"
    } else {
      error_target <-
        ifelse(
          !is.null(results[[i]]$target_data),
          ifelse(
            results[[i]]$target_data$conformance_error,
            "failed",
            "passed"
          ),
          "ERROR"
        )
    }
    out <- rbind(
      out,
      data.table::data.table(
        "Variable" = i,
        "Check Source Data" = error_source,
        "Check Target Data" = error_target
      )
    )
  }
  return(out)
}