R/con_contradictions.R
In dataquieR: Data Quality in Epidemiological Research

Documented in con_contradictions

#' Checks user-defined contradictions in study data
#'
#' @description
#' This approach considers a contradiction if impossible combinations of data
#' are observed in one participant. For example, if age of a participant is
#' recorded repeatedly the value of age is (unfortunately) not able to decline.
#' Most cases of contradictions rest on comparison of two variables.
#'
#' Important to note, each value that is used for comparison may represent a
#' possible characteristic but the combination of these two values is considered
#' to be impossible. The approach does not consider implausible or inadmissible
#' values.
#'
#' [Descriptor]
#'
#' @details
#' ### Algorithm of this implementation:
#'
#'  - Select all variables in the data with defined contradiction rules (static
#'    metadata column CONTRADICTIONS)
#'  - Remove missing codes from the study data (if defined in the metadata)
#'  - Remove measurements deviating from limits defined in the metadata
#'  - Assign label to levels of categorical variables (if applicable)
#'  - Apply contradiction checks on predefined sets of variables
#'  - Identification of measurements fulfilling contradiction rules. Therefore
#'    two output data frames are generated:
#'    - on the level of observation to flag each contradictory value
#'      combination, and
#'    - a summary table for each contradiction check.
#'  - A summary plot illustrating the number of contradictions is generated.
#'
#' List function.
#'
#' @inheritParams .template_function_indicator
#'
#' @param resp_vars [variable list] the name of the measurement variables
#' @param threshold_value [numeric] from=0 to=100. a numerical value
#'                                                 ranging from 0-100
#' @param check_table [data.frame] contradiction rules table.  Table defining
#'                                 contradictions. See details for
#'                                 its required structure.
#' @param summarize_categories [logical] Needs a column 'tag' in the
#'                             `check_table`.
#'                             If set, a summary output is generated for the
#'                             defined categories plus one plot per
#'                             category.
#' @return
#' If `summarize_categories` is `FALSE`:
#' A [list] with:
#'   - `FlaggedStudyData`: The first output of the contradiction function is a
#'                         data frame of similar dimension regarding the number
#'                         of observations in the study data. In addition, for
#'                         each applied check on the variables an additional
#'                         column is added which flags observations with a
#'                         contradiction given the applied check.
#'   - `SummaryTable`: The second output summarizes this information into one
#'                     data frame. This output can be used to provide an
#'                     executive overview on the amount of contradictions. This
#'                     output is meant for automatic digestion within pipelines.
#'   - `SummaryData`: The third output is the same as `SummaryTable` but for
#'                    human readers.
#'   - `SummaryPlot`: The fourth output visualizes summarized information
#'     of `SummaryData`.
#'
#' if `summarize_categories` is `TRUE`, other objects are returned:
#' one per category named by that category (e.g. "Empirical") containing a
#' result for contradictions within that category only. Additionally, in the
#' slot `all_checks` a result as it would have been returned with
#' `summarize_categories` set to `FALSE`. Finally, a slot `SummaryData` is
#' returned containing sums per Category and an according [ggplot2::ggplot] in
#' `SummaryPlot`.
#'
#' @export
#'
#' @importFrom ggplot2 ggplot geom_bar scale_fill_manual theme_minimal
#'                     scale_y_continuous geom_hline coord_flip theme aes
#'                     geom_text xlab scale_x_continuous sec_axis
#' @importFrom stats setNames reorder
#' @seealso
#' [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_con_impl_contradictions.html
#' )
con_contradictions <- function(resp_vars = NULL,
                               study_data,
                               label_col,
                               item_level = "item_level",
                               threshold_value,
                               check_table,
                               summarize_categories = FALSE,
                               # flip_mode = "flip" # TODO: Fix noflip graph
                               meta_data = item_level,
                               meta_data_v2) {
  # Preps ----------------------------------------------------------------------
  util_maybe_load_meta_data_v2()
  # labels used instead of variable names?
  if (!(missing(label_col)) && label_col != VAR_NAMES) {
    util_message(
      sprintf(paste("Labels of variables from %s will be used.",
                    "In this case columns A and B in check_tables must",
                    "refer to labels.", collapse = " "),
      dQuote(label_col))
    )
  } else {
    util_message(paste("Variable names will be used. In this case columns A",
                  "and B in check_tables must refer to variable names."))
  }

  # map meta to study
  prep_prepare_dataframes(.replace_hard_limits = TRUE,
                          .apply_factor_metadata = TRUE)

  util_correct_variable_use("resp_vars",
    allow_more_than_one = TRUE,
    allow_null = TRUE,
    allow_any_obs_na = TRUE
  )

  util_expect_data_frame(check_table, c("ID", "Function_name", "A", "A_levels",
                                        "A_value", "B", "B_levels",  "B_value",
                                        "Label"))

  # table of specified contradictions
  if (missing(check_table) || !is.data.frame(check_table)) {
    util_error(
      c("Missing check_table --",
        "cannot apply contradictions checks w/o contradiction rules"),
      applicability_problem = TRUE)
  }

  if (missing(threshold_value)) {
    threshold_value <- 0
    if (!.called_in_pipeline)
      util_message("No %s has been set, will use default %d",
                 dQuote("threshold_value"), threshold_value,
                 applicability_problem = TRUE)
  }

  ct <- check_table
  ct$Label <- as.character(ct$Label)

  # colors:
  cols <- c("0" = "#2166AC", "1" = "#B2182B")

  # columns:
  expected_cols <- c(
    "ID",
    "Label",
    "Function_name",
    "A",
    "B",
    "A_value",
    "B_value",
    "A_levels",
    "B_levels"
  )

  missing_cols <- !(expected_cols %in% colnames(ct))

  if (any(missing_cols)) {
    util_error(
      "Missing the following columns in the check_table: %s",
      dQuote(expected_cols[missing_cols]),
      applicability_problem = TRUE
    )
  }

  if (summarize_categories) { # TODO: Optimize graphics as in the redcap version of this function.
    # if we want to summarize contradictions per category
    if (!("tag" %in% colnames(ct))) {
      util_error(c(
        "Cannot summerize categories of contradictions,",
        "because these are not defined in the check_table as column 'tag'."),
        applicability_problem = TRUE)
    }
    splitted_tags <- lapply(strsplit(ct$tag, SPLIT_CHAR, fixed = TRUE), trimws)
    tags <- sort(unique(unlist(splitted_tags)))
    tags <- setNames(nm = tags)
    tags_ext <- tags
    tags_ext[["all_checks"]] <- NA
    result <- lapply(tags_ext, function(atag) {
      # generate one output per category (stratified)
      if (is.na(atag)) {
        new_ct <- ct[, -which(colnames(ct) == "tag"), drop = FALSE]
      } else {
        contains_tag <- function(x, tg) {
          any(x == tg, na.rm = TRUE)
        }
        rows_matching_tag <- vapply(splitted_tags, contains_tag, tg = atag,
                                    logical(1))
        new_ct <- ct[rows_matching_tag, -which(colnames(ct) == "tag"),
                     drop = FALSE]
      }
      con_contradictions(
        resp_vars = resp_vars, study_data = study_data,
        meta_data = meta_data, label_col = label_col,
        threshold_value = threshold_value, check_table = new_ct,
        summarize_categories = FALSE
        # , flip_mode = flip_mode TODO
      )
    })
    rx <- lapply(tags_ext, function(atag) {
      # and summarize the contradictions per category/tag
      if (is.na(atag)) {
        sum(rowSums(result[["all_checks"]]$FlaggedStudyData[, -1, drop = FALSE],
                    na.rm = TRUE) > 0) /
          nrow(result[["all_checks"]]$FlaggedStudyData) * 100
      } else {
        sum(rowSums(result[[atag]]$FlaggedStudyData[, -1, drop = FALSE],
                    na.rm = TRUE) > 0) /
          nrow(result[[atag]]$FlaggedStudyData) * 100
      }
    })
    rx <- data.frame(
      category = names(rx),
      percent = unlist(rx),
      GRADING = ordered(ifelse(unlist(rx) > threshold_value, 1, 0))
    )
    result$SummaryData <- rx
    # Plot for summarized contradiction checks -----------------------------------------------------
    p <- util_create_lean_ggplot(ggplot(rx, aes(x = seq_len(nrow(rx)), y = percent,
                        fill = as.ordered(GRADING))) +
      geom_bar(stat = "identity") +
      scale_fill_manual(values = cols, name = " ", guide = "none") +
      theme_minimal() +
      scale_y_continuous(name = "(%)",
                         limits = (c(0, max(1.2 * max(rx$percent),
                                            threshold_value))),
                         expand = expansion(mult = c(0,0.05))) +
      scale_x_continuous(breaks = seq_len(nrow(rx)),
                         sec.axis = sec_axis(~., # TDOO: checl ~
                                             breaks = seq_len(nrow(rx)),
                                             labels = rx$category),
                         trans = "reverse") +
      xlab("Category of applied contradiction checks") +
      geom_hline(yintercept = threshold_value, color = "red", linetype = 2) +
      geom_text(label = paste0(" ", round(rx$percent, 2), "%"),
                hjust = 0, vjust = 0.5) +
      coord_flip() + # TODO
      theme(axis.text.y.right = element_text(size = 14),
            axis.text.y.left = element_blank()),
      rx = rx,
      cols = cols,
      threshold_value = threshold_value
      )

    # p <- p + util_coord_flip(p = p) # TODO: estimate w and h, if p is not using discrete axes util_lazy_add_coord(p, fli)

    # https://stackoverflow.com/a/51795017
#    bp <- ggplot_build(p)
#    w <- 2 * length(bp$layout$panel_params[[1]]$x$get_labels())
#    if (w == 0) {
 #     w <- 10
#    }
#    w <- w + 2 +
#      max(nchar(bp$layout$panel_params[[1]]$y$get_labels()),
#          na.rm = TRUE)
#    h <- 2 * length(bp$layout$panel_params[[1]]$y$get_labels())
 #   if (h == 0) {
#      h <- 10
#    }
 #   h <- h + 15

#    p <- util_set_size(p, width_em = w, height_em = h)

    result$SummaryPlot <- p

    return(result)
  } else {
    ct$A_levels <- as.character(ct$A_levels)
    ct$B_levels <- as.character(ct$B_levels)

    # check and prep metadata
    if (!(CONTRADICTIONS %in% colnames(meta_data))) {
      util_error(
        c("Missing column %s in metadata cannot apply",
          "contradictions checks w/o contradiction rules"),
        dQuote(CONTRADICTIONS),
        applicability_problem = TRUE
      )
    }

    meta_data[["CONTRADICTIONS"]] <-
      as.character(meta_data[["CONTRADICTIONS"]])

    # no variables defined?
    if (length(resp_vars) == 0) {
      if (all(is.na(meta_data[[CONTRADICTIONS]]))) {
        util_error(paste0("No Variables with defined CONTRADICTIONS."),
                   applicability_problem = TRUE,
                   intrinsic_applicability_problem = TRUE)
      } else {
        util_message(paste0(
          "All variables with CONTRADICTIONS in the metadata are used."),
          applicability_problem = TRUE,
          intrinsic_applicability_problem = TRUE)
        resp_vars <- meta_data[[label_col]][!(is.na(meta_data[[CONTRADICTIONS]]))]
        resp_vars <- intersect(resp_vars, colnames(ds1))
      }
    } else {
      # contradictions defined at all?
      if (all(is.na(meta_data[[CONTRADICTIONS]][meta_data[[label_col]] %in%
                                                resp_vars]))) {
        util_error(paste0("No Variables with defined CONTRADICTIONS."),
                   applicability_problem = TRUE,
                   intrinsic_applicability_problem = TRUE)
      }
      # no contradictions for some variables?
      rvs_with_contr <- meta_data[[label_col]][!(is.na(meta_data[[CONTRADICTIONS]])) &
                                       meta_data[[label_col]] %in% resp_vars]
      if (length(rvs_with_contr) < length(resp_vars)) {
        util_message(paste0("The variables ", resp_vars[!(resp_vars %in% rvs_with_contr)],
                            " have no defined CONTRADICTIONS.",
                            collapse = ", "),
                     applicability_problem = TRUE)
      }
      resp_vars <- rvs_with_contr
    }

    # select contradiction checks
    # get checks from metadata
    cl <- meta_data[[CONTRADICTIONS]][!(is.na(meta_data$CONTRADICTIONS))]

    # is list ?
    cl <- unlist(cl)

    # select unique checks
    cl <- unique(as.numeric(unlist(strsplit(as.character(cl), SPLIT_CHAR,
                                            fixed = TRUE))))

    cl <- intersect(cl, ct$ID)

    cl <- cl[order(cl)]

    summary_df1 <- data.frame(Obs = 1:nrow(ds1))

    summary_df2 <- data.frame(
      Check_type = rep(NA, length(cl)),
      Check_ID = rep(NA, length(cl)),
      Study_variables = rep(NA, length(cl)),
      A_levels = rep(NA, length(cl)),
      B_levels = rep(NA, length(cl)),
      N = rep(NA, length(cl)),
      Percent = rep(NA, length(cl)),
      Grading = rep(NA, length(cl)),
      Label = rep(NA, length(cl))
    )

    for (i in seq_along(cl)) {
      prior_names <- names(summary_df1)

      # prepare columns and name of respective check
      summary_df1[i + 1] <- NA
      names(summary_df1) <- c(prior_names, paste0("grading_", cl[i]))

      # which check function is to be applied
      check <- paste(ct$Function_name[ct$ID == cl[i]])

      # parse levels
      a_lev <- gsub("'", "", ct$A_levels[ct$ID == cl[i]])
      a_lev <- unlist(strsplit(a_lev, SPLIT_CHAR, fixed = TRUE))
      a_lev <- trimws(a_lev)

      b_lev <- gsub("'", "", ct$B_levels[ct$ID == cl[i]])
      b_lev <- unlist(strsplit(b_lev, SPLIT_CHAR, fixed = TRUE))
      b_lev <- trimws(b_lev)
      # apply check
      summary_df1[i + 1] <-
        contradiction_functions[[check]](study_data = ds1,
        A = paste(ct$A[ct$ID == cl[i]]),
        A_levels = a_lev,
        A_value = ct$A_value[ct$ID == cl[i]],
        B = paste(ct$B[ct$ID == cl[i]]),
        B_levels = b_lev,
        B_value = ct$B_value[ct$ID == cl[i]]
      )

      # summarize checks
      summary_df2[i, 1] <- cl[i]
      summary_df2[i, 2] <- check
      summary_df2[i, 3] <- paste0(
        "A is: ", ct$A[ct$ID == cl[i]], "; ",
        "B is: ", ct$B[ct$ID == cl[i]]
      )
      summary_df2[i, 4] <- paste(a_lev, collapse = SPLIT_CHAR)
      summary_df2[i, 5] <- paste(b_lev, collapse = SPLIT_CHAR)
      summary_df2[i, 6] <- sum(summary_df1[, i + 1], na.rm = TRUE)
      summary_df2[i, 7] <- sum(summary_df1[, i + 1], na.rm = TRUE) /
        dim(ds1)[1] * 100
      summary_df2[i, 8] <- ifelse(summary_df2[i, 7] > threshold_value, 1, 0)
      summary_df2[i, 9] <- ct$Label[ct$ID == cl[i]]
    }

    summary_df2$Percent <- round(summary_df2$Percent, digits = 2)

    names(summary_df2) <- c(
      "Check ID", "Check type", "Variables A and B", "A Levels",
      "B Levels", "Contradictions (N)", "Contradictions (%)",
      "GRADING", "Label"
    )

    summary_df2$GRADING <- ordered(summary_df2$GRADING) # FIXME: Update GRADING -> Standard Name

    x <- util_as_numeric(reorder(summary_df2[, 1], -summary_df2[, 1]))
    lbs <- as.character(reorder(summary_df2[, 9], -summary_df2[, 1]))
    # plot summary_df2
    p <- util_create_lean_ggplot(ggplot(summary_df2, aes(x = x, y = .data[["Contradictions (%)"]],
                                 fill =
                                    as.ordered(GRADING))) +
      geom_bar(stat = "identity") +
      geom_text(
        label = paste0(" ", round(summary_df2[, 7], digits = 2), "%"),
        hjust = 0, vjust = 0.5
      ) +
      scale_fill_manual(values = cols, name = " ", guide = "none") +
      theme_minimal() +
      xlab("IDs of applied checks") +
      scale_y_continuous(name = "(%)",
                         limits = (c(0, max(1.2 * max(summary_df2[, 7]),
                                            threshold_value)))) +
      scale_x_continuous(breaks = x, sec.axis =
                           sec_axis(~., breaks = x, labels = lbs)) + # TODO: checl ~ ??
      geom_hline(yintercept = threshold_value, color = "red", linetype = 2) +
      coord_flip() + # TODO
      theme(text = element_text(size = 20)),
      summary_df2 = summary_df2,
      threshold_value = threshold_value,
      x = x,
      lbs = lbs,
      cols = cols)

    # p <- p + util_coord_flip(p = p) # TODO: estimate w and h, if p is not using discrete axes util_lazy_add_coord(p, fli)

    # create SummaryTable object
    st1 <- summary_df2
    st1$`Variables A and B` <- gsub("A is: ", "", st1$`Variables A and B`)
    st1$`Variables A and B` <- gsub("B is: ", "", st1$`Variables A and B`)
    st1$Variables <- unlist(lapply(st1$`Variables A and B`,
                                   function(x) unlist(strsplit(x, ";",
                                                               fixed =
                                                                 TRUE))[1]))
    st1$`Reference variable` <- unlist(lapply(st1$`Variables A and B`,
                                              function(x) unlist(
                                                strsplit(x, ";", fixed =
                                                           TRUE))[2]))
    st1$`Variables A and B` <- NULL
    st1 <- st1[, c(9, 10, 1:8)]
    #st1 <- dplyr::rename(st1, c("GRADING" = "Grading"))

#    suppressWarnings({
      # suppress wrong warnings: https://github.com/tidyverse/ggplot2/pull/4439/commits
      # find out size of the plot https://stackoverflow.com/a/51795017
#      bp <- ggplot_build(p)
#      w <- 2 * length(bp$layout$panel_params[[1]]$x$get_labels())
#      if (w == 0) {
#        w <- 10
 #     }
 #     w <- w + 2 +
 #       max(nchar(bp$layout$panel_params[[1]]$y$get_labels()),
 #           na.rm = TRUE)
 #     w <- w +
 #       max(nchar(bp$layout$panel_params[[1]]$y.sec$get_labels()),
 #           na.rm = TRUE)
 #     h <- 2 * length(bp$layout$panel_params[[1]]$y$get_labels())
#      if (h == 0) {
#        h <- 10
#      }
 #     h <- h + 15
#      p <- util_set_size(p, width_em = w, height_em = h)
#    })

    # Output
    return(list(
      FlaggedStudyData = summary_df1,
      SummaryTable = st1, # TODO: VariableGroupTable
      SummaryData = summary_df2,
      SummaryPlot = p
    ))
  }

  # Never called, just for documentation.
  return(list( # nocov start
    FlaggedStudyData = summary_df1,
    SummaryTable = st1,
    SummaryData = summary_df2,
    SummaryPlot = p
  )) # nocov end
}