R/int_datatype_matrix.R
In dataquieR: Data Quality in Epidemiological Research

Documented in int_datatype_matrix

#' Check declared data types of metadata in study data
#'
#' @description
#' Checks data types of the study data and for the data type
#' declared in the metadata
#'
#' [Indicator]
#'
#' @details
#' This is a preparatory support function that compares study data with
#' associated metadata. A prerequisite of this function is that the no. of
#' columns in the study data complies with the no. of rows in the metadata.
#'
#' For each study variable, the function searches for its data type declared in
#' static metadata and returns a heatmap like matrix indicating data type
#' mismatches in the study data.
#'
#' List function.
#'
#' @param resp_vars [variable] the names of the measurement variables, if
#'                             missing or `NULL`, all variables will be checked
#' @param study_data [data.frame] the data frame that contains the measurements
#' @param meta_data [data.frame] the data frame that contains metadata
#'                               attributes of study data
#' @param split_segments [logical] return one matrix per study segment
#' @param label_col [variable attribute] the name of the column in the metadata
#'                                       with labels of variables
#' @param max_vars_per_plot [integer] from=0. The maximum number of variables
#'                                            per single plot.
#' @param threshold_value [numeric] from=0 to=100. percentage failing
#'                                  conversions allowed to still classify a
#'                                  study variable convertible.
#' `inheritParams` `acc_distributions`
#' @return a list with:
#'   - `SummaryTable`: data frame about the applicability of each indicator
#'                   function (each function in a column).
#'                   its [integer] values can be one of the following four
#'                   categories:
#'                               0. Non-matching datatype,
#'                               1. Matching datatype,
#'   - `SummaryPlot`: [ggplot2] heatmap plot, graphical representation of
#'                       `SummaryTable`
#'   - `DataTypePlotList`: [list] of plots per (maybe artificial) segment
#'   - `ReportSummaryTable`: data frame underlying `SummaryPlot`
#' @export
#' @importFrom ggplot2 ggplot aes geom_tile scale_fill_manual facet_wrap
#'                     theme_minimal scale_x_discrete xlab guides
#'                     guide_legend theme element_text
#' @examples
#' \dontrun{
#' load(system.file("extdata/meta_data.RData", package = "dataquieR"), envir =
#'   environment())
#' load(system.file("extdata/study_data.RData", package = "dataquieR"), envir =
#'   environment())
#' study_data$v00000 <- as.character(study_data$v00000)
#' study_data$v00002 <- as.character(study_data$v00002)
#' study_data$v00002[3] <- ""
#' appmatrix <- int_datatype_matrix(study_data = study_data,
#'                                  meta_data = meta_data,
#'                                  label_col = LABEL)
#' study_data$v00002[5] <- "X"
#' appmatrix <- int_datatype_matrix(study_data = study_data,
#'                                  meta_data = meta_data,
#'                                  label_col = LABEL)
#' appmatrix$ReportSummaryTable
#' }
int_datatype_matrix <- function(resp_vars = NULL,
                                study_data, meta_data, split_segments =
                                     FALSE, label_col,
                                     max_vars_per_plot = 20,
                                threshold_value = 0
                                # , flip_mode = "noflip" # TODO: Improve style
                                ) {

  if (.called_in_pipeline) {
    return(list(
      SummaryData =
        "Dummy result, should not be displayed, internal error, please report."))
  }

  if (length(max_vars_per_plot) != 1 || !util_is_integer(max_vars_per_plot) ||
      is.na(max_vars_per_plot) || is.nan(max_vars_per_plot) ||
      max_vars_per_plot < 1) {
    util_error(c(
      "max_vars_per_plot must be one strictly positive non-complex integer",
      "value, may be Inf."
    ), applicability_problem = TRUE)
  }

  prep_prepare_dataframes(.replace_missings = FALSE, .adjust_data_type = FALSE)

  # correct variable use?
  util_correct_variable_use("resp_vars",
                            allow_more_than_one = TRUE,
                            allow_null          = TRUE,
                            allow_any_obs_na    = TRUE
  )

  # if no resp_vars specified use all columns of studydata
  if (length(resp_vars) == 0) {
    resp_vars <- names(ds1)
  } else {
    ds1 <- ds1[, resp_vars, drop = FALSE]
  }

  Variables <- resp_vars
  # this matrix will be shown to users
  app_matrix <- data.frame(Variables = Variables, stringsAsFactors = FALSE)
  # this matrix can be used to trigger computations
  # tri_matrix <- data.frame(Variables = Variables)
  # defined for merge of strata (optional)
  by_y <- label_col

  # DATA_TYPE defined in metadata?
  if (!(DATA_TYPE %in% names(meta_data))) {
    util_error(
      c("The attribute DATA_TYPE is not contained in the metadata but is",
        "required for this function."), applicability_problem = TRUE)
  }

  # variables with missing DATA_TYPE?
  if (any(is.na(meta_data$DATA_TYPE))) {
    whichnot <- as.character(meta_data[[label_col]][is.na(meta_data$DATA_TYPE)])
    util_error(paste0("The DATA_TYPE for variable(s) <<", whichnot,
                      ">> is not defined in the metadata."),
               applicability_problem = TRUE)
  }

  # check whether data types adhere to conventions
  if (!all(unique(meta_data$DATA_TYPE) %in% DATA_TYPES)) {
    whichnot <- dplyr::setdiff(unique(meta_data$DATA_TYPE), DATA_TYPES)
    util_warning(paste0("The data type(s): <<", whichnot,
                        ">> is not eligible in the metadata concept."),
                 applicability_problem = TRUE)
    util_error("Please map data types to: %s.", paste0(dQuote(DATA_TYPES),
                                                       collapse = ", "),
               applicability_problem = TRUE)
  }

  # DATA TYPE CONSISTENCY ------------------------------------------------------
  # This step compares the datatype as represented in study data with expected
  # datatypes in metadata and creates a binary vector indicating whether data
  # type fits or not
  dt_appl <-
             as.numeric(util_compare_meta_with_study(sdf = ds1, mdf = meta_data,
                                                     label_col = label_col,
                                                     check_convertible = TRUE,
                                                     check_conversion_stable = TRUE,
                                                     threshold_value =
                                                       threshold_value))
  dt_appl <- recode(dt_appl,
                    `1` = 0,
                    `2` = 1,
                    `0` = 2,
                    `3` = 3) # 3, convetible with drawbacks

  # two columns in heatmap: percentage missmatch, percentage non-convertible
  # percentage convertible?

  app_matrix$MATCH = dt_appl

  # SHOULD STRATIFICATION OF SEGMENTS BE USED? ---------------------------------

  # is STUDY_SEGMENT in metadata and none of the entries has NA?
  strata_defined <- STUDY_SEGMENT %in% names(meta_data)

  if (!strata_defined && split_segments) {
    util_warning(c(
      "Stratification for STUDY_SEGMENT is not possible due to missing",
      "metadata. Will split arbitrarily avoiding too large figures"
    ), applicability_problem = TRUE)
    nvars <- nrow(meta_data)
    meta_data$STUDY_SEGMENT <- paste0("Block #", ceiling(1:nvars /
                                                             max_vars_per_plot))
  } else if (strata_defined && split_segments) {
    if (any(is.na(meta_data$STUDY_SEGMENT))) {
      util_message(c(
        "Some STUDY_SEGMENT are NA. Will assign those to an artificial",
        "segment %s"), dQuote("Other"),
        applicability_problem = TRUE
      )
      meta_data$STUDY_SEGMENT[is.na(meta_data$STUDY_SEGMENT)] <- "Other"
    }
    too_big_blocks <- table(meta_data$STUDY_SEGMENT) > max_vars_per_plot
    too_big_blocks <- names(too_big_blocks)[too_big_blocks]
    for (too_big_block in too_big_blocks) {
      util_message(
        "Will split segemnt %s arbitrarily avoiding too large figures",
        dQuote(too_big_block),
        applicability_problem = FALSE
      )
      nvars <- sum(meta_data$STUDY_SEGMENT == too_big_block, na.rm = TRUE) # TODO: Use [[STUDY_SEGMENT]]
      meta_data$STUDY_SEGMENT[
        meta_data$STUDY_SEGMENT == too_big_block] <-
        paste0(
          meta_data$STUDY_SEGMENT[meta_data$STUDY_SEGMENT ==
                                        too_big_block],
          "#",
          ceiling(1:nvars / max_vars_per_plot)
        )
    }
  }

  if (!(STUDY_SEGMENT %in% names(meta_data))) {
    meta_data[[STUDY_SEGMENT]] <- "Study"
  }

  # merge relation to segments to app_matrix
  app_matrix <- merge(app_matrix, meta_data[, intersect(c(VAR_NAMES, LABEL,
                                                          STUDY_SEGMENT,
                                                          label_col),
                                                        colnames(meta_data)),
                                            FALSE],
    by.x = "Variables",
    by.y = by_y
  )
  app_matrix <- app_matrix[, !(names(app_matrix) %in% c(VAR_NAMES, LABEL,
                                                        label_col))]

  # PREPARE PLOT ---------------------------------------------------------------
  # reorder according VARIABLE_ORDER (optional)
  if (VARIABLE_ORDER %in% colnames(meta_data)) {
    meta_data <- meta_data[order(meta_data$VARIABLE_ORDER), ]
    meta_data[[VARIABLE_ORDER]][is.na(meta_data[[VARIABLE_ORDER]])] <-
      max(meta_data[[VARIABLE_ORDER]], na.rm = TRUE)
    meta_data[[VARIABLE_ORDER]][is.na(meta_data[[VARIABLE_ORDER]])] <-
      1
    app_matrix <- app_matrix[na.omit(match(meta_data[[label_col]],
                                     app_matrix$Variables)), ]
  }

  # assign factor levels
  app_matrix$Variables <- factor(app_matrix$Variables, levels =
                                   app_matrix$Variables)


  # reshape wide to long
#  app_matrix_long <- melt(app_matrix, id.vars = c("Variables",
#                                                  STUDY_SEGMENT))
  app_matrix_long <- stats::reshape(data = app_matrix,
                                     idvar = c("Variables",
                                               STUDY_SEGMENT),
                                     varying = colnames(app_matrix)[2],
                                     v.names = "value",
                                     times = colnames(app_matrix)[2],
                                     direction = "long")
  app_matrix_long$time <- as.factor(app_matrix_long$time)

  lev <- app_matrix$Variables
  app_matrix_long$Variables <- factor(app_matrix_long$Variables,
                                      levels = lev, ordered = FALSE)



  colnames(app_matrix_long) <- c("VARIABLES", "SEGMENT", "IMPLEMENTATION",
                                 "APP_SCORE")
  rownames(app_matrix_long) <- NULL


  # assign factor labels
  app_matrix_long$APP_SCORE <- factor(app_matrix_long$APP_SCORE,
    levels = c(2:0),
    ordered = TRUE,
    labels = c(
      "Non-matching datatype",
      "Non-Matching datatype, convertible",
      "Matching datatype"
    )
  )
  # PLOT -----------------------------------------------------------------------
  # colcode <- c("#B2182B", "#ef6548", "#92C5DE", "#2166AC", "#B0B0B0")
  colcode <- c("#B2182B", "#92C5DE", "#2166AC")
  names(colcode) <- levels(app_matrix_long$APP_SCORE)

  ratio <- dim(app_matrix)[1] / dim(app_matrix)[2]

  # ref_env <- environment() TODO

  plot_me <- function(m) {
    ggplot(m, aes(
      x = IMPLEMENTATION, y = VARIABLES,
      fill = APP_SCORE
    )) +
      geom_tile(colour = "white", linewidth = 0.8) + # https://github.com/tidyverse/ggplot2/issues/5051
      scale_fill_manual(values = colcode, name = " ") +
      {
        if (split_segments) facet_wrap(~SEGMENT, scales = "free_y")# TODO: check ~
      } +
      theme_minimal() +
      scale_x_discrete(position = "top") +
      xlab("") +
      guides(fill = guide_legend(
        ncol = 1, nrow = length(colcode),
        byrow = TRUE
      )) +
      theme(
        legend.position = "bottom",
        axis.text.x = element_text(angle = 90, hjust = 0),
        axis.text.y = element_text(size = 10),
        aspect.ratio = ratio
      ) # + util_coord_flip(ref_env = ref_env) # TODO: estimate w and h, since p is not using discrete axes
  }

  p <- plot_me(app_matrix_long)

  pl <- lapply(
    split(app_matrix_long, app_matrix_long$SEGMENT),
    plot_me
  )

  ReportSummaryTable <- as.data.frame(
    t(util_compare_meta_with_study(sdf = ds1, mdf = meta_data,
                                                     label_col = label_col,
                                                     check_convertible = TRUE,
                                                check_conversion_stable = TRUE,
                                                     return_percentages = TRUE,
                                                     threshold_value =
                                                       threshold_value)))
  #######
  match <- ReportSummaryTable$match
  ReportSummaryTable$match <- NULL
  ReportSummaryTable$convertible_mismatch_stable <-
    ReportSummaryTable$convertible_mismatch_stable / 100
  ReportSummaryTable$convertible_mismatch_unstable <-
    ReportSummaryTable$convertible_mismatch_unstable / 100
  ReportSummaryTable$nonconvertible_mismatch <-
    ReportSummaryTable$nonconvertible_mismatch / 100
  colnames(ReportSummaryTable)[colnames(ReportSummaryTable) ==
                                 "convertible_mismatch_stable"] <-
    "convertible mismatch, stable"
  colnames(ReportSummaryTable)[colnames(ReportSummaryTable) ==
                                 "convertible_mismatch_unstable"] <-
    "convertible mismatch, unstable"
  colnames(ReportSummaryTable)[colnames(ReportSummaryTable) ==
                                 "nonconvertible_mismatch"] <-
    "nonconvertible mismatch"

  ReportSummaryTable$N <- rep(1, nrow(ReportSummaryTable))
  ReportSummaryTable$Variables <- rownames(ReportSummaryTable)

  rownames(ReportSummaryTable) <- NULL

  attr(ReportSummaryTable, "continuous") <- TRUE
  attr(ReportSummaryTable, "colscale") <- c("#B2182B", "#92C5DE", "#2166AC")

  class(ReportSummaryTable) <- union("ReportSummaryTable",
                                     class(ReportSummaryTable))

  app_matrix$PCT_int_vfe_type <-
    setNames(100 * ReportSummaryTable$`nonconvertible mismatch`,
             nm = ReportSummaryTable$Variables)[app_matrix$Variables]

  app_matrix$Variables <- as.character(app_matrix$Variables)

  # create SummaryData table
  SummaryData <- app_matrix[, c("Variables", "PCT_int_vfe_type")]

  SummaryData$PCT_int_vfe_type <-
    paste0(round(SummaryData$PCT_int_vfe_type * nrow(ds1), digits = 0),
           " (",
           base::format(SummaryData$PCT_int_vfe_type, digits = 2, nsmall = 2), "%)")

  names(SummaryData)[names(SummaryData) == "PCT_int_vfe_type"] <-
    "Data type mismatch N (%)"

#  names(SummaryData)[names(SummaryData) == "PCT_int_vfe_type"] <-
#   util_translate_indicator_metrics("PCT_int_vfe_type",
#                                    short = FALSE, long = FALSE)

  SummaryData$`Convertible mismatch, stable N (%)` <-
    paste0( round(ReportSummaryTable$`convertible mismatch, stable` * nrow(ds1),
                  digits = 0), " (",
            base::format(100 * ReportSummaryTable$`convertible mismatch, stable`, digits = 2, nsmall = 2),
            "%)")
  SummaryData$`Convertible mismatch, unstable N (%)`<-
    paste0(round(ReportSummaryTable$`convertible mismatch, unstable` * nrow(ds1),
                 digits = 0), " (",
           base::format(100 * ReportSummaryTable$`convertible mismatch, unstable`, digits = 2, nsmall = 2),
           "%)")

  SummaryData$`Data type match N (%)` <-
    paste0(round(match / 100 * nrow(ds1), digits = 0), " (",
           base::format(match, digits = 2, nsmall = 2), "%)" )

  SummaryData$`Expected DATA_TYPE` <- setNames(meta_data[[DATA_TYPE]],
                        nm = meta_data[[label_col]])[SummaryData$Variables]

  SummaryData$`Observed DATA_TYPE` <-
    setNames(prep_datatype_from_data(resp_vars = # TODO: Improve
                                       SummaryData$Variables,
                                     study_data = ds1),
             nm = colnames(ds1))[SummaryData$Variables]

  SummaryData <- cbind(SummaryData,
                             app_matrix[, c("MATCH", "STUDY_SEGMENT"),
                                        drop = FALSE])

  SummaryData$MATCH <- factor(
    SummaryData$MATCH,
    levels = c(3:0),
    ordered = TRUE,
    labels = c(
      "Non-matching datatype, convertible, unstable",
      "Non-matching datatype",
      "Non-Matching datatype, convertible, stable",
      "Matching datatype"
    )
  )

  colnames(SummaryData)[colnames(SummaryData) == "MATCH"] <-
    "State, given threshold"

  #Add hover text to SummaryData table
  text_to_display <- c(Variables = "",
                       `Data type mismatch N (%)` =
                         paste0("No. observational units with expected and ",
                                "observed data types not matching"),
                       `Convertible mismatch, stable N (%)` =
                         paste0("No. observational units with expected and ",
                                "observed data types not matching. A conversion ",
                                "is possible without modifying the ",
                                "data ",
                                "(e.g., string '1.05' -> float 1.05)"),
                       `Convertible mismatch, unstable N (%)` =
                         paste0("No. observational units with expected and ",
                                "observed data types not matching. A conversion ",
                                "is possible but implies changes in",
                                " the data (e.g., string '2.06' -> integer 2)"),
                       `Match N (%)` =
                         paste0("No. observational units with ",
                                "matching expected-observed data types"),
                       `Expected DATA_TYPE` =
                         "Data type expected from the metadata",
                       `Observed DATA_TYPE` =
                         "Data type observed in the study data",
                       `State, given threshold` =
                         paste0("Classification of the variable in one of ",
                         "the categories: 1) Non-matching datatype; ",
                         "2) Non-matching datatype, convertible, unstable; ",
                         "3) Non-Matching datatype, convertible, stable; ",
                         "4) Matching datatype"),
                       `STUDY_SEGMENT`= ""  )
  attr(SummaryData, "description") <- text_to_display



  SummaryTable <- app_matrix
  SummaryTable$GRADING <- SummaryTable$MATCH == 2
  names(SummaryTable)[names(SummaryTable) == "MATCH"] <- "CLS_int_vfe_type"

  ReportSummaryTable[,
                     c("convertible mismatch, stable",
                       "convertible mismatch, unstable",
                       "nonconvertible mismatch",
                       "N")] <-
    round(ReportSummaryTable[, c("convertible mismatch, stable",
                           "convertible mismatch, unstable",
                           "nonconvertible mismatch",
                           "N")] / ReportSummaryTable$N * nrow(ds1), digits = 0)
  attr(ReportSummaryTable, "relative") <- TRUE

  return(list(
    SummaryPlot = p,
    DataTypePlotList = pl,
    SummaryTable = SummaryTable,
    SummaryData = SummaryData,
    ReportSummaryTable = ReportSummaryTable
  ))
}