Nothing
#' Checks user-defined contradictions in study data
#'
#' @description
#' This approach considers a contradiction if impossible combinations of data
#' are observed in one participant. For example, if age of a participant is
#' recorded repeatedly the value of age is (unfortunately) not able to decline.
#' Most cases of contradictions rest on comparison of two variables.
#'
#' Important to note, each value that is used for comparison may represent a
#' possible characteristic but the combination of these two values is considered
#' to be impossible. The approach does not consider implausible or inadmissible
#' values.
#'
#' @details
#' ### Algorithm of this implementation:
#'
#' - Remove missing codes from the study data (if defined in the metadata)
#' - Remove measurements deviating from limits defined in the metadata
#' - Assign label to levels of categorical variables (if applicable)
#' - Apply contradiction checks (given as `REDCap`-like rules in a separate
#' metadata table)
#' - Identification of measurements fulfilling contradiction rules. Therefore
#' two output data frames are generated:
#' - on the level of observation to flag each contradictory value
#' combination, and
#' - a summary table for each contradiction check.
#' - A summary plot illustrating the number of contradictions is generated.
#'
#' List function.
#'
#' @param study_data [data.frame] the data frame that contains the measurements
#' @param meta_data [data.frame] the data frame that contains metadata
#' attributes of study data
#' @param label_col [variable attribute] the name of the column in the metadata
#' with labels of variables
#' @param threshold_value [numeric] from=0 to=100. a numerical value
#' ranging from 0-100
#' @param meta_data_cross_item [data.frame] contradiction rules table. Table
#' defining contractions. See details for
#' its required structure.
#' @param summarize_categories [logical] Needs a column 'CONTRADICTION_TYPE' in
#' the `meta_data_cross_item`.
#' If set, a summary output is generated for the
#' defined categories plus one plot per
#' category. TODO: Not yet controllable by metadata.
#' @param use_value_labels [logical] Deprecated in favor of [DATA_PREPARATION].
#' If set to `TRUE`, labels can be used in the
#' `REDCap` syntax to specify contraction checks for
#' categorical variables. If set to `FALSE`,
#' contractions have to be specified using the coded
#' values. In case that this argument is not set in
#' the function call, it will be set to `TRUE` if
#' the metadata contains a column `VALUE_LABELS`
#' which is not empty.
#'
#' `inheritParams` `acc_distributions`
#'
#' @return
#' If `summarize_categories` is `FALSE`:
#' A [list] with:
#' - `FlaggedStudyData`: The first output of the contradiction function is a
#' data frame of similar dimension regarding the number
#' of observations in the study data. In addition, for
#' each applied check on the variables an additional
#' column is added which flags observations with a
#' contradiction given the applied check.
#' - `SummaryData`: The second output summarizes this information into one
#' data frame. This output can be used to provide an
#' executive overview on the amount of contradictions.
#' - `VariableGroupTable`: A subset of `SummaryData` used within the pipeline.
#' - `SummaryPlot`: The third output visualizes summarized information
#' of `SummaryData`.
#'
#' If `summarize_categories` is `TRUE`, other objects are returned:
#' One per category named by that category (e.g. "Empirical") containing a
#' result for contradiction checks within that category only. Additionally, in the
#' slot `all_checks`, a result as it would have been returned with
#' `summarize_categories` set to `FALSE`. Finally, a slot `SummaryData` is
#' returned containing sums per Category and an according [ggplot] in
#' `SummaryPlot`.
#'
#' @export
#'
#' @importFrom ggplot2 ggplot geom_bar scale_fill_manual theme_minimal
#' scale_y_continuous geom_hline coord_flip theme aes
#' geom_text xlab scale_x_continuous sec_axis
#' @importFrom stats setNames
#' @seealso
#' [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_con_impl_contradictions_redcap.html
#' )
#' @examples
#' \dontrun{ # slow
#' load(system.file("extdata", "meta_data.RData", package = "dataquieR"))
#' load(system.file("extdata", "study_data.RData", package = "dataquieR"))
#' meta_data_cross_item <- prep_get_data_frame("meta_data_v2|cross-item_level")
#' label_col <- "LABEL"
#' threshold_value <- 1
#' con_contradictions_redcap(
#' study_data = study_data, meta_data = meta_data, label_col = label_col,
#' threshold_value = threshold_value, meta_data_cross_item = meta_data_cross_item
#' )
#' con_contradictions_redcap(
#' study_data = study_data, meta_data = meta_data, label_col = label_col,
#' threshold_value = threshold_value, meta_data_cross_item = meta_data_cross_item,
#' summarize_categories = TRUE
#' )
#' }
con_contradictions_redcap <- function(study_data, meta_data,
label_col, threshold_value,
meta_data_cross_item = "cross-item_level",
use_value_labels,
summarize_categories = FALSE # TODO: make addressable from dq_report2
# flip_mode = "flip" # TODO: Fix noflip graph
) {
# preps ----------------------------------------------------------------------
# map metadata to study data
prep_prepare_dataframes(.replace_hard_limits = FALSE,
.replace_missings = FALSE) # replacements are performed later
if (!missing(use_value_labels)) {
util_deprecate_soft(when = "2.0.1",
what = "con_contradictions_redcap(use_value_labels)",
details =
"Please use DATA_PREPARATION in meta_data_cross_item now."
)
}
# table of specified contradictions
util_expect_data_frame(meta_data_cross_item, list(
CONTRADICTION_TERM = is.character,
CHECK_LABEL = is.character
))
meta_data_cross_item <- util_normalize_cross_item(
meta_data = meta_data,
meta_data_cross_item = meta_data_cross_item,
label_col = label_col
)
# There might be rows without contradiction rules (NAs), which should be removed first.
if (any(is.na(meta_data_cross_item[[CONTRADICTION_TERM]]))) {
meta_data_cross_item <- meta_data_cross_item[-which(is.na(meta_data_cross_item[[CONTRADICTION_TERM]])), ]
}
if ("VARIABLE_LIST" %in% colnames(meta_data_cross_item)) {
old_vl <- meta_data_cross_item$VARIABLE_LIST
meta_data_cross_item$VARIABLE_LIST <- NULL
} else {
old_vl <- NULL
}
# generate VARIABLE_LIST entries
# TODO: Support "[" and "]" in variable labels
# TODO: handle also VAR_NAMES here. The VARIABLE_LIST below should then likely be mapped back to one soft of identifiers.
needles_var_names <- unique(c(meta_data[[VAR_NAMES]],
meta_data[[label_col]],
meta_data[[LABEL]],
meta_data[[LONG_LABEL]]))
needles <- paste0("[",
needles_var_names,
"]")
x <- vapply(setNames(needles, nm = needles_var_names),
grepl,
setNames(nm = meta_data_cross_item[[CONTRADICTION_TERM]]),
fixed = TRUE,
FUN.VALUE = logical(length = nrow(meta_data_cross_item))
)
if (is.vector(x)) {
x <- as.matrix(t(x))
}
# variablelist <- lapply(lapply(lapply(apply(x, 1, which, simplify = FALSE), # apply supports simplify from R 4.1.0
# names), sort), unique) # so use the following, less intuitive code line
variablelist <- unname(lapply(as.data.frame(t(x)), function(xx) unique(sort(colnames(x)[xx]))))
variablelist <-
lapply(variablelist, paste0, collapse = sprintf(" %s ", SPLIT_CHAR))
if (!is.null(old_vl)) { # FIXME: Remove!!
old_vars <- util_parse_assignments(old_vl)
new_vars <- util_parse_assignments(variablelist)
if (length(setdiff(variablelist, old_vl)) > 0) {
meta_data_cross_item$VARIABLE_LIST <- variablelist # TODO: To 000_globs.R
util_message("%s is incomplete for contradictions. %s will be used for finding all related variables.",
dQuote(VARIABLE_LIST),
dQuote(CONTRADICTION_TERM),
applicability_problem = TRUE)
} else if (length(setdiff(old_vl, variablelist)) > 0) {
meta_data_cross_item$VARIABLE_LIST <- variablelist # TODO: To 000_globs.R
util_message("%s features more variables than the corresponding %s. %s will be used for finding all related variables.",
dQuote(VARIABLE_LIST),
dQuote(CONTRADICTION_TERM),
dQuote(CONTRADICTION_TERM),
applicability_problem = TRUE)
} else {
meta_data_cross_item$VARIABLE_LIST <- old_vl # TODO: To 000_globs.R
}
}
if (missing(threshold_value)) {
threshold_value <- 0
if (!.called_in_pipeline)
util_message("No %s has been set, will use default %d",
dQuote("threshold_value"), threshold_value,
applicability_problem = TRUE)
} else {
util_expect_scalar(threshold_value, check_type = function(x) {
is.numeric(x) && x >= 0 && x <= 100
})
}
util_expect_scalar(summarize_categories, check_type = is.logical)
# parse redcap rules to obtain interpretable contradiction checks
compiled_rules <- lapply(setNames(nm = meta_data_cross_item[[CONTRADICTION_TERM]]),
util_parse_redcap_rule)
# colors
cols <- c("0" = "#2166AC", "1" = "#B2182B")
# summarize contradictions per category given in CONTRADICTION_TYPE -------------------------------------
if (summarize_categories) {
if (!(CONTRADICTION_TYPE %in% colnames(meta_data_cross_item))) {
util_error(c(
"Cannot summerize categories of contradictions,",
"because these are not defined in the meta_data_cross_item",
"as column %s."),
sQuote(CONTRADICTION_TYPE),
applicability_problem = TRUE)
}
split_tags <- lapply(strsplit(meta_data_cross_item[[CONTRADICTION_TYPE]], SPLIT_CHAR, fixed = TRUE), trimws)
tags <- sort(unique(unlist(split_tags)))
tags <- setNames(nm = tags)
tags_ext <- tags
tags_ext[["all_checks"]] <- NA
result <- lapply(tags_ext, function(atag) {
# generate one output per category (stratified)
if (is.na(atag)) {
new_ct <- meta_data_cross_item[, -which(colnames(meta_data_cross_item) == CONTRADICTION_TYPE), drop = FALSE]
} else {
contains_tag <- function(x, tg) {
any(x == tg, na.rm = TRUE)
}
rows_matching_tag <- vapply(split_tags, contains_tag, tg = atag,
logical(1))
new_ct <- meta_data_cross_item[rows_matching_tag, -which(colnames(meta_data_cross_item) == CONTRADICTION_TYPE),
drop = FALSE]
}
# recursive call of the function only for the contradiction checks of the currently selected category in "atag"
r <- try(con_contradictions_redcap(
study_data = study_data,
meta_data = meta_data, label_col = label_col,
threshold_value = threshold_value, meta_data_cross_item = new_ct,
summarize_categories = FALSE
# , flip_mode = flip_mode TODO
), silent = TRUE)
if (inherits(r, "try-error")) {
list(FlaggedStudyData = data.frame())
} else {
r
}
})
# summarize the outputs of the recursive calls
rx <- lapply(tags_ext, function(atag) {
if (is.na(atag)) {
round(sum(rowSums(result[["all_checks"]]$FlaggedStudyData[, -1, drop = FALSE],
na.rm = TRUE) > 0) /
nrow(result[["all_checks"]]$FlaggedStudyData) * 100, digits = 2)
} else {
round(sum(rowSums(result[[atag]]$FlaggedStudyData[, -1, drop = FALSE],
na.rm = TRUE) > 0) /
nrow(result[[atag]]$FlaggedStudyData) * 100, digits = 2)
}
})
rx <- data.frame(
category = names(rx),
PCT_con_con = unlist(rx),
GRADING = ordered(ifelse(unlist(rx) > threshold_value, 1, 0))
)
result$SummaryData <- rx
# Plot for summarized contradiction checks -----------------------------------------------------
# TODO: work on the changes here and below with position = "top"
p <- ggplot(rx, aes(x = seq_len(nrow(rx)), y = PCT_con_con,
fill = as.ordered(GRADING))) +
geom_bar(stat = "identity") +
scale_fill_manual(values = cols, name = " ", guide = "none") +
theme_minimal() +
scale_y_continuous(name = "(%)",
limits = (c(0, max(1.2 * suppressWarnings(
max(rx$PCT_con_con)),
threshold_value))),
expand = expansion(mult = c(0,0.05))) +
scale_x_continuous(breaks = seq_len(nrow(rx)),
labels = rx$category,
position = "top",
trans = "reverse") +
# xlab("Category of applied contradiction checks") +
xlab("") +
geom_hline(yintercept = threshold_value, color = "red", linetype = 2) +
geom_text(label = paste0(" ", rx$PCT_con_con, "%"),
hjust = 0, vjust = 0.5) +
coord_flip() + # TODO
theme(axis.text.y.right = element_text(size = 14),
axis.text.y.left = element_blank())
# p <- p + util_coord_flip(p = p) # TODO: estimate w and h, if p is not using discrete axes
# https://stackoverflow.com/a/51795017
bp <- ggplot_build(p)
w <- 2 * length(bp$layout$panel_params[[1]]$x$get_labels())
if (w == 0) {
w <- 10
}
w <- w + 2 +
max(nchar(bp$layout$panel_params[[1]]$y$get_labels()),
na.rm = TRUE)
h <- 2 * length(bp$layout$panel_params[[1]]$y$get_labels())
if (h == 0) {
h <- 10
}
h <- h + 15
p <- util_set_size(p, width_em = w, height_em = h)
result$SummaryPlot <- p
return(result)
} else {
# run contradiction checks without summarizing -------------------------------------------------------
# apply contradiction checks -------------------------------------------------------------------------
rule_match <- mapply(
SIMPLIFY = FALSE,
rule = compiled_rules,
prep = util_parse_assignments(
meta_data_cross_item[[DATA_PREPARATION]], multi_variate_text = TRUE),
FUN = function(rule, prep) {
prep <- as.character(names(prep))
use_value_labels <- ("LABEL" # meta_data_cross_item has been normalized, already
%in% prep)
if ("MISSING_NA" %in% prep) {
replace_missing_by <- "NA"
} else if ("MISSING_LABEL" %in% prep) {
replace_missing_by <- "LABEL"
} else if ("MISSING_INTERPRET" %in% prep) {
replace_missing_by <- "INTERPRET"
} else {
replace_missing_by <- ""
}
replace_limits <- ("LIMITS"
%in% prep)
if (is.list(rule) && !length(rule) && is.null(attr(rule, "class"))) {
r <- try(util_error("Parser error"), silent = TRUE)
} else {
r <- try(util_eval_rule(rule = rule,
ds1 = ds1,
meta_data = meta_data,
use_value_labels = use_value_labels,
replace_missing_by = replace_missing_by,
replace_limits = replace_limits
),
silent = TRUE)
}
if (inherits(r, "try-error")) {
rule_src <- attr(rule, "src")
if (length(rule_src) == 0) {
rule_src <- util_deparse1(rule)
}
util_warning("Could not evaluate rule %s: %s",
dQuote(rule_src),
conditionMessage(attr(r, "condition")))
r <- TRUE
}
r
})
rule_match <- lapply(rule_match, as.logical) # TODO: if a rule returned "", this is NA, if it did not return TRUE, FALSE or "", this would be a warning
list_element_length <- vapply(rule_match, length, FUN.VALUE = integer(1))
if (any(list_element_length == 0) && !all(list_element_length == 0)) {
# not all columns of same length, fix this for as.data.frame
rule_match[list_element_length == 0] <-
list(rep(TRUE, nrow(ds1)))
}
list_element_length <- vapply(rule_match, length, FUN.VALUE = integer(1))
if (any(list_element_length == 1)) {
# not all columns of same length, fix this for as.data.frame
rule_match[list_element_length == 1] <- lapply(
rule_match[list_element_length == 1],
function(to_recycle) {
rep(to_recycle, nrow(ds1))
})
}
if (length(unique(vapply(rule_match, length, FUN.VALUE = integer(1)))) > 1) {
util_error(c("Internal error: unexpected inhomogeneous length of rules result.",
"This is an internal error, please excuse and contact the dataquieR developers."))
}
if (length(unique(vapply(rule_match, length, FUN.VALUE = integer(1)))) == 0) {
summary_df1 <- data.frame(Obs = seq_len(nrow(ds1)))
} else {
summary_df1 <- cbind(data.frame(Obs = seq_len(nrow(ds1))),
as.data.frame(rule_match))
}
colnames(summary_df1)[-1] <- paste0("flag_con",
formatC(seq_len(nrow(meta_data_cross_item)),
width = nchar(nrow(meta_data_cross_item)),
format = "d",
flag = "0"))
summary_df2 <- meta_data_cross_item
summary_df2$NUM_con_con <- as.numeric(lapply(rule_match, sum, na.rm = TRUE))
summary_df2$PCT_con_con <- round(summary_df2$NUM_con_con / nrow(ds1) * 100,
digits = 2)
summary_df2$GRADING <- ifelse(summary_df2$PCT_con_con > threshold_value,
1, 0)
summary_df3 <- summary_df2[, c("VARIABLE_LIST", "NUM_con_con",
"PCT_con_con", "GRADING")]
# Plot for all contradiction checks --------------------------------------------------------
p <- ggplot(summary_df2, aes(x = seq_len(nrow(meta_data_cross_item)), y = PCT_con_con,
fill = as.ordered(GRADING))) +
geom_bar(stat = "identity") +
scale_fill_manual(values = cols, name = " ", guide = "none") +
theme_minimal() +
# xlab("Applied contradiction checks") +
xlab("") +
scale_y_continuous(name = "(%)",
limits = c(0, max(1.2 * suppressWarnings(
max(summary_df2$PCT_con_con)),
threshold_value)),
expand = expansion(mult = c(0,0.05))) +
scale_x_continuous(breaks = seq_len(nrow(meta_data_cross_item)),
labels = meta_data_cross_item$CHECK_LABEL,
position = "top",
trans = "reverse") +
geom_hline(yintercept = threshold_value, color = "red", linetype = 2) +
geom_text(label = paste0(" ", summary_df2$PCT_con_con, "%"),
hjust = 0, vjust = 0.5) +
coord_flip() + # TODO
theme(axis.text.y.right = element_text(size = 14),
axis.text.y.left = element_blank())
# p <- p + util_coord_flip(p = p) # TODO: estimate w and h, if p is not using discrete axes
suppressWarnings({
# suppress wrong warnings: https://github.com/tidyverse/ggplot2/pull/4439/commits
# find out size of the plot https://stackoverflow.com/a/51795017
bp <- ggplot_build(p)
w <- 2 * length(bp$layout$panel_params[[1]]$x$get_labels())
if (w == 0) {
w <- 10
}
w <- w + 2 +
max(nchar(bp$layout$panel_params[[1]]$y$get_labels()),
na.rm = TRUE)
w <- w +
max(nchar(bp$layout$panel_params[[1]]$y.sec$get_labels()),
na.rm = TRUE)
h <- 2 * length(bp$layout$panel_params[[1]]$y$get_labels())
if (h == 0) {
h <- 10
}
h <- h + 15
p <- util_set_size(p, width_em = w, height_em = h)
})
if (!prod(dim(summary_df3))) {
util_error("No contradiction check defined",
applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
}
# Output
return(list(
FlaggedStudyData = summary_df1,
SummaryTable = summary_df2,
VariableGroupTable = summary_df3,
SummaryPlot = p
))
}
# Never called, just for documentation.
return(list( # nocov start
FlaggedStudyData = summary_df1,
SummaryTable = summary_df2,
VariableGroupTable = summary_df3,
SummaryPlot = p
)) # nocov end
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.