Nothing
#' Plots and checks for distributions
#'
#' @description
#' Data quality indicator checks "Unexpected location" and "Unexpected
#' proportion" with histograms and, if a grouping variable is included, plots of
#' empirical cumulative distributions for the subgroups.
#'
#' [Indicator]
#'
#' @details
#' # Algorithm of this implementation:
#'
#' - If no response variable is defined, select all variables of type float or
#' integer in the study data.
#' - Remove missing codes from the study data (if defined in the metadata).
#' - Remove measurements deviating from (hard) limits defined in the metadata
#' (if defined).
#' - Exclude variables containing only `NA` or only one unique value (excluding
#' `NA`s).
#' - Perform check for "Unexpected location" if defined in the metadata (needs a
#' LOCATION_METRIC (mean or median) and LOCATION_RANGE (range of expected values
#' for the mean and median, respectively)).
#' - Perform check for "Unexpected proportion" if defined in the metadata (needs
#' PROPORTION_RANGE (range of expected values for the proportions of the
#' categories)).
#' - Plot histogram(s).
#' - If group_vars is specified by the user, distributions within group-wise
#' ecdf are presented.
#'
#' @export
#'
#' @param resp_vars [variable list] the names of the measurement variables
#' @param group_vars [variable list] the name of the observer, device or
#' reader variable
#' @param study_data [data.frame] the data frame that contains the measurements
#' @param meta_data [data.frame] the data frame that contains metadata
#' attributes of study data
#' @param label_col [variable attribute] the name of the column in the metadata
#' with labels of variables
#' @param plot_ranges [logical] Should the plot show ranges and results from the
#' data quality checks? (default: TRUE)
#' @param check_param [enum] any | location | proportion. Which type of check
#' should be conducted (if possible): a check on the
#' location of the mean or median value of the study
#' data, a check on proportions of categories, or either
#' of them if the necessary metadata is available.
#' @param flip_mode [enum] default | flip | noflip | auto. Should the plot be
#' in default orientation, flipped, not flipped or
#' auto-flipped. Not all options are always supported.
#' In general, this con be controlled by
#' setting the `roptions(dataquieR.flip_mode = ...)`. If
#' called from `dq_report`, you can also pass
#' `flip_mode` to all function calls or set them
#' specifically using `specific_args`.
#'
#' @return A [list] with:
#' - `SummaryTable`: [data.frame] containing data quality checks for
#' "Unexpected location" (`FLG_acc_ud_loc`) and "Unexpected
#' proportion" (`FLG_acc_ud_prop`) for each response
#' variable in `resp_vars`.
#' - `SummaryData`: a [data.frame] containing data quality checks for
#' "Unexpected location" and / or "Unexpected proportion"
#' for a report
#' - `SummaryPlotList`: [list] of [ggplot]s for each response variable in
#' `resp_vars`.
#'
#' @importFrom ggplot2 ggplot aes geom_histogram geom_bar geom_vline stat_ecdf
#' geom_segment scale_x_continuous scale_color_manual
#' coord_flip theme_minimal theme element_text element_blank
#' labs ylab
#' @importFrom stats na.omit
#' @importFrom rlang .data
#' @importFrom grDevices hcl.colors
#' @seealso
#' [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_distributions.html
#' )
acc_distributions <- function(resp_vars = NULL, group_vars = NULL,
study_data, meta_data,
label_col,
check_param = c("any", "location", "proportion"),
plot_ranges = TRUE,
flip_mode = "noflip") { # FIXME: EK: Please address the distribution issue of having too many different versions (after discussion with me)
# preps ----------------------------------------------------------------------
# map metadata to study data
prep_prepare_dataframes(.replace_hard_limits = TRUE)
# If no response variable is defined, all suitable variables will be selected.
if (length(resp_vars) == 0) {
util_message(
c("All variables defined to be integer or float in the metadata are used",
"by acc_distributions."), # TODO: extend for nominal and ordinal variables
applicability_problem = TRUE, intrinsic_applicability_problem = TRUE)
resp_vars <- meta_data[[label_col]][meta_data$DATA_TYPE %in%
c("integer", "float")]
resp_vars <- intersect(resp_vars, colnames(ds1))
if (length(resp_vars) == 0) {
util_error("No suitable variables were defined for acc_distributions.",
applicability_problem = TRUE)
}
}
# set up grouping variables, if needed
if (all(is.na(group_vars))) {
group_vars <- NULL
} else {
util_correct_variable_use("group_vars",
allow_null = TRUE,
allow_more_than_one = FALSE,
allow_any_obs_na = TRUE,
allow_all_obs_na = FALSE,
need_type = "!float",
need_scale = "nominal | ordinal"
)
# TODO: Still needed? Utility function?
if (length(group_vars) > 0) {
# all labelled variables
levlabs <- meta_data$VALUE_LABELS[meta_data[[label_col]] %in% group_vars]
if (any(grepl("=", levlabs) | is.na(levlabs))) {
# any variables without labels?
if (any(is.na(levlabs))) {
util_warning(paste0(
"Variables ", paste0(group_vars[is.na(levlabs)], collapse = ", "),
" have no assigned labels and levels and can therefore not be ",
"used as grouping variables in acc_distributions."
), applicability_problem = TRUE)
}
# only variables with labels
gvs_ll <- group_vars[!is.na(levlabs)]
gvs_ll <- gvs_ll[match(gvs_ll,
meta_data[[label_col]][
meta_data[[label_col]] %in% group_vars])]
levlabs <- levlabs[!is.na(levlabs)]
for (i in seq_along(gvs_ll)) {
ds1[[gvs_ll[i]]] <- util_assign_levlabs(
variable = ds1[[gvs_ll[i]]],
string_of_levlabs = levlabs[i],
splitchar = SPLIT_CHAR,
assignchar = " = "
)
}
}
# The grouping variable(s) should not be included as response variable(s).
if (any(group_vars %in% resp_vars)) {
resp_vars <- resp_vars[-which(resp_vars %in% group_vars)]
util_warning(paste("Removed grouping variable from response variables",
"for acc_distributions."),
applicability_problem = TRUE)
}
if (length(resp_vars) == 0) {
util_warning("No variables left to analyse for acc_distributions.",
applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
return(list(SummaryTable = list(),
SummaryPlotList = list(),
SummaryData = list()))
}
}
}
util_correct_variable_use("resp_vars",
allow_more_than_one = TRUE,
allow_any_obs_na = TRUE,
allow_all_obs_na = FALSE,
min_distinct_values = 2,
do_not_stop = ifelse(length(resp_vars) > 1,
TRUE, FALSE),
need_scale = "!na"
)
# Which parameter of the distribution should be checked?
dq_param <- match.arg(check_param)
# DQ indicator "Unexpected location" requires metadata for the range of
# expected values for the location parameter and the location metric.
# DQ indicator "Unexpected proportion" requires metadata for the range of
# expected percentages for the categories.
if (!any(grepl("_RANGE", colnames(meta_data)))) {
util_message(paste("Metadata does not contain columns for expected ranges",
"of location or proportions. So acc_distributions can",
"generate plots, but can not perform data quality",
"checks."), applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
# create empty metadata list
rvs_meta <- list("Metric" = setNames(nm = resp_vars,
rep(NA, length(resp_vars))),
"Range" = setNames(nm = resp_vars,
rep(NA, length(resp_vars))))
} else {
if (dq_param == "any") {
# split variables for location and proportion checks and call
# acc_distributions recursively
scale_level <- meta_data[[SCALE_LEVEL]][match(resp_vars,
meta_data[[label_col]])]
if (any(scale_level %in% c("interval", "ratio"))) {
rvs_loc <- resp_vars[scale_level %in%
c("interval", "ratio")]
res_loc <- suppressWarnings(acc_distributions(
resp_vars = rvs_loc,
group_vars = group_vars, study_data = study_data,
meta_data = meta_data, label_col = label_col,
check_param = "location", plot_ranges = plot_ranges,
flip_mode = flip_mode))
res_merged <- res_loc
}
if (any(!(scale_level %in% c("interval", "ratio")))) {
rvs_prop <- resp_vars[!(scale_level %in%
c("interval", "ratio"))]
res_prop <- suppressWarnings(acc_distributions(
resp_vars = rvs_prop,
group_vars = group_vars, study_data = study_data,
meta_data = meta_data, label_col = label_col,
check_param = "proportion", plot_ranges = plot_ranges,
flip_mode = flip_mode))
if (exists("res_merged")) {
res_merged$SummaryTable <- merge(res_merged$SummaryTable,
res_prop$SummaryTable,
all = TRUE, sort = FALSE)
res_merged$SummaryData <- merge(res_merged$SummaryData,
res_prop$SummaryData,
all = TRUE, sort = FALSE)
res_merged$SummaryPlotList <- c(res_merged$SummaryPlotList,
res_prop$SummaryPlotList)
} else {
res_merged <- res_prop
}
}
# order merged results to match the order of `resp_vars`
if (length(resp_vars) > 1) {
res_merged$SummaryTable <- res_merged$SummaryTable[
match(resp_vars, res_merged$SummaryTable$Variables), ]
res_merged$SummaryData <- res_merged$SummaryData[
match(resp_vars, res_merged$SummaryData$Variables), ]
res_merged$SummaryPlotList <- res_merged$SummaryPlotList[resp_vars]
}
return(res_merged)
} else if (dq_param == "location") {
rvs_meta <- util_prep_location_check(resp_vars = resp_vars,
meta_data = meta_data,
report_problems = "warning")
} else {
rvs_meta <- util_prep_proportion_check(resp_vars = resp_vars,
meta_data = meta_data,
study_data = ds1,
report_problems = "warning")
}
}
rvs_with_meta <- names(rvs_meta$Range)[which(!is.na(rvs_meta$Range))]
# Which variables are of type 'datetime'?
is_datetime_var <- vapply(resp_vars, function(rv) {
meta_data[["DATA_TYPE"]][meta_data[[label_col]] == rv] ==
DATA_TYPES$DATETIME
}, FUN.VALUE = logical(1))
# data quality indicator checks ----------------------------------------------
if (dq_param == "location") {
dq_check_list <- lapply(setNames(nm = resp_vars), function(rv) {
rv_data <- ds1[[rv]]
loc_func <- rvs_meta$Metric[[rv]]
if (!is.na(loc_func)) {
loc_val <- do.call(loc_func, args = list(x = rv_data, na.rm = TRUE))
loc_check <- !(redcap_env$`in`(loc_val, rvs_meta$Range[[rv]]))
} else {
loc_val <- NA
loc_check <- NA
}
loc_res <- setNames(nm = c("Variables",
# paste(c("FLG","VAL"), "acc_ud_loc", sep = "_"),
"FLG_acc_ud_loc", "values_from_data",
"loc_func",
"loc_range"),
list(rv,
loc_check,
loc_val,
loc_func,
util_find_var_by_meta(resp_vars = rv,
target = "LOCATION_RANGE",
meta_data = meta_data)))
loc_res
})
} else if (dq_param == "proportion") {
dq_check_list <- lapply(setNames(nm = resp_vars), function(rv) {
rv_data <- ds1[[rv]]
# compute proportions for variable `rv` in the study data
prop_table <- table(rv_data)
prop_table <- prop_table/length(rv_data[which(!is.na(rv_data))]) * 100
# If some categories were expected but not observed, we have to add zeroes
# to the table.
if (!all(names(rvs_meta$Range[[rv]]) %in% names(prop_table))) {
nonmiss_cat <- names(prop_table)
miss_cat <- setdiff(names(rvs_meta$Range[[rv]]), nonmiss_cat)
prop_table <- setNames(c(prop_table, rep(0, length(miss_cat))),
nm = c(nonmiss_cat, miss_cat))
}
# order categories as in value labels list
prop_table <- prop_table[union(names(rvs_meta$Range[[rv]]),
names(prop_table))]
# check for each category of the variable whether the proportion lies
# within the expected range
prop_in_range <- mapply(as.list(prop_table),
rvs_meta$Range[[rv]],
FUN = function(pp, int) {
if (inherits(int, "interval")) {
redcap_env$`in`(pp, int)
} else {
NA
}
})
# Which categories of the variable should be flagged, if any?
cat_flg <- names(prop_in_range)[which(!prop_in_range)]
if (length(cat_flg) == 0) cat_flg <- NA
prop_res <- setNames(nm = c(
"Variables",
# paste(c("FLG", "VAL"), "acc_ud_prop", sep = "_"),
"FLG_acc_ud_prop", "values_from_data",
"prop_range",
"flg_which"),
list(rv,
any(!(prop_in_range[which(!is.na(prop_in_range))])),
paste(
paste(names(prop_table), round(prop_table, 1), sep = " = "),
collapse = " | "),
util_find_var_by_meta(resp_vars = rv,
target = "PROPORTION_RANGE",
meta_data = meta_data),
cat_flg))
prop_res
})
# remove the list of flagged categories to arrange results as dataframe
flg_cat_prop <- lapply(dq_check_list, function(x) { x[["flg_which"]] })
dq_check_list <- lapply(dq_check_list, function(x) {
x[-which(names(x) == "flg_which")]
})
} else {
dq_check_list <- list()
}
res_dq_check <- do.call(rbind.data.frame, dq_check_list)
rownames(res_dq_check) <- NULL
# summary table --------------------------------------------------------------
res_out <- res_dq_check[, 1:4]
# GRADING for backwards compatibility
res_out$GRADING <- as.numeric(res_out[, 2])
res_out$GRADING[which(is.na(res_out$GRADING))] <- 0
# summary data ---------------------------------------------------------------
if (dq_param == "location") {
res_view <- data.frame("Variables" = res_dq_check$Variables,
"Measure of location" = res_dq_check$loc_func,
"Value" = res_dq_check$values_from_data,
"Range of expected values" =
res_dq_check$loc_range,
"Flag" = res_dq_check$FLG_acc_ud_loc,
check.names = FALSE)
} else if (dq_param == "proportion") {
res_view <- data.frame("Variables" = res_dq_check$Variables,
"Proportions" = res_dq_check$values_from_data,
"Range of expected values" =
res_dq_check$prop_range,
"Flag" = res_dq_check$FLG_acc_ud_prop,
check.names = FALSE)
}
# plot -----------------------------------------------------------------------
ref_env <- environment()
plot_list <- lapply(setNames(nm = resp_vars), function(rv) {
# omit NAs from data to prevent ggplot2 warning messages
ds1 <- ds1[!(is.na(ds1[[rv]])), , drop = FALSE]
# Should the plot be a histogram? If not, it will be a bar chart.
plot_histogram <-
meta_data[[SCALE_LEVEL]][which(meta_data[[label_col]] == rv)] %in%
c(SCALE_LEVELS$INTERVAL, SCALE_LEVELS$RATIO) #&&
#!(dq_param == "proportion" && rv %in% rvs_with_meta)
txtspec <- element_text(
colour = "black", hjust = .5,
vjust = .5, face = "plain"
)
col_bars <- "grey"
blue_red <- hcl.colors(10, "Plasma")[c(1,6)]
if (plot_histogram) {
# histogram --------------------------------------------------------------
# compute bin breaks
bin_breaks <- suppressMessages(util_optimize_histogram_bins(
x = ds1[[rv]],
nbins_max = 100
))
breaks_x <- bin_breaks[[1]]
# plot histogram
p <- ggplot(data = ds1[, rv, drop = FALSE], aes(x = .data[[rv]])) +
geom_histogram(breaks = breaks_x,
fill = col_bars,
color = col_bars)
if (!is_datetime_var[[rv]]) {
p <- p + scale_x_continuous(expand = expansion(mult = 0.1),
name = paste0(rv))
} else {
p <- p + ggplot2::scale_x_datetime(expand = expansion(mult = 0.1),
name = paste0(rv))
}
} else {
# bar chart --------------------------------------------------------------
col_bars_sep <- setNames(nm = union(names(rvs_meta$Range[[rv]]),
unique(ds1[[rv]])),
rep(col_bars,
length(union(names(rvs_meta$Range[[rv]]),
unique(ds1[[rv]])))))
if (plot_ranges & any(!is.na(flg_cat_prop[[rv]]))) {
# if bars outside of expected ranges shall be highlighted
col_bars_sep[flg_cat_prop[[rv]]] <- blue_red[2]
}
col_bars <- col_bars_sep
count_tab <- table(factor(ds1[[rv]]))
col_bars <- col_bars[names(count_tab)[which(count_tab > 0)]]
# bar for 0 counts has 0 height and needs no color
# (would cause a ggplot error)
exp_bars <- names(col_bars_sep) # expected bars
ds1[[rv]] <- factor(ds1[[rv]], levels = exp_bars)
if ("VALUE_LABELS" %in% colnames(meta_data) &&
!util_empty(meta_data$VALUE_LABELS[which(meta_data[[label_col]] ==
rv)])) {
val_lab <- util_parse_assignments(
split_on_any_split_char = TRUE, split_char = c(SPLIT_CHAR, '<'),
meta_data[[VALUE_LABELS]][which(meta_data[[label_col]] == rv)])
if (all(levels(ds1[[rv]]) %in% names(val_lab))) {
# TODO: Should the other be discarded?
levels(ds1[[rv]]) <- vapply(levels(ds1[[rv]]),
function(old_lev) {
val_lab[[old_lev]]
},
FUN.VALUE = character(1))
exp_bars <- levels(ds1[[rv]])
}
}
p <- ggplot(data = ds1[, rv, drop = FALSE],
aes(x = .data[[rv]])) +
geom_bar(fill = col_bars, width = 0.8)
p <- p +
scale_x_discrete(name = paste0(rv),
breaks = as.character(exp_bars),
drop = FALSE,
expand = expansion(add = 0.5, mult = 0.1))
}
# add ranges (if needed) ---------------------------------------------------
if (plot_ranges) {
# location
if (dq_param == "location" & rv %in% rvs_with_meta) {
# lower limit
ll <- rvs_meta$Range[[rv]][["low"]]
ll <- ifelse(is.infinite(ll), NA, ll)
if (!is.na(ll)) {
p <- p + geom_vline(xintercept = ll,
linetype = 2,
col = blue_red[1])
}
# upper limit
ul <- rvs_meta$Range[[rv]][["upp"]]
ul <- ifelse(is.infinite(ul), NA, ul)
if (!is.na(ul)) {
p <- p + geom_vline(xintercept = ul,
linetype = 2,
col = blue_red[1])
}
# add mean or median as line
loc_val <- dq_check_list[[rv]][["values_from_data"]]
if (!is.na(loc_val)) {
loc_flg <- dq_check_list[[rv]][["FLG_acc_ud_loc"]]
if (!is.na(loc_flg) & !loc_flg) {
loc_col <- blue_red[1]
} else {
loc_col <- blue_red[2]
}
p <- p + geom_vline(xintercept = loc_val,
col = loc_col)
}
}
# proportions
if (dq_param == "proportion" & rv %in% rvs_with_meta) {
# add lines for each category (if available)
for (pp in seq_along(rvs_meta$Range[[rv]])) {
if (inherits(rvs_meta$Range[[rv]][[pp]], "interval")) {
if (is.factor(ds1[[rv]])) {
pp_num <- pp
} else {
pp_num <- as.numeric(names(rvs_meta$Range[[rv]])[pp])
}
# lower limit
ll <- rvs_meta$Range[[rv]][[pp]][["low"]]
ll <- ifelse(is.infinite(ll), NA, ll)
if (!is.na(ll)) {
pp_coord <- data.frame(x1 = pp_num - 0.5,
x2 = pp_num + 0.5,
y1 = ll / 100 * nrow(ds1),
y2 = ll / 100 * nrow(ds1))
p <- p + geom_segment(aes(x = .data[["x1"]],
xend = .data[["x2"]],
y = .data[["y1"]],
yend = .data[["y2"]]),
data = pp_coord,
linetype = 2,
col = blue_red[1])
}
# upper limit
ul <- rvs_meta$Range[[rv]][[pp]][["upp"]]
ul <- ifelse(is.infinite(ul), NA, ul)
if (!is.na(ul)) {
pp_coord <- data.frame(x1 = pp_num - 0.5,
x2 = pp_num + 0.5,
y1 = ul / 100 * nrow(ds1),
y2 = ul / 100 * nrow(ds1))
p <- p + geom_segment(aes(x = .data[["x1"]],
xend = .data[["x2"]],
y = .data[["y1"]],
yend = .data[["y2"]]),
data = pp_coord,
linetype = 2,
col = blue_red[1])
}
}
}
}
}
fli <- util_coord_flip(p = p, ref_env = ref_env)
p <- p +
fli +
# TODO: estimate w and h, if p is not using discrete axes
ylab("") +
theme_minimal() +
theme(
title = txtspec,
axis.text.x = txtspec,
axis.text.y = txtspec,
axis.title.x = txtspec,
axis.title.y = txtspec
)
# ecdf by group_vars -------------------------------------------------------
if (length(group_vars) > 0 & plot_histogram) {
ds1[[group_vars]] <- as.factor(ds1[[group_vars]]) # there is only one
# grouping variable, we have ensured this by allow_more_than_one = FALSE
pp <- ggplot(data = ds1[, c(rv, group_vars), drop = FALSE],
aes(x = .data[[rv]], colour = .data[[group_vars]])) +
fli +
stat_ecdf(geom = "step") +
labs(x = "", y = paste0("ECDF: ", rv, " (by ",
paste0(group_vars, collapse = ", "), ")")) +
scale_x_continuous(expand = expansion(mult = 0.1),
limits = range(breaks_x)) +
theme_minimal() +
theme(
title = txtspec,
axis.text.x = txtspec,
axis.text.y = txtspec,
axis.title.x = txtspec,
axis.title.y = txtspec,
legend.title = element_blank()
)
if (util_ensure_suggested("colorspace",
"use the colorspace color scale",
err = FALSE)) {
pp <- pp +
colorspace::scale_color_discrete_sequential(palette = "Plasma",
na.value = "grey")
}
P <- p + pp +
plot_layout(ncol = 1) +
plot_annotation(tag_levels = 'A')
} else {
P <- p
}
util_set_size(P)
})
return(util_attach_attr(list(SummaryTable = res_out,
SummaryData = res_view,
SummaryPlotList = plot_list),
as_plotly = "util_as_plotly_acc_distributions"))
}
#' Plots and checks for distributions -- Location
#' @inherit acc_distributions
#' @export
#' @seealso
#' - [acc_distributions]
#' - [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_distributions.html
#' )
acc_distributions_loc <- function(resp_vars = NULL,
study_data,
meta_data,
label_col,
check_param = "location",
plot_ranges = TRUE,
flip_mode = "noflip") {
rvs_meta <- util_prep_location_check(resp_vars = resp_vars,
meta_data = meta_data,
report_problems = "error")
acc_distributions(
resp_vars = resp_vars, group_vars = NULL, study_data = study_data,
meta_data = meta_data, label_col = label_col,
check_param = "location", plot_ranges = plot_ranges,
flip_mode = flip_mode
)
}
#' Plots and checks for distributions -- Location, ECDF
#' @inherit acc_distributions
#' @export
#' @seealso
#' - [acc_distributions]
#' - [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_distributions.html
#' )
acc_distributions_loc_ecdf <- function(resp_vars = NULL,
group_vars = NULL,
study_data,
meta_data,
label_col,
check_param = "location",
plot_ranges = TRUE,
flip_mode = "noflip") {
rvs_meta <- util_prep_location_check(resp_vars = resp_vars,
meta_data = meta_data,
report_problems = "error")
acc_distributions(
resp_vars = resp_vars, group_vars = group_vars, study_data = study_data,
meta_data = meta_data, label_col = label_col,
check_param = "location", plot_ranges = plot_ranges,
flip_mode = flip_mode
)
}
#' Plots and checks for distributions -- Proportion
#' @inherit acc_distributions
#' @export
#' @seealso
#' - [acc_distributions]
#' - [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_distributions.html
#' )
acc_distributions_prop <- function(resp_vars = NULL,
study_data,
meta_data,
label_col,
check_param = "proportion",
plot_ranges = TRUE,
flip_mode = "noflip") {
md_prep <- util_prep_proportion_check(
resp_vars = resp_vars,
meta_data = meta_data,
study_data = study_data,
report_problems = "error"
)
acc_distributions(
resp_vars = resp_vars, group_vars = NULL, study_data = study_data,
meta_data = meta_data, label_col = label_col,
check_param = "proportion", plot_ranges = plot_ranges,
flip_mode = flip_mode
)
}
util_has_no_group_vars <- function(resp_vars,
meta_data = "item_level",
label_col = LABEL) {
util_expect_scalar(resp_vars)
util_expect_data_frame(meta_data)
columns <- grep("^GROUP_VAR_.*", colnames(meta_data), value = TRUE)
resp_vars <- util_find_var_by_meta(resp_vars,
meta_data = meta_data,
label_col = label_col)
if (is.na(resp_vars)) {
return(TRUE)
}
rowSums(!util_empty(meta_data[meta_data[["VAR_NAMES"]] == resp_vars,
columns, FALSE])) == 0
}
#' Plots and checks for distributions -- only
#'
#' [Descriptor]
#'
#' @inherit acc_distributions
#' @export
#' @seealso
#' - [acc_distributions]
#' - [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_distributions.html
#' )
acc_distributions_only <- function(resp_vars = NULL,
study_data,
meta_data,
label_col,
flip_mode = "noflip") {
loc_avail <- !inherits(try(util_prep_location_check(resp_vars = resp_vars,
meta_data = meta_data,
report_problems = "error"),
silent = TRUE), "try-error")
if (loc_avail) {
util_error("%s is already in a distribution location check plot figure",
dQuote(resp_vars),
applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
}
prop_avail <- !inherits(try(util_prep_proportion_check(
resp_vars = resp_vars,
meta_data = meta_data,
study_data = study_data,
report_problems = "error"
), silent = TRUE), "try-error")
if (prop_avail) {
util_error("%s is already in a distribution proportion check plot figure",
dQuote(resp_vars),
applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
}
if (!util_has_no_group_vars(resp_vars = resp_vars,
meta_data = meta_data,
label_col = label_col)) {
util_error("%s is already in a ecdf/distribution plot figure",
dQuote(resp_vars),
applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
}
acc_distributions(
resp_vars = resp_vars, group_vars = NULL, study_data = study_data,
meta_data = meta_data, label_col = label_col,
flip_mode = flip_mode
)
}
#' Plots and checks for distributions -- only, but with ecdf
#'
#' [Descriptor]
#' @inherit acc_distributions
#' @export
#' @seealso
#' - [acc_distributions]
#' - [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_distributions.html
#' )
acc_distributions_only_ecdf <- function(resp_vars = NULL,
study_data,
group_vars = NULL,
meta_data,
label_col,
flip_mode = "noflip") {
prep_prepare_dataframes()
util_correct_variable_use(group_vars)
loc_avail <- !inherits(try(util_prep_location_check(resp_vars = resp_vars,
meta_data = meta_data,
report_problems = "error"),
silent = TRUE), "try-error")
if (loc_avail) {
util_error("%s is already in a distribution location check plot figure",
dQuote(resp_vars),
applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
}
prop_avail <- !inherits(try(util_prep_proportion_check(
resp_vars = resp_vars,
meta_data = meta_data,
study_data = study_data,
report_problems = "error"
), silent = TRUE), "try-error")
if (prop_avail) {
util_error("%s is already in a distribution proportion check plot figure",
dQuote(resp_vars),
applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
}
acc_distributions(
resp_vars = resp_vars, group_vars = group_vars, study_data = study_data,
meta_data = meta_data, label_col = label_col,
flip_mode = flip_mode
)
}
#' @family plotly_shims
#' @concept plotly_shims
#' @keywords internal
util_as_plotly_acc_distributions <- function(res, ...) {
if (length(res$SummaryPlotList) != 1) {
return(plotly::ggplotly(ggplot2::ggplot() +
ggplot2::annotate("text", x = 0, y = 0,
label =
sprintf(paste("Internal error: I should have exactly 1 result, if",
"calling plotly for a dq_report2 otuput. I have %d."),
length(res$SummaryPlotList))) +
theme(
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_blank()
)))
}
res$SummaryPlotList <-
util_remove_dataquieR_result_class(res$SummaryPlotList)
# use res$SummaryPlot, not something from the enclosure
# of the result, that may contain study data.
util_ensure_suggested("plotly")
res$SummaryPlot <- res$SummaryPlotList[[1]]
if (inherits(res$SummaryPlot, "patchwork")) {
py1 <- try(plotly::ggplotly(res$SummaryPlot[[1]],
...), silent = TRUE)
py2 <- try(plotly::ggplotly(res$SummaryPlot[[2]],
...), silent = TRUE)
util_stop_if_not(!inherits(py1, "try-error"))
util_stop_if_not(!inherits(py2, "try-error"))
# https://plotly.com/r/subplots/#subplots-with-shared-yaxes
plotly::subplot(
plotly::add_annotations( # https://stackoverflow.com/a/59191142
py1,
text = "A",
x = 0,
y = 1,
yref = "paper",
xref = "paper",
xanchor = "left",
yanchor = "top",
yshift = 20,
showarrow = FALSE,
font = list(size = 15)
),
plotly::add_annotations( # https://stackoverflow.com/a/59191142
py2,
text = "B",
x = 0,
y = 1,
yref = "paper",
xref = "paper",
xanchor = "left",
yanchor = "top",
yshift = 20,
showarrow = FALSE,
font = list(size = 15)
),
nrows = 2,
shareX = TRUE)
} else {
plotly::ggplotly(res$SummaryPlot, ...)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.