Nothing
#' Estimate marginal means, see [emmeans::emmeans]
#'
#' @description
#' margins does calculations for quality indicator
#' Unexpected distribution wrt location (link). Therefore we pursue a combined
#' approach of descriptive and model-based statistics to investigate differences
#' across the levels of an auxiliary variable.
#'
#' CAT: Unexpected distribution w.r.t. location
#'
#' Marginal means
#'
#' Marginal means rests on model based results, i.e. a significantly different
#' marginal mean depends on sample size. Particularly in large studies, small
#' and irrelevant differences may become significant. The contrary holds if
#' sample size is low.
#'
#'
#' @details
#' Limitations
#'
#' Selecting the appropriate distribution is complex. Dozens of continuous,
#' discrete or mixed distributions are conceivable in the context of
#' epidemiological data. Their exact exploration is beyond the scope of this
#' data quality approach. The function above uses the help function
#' \link{util_dist_selection}
#' which discriminates four cases:
#' \itemize{
#' \item continuous data
#' \item binary data
#' \item count data with <= 20 categories
#' \item count data with > 20 categories
#' }
#' Nonetheless, only three different plot types are generated. The fourth case
#' is treated as continuous data. This is in fact a coarsening of the original
#' data but for the purpose of clarity this approach is chosen.
#'
#' @keywords accuracy
#'
#' @param resp_vars [variable] the name of the continuous measurement variable
#' @param group_vars [variable list] len=1-1. the name of the observer, device or
#' reader variable
#' @param co_vars [variable list] a vector of covariables, e.g. age and sex for
#' adjustment
#' @param threshold_type [enum] empirical | user | none. In case empirical is
#' chosen a multiplier of the scale measure is used,
#' in case of user a value of the mean or probability
#' (binary data) has to be defined see Implementation
#' and use of thresholds. In case of none, no thresholds
#' are displayed and no flagging of
#' unusual group levels is applied.
#' @param threshold_value [numeric] a multiplier or absolute value see
#' Implementation and use of thresholds
#' @param min_obs_in_subgroup [integer] from=0. optional argument if a
#' "group_var" is used.
#' This argument specifies the
#' minimum no. of observations that is
#' required to include a subgroup (level)
#' of the "group_var" in
#' the analysis. Subgroups with less
#' observations are excluded. The
#' default is 5.
#' @param study_data [data.frame] the data frame that contains the measurements
#' @param meta_data [data.frame] the data frame that contains metadata
#' attributes of study data
#' @param label_col [variable attribute] the name of the column in the metadata
#' with labels of variables
#'
#' @return a list with:
#' - SummaryTable: data frame underlying the plot
#' - SummaryData: data frame
#' - SummaryPlot: ggplot2 margins plot
#' @export
#' @importFrom ggplot2 ggplot geom_violin geom_boxplot geom_pointrange
#' element_blank element_text element_text
#' geom_pointrange geom_density coord_flip annotate
#' ggplot_build theme_minimal labs theme scale_colour_manual
#' geom_count aes geom_vline theme_set
#' @import patchwork
#' @importFrom utils tail head
#'
#' @examples
#' \dontrun{
#' # runs spuriously slow on rhub
#' load(system.file("extdata/study_data.RData", package = "dataquieR"))
#' load(system.file("extdata/meta_data.RData", package = "dataquieR"))
#' acc_margins(resp_vars = "DBP_0",
#' study_data = study_data,
#' meta_data = meta_data,
#' group_vars = "USR_BP_0",
#' label_col = LABEL,
#' co_vars = "AGE_0")
#' }
#' @seealso
#' [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_margins.html
#' )
acc_margins <- function(resp_vars = NULL, group_vars = NULL, co_vars = NULL,
threshold_type = NULL, threshold_value,
min_obs_in_subgroup,
study_data, meta_data, label_col
# TODO: flip_mode =
) {
# preps ----------------------------------------------------------------------
# map metadata to study data
prep_prepare_dataframes(.replace_hard_limits = TRUE)
util_correct_variable_use("resp_vars", need_type = "integer|float")
util_correct_variable_use("group_vars",
allow_any_obs_na = TRUE,
need_type = "!float"
)
util_correct_variable_use("co_vars",
allow_more_than_one = TRUE,
allow_null = TRUE
)
if (is.null(co_vars)) {
co_vars <- NA
}
dollar <- "\uFE69"
colnames(ds1) <- gsub("$", dollar, fixed = TRUE, colnames(ds1))
if (any(grepl('$', fixed = TRUE, c(resp_vars, group_vars, co_vars)))) {
util_message(c("emmeans used by acc_margins does not support variable",
"names containing %s, replacing this symbol by an",
"equivalent unicode character %s"),
dQuote("$"),
dQuote(dollar), applicability_problem = TRUE,
intrinsic_applicability_problem = FALSE)
}
resp_vars <- gsub("$", dollar, fixed = TRUE, resp_vars)
group_vars <- gsub("$", dollar, fixed = TRUE, group_vars)
co_vars <- gsub("$", dollar, fixed = TRUE, co_vars)
rvs <- resp_vars
# no minimum observations specified
# TODO: util_expect_scalar(min_obs_in_subgroup, is.integer, ...)
if (missing(min_obs_in_subgroup) || length(min_obs_in_subgroup) != 1) {
if (!missing(min_obs_in_subgroup) ||
!.called_in_pipeline) util_message(
"No or many minimum observation count was specified and is set to n=5.",
applicability_problem = TRUE)
min_obs_in_subgroup <- 5
} else {
.min_obs_in_subgroup <- as.integer(min_obs_in_subgroup)
if (is.na(.min_obs_in_subgroup)) {
util_message(
"min_obs_in_subgroup is not integer: %s, setting it to default value 5.",
dQuote(try(as.character(min_obs_in_subgroup))),
applicability_problem = TRUE)
min_obs_in_subgroup <- 5
} else {
min_obs_in_subgroup <-
.min_obs_in_subgroup
}
}
if (min_obs_in_subgroup < 5) {
min_obs_in_subgroup <- 5
util_message("min_obs_in_subgroup cannot be set below 5.",
applicability_problem = TRUE)
}
# Label assignment -----------------------------------------------------------
# temporary study data
if (!is.null(group_vars)) {
# all labelled variables
levlabs <- meta_data$VALUE_LABELS[meta_data[[label_col]] %in% group_vars]
# only variables with labels
if (!all(is.na(levlabs))) {
gvs_ll <- group_vars
# assign labels only if assignment operator is found and the variable has
# labels
if (grepl("=", levlabs)) {
ds1[[gvs_ll]] <- util_assign_levlabs(
variable = ds1[[gvs_ll]],
string_of_levlabs = levlabs,
splitchar = SPLIT_CHAR,
assignchar = " = "
)
}
}
}
# missings -------------------------------------------------------------------
co_vars <- na.omit(co_vars)
# co_vars and group_vars
n_prior <- dim(ds1)[1]
ds1 <- ds1[rowSums(is.na(ds1[, c(co_vars, group_vars), drop = FALSE])) == 0, ,
drop = FALSE
]
n_post <- dim(ds1)[1]
if (n_post < n_prior) {
util_message(paste0(
"Due to missing values in ", paste0(co_vars, collapse = ", "),
" or ", group_vars, " N=", n_prior - n_post,
" observations were excluded."
),
applicability_problem = FALSE)
}
# rvs
n_prior <- dim(ds1)[1]
ds1 <- ds1[!is.na(ds1[[rvs]]), ]
n_post <- dim(ds1)[1]
if (n_post < n_prior) {
util_message(paste0(
"Due to missing values in ", paste0(rvs), " N=",
n_prior - n_post, " observations were excluded."
), applicability_problem = FALSE)
}
# type factor
ds1[[group_vars]] <- factor(ds1[[group_vars]])
if (!missing(threshold_value)) {
if (is.vector(threshold_value)) {
.threshold_value <- as.numeric(threshold_value)
} else {
.threshold_value <- NA
}
if (length(threshold_value) != 1 || is.na(.threshold_value)) {
util_message(
"threshold_value is not numeric(1): %s, setting it to default value 1.",
dQuote(head(try(as.character(threshold_value)), 1)),
applicability_problem = TRUE)
threshold_value <- 1
} else {
threshold_value <-
.threshold_value
}
}
if (is.null(threshold_type) || (!is.list(threshold_type)
&& length(threshold_type) != 1)) {
if (
!is.null(threshold_type)
||
!.called_in_pipeline
) util_message("No or many threshold type specified and set to empirical.",
applicability_problem = TRUE)
threshold_type <- "empirical"
}
threshold_type <- match.arg(threshold_type, c("empirical", "user", "none"))
# no relative distance (based on SD) to mean defined?
if (threshold_type %in% c("empirical", "none") & missing(threshold_value)) {
threshold_value <- 1
}
# threshold is user but no value defined -> switch to empirical
if (threshold_type == "user" & missing(threshold_value)) {
util_message(
c(
"Threshold was set to user but no value for the unit of measurements",
"was defined.\n",
"The function switches to one SD as default."),
applicability_problem = TRUE)
threshold_type == "empirical"
threshold_value <- 1
}
# summary data frame of observations by level of random effect and non-missing
# values in rvs
check_df <- util_table_of_vct(ds1[[group_vars]])
# Consider remaining obs/level after na.rm() ---------------------------------
# too few observations in >1 level of group_vars
if (min(check_df[, 2]) < min_obs_in_subgroup) {
critical_levels <- levels(check_df$Var1)[check_df$Freq <
min_obs_in_subgroup]
util_message(paste0(c(
"The following levels:", head(critical_levels, 100),
if (length(critical_levels) > 100) {", ..." }, "have <",
min_obs_in_subgroup, " observations and will be removed."
),
collapse = " "
), applicability_problem = FALSE)
ds1 <- ds1[!(ds1[[group_vars]] %in% critical_levels), ]
}
# if no co_vars are defined for adjustment only the intercept is modelled
if (length(co_vars) == 1 & is.na(co_vars)[1]) {
co_vars <- "1"
} else {
co_vars <- util_bQuote(co_vars)
}
#############
# Modelling #
#############
# determine distributional approximation
seldist <- util_dist_selection(ds1[[rvs]], meta_data = meta_data)
if (!(prod(dim(ds1)))) {
util_error("No data left")
}
# build model formula
fmla <- as.formula(paste0(
paste0(util_bQuote(rvs), "~"),
paste0(
paste0(co_vars, collapse = " + "),
" + ",
util_bQuote(group_vars)
)
))
# variable of type float or integer with > 20 categories
if (seldist$IsInteger == FALSE | seldist$IsInteger == TRUE &
seldist$NCategory > 20) {
# call linear model
model <- lm(fmla, data = ds1)
}
# variable of type binary
if (seldist$IsInteger == TRUE & seldist$IsMultCat == 0) {
if (is.factor(ds1[[rvs]])) {
ds1[[rvs]] <- as.integer(ds1[[rvs]])
}
# other levels than 0/1
if (!all(unique(ds1[[rvs]]) %in% c(0, 1))) {
mf <- as.numeric(tail(names(sort(table(ds1[[rvs]]))), 1))
# https://stackoverflow.com/questions/12187187/
# how-to-retrieve-the-most-repeated-value-in-a-
# column-present-in-a-data-frame
ds1[[rvs]][ds1[[rvs]] == mf] <- 0
# could be other than 1
lf <- as.numeric(head(names(sort(table(ds1[[rvs]]))), 1))
ds1[[rvs]][ds1[[rvs]] == lf] <- 1
util_message(paste0("The levels of ", rvs, " (", mf, " and ", lf,
") have been recoded to 0 and 1)"),
applicability_problem = FALSE)
}
# call logreg
model <- glm(fmla, data = ds1, family = binomial(link = "logit"))
}
# variable of type integer with > 2 categories
if (seldist$IsMultCat == 1 & seldist$NCategory <= 20) {
if (is.factor(ds1[[rvs]])) {
ds1[[rvs]] <- as.integer(ds1[[rvs]])
}
# call poisson model
model <- glm(fmla, data = ds1, family = poisson(link = "log"))
}
# emmeans::emmeans appears not working correctly for the overall mean. Somehow
# the package assumes an interaction term although there isn't.
# browser()
# call emmeans::emmeans ------------------------------------------------------
res_df <- data.frame(emmeans::emmeans(model, group_vars, type = "response"),
check.names = FALSE)
if (seldist$IsInteger == FALSE | seldist$IsInteger == TRUE &
seldist$NCategory > 20) {
res_df <- dplyr::rename(res_df, c("margins" = "emmean", "LCL" = "lower.CL",
"UCL" = "upper.CL"))
}
if (seldist$IsInteger == 1 & seldist$IsMultCat == 0) {
res_df <- dplyr::rename(res_df, c("margins" = "prob", "LCL" = "asymp.LCL",
"UCL" = "asymp.UCL"))
}
if (seldist$IsMultCat == 1 & seldist$NCategory <= 20) {
res_df <- dplyr::rename(res_df, c("margins" = "rate", "LCL" = "asymp.LCL",
"UCL" = "asymp.UCL"))
}
# adjusted overall mean
omv <- data.frame(emmeans::emmeans(model, "1", type = "response"))
if (seldist$IsInteger == FALSE | seldist$IsInteger == TRUE &
seldist$NCategory > 20) {
omv <- dplyr::rename(omv, c("margins" = "emmean", "LCL" = "lower.CL",
"UCL" = "upper.CL"))
}
if (seldist$IsInteger == 1 & seldist$IsMultCat == 0) {
omv <- dplyr::rename(omv, c("margins" = "prob", "LCL" = "asymp.LCL",
"UCL" = "asymp.UCL"))
}
if (seldist$IsMultCat == 1 & seldist$NCategory <= 20) {
omv <- dplyr::rename(omv, c("margins" = "rate", "LCL" = "asymp.LCL",
"UCL" = "asymp.UCL"))
}
res_df$overall <- omv$margins
##############
# THRESHOLDS #
##############
# continuous data or count data with mor than 20 categories
if (threshold_type %in% c("empirical", "none")) {
if (seldist$IsInteger == FALSE | seldist$IsInteger == TRUE &
seldist$NCategory > 20) {
th <- sd(ds1[[rvs]])
parn <- c(
paste("-", threshold_value, "TH", sep = ""), "Mean",
paste("+", threshold_value, "TH", sep = "")
)
}
# binary data
if (seldist$IsInteger == 1 & seldist$IsMultCat == 0) {
th <- mean(ds1[[rvs]])
th <- th * (1 - th)
parn <- c(
paste("-", threshold_value, "TH", sep = ""), "Prob.",
paste("+", threshold_value, "TH", sep = "")
)
}
# poisson
if (seldist$IsMultCat == 1 & seldist$NCategory <= 20) {
# th <- exp(as.numeric(coefficients(glm(v101 ~ 1,
# family = poisson(link="log"), data = expl_df))))
th <- 1
parn <- c(
paste("-", threshold_value, "TH", sep = ""), "Mean",
paste("+", threshold_value, "TH", sep = "")
)
}
pars <- as.vector(c(
omv$margins - threshold_value * th, omv$margins,
omv$margins + threshold_value * th
))
# store threshold
res_df$threshold <- threshold_value
# select abnormalties
res_df$GRADING <- ifelse(res_df$margins < pars[1] |
res_df$margins > pars[3], 1, 0)
} else if (threshold_type == "user") {
th <- threshold_value
# store threshold
res_df$threshold <- th
pars <- as.vector(c(th, th, th))
if (seldist$IsInteger == 1 & seldist$IsMultCat == 0) {
parn <- c("", paste0("Prob.=", threshold_value, sep = ""), "")
} else {
parn <- c("", paste0("Mean=", threshold_value, sep = ""), "")
}
# select abnormalties
res_df$GRADING <- mapply(
function(th, l, u) {
ifelse(th >= l & th <= u, 0, 1)
},
res_df$threshold, res_df$LCL, res_df$UCL
)
}
if (length(co_vars) > 0) {
if (length(co_vars) < 10) {
subtitle <- sprintf("adjusted for %s", paste0(co_vars, collapse = ", "))
} else {
subtitle <- sprintf("adjusted for %d variables", length(co_vars))
}
} else {
subtitle <- ""
}
############
# Graphics #
############
# use offset for annotation depending on variable scale
offs <- ifelse(th <= 50, th / 10, th / 20)
if (seldist$IsInteger == FALSE | seldist$IsInteger == TRUE &
seldist$NCategory > 20) {
# Plot 1: hybrid density/boxplot graph
warn_code <- c("1" = "#B2182B", "0" = "#2166AC")
p1 <- ggplot(data = ds1[, c(rvs, group_vars), drop = FALSE],
aes(x = .data[[group_vars]], y = .data[[rvs]])) +
geom_violin(alpha = 0.9, draw_quantiles = TRUE, fill = "gray99") +
geom_boxplot(width = 0.1, fill = "white", color = "gray", alpha = 0.5) +
geom_pointrange(
data = res_df, aes(
x = factor(.data[[group_vars]]),
y = margins,
ymin = LCL,
ymax = UCL,
color = as.factor(GRADING)
),
shape = 18, linewidth = 1,
inherit.aes = FALSE,
fatten = 5
) +
theme_minimal() +
labs(x = "", y = "") +
theme(
legend.position = "None", legend.title = element_blank(),
text = element_text(size = 16),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
) +
scale_colour_manual(values = warn_code)
if (threshold_type != "none") {
p1 <- p1 +
geom_hline(yintercept = pars[2], color = "red") +
geom_hline(yintercept = pars[-2], color = "red", linetype = 2)
} else {
p1 <- p1 +
geom_hline(yintercept = pars[2], color = "red")
}
}
if (seldist$IsInteger == 1 & seldist$NCategory <= 20) {
warn_code <- c("1" = "#B2182B", "0" = "#2166AC")
p1 <- ggplot(data = ds1[, c(rvs, group_vars), drop = FALSE],
aes(x = .data[[group_vars]], y = .data[[rvs]])) +
geom_count(aes(alpha = 0.9), color = "gray") +
geom_pointrange(
data = res_df, aes(
x = .data[[group_vars]],
y = margins,
ymin = LCL,
ymax = UCL,
color = as.factor(GRADING)
),
shape = 18, linewidth = 1,
inherit.aes = FALSE,
fatten = 5
) +
theme_minimal() +
labs(x = "", y = "") +
theme(
legend.position = "None", legend.title = element_blank(),
text = element_text(size = 16),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
) +
scale_colour_manual(values = warn_code)
if (threshold_type != "none") {
p1 <-
p1 +
geom_hline(yintercept = pars[2], color = "red") +
geom_hline(yintercept = pars[-2], color = "red", linetype = 2)
} else {
p1 <-
p1 +
geom_hline(yintercept = pars[2], color = "red")
}
}
# Plot 2: overall distributional plot flipped on y-axis of plot 1
get_y_scale <- ggplot(ds1[, rvs, drop = FALSE], aes(x = .data[[rvs]])) +
geom_density(alpha = 0.35)
aty <- mean(range(ggplot_build(get_y_scale)$data[[1]]$y))
p2 <- ggplot(ds1[, rvs, drop = FALSE], aes(.data[[rvs]])) +
geom_density(alpha = 0.35) +
coord_flip() +
theme_minimal() +
# coord_flip() +
labs(x = NULL, y = NULL) +
theme(axis.text.y = element_blank(), axis.text.x = element_blank(),
text = element_text(size = 16))
if (threshold_type != "none") {
p2 <-
p2 +
annotate(geom = "text", x = pars + offs, y = aty, label = parn) +
geom_vline(xintercept = pars[2], color = "red") +
geom_vline(xintercept = pars[-2], color = "red", linetype = 2)
} else {
p2 <-
p2 +
annotate(geom = "text", x = pars + offs, y = aty,
label = c("", parn[2], "")) +
geom_vline(xintercept = pars[2], color = "red")
}
# combine plots
# and add the title
res_plot <- # TODO: For all patchwork-calls, add information to reproduce the layout in plot.ly
p1 +
p2 +
plot_layout(nrow = 1,
widths = c(5, 1)
) +
plot_annotation(title = paste(group_vars, "margins in", rvs),
subtitle = subtitle)
SummaryTable <- data.frame(Variables = rvs, GRADING =
as.numeric(any(res_df$GRADING > 0)))
# length(unique(fit_df$GROUP)))
# output
return(util_attach_attr(list(
SummaryData = res_df,
SummaryTable = SummaryTable,
SummaryPlot = util_set_size(res_plot, width_em = 25 +
1.2 * length(unique(ds1[[group_vars]])),
height_em = 25)
), as_plotly = "util_as_plotly_acc_margins"))
}
util_as_plotly_acc_margins <- function(res, ...) {
res$SummaryPlot <- util_remove_dataquieR_result_class(res$SummaryPlot)
# use res$SummaryPlot, not res_plot to avoid dependeing on the enclosure
# of the result, that may contain study data.
util_ensure_suggested("plotly")
util_stop_if_not(inherits(res$SummaryPlot, "patchwork"))
rel_w <- res$SummaryPlot$patches$layout$widths /
sum(res$SummaryPlot$patches$layout$widths, na.rm = TRUE)
py1 <- try(plotly::ggplotly(res$SummaryPlot[[1]],
...), silent = TRUE)
py1$x$data[[2]]$mode <- NULL # suppress a warning on print (https://github.com/plotly/plotly.R/issues/2242)
py2 <- try(plotly::ggplotly(res$SummaryPlot[[2]],
...), silent = TRUE)
util_stop_if_not(!inherits(py1, "try-error"))
util_stop_if_not(!inherits(py2, "try-error"))
# https://plotly.com/r/subplots/#subplots-with-shared-yaxes
plotly::layout(plotly::subplot(py1,
py2,
nrows =
res$SummaryPlot$patches$layout$nrow,
shareY = TRUE,
widths =
rel_w),
title = list(text =
res$SummaryPlot$patches$annotation$title),
margin = 0.01)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.