Nothing
#' Utility function to create a margins plot from linear regression models
#'
#' @param resp_vars [variable] the name of the measurement variable
#' @param group_vars [variable] the name of the observer, device or
#' reader variable
#' @param co_vars [variable list] a vector of covariables, e.g. age and sex for
#' adjustment
#' @param threshold_type [enum] empirical | user | none. See `acc_margins`.
#' @param threshold_value [numeric] see `acc_margins`
#' @param min_obs_in_subgroup [integer] from=0. This optional argument specifies
#' the minimum number of observations that is required to
#' include a subgroup (level) of the `group_var` in the
#' analysis.
#' @param ds1 [data.frame] the data frame that contains the measurements, after
#' replacing missing value codes by `NA`, excluding
#' inadmissible values and transforming categorical
#' variables to factors.
#' @param label_col [variable attribute] the name of the column in the metadata
#' with labels of variables
#' @param levels [levels()] of the original ordinal variable, if applicable.
#' Used for axis tick labels.
#' @param adjusted_hint [character] hint, if adjusted for `co_vars`
#' @param title [character] title for the plot
#' @param n_violin_max [integer] from=0. This optional argument specifies
#' the maximum number of levels of the `group_var` for
#' which violin plots will be shown in the figure.
#' @param sort_group_var_levels [logical] Should the levels of the grouping
#' variable be sorted descending by the number of
#' observations (in the figure)?
#' @param include_numbers_in_figures [logical] Should the figure report the
#' number of observations for each level of the grouping
#' variable?
#'
#' @return A table and a matching plot.
#'
#' @importFrom ggplot2 ggplot geom_violin geom_boxplot geom_pointrange geom_text
#' element_blank element_text
#' geom_density coord_flip annotate
#' ggplot_build theme_minimal labs theme scale_colour_manual
#' geom_count aes geom_vline geom_hline
#' @import patchwork
#'
#' @noRd
util_margins_lm <- function(resp_vars = NULL, group_vars = NULL, co_vars = NULL,
threshold_type = NULL, threshold_value,
min_obs_in_subgroup = 5, ds1, label_col,
levels = NULL,
adjusted_hint = "",
title = "",
n_violin_max = getOption("dataquieR.max_group_var_levels_with_violins", dataquieR.max_group_var_levels_with_violins_default),
sort_group_var_levels =
getOption("dataquieR.acc_margins_sort",
dataquieR.acc_margins_sort_default),
include_numbers_in_figures =
getOption("dataquieR.acc_margins_num",
dataquieR.acc_margins_num_default)) {
# preps and checks -----------------------------------------------------------
# to avoid "no visible binding for global variable ‘sample_size’"
sample_size <- NULL
# ensure that the response variable is not constant
var_prop <- util_dist_selection(ds1[, resp_vars, drop = FALSE])
if (var_prop$NDistinct < 2) {
util_error("The response variable is constant (after data preparation).",
applicability_problem = TRUE,
intrinsic_applicability_problem = TRUE)
}
# ensure that there are enough observations for the model
countdf <- util_table_of_vct(ds1[[group_vars]])
if (any(countdf[, 2] < min_obs_in_subgroup)) {
util_error("Not enough observations per group (after data preparation).")
}
if (nrow(countdf) <= n_violin_max) {
show_violins <- TRUE
} else {
show_violins <- FALSE
}
# modelling ------------------------------------------------------------------
# if no co_vars are defined for adjustment only the intercept is modelled
if (length(co_vars) == 0) {
co_vars <- "1"
co_vars_bQ <- co_vars
} else {
co_vars_bQ <- util_bQuote(co_vars)
}
# build model formula
fmla <- as.formula(paste0(
paste0(util_bQuote(resp_vars), "~"),
paste0(
paste0(co_vars_bQ, collapse = " + "),
" + ",
util_bQuote(group_vars)
)
))
model <- lm(fmla, data = ds1)
res_df <- data.frame(emmeans::emmeans(model, group_vars, type = "response"),
check.names = FALSE)
# adjust for covariates, if needed
ds1$resp_var_adj <-
# estimated mean for each level of the grouping variable
res_df$emmean[match(ds1[[group_vars]], res_df[, group_vars])] +
# residuals: original value of the response variable - fitted value
model$residuals
summary_ds <- as.data.frame(
dplyr::summarize(
dplyr::group_by_at(ds1[, c(resp_vars, group_vars), drop = FALSE],
unname(group_vars)),
sample_size = dplyr::n()))
res_df <- merge(res_df, summary_ds, by = group_vars, all.x = TRUE)
res_df <- dplyr::rename(res_df, c("margins" = "emmean", "LCL" = "lower.CL",
"UCL" = "upper.CL"))
# adjusted overall mean
omv <- data.frame(emmeans::emmeans(model, "1", type = "response"))
omv <- dplyr::rename(omv, c("margins" = "emmean", "LCL" = "lower.CL",
"UCL" = "upper.CL"))
res_df$overall <- omv$margins # TODO: never used?
# thresholds -----------------------------------------------------------------
if (threshold_type %in% c("empirical", "none")) {
th <- sd(ds1[["resp_var_adj"]]) # TODO: here we use an empirical estimate,
# but in other margin functions we use estimates based on properties of the
# assumed distribution
parn <- c(
paste("-", threshold_value, "TH", sep = ""), "Mean",
paste("+", threshold_value, "TH", sep = "")
)
pars <- as.vector(c(
omv$margins - threshold_value * th, omv$margins,
omv$margins + threshold_value * th
))
res_df$threshold <- threshold_value
res_df$GRADING <- ifelse(res_df$margins < pars[1] |
res_df$margins > pars[3], 1, 0)
} else if (threshold_type == "user") {
th <- threshold_value
parn <- c("", paste0("Mean=", threshold_value, sep = ""), "")
pars <- as.vector(c(th, th, th))
res_df$threshold <- th
res_df$GRADING <- mapply(
function(th, l, u) {
ifelse(th >= l & th <= u, 0, 1)
},
res_df$threshold, res_df$LCL, res_df$UCL
)
}
# figure ---------------------------------------------------------------------
# use offset for annotation depending on variable scale
offs <- ifelse(th <= 50, th / 10, th / 20)
warn_code <- c("1" = "#B2182B", "0" = "#2166AC")
if (sort_group_var_levels) {
# order levels of the grouping variable by number of observations
tbl_gr <- util_table_of_vct(ds1[[group_vars]]) %>%
dplyr::arrange(dplyr::desc(.data$Freq))
ds1[[group_vars]] <- factor(ds1[[group_vars]],
levels = as.character(tbl_gr$Var1))
res_df <- res_df[match(levels(ds1[[group_vars]]),
as.character(res_df[, group_vars])), ]
}
# Plot 1: hybrid density/boxplot graph
.ds00 <- ds1[, c("resp_var_adj", group_vars), drop = FALSE]
if (all(!is.finite(.ds00$resp_var_adj))) {
util_error("No data left.")
}
p1 <-
util_create_lean_ggplot(ggplot(data = .ds00,
aes(x = .data[[group_vars]],
y = .data[["resp_var_adj"]])),
.ds00 = .ds00,
group_vars = group_vars)
if (show_violins) {
p1 <- util_create_lean_ggplot(p1 +
geom_violin(alpha = 0.9,
fill = "gray99") +
ggplot2::stat_ydensity( # FIXME: ggplot2 4.0.0 now draws actual data quantiles (not “density-inferred” ones), so the lines may shift very slightly compared to pre-4.0.0 outputs, but the intent (e.g., median at 0.5) is the same. ggplot2.tidyverse.org -- do we need to change some documentation?
geom = "violin",
alpha = 0.9,
fill = "gray99",
quantiles = 0.5
),
p1 = p1)
}
p1 <- util_create_lean_ggplot(p1 +
geom_boxplot(width = 0.1,
fill = "white",
color = "gray",
alpha = 0.5) +
geom_pointrange(
data = res_df, aes(
x = factor(.data[[group_vars]]),
y = margins,
ymin = LCL,
ymax = UCL,
color = as.factor(GRADING)#, n = sample_size
),
shape = 18,
linewidth = 1,
inherit.aes = FALSE,
size = .5) +
theme_minimal() +
labs(x = "", y = "") +
theme(
legend.position = "none",
legend.title = element_blank(),
text = element_text(size = 16),
axis.text.x = element_text(angle = 90,
vjust = 0.5,
hjust = 1)
) +
scale_colour_manual(values = warn_code),
p1 = p1,
res_df = res_df,
group_vars = group_vars,
warn_code = warn_code)
if (include_numbers_in_figures) {
.ds01 <- summary_ds[, group_vars]
.ds02 <- ds1[, "resp_var_adj", drop = FALSE]
p1 <- util_create_lean_ggplot(p1 +
geom_text(data = summary_ds,
aes(x = .ds01,
y = max(.ds02) + 1.2,
label = sample_size),
hjust = 0.5, angle = 90) +
annotate("text",
x = 0.5,
y = max(.ds02) + 1.2,
label = "N"),
p1 = p1,
summary_ds = summary_ds,
.ds01 = .ds01,
.ds02 = .ds02
)
}
if (!is.null(levels)) {
try(p1 <-
util_create_lean_ggplot(
p1 +
ggplot2::scale_y_continuous(breaks =
as.integer(names(levels)),
labels = levels),
p1 = p1,
levels = levels),
silent = TRUE)
}
if (threshold_type != "none") {
p1 <- util_create_lean_ggplot(p1 +
geom_hline(yintercept = pars[2],
color = "red") +
geom_hline(yintercept = pars[-2],
color = "red",
linetype = 2),
p1 = p1,
pars = pars)
} else {
p1 <- util_create_lean_ggplot(p1 +
geom_hline(yintercept = pars[2],
color = "red"),
p1 = p1,
pars = pars)
}
# Plot 2: overall distributional plot flipped on y-axis of plot 1
.ds03 <- ds1[, "resp_var_adj", drop = FALSE]
get_y_scale <- util_create_lean_ggplot(
ggplot(.ds03,
aes(x = .data[["resp_var_adj"]])) +
geom_density(alpha = 0.35),
.ds03 = .ds03)
build <- ggplot2::ggplot_build(get_y_scale)
data1 <- util_gg_get(build, "data")[[1]]
yvals <- util_gg_get(data1, "y")
aty <- mean(range(yvals))
p2 <- util_create_lean_ggplot(ggplot(.ds03,
aes(.data[["resp_var_adj"]])) +
geom_density(alpha = 0.35) +
coord_flip() + # util_lazy_add_coord(p, fli)
theme_minimal() +
labs(x = NULL, y = NULL) +
ggplot2::xlim(c(min(min(.ds03), pars),
max(max(.ds03) + 1.2,
pars + offs))) +
theme(axis.text.y = element_blank(),
axis.text.x = element_blank(),
text = element_text(size = 16)),
.ds03 = .ds03,
pars = pars,
offs = offs)
if (threshold_type != "none") {
p2 <- util_create_lean_ggplot(
p2 +
annotate(geom = "text",
x = pars + offs,
y = aty,
label = parn) +
geom_vline(xintercept = pars[2], color = "red") +
geom_vline(xintercept = pars[-2], color = "red", linetype = 2),
p2 = p2,
pars = pars,
offs = offs,
aty = aty,
parn = parn)
} else {
p2 <- util_create_lean_ggplot(
p2 +
annotate(geom = "text",
x = pars + offs,
y = aty,
label = c("", parn[2], "")) +
geom_vline(xintercept = pars[2], color = "red"),
p2 = p2,
pars = pars,
offs = offs,
aty = aty,
parn = parn)
}
# combine plots and add the title
res_plot <- # TODO: For all patchwork-calls, add information to reproduce the layout in plot.ly
util_create_lean_ggplot(
p1 +
p2 +
plot_layout(nrow = 1,
widths = c(5, 1)) +
plot_annotation(title = title,
subtitle = adjusted_hint),
p1 = p1,
p2 = p2,
title = title,
adjusted_hint = adjusted_hint)
# output ---------------------------------------------------------------------
return(list("plot_data" = res_df, "plot" = res_plot))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.