Nothing
#' Calculate and plot Mahalanobis distances for social science indices
#'
#' @description
#' A standard tool to calculate Mahalanobis distance.
#' In this approach the Mahalanobis distance is calculated for ordinal
#' variables (treated as continuous) to identify inattentive responses.
#' It calculates the distance for each observational unit from the sample mean.
#' The greater the distance, the atypical the responses.
#'
#' [Indicator]
#'
#' @details
#' # ALGORITHM OF THIS IMPLEMENTATION:
#' - Implementation is restricted to variables of type integer
#' - Remove missing codes from the study data (if defined in the metadata)
#' - The covariance matrix is estimated for all variables from `variable_group`
#' - The Mahalanobis distance of each observation is calculated
#' \eqn{MD^2_i = (x_i - \mu)^T \Sigma^{-1} (x_i - \mu)}
#' - The default to consider a value an outlier is "use the 0.975 quantile
#' of a chi-square distribution with p degrees of freedom" (Mayrhofer and Filzmoser, 2023)
#' List function.
#'
#' @inheritParams .template_function_indicator
#'
#' @param variable_group [variable list] the names of the continuous
#' measurement variables building
#' a group, for that multivariate outliers
#' make sense.
#' @param mahalanobis_threshold [numeric] TODO: ES
#'
#' @return a list with:
#' - `SummaryTable`: [data.frame] underlying the plot
#' - `SummaryPlot`: [ggplot2::ggplot2] outlier plot
#' - `FlaggedStudyData` [data.frame] contains the original data frame with
#' the additional columns `tukey`,
#' `3SD`,
#' `hubert`, and `sigmagap`. Every
#' observation
#' is coded 0 if no outlier was detected in
#' the respective column and 1 if an
#' outlier was detected. This can be used
#' to exclude observations with outliers.
#'
#' @export
#' @importFrom ggplot2 ggplot aes geom_path scale_color_manual geom_point
#' discrete_scale theme_minimal scale_alpha_manual
#'
#' @importFrom stats mahalanobis
#' @importFrom rlang .data
#' @seealso
#' [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_multivariate_outlier.html
#' )
acc_mahalanobis <- function(variable_group = NULL,
label_col = VAR_NAMES,
study_data,
item_level = "item_level",
meta_data = item_level,
meta_data_v2,
mahalanobis_threshold =
suppressWarnings(as.numeric(getOption("dataquieR.MAHALANOBIS_THRESHOLD",
dataquieR.MAHALANOBIS_THRESHOLD_default)))) {
# preps ----------------------------------------------------------------------
mahalanobis_distance_check = !util_empty(mahalanobis_threshold)
if (.called_in_pipeline && !mahalanobis_distance_check) {
util_error("No Mahalanobis distance requested",
intrinsic_applicability_problem = TRUE)
}
if (length(mahalanobis_threshold) != 1 ||
!is.numeric(mahalanobis_threshold) ||
!is.finite(mahalanobis_threshold)) {
util_message(
c("The mahalanobis_threshold argument is not correct",
"default (%g) is used."),
dataquieR.MAHALANOBIS_THRESHOLD_default,
applicability_problem = TRUE)
mahalanobis_threshold <- dataquieR.MAHALANOBIS_THRESHOLD_default
}
util_maybe_load_meta_data_v2()
if (missing(label_col)) {
orig_label_col <- rlang::missing_arg()
} else {
orig_label_col <- force(label_col)
}
label_col <- attr(prep_get_labels("",
item_level = meta_data,
label_class = "SHORT",
label_col = label_col),
"label_col")
# map metadata to study data
prep_prepare_dataframes(.replace_hard_limits = TRUE)
util_correct_variable_use("variable_group",
allow_more_than_one = TRUE,
allow_any_obs_na = TRUE,
need_type = "integer | float",
need_scale = "interval | ratio | ordinal" #ES: should I leave only ordinal?
)
if (length(variable_group) == 1) {
util_error("Need at least two variables for Mahanobis distance.",
applicability_problem = TRUE)
}
# missing data in any of the variables?
vars <- variable_group[!is.na(variable_group)]
#Filter to only keep the columns in the variable_group
ds1_group <- ds1[, vars, drop = FALSE]
n_prior <- dim(ds1_group)[1]
ds1plot <- ds1_group[rowSums(is.na(ds1_group[, vars, drop = FALSE])) == 0, ,
drop = FALSE]
rm(ds1_group)
n_post <- dim(ds1plot)[1]
if (n_post == 0) {
util_error("No samples with without missing values in one of %s. Aborting.",
paste0(dQuote(vars), collapse = ", "),
applicability_problem = FALSE)
}
if (n_post < n_prior) {
util_message(paste0(
"Due to missing values",
" N=", n_prior - n_post,
" observational units were excluded."
), applicability_problem = FALSE)
}
if (length(variable_group) < 2) {
util_error(
"Fewer than two variables left, no Mahalanobis distance calculation possible.",
applicability_problem = TRUE)
}
# Mahalanobis ----------------------------------------------------------------
# no. variables used to calculate the MD
degree_freedom <- ncol(ds1plot[, variable_group]) #ES: check is this is actually the no. vars
# Estimate covariance of response variables
Sx <- cov(ds1plot[, variable_group, drop = FALSE])
# Calculation of the Mahalanobis distance
ds1plot$MD <-
mahalanobis(ds1plot[, variable_group],
colMeans(ds1plot[, variable_group,drop = FALSE]),
Sx)
# Threshold to identify multivariate outliers (argument mahalanobis_threshold)
# is 0.975 quantile of a chi-square distribution
# with p degrees of freedom, where p = variables (Mayrhofer and Filzmoser, 2023)
MD_outliers_threshold <- stats::qchisq(mahalanobis_threshold, df = degree_freedom)
#ES: This differs from careless function mahad that calculates MD even if there are missing values.
#create a column with outliers
ds1plot$MD_outliers <- NA
ds1plot$MD_outliers <- ifelse(ds1plot$MD > MD_outliers_threshold, 1, 0)
n_non_ol <- sum(ds1plot$MD_outliers == 0)
n_devs <- sum(ds1plot$MD_outliers == 1)
# create summary table
# grading <- max(n_devs, na.rm = TRUE)
st1 <- data.frame(Variables = paste0(variable_group, collapse = " | "))
st1$"MD_outliers (N)" <- n_devs
st1$"MD_outliers (%)" <- round(n_devs/length(ds1plot$MD)*100, digits = 2)
st1$"N" <- length(ds1plot$MD)
st1$"observational_units_removed" <- n_prior - n_post
st1$"mahalanobis_threshold" <- mahalanobis_threshold
SummaryData <- st1
SummaryTable <- st1
names(SummaryTable)[names(SummaryTable) == "MD_outliers (N)"] <- 'NUM_ssc_MD' # add indicator ssc_MD to DQ_OBS to make grading rules work, here.
names(SummaryTable)[names(SummaryTable) == 'MD_outliers (%)'] <- 'PCT_ssc_MD'
# new complete data with the MD (included exclude rows with missing values)
# in that case MD is NA
FlaggedStudyData <- merge(ds1, ds1plot,
by = intersect(colnames(ds1),
colnames(ds1plot)),
all = TRUE)
# Q-Q plot ----
p_df <- length(ds1plot$MD)
probabilities <- stats::ppoints(p_df)
theoretical_quantiles <- stats::qchisq(probabilities, df = p_df)
ds1plot <- ds1plot[order(ds1plot$MD), ]
point_colors <- ifelse(ds1plot$MD_outliers == 1, "#7f0000", "#2166AC") #ES: maybe a better way?
plot_data <- data.frame(
Theoretical = theoretical_quantiles,
Sample = ds1plot$MD
)
plot_title <- "Q-Q Plot: Sample Data vs. Chi-squared Distribution"
x_label <- paste0("Theoretical Quantiles (Chi-squared, df =", p_df, ")")
y_label <- "Sample Data (MD)"
p1 <- util_create_lean_ggplot({
ggplot(plot_data, aes(x = .data[["Theoretical"]],
y = .data[["Sample"]])) +
geom_point(alpha = 0.6,
color = point_colors) +
labs(
title = plot_title,
x = x_label,
y = y_label
) +
theme_minimal()
},
plot_data = plot_data,
point_colors = point_colors,
plot_title = plot_title,
x_label = x_label,
y_label = y_label)
p <- stats::qqplot(
x = theoretical_quantiles,
y = ds1plot$MD,
main = "Q-Q Plot: Sample Data vs. Chi-squared Distribution",
xlab = paste0("Theoretical Quantiles (Chi-squared, df =", p_df, ")"),
ylab = "Sample Data (MD)",
pch = 19,
col = grDevices::adjustcolor(point_colors, alpha.f = 0.6)
)
# Information for sizing
number_of_vars <- length(ds1plot$MD)
min_value <- min(ds1plot$MD, na.rm = TRUE)
max_value <- max(ds1plot$MD, na.rm = TRUE)
range <- max_value - min_value
no_char_y <- nchar(max_value)
#Sizing information - a default
attr(p, "sizing_hints") <- list(
figure_type_id = "multivar_plot",
number_of_vars = number_of_vars,
range = range,
no_char_y = no_char_y
)
rm(min_value, max_value, range, number_of_vars, no_char_y)
#############################################
#ES: not sure what this does ??
if (!missing(orig_label_col)) { # map back SummaryTable to requested label_col and tables to suitable table labels
for (slot in c("SummaryTable", "SummaryData")) {
tb <- get(slot)
vs <- lapply(util_parse_assignments(tb$Variables,
multi_variate_text = TRUE),
function(x) {
prep_map_labels(
x = x,
item_level = meta_data,
from = label_col,
to = orig_label_col,
ifnotfound = x,
warn_ambiguous = FALSE
)
}
)
tb$Variables <-
lapply(vs, paste0, collapse = sprintf(" %s ", SPLIT_CHAR))
tb[["Variables"]] <-
prep_map_labels(x = tb[["Variables"]],
item_level = meta_data,
to = orig_label_col,
from = label_col,
ifnotfound = tb[["Variables"]],
warn_ambiguous = FALSE)
assign(slot, tb)
}
}
##########################################
return(list(FlaggedStudyData = FlaggedStudyData,
SummaryTable = SummaryTable,
SummaryData = SummaryData,
SummaryPlot = util_set_size(p1,
width_em = 25 +
1.2 * length(variable_group),
height_em = 25
)))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.