Nothing
#' Heuristics to amend a SCALE_LEVEL column and a UNIT column in the metadata
#'
#' ...if missing
#'
#' @param resp_vars [variable list] the names of the measurement variables
#' @param study_data [data.frame] the data frame that contains the measurements
#' @param meta_data [data.frame] the data frame that contains metadata
#' attributes of study data
#' @param label_col [variable attribute] the name of the column in the metadata
#' with labels of variables
#'
#' @return [data.frame] modified metadata
#'
#' @export
#'
#' @examples
#' \dontrun{
#' prep_load_workbook_like_file("meta_data_v2")
#' prep_scalelevel_from_data_and_metadata(study_data = "study_data")
#' }
prep_scalelevel_from_data_and_metadata <- function(resp_vars = NULL,
study_data,
meta_data = "item_level",
label_col = LABEL) {
# TODO: shorten function name, e.g., 'prep_amend_scale_level'
if (
inherits(
try(
util_expect_data_frame(meta_data),
silent = TRUE),
"try-error")) {
w <-
"No item-level metadata provided at all. Will try to predict them."
if (requireNamespace("cli", quietly = TRUE)) {
w <- cli::bg_red(cli::col_br_yellow(w))
}
util_warning(w, immediate = TRUE)
util_warning(w, immediate = TRUE)
meta_data <- prep_study2meta(study_data)
}
prep_prepare_dataframes(
.replace_hard_limits = TRUE,
.amend_scale_level = FALSE, # prevent recursion
.replace_missings = TRUE
)
util_correct_variable_use(resp_vars,
allow_more_than_one = TRUE,
allow_null = TRUE,
overwrite = TRUE,
remove_not_found = TRUE)
if (is.null(resp_vars)) { # FIXME EK: Crashes, if set to e.g. one variable only
resp_vars <- colnames(ds1)
}
md <- meta_data[meta_data[[label_col]] %in% resp_vars,
intersect(c(label_col, SCALE_LEVEL, DATA_TYPE, UNIT,
VALUE_LABELS),
colnames(meta_data)), FALSE]
distsel <- util_dist_selection(ds1)
# TODO: What about missing codes like ".A" in metadata? They are considered
# to be negative values, even in string columns! (".A" < 0 gives TRUE)
# (problem with util_dist_selection, not actually here)
meta_info <-
merge(md, distsel, by.x = label_col, by.y = "Variables", all = TRUE)
for (cl in c(SCALE_LEVEL, DATA_TYPE, UNIT, VALUE_LABELS)) {
if (!(cl %in% colnames(meta_info))) {
meta_info[[cl]] <- NA_character_
}
}
meta_info <- meta_info[,
intersect(
c(
label_col,
"DATA_TYPE",
"VALUE_LABELS",
"IsInteger",
"IsMultCat",
"NCategory",
"AnyNegative",
"SCALE_LEVEL",
"UNIT"
),
colnames(meta_info)
), drop = FALSE]
varrefs <- util_variable_references(meta_data = meta_data)
# 1. Variables with value labels ---------------------------------------------
# - the value labels indicate an order: ordinal scale
# - the value labels are separated by `|`: nominal scale
assignments <- util_parse_assignments(meta_info$VALUE_LABELS,
split_char = c(SPLIT_CHAR, "<"),
multi_variate_text = TRUE)
meta_info$is_ordered <- vapply(assignments, attr, "split_char",
FUN.VALUE = character(1)) == "<"
if (any(meta_info$is_ordered)) { # check for counter-intuitive ordered lists
# e.g., 2 = low < 1 = high
# QUESTION TO STS: Should this check rather be part of prep_prepare_dataframes?
for (a in assignments[meta_info$is_ordered]) {
if (all(suppressWarnings(is.na(as.integer(names(a)))) ==
is.na(names(a))) && # integer codes
!all(diff(as.integer(names(a))) > 0)
) {
util_message("Found counter-intuitive %s: %s",
VALUE_LABELS,
prep_deparse_assignments(labels = vapply(a, identity,
FUN.VALUE = character(1)),
codes = names(a),
split_char = "<"))
}
}
}
meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$DATA_TYPE %in% c(DATA_TYPES$STRING, DATA_TYPES$INTEGER) &
!util_empty(meta_info$VALUE_LABELS), SCALE_LEVEL] <-
ifelse(meta_info$is_ordered[util_empty(meta_info$SCALE_LEVEL) &
meta_info$DATA_TYPE %in%
c(DATA_TYPES$STRING,
DATA_TYPES$INTEGER) &
!util_empty(meta_info$VALUE_LABELS)],
SCALE_LEVELS$ORDINAL,
SCALE_LEVELS$NOMINAL)
# identical(attr(util_parse_assignments("a < 2", split_char = c(SPLIT_CHAR, "<")), "split_char"), "<")
# TODO: also consider STANDARDIZED_VOCABULARY (once it is implemented)
# 2. Variables used as grouping variables: nominal scale ---------------------
# for example examiners and devices
all_group_vars <- varrefs[
startsWith(varrefs, "GROUP_VAR_")]# TODO: What about the old schema using KEY_OBSERVER, etc.?
if (length(all_group_vars) > 0) {
used_as_group_var <- unique(sort(unlist(apply(meta_data[all_group_vars],
1:2,
util_find_var_by_meta,
meta_data = meta_data,
label_col = label_col,
target = label_col
), recursive = TRUE)))
meta_info$used_as_group_var <- meta_info[[label_col]] %in% used_as_group_var
meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$used_as_group_var, SCALE_LEVEL] <-
SCALE_LEVELS$NOMINAL
}
# 3. Variables used as time variables: interval scale ------------------------
all_time_vars <- varrefs[
startsWith(varrefs, "TIME_VAR")]
if (length(all_time_vars) > 0) {
used_as_time_var <- unique(sort(unlist(apply(meta_data[all_time_vars],
1:2,
util_find_var_by_meta,
meta_data = meta_data,
label_col = label_col,
target = label_col
), recursive = TRUE)))
meta_info$used_as_time_var <- meta_info[[label_col]] %in% used_as_time_var
meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$used_as_time_var, SCALE_LEVEL] <-
SCALE_LEVELS$INTERVAL
}
# 4. Variables of type 'datetime': interval scale ----------------------------
meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$DATA_TYPE == DATA_TYPES$DATETIME, SCALE_LEVEL] <-
SCALE_LEVELS$INTERVAL
# 5. Variables of type 'string': nominal scale or 'na' -----------------------
string_vars <- meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$DATA_TYPE %in% c(DATA_TYPES$STRING),
label_col]
if (length(string_vars) > 0) {
string_not_cat <- vapply(ds1[, string_vars, drop = FALSE],
FUN.VALUE = logical(1),
FUN = util_string_is_not_categorical)
meta_info[meta_info[[label_col]] %in% string_vars, SCALE_LEVEL] <-
ifelse(string_not_cat, SCALE_LEVELS$`NA`, SCALE_LEVELS$NOMINAL)
}
# 6. Classify remaining integer and float values -----------------------------
binaryrecodelimit <-
getOption("dataquieR.scale_level_heuristics_control_binaryrecodelimit",
dataquieR.scale_level_heuristics_control_binaryrecodelimit_default)
metriclevels <-
getOption("dataquieR.scale_level_heuristics_control_metriclevels",
dataquieR.scale_level_heuristics_control_metriclevels_default)
if (metriclevels < binaryrecodelimit) {
metriclevels <- ifelse(
dataquieR.scale_level_heuristics_control_metriclevels_default <
binaryrecodelimit,
binaryrecodelimit + 10,
dataquieR.scale_level_heuristics_control_metriclevels_default)
util_warning(paste(
"The threshold for metric variables was set too low and set to",
metriclevels,
"instead."),
applicability_problem = TRUE)
}
# number of distinct values > 'metriclevels' (default: 25) and
# minimum value >= 0: ratio scale
meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$DATA_TYPE %in% c(DATA_TYPES$INTEGER, DATA_TYPES$FLOAT) &
((meta_info$IsInteger & meta_info$NCategory > metriclevels) |
!meta_info$IsInteger) &
!meta_info$AnyNegative, SCALE_LEVEL] <-
SCALE_LEVELS$RATIO
# number of distinct values > 'metriclevels' (default: 25) and
# minimum value < 0: interval scale
meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$DATA_TYPE %in% c(DATA_TYPES$INTEGER, DATA_TYPES$FLOAT) &
((meta_info$IsInteger & meta_info$NCategory > metriclevels) |
!meta_info$IsInteger) &
meta_info$AnyNegative, SCALE_LEVEL] <-
SCALE_LEVELS$INTERVAL
# number of distinct values <= 'metriclevels' (default: 25) and
# number of distinct values > 'binaryrecodelimit' (default: 8): ordinal
meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$DATA_TYPE %in% c(DATA_TYPES$INTEGER, DATA_TYPES$FLOAT) &
meta_info$IsInteger &
meta_info$NCategory <= metriclevels &
meta_info$NCategory > binaryrecodelimit, SCALE_LEVEL] <-
SCALE_LEVELS$ORDINAL
# number of distinct values <= 'binaryrecodelimit' (default: 8): nominal
meta_info[
util_empty(meta_info$SCALE_LEVEL) &
meta_info$DATA_TYPE %in% c(DATA_TYPES$INTEGER, DATA_TYPES$FLOAT) &
meta_info$IsInteger &
meta_info$NCategory <= binaryrecodelimit, SCALE_LEVEL] <-
SCALE_LEVELS$NOMINAL
### TODO: This should not be required here anymore, but let's test it first.
# write also something for variables that do not fit the `Stevens's` Typology
# e.g., notes, letters, free text, other structured content (xml, json, ...)
meta_info[
util_empty(meta_info$SCALE_LEVEL), SCALE_LEVEL] <-
SCALE_LEVELS$`NA`
meta_data[[SCALE_LEVEL]] <- setNames(meta_info[[SCALE_LEVEL]],
nm = meta_info[[label_col]])[meta_data[[label_col]]]
meta_data
}
# https://gitlab.com/libreumg/dataquier/-/issues/125
# https://gitlab.com/libreumg/dataquier/-/issues/148
# TODO: Create Test Issue: We should improve the handling of VALUE_LABLELS like male | female (i.e., w/o a coding, the actual values are in the data, which is not so uncommon) -> maybe, we should convert these to our standard case similarly to prep_apply_coding and prep_valuelabels_from_data
# TODO: Discuss: How to reduce the message about inconsistent "low < high" but "nominal" in the metadata
# TODO: Share Updated metadata with examples for all new features (< in value labels, value labels w/o number -- old feature actually, scale_level set vs. missing; add some lorem-ipsum column; add suitable variable role secondary for variables, which are now mostly missing)
# IDEA: Use a scale-free-network assumption as a foundation to expect exponentionally distributed values for key columns
# For testing:
#' prep_load_workbook_like_file("meta_data_v2")
#' View(prep_scalelevel_from_data_and_metadata(study_data = "study_data")[, union(SCALE_LEVEL, colnames(prep_scalelevel_from_data_and_metadata(study_data = "study_data")))])
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.