Nothing
#' @title Harmonize the values and labels of labelled vectors
#' @description Create a labelled vector with harmonized numeric coding and
#' value labels.
#' @details Create a labelled vector that contains in its metadata attributes
#' the original labelling, the original numeric coding and the current
#' labelling, with the numerical values representing the harmonized
#' coding.
#' @param x A labelled vector
#' @param harmonize_label A character vector of 1L containing the new,
#' harmonize variable label. Defaults to \code{NULL}, in which case
#' it uses the variable label of \code{x}, unless it is also \code{NULL}.
#' @param harmonize_labels A list of harmonization values
#' @param na_values A named vector of \code{na_values}, the
#' observations that are defined to be treated as missing in
#' the SPSS-style coding.
#' @param na_range A min, max range of \code{na_range}, the
#' continuous missing value range. In most surveys this should be left
#' \code{NULL}.
#' @param name_orig The original name of the variable. If left \code{NULL}
#' it uses the latest name of the object \code{x}.
#' @param id A survey ID, defaults to \code{survey_id}
#' @param remove Defaults to \code{NULL}. A character or regex that will be removed from all
#' old value labels, like \code{"\\("|\\)} for ( and ).
#' @param perl Use perl-like regex? Defaults to \code{FALSE}.
#' @importFrom labelled to_character labelled na_values val_labels
#' @importFrom labelled var_label
#' @importFrom tibble tibble as_tibble
#' @importFrom dplyr mutate left_join distinct select if_else
#' @importFrom tidyselect all_of
#' @importFrom haven labelled_spss
#' @importFrom assertthat assert_that
#' @importFrom rlang set_names .data
#' @family harmonization functions
#' @return A labelled vector that contains in its metadata attributes
#' the original labelling, the original numeric coding and the current
#' labelling, with the numerical values representing the harmonized
#' coding.
#' @examples
#' var1 <- labelled::labelled_spss(
#' x = c(1, 0, 1, 1, 0, 8, 9),
#' labels = c(
#' "TRUST" = 1,
#' "NOT TRUST" = 0,
#' "DON'T KNOW" = 8,
#' "INAP. HERE" = 9
#' ),
#' na_values = c(8, 9)
#' )
#'
#' harmonize_values(
#' var1,
#' harmonize_labels = list(
#' from = c("^tend\\sto|^trust", "^tend\\snot|not\\strust", "^dk|^don", "^inap"),
#' to = c("trust", "not_trust", "do_not_know", "inap"),
#' numeric_values = c(1, 0, 99997, 99999)
#' ),
#' na_values = c(
#' "do_not_know" = 99997,
#' "inap" = 99999
#' ),
#' id = "survey_id"
#' )
#' @export
#' @family harmonization functions
harmonize_values <- function(
x,
harmonize_label = NULL,
harmonize_labels = NULL,
na_values = c(
"do_not_know" = 99997,
"declined" = 99998,
"inap" = 99999
),
na_range = NULL,
id = "survey_id",
name_orig = NULL,
remove = NULL,
perl = FALSE
) {
if (is.null(na_values)) input_na_values <- NULL
if (is.null(na_values)[1] | is.na(na_values)[1]) {
if ("na_values" %in% names(harmonize_labels)) {
input_na_values <- na_values
harmonize_labels <- harmonize_labels[c("from", "to", "numeric_values")]
}
} else {
# what if there are no na_values given?
input_na_values <- na_values
harmonize_labels <- harmonize_labels[c("from", "to", "numeric_values")]
}
validate_label_list <- function(label_list) {
assert_that(
all(c("from", "to", "numeric_values") %in% names(label_list)),
msg = "The harmonize_labels list must have 'from', 'to', and 'numeric_values' vectors."
)
ll_lengths <- vapply(label_list[c("from", "to", "numeric_values")], length, numeric(1))
assert_that(
length(unique(ll_lengths)) == 1,
msg = paste0(
"The 'from', 'to', and 'numeric_values' vectors must be of equal length, currently it is: ",
as.character(paste(ll_lengths, collapse = ", "))
)
)
ll <- as_tibble(label_list[c("from", "to", "numeric_values")])
for (l in unique(label_list$to)) {
matched_numeric_value <- ll %>%
filter(.data$to == l) %>%
distinct(.data$to, .data$numeric_values) %>%
pull(.data$numeric_values)
assert_that(length(matched_numeric_value) == 1,
msg = paste0(
"in harmonized_list ",
l, " is matched with multiple numeric values: <",
paste(matched_numeric_value, collapse = ","), ">"
)
)
}
}
if (!is.null(harmonize_labels)) {
validate_label_list(label_list = harmonize_labels)
}
if (is.null(id)) {
# if not otherwise stated, inherit the ID of x, if present
if (!is.null(attr(x, "id"))) {
id <- attr(x, "id")
} else {
id <- "unknown"
}
}
## Get the original object name for recording it as metadata
original_x_name <- deparse(substitute(x))
## Harmonize SPSS variable labels, if they exist
if (is.null(harmonize_label)) harmonize_label <- labelled::var_label(x)
if (is.null(harmonize_label)) harmonize_label <- original_x_name
if (!is.null(harmonize_labels) & validate_harmonize_labels(harmonize_labels)) { ## see assertions.R
harmonize_labels <- tibble::as_tibble(harmonize_labels) %>%
select(all_of(c("from", "to", "numeric_values")))
}
original_values <- tibble::tibble(
x = vctrs::vec_data(x)
)
original_values$orig_labels <- if_else(
condition = original_values$x %in% labelled::val_labels(x),
true = as_character(x),
false = NA_character_
)
if (!is.null(remove)) {
harmonize_labels$from <- gsub(remove, "", harmonize_labels$from)
original_values$orig_labels <- gsub(remove, "", original_values$orig_labels)
}
if (is.na_range_to_values(x)) {
x <- na_range_to_values(x)
}
if (!is.null(harmonize_labels)) {
condition_string <- tolower(gsub("\\.|\\(|\\)", "", harmonize_labels$from))
codebook_string <- tolower(gsub("\\.|\\(|\\)", "", original_values$orig_labels))
original_values$orig_labels <- if_else(
# label == "" if not in the harmonization list
condition = grepl(paste(condition_string, collapse = "|"),
codebook_string,
perl = perl
),
true = codebook_string,
false = ""
)
code_table <- dplyr::distinct_all(original_values)
str <- code_table$orig_labels # string of original values
code_table$new_labels <- NA_character_
code_table$new_values <- NA_real_
for (o in seq_along(harmonize_labels$from)) {
from_string <- tolower(gsub("\\.|\\(|\\)", "", harmonize_labels$from[o]))
code_table$new_labels[which(grepl(from_string, str))] <- harmonize_labels$to[o]
code_table$new_values[which(grepl(from_string, str))] <- harmonize_labels$numeric_values[o]
# code_table$new_labels [which ( tolower(harmonize_labels$from[o]) == str) ] <- harmonize_labels$to[o]
# code_table$new_values [which ( tolower(harmonize_labels$from[o]) == str) ] <- harmonize_labels$numeric_values[o]
}
old_code <- function() {
## Can be removed after extensive testing -------
for (r in seq_along(harmonize_labels$to)) {
## harmonize the strings to new labelling by regex
str[which(grepl(tolower(harmonize_labels$from[r]), str, perl = perl))] <- harmonize_labels$to[r]
}
add_new_values <- harmonize_labels %>%
as_tibble() %>%
dplyr::select(tidyselect::all_of(
c("to", "numeric_values")
)) %>%
rlang::set_names(c("new_labels", "new_values"))
code_table <- dplyr::left_join(
code_table,
add_new_values,
by = "new_labels"
) %>%
filter(!is.na(.data$new_values))
}
code_table$original_values <- NULL
} else {
# no harmonization is given -----------------------
code_table <- get_labelled_attributes(x) ## see below main function
na_labels <- names(na_values)
if (length(na_labels) > 0) {
# if there is no valid label harmonization, still check for potential missings
potential_na_values <- sapply(na_labels, function(x) paste0("^", x, "|", x))
na_regex <- sapply(potential_na_values, function(s) grepl(s, val_label_normalize(code_table$new_labels), perl = perl))
for (c in seq_along(na_labels)) {
code_table$new_labels[which(na_regex[, c])] <- na_labels[c]
}
}
}
code_table <- code_table %>%
distinct_all() %>%
dplyr::arrange(.data$new_values)
new_value_table <- original_values %>%
dplyr::left_join(
code_table,
by = c("x", "orig_labels")
) %>%
dplyr::mutate(new_values = if_else(
condition = is.na(.data$new_labels),
true = x,
false = .data$new_values
)) %>% # invalid labels should be treated elsewhere
dplyr::arrange(.data$new_values)
original_values$x
## define new missing values, not with range
## A message should be given, these are just suspected to be missing
new_na_values <- new_value_table$new_values[which(new_value_table$new_values >= 99900)]
if (!is.null(na_values) & length(na_values) > 0) {
# The user gets a message if the state missing range (described with labels) is not empty.
potential_na_values <- original_values$x[which(original_values$x %in% as.numeric(na_values))]
if (length(potential_na_values) > 0) {
unique_potential_na_values <- paste(unique(potential_na_values), collapse = ", ")
warning_message <- glue::glue("Variable {original_x_name} has values {unique_potential_na_values} that you mark as missing.")
# message ( warning_message)
}
}
new_na_values <- input_na_values
# define new value - label pairs
new_labelling <- new_value_table %>%
dplyr::distinct(.data$new_values, .data$new_labels)
new_labels <- new_labelling$new_values
names(new_labels) <- new_labelling$new_labels
new_labelling <- new_labelling %>%
mutate(new_labels = ifelse(is.na(.data$new_labels),
ifelse(.data$new_values %in% input_na_values,
names(input_na_values), .data$new_labels
),
.data$new_labels
))
# define original labelling
original_labelling <- new_value_table %>%
dplyr::distinct(.data$new_values, .data$orig_labels)
original_labels <- original_labelling$new_values
names(original_labels) <- original_labelling$orig_labels
# define original numeric code
original_numerics <- new_value_table %>%
dplyr::distinct(.data$new_values, .data$x)
original_numeric_values <- original_numerics$new_values
names(original_numeric_values) <- original_numerics$x
# create new numerics
new_numerics <- tibble::tibble(x = original_values$x) %>%
dplyr::left_join(original_numerics, by = "x")
# ifelse ( is.null(labelled::na_values(x)),
# na_values,
# c())
na_value_labels <- na_values
return_value <- labelled_spss_survey(
x = new_numerics$new_values,
labels = labelled::val_labels(x),
label = harmonize_label,
na_values = labelled::na_values(x),
na_range = labelled::na_range(x),
id = id,
name_orig = original_x_name
)
add_to_value_range <- (!harmonize_labels$to %in% new_labelling$new_labels)
further_valid_labels <- harmonize_labels$numeric_values[add_to_value_range]
names(further_valid_labels) <- harmonize_labels$to[add_to_value_range]
new_valid_range_labels <- sort(
c(
new_labels,
further_valid_labels,
input_na_values[which(!input_na_values %in% new_labelling$new_labels)]
)
)
new_valid_range_labels_2 <- new_valid_range_labels[unique(names(new_valid_range_labels))]
new_valid_range_labels_3 <- new_valid_range_labels_2[!is.na(new_valid_range_labels_2)]
new_valid_range_labels_3
attr(return_value, "labels") <- new_valid_range_labels_3
attr(return_value, "na_values") <- unique(new_na_values)
attr(return_value, paste0(attr(return_value, "id"), "_values")) <- original_numeric_values
assertthat::assert_that(
inherits(return_value, "haven_labelled_spss")
)
return_value
}
#' @importFrom dplyr distinct_all
#' @keywords internal
get_labelled_attributes <- function(x) {
unlabelled_x <- sapply(attributes(x), function(i) {
attributes(i) <- NULL
x
})[, 1]
code_table <- tibble::tibble(
x = unlabelled_x,
new_values = unlabelled_x
)
code_table$orig_labels <- as_character(x)
code_table$new_labels <- as_character(x)
dplyr::distinct_at(code_table, dplyr::vars(
all_of(c("x", "new_values", "orig_labels", "new_labels"))
))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.