Nothing
#' @name recodeMissings
#'
#' @title Consistent recoding of (extended) missing values
#'
#' @description
#' A function to recode all missing values to either SPSS or Stata types,
#' uniformly (re)using the same codes across all variables.
#'
#' @details
#' When a dictionary is not provided, it is automatically constructed from the
#' available data and metadata, using negative numbers starting from -91 and up
#' to 27 letters starting with "a".
#'
#' If the dataset contains mixed variables with SPSS and Stata style missing
#' values, unless otherwise specified in a dictionary it uses other codes than
#' the existing ones.
#'
#' For the SPSS type of missing values, the resulting variables are coerced to a
#' declared labelled format.
#'
#' Unlike SPSS, Stata does not allow labels for character values. Both cannot be
#' transported from SPSS to Stata, it is either one or another. If labels are
#' more important to preserve than original values (especially the information
#' about the missing values), the argument `chartonum` replaces all character
#' values with suitable, non-overlapping numbers and adjusts the labels
#' accordingly.
#'
#' If no labels are found in the metadata, the original values are preserved.
#'
#' @examples
#' x <- data.frame(
#' A = declared(
#' c(1:5, -92),
#' labels = c(Good = 1, Bad = 5, NR = -92),
#' na_values = -92
#' ),
#' B = labelled(
#' c(1:5, haven::tagged_na('a')),
#' labels = c(DK = haven::tagged_na('a'))
#' ),
#' C = declared(
#' c(1, -91, 3:5, -92),
#' labels = c(DK = -91, NR = -92),
#' na_values = c(-91, -92)
#' )
#' )
#'
#' xrec <- recodeMissings(x, to = "Stata")
#'
#' attr(xrec, "dictionary")
#'
#' dictionary <- data.frame(
#' old = c(-91, -92, "a"),
#' new = c("c", "d", "c")
#' )
#' recodeMissings(x, to = "Stata", dictionary = dictionary)
#'
#' recodeMissings(x, to = "SPSS")
#'
#' dictionary$new <- c(-97, -98, -97)
#'
#' recodeMissings(x, to = "SPSS", dictionary = dictionary)
#'
#' recodeMissings(x, to = "SPSS", start = 991)
#'
#' recodeMissings(x, to = "SPSS", start = -8)
#'
#' @return A data frame with all missing values recoded consistently.
#'
#' @author Adrian Dusa
#'
#' @param dataset A data frame
#' @param to Software to recode missing values for
#' @param dictionary
#' A named vector, with corresponding Stata missing codes to SPSS missing values
#' @param start
#' A named vector, with corresponding Stata missing codes to SPSS missing values
#' @param ... Other internal arguments
#'
#' @export
`recodeMissings` <- function(
dataset, to = c("SPSS", "Stata", "SAS"), dictionary = NULL, start = -91, ...
) {
dots <- list(...)
to <- toupper(match.arg(to))
tospss <- to == "SPSS"
error_null <- ifelse(isFALSE(dots$error_null), FALSE, TRUE)
to_declared <- ifelse(isFALSE(dots$to_declared), FALSE, TRUE)
if (is.data.frame(dataset)) {
error <- TRUE
i <- 1
while (i <= ncol(dataset) & error) {
attrx <- attributes(dataset[[i]])
if (
any(
is.element(
c("labels", "na_value", "na_range"),
names(attrx)
)
)
) {
error <- FALSE
}
i <- i + 1
}
if (error && error_null) {
admisc::stopError(
paste(
"The input does not seem to contain any",
"metadata about values and labels."
)
)
}
}
else {
admisc::stopError(
"The input should be a data frame containing labelled variables."
)
}
dataDscr <- collectMetadata(dataset, error_null = error_null)
charvar <- unname(sapply(dataset, is.character))
spss <- unname(sapply(dataset, function(x) {
!is.null(attr(x, "labels", exact = TRUE)) &&
(
inherits(x, "haven_labelled_spss") || inherits(x, "declared")
)
}))
stata <- unname(sapply(dataset, function(x) {
is.double(x) &&
!is.null(attr(x, "labels", exact = TRUE)) &&
(
inherits(x, "haven_labelled") & !inherits(x, "haven_labelled_spss")
)
}))
allMissing <- list()
for (variable in names(dataset[, spss | stata, drop = FALSE])) {
x <- declared::undeclare(dataset[[variable]], drop = TRUE)
attributes(x) <- NULL
metadata <- dataDscr[[variable]]
labels <- metadata[["labels"]]
na_range <- metadata[["na_range"]]
missing <- metadata[["na_values"]]
if (!is.null(na_range)) {
misvals <- x[x >= na_range[1] & x <= na_range[2]]
missing <- c(missing, misvals[!is.na(misvals)])
if (!is.null(labels)) {
if (admisc::possibleNumeric(labels)) {
lbls <- admisc::asNumeric(labels)
missing <- c(
missing,
lbls[lbls >= na_range[1] & lbls <= na_range[2]]
)
}
}
}
missing <- sort(unique(missing))
tagged <- haven::is_tagged_na(labels)
if (any(tagged)) {
labels[tagged] <- haven::na_tag(labels[tagged])
}
if (!is.null(missing)) {
names(missing) <- ""
}
if (
is.element("labels", names(metadata)) &&
any(is.element(missing, labels))
) {
wel <- which(is.element(missing, labels))
names(missing)[wel] <- names(labels)[
match(missing[wel], labels)
]
}
allMissing[[variable]] <- missing
}
umispss <- unlist(unname(allMissing[spss[spss | stata]]))
umistata <- unlist(unname(allMissing[stata[spss | stata]]))
if (!is.null(umispss)) {
umispss[order(names(umispss), decreasing = TRUE)]
umispss <- umispss[!duplicated(umispss)]
}
if (!is.null(umistata)) {
umistata[order(names(umistata), decreasing = TRUE)]
umistata <- umistata[!duplicated(umistata)]
}
torecode <- data.frame(
spss = c(rep(TRUE, length(umispss)), rep(FALSE, length(umistata))),
label = c(names(umispss), names(umistata)),
old = c(unname(umispss), unname(umistata))
)
if (nrow(torecode) == 0) {
# There is no information about missing values
return(dataset)
}
torecode <- torecode[order(torecode$label, decreasing = TRUE), ]
torecode <- torecode[order(torecode$spss, decreasing = tospss), ]
torecode$new <- torecode$old
wi <- which(torecode$spss != tospss)
if (length(wi) > 0) {
for (i in wi) {
if (nzchar(torecode$label[i])) {
wl <- which(torecode$label == torecode$label[i])
if (length(wl) > 1) {
torecode$new[i] <- torecode$old[wl[1]]
}
}
}
}
torecode <- torecode[order(torecode$old), ]
torecode <- torecode[order(torecode$spss, decreasing = tospss), ]
nchars <- nchar(torecode$new)
torecode$new <- sapply(
strsplit(as.character(torecode$new), split = ""),
function(x) {
if (x[1] != "-") {
x <- unique(x)
}
paste(x, collapse = "")
}
)
torecode <- torecode[order(torecode$new, nchars), ]
torecode <- torecode[order(torecode$spss, decreasing = tospss), ]
torecode$new <- match(torecode$new, unique(torecode$new))
if (tospss) {
mcodes <- seq(max(5000, nrow(torecode) + 1)) + abs(start) - 1
if (start < 0) {
mcodes <- -1 * mcodes
}
torecode$new <- mcodes[torecode$new]
}
else {
toomany <- max(torecode$new) > length(letters)
if (toomany) {
# TODO: recode variable by variable...?
admisc::stopError("Too many overall missing values.")
}
torecode$new <- letters[torecode$new]
}
torecode$label[is.na(torecode$label)] <- ""
if (is.null(dictionary)) {
if (isTRUE(dots$return_dictionary)) {
return(torecode[, -1])
}
dictionary <- torecode
}
else {
if (length(setdiff(torecode$old, dictionary$old)) > 0) {
admisc::stopError(
"Missing values in the data not present in the dictionary."
)
}
}
# now recode the respective variables according to the dictionary
old <- dictionary$old
new <- dictionary$new
pnold <- admisc::possibleNumeric(old, each = TRUE)
old <- tolower(old)
if (is.character(new)) {
new <- tolower(new)
}
for (variable in names(dataset[, spss | stata, drop = FALSE])) {
x <- declared::undeclare(dataset[[variable]], drop = TRUE)
attributes(x) <- NULL # for haven_labelled with tagged NAs
metadata <- dataDscr[[variable]]
labels_x <- metadata[["labels"]]
na_values_x <- metadata[["na_values"]]
na_range_x <- metadata[["na_range"]]
if (!is.null(na_values_x) | !is.null(na_range_x)) {
if (tospss) {
selection <- logical(length(old))
if (!is.null(na_values_x)) {
selection <- is.element(old, na_values_x)
}
else if (!is.null(na_range_x)) {
na_range_x <- range(as.numeric(na_range_x))
if (any(pnold)) {
selection[pnold] <-
as.numeric(old[pnold]) >= min(na_range_x) &
as.numeric(old[pnold]) <= max(na_range_x)
}
}
if (any(selection)) {
old_x <- old[selection]
new_x <- new[selection]
# spss_x <- dictionary$spss[selection]
for (d in seq(length(old_x))) {
if (!is.null(na_values_x)) {
na_values_x[is.element(na_values_x, old_x[d])] <- new_x[d]
}
# if (spss_x[d]) {
x[is.element(x, old_x[d])] <- new_x[d]
labels_x[is.element(labels_x, old_x[d])] <- new_x[d]
# }
# else {
if (is.element(old_x[d], letters)) {
x[
haven::is_tagged_na(x, old_x[d])
] <- new_x[d]
labels_x[
haven::is_tagged_na(labels_x, old_x[d])
] <- new_x[d]
}
}
}
if (admisc::possibleNumeric(na_values_x)) {
na_values_x <- as.numeric(na_values_x)
}
if (!is.null(labels_x) && !admisc::possibleNumeric(labels_x)) {
x <- as.character(x)
if (length(na_values_x) > 0) {
na_values_x <- as.character(na_values_x)
}
}
callist <- list(
x = x,
labels = labels_x,
label = metadata[["label"]]
)
if (length(na_values_x) > 0) {
callist$na_values <- na_values_x
}
if (length(na_range_x) > 0) {
# update na_range, this is from SPSS to an SPSS type variable
updated <- logical(2)
copy_range <- na_range_x
if (na_range_x[1] == -Inf) {
updated[1] <- TRUE
if (start > 0) {
na_range_x[1] <- Inf
copy_range <- rev(copy_range)
updated <- rev(updated)
}
}
if (na_range_x[2] == Inf) {
updated[2] <- TRUE
if (start < 0) {
na_range_x[2] <- -Inf
copy_range <- rev(copy_range)
updated <- rev(updated)
}
}
na_range_x <- sort(na_range_x)
for (d in which(pnold)) {
if (identical(na_range_x[1], as.numeric(old[d]))) {
na_range_x[1] <- new[d]
updated[1] <- TRUE
}
if (identical(na_range_x[2], as.numeric(old[d]))) {
na_range_x[2] <- new[d]
updated[2] <- TRUE
}
}
if (!all(updated)) {
difference <- diff(copy_range)
# two scenarios, something like:
# 1. range c(-99, -95) and only one of them is in the dictionary
if (updated[1]) {
na_range_x[2] <- na_range_x[1] + difference
}
else if (updated[2]) {
na_range_x[1] <- na_range_x[2] - difference
}
else {
# 2. range is c(-99, -95), with say a value of -97 missing
# and none of them are in the dictionary
na_range_x <- range(new)
}
}
callist$na_range <- na_range_x
}
if (to_declared) {
dataset[[variable]] <- do.call(declared::declared, callist)
}
else {
dataset[[variable]] <- do.call(haven::labelled_spss, callist)
}
}
else if (is.numeric(x)) {
# it makes sense to check for character variables, since neither
# Stata nor SAS do not accept missing values for chars technically,
# a char var with missing value would be "valid" in SPSS but it doesn't
# matter if recoding to Stata or SAS, it's like it would not exist
attributes(x) <- NULL
selection <- logical(length(old))
if (!is.null(na_values_x)) {
selection <- is.element(old, na_values_x)
}
else if (!is.null(na_range_x)) {
pnold <- admisc::possibleNumeric(old, each = TRUE)
if (any(pnold)) {
selection[pnold] <-
as.numeric(old[pnold]) >= min(na_range_x) &
as.numeric(old[pnold]) <= max(na_range_x)
}
}
if (any(selection)) {
old_x <- old[selection]
new_x <- new[selection]
if (admisc::possibleNumeric(old_x)) {
old_x <- admisc::asNumeric(old_x)
}
for (d in seq(length(old_x))) {
x[is.element(x, old_x[d])] <- haven::tagged_na(new_x[d])
labels_x[
is.element(labels_x, old_x[d])
] <- haven::tagged_na(new_x[d])
if (is.element(tolower(old_x[d]), letters)) {
x[
haven::is_tagged_na(x, old_x[d])
] <- haven::tagged_na(new_x[d])
labels_x[
haven::is_tagged_na(labels_x, old_x[d])
] <- haven::tagged_na(new_x[d])
}
}
dataset[[variable]] <- haven::labelled(
x,
labels = labels_x,
label = metadata[["label"]]
)
}
}
}
}
attr(dataset, "dictionary") <- dictionary[, -1]
return(dataset)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.