# This file is part of the Minnesota Population Center's ipumsr.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/mnpopcenter/ipumsr
#' Read metadata about an IPUMS extract from a DDI (.xml) file
#'
#' Reads the metadata about an IPUMS extract from a DDI file into R.
#' Includes information about variable and value labels, terms of
#' usage for the data and positions for the fixed-width file.
#'
#' @param ddi_file Filepath to DDI xml file
#' @param data_layer If ddi_file is an extract with multiple DDIs, dplyr
#' \code{\link[dplyr]{select}}-style notation indicating which .xml data
#' layer to load.
#' @param lower_vars Logical indicating whether to convert variable names
#' to lowercase (default is FALSE, in line with IPUMS conventions)
#' @return An \code{ipums_ddi} object with metadata information.
#' @examples
#' # Example extract DDI
#' ddi_file <- ipums_example("cps_00006.xml")
#' ddi <- read_ipums_ddi(ddi_file)
#' @family ipums_metadata
#' @export
read_ipums_ddi <- function(ddi_file, data_layer = NULL, lower_vars = FALSE) {
data_layer <- enquo(data_layer)
custom_check_file_exists(ddi_file)
if (fostr_sub(ddi_file, -4) == ".zip") {
ddi_in_zip <- find_files_in(ddi_file, "xml", data_layer)
ddi_file_load <- unz(ddi_file, ddi_in_zip)
} else {
ddi_file_load <- ddi_file
}
ddi_xml <- xml2::read_xml(ddi_file_load, data_layer = NULL)
# Basic information
conditions <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:dataAccs/d1:useStmt/d1:conditions"
)
citation <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:dataAccs/d1:useStmt/d1:citReq"
)
ipums_project <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:citation/d1:serStmt/d1:serName"
)
extract_notes <- xml_text_from_path_collapsed(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:notes"
)
extract_date <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:citation/d1:prodStmt/d1:prodDate/@date"
)
extract_date <- as.Date(extract_date)
# File information
files <- xml2::xml_find_all(ddi_xml, "/d1:codeBook/d1:fileDscr")
if (length(files) > 1) {
warning("Extracts with multiple files not supported, using first file.", call. = FALSE)
}
file_name <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileName"
)
file_type <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/@type"
)
file_encoding <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileType/@charset"
)
# Get rectype info if hierarchical
if (file_type == "hierarchical") {
rectypes <- xml_text_from_path_all(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/d1:recGrp/@rectype"
)
rectype_idvar <- xml_text_from_path_all(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/d1:recGrp/@recidvar"
)
rectype_idvar <- rectype_idvar[length(rectype_idvar)]
rectypes_keyvars <- xml2::xml_text(
xml2::xml_find_first(
xml2::xml_find_all(ddi_xml, "/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/d1:recGrp"),
"@keyvar"
)
)
rectypes_keyvars <- fostr_split(rectypes_keyvars, "[[:blank:]]+")
rectypes_keyvars <- purrr::map(rectypes_keyvars, ~.[!is.na(.)])
rectypes_keyvars <- tibble::tibble(
rectype = rectypes,
keyvars = rectypes_keyvars
)
# For some reason our extract engine can't provide value labels for rec types
# So get it from file structure area
rt_lbls <- xml_text_from_path_all(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/d1:recGrp/d1:labl"
)
rectype_labels <- tibble::tibble(
val = rectypes,
lbl = rt_lbls
)
} else {
rectypes <- NULL
rectype_idvar <- NULL
rectype_labels <- NULL
rectypes_keyvars <- NULL
}
# Get variable specific information
var_info <- get_var_info_from_ddi(ddi_xml, file_type, rectype_idvar, rectype_labels)
if (lower_vars) {
var_info$var_name <- tolower(var_info$var_name)
if (!is.null(rectype_idvar)) rectype_idvar <- tolower(rectype_idvar)
if (!is.null(rectypes_keyvars)) {
rectypes_keyvars$keyvars <- purrr::map(rectypes_keyvars$keyvars, tolower)
}
}
make_ddi_from_scratch(
file_name = file_name,
file_path = dirname(ddi_file),
file_type = file_type,
ipums_project = ipums_project,
extract_date = extract_date,
extract_notes = extract_notes,
rectypes = rectypes,
rectype_idvar = rectype_idvar,
rectypes_keyvars = rectypes_keyvars,
var_info = var_info,
conditions = conditions,
citation = citation,
file_encoding = file_encoding
)
}
xml_text_from_path_first <- function(xml, path) {
xml2::xml_text(xml2::xml_find_first(xml, path))
}
xml_text_from_path_collapsed <- function(xml, path, collapse = "\n\n") {
out <- xml2::xml_text(xml2::xml_find_all(xml, path))
paste(out, collapse = collapse)
}
xml_text_from_path_all <- function(xml, path) {
xml2::xml_text(xml2::xml_find_all(xml, path))
}
get_var_info_from_ddi <- function(ddi_xml, file_type, rt_idvar, rectype_labels) {
var_info_xml <- xml2::xml_find_all(ddi_xml, "/d1:codeBook/d1:dataDscr/d1:var")
if (length(var_info_xml) == 0) return(NULL)
var_name <- xml2::xml_attr(var_info_xml, "name")
start <- as.numeric(xml_text_from_path_first(var_info_xml, "d1:location/@StartPos"))
end <- as.numeric(xml_text_from_path_first(var_info_xml, "d1:location/@EndPos"))
width <- as.numeric(xml_text_from_path_first(var_info_xml, "d1:location/@width"))
var_label <- xml_text_from_path_first(var_info_xml, "d1:labl")
var_desc <- xml_text_from_path_first(var_info_xml, "d1:txt")
imp_decim <- as.numeric(xml2::xml_attr(var_info_xml, "dcml"))
var_type <- xml_text_from_path_first(var_info_xml, "d1:varFormat/@type")
var_intrvl <- xml2::xml_attr(var_info_xml, "intrvl")
var_type <- dplyr::case_when(
var_type == "numeric" & var_intrvl == "discrete" & (width < 10) ~ "integer",
var_type == "numeric" ~ "numeric",
var_type == "character" ~ "character",
TRUE ~ "character" # Default to character if it's unexpected
)
code_instr <- xml_text_from_path_first(var_info_xml, "d1:codInstr")
if (file_type == "hierarchical") {
rectype_by_var <- fostr_split(xml2::xml_attr(var_info_xml, "rectype"), " ")
} else {
rectype_by_var <- NA
}
# Value labels
# Some come from parsed code sections
lbls_from_code_instr <- parse_labels_from_code_instr(code_instr, var_type)
# For hierarchical, RECTYPE comes from elsewhere in the DDI
if (file_type == "hierarchical") {
# If var is numeric, need to convert
rt_type <- var_type[var_name == rt_idvar]
if (length(rt_type) == 1 && rt_type %in% c("numeric", "integer")) {
rectype_labels$val <- suppressWarnings(as.numeric(rectype_labels$val))
}
rectype_labels <- dplyr::filter(rectype_labels, !is.na(.data$val))
rectype_labels <- dplyr::arrange(rectype_labels, .data$val)
# Replace in the code_instructions
if (nrow(rectype_labels) > 0) {
lbls_from_code_instr[[which(var_name == rt_idvar)]] <- rectype_labels
}
}
val_labels <- purrr::pmap(
list(var_info_xml, var_type, lbls_from_code_instr),
function(vvv, vtype, extra_labels) {
lbls <- xml2::xml_find_all(vvv, "d1:catgry")
if (length(lbls) == 0) return(extra_labels)
lbls <- tibble::tibble(
val = xml_text_from_path_all(lbls, "d1:catValu"),
lbl = xml_text_from_path_all(lbls, "d1:labl")
)
if (vtype %in% c("numeric", "integer")) lbls$val <- as.numeric(lbls$val)
# Drop labels that are the same as the value
# But leading 0's can be ignored if numeric
if (vtype %in% c("numeric", "integer")) {
lnum <- suppressWarnings(as.numeric(lbls$lbl))
lbls <- dplyr::filter(lbls, (is.na(lnum) | .data$val != lnum))
} else {
lbls <- dplyr::filter(lbls, .data$val != .data$lbl)
}
out <- dplyr::bind_rows(lbls, extra_labels)
dplyr::arrange(out, .data$val)
})
make_var_info_from_scratch(
var_name = var_name,
var_label = var_label,
var_desc = var_desc,
val_labels = val_labels,
code_instr = code_instr,
start = start,
end = end,
imp_decim = imp_decim,
var_type = var_type,
rectypes = rectype_by_var
)
}
#' Read metadata from a text codebook in a NHGIS or Terra area-level extract
#'
#' Read text formatted codebooks provided by some IPUMS extract systems such as
#' NHGIS and Terra Area-level extracts in a format analogous to the DDIs
#' available for other projects.
#'
#' @return
#' A \code{ipums_ddi} object with information on the variables included in the
#' csv file of a NHGIS extract.
#' @param cb_file Filepath to the codebook (either the .zip file directly downloaded
#' from the website, or the path to the unzipped .txt file).
#' @param data_layer dplyr \code{\link[dplyr]{select}}-style notation for uniquely
#' identifying the data layer to load. Required for reading from .zip files
#' for extracts with multiple files.
#' @examples
#' # Example NHGIS extract
#' nhgis_file <- ipums_example("nhgis0008_csv.zip")
#' ddi <- read_ipums_codebook(nhgis_file)
#' @family ipums_metadata
#' @export
read_ipums_codebook <- function(cb_file, data_layer = NULL) {
data_layer <- enquo(data_layer)
if (path_is_zip_or_dir(cb_file)) {
cb_name <- find_files_in(cb_file, "txt", multiple_ok = TRUE)
# There are 2 formats for extracts, so we have to do some work here.
# IPUMS Terra always(?) has 2 text files, one is a codebook for all files
# in the extract and another with a name that ends in "info.txt" and
# isn't useful
if (length(cb_name) > 1) {
# First try to get rid of the "info" txt
cb_name <- cb_name[!fostr_detect(cb_name, "info\\.txt$")]
# If we still have multiple, then we should try to use the data_layer filter
# because we're probably in a NHGIS extract
if (length(cb_name) > 1) cb_name <- find_files_in(cb_file, "txt", data_layer)
}
if (length(cb_name) == 1) {
if (file_is_zip(cb_file)) {
cb <- readr::read_lines(unz(cb_file, cb_name))
} else {
cb <- readr::read_lines(file.path(cb_file, cb_name))
}
} else {
cb <- NULL
}
} else {
cb_name <- cb_file
if (file.exists(cb_name)) {
cb <- readr::read_lines(cb_name)
} else {
cb <- NULL
}
}
if (is.null(cb)) {
stop("Could not find text codebook.")
}
# Section markers are a line full of dashes (setting to 5+ to eliminate false positives)
section_markers <- which(fostr_detect(cb, "^[-]{5,}$"))
# Second line tells if it is NHGIS or IPUMS Terra codebook
if (fostr_detect(cb[2], "IPUMS Terra")) type <- "IPUMS Terra"
else if (fostr_detect(cb[2], "NHGIS")) type <- "NHGIS"
else stop("Unknown codebook format.")
# Get table names (var_desc) and variable labels (var_label)
# from data dictionary section using messy string parsing code
dd <- find_cb_section(cb, "^Data Dictionary$", section_markers)
if (type == "IPUMS Terra") {
data_file_rows <- which(fostr_detect(dd, "^Data File:"))
data_file_sections <- purrr::map2(data_file_rows, c(data_file_rows[-1], length(dd)), ~seq(.x + 1, .y - 1))
data_file_names <- fostr_named_capture_single(
dd[data_file_rows],
"Data File: (?<data_file>.+)"
)
# Only get var info from file you're downloading (specfied in data_layer)
if (quo_is_null(data_layer)) {
this_file <- seq_along(data_file_names)
} else {
this_file <- which(data_file_names == tidyselect::vars_select(data_file_names, !!data_layer))
}
if (length(this_file) > 1) {
stop(custom_format_text(
"Multiple codebooks found, please specify which to use with the",
"`data_layer` argument", indent = 2, exdent = 2
))
}
var_info <- dd[data_file_sections[[this_file]]]
var_info <- fostr_named_capture(
var_info,
"(?<var_name>[[:alnum:]|[:punct:]]+):[[:blank:]]+(?<var_label>.+)$",
only_matches = TRUE
)
var_info <- make_var_info_from_scratch(
var_name = var_info$var_name,
var_label = var_info$var_label,
var_desc = ""
)
} else if (type == "NHGIS") {
# Check if file is a time series file
time_series_type <- fostr_named_capture_single(
cb,
"^Time series layout:[[:blank:]]+(?<ts_type>.+)$",
only_matches = TRUE
)
if (length(time_series_type) > 0) {
is_time_series <- TRUE
} else {
is_time_series <- FALSE
}
context_start <- which(dd == "Context Fields ") + 1
context_end <- which(fostr_detect(dd, "^[[:blank:]]$")) - 1
context_end <- min(context_end[context_end > context_start])
context_rows <- seq(context_start, context_end)
context_vars <- dd[context_rows]
context_vars <- fostr_named_capture(
context_vars,
"(?<var_name>[[:alnum:]|[:punct:]]+):[[:blank:]]+(?<var_label>.+)$"
)
context_vars$var_desc <- ""
context_vars <- context_vars[!is.na(context_vars$var_name), ]
table_name_rows <- which(fostr_detect(dd, "^[[:blank:]]*Table [0-9]+:"))
table_sections <- purrr::map2(table_name_rows, c(table_name_rows[-1], length(dd)), ~seq(.x, .y - 1))
table_vars <- purrr::map_df(table_sections, function(rows) {
if (is_time_series) {
table_name_and_code <- fostr_named_capture(
dd[rows[1]],
"^[[:blank:]]*Table .+?:[[:blank:]]+\\((?<table_code>.+?)\\)[[:blank:]]+(?<table_name>.+)$"
)
nhgis_table_code <- table_name_and_code$table_code
table_name <- table_name_and_code$table_name
time_series_headers <- fostr_detect(dd[rows], "^[[:blank:]]+Time series")
vars <- dd[rows][!time_series_headers]
vars <- vars[-1] # First row was table name/code
vars <- fostr_named_capture(
vars,
"(?<var_name>[[:alnum:]|[:punct:]]+):[[:blank:]]+(?<var_label>.+)$",
only_matches = TRUE
)
vars$var_desc <- paste0(table_name, " (", nhgis_table_code, ")")
} else {
table_name <- fostr_named_capture_single(
dd[rows[1]],
"^[[:blank:]]*Table .+?:[[:blank:]]+(?<table_name>.+)$"
)
nhgis_table_code <- fostr_named_capture_single(
dd[rows[4]],
"^[[:blank:]]*NHGIS code:[[:blank:]]+(?<table_code>.+)$"
)
vars <- fostr_named_capture(
dd[rows[-1:-4]],
"(?<var_name>[[:alnum:]|[:punct:]]+):[[:blank:]]+(?<var_label>.+)$",
only_matches = TRUE
)
vars$var_desc <- paste0(table_name, " (", nhgis_table_code, ")")
}
vars
})
var_info <- make_var_info_from_scratch(
var_name = c(context_vars$var_name, table_vars$var_name),
var_label = c(context_vars$var_label, table_vars$var_label),
var_desc = c(context_vars$var_desc, table_vars$var_desc)
)
}
# Get License and Condition section
conditions_text <- find_cb_section(cb, "^Citation and Use of .+ Data", section_markers)
conditions_text <- paste(conditions_text, collapse = "\n")
out <- make_ddi_from_scratch(
file_name = cb_name,
file_type = "rectangular",
ipums_project = type,
var_info = var_info,
conditions = conditions_text
)
out
}
find_cb_section <- function(cb_text, section, section_markers) {
start <- which(fostr_detect(cb_text, section))
start <- start[start - 1 %in% section_markers & start + 1 %in% section_markers] + 2
end <- min(c(length(cb_text), section_markers[section_markers > start])) - 1
cb_text[seq(start, end)]
}
path_is_zip_or_dir <- function(file) {
ext <- tools::file_ext(file)
ext == "zip" || ext == ""
}
#' Create DDI structure (for internal use)
#'
#' Helper to make a new DDI structure (not very useful for end-users).
#'
#' @keywords internal
#' @export
make_ddi_from_scratch <- function(
file_name = NULL,
file_path = NULL,
file_type = NULL,
ipums_project = NULL,
extract_date = NULL,
extract_notes = NULL,
rectypes = NULL,
rectype_idvar = NULL,
rectypes_keyvars = NULL,
var_info = NULL,
conditions = NULL,
citation = NULL,
file_encoding = NULL
) {
out <- list(
file_name = file_name,
file_path = file_path,
file_type = file_type,
ipums_project = ipums_project,
extract_date = extract_date,
extract_notes = extract_notes,
rectypes = rectypes,
rectype_idvar = rectype_idvar,
rectypes_keyvars = rectypes_keyvars,
var_info = var_info,
conditions = conditions,
citation = citation,
file_encoding = file_encoding
)
class(out) <- "ipums_ddi"
out
}
make_var_info_from_scratch <- function(
var_name = "",
var_label = "",
var_desc = "",
val_labels = list(tibble::tibble(val = numeric(0), lbl = character(0))),
code_instr = "",
start = NA,
end = NA,
imp_decim = 0,
var_type = "",
rectypes = NA
) {
tibble::tibble(
var_name = var_name,
var_label = var_label,
var_desc = var_desc,
val_labels = val_labels,
code_instr = code_instr,
start = start,
end = end,
imp_decim = imp_decim,
var_type = var_type,
rectypes = rectypes
)
}
make_empty_labels <- function(vt) {
tibble::tibble(
val = if (vt == "character") character(0) else numeric(0),
lbl = character(0)
)
}
# Helper to get labels out of free text from codInstr in xml
parse_labels_from_code_instr <- function(code, var_type) {
purrr::map2(code, var_type, function(x, vt) {
if (is.na(x)) return(make_empty_labels(vt))
lines <- fostr_split(x, "\n")[[1]]
labels <- parse_code_regex(lines, vt)
dplyr::arrange(labels, .data$val)
})
}
parse_code_regex <- function(x, vtype) {
if (vtype %in% c("numeric", "integer")) {
labels <- fostr_named_capture(
x,
"^(?<val>-?[0-9.,]+)(([[:blank:]][[:punct:]]|[[:punct:]][[:blank:]]|[[:blank:]]|=)+)(?<lbl>.+?)$",
only_matches = TRUE
)
labels$val <- as.numeric(fostr_replace_all(labels$val, ",", ""))
} else {
labels <- fostr_named_capture(
x,
"^(?<val>[[:graph:]]+)(([[:blank:]]+[[:punct:]|=]+[[:blank:]])+)(?<lbl>.+)$",
only_matches = TRUE
)
}
labels
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.