Nothing
# This file is part of the ipumsr R package created by IPUMS.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/ipumsr
#' `ipums_ddi` class
#'
#' @description
#' The `ipums_ddi` class provides a data structure for storing the metadata
#' information contained in IPUMS codebook files. These objects are primarily
#' used when loading IPUMS data, but can also be
#' used to explore metadata for an IPUMS extract.
#'
#' - For microdata projects, this information is provided in
#' [DDI codebook](https://ddialliance.org/learn/what-is-ddi)
#' (.xml) files.
#' - For NHGIS, this information is provided in .txt codebook files.
#'
#' The codebook file contains metadata about the extract files themselves,
#' including file name, file path, and extract date as well as information about
#' variables present in the data, including variable names, descriptions, data
#' types, implied decimals, and positions in the fixed-width files.
#'
#' This information is used to correctly parse IPUMS
#' fixed-width files and attach additional variable metadata to data upon load.
#'
#' Note that codebook metadata for NHGIS extracts can also be stored in
#' an `ipums_ddi` object, even though these codebooks are distributed as .txt
#' files, not .xml files. These files do not adhere to the same standards as
#' the DDI codebook files, so some `ipums_ddi` fields will be left blank when
#' reading NHGIS codebooks.
#'
#' ## Creating an `ipums_ddi` object
#'
#' - To create an `ipums_ddi` object from an IPUMS microdata extract, use
#' [read_ipums_ddi()].
#' - To create an `ipums_ddi` object from an IPUMS NHGIS extract, use
#' [read_nhgis_codebook()]
#'
#' ## Loading data
#'
#' - To load the data associated with an `ipums_ddi` object, use
#' [read_ipums_micro()], [read_ipums_micro_chunked()], or
#' [read_ipums_micro_yield()]
#'
#' ## View metadata
#'
#' - Use [ipums_var_info()] to explore variable-level metadata for the variables
#' included in a dataset.
#' - Use [ipums_file_info()] to explore file-level metadata for an extract.
#'
#' @name ipums_ddi-class
#'
#' @keywords internal
#'
#' @aliases ipums_ddi
NULL
#' Read metadata about an IPUMS microdata extract from a DDI codebook (.xml)
#' file
#'
#' @description
#' Reads the metadata about an IPUMS extract from a
#' [DDI codebook](https://ddialliance.org/learn/what-is-ddi) into an
#' [ipums_ddi] object.
#'
#' These metadata contains parsing instructions for the associated fixed-width
#' data file, contextual labels for variables and values in the data, and
#' general extract information.
#'
#' See *Downloading IPUMS files* below for information about downloading
#' IPUMS DDI codebook files.
#'
#' # Downloading IPUMS files
#' The DDI codebook (.xml) file provided with IPUMS microdata extracts can be
#' downloaded through the IPUMS extract interface or (for some collections)
#' within R using the IPUMS API.
#'
#' If using the IPUMS extract interface:
#' - Download the DDI codebook by right clicking on the **DDI** link in the
#' **Codebook** column of the extract interface and selecting **Save as...**
#' (on Safari, you may have to select **Download Linked File As...**).
#' Be sure that the codebook is downloaded in .xml format.
#'
#' If using the IPUMS API:
#' - For supported collections, use [download_extract()] to download a completed
#' extract via the IPUMS API. This automatically downloads both the DDI
#' codebook and the data file from the extract and
#' returns the path to the codebook file.
#'
#' @param ddi_file Path to a DDI .xml file downloaded from
#' [IPUMS](https://www.ipums.org/). See *Downloading IPUMS files* below.
#' @param lower_vars Logical indicating whether to convert variable names to
#' lowercase. Defaults to `FALSE` for consistency with IPUMS conventions.
#' @param data_layer,file_select `r lifecycle::badge("deprecated")` Reading
#' DDI files contained in a .zip archive has been deprecated. Please provide
#' the full path to the .xml file to be loaded in `ddi_file`.
#'
#' @return An [ipums_ddi] object with metadata information.
#'
#' @seealso [read_ipums_micro()], [read_ipums_micro_chunked()] and
#' [read_ipums_micro_yield()] to read data from IPUMS microdata extracts.
#'
#' [ipums_var_info()] and [ipums_file_info()] to view metadata about an
#' [ipums_ddi] object.
#'
#' [ipums_list_files()] to list files in an IPUMS extract.
#'
#' @export
#'
#' @examples
#' # Example codebook file
#' ddi_file <- ipums_example("cps_00157.xml")
#'
#' # Load data into an `ipums_ddi` obj
#' ddi <- read_ipums_ddi(ddi_file)
#'
#' # Use the object to load its associated data
#' cps <- read_ipums_micro(ddi)
#'
#' head(cps)
#'
#' # Or get metadata information directly
#' ipums_var_info(ddi)
#'
#' ipums_file_info(ddi)[1:2]
#'
#' # If variable metadata have been lost from a data source, reattach from
#' # its corresponding `ipums_ddi` object:
#' cps <- zap_ipums_attributes(cps)
#'
#' ipums_var_label(cps$STATEFIP)
#'
#' cps <- set_ipums_var_attributes(cps, ddi$var_info)
#'
#' ipums_var_label(cps$STATEFIP)
read_ipums_ddi <- function(ddi_file,
lower_vars = FALSE,
file_select = deprecated(),
data_layer = deprecated()) {
if (!missing(data_layer)) {
lifecycle::deprecate_warn(
"0.6.0",
"read_ipums_ddi(data_layer = )"
)
file_select <- enquo(data_layer)
} else if (!missing(file_select)) {
lifecycle::deprecate_warn(
"0.6.3",
"read_ipums_ddi(file_select = )"
)
file_select <- enquo(file_select)
} else {
file_select <- NULL
file_select <- enquo(file_select)
}
if (file_is_zip(ddi_file)) {
lifecycle::deprecate_warn(
"0.6.3",
I("Reading DDI files through a zip archive "),
details = "Please provide the full path to the DDI file to be loaded."
)
} else {
dir_read_deprecated(ddi_file)
}
custom_check_file_exists(ddi_file)
ddi_file_load <- find_files_in(
ddi_file,
"xml",
file_select,
none_ok = FALSE,
multiple_ok = FALSE
)
if (file_is_zip(ddi_file)) {
ddi_file_load <- unz(ddi_file, ddi_file_load)
} else if (file_is_dir(ddi_file)) {
ddi_file_load <- file.path(ddi_file, ddi_file_load)
}
ddi_xml <- xml2::read_xml(ddi_file_load, file_select = NULL)
# Basic information
conditions <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:dataAccs/d1:useStmt/d1:conditions"
)
citation <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:dataAccs/d1:useStmt/d1:citReq"
)
ipums_project <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:citation/d1:serStmt/d1:serName"
)
extract_notes <- xml_text_from_path_collapsed(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:notes"
)
extract_date <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:stdyDscr/d1:citation/d1:prodStmt/d1:prodDate/@date"
)
extract_date <- as.Date(extract_date)
# File information
files <- xml2::xml_find_all(ddi_xml, "/d1:codeBook/d1:fileDscr")
if (length(files) > 1) {
rlang::warn("Extracts with multiple files not supported, using first file.")
}
file_name <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileName"
)
file_type <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/@type"
)
file_encoding <- xml_text_from_path_first(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileType/@charset"
)
# Get rectype info if hierarchical
if (file_type == "hierarchical") {
rectypes <- xml_text_from_path_all(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/d1:recGrp/@rectype"
)
rectype_idvar <- xml_text_from_path_all(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/d1:recGrp/@recidvar"
)
rectype_idvar <- rectype_idvar[length(rectype_idvar)]
rectypes_keyvars <- xml2::xml_text(
xml2::xml_find_first(
xml2::xml_find_all(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/d1:recGrp"
),
"@keyvar"
)
)
rectypes_keyvars <- fostr_split(rectypes_keyvars, "[[:blank:]]+")
rectypes_keyvars <- purrr::map(rectypes_keyvars, ~ .[!is.na(.)])
rectypes_keyvars <- tibble::tibble(
rectype = rectypes,
keyvars = rectypes_keyvars
)
# For some reason our extract engine can't provide value labels for rectypes
# So get it from file structure area
rt_lbls <- xml_text_from_path_all(
ddi_xml,
"/d1:codeBook/d1:fileDscr/d1:fileTxt/d1:fileStrc/d1:recGrp/d1:labl"
)
rectype_labels <- tibble::tibble(
val = rectypes,
lbl = rt_lbls
)
} else {
rectypes <- NULL
rectype_idvar <- NULL
rectype_labels <- NULL
rectypes_keyvars <- NULL
}
# Get variable specific information
var_info <- get_var_info_from_ddi(
ddi_xml,
file_type,
rectype_idvar,
rectype_labels
)
if (lower_vars) {
var_info$var_name <- tolower(var_info$var_name)
if (!is.null(rectype_idvar)) {
rectype_idvar <- tolower(rectype_idvar)
}
if (!is.null(rectypes_keyvars)) {
rectypes_keyvars$keyvars <- purrr::map(rectypes_keyvars$keyvars, tolower)
}
}
if (file_is_dir(ddi_file)) {
file_path <- ddi_file
} else {
file_path <- dirname(ddi_file)
}
new_ipums_ddi(
file_name = file_name,
file_path = file_path,
file_type = file_type,
ipums_project = ipums_project,
extract_date = extract_date,
extract_notes = extract_notes,
rectypes = rectypes,
rectype_idvar = rectype_idvar,
rectypes_keyvars = rectypes_keyvars,
var_info = var_info,
conditions = conditions,
citation = citation,
file_encoding = file_encoding
)
}
xml_text_from_path_first <- function(xml, path) {
xml2::xml_text(xml2::xml_find_first(xml, path))
}
xml_text_from_path_collapsed <- function(xml, path, collapse = "\n\n") {
out <- xml2::xml_text(xml2::xml_find_all(xml, path))
paste(out, collapse = collapse)
}
xml_text_from_path_all <- function(xml, path) {
xml2::xml_text(xml2::xml_find_all(xml, path))
}
get_var_info_from_ddi <- function(ddi_xml,
file_type,
rt_idvar,
rectype_labels) {
var_info_xml <- xml2::xml_find_all(ddi_xml, "/d1:codeBook/d1:dataDscr/d1:var")
if (length(var_info_xml) == 0) {
return(NULL)
}
var_name <- xml2::xml_attr(var_info_xml, "name")
start <- as.numeric(
xml_text_from_path_first(var_info_xml, "d1:location/@StartPos")
)
end <- as.numeric(
xml_text_from_path_first(var_info_xml, "d1:location/@EndPos")
)
width <- as.numeric(
xml_text_from_path_first(var_info_xml, "d1:location/@width")
)
var_label <- xml_text_from_path_first(var_info_xml, "d1:labl")
var_desc <- xml_text_from_path_first(var_info_xml, "d1:txt")
imp_decim <- as.numeric(
xml2::xml_attr(var_info_xml, "dcml")
)
var_type <- xml_text_from_path_first(var_info_xml, "d1:varFormat/@type")
var_intrvl <- xml2::xml_attr(var_info_xml, "intrvl")
var_type <- dplyr::case_when(
var_type == "numeric" & var_intrvl == "discrete" & (width < 10) ~ "integer",
var_type == "numeric" ~ "numeric",
var_type == "character" ~ "character",
TRUE ~ "character" # Default to character if it's unexpected
)
code_instr <- fostr_replace(
xml_text_from_path_first(var_info_xml, "d1:codInstr"),
"^Codes",
""
)
if (file_type == "hierarchical") {
rectype_by_var <- fostr_split(xml2::xml_attr(var_info_xml, "rectype"), " ")
} else {
rectype_by_var <- NA
}
# Value labels
# Some come from parsed code sections
lbls_from_code_instr <- parse_labels_from_code_instr(code_instr, var_type)
# For hierarchical, RECTYPE comes from elsewhere in the DDI
if (file_type == "hierarchical") {
# If var is numeric, need to convert
rt_type <- var_type[var_name == rt_idvar]
if (length(rt_type) == 1 && rt_type %in% c("numeric", "integer")) {
rectype_labels$val <- suppressWarnings(as.numeric(rectype_labels$val))
}
rectype_labels <- dplyr::filter(rectype_labels, !is.na(.data$val))
rectype_labels <- dplyr::arrange(rectype_labels, .data$val)
# Replace in the code_instructions
if (nrow(rectype_labels) > 0) {
lbls_from_code_instr[[which(var_name == rt_idvar)]] <- rectype_labels
}
}
val_labels <- purrr::pmap(
list(var_info_xml, var_type, lbls_from_code_instr),
function(vvv, vtype, extra_labels) {
lbls <- xml2::xml_find_all(vvv, "d1:catgry")
if (length(lbls) == 0) {
return(extra_labels)
}
lbls <- tibble::tibble(
val = xml_text_from_path_all(lbls, "d1:catValu"),
lbl = xml_text_from_path_all(lbls, "d1:labl")
)
if (vtype %in% c("numeric", "integer")) lbls$val <- as.numeric(lbls$val)
# Drop labels that are the same as the value
# But leading 0's can be ignored if numeric
if (vtype %in% c("numeric", "integer")) {
lnum <- suppressWarnings(as.numeric(lbls$lbl))
lbls <- dplyr::filter(lbls, (is.na(lnum) | .data$val != lnum))
} else {
lbls <- dplyr::filter(lbls, .data$val != .data$lbl)
}
out <- dplyr::bind_rows(lbls, extra_labels)
dplyr::arrange(out, .data$val)
}
)
make_var_info_from_scratch(
var_name = var_name,
var_label = var_label,
var_desc = var_desc,
val_labels = val_labels,
code_instr = code_instr,
start = start,
end = end,
imp_decim = imp_decim,
var_type = var_type,
rectypes = rectype_by_var
)
}
#' Read metadata from an NHGIS codebook (.txt) file
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' Read the variable metadata contained in the .txt codebook file included with
#' NHGIS extracts into an [ipums_ddi] object.
#'
#' Because NHGIS variable metadata do not
#' adhere to all the standards of microdata DDI files, some of the `ipums_ddi`
#' fields will not be populated.
#'
#' This function is marked as experimental while we determine whether
#' there may be a more robust way to standardize codebook and DDI reading across
#' IPUMS collections.
#'
#' @param cb_file Path to a .zip archive containing an NHGIS extract or to an
#' NHGIS codebook (.txt) file.
#' @param file_select If `cb_file` is a .zip archive or directory that contains
#' multiple codebook files, an expression identifying the file to read.
#' Accepts a character string specifying the file name, a
#' [tidyselect selection][selection_language], or an index position of the
#' file. Ignored if `cb_file` is the path to a single codebook file.
#' @param raw If `TRUE`, return a character vector containing the lines
#' of `cb_file` rather than an `ipums_ddi` object. Defaults to
#' `FALSE`.
#'
#' @return If `raw = FALSE`, an `ipums_ddi` object with information on the
#' variables contained in the data for the extract associated with the given
#' `cb_file`.
#'
#' If `raw = TRUE`, a character vector with one element for each
#' line of the given `cb_file`.
#'
#' @export
#'
#' @seealso [read_nhgis()] to read tabular data from an IPUMS NHGIS extract.
#'
#' [read_ipums_sf()] to read spatial data from an IPUMS extract.
#'
#' [ipums_list_files()] to list files in an IPUMS extract.
#'
#' @examples
#' # Example file
#' nhgis_file <- ipums_example("nhgis0972_csv.zip")
#'
#' # Read codebook as an `ipums_ddi` object:
#' codebook <- read_nhgis_codebook(nhgis_file)
#'
#' # Variable-level metadata about the contents of the data file:
#' ipums_var_info(codebook)
#'
#' ipums_var_label(codebook, "PMSA")
#'
#' # If variable metadata have been lost from a data source, reattach from
#' # the corresponding `ipums_ddi` object:
#' nhgis_data <- read_nhgis(nhgis_file, verbose = FALSE)
#'
#' nhgis_data <- zap_ipums_attributes(nhgis_data)
#' ipums_var_label(nhgis_data$PMSA)
#'
#' nhgis_data <- set_ipums_var_attributes(nhgis_data, codebook$var_info)
#' ipums_var_label(nhgis_data$PMSA)
#'
#' # You can also load the codebook in raw format to display in the console
#' codebook_raw <- read_nhgis_codebook(nhgis_file, raw = TRUE)
#'
#' # Use `cat` for human-readable output
#' cat(codebook_raw[1:20], sep = "\n")
read_nhgis_codebook <- function(cb_file,
file_select = NULL,
raw = FALSE) {
dir_read_deprecated(cb_file)
file_select <- enquo(file_select)
custom_check_file_exists(cb_file)
cb_name <- find_files_in(
cb_file,
"txt",
file_select = file_select,
multiple_ok = FALSE,
none_ok = FALSE
)
if (file_is_zip(cb_file)) {
cb <- readr::read_lines(unz(cb_file, cb_name), progress = FALSE)
} else if (file_is_dir(cb_file)) {
cb <- readr::read_lines(file.path(cb_file, cb_name), progress = FALSE)
} else {
cb <- readr::read_lines(cb_file, progress = FALSE)
}
if (raw) {
return(cb)
}
# Section markers are a line full of dashes
# (setting to 5+ to eliminate false positives)
section_markers <- which(fostr_detect(cb, "^[-]{5,}$"))
dd <- find_cb_section(cb, "^Data Dictionary$", section_markers)
context_start <- which(dd == "Context Fields ") + 1
context_end <- which(fostr_detect(dd, "^[[:blank:]]$")) - 1
context_end <- min(context_end[context_end > context_start])
context_rows <- seq(context_start, context_end)
context_vars <- fostr_named_capture(
dd[context_rows],
"(?<var_name>[[:alnum:]|[:punct:]]+):[[:blank:]]+(?<var_label>.+)$"
)
context_vars$var_desc <- ""
context_vars <- context_vars[!is.na(context_vars$var_name), ]
data_type_rows <- which(fostr_detect(dd, "(Data Type|Breakdown)"))
blank_rows <- which(fostr_detect(dd, "^[[:blank:]]+$"))
data_types <- dd[data_type_rows]
# If multiple data types, process variable info for each data
# type separately.
if (length(data_type_rows) > 0) {
data_type_sections <- purrr::map(
data_type_rows,
~ seq(.x, blank_rows[which(.x <= blank_rows)[1]] - 1)
)
# Combine multiple lines of data type/breakdown value info into single
# string to attach to var info
data_types <- purrr::map(
data_type_sections,
~ parse_breakdown(
dd[.x][length(.x):2] # Go in reverse so data types come before brkdowns
)
)
data_type_rows <- purrr::map2(
data_type_rows,
c(data_type_rows[-1], length(dd) + 1),
~ seq(.x, .y - 1)
)
table_name_rows <- purrr::map(
data_type_rows,
~ .x[fostr_detect(dd[.x], "^[[:blank:]]*(Table)|(Data Type)")]
)
table_sections <- purrr::map2(
data_type_rows,
table_name_rows,
function(dt, tn) {
purrr::map2(
tn,
c(tn[-1], max(dt) + 1),
~ seq(.x, .y - 1)
)
}
)
table_sections <- purrr::flatten(
purrr::map2(table_sections, data_types, ~ set_names(.x, .y))
)
if (any(fostr_detect(cb, "^Time series layout:"))) {
table_vars <- purrr::map_dfr(
table_sections,
~ read_nhgis_tst_tables(dd, .x)
)
} else {
table_vars <- purrr::map2_dfr(
table_sections,
names(table_sections),
~ read_nhgis_ds_tables(dd, .x, data_type = .y)
)
}
} else {
table_name_rows <- which(
fostr_detect(dd, "^[[:blank:]]*(Table)|(Data Type)")
)
table_sections <- purrr::map2(
table_name_rows,
c(table_name_rows[-1], length(dd)),
~ seq(.x, .y - 1)
)
if (any(fostr_detect(cb, "^Time series layout:"))) {
table_vars <- purrr::map_dfr(
table_sections,
~ read_nhgis_tst_tables(dd, .x)
)
} else {
table_vars <- purrr::map_dfr(
table_sections,
~ read_nhgis_ds_tables(dd, .x)
)
}
}
var_info <- make_var_info_from_scratch(
var_name = c(context_vars$var_name, table_vars$var_name),
var_label = c(context_vars$var_label, table_vars$var_label),
var_desc = c(context_vars$var_desc, table_vars$var_desc)
)
# Get License and Condition section
conditions_text <- find_cb_section(
cb,
"^Citation and Use of .+ Data",
section_markers
)
conditions_text <- paste(conditions_text, collapse = "\n")
new_ipums_ddi(
file_name = cb_name,
file_type = "rectangular",
ipums_project = "NHGIS",
var_info = var_info,
conditions = conditions_text
)
}
#' Parse NHGIS codebook lines with data type or breakdown info
#'
#' @description
#' Extracts the apporpriate name for data types and breakdown values from an
#' NHGIS codebook for maximal similarity to the content of NHGIS enhanced
#' header rows.
#'
#' Lines with a double colon ("::") have titles following the double colon.
#' Lines with single colons have titles following the first single colon.
#' Lines with no colons are typically data types and should have the entire
#' line extracted.
#'
#' @param bkdown_lines Lines corresponding to a single data type or breakdown
#' section of the codebook. Typically start with "Breakdown" or "Data Type"
#'
#' @return Character vector of length `bkdown_lines` with the extracted
#' data type or breakdown titles
#'
#' @noRd
parse_breakdown <- function(bkdown_lines) {
dt_bkdwn <- purrr::map(
bkdown_lines,
~ if (fostr_detect(.x, "::")) {
fostr_named_capture(.x, "::(?<x>[^\\(]+)")$x
} else if (fostr_detect(.x, ":")) {
fostr_named_capture(.x, ":(?<x>[^\\(]+)")$x
} else {
.x
}
)
paste0(trimws(dt_bkdwn), collapse = ": ")
}
#' Helper function to read codebook information for an NHGIS
#' extract that contains time series tables.
#'
#' @param dd Character vector of lines contained in the codebook's
#' "Data Dictionary" section.
#' @param table_rows Indices of rows that include table variable information
#' within the provided `dd`.
#'
#' @return tibble of variable information
#'
#' @noRd
read_nhgis_tst_tables <- function(dd, table_rows) {
table_name_and_code <- fostr_named_capture(
dd[table_rows[1]],
paste0(
"^[[:blank:]]*Table .+?:[[:blank:]]+\\((?<table_code>.+?)\\)",
"[[:blank:]]+(?<table_name>.+)$"
)
)
nhgis_table_code <- table_name_and_code$table_code
table_name <- table_name_and_code$table_name
time_series_headers <- fostr_detect(
dd[table_rows],
"^[[:blank:]]+Time series"
)
vars <- dd[table_rows][!time_series_headers]
vars <- vars[-1] # First row was table name/code
vars <- fostr_named_capture(
vars,
"(?<var_name>[[:alnum:]|[:punct:]]+):[[:blank:]]+(?<var_label>.+)$",
only_matches = TRUE
)
vars$var_desc <- paste0("Table ", nhgis_table_code, ": ", table_name)
vars
}
#' Helper function to read codebook information for an NHGIS
#' extract that contains datasets.
#'
#' @param dd Character vector of lines contained in the codebook's
#' "Data Dictionary" section.
#' @param table_rows Indices of rows that include table variable information
#' within the provided `dd`.
#'
#' @return tibble of variable information
#'
#' @noRd
read_nhgis_ds_tables <- function(dd, table_rows, data_type = NULL) {
if (fostr_detect(dd[table_rows[1]], "Data Type")) {
rows <- dd[table_rows]
# Start at first blank row. Rows before this are part of the
# Data type or breakdown value, not variables.
rows <- rows[seq(
min(which(fostr_detect(rows, "^[[:blank:]]+$"))),
length(rows)
)]
vars <- fostr_named_capture(
rows,
"(?<var_name>[[:alnum:]|[:punct:]]+):[[:blank:]]+(?<var_label>.+)$",
only_matches = TRUE
)
vars$var_desc <- ""
} else {
table_name <- fostr_named_capture_single(
dd[table_rows[1]],
"^[[:blank:]]*Table .+?:[[:blank:]]+(?<table_name>.+)$"
)
universe <- fostr_named_capture_single(
dd[table_rows[2]],
"^[[:blank:]]*Universe:[[:blank:]]+(?<universe>.+)$"
)
nhgis_table_code <- fostr_named_capture_single(
dd[table_rows[4]],
"^[[:blank:]]*NHGIS code:[[:blank:]]+(?<table_code>.+)$"
)
vars <- fostr_named_capture(
dd[table_rows[-1:-4]],
"(?<var_name>[[:alnum:]|[:punct:]]+):[[:blank:]]+(?<var_label>.+)$",
only_matches = TRUE
)
if (!is_null(data_type)) {
vars$var_label <- paste0(data_type, ": ", vars$var_label)
}
vars$var_desc <- paste0(
"Table ", nhgis_table_code, ": ", table_name,
" (Universe: ", universe, ")"
)
}
vars
}
find_cb_section <- function(cb_text, section, section_markers) {
start <- which(fostr_detect(cb_text, section))
start <- 2 + start[
start - 1 %in% section_markers & start + 1 %in% section_markers
]
end <- min(c(length(cb_text), section_markers[section_markers > start])) - 1
cb_text[seq(start, end)]
}
new_ipums_ddi <- function(file_name = NULL,
file_path = NULL,
file_type = NULL,
ipums_project = NULL,
extract_date = NULL,
extract_notes = NULL,
rectypes = NULL,
rectype_idvar = NULL,
rectypes_keyvars = NULL,
var_info = NULL,
conditions = NULL,
citation = NULL,
file_encoding = NULL) {
ddi <- list(
file_name = file_name,
file_path = file_path,
file_type = file_type,
ipums_project = ipums_project,
extract_date = extract_date,
extract_notes = extract_notes,
rectypes = rectypes,
rectype_idvar = rectype_idvar,
rectypes_keyvars = rectypes_keyvars,
var_info = var_info,
conditions = conditions,
citation = citation,
file_encoding = file_encoding
)
structure(ddi, class = "ipums_ddi")
}
make_var_info_from_scratch <- function(var_name = "",
var_label = "",
var_desc = "",
val_labels = NULL,
code_instr = "",
start = NA,
end = NA,
imp_decim = 0,
var_type = "",
rectypes = NA) {
val_labels <- val_labels %||% list(
tibble::tibble(val = numeric(0), lbl = character(0))
)
tibble::tibble(
var_name = var_name,
var_label = var_label,
var_desc = var_desc,
val_labels = val_labels,
code_instr = code_instr,
start = start,
end = end,
imp_decim = imp_decim,
var_type = var_type,
rectypes = rectypes
)
}
make_empty_labels <- function(vt) {
tibble::tibble(
val = if (vt == "character") character(0) else numeric(0),
lbl = character(0)
)
}
# Helper to get labels out of free text from codInstr in xml
parse_labels_from_code_instr <- function(code, var_type) {
purrr::map2(
code,
var_type,
function(x, vt) {
if (is.na(x)) {
return(make_empty_labels(vt))
}
lines <- fostr_split(x, "\n")[[1]]
labels <- parse_code_regex(lines, vt)
dplyr::arrange(labels, .data$val)
}
)
}
parse_code_regex <- function(x, vtype) {
if (vtype %in% c("numeric", "integer")) {
labels <- fostr_named_capture(
x,
"^(?<val>-?[0-9.,]+)(([[:blank:]][[:punct:]]|[[:punct:]][[:blank:]]|[[:blank:]]|=)+)(?<lbl>.+?)$",
only_matches = TRUE
)
labels$val <- as.numeric(fostr_replace_all(labels$val, ",", ""))
} else {
labels <- fostr_named_capture(
x,
"^(?<val>[[:graph:]]+)(([[:blank:]]+[[:punct:]|=]+[[:blank:]])+)(?<lbl>.+)$",
only_matches = TRUE
)
}
labels
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.