R/agg_read.R

Defines functions ipums_agg_empty_ddi warn_default_fwf_parsing convert_col_types parse_col_positions parse_col_recode parse_nhgis_do_file check_header_row read_ihgis_codebook_safe read_nhgis_codebook_safe read_nhgis_csv read_nhgis_fwf read_nhgis read_ipums_agg

Documented in read_ipums_agg read_nhgis

# This file is part of the ipumsr R package created by IPUMS.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
#   https://github.com/ipums/ipumsr

#' Read data from an IPUMS aggregate data extract
#'
#' @description
#' Read a .csv file from an extract downloaded from an IPUMS aggregate
#' data collection (IPUMS NHGIS or IPUMS IHGIS).
#'
#' To read spatial data from an NHGIS extract, use [read_ipums_sf()].
#'
#' @param data_file Path to a .zip archive containing an IPUMS NHGIS or
#'   IPUMS IHGIS extract or a single .csv file from such an extract.
#' @param file_select If `data_file` is a .zip archive that
#'   contains multiple files, an expression identifying the file to load.
#'   Accepts a character vector specifying the
#'   file name, a [tidyselect selection][selection_language], or an index
#'   position. This must uniquely identify a file.
#' @param vars Names of variables to include in the output. Accepts a
#'   vector of names or a [tidyselect selection][selection_language].
#'   If `NULL`, includes all variables in the file.
#' @param col_types One of `NULL`, a [`cols()`][readr::cols]
#'   specification or a string. If `NULL`, all column types will be inferred
#'   from the values in the first `guess_max` rows of each column.
#'   Alternatively, you can use a compact string representation to specify
#'   column types:
#'
#'   - c = character
#'   - i = integer
#'   - n = number
#'   - d = double
#'   - l = logical
#'   - f = factor
#'   - D = date
#'   - T = date time
#'   - t = time
#'   - ? = guess
#'   - _ or - = skip
#'
#'   See [`read_delim()`][readr::read_delim] for more details.
#' @param n_max Maximum number of lines to read.
#' @param guess_max For .csv files, maximum number of lines to use for guessing
#'   column types. Will never use more than the number of lines read.
#' @param var_attrs Variable attributes to add from the codebook (.txt) file
#'   included in the extract. Defaults to all available attributes.
#'
#'   See [`set_ipums_var_attributes()`] for more details.
#' @param remove_extra_header If `TRUE`, remove the additional descriptive
#'   header row included in some NHGIS .csv files.
#'
#'   This header row is not
#'   usually needed as it contains similar information to that
#'   included in the `"label"` attribute of each data column (if `var_attrs`
#'   includes `"var_label"`).
#' @param file_encoding Encoding for the file to be loaded. For NHGIS extracts,
#'   defaults to ISO-8859-1. For IHGIS extracts, defaults to UTF-8. If the
#'   default encoding produces unexpected characters, adjust the encoding here.
#' @param verbose Logical controlling whether to display output when loading
#'   data. If `TRUE`, displays IPUMS conditions, a progress bar, and
#'   column types. Otherwise, all are suppressed.
#'
#'   Will be overridden by `readr.show_progress` and `readr.show_col_types`
#'   options, if they are set.
#'
#' @return A [`tibble`][tibble::tbl_df-class] containing the data found in
#'   `data_file`
#'
#' @export
#'
#' @seealso
#' [read_ipums_sf()] to read spatial data from an IPUMS extract.
#'
#' [read_nhgis_codebook()] or [read_ihgis_codebook()] to read metadata about
#' an IPUMS aggregate data extract.
#'
#' [ipums_list_files()] to list files in an IPUMS extract.
#'
#' @examples
#' nhgis_file <- ipums_example("nhgis0972_csv.zip")
#' ihgis_file <- ipums_example("ihgis0014.zip")
#'
#' # Provide the .zip archive directly to load the data inside:
#' read_ipums_agg(nhgis_file)
#'
#' # For extracts that contain multiple files, use `file_select` to specify
#' # a single file to load. This accepts a tidyselect expression:
#' read_ipums_agg(ihgis_file, file_select = matches("AAA_g0"), verbose = FALSE)
#'
#' # Or an index position:
#' read_ipums_agg(ihgis_file, file_select = 2, verbose = FALSE)
#'
#' # Variable metadata is automatically attached to data, if available
#' ihgis_data <- read_ipums_agg(ihgis_file, file_select = 2, verbose = FALSE)
#' ipums_var_info(ihgis_data)
#'
#' # Column types are inferred from the data. You can
#' # manually specify column types with `col_types`. This may be useful for
#' # geographic codes, which should typically be interpreted as character values
#' read_ipums_agg(nhgis_file, col_types = list(MSA_CMSAA = "c"), verbose = FALSE)
#'
#' # You can also read in a subset of the data file:
#' read_ipums_agg(
#'   nhgis_file,
#'   n_max = 15,
#'   vars = c(GISJOIN, YEAR, D6Z002),
#'   verbose = FALSE
#' )
read_ipums_agg <- function(data_file,
                           file_select = NULL,
                           vars = NULL,
                           col_types = NULL,
                           n_max = Inf,
                           guess_max = min(n_max, 1000),
                           var_attrs = c("val_labels", "var_label", "var_desc"),
                           remove_extra_header = TRUE,
                           file_encoding = NULL,
                           verbose = TRUE) {
  if (length(data_file) != 1) {
    rlang::abort("`data_file` must be length 1")
  }

  file_select <- enquo(file_select)

  custom_check_file_exists(data_file)

  file <- find_files_in(
    data_file,
    name_ext = "csv",
    file_select = file_select,
    pattern_exclude = "(_datadict|_geog|_tables)\\.csv$",
    multiple_ok = FALSE,
    none_ok = FALSE
  )

  # NHGIS files should have NHGIS in name. IHGIS that isn't guaranteed
  # If not obviously NHGIS, we'll try loading IHGIS codebook.
  # If both fail then we simply load the CSV data with no codebook
  if (fostr_detect(basename(data_file), "^nhgis")) {
    cb_ddi_info <- read_nhgis_codebook_safe(
      data_file,
      filename = file,
      verbose = !is_null(var_attrs),
      file_encoding = file_encoding
    )
  } else {
    cb_ddi_info <- read_ihgis_codebook_safe(
      data_file,
      verbose = !is_null(var_attrs),
      file_encoding = file_encoding
    )
  }

  if (file_is_zip(data_file)) {
    file <- unz(data_file, file)
    # TODO probably need on.exit here...
  } else if (file_is_dir(data_file)) {
    file <- file.path(data_file, file)
  }

  header_info <- check_header_row(data_file, file_select = !!file_select)

  # Skip to avoid loading extra header
  # We will reattach correct column names when we load the data.
  if (header_info$has_extra_header && remove_extra_header) {
    skip <- 2
  } else {
    skip <- 1
  }

  if (verbose) {
    message(short_conditions_text(cb_ddi_info))
  }

  data <- readr::read_csv(
    file,
    skip = skip,
    col_select = !!enquo(vars),
    col_names = header_info$col_names, # Reattach skipped colnames
    locale = ipums_locale(cb_ddi_info$file_encoding),
    progress = show_readr_progress(verbose),
    show_col_types = show_readr_coltypes(verbose),
    col_types = col_types,
    n_max = n_max,
    guess_max = guess_max,
    na = c("", "NA")
  )

  data <- set_ipums_var_attributes(data, cb_ddi_info$var_info, var_attrs)

  data
}

#' Read tabular data from an NHGIS extract
#'
#' @description
#' `r lifecycle::badge("deprecated")`
#'
#' Read a .csv or fixed-width (.dat) file downloaded from the NHGIS extract
#' system.
#'
#' This function has been deprecated in favor of [read_ipums_agg()], which
#' can read .csv files from both IPUMS aggregate data collections
#' (IPUMS NHGIS and IPUMS IHGIS). Please use that function instead.
#'
#' Note that fixed-width file reading is not supported in `read_ipums_agg()` and
#' will likely be retired with `read_nhgis()`. We therefore encourage you to
#' create NHGIS extracts in .csv format going forward. For previously-submitted
#' fixed-width extracts, we suggest
#' regenerating them in .csv format and loading them with `read_ipums_agg()`.
#' Use the `data_format` argument of [define_extract_agg()] to create a
#' .csv extract for submission via the IPUMS API.
#'
#' To read spatial data from an NHGIS extract, use [read_ipums_sf()].
#'
#' @details
#' The .do file that is included when downloading an NHGIS fixed-width
#' extract contains the necessary metadata (e.g. column positions and implicit
#' decimals) to correctly parse the data file. `read_nhgis()` uses this
#' information to parse and recode the fixed-width data appropriately.
#'
#' If you no longer have access to the .do file, consider resubmitting the
#' extract that produced the data. You can also change the desired data
#' format to produce a .csv file, which does not require additional metadata
#' files to be loaded.
#'
#' For more about resubmitting an existing extract via the IPUMS API, see
#' `vignette("ipums-api", package = "ipumsr")`.
#'
#' @inheritParams read_ipums_agg
#' @param data_file Path to a .zip archive containing an NHGIS extract or
#'   a single file from an NHGIS extract.
#' @param do_file For fixed-width files, path to the .do file associated with
#'   the provided `data_file`. The .do file contains the parsing instructions
#'   for the data file.
#'
#'   By default, looks in the same path as `data_file` for
#'   a .do file with the same name. See Details section below.
#'
#' @return A [`tibble`][tibble::tbl_df-class] containing the data found in
#'   `data_file`
#'
#' @seealso
#' [read_ipums_sf()] to read spatial data from an IPUMS extract.
#'
#' [read_nhgis_codebook()] to read metadata about an IPUMS NHGIS extract.
#'
#' [ipums_list_files()] to list files in an IPUMS extract.
#'
#' @export
#'
#' @keywords internal
#'
#' @examples
#' # Example files
#' csv_file <- ipums_example("nhgis0972_csv.zip")
#' fw_file <- ipums_example("nhgis0730_fixed.zip")
#'
#' # Previously:
#' read_nhgis(csv_file)
#'
#' # For CSV files, please update to use the following:
#' read_ipums_agg(csv_file)
#'
#' # Fixed-width files are parsed with the correct column positions
#' # and column types automatically:
#' read_nhgis(fw_file, file_select = contains("ts"), verbose = FALSE)
read_nhgis <- function(data_file,
                       file_select = NULL,
                       vars = NULL,
                       col_types = NULL,
                       n_max = Inf,
                       guess_max = min(n_max, 1000),
                       do_file = NULL,
                       var_attrs = c("val_labels", "var_label", "var_desc"),
                       remove_extra_header = TRUE,
                       verbose = TRUE) {
  lifecycle::deprecate_warn("0.9.0", "read_nhgis()", "read_ipums_agg()")

  if (length(data_file) != 1) {
    rlang::abort("`data_file` must be length 1")
  }

  file_select <- enquo(file_select)

  custom_check_file_exists(data_file)

  data_files <- find_files_in(
    data_file,
    name_ext = "csv|dat",
    pattern_exclude = "(_datadict|_geog|_tables)\\.csv$",
    multiple_ok = TRUE,
    none_ok = TRUE
  )

  has_csv <- any(grepl("[.]csv$", data_files))
  has_dat <- any(grepl("[.]dat$", data_files))

  if (!has_csv && !has_dat) {
    rlang::abort("No .csv or .dat files found in the provided `data_file`.")
  } else if (has_csv && has_dat) {
    rlang::abort(c(
      "Both .csv and .dat files found in the provided `data_file`.",
      "x" = paste0(
        "Only one type of data file can be present in the provided `data_file`."
      )
    ))
  }

  if (has_csv) {
    data <- read_ipums_agg(
      data_file,
      file_select = !!file_select,
      vars = !!enquo(vars),
      col_types = col_types,
      n_max = n_max,
      guess_max = guess_max,
      var_attrs = var_attrs,
      remove_extra_header = remove_extra_header,
      verbose = verbose
    )
  } else {
    data <- read_nhgis_fwf(
      data_file,
      file_select = !!file_select,
      col_types = col_types,
      col_select = !!enquo(vars),
      var_attrs = var_attrs,
      do_file = do_file,
      n_max = n_max,
      verbose = verbose,
      na = c(".", "", "NA")
    )
  }

  data
}

# Internal ---------------------

read_nhgis_fwf <- function(data_file,
                           file_select = NULL,
                           do_file = NULL,
                           col_types = NULL,
                           var_attrs = c("val_labels", "var_label", "var_desc"),
                           verbose = TRUE,
                           ...) {
  col_spec <- NULL

  file_select <- enquo(file_select)

  file <- find_files_in(
    data_file,
    name_ext = "dat",
    file_select = file_select,
    multiple_ok = FALSE,
    none_ok = FALSE
  )

  if (file_is_zip(data_file)) {
    # Cannot use fwf_empty() col_positions on an unz() connection
    # Must unzip file to allow for default fwf_empty() specification
    fwf_dir <- tempfile()

    on.exit(
      unlink(fwf_dir, recursive = TRUE),
      add = TRUE,
      after = FALSE
    )

    utils::unzip(data_file, exdir = fwf_dir)

    file <- file.path(fwf_dir, file)
  } else if (file_is_dir(data_file)) {
    file <- file.path(data_file, file)
  }

  default_do_file <- is_null(do_file)

  # Assume that a provided `do_file` is a relative path if `data_file` is
  # zipped. Otherwise, full path must be provided.
  if (file_is_zip(data_file)) {
    do_file <- do_file %||% fostr_replace(basename(file), "\\.dat$", ".do")
    do_file <- fostr_replace(file, basename(file), do_file)
  } else {
    do_file <- do_file %||% fostr_replace(file, "\\.dat$", ".do")
  }

  if (!file.exists(do_file)) {
    if (!default_do_file) {
      rlang::abort("Could not find the provided `do_file`.")
    } else {
      rlang::abort(c(
        "Could not find a .do file associated with the provided data file.",
        "i" = "Use the `do_file` argument to provide the path to the .do file."
      ))
    }
  } else {
    col_spec <- tryCatch(
      parse_nhgis_do_file(do_file),
      error = function(cnd) {
        rlang::abort(
          c(
            "Unexpected error parsing .do file",
            "x" = paste0(
              "This may occur if files have been reorganized from the original",
              " .zip format provided by NHGIS."
            ),
            "i" = paste0(
              "Check that `file_select` matches the intended file ",
              "or consider re-downloading this extract in .csv format."
            )
          ),
          call = expr(read_nhgis_fwf())
        )
      }
    )
  }

  cb_ddi_info <- read_nhgis_codebook_safe(
    data_file,
    filename = file,
    verbose = !is_null(var_attrs)
  )

  if (verbose) {
    message(short_conditions_text(cb_ddi_info))
  }

  data <- readr::read_fwf(
    file,
    col_positions = col_spec$col_positions,
    col_types = col_types %||% col_spec$col_types,
    locale = ipums_locale(cb_ddi_info$file_encoding),
    progress = show_readr_progress(verbose),
    show_col_types = show_readr_coltypes(verbose),
    ...
  )

  if (!is_null(col_spec$col_recode)) {
    # Rescale column values based on expressions in .do file
    purrr::walk2(
      col_spec$col_recode$cols,
      col_spec$col_recode$exprs,
      function(col, expr) {
        if (!is_null(data[[col]])) {
          # Coerce to numeric to guard against user-specified col_types
          data[[col]] <<- as.numeric(data[[col]])
          data[[col]] <<- eval(expr, data)
        }
      }
    )
  }

  data <- set_ipums_var_attributes(data, cb_ddi_info$var_info, var_attrs)

  data
}

read_nhgis_csv <- function(data_file,
                           file_select = NULL,
                           var_attrs = c("val_labels", "var_label", "var_desc"),
                           remove_extra_header = TRUE,
                           verbose = TRUE,
                           ...) {
  file_select <- enquo(file_select)

  file <- find_files_in(
    data_file,
    name_ext = "csv",
    file_select = file_select,
    pattern_exclude = "(_datadict|_geog|_tables)\\.csv$",
    multiple_ok = FALSE,
    none_ok = FALSE
  )

  cb_ddi_info <- read_nhgis_codebook_safe(
    data_file,
    filename = file,
    verbose = !is_null(var_attrs)
  )

  if (verbose) {
    message(short_conditions_text(cb_ddi_info))
  }

  if (file_is_zip(data_file)) {
    file <- unz(data_file, file)
  } else if (file_is_dir(data_file)) {
    file <- file.path(data_file, file)
  }

  header_info <- check_header_row(data_file, file_select = !!file_select)

  # Skip to avoid loading extra header
  # We will reattach correct column names when we load the data.
  if (header_info$has_extra_header && remove_extra_header) {
    skip <- 2
  } else {
    skip <- 1
  }

  data <- readr::read_csv(
    file,
    skip = skip,
    col_names = header_info$col_names, # Reattach skipped colnames
    locale = ipums_locale(cb_ddi_info$file_encoding),
    progress = show_readr_progress(verbose),
    show_col_types = show_readr_coltypes(verbose),
    ...
  )

  data <- set_ipums_var_attributes(data, cb_ddi_info$var_info, var_attrs)

  data
}

#' Load a codebook associated with a provided NHGIS data file
#'
#' Helper to load a codebook associated with a provided data file.
#' This is designed to handle a codebook that is bundled with a data file when
#' loading that data file. To load a codebook .txt file directly, see
#' `read_nhgis_codebook()`.
#'
#' This function is able to identify the correct codebook for a given data
#' file regardless of whether the data file is zipped or is the direct path
#' to a file. Codebooks are matched to data files by name, where the codebook
#' has the same file name with `_codebook` appended.
#'
#' An empty codebook is provided if no matching codebook can be found.
#'
#' @param data_file Path to a data file or a .zip archive containing an NHGIS
#'   extract.
#' @param filename Name of the .csv or .dat file to be loaded within the
#'   `data_file` zip archive. This allows codebooks to be identified when
#'   `data_file` contains multiple files.
#'
#'   We do not use `file_select` directly because it would not capture
#'   the case in which a string is provided containing the full .csv or .dat
#'   file name.
#' @param verbose Logical indicating whether to warn if codebook cannot be
#'   loaded.
#'
#' @return An `ipums_ddi` object
#'
#' @noRd
read_nhgis_codebook_safe <- function(data_file,
                                     filename,
                                     file_encoding = NULL,
                                     verbose = FALSE) {
  cb_files <- find_files_in(
    data_file,
    name_ext = "txt",
    multiple_ok = TRUE,
    none_ok = TRUE
  )

  if (length(cb_files) > 0) {
    # If any .txt files, find the one with the same base name as
    # the file being loaded
    cb_file <- fostr_subset(
      cb_files,
      fostr_replace(basename(filename), ipums_file_ext(filename), "")
    )
  }

  cb_ddi_info <- try(
    read_nhgis_codebook(data_file, file_select = tidyselect::all_of(cb_file)),
    silent = TRUE
  )

  cb_error <- inherits(cb_ddi_info, "try-error")

  if (cb_error) {
    # If error, a direct file path may have been provided.
    # Attempt to load codebook for file with same base name in same directory
    filename <- fostr_replace(
      data_file,
      paste0(ipums_file_ext(data_file), "$"),
      "_codebook.txt"
    )

    cb_ddi_info <- try(
      read_nhgis_codebook(filename),
      silent = TRUE
    )

    cb_error <- inherits(cb_ddi_info, "try-error")
  }

  # If still no codebook info, return empty DDI
  if (cb_error) {
    cb_ddi_info <- ipums_agg_empty_ddi("nhgis")

    if (verbose) {
      rlang::warn(
        c(
          "Unable to read codebook associated with this file.",
          "i" = "To load a codebook manually, use `read_nhgis_codebook()`.",
          "i" = paste0(
            "To attach codebook information to loaded data, ",
            "use `set_ipums_var_attributes()`."
          )
        )
      )
    }
  }

  # Specify encoding (assuming all nhgis extracts are ISO-8859-1 eg latin1
  # because an extract with county names has n with tildes and so is can
  # be verified as ISO-8859-1)
  cb_ddi_info$file_encoding <- file_encoding %||% "ISO-8859-1"

  cb_ddi_info
}

read_ihgis_codebook_safe <- function(data_file,
                                     file_encoding = NULL,
                                     verbose = FALSE) {
  # If data_file is an extract, we can just load the cb
  cb_ddi_info <- try(read_ihgis_codebook(data_file), silent = TRUE)
  cb_error <- inherits(cb_ddi_info, "try-error")

  # if this failed, then `data_file` is probably path to a CSV data file,
  # which cannot be passed to read_ihgis_codebook() directly.
  # We need to find associated datadict file
  if (cb_error) {
    dd_file <- find_files_in(
      dirname(data_file),
      name_ext = "csv$",
      file_select = quo(tidyselect::matches("_datadict")),
      multiple_ok = TRUE,
      none_ok = TRUE
    )

    dd_file <- file.path(dirname(data_file), dd_file)

    cb_ddi_info <- try(read_ihgis_codebook(dd_file), silent = TRUE)
    cb_error <- inherits(cb_ddi_info, "try-error")
  }

  # If still no codebook info, we're probably missing the necessary files.
  # Return empty DDI
  if (cb_error) {
    cb_ddi_info <- ipums_agg_empty_ddi("ihgis")

    if (verbose) {
      rlang::warn(
        c(
          "Unable to read codebook associated with this file.",
          "i" = "To load a codebook manually, use `read_ihgis_codebook()`.",
          "i" = paste0(
            "To attach codebook information to loaded data, ",
            "use `set_ipums_var_attributes()`."
          )
        )
      )
    }
  }

  cb_ddi_info$file_encoding <- file_encoding %||% "UTF-8"

  cb_ddi_info
}

check_header_row <- function(data_file, file_select = NULL) {
  file_select <- enquo(file_select)

  file <- find_files_in(
    data_file,
    name_ext = "csv",
    file_select = file_select,
    pattern_exclude = "(_datadict|_geog|_tables)\\.csv$",
    multiple_ok = FALSE,
    none_ok = FALSE
  )

  if (file_is_zip(data_file)) {
    file <- unz(data_file, file)
  } else if (file_is_dir(data_file)) {
    file <- file.path(data_file, file)
  }

  # Read first row to determine if this data contains the NHGIS
  # "expanded" header row
  header_row <- readr::read_csv(
    file,
    n_max = 1,
    col_types = readr::cols(.default = readr::col_guess()),
    progress = FALSE,
    show_col_types = FALSE,
    na = c("", "NA")
  )

  has_extra_header <- all(purrr::map_lgl(header_row, is.character))
  header_vals <- unname(unlist(header_row))

  list(
    has_extra_header = has_extra_header,
    header_vals = header_vals,
    col_names = colnames(header_row)
  )
}

parse_nhgis_do_file <- function(file) {
  do_lines <- trimws(readr::read_lines(file, progress = FALSE))

  col_spec <- parse_col_positions(do_lines)
  col_recode <- parse_col_recode(do_lines)

  list(
    col_types = col_spec$col_types,
    col_positions = col_spec$col_positions,
    col_recode = col_recode
  )
}

parse_col_recode <- function(do_lines) {
  recode_lines <- which(grepl("^replace", do_lines))

  if (length(recode_lines) == 0) {
    return(NULL)
  }

  recode_vals <- toupper(
    fostr_replace(do_lines[recode_lines], "^replace ", "")
  )

  recode_vals <- fostr_split(recode_vals, "( +)?=( +)?")

  cols <- purrr::map_chr(recode_vals, purrr::pluck(1))
  exprs <- purrr::map_chr(recode_vals, purrr::pluck(2))

  list(
    cols = cols,
    exprs = rlang::parse_exprs(exprs)
  )
}

parse_col_positions <- function(do_lines) {
  # Get positions and labels
  start <- which(grepl("^quietly", do_lines)) + 1
  end <- which(grepl("^using", do_lines)) - 1

  col_info <- fostr_split(do_lines[start:end], "\\s{2,}")

  col_types <- convert_col_types(purrr::map_chr(col_info, purrr::pluck(1)))
  col_name <- toupper(purrr::map_chr(col_info, purrr::pluck(2)))
  col_index <- fostr_split(purrr::map_chr(col_info, purrr::pluck(3)), "-")

  col_start <- as.numeric(purrr::map_chr(col_index, purrr::pluck(1)))
  col_end <- as.numeric(purrr::map_chr(col_index, purrr::pluck(2)))

  list(
    col_types = col_types,
    col_positions = readr::fwf_positions(col_start, col_end, col_name)
  )
}

convert_col_types <- function(types) {
  types <- fostr_replace(types, "^str.+", "str")

  recode_key <- c(
    str = "c",
    byte = "i",
    int = "i",
    long = "i",
    float = "d",
    double = "d"
  )

  paste0(dplyr::recode(types, !!!recode_key), collapse = "")
}

warn_default_fwf_parsing <- function() {
  rlang::warn(
    c(
      paste0(
        "Data loaded from NHGIS fixed-width files may not be consistent with ",
        "the information included in the data codebook when parsing column ",
        "positions manually."
      ),
      "i" = paste0(
        "Please consult the .txt and .do files associated with this extract ",
        "to ensure data is recoded correctly."
      )
    )
  )
}

ipums_agg_empty_ddi <- function(collection) {
  new_ipums_ddi(
    ipums_project = get_proj_name(collection),
    file_type = "rectangular",
    conditions = paste0(
      "Use of data from ", get_proj_name(collection),
      " is subject to conditions including that users ",
      "should cite the data appropriately. ",
      "Please see ", get_proj_config(collection)$home_url,
      " for more information."
    )
  )
}

Try the ipumsr package in your browser

Any scripts or data that you put into this service are public.

ipumsr documentation built on June 8, 2025, 1:30 p.m.