tests/testthat/test_nhgis.R

nhgis_single_csv <- ipums_example("nhgis0972_csv.zip")
nhgis_single_shp <- ipums_example("nhgis0972_shape_small.zip")

nhgis_multi_ds <- ipums_example("nhgis0731_csv.zip")
nhgis_multi_shp <- ipums_example("nhgis0712_shape_small.zip")

nhgis_multi_fwf <- ipums_example("nhgis0730_fixed.zip")

rows <- 71
vars_data <- 25

# Read single files -------------------------------

test_that("Can read NHGIS extract: single dataset", {
  # Use snapshot since several forms of output are produced by default
  # (IPUMS conditions, col spec, and additional col spec info)
  expect_snapshot(
    nhgis_csv <- read_nhgis(nhgis_single_csv)
  )

  expect_equal(nrow(nhgis_csv), rows)
  expect_equal(ncol(nhgis_csv), vars_data)
  expect_equal(
    attr(nhgis_csv$D6Z001, "label"),
    "Total area: 1989 to March 1990"
  )
  expect_equal(
    attr(nhgis_csv$D6Z001, "var_desc"),
    "Table D6Z: Year Structure Built (Universe: Housing Units)"
  )
  expect_equal(
    nhgis_csv$PMSA[1:2],
    c("Akron, OH PMSA", "Anaheim--Santa Ana, CA PMSA")
  )
  expect_s3_class(nhgis_csv, c("spec_tbl_df", "tbl_df", "tbl", "data.frame"))
})

test_that("Can read NHGIS extract: fixed-width files", {
  expect_error(
    read_nhgis(nhgis_multi_fwf, verbose = FALSE),
    "Multiple files found"
  )

  nhgis_fwf1 <- read_nhgis(nhgis_multi_fwf, file_select = 1, verbose = FALSE)
  nhgis_csv1 <- read_nhgis(nhgis_multi_ds, file_select = 1, verbose = FALSE)

  # Test that .do file col typing is working.
  # These coltypes differ from what would be expected by readr defaults.
  expect_type(nhgis_fwf1$AIHHTLI, "character")
  expect_type(nhgis_fwf1$MEMI, "character")
  expect_type(nhgis_fwf1$PCI, "character")
  expect_type(nhgis_fwf1$NATIONA, "character")

  # Test that .do file recoding is working:
  # This should be coercible to integer because it was not rescaled on import:
  expect_true(
    dplyr::near(
      nhgis_fwf1$AJWBE001,
      as.integer(nhgis_fwf1$AJWBE001)
    )
  )
  # This was rescaled and should be a decimal:
  expect_false(
    dplyr::near(
      nhgis_fwf1$AJWCE001,
      as.integer(nhgis_fwf1$AJWCE001)
    )
  )

  expect_equal(nrow(nhgis_fwf1), nrow(nhgis_csv1))
  expect_equal(ncol(nhgis_fwf1), ncol(nhgis_csv1) - 1)

  expect_equal(
    colnames(nhgis_fwf1),
    read_nhgis_codebook(nhgis_multi_fwf, file_select = 1)$var_info$var_name
  )

  # This has some coltype differences which are to be expected
  # as we can directly provide coltypes in read_nhgis_fwf()
  # whereas read_nhgis_csv() uses readr defaults
  expect_identical(
    purrr::map(nhgis_fwf1, attributes),
    purrr::map(dplyr::select(nhgis_csv1, -"GISJOIN"), attributes)
  )

  expect_identical(
    purrr::map(zap_ipums_attributes(nhgis_fwf1), "identity"),
    purrr::map(
      zap_ipums_attributes(dplyr::select(nhgis_csv1, -"GISJOIN")),
      "identity"
    )
  )

  expect_error(
    read_nhgis(
      nhgis_multi_fwf,
      file_select = 1,
      do_file = FALSE,
      verbose = FALSE
    )
  )

  expect_error(
    read_nhgis(
      nhgis_multi_fwf,
      file_select = 1,
      do_file = "asdf",
      verbose = FALSE
    ),
    "Could not find the provided `do_file`."
  )

  expect_equal(
    attributes(nhgis_fwf1$AJWBE001),
    list(
      label = "Estimates: Total",
      var_desc = "Table AJWB: Sex by Age (Universe: Total population)"
    )
  )

  expect_equal(attr(nhgis_fwf1$YEAR, "label"), "Data File Year")
  expect_s3_class(nhgis_fwf1, c("spec_tbl_df", "tbl_df", "tbl", "data.frame"))
})

test_that("Can read NHGIS extract: single time series table", {
  tst <- read_nhgis(
    nhgis_multi_fwf,
    file_select = contains("ts"),
    verbose = FALSE
  )

  tst2 <- read_nhgis(
    nhgis_multi_ds,
    file_select = contains("ts"),
    verbose = FALSE
  )

  expect_identical(
    purrr::map(tst, attributes),
    purrr::map(tst2, attributes)
  )

  expect_identical(
    purrr::map(zap_ipums_attributes(tst), "identity"),
    purrr::map(zap_ipums_attributes(tst2), "identity")
  )

  expect_equal(nrow(tst), 84)
  expect_equal(tst$GISJOIN[[1]], "G010")
  expect_equal(tst$STATE[[1]], "Alabama")
  expect_equal(tst$A00AA1820[[1]], 127901)

  expect_equal(
    attributes(tst$A00AA2020),
    list(
      label = "2020: Persons: Total",
      var_desc = "Table A00: Total Population"
    )
  )
})

# Select files when multiple exist -------------------------------

test_that("Can select data files by index", {
  nhgis1 <- read_nhgis(nhgis_multi_ds, file_select = 1, verbose = FALSE)
  nhgis2 <- read_nhgis(nhgis_multi_ds, file_select = 2, verbose = FALSE)

  expect_equal(dim(nhgis1), c(1, 115))
  expect_equal(dim(nhgis2), c(84, 28))

  expect_false(identical(nhgis1, nhgis2))

  expect_true("AJWBE001" %in% colnames(nhgis1))
  expect_true("A00AA1790" %in% colnames(nhgis2))

  expect_equal(
    attributes(nhgis1$GISJOIN),
    list(label = "GIS Join Match Code", var_desc = "")
  )

  expect_equal(
    attributes(nhgis2$GISJOIN),
    list(label = "GIS Join Match Code", var_desc = "")
  )
})

test_that("Can select data files with tidyselect", {
  nhgis1 <- read_nhgis(
    nhgis_multi_ds,
    file_select = contains("ds239"),
    verbose = FALSE
  )

  nhgis2 <- read_nhgis(
    nhgis_multi_ds,
    file_select = contains("ts_nominal"),
    verbose = FALSE
  )

  expect_equal(dim(nhgis1), c(1, 115))
  expect_equal(dim(nhgis2), c(84, 28))

  expect_false(identical(nhgis1, nhgis2))

  expect_true("AJWBE001" %in% colnames(nhgis1))
  expect_true("A00AA1790" %in% colnames(nhgis2))

  expect_equal(
    attributes(nhgis1$GISJOIN),
    list(label = "GIS Join Match Code", var_desc = "")
  )

  expect_equal(
    attributes(nhgis2$GISJOIN),
    list(label = "GIS Join Match Code", var_desc = "")
  )
})

test_that("Can still find codebook if file_select matches data file only", {
  nhgis <- read_nhgis(
    nhgis_multi_ds,
    file_select = matches("nation.csv"),
    verbose = FALSE
  )

  nhgis2 <- read_nhgis(
    nhgis_multi_ds,
    file_select = "nhgis0731_csv/nhgis0731_ds239_20185_nation.csv",
    verbose = FALSE
  )

  expect_true(all.equal(nhgis, nhgis2))

  expect_equal(
    attributes(nhgis$GISJOIN),
    list(label = "GIS Join Match Code", var_desc = "")
  )
})

test_that("Can still find codebook if direct data file path provided", {
  temp_dir <- tempfile()
  dir.create(temp_dir)
  unzipped <- utils::unzip(nhgis_single_csv, exdir = temp_dir)
  on.exit(unlink(temp_dir, recursive = TRUE), add = TRUE, after = FALSE)

  data_file <- unzipped[fostr_detect(unzipped, ".csv$")]

  x <- read_nhgis(data_file, verbose = FALSE)

  expect_equal(
    ipums_var_info(x)$var_name,
    colnames(x)
  )

  expect_equal(
    ipums_var_info(x)$var_label,
    unname(purrr::map_chr(x, ~ attributes(.x)$label))
  )
})

# Can read data when provided in zip, dir, and shp formats -------

test_that("We can specify available readr options in read_nhgis()", {
  expect_silent(
    nhgis_data <- read_nhgis(
      nhgis_single_csv,
      verbose = FALSE,
      col_types = readr::cols(.default = readr::col_character()),
      remove_extra_header = TRUE
    )
  )

  expect_warning(
    nhgis_data_fwf <- read_nhgis(
      nhgis_multi_fwf,
      file_select = 1,
      verbose = FALSE,
      vars = 1:4,
      col_types = "ccil"
    ),
    "One or more parsing issues"
  )

  nhgis_data_fwf2 <- read_nhgis(
    nhgis_multi_fwf,
    file_select = 1,
    verbose = FALSE,
    vars = c(YEAR, STUSAB)
  )

  expect_equal(nrow(nhgis_data), rows)
  expect_equal(ncol(nhgis_data), vars_data)
  expect_equal(unique(purrr::map_chr(nhgis_data, class)), "character")

  expect_equal(nrow(nhgis_data_fwf), 1)
  expect_equal(ncol(nhgis_data_fwf), 4)
  expect_equal(
    unname(purrr::map_chr(nhgis_data_fwf, class)),
    c("character", "character", "integer", "logical")
  )

  expect_equal(colnames(nhgis_data_fwf2), c("YEAR", "STUSAB"))

  expect_equal(
    nrow(read_nhgis(nhgis_single_csv, n_max = 10, verbose = FALSE)),
    10
  )
})

test_that("We get informative error messages when reading NHGIS extracts", {
  skip_if_not_installed("tidyselect", "1.2.1")

  expect_error(
    read_nhgis("FAKE_FILE.zip", verbose = FALSE),
    "Could not find file"
  )

  expect_error(
    read_nhgis("~/Desktop/FAKE_FILE.zip", verbose = FALSE),
    "Could not find file .+/Desktop/FAKE_FILE.zip"
  )

  expect_error(
    nhgis <- read_nhgis(nhgis_multi_ds, verbose = FALSE),
    "Multiple files found, please use the `file_select`"
  )
  expect_error(
    nhgis <- read_nhgis(
      nhgis_multi_ds,
      file_select = contains("nhgis"),
      verbose = FALSE
    ),
    "Multiple files found, please use the `file_select`"
  )
  expect_error(
    nhgis <- read_nhgis(
      nhgis_multi_ds,
      file_select = 1:2,
      verbose = FALSE
    ),
    "Multiple files found, please use the `file_select`"
  )
  expect_snapshot(
    read_nhgis(nhgis_multi_ds, file_select = 3, verbose = FALSE),
    error = TRUE
  )
  expect_error(
    read_nhgis(
      nhgis_multi_ds,
      file_select = contains("not-in-file"),
      verbose = FALSE
    ),
    "The provided `file_select` did not select any of the available files:"
  )
})

test_that("Can read NHGIS codebook", {
  expect_error(
    read_nhgis_codebook(nhgis_multi_ds),
    "Multiple files found"
  )

  # Single dataset ----------------

  d_csv <- read_nhgis(nhgis_single_csv, verbose = FALSE)

  cb_csv <- read_nhgis_codebook(nhgis_single_csv)

  # FWF with ACS ----------------

  # Includes multiple data types which changes codebook slightly

  d_fwf <- read_nhgis(
    nhgis_multi_fwf,
    file_select = 1,
    verbose = FALSE
  )

  cb_fwf <- read_nhgis_codebook(nhgis_multi_fwf, file_select = 1)

  # Time series table ----------

  d_fwf_tst <- read_nhgis(
    nhgis_multi_fwf,
    file_select = 2,
    verbose = FALSE
  )

  cb_fwf_ts <- read_nhgis_codebook(nhgis_multi_fwf, file_select = 2)

  expect_equal(cb_csv$var_info$var_name, colnames(d_csv))
  expect_equal(cb_fwf$var_info$var_name, colnames(d_fwf))
  expect_equal(cb_fwf_ts$var_info$var_name, colnames(d_fwf_tst))

  # Raw cb lines:
  expect_equal(
    length(read_nhgis_codebook(nhgis_single_csv, raw = TRUE)),
    117
  )

  # Confirm that data types and universe are added to labels
  expect_equal(
    attr(d_fwf$AJWBE009, "label"),
    "Estimates: Male: 21 years"
  )
  expect_equal(
    attr(d_fwf$AJWBE009, "var_desc"),
    "Table AJWB: Sex by Age (Universe: Total population)"
  )
})

test_that("Can read unzipped NHGIS files", {
  test_path <- vcr::vcr_test_path("fixtures", "nhgis_unzipped")

  x1 <- suppressWarnings(read_nhgis(test_path, file_select = 1, verbose = FALSE))
  x2 <- read_nhgis(
    file.path(test_path, "test1.csv"),
    var_attrs = NULL,
    verbose = FALSE
  )

  expect_equal(x1$a, "b")
  expect_equal(x1, x2)
})

Try the ipumsr package in your browser

Any scripts or data that you put into this service are public.

ipumsr documentation built on Sept. 12, 2024, 7:38 a.m.