#' Parse raw data, calculate new varibles, export as csv
#'
#' This function parses the data downloded by [acs_download()] and creates a csv
#' file for the selected parameters.
#'
#' @param year \[integer(1)]: The year of the desired ACS sample. For
#' example, use 2010 for the 2010 1-year ACS or the 2006-2010 5-yer ACS.
#' @param span \[integer(1)]: The span of years for ACS estimates. ACS contains
#' 1-, 3-, and 5-year surveys.
#' @param geo \[`charater(1)`]: The 2-letter state abbreviation or 2-digit FIPS
#' code for the state for which data will be downloaded. For geogrpahies that
#' do not nest within states, use `"us"`.
#' @param sum_levels \[`character`]: The Census Bureau codes for the summary
#' level to include in the table. (eg. "010" = United States) For full list
#' see <https://factfinder.census.gov/help/en/summary_level_code_list.htm>.
#' @param keep_vars \[`character`]: A character vector of ACS variable codes to
#' be included in the output table, using the format `"b25003_001"`.
#' @param acs_dir \[`character(1)`]: The root directory in which all the ACS
#' data has been downloaded with [acs_download()]. Defaults to current working directory.
#' @param .f A function or formula to be passed to [`purrr::as_mapper()`]. The
#' function must take a single dataframe as the only argument, and return a
#' single dataframe.
#'
#' If a __function__, it is used as is.
#'
#' If a __formula__, e.g. `~ .x + 2`, it is converted to a function. There are
#' three ways to refer to the arguments:
#'
#' * For a single argument function, use `.` * For a two argument function,
#' use `.x` and `.y` * For more arguments, use `..1`, `..2`, `..3` etc
#'
#' This syntax allows you to create very compact anonymous functions.
#'
#' @export
#'
acs_transform <- function(year, span, geo, sum_levels, keep_vars, acs_dir = ".", .f = NULL) {
validate_args(
year = year,
span = span,
)
if (is.null(.f)) {
.f <- purrr::as_mapper(~return(.x))
} else {
.f <- purrr::as_mapper(.f)
}
# TODO: validate requested sumlevels. Are they available for requested geo
# type (state or US) and span (1 or 5 year). Add to validators.R file. Should
# also allow sum_level an name or number, and use internal lookup to get the
# other. If they request something invalid, it should print a list of possible
# sum_levels as a message
# Might also make a function that returns a table to help people learn the
# summary level codes. eg. a table with cols: year, span, sumlevel code,
# sumlevel name
# https://factfinder.census.gov/help/en/summary_level_code_list.htm
# ACS 1, state
# state_1_sumlevels <- c("040", "050", "060", "160", "312", "500", "795", "960", "970")
# us_1_sumlevels <- c("010", "020", "030", "250", "310", "314", "330", "335", "350", "355", "400")
# state_5_sumlevel
# us_5_sumlevels
geo_abb <- swap_geo_id(geo, span, "abb")
geo_name <- swap_geo_id(geo, span, "name")
trct_blkgrp <- any(sum_levels %in% c("140", "150"))
non_trct_blkgrp <- any(!sum_levels %in% c("140", "150"))
raw_dir <- glue("{acs_dir}/Raw/{year}_{span}")
docs_dir <- glue("{raw_dir}/_docs")
if (trct_blkgrp && non_trct_blkgrp) {
data_dir <- glue_chr("{raw_dir}/{geo_name}/{c('tract_blockgroup', 'non_tract_blockgroup')}")
} else {
data_dir <- dplyr::case_when(
span == 1L ~ glue_chr("{raw_dir}/{geo_name}"),
geo_abb == "us" ~ glue_chr("{raw_dir}/{geo_name}/non_tract_blockgroup"),
non_trct_blkgrp ~ glue_chr("{raw_dir}/{geo_name}/non_tract_blockgroup"),
trct_blkgrp ~ glue_chr("{raw_dir}/{geo_name}/tract_blockgroup")
)
}
if (!fs::dir_exists(data_dir[1])) {
stop_glue("The folllowing folder does not exist.
{data_dir}
Make sure to run `acs_download()` for {geo} {year} {span}-year")
}
clean_dir <- glue("{acs_dir}/Clean/{year}_{span}")
fs::dir_create(clean_dir, recurse = TRUE)
geos_table_slim <- get_geos_table(data_dir[1], docs_dir, year, span, geo_abb, sum_levels)
# create a named list of table_vars (names are seq numbers), and keep only the
# ones that contains variables we want to keep
seq_col_lookup <- get_seq_col_lookup(docs_dir[1], year) %>%
purrr::keep(~length(dplyr::intersect(keep_vars, .x)) > 0)
# Iterate over the seq numbers and for each do the following: import that seq
# number's value files for estimates and margins, filter to just the requested
# geographies (sumlevels), combine estiamtes and margins
pb <- dplyr::progress_estimated(length(seq_col_lookup))
values <- purrr::map_dfc(
.x = names(seq_col_lookup),
.f = import_values,
seq_col_lookup = seq_col_lookup,
geos_table = geos_table_slim,
keep_vars = keep_vars,
data_dir = data_dir,
year = year,
span = span,
geo_abb = geo_abb,
.pb = pb
)
# Possible that there are no rows returned (eg. place in WY for 1yr)
if (nrow(values) == 0) {
warn_glue("No data for {geo_abb} {year} {span}yr")
return(invisible(NULL))
}
# There are some tables that appear in multiple Seq files for some reason
# eg. B05002 is in 23 and 171, so in the map_dfc() they get a "1" added to
# the column name and we need to remove these
first_cols <- c("year", "span", "sum_level", "geoid_full", "geoid", "geo_name")
acs_var_pat <- "[bc]\\d{5}[a-z]*_[em]\\d{3}"
keep_acs_vars <- c(
stringr::str_replace(keep_vars, "_", "_e"),
stringr::str_replace(keep_vars, "_", "_m")
)
values %>%
dplyr::mutate(
year = as.integer(year),
span = as.integer(span)
) %>%
add_na_cols(keep_acs_vars) %>%
dplyr::select(first_cols, dplyr::matches(acs_var_pat)) %>%
dplyr::mutate_at(dplyr::vars(dplyr::matches(acs_var_pat)), as.double) %>%
.f() %>% # apply user-provided variable calculation function
fst::write_fst(glue("{clean_dir}/{geo_abb}_{year}_{span}.fst"))
}
import_values <- function(seq,
seq_col_lookup,
geos_table,
keep_vars,
data_dir,
year,
span,
geo_abb,
.pb = NULL) {
if ((!is.null(.pb)) && inherits(.pb, "Progress") && (.pb$i < .pb$n)) {
.pb$tick()$print()
}
# These are always the first columns in the values files
first_cols <- c("fileid", "filetype", "stusab", "character", "sequence", "logrecno")
# get the acs_vars for this seq number - these are the last columns
seq_cols <- seq_col_lookup[[seq]]
seq_keep_vars <- dplyr::intersect(keep_vars, seq_cols)
geo_cols <- c("geoid_full", "geoid", "sum_level", "geo_name")
col_names <- c(first_cols, seq_cols)
col_types <- c(rep("character", 6), rep("numeric", length(seq_cols)))
est_file <- dplyr::case_when(
year == 2005L ~ glue("{data_dir}/{geo_abb}{seq}e.{year}-{span}yr"),
year >= 2006L ~ glue("{data_dir}/e{year}{span}{geo_abb}{seq}.txt")
)
mar_file <- dplyr::case_when(
year == 2005L ~ glue("{data_dir}/{geo_abb}{seq}m.{year}-{span}yr"),
year >= 2006L ~ glue("{data_dir}/m{year}{span}{geo_abb}{seq}.txt")
)
# If this is being run for geos that include tract/blockgroup and
# non-tract/blockgroup data, then data_dir will be length two and so you'll
# ahve two file paths here to teh same type of file in each folder
estimates <- purrr::map_dfr(est_file, read_acs_csv, col_names, col_types)
# For some geos/seqs the dataset is empty b/c they're for PR specific tables
if (!length(estimates)) {
return(tibble::tibble())
}
estimates <- estimates %>%
dplyr::select("logrecno", seq_keep_vars) %>%
dplyr::rename_at(seq_keep_vars, stringr::str_replace, pattern = "(.*)(\\d{3})$", replacement = "\\1e\\2")
margins <- purrr::map_dfr(mar_file, read_acs_csv, col_names, col_types) %>%
dplyr::select_at(seq_keep_vars, stringr::str_replace, pattern = "(.*)(\\d{3})$", replacement = "\\1m\\2")
# with estimates and margins sorted indentically can bind columns instead of join
estimates %>%
dplyr::bind_cols(margins) %>%
dplyr::right_join(geos_table, by = "logrecno") %>%
dplyr::select(-"logrecno") %>%
dplyr::select(geo_cols, dplyr::everything()) %>%
dplyr::as_tibble()
}
read_acs_csv <- function(file, col_names, col_types) {
# fread raises errors for empty files, so skip
if (fs::file_info(file)[["size"]] == 0) {
return(tibble::tibble())
}
suppressWarnings( # warns for length=0 files, but this is handled above
data.table::fread(
file = file,
sep = ",",
header = FALSE,
col.names = col_names,
colClasses = col_types,
na.strings = c("", ".", "..0"),
stringsAsFactors = FALSE,
data.table = FALSE,
showProgress = FALSE
)
)
}
# Not all variables will be present in all years of data, so to make sure all
# the files are consistent when the user's variable creation function is applied
# all their requested variables are added t the dataframe with NA if they
# weren't in the data.
add_na_cols <- function(.data, col_names) {
new_cols <- purrr::discard(col_names, ~.x %in% names(.data))
new_cols_df <- new_cols %>%
purrr::map(~rep(NA_real_, nrow(.data))) %>%
purrr::set_names(new_cols)
dplyr::bind_cols(.data, new_cols_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.