knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) commanism <- function(.) { scales::label_comma(accuracy = 1)(.) }
A reusable approach incorporates setup and post-processing, which take place before and after calling for data from the API. Let's say that you are interested in the fraction of women who have given birth in the U.S., broken down by age.
Use search_in_columns()
to find tables that are relevant to our question.
#| label: search-for-women-and-births WB_TABLES <- hercacstables::search_in_columns( hercacstables::METADATA_FOR_ACS_GROUPS, Group = "\\d$", # the group must end in a digit, so only "all races" tables Universe = "wom[ae]n", # matches woman and women Description = c("birth", "year") # the census doesn't use "age" much )
#| label: show-womens-and-birth-tables #| echo: false knitr::kable(WB_TABLES)
It looks like table B13016 is exactly what we are looking for.
First, create a glossary table that maps from the Census variables you need to the real-world meanings that you actually care about.
The following example glosses 12 variables from "B13016." Each specific variable encodes four columns' worth of data.
#| label: setup-births-glossary GLOSSARY_OF_WOMEN_AND_BIRTHS <- "B13016" |> hercacstables::unpack_group_details() |> dplyr::filter( .data$Dataset == "ACS1", # there are details for 1- and 5-year datasets dplyr::if_all(c("A", "B"), \(.)nchar(.) > 0) # ) |> dplyr::mutate( `Gave Birth` = stringr::str_detect(.data$A, "had"), `Lower Age` = stringr::str_extract(.data$B, "^\\d{2}"), `Upper Age` = stringr::str_extract(.data$B, "(?<!^)\\d{2}"), dplyr::across(tidyselect::ends_with("Age"), as.integer) ) |> dplyr::select( !tidyselect::any_of(c("A", "B")) )
#| label: show-births-glossary #| echo: false knitr::kable(GLOSSARY_OF_WOMEN_AND_BIRTHS)
Having defined a reusable glossary, we can now define a reusable fetching function.
#| label: define-fetch-births-data fetch_women_and_births <- function(...) { hercacstables::fetch_data( variables = GLOSSARY_OF_WOMEN_AND_BIRTHS$Variable, survey_type = "acs", table_or_survey_code = "acs1", ... ) }
Run the fetch_data()
command by itself so that you can cache it.
In this example, we will ask for the most recent data for the whole country.
#| label: fetch-births-data #| cache: true RAW_WOMEN_AND_BIRTHS <- fetch_women_and_births( year = hercacstables::most_recent_vintage("acs", "acs1"), for_geo = "us", for_items = "*" )
#| label: show-raw-birth-data #| echo: false RAW_WOMEN_AND_BIRTHS |> dplyr::mutate( Value = commanism(.data$Value) ) |> knitr::kable( align = "rlrrr" )
The raw data do not answer our question all by themselves.
We are interested in rates, not counts.
Using the dplyr::summarize()
function will let us get rid of the superfluous
information about marital status.
#| label: wrangle-birth-data wrangle_women_and_births <- function(.raw_api_output, ...) { .raw_api_output |> dplyr::inner_join( GLOSSARY_OF_WOMEN_AND_BIRTHS, by = c("Group", "Index") ) |> dplyr::summarize( `Recent Mothers` = sum(.data$Value * .data$`Gave Birth`), `All Women` = sum(.data$Value), Rate = .data$`Recent Mothers` / .data$`All Women`, .by = tidyselect::all_of(c(..., "Lower Age", "Upper Age")) ) } WOMEN_AND_BIRTHS <- wrangle_women_and_births(RAW_WOMEN_AND_BIRTHS)
#| label: show-wrangled-births #| echo: false WOMEN_AND_BIRTHS |> dplyr::mutate( dplyr::across(c("Recent Mothers", "All Women"), commanism), Rate = scales::label_percent(accuracy = 1)(.data$Rate) ) |> knitr::kable( align = "r" )
The benefit of this approach is that we can reuse the fetching and wrangling functions.
In this example, we pull data for three different counties in Wisconsin.
#| label: pull-se-wi-births #| cache: true SE_WI_WOMEN_AND_BIRTHS <- fetch_women_and_births( state = 55, for_geo = "county", for_items = c("059", "079", "101"), # Kenosha, Racine, and Milwaukee year = 2023 ) |> wrangle_women_and_births( "county" ) |> dplyr::mutate( county = dplyr::case_match(.data$county, "059" ~ "Kenosha", "079" ~ "Milwaukee", "101" ~ "Racine") )
For a simpler table, we'll exclude the counts of mothers and women, then pivot the rate data by county.
#| label: show-se-wi-wrangled-births #| echo: false SE_WI_WOMEN_AND_BIRTHS |> dplyr::select( "county", "Lower Age", "Upper Age", "Rate" ) |> dplyr::mutate( Rate = scales::label_percent(accuracy = 1)(.data$Rate) ) |> tidyr::pivot_wider( names_from = "county", values_from = "Rate" ) |> knitr::kable( align = "r" )
In this example, we pull data about Texas from three years. We have to make three separate calls, one for each year. It's an idiosyncrasy of this particular API.
#| label: pull-tx-births #| cache: true TEXAS_WOMEN_AND_BIRTHS <- 2021:2023 |> purrr::map( \(.y) fetch_women_and_births( for_geo = "state", for_items = 48L, year = .y ) ) |> purrr::list_rbind() |> wrangle_women_and_births( "Year" )
For a simpler table, we'll exclude the counts of mothers and women, then pivot the rate data by year.
#| label: show-tx-wrangled-births #| echo: false TEXAS_WOMEN_AND_BIRTHS |> dplyr::select( "Year", "Lower Age", "Upper Age", "Rate" ) |> dplyr::mutate( Rate = scales::label_percent(accuracy = 1)(.data$Rate) ) |> tidyr::pivot_wider( names_from = "Year", values_from = "Rate" ) |> knitr::kable( align = "r" )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.