R/census_geo_api.R

Defines functions census_geo_api

Documented in census_geo_api

#' Census Data download function.
#'
#' \code{census_geo_api} retrieves U.S. Census geographic data for a given state.
#'
#' This function allows users to download U.S. Census geographic data (2010 or 2020), 
#' at either the county, tract, block, or place level, for a particular state. 
#'
#' @inheritParams get_census_data
#' @param state A required character object specifying which state to extract Census data for, 
#'  e.g., \code{"NJ"}.
#' @param geo A character object specifying what aggregation level to use. 
#'  Use `"block"`, `"block_group"`, `"county"`, `"place"`, `"tract"`, or `"zcta"`. 
#'  Default is \code{"tract"}. Warning: extracting block-level data takes very long.
#' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on 
#'  age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race).
#'  If \code{TRUE}, function will return Pr(Geolocation, Age | Race). 
#'  If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).
#' @param sex A \code{TRUE}/\code{FALSE} object indicating whether to condition on 
#'  sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). 
#'  If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). 
#'  If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).
#' @param year A character object specifying the year of U.S. Census data to be downloaded.
#'  Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}.
#'  Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and 
#'  \code{\var{sex}} are both \code{FALSE}.
#' @param retry The number of retries at the census website if network interruption occurs.
#' @param save_temp File indicating where to save the temporary outputs. 
#'  Defaults to NULL. If specified, the function will look for an .RData file
#'  with the same format as the expected output. 
#' @param counties A vector of counties contained in your data. If \code{NULL}, all counties are pulled.
#' Useful for smaller predictions where only a few counties are considered. Must be zero padded.
#' @return Output will be an object of class \code{list}, indexed by state names. It will 
#'  consist of the original user-input data with additional columns of Census geographic data.
#'
#' @examples
#' \dontshow{data(voters)}
#' \dontrun{census_geo_api(states = c("NJ", "DE"), geo = "block")}
#' \dontrun{census_geo_api(states = "FL", geo = "tract", age = TRUE, sex = TRUE)}
#' \dontrun{census_geo_api(states = "MA", geo = "place", age = FALSE, sex = FALSE,
#'  year = "2020")}
#'
#' @references
#' Relies on `get_census_api()`, `get_census_api_2()`, and `vec_to_chunk()` functions authored by Nicholas Nagle, 
#' available [here](https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html).
#' 
#' @importFrom furrr future_map_dfr
#' @importFrom purrr map_dfr
#' @keywords internal

census_geo_api <- function(
    key = Sys.getenv("CENSUS_API_KEY"),
    state,
    geo = c("tract", "block", "block_group", "county", "place", "zcta"),
    age = FALSE,
    sex = FALSE,
    year = c("2020", "2010"),
    retry = 3,
    save_temp = NULL,
    counties = NULL
) {
  key <- validate_key(key)
  
  geo <- tolower(geo)
  geo <- rlang::arg_match(geo)
  
  year <- as.character(year)
  year <- rlang::arg_match(year)
  
  census <- NULL
  state <- as_state_abbreviation(state)
  
  df.out <- NULL
  state.fips <- as_fips_code(state)
  
  vars <- census_geo_api_names(year = year, age = age, sex = sex)
  census_data_url <- census_geo_api_url(year = year)
  
  if (geo == "place") {
    region <- paste("for=place:*&in=state:", state.fips, sep = "")
    census <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region, retry)
  }
  
  if (geo == "county") {
    if (is.null(counties)) {
      region <- paste("for=county:*&in=state:", state.fips, sep = "")
    } else {
      counties_paste <- paste0(counties, collapse = ",")
      region <- paste("for=county:",counties_paste,"&in=state:", state.fips, sep = "")
    }

    census <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region, retry)
  }
  
  if (geo == "tract") {
    if (is.null(counties)) {
      region_county <- paste("for=county:*&in=state:", state.fips, sep = "")
    } else {
      counties_paste <- paste0(counties, collapse = ",")
      region_county <- paste("for=county:",counties_paste,"&in=state:", state.fips, sep = "")
    }
    
    county_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_county, retry)
    
    if(is.null(counties)) {
      county_list <- county_df$county
    } else {
      county_list <- intersect(counties, county_df$county)
    }
    
    if(length(county_list) > 0) {
      census <- furrr::future_map_dfr(seq_along(county_list), function(county) {
        message(paste("County ", county, " of ", length(county_list), ": ", county_list[county], sep = ""))
        region_county <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[county], sep = "")
        get_census_api(data_url = census_data_url, key = key, var.names = unlist(vars), region = region_county, retry)
      }, .progress = TRUE)
    } else {
      message('There were no intersecting counties in your voter.file data (tract)')
    } 
  }
  
  if (geo == "block_group") {
    if (is.null(counties)) {
      region_county <- paste("for=county:*&in=state:", state.fips, sep = "")
    } else {
      counties_paste <- paste0(counties, collapse = ",")
      region_county <- paste("for=county:",counties_paste,"&in=state:", state.fips, sep = "")
    }
    
    county_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_county, retry)
    
    if(is.null(counties)) {
      county_list <- county_df$county
    } else {
      county_list <- intersect(counties, county_df$county)
    }
    
    if(length(county_list) > 0) {
      message('Running block_group by county...')
      
      census <- purrr::map_dfr(
        1:length(county_list), 
        function(county) {
          # too verbose, commenting out
          message(paste("County ", county, " of ", length(county_list), ": ", county_list[county], sep = ""))
          
          blockgroup <- paste("for=block+group:*&in=state:", state.fips, "+county:", county_list[county], sep = "")
  
          # message(region_tract)
          blockgroup_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = blockgroup, retry)
          names(blockgroup_df)[4] <- "block_group" # Fix name, it comes in with a space from api. 
          blockgroup_df
        }
      )
      message("\n") # new line for progress bar
    } else {
      message('There were no intersecting counties in your voter.file data (block)')
    }
  }
  
  if (geo == "block") {
    if (is.null(counties)) {
      region_county <- paste("for=county:*&in=state:", state.fips, sep = "")
    } else {
      counties_paste <- paste0(counties, collapse = ",")
      region_county <- paste("for=county:",counties_paste,"&in=state:", state.fips, sep = "")
    }
    
    county_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_county, retry)
    
    if(is.null(counties)) {
      county_list <- county_df$county
    } else {
      county_list <- intersect(counties, county_df$county)
    }
    
    if(length(county_list) > 0) {
      message('Running block by county...')
      
      census <- purrr::map_dfr(
        1:length(county_list), 
        function(county) {
          # too verbose, commenting out
          message(paste("County ", county, " of ", length(county_list), ": ", county_list[county], sep = ""))
          
          region_tract <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[county], sep = "")
          # message(region_tract)
          tract_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_tract, retry)
          tract_list <- tract_df$tract
          
          furrr::future_map_dfr(1:length(tract_list), function(tract) {
            message(paste("Tract ", tract, " of ", length(tract_list), ": ", tract_list[tract], sep = ""))
            
            region_block <- paste("for=block:*&in=state:", state.fips, "+county:", county_list[county], "+tract:", tract_list[tract], sep = "")
            get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_block, retry)
          }, .progress = TRUE)
        }
      )
      message("\n") # new line for progress bar
    } else {
      message('There were no intersecting counties in your voter.file data (block)')
    } 
  }
  
  if (geo == "zcta") {
    census <- census_geo_api_zcta(
      census_data_url = census_data_url,
      key = key,
      vars = vars,
      state = state,
      counties = counties,
      retry = retry
    )
  }
  
  census <- dplyr::mutate(census, state = as_state_abbreviation(state))
  
  r_columns <- purrr::map(vars, function(vars) rowSums(census[vars]))
  
  census <- dplyr::bind_cols(census, r_columns)
  census <- dplyr::group_by(census, dplyr::across(dplyr::any_of("state")))
  census <- dplyr::mutate(
    census,
    dplyr::across(
      # Divide all r_columns by the total population of the corresponding race
      dplyr::all_of(names(r_columns)),
      function(x) {
        x / sum(
          dplyr::pick(
            sub("^.+_(.{3})$", "r_\\1", dplyr::cur_column(), perl = TRUE)
          )
        )
      }
    )
  )
  census <- dplyr::ungroup(census)
  
  census
}
kosukeimai/wru documentation built on April 8, 2024, 6:03 p.m.