R/helpers.R

Defines functions summary_files get_census_api_key variables_from_table_decennial variables_from_table_acs census_api_key use_tigris

Documented in census_api_key summary_files

# Function to get the correct geom for a Census dataset
# if geometry = TRUE
use_tigris <- function(geography, year, cb = TRUE, resolution = "500k",
                       state = NULL, county = NULL, starts_with = NULL, ...) {

  # Handle missing 2022 CB files for now
  # if (year == 2022) {
  #   year <- 2021
  # }

  if (year %in% 2011:2012) {
    cb <- FALSE
  }

  if (year == 2009) {
    year <- 2000
  }

  if (geography == "state") {

    st <- states(cb = cb, resolution = resolution, year = year, class = "sf", ...)

    if (year == 1990) {
      st <- mutate(st, GEOID = ST)
      st <- st %>%
        group_by(GEOID) %>%
        summarize() %>%
        st_cast("MULTIPOLYGON")

    } else if (year %in% c(2000, 2010)) {

      if (cb) {
      st <- mutate(st, GEOID = STATE)

      if (year == 2000) {
        st <- st %>%
          group_by(GEOID) %>%
          summarize() %>%
          st_cast("MULTIPOLYGON")
      }
    # when cb = FALSE the variable name for the state fips code column is year specific
    } else if (year == 2000) {
      st <- mutate(st, GEOID = STATEFP00)
    } else if (year == 2010)
      st <- mutate(st, GEOID = STATEFP10)
    }
    if (year == 2014) {
      st <- st_zm(st)
    }

    return(st)

  } else if (geography == "county") {

    ct <- counties(cb = cb, state = state, resolution = resolution, year = year,
             class = "sf", ...)

    if (year == 1990) {
      ct <- mutate(ct, GEOID = paste0(ST, CO))
      ct <- ct %>%
        group_by(GEOID) %>%
        summarize() %>%
        st_cast("MULTIPOLYGON")
    } else if (year %in% c(2000, 2010)) {
      if (cb) {
        ct <- mutate(ct, GEOID = paste0(STATE, COUNTY))
        if (year == 2000) {
          ct <- ct %>%
            group_by(GEOID) %>%
            summarize() %>%
            st_cast("MULTIPOLYGON")
        }
        # when cb = FALSE the variable name for the fips code columns are year specific
      } else if (year == 2000) {
        ct <- mutate(ct, GEOID = CNTYIDFP00)
      } else if (year == 2010)
        ct <- mutate(ct, GEOID = GEOID10)
    }

    if (year == 2014) {
      ct <- st_zm(ct)
    }

    return(ct)

  } else if (geography == "tract") {

    tr <- tracts(cb = cb, state = state, county = county, year = year,
           class = "sf", ...)

    if (year == 1990) {
      tr <- tr %>%
        mutate(TRACTSUF = ifelse(is.na(TRACTSUF), "00", TRACTSUF)) %>%
        mutate(GEOID = paste0(ST, CO, TRACTBASE, TRACTSUF))
    } else if (year %in% c(2000, 2010)) {
      if (cb) {
        if (year == 2000) {
          tr <- mutate(tr, TRACT = str_pad(TRACT, 6, "right", "0"))
        }
        tr <- mutate(tr, GEOID = paste0(STATE, COUNTY, TRACT))
        # when cb = FALSE the variable name for the fips code columns are year specific
      } else if (year == 2000) {
        tr <- mutate(tr, GEOID = CTIDFP00)
      } else if (year == 2010)
        tr <- mutate(tr, GEOID = GEOID10)
    }



    if (any(duplicated(tr$GEOID))) {
      tr <- tr %>%
        group_by(GEOID) %>%
        summarize() %>%
        st_cast("MULTIPOLYGON")
    }

    if (year == 2014) {
      tr <- st_zm(tr)
    }

    return(tr)

  } else if (geography == "block group") {

    bg <- block_groups(cb = cb, state = state, county = county, year = year,
                 class = "sf", ...)

    if (cb) {
      if (year == 2000) {
        bg <- bg %>%
          mutate(TRACT = str_pad(TRACT, 6, "right", "0")) %>%
          mutate(GEOID = paste0(STATE, COUNTY, TRACT, BLKGROUP))
      } else if (year == 2010) {
        bg <- mutate(bg, GEOID = paste0(STATE, COUNTY, TRACT, BLKGRP))
      }
      # when cb = FALSE the variable name for the fips code columns are year specific
    } else if (year == 2000) {
      bg <- mutate(bg, GEOID = BKGPIDFP00)
    } else if (year == 2010)
      bg <- mutate(bg, GEOID = GEOID10)

    if (any(duplicated(bg$GEOID))) {
      bg <- bg %>%
        group_by(GEOID) %>%
        summarize() %>%
        st_cast("MULTIPOLYGON")
    }

    if (year == 2014) {
      bg <- st_zm(bg)
    }

    return(bg)

  } else if (geography %in% c("zcta", "zip code tabulation area", "zip code tabulation area (or part)")) {

    # For right now, to get it to work, it has to be cb = FALSE for 2010, 2011, and 2012
    # Re-visit this in the future.

    if (year %in% 2010:2012) cb <- FALSE

    # No ZCTA geometry for 2011, so use 2010 instead
    if (year == 2011) year <- 2010

    # Similarly, we don't have cb ZCTAs for 2021 yet, so use 2020 instead
    if (year == 2021 && cb) {
      year <- 2020
    }

    # We can't pull ZCTA shapes by state for 2020 but this is
    # an available hierarchy in the DHC.  So set state to NULL.
    if (year == 2020) {
      state <- NULL
    }

    z <- zctas(cb = cb, starts_with = starts_with, year = year,
               class = "sf", state = state, ...)

    if (year == 2000) {
      if (cb) {
        z <- rename(z, GEOID = ZCTA)
      } else {
        z <- rename(z, GEOID = ZCTA5CE00)
      }
    } else if (year >= 2020) {
      z <- rename(z, GEOID = GEOID20)
    } else {
      z <- rename(z, GEOID = GEOID10)
    }

    return(z)

  } else if (geography == "block") {

    bl <- blocks(state = state, county = county, year = year, class = "sf", ...)

    if (year == 2010) {
      bl <- rename(bl, GEOID = GEOID10)
    } else if (year == 2000) {
      bl <- rename(bl, GEOID = BLKIDFP00)
    } else if (year == 2020) {
      bl <- rename(bl, GEOID = GEOID20)
    }

    return(bl)

  } else if (geography == "place") {

    pl <- places(state = state, year = year, cb = cb, class = "sf", ...)

    return(pl)

  } else if (geography == "metropolitan statistical area/micropolitan statistical area" || geography == "cbsa") {

    # 2022 CBSA files for CB not yet available
    if (cb && year == 2022) {
      year <- 2021
    }

    cbsa <- core_based_statistical_areas(cb = cb, year = year, class = "sf", ...)

    return(cbsa)

  } else if (geography == "congressional district") {

    cd <- congressional_districts(cb = cb, year = year, class = "sf", ...)

    return(cd)

  } else if (geography == "public use microdata area") {

    # Right now, PUMAs are not defined for 2020 and are not in the CB file
    # Use the 2019 CB pumas for 2020 and 2021 as they align with the boundaries
    # used; 2022 should switch to the new 2020 PUMA boundaries which are available       in the 2020 CB file
    if (year %in% 2020:2021) {
      if (cb) {
        year <- 2019
      }
    } else if (year >= 2022) {
      if (cb) {
        year <- 2020
      }
    }

    state_ids <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
                   "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA",
                   "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY",
                   "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX",
                   "UT", "VT", "VA", "WA", "WV", "WI", "WY", "DC", "PR")

    if (length(state) > 1) {
      pm <- purrr::map(state, function(x) {
        pumas(state = x, cb = cb, year = year, class = "sf", ...)
      }) %>%
        rbind_tigris()
    } else if (is.null(state)) {

      if (!year %in% 2019:2020 && !cb) {
        pm <- purrr::map(state_ids, function(x) {
          pumas(state = x, cb = cb, year = year, class = "sf", ...)
        }) %>%
          rbind_tigris()
      } else {
        pm <- pumas(state = state, cb = cb, year = year, class = "sf", ...)
      }
    } else {
      pm <- pumas(state = state, cb = cb, year = year, class = "sf", ...)
    }

    if ("GEOID20" %in% names(pm)) {
      pm <- rename(pm, GEOID = GEOID20)
    } else {
      pm <- rename(pm, GEOID = GEOID10)
    }



    return(pm)

  } else if (geography == "state legislative district (upper chamber)") {

    slu <- state_legislative_districts(state = state, house = "upper", cb = cb, year = year,
                                       class = "sf", ...)

    return(slu)

  } else if (geography == "state legislative district (lower chamber)") {

    slc <- state_legislative_districts(state = state, house = "lower", cb = cb, year = year,
                                       class = "sf", ...)

    return(slc)

  } else if (geography == c("american indian area/alaska native area/hawaiian home land")) {

    nv <- native_areas(cb = cb, year = year, class = "sf", ...)

    return(nv)

  } else if (geography == "county subdivision") {

    cs <- county_subdivisions(state = state, county = county, cb = cb,
                              year = year, class = "sf", ...)

    if ("GEO_ID" %in% names(cs)) {
      cs$GEOID <- paste0(cs$STATE, cs$COUNTY, cs$COUSUB)
    }

    if ("GEOID10" %in% names(cs)) {
      cs$GEOID <- cs$GEOID10
    }

    return(cs)

  } else if (geography == "combined statistical area") {

    csa <- combined_statistical_areas(cb = cb, class = "sf", year = year, ...)

    return(csa)

  } else if (geography == "urban area") {

    # Right now, urban areas are not defined for 2020 and are not in the CB file
    if (year == 2020) cb <- FALSE

    # The 2020 decennial Census uses the 2020 urban areas, but the 2020 ACS
    # does not, so we will need to figure that out in the get_decennial side
    # Pass that through as a keyword argument

    ua <- urban_areas(cb = cb, year = year, class = "sf", ...)

    if ("GEOID20" %in% names(ua)) {
      ua <- rename(ua, GEOID = GEOID20)
    } else {
      ua <- rename(ua, GEOID = GEOID10)
    }


    return(ua)

  } else if (geography == "school district (elementary)") {

    if (year < 2016) {
      sde <- school_districts(state = state, type = "elementary", year = year,
                              class = "sf", ...)
    } else {
      sde <- school_districts(state = state, type = "elementary", cb = cb, year = year,
                              class = "sf", ...)
    }



    return(sde)

  } else if (geography == "school district (secondary)") {

    if (year < 2016) {
      sds <- school_districts(state = state, type = "secondary", year = year,
                              class = "sf", ...)
    } else {
      sds <- school_districts(state = state, type = "secondary", cb = cb, year = year,
                              class = "sf", ...)
    }

    return(sds)

  } else if (geography == "school district (unified)") {

    if (year < 2016) {
      sdu <- school_districts(state = state, type = "unified", year = year,
                              class = "sf", ...)
    } else {
      sdu <- school_districts(state = state, type = "unified", cb = cb, year = year,
                              class = "sf", ...)
    }

    return(sdu)

  } else if (geography == "new england city and town area") {

    ne <- new_england(type = "necta", cb = cb, year = year,
                      class = "sf", ...)

    return(ne)

  } else if (geography == "combined new england city and town area") {

    nec <- new_england(type = "combined", cb = cb, year = year,
                      class = "sf", ...)

    return(nec)

  } else if (geography == "us") {

    nat <- nation(year = year, class = "sf", ...)

    nat <- dplyr::mutate(nat, GEOID = "1")

    return(nat)

  } else if (geography == "region") {

    reg <- regions(year = year, class = "sf", ...)

    return(reg)

  } else if (geography == "division") {

    div <- divisions(year = year, class = "sf", ...)

    return(div)

  } else if (geography == "alaska native regional corporation") {

    anrc <- alaska_native_regional_corporations(year = year, ...)

    return(anrc)

  } else if (geography == "voting district") {

    if (!is.null(county) && length(county) == 1) {
      vtds <- voting_districts(state = state, county = county,
                               year = 2020, cb = cb, ...)
    } else {
      vtds <- voting_districts(state = state, year = 2020, cb = cb, ...)
    }

    vtds <- dplyr::rename(vtds, GEOID = GEOID20)

    return(vtds)

  } else {

    # Leave this in as a legacy piece in case something changes
    stop(sprintf("Geometry for %s is not yet supported.  Use the tigris package and join as normal instead.",
                 geography), call. = FALSE)

  }
}

#' Install a CENSUS API Key in Your \code{.Renviron} File for Repeated Use
#' @description This function will add your CENSUS API key to your \code{.Renviron} file so it can be called securely without being stored
#' in your code. After you have installed your key, it can be called any time by typing \code{Sys.getenv("CENSUS_API_KEY")} and can be
#' used in package functions by simply typing CENSUS_API_KEY If you do not have an \code{.Renviron} file, the function will create on for you.
#' If you already have an \code{.Renviron} file, the function will append the key to your existing file, while making a backup of your
#' original file for disaster recovery purposes.
#' @param key The API key provided to you from the Census formated in quotes. A key can be acquired at \url{http://api.census.gov/data/key_signup.html}
#' @param install if TRUE, will install the key in your \code{.Renviron} file for use in future sessions.  Defaults to FALSE.
#' @param overwrite If this is set to TRUE, it will overwrite an existing CENSUS_API_KEY that you already have in your \code{.Renviron} file.
#' @importFrom utils write.table read.table
#' @examples
#'
#' \dontrun{
#' census_api_key("111111abc", install = TRUE)
#' # First time, reload your environment so you can use the key without restarting R.
#' readRenviron("~/.Renviron")
#' # You can check it with:
#' Sys.getenv("CENSUS_API_KEY")
#' }
#'
#' \dontrun{
#' # If you need to overwrite an existing key:
#' census_api_key("111111abc", overwrite = TRUE, install = TRUE)
#' # First time, relead your environment so you can use the key without restarting R.
#' readRenviron("~/.Renviron")
#' # You can check it with:
#' Sys.getenv("CENSUS_API_KEY")
#' }
#' @export

census_api_key <- function(key, overwrite = FALSE, install = FALSE){

  if (install) {
    home <- Sys.getenv("HOME")
    renv <- file.path(home, ".Renviron")
    if(file.exists(renv)){
      # Backup original .Renviron before doing anything else here.
      file.copy(renv, file.path(home, ".Renviron_backup"))
    }
    if(!file.exists(renv)){
      file.create(renv)
    }
    else{
      if(isTRUE(overwrite)){
        message("Your original .Renviron will be backed up and stored in your R HOME directory if needed.")
        oldenv=read.table(renv, stringsAsFactors = FALSE)
        newenv <- oldenv[-grep("CENSUS_API_KEY", oldenv),]
        write.table(newenv, renv, quote = FALSE, sep = "\n",
                    col.names = FALSE, row.names = FALSE)
      }
      else{
        tv <- readLines(renv)
        if(any(grepl("CENSUS_API_KEY",tv))){
          stop("A CENSUS_API_KEY already exists. You can overwrite it with the argument overwrite=TRUE", call.=FALSE)
        }
      }
    }

    keyconcat <- paste0("CENSUS_API_KEY='", key, "'")
    # Append API key to .Renviron file
    write(keyconcat, renv, sep = "\n", append = TRUE)
    message('Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). \nTo use now, restart R or run `readRenviron("~/.Renviron")`')
    return(key)
  } else {
    message("To install your API key for use in future sessions, run this function with `install = TRUE`.")
    Sys.setenv(CENSUS_API_KEY = key)
  }

}


# Function to generate a vector of variables from an ACS table
variables_from_table_acs <- function(table, year, survey, cache_table) {

  # Look to see if table exists in cache dir
  cache_dir <- user_cache_dir("tidycensus")

  dset <- paste0(survey, "_", year, ".rds")

  dset <- gsub("/", "_", dset)



  if (cache_table) {
    message(sprintf("Loading %s variables for %s from table %s and caching the dataset for faster future access.", toupper(survey), year, table))
    df <- load_variables(year, survey, cache = TRUE)
  } else {
    if (file.exists(file.path(cache_dir, dset))) {
      df <- load_variables(year, survey, cache = TRUE)
    } else {
      message(sprintf("Loading %s variables for %s from table %s. To cache this dataset for faster access to ACS tables in the future, run this function with `cache_table = TRUE`. You only need to do this once per ACS dataset.", toupper(survey), year, table))
      df <- load_variables(year, survey, cache = FALSE)
    }
  }

  # For backwards compatibility
  names(df) <- tolower(names(df))

  specific <- paste0(table, "_")

  # Find all variables that match the table
  sub <- df[grepl(specific, df$name), ]

  vars <- sub$name

  return(vars)

}


# Function to generate a vector of variables from an Census table
variables_from_table_decennial <- function(table, year, sumfile, cache_table) {

  # Look to see if table exists in cache dir
  cache_dir <- user_cache_dir("tidycensus")

  dset <- paste0(sumfile, "_", year, ".rds")

  if (cache_table) {

    df <- load_variables(year, sumfile, cache = TRUE)
    names(df) <- tolower(names(df))

    # Check to see if we need to look in sf3 for 2000
    if (year == 2000) {
      if (!any(grepl(table, df$name))) {
        df <- load_variables(year, dataset = "sf3", cache = TRUE)
        names(df) <- tolower(names(df))
      }
    }


    message(sprintf("Loading %s variables for %s from table %s and caching the dataset for faster future access.", toupper(sumfile), year, table))

  } else {
    if (file.exists(file.path(cache_dir, dset))) {
      df <- load_variables(year, sumfile, cache = TRUE)
      names(df) <- tolower(names(df))

      # Check to see if we need to look in sf3 for 2000
      if (year == 2000) {
        if (!any(grepl(table, df$name))) {
          df <- load_variables(year, dataset = "sf3", cache = TRUE)
          names(df) <- tolower(names(df))
        }
      }

    } else {
      message(sprintf("Loading %s variables for %s from table %s. To cache this dataset for faster access to Census tables in the future, run this function with `cache_table = TRUE`. You only need to do this once per Census dataset.", toupper(sumfile), year, table))
      df <- load_variables(year, sumfile, cache = FALSE)
      names(df) <- tolower(names(df))

      # Check to see if we need to look in sf3 for 2000
      if (year == 2000) {
        if (!any(grepl(table, df$name))) {
          df <- load_variables(year, dataset = "sf3", cache = TRUE)
          names(df) <- tolower(names(df))
        }
      }
    }
  }

  # Find all variables that match the table

  if (year == 2020 && sumfile %in% c("pl", "dhc", "dp", "ddhca")) {
    vars <- df %>%
      filter(grepl(paste0(table, "_[0-9]+"), name)) %>%
      pull(name)
  } else {
    vars <- df %>%
      filter(grepl(paste0(table, "[0-9]+"), name)) %>%
      pull(name)
  }


  return(vars)

}


# Check to see if a Census API key is installed
get_census_api_key <- function(key) {

  # If a key is supplied, return it
  if (!is.null(key)) {
    return(key)

  } else if (Sys.getenv("CENSUS_API_KEY") == "") {
    rlang::warn(message = c('*' = stringr::str_wrap("You have not set a Census API key. Users without a key are limited to 500 queries per day and may experience performance limitations."),
                            'i' = stringr::str_wrap("For best results, get a Census API key at http://api.census.gov/data/key_signup.html and then supply the key to the `census_api_key()` function to use it throughout your tidycensus session.")),
                .frequency = "once",
                .frequency_id = "api_key_warning")

    return(NULL)

  } else {

    return(Sys.getenv('CENSUS_API_KEY'))

  }
}


#' Identify summary files for a given decennial Census year
#'
#' @param year The year of the decennial Census
#'
#' @return A vector of available summary files for a given decennial Census year. To access data for a given summary file, supply the desired value to the \code{sumfile} parameter in \code{get_decennial()}.
#' @export
summary_files <- function(year) {
  if (year == 2000) {
    sumfiles <- c("sf1", "sf2", "sf3", "sf4", "pl",
                  "sf2profile", "sf3profile", "sf4profile",
                  "aian", "aianprofile", "as", "mp",
                  "gu", "vi", "cd110h", "cd110s", "cd110hprofile",
                  "cd110sprofile", "sldh", "slds", "sldhprofile",
                  "sldsprofile", "cqr")
  } else if (year == 2010) {
    sumfiles <- c("sf1", "sf2", "pl", "plnat",
                  "aian", "as", "mp",
                  "gu", "vi", "cd113", "cd113profile",
                  "cd115", "cd115profile", "cd116")
  } else if (year == 2020) {
    sumfiles <- c("pl", "dhc", "dp", "pes", "dpas", "ddhca",
                  "dpmp", "dpgu", "dpvi",
                  "dhcvi", "dhcgu", "dhcvi", "dhcas", "cd118")
  } else {
    stop("Invalid year for `get_decennial()`; valid years are 2000, 2010, and 2020.", call. = FALSE)
  }

  sumfiles
}
walkerke/tidycensus documentation built on March 28, 2024, 7:04 p.m.