Nothing
#' Internal function: Get the API response, return a data frame
#'
#' @param apiurl, key, get, region, time, etc
#' @keywords internal
#' @export
getFunction <- function(apiurl, name, key, get, region, regionin, time, year, date, period, monthly, show_call, convert_variables, category_code, data_type_code, naics, pscode, naics2012, naics2007, naics2002, naics1997, sic, ...) {
# Return API's built in error message if invalid call
apiCheck <- function(req) {
if (req$status_code==400) {
error_message <- (gsub("<[^>]*>", "", httr::content(req, as="text")))
if (error_message == "error: missing 'for' argument") {
stop("This dataset requires you to specify a geography with the 'region' argument.")
}
stop(paste("The Census Bureau returned the following error message:\n", error_message,
"\n Your API call was: ", print(req$url)))
}
# Some time series don't give error messages, just don't resolve (e.g. SAIPE)
if (req$status_code==204) stop("204, no content was returned.\nSee ?listCensusMetadata to learn more about valid API options.", call. = FALSE)
if (identical(httr::content(req, as = "text"), "")) stop(paste("No output to parse. \n Your API call was: ", print(req$url)), call. = FALSE)
}
apiParse <- function (req) {
if (jsonlite::validate(httr::content(req, as="text"))[1] == FALSE) {
error_message <- (gsub("<[^>]*>", "", httr::content(req, as="text")))
stop(paste("The Census Bureau returned the following error message:\n", error_message, "\nYour api call was: ", req$url))
} else {
# Show call if option is true
if (show_call == TRUE) {
print(paste("Your successful api call was: ", req$url))
print("For more information, visit the documentation at https://www.hrecht.com/censusapi/")
}
raw <- jsonlite::fromJSON(httr::content(req, as = "text"))
}
}
# Function to clean up column names - particularly ones with periods in them
cleanColnames <- function(dt) {
# No trailing punct
colnames(dt) <- gsub("\\.[[:punct:]]*$", "", colnames(dt))
# All punctuation becomes underscore
colnames(dt) <- gsub("[[:punct:]]", "_", colnames(dt))
# Get rid of repeat underscores
colnames(dt) <- gsub("(_)\\1+", "\\1", colnames(dt))
return(dt)
}
responseFormat <- function(raw) {
# Make first row the header
colnames(raw) <- raw[1, ]
df <- data.frame(raw)
df <- df[-1,]
df <- cleanColnames(df)
# Make all columns character
df[] <- lapply(df, as.character)
# Make columns numeric based on column names - unfortunately best strategy without additional API calls given structure of data across endpoints
if (convert_variables == TRUE) {
string_col_parts <- "_TTL|_NAME|NAICS2012|NAICS2017|NAICS2012_TTL|NAICS2017_TTL|fage4|FAGE4|LABEL|_DESC|CAT"
# For ACS data, do not make columns numeric if they are ACS annotation variables - ending in MA or EA or SS
if (grepl("acs/acs", name, ignore.case = T)) {
# Do not make known string/label variables numeric
numeric_cols <- grep("[0-9]", names(df), value=TRUE)
string_cols <- grep(paste0("MA|EA|SS|", string_col_parts), numeric_cols, value = TRUE, ignore.case = T)
# Small Area Health Insurance Estimates
} else if (grepl("healthins/sahie", name, ignore.case = T)) {
numeric_cols <- grep("[0-9]|_PT|NIPR|PCTIC|PCTUI|NIC|NUI", names(df), value=TRUE, ignore.case = T)
string_cols <- grep(string_col_parts, numeric_cols, value = TRUE, ignore.case = T)
# Small Area Income and Poverty Estimates
} else if (grepl("poverty/saipe", name, ignore.case = T)) {
numeric_cols <- grep("[0-9]|SAEMHI|SAEPOV", names(df), value=TRUE, ignore.case = T)
string_cols <- grep(string_col_parts, numeric_cols, value = TRUE, ignore.case = T)
# Population and Housing Estimates
} else if (grepl("pep/", name, ignore.case = T)) {
numeric_cols <- grep("[0-9]|POP|DENSITY|HUEST", names(df), value=TRUE, ignore.case = T)
string_cols <- grep(string_col_parts, numeric_cols, value = TRUE, ignore.case = T)
# County Business Patterns
} else if (name == "cbp" | name == "zbp") {
# Exact matches for CBP variables
numeric_cols <- grep("[0-9]|\\<EMP\\>|\\<ESTAB\\>|PAYANN", names(df), value=TRUE, ignore.case = T)
string_cols <- grep(string_col_parts, numeric_cols, value = TRUE, ignore.case = T)
# Decennial Response Rates
} else if (name == "dec/responserate") {
numeric_cols <- grep("[0-9]|CINT|MIN|MED|AVG|MAX|DRR|CRR", names(df), value=TRUE, ignore.case = T)
string_cols <- grep(string_col_parts, numeric_cols, value = TRUE, ignore.case = T)
# International trade
} else if (grepl("timeseries/intltrade/", name, ignore.case = T)) {
numeric_cols <- grep("[0-9]", names(df), value=TRUE, ignore.case = T)
string_col_parts <- paste0(string_col_parts, "|UNIT_QY|_FLAG")
string_cols <- grep(string_col_parts, numeric_cols, value = TRUE, ignore.case = T)
# Microdata weighting variables
} else if (grepl("cps/", name, ignore.case = T) |
name %in% c("acs/acs5/pums", "acs/acs5/pumspr", "acs/acs1/pums", "acs/acs1/pumspr")) {
numeric_cols <- grep("[0-9]|PWSSWGT|HWHHWGT|PWFMWGT|PWLGWGT|PWCMPWGT
|PWORWGT|PWVETWGT|WGTP|PWGTP", names(df), value=TRUE, ignore.case = T)
string_cols <- grep(string_col_parts, numeric_cols, value = TRUE, ignore.case = T)
} else {
# Do not make known string/label variables numeric
numeric_cols <- grep("[0-9]", names(df), value=TRUE)
string_cols <- grep(string_col_parts, numeric_cols, value = TRUE, ignore.case = T)
}
# Convert string "NULL" or "N/A" values to true NA
df[(df == "NULL" | df == "N/A" | df == "NA")] <- NA
for(col in setdiff(numeric_cols, string_cols)) df[,col] <- as.numeric(df[,col])
}
row.names(df) <- NULL
return(df)
}
# Assemble call
req <- httr::GET(apiurl, query = list(key = key, get = get, "for" = region, "in" = regionin, category_code = category_code, data_type_code = data_type_code, time = time, YEAR = year, DATE = date, PERIOD = period, MONTHLY = monthly, NAICS=naics, PSCODE=pscode, NAICS2012 = naics2012, NAICS2007 = naics2007, NAICS2002 = naics2002, NAICS1997 = naics1997, SIC = sic, ...))
# Check the API call for a valid response
apiCheck(req)
# If check didn't fail, parse the content
raw <- apiParse(req)
# Format the response into a nice data frame
df <- responseFormat(raw)
}
#' Retrieve Census data from a given API
#'
#' @param name The programmatic name of your dataset, e.g. `timeseries/poverty/saipe`
#' or `acs/acs5`. See `listCensusApis()` for options.
#' @param vintage Vintage (year) of dataset, e.g. 2014. Not required for timeseries APIs.
#' @param vars List of variables to get. Required.
#' @param region Geography to get.
#' @param regionin Optional hierarchical geography to limit region.
#' @param time,year,date,period,monthly Optional arguments used for some time series APIs.
#' @param category_code,data_type_code,naics,pscode,naics2012,naics2007,naics2002,naics1997,sic
#' Optional arguments used in economic data APIs.
#' @param show_call List the underlying API call that was sent to the Census Bureau.
#' @param convert_variables Convert likely numeric variables into numeric data.
#' Default is true. If false, results will be characters, which is the type returned by
#' the Census Bureau.
#' @param key Your Census API key, obtained at https://api.census.gov/data/key_signup.html.
#' This function will default to a `CENSUS_KEY` stored in your .Renviron if available.
#' @param ... Other valid arguments to pass to the Census API. Note: the APIs are case sensitive.
#' @keywords api
#' @examples
#' \dontrun{
#' # Get total population and median household income for places (cities, towns, villages)
#' # in one state from the 5-year ACS.
#' acs_simple <- getCensus(
#' name = "acs/acs5",
#' vintage = 2020,
#' vars = c("NAME", "B01001_001E", "B19013_001E"),
#' region = "place:*",
#' regionin = "state:01")
#' head(acs_simple)
#'
#' # Get all data from the B19013 variable group.
#' # This returns estimates as well as margins of error and annotation flags.
#' acs_group <- getCensus(
#' name = "acs/acs5",
#' vintage = 2020,
#' vars = c("B01001_001E", "group(B19013)"),
#' region = "place:*",
#' regionin = "state:01")
#' head(acs_group)
#'
#' # Retreive 2010 Decennial Census block-level data within a specific tract,
#' # using the regionin argument to precisely specify the Census tract.
#' decennial_2010 <- getCensus(
#' name = "dec/sf1",
#' vintage = 2010,
#' vars = c("NAME","P001001"),
#' region = "block:*",
#' regionin = "state:36+county:027+tract:010000")
#' head(decennial_2010)
#'
#' # Get poverty rates for children and for people of all ages over time using the
#' # Small Area Income and Poverty Estimates API
#' saipe <- getCensus(
#' name = "timeseries/poverty/saipe",
#' vars = c("NAME", "SAEPOVRT0_17_PT", "SAEPOVRTALL_PT"),
#' region = "state:01",
#' year = "2000:2019")
#' head(saipe)
#'
#' # Get County Business Patterns data for a specific NAICS sector.
#' cbp_2016 <- getCensus(
#' name = "cbp",
#' vintage = "2016",
#' vars = c("EMP", "ESTAB", "NAICS2012_TTL", "GEO_TTL"),
#' region = "state:*",
#' naics2012 = "23")
#' head(cbp_2016)
#' }
#'
#' @export
getCensus <-
function(name,
vintage = NULL,
key = Sys.getenv("CENSUS_KEY"),
vars,
region = NULL,
regionin = NULL,
time = NULL,
year = NULL,
date = NULL,
period = NULL,
monthly = NULL,
show_call = FALSE,
convert_variables = TRUE,
category_code = NULL,
data_type_code = NULL,
naics = NULL,
pscode = NULL,
naics2012 = NULL,
naics2007 = NULL,
naics2002 = NULL,
naics1997 = NULL,
sic = NULL,
...) {
constructURL <- function(name, vintage) {
if (is.null(vintage)) {
apiurl <- paste("https://api.census.gov/data", name, sep="/")
} else {
apiurl <- paste("https://api.census.gov/data", vintage, name, sep="/")
}
# Handle messy urls
lastchar <- substr(apiurl, nchar(apiurl), nchar(apiurl))
if (lastchar=="?" | lastchar=="/") {
apiurl <- substr(apiurl, 1, nchar(apiurl)-1)
}
apiurl
}
# Check for key in environment
key_env <- Sys.getenv("CENSUS_KEY")
if ((key_env == "" & key == key_env)) {
stop("'key' argument is missing. A Census API key is required and can be requested at https://api.census.gov/data/key_signup.html.\nPlease add your Census key to your .Renviron - see instructions at https://github.com/hrecht/censusapi#api-key-setup")
}
apiurl <- constructURL(name, vintage)
# Census API max vars per call = 50
# Splitting function based on work by Nicholas Nagle, https://rpubs.com/nnnagle/19337
if(length(vars)>50){
# Split vars into list
vars <- split(vars, ceiling(seq_along(vars)/50))
get <- lapply(vars, function(x) paste(x, sep='', collapse=","))
data <- lapply(get, function(x) getFunction(apiurl, name, key, x, region, regionin, time, year, date, period, monthly, show_call, convert_variables, category_code, data_type_code, naics, pscode, naics2012, naics2007, naics2002, naics1997, sic, ...))
data <- Reduce(function(x, y) merge(x, y, all = TRUE, sort = FALSE), data)
} else {
get <- paste(vars, sep='', collapse=',')
data <- getFunction(apiurl, name, key, get, region, regionin, time, year, date, period, monthly, show_call, convert_variables, category_code, data_type_code, naics, pscode, naics2012, naics2007, naics2002, naics1997, sic, ...)
}
# If there are any duplicate columns (ie if you put a variable in vars twice) remove the duplicates
data <- data[, !duplicated(colnames(data))]
# Reorder columns so that lowercase column names (geographies) are first
data <- data[,c(which(grepl("[a-z]", colnames(data))), which(!grepl("[a-z]", colnames(data))))]
return(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.