#' Get general information about available datasets
#'
#' Scrapes <https://api.census.gov/data.json> and returns a dataframe that
#' includes columns for dataset title, description, name, vintage, url, dataset
#' type, and other useful fields.
#'
#' @family metadata
#' @param name Optional complete or partial API dataset programmatic name. For
#' example, "acs", "acs/acs5", "acs/acs5/subject". If using a partial name,
#' this needs to be the left-most part of the dataset name before `/`, e.g.
#' "timeseries/eits" or "dec" or "acs/acs5".
#' @param vintage Optional vintage (year) of dataset.
#' @returns A data frame with the following columns:
#' * title: Short written description of the dataset.
#' * name: Programmatic name of the dataset.
#' * vintage: Year of the survey, for use with microdata and aggregate datasets.
#' * type: Dataset type, which is either "Aggregate", "Microdata", or "Timeseries".
#' * temporal: Time period of the dataset. Warning: not always documented.
#' * spatial: Spatial region of the dataset. Warning: not always documented.
#' * url: Base URL of the dataset endpoint.
#' * modified: Date last modified. Warning: sometimes out of date.
#' * description: Long written description of the dataset.
#' * contact: Email address for specific questions about the Census Bureau survey.
#' @examplesIf has_api_key()
#' # Get information about every dataset available in the APIs
#' apis <- listCensusApis()
#' head(apis)
#'
#' # Get information about all vintage 2022 datasets
#' apis_2022 <- listCensusApis(vintage = 2022)
#' head(apis_2022)
#'
#' # Get information about all timeseries datasets
#' apis_timeseries <- listCensusApis(name = "timeseries")
#' head(apis_timeseries)
#'
#' # Get information about 2020 Decennial Census datasets
#' apis_decennial_2020 <- listCensusApis(name = "dec", vintage = 2020)
#' head(apis_decennial_2020)
#'
#' # Get information about one particular dataset
#' api_sahie <- listCensusApis(name = "timeseries/healthins/sahie")
#' head(api_sahie)
#'
#' @export
listCensusApis <- function(name = NULL,
vintage = NULL) {
constructURL <- function(name, vintage) {
# Get data.json
if (is.null(name) & is.null(vintage)) {
u <- "https://api.census.gov/data.json"
} else if (is.null(vintage) & !is.null(name)) {
u <- paste0("https://api.census.gov/data/", name, ".json")
} else if (is.null(name) & !is.null(vintage)) {
u <- paste0("https://api.census.gov/data/", vintage, ".json")
} else if (!is.null(name) & !is.null(vintage)) {
u <- paste0("https://api.census.gov/data/", vintage, "/", name, ".json")
}
}
# Return API's built in error message if invalid call
apiCheck <- function(req) {
if (!(req$status_code %in% c(200, 201, 202))) {
if (req$status_code == 404) {
stop(paste("Invalid metadata request, (404) not found.",
"\n Your API call was: ", print(req$url)), call. = FALSE)
} else if (req$status_code==400) {
stop(paste("The Census Bureau returned the following error message:\n", req$error_message,
"\n Your API call was: ", print(req$url)))
} else if (req$status_code==204) {
stop(paste("204, no content was returned. \n Your API call was: ", print(req$url)), call. = FALSE)
} else if (identical(httr::content(req, as = "text"), "")) {
stop(paste("No output to parse. \n Your API call was: ", print(req$url)), call. = FALSE)
}
}
}
apiParse <- function (req) {
if (jsonlite::validate(httr::content(req, as="text"))[1] == FALSE) {
error_message <- (gsub("<[^>]*>", "", httr::content(req, as="text")))
stop(paste("The Census Bureau returned the following error message:\n", error_message, "\nYour api call was: ", req$url))
} else {
raw <- jsonlite::fromJSON(httr::content(req, as = "text"))
}
}
u <- constructURL(name = name, vintage = vintage)
req <- httr::GET(u)
# Check the API call for a valid response
apiCheck(req)
# If check didn't fail, parse the content
raw <- apiParse(req)
#raw <- jsonlite::fromJSON(u)
datasets <- jsonlite::flatten(raw$dataset)
# Format variable names and values
colnames(datasets) <- gsub("c_", "", colnames(datasets))
datasets$name <- apply(datasets, 1, function(x) paste(x$dataset, collapse = "/"))
datasets$url <- apply(datasets, 1, function(x) x$distribution$accessURL)
names(datasets)[names(datasets) == "contactPoint.hasEmail"] <- "contact"
# Add a dataset type variable built from binary variables
# If requesting a subset of datasets it won't have all of these fields so assign
# values one by one
datasets$type <- NA
if ("isMicrodata" %in% colnames(datasets)) {
datasets$type <- ifelse(datasets$isMicrodata %in% TRUE, "Microdata", datasets$type)
}
if ("isAggregate" %in% colnames(datasets)) {
datasets$type <- ifelse(datasets$isAggregate %in% TRUE, "Aggregate", datasets$type)
}
if ("isTimeseries" %in% colnames(datasets)) {
datasets$type <- ifelse(datasets$isTimeseries %in% TRUE, "Timeseries", datasets$type)
}
# Keep only valuable columns - many are not useful (empty or the same for all datasets)
if ("vintage" %in% colnames(datasets)) {
dt <- datasets[, c("title", "name", "vintage", "type", "temporal",
"spatial", "url", "modified", "description", "contact")]
# Give some logic to the row ordering
dt <- dt[order(-dt$vintage, dt$name),]
} else {
dt <- datasets[, c("title", "name", "type", "temporal", "spatial",
"url", "modified", "description", "contact")]
dt <- dt[order(dt$name),]
}
dt$contact <- gsub("mailto:", "", dt$contact)
rownames(dt) <- NULL
return(dt)
}
#' Get metadata about a specified API endpoint
#'
#' Get information about a Census Bureau API dataset, including its available
#' variables, geographies, variable groups, and value labels
#'
#' @family metadata
#' @param name API programmatic name - e.g. acs/acs5. Use `listCensusApis()` to
#' see valid dataset names.
#' @param vintage Vintage (year) of dataset. Not required for timeseries APIs.
#' @param type Type of metadata to return. Options are:
#' * "variables" (default) - list of variable names and descriptions
#' for the dataset.
#' * "geographies" - available geographies.
#' * "groups" - available variable groups. Only available
#' for some datasets.
#' * "values" - encoded value labels for a given variable. Pair with
#' "variable_name". Only available for some datasets.
#' @param group An optional variable group code, used to return metadata for a
#' specific group of variables only. Variable groups are not used for all
#' APIs.
#' @param variable_name A name of a specific variable used to return value
#' labels for that variable. Value labels are not used for all APIs.
#' @param include_values Use with `type = "variables"`. Include value metadata
#' for all variables in a dataset if value metadata exists. Default is
#' "FALSE".
#' @returns A data frame with metadata about the specified API endpoint.
#' @examplesIf has_api_key()
#' # type: variables # List the variables available in the Small Area
#' # Health Insurance Estimates.
#' variables <- listCensusMetadata(
#' name = "timeseries/healthins/sahie", type = "variables")
#' head(variables)
#'
#' # type: variables for a single variable group
#' # List the variables that are included in the B17020 group in the
#' # 5-year American Community Survey.
#' variable_group <- listCensusMetadata(
#' name = "acs/acs5", vintage = 2022,
#' type = "variables", group = "B17020")
#' head(variable_group)
#'
#' # type: variables, with value labels
#' # Create a data dictionary with all variable names and encoded values
#' # for a microdata API.
#' variable_values <- listCensusMetadata(
#' name = "cps/voting/nov",
#' vintage = 2020,
#' type = "variables",
#' include_values = TRUE)
#' head(variable_values)
#'
#' # type: geographies
#' # List the geographies available in the 5-year American Community Survey.
#' geographies <- listCensusMetadata(
#' name = "acs/acs5",
#' vintage = 2022,
#' type = "geographies")
#' head(geographies)
#'
#' # type: groups
#' # List the variable groups available in the 5-year American
#' # Community Survey.
#' groups <- listCensusMetadata(
#' name = "acs/acs5",
#' vintage = 2022,
#' type = "groups")
#' head(groups)
#'
#' # type: values for a single variable
#' # List the value labels of the NAICS2017 variable in the County
#' # Business Patterns dataset.
#' naics_values <- listCensusMetadata(
#' name = "cbp",
#' vintage = 2021,
#' type = "values",
#' variable = "NAICS2017")
#' head(naics_values)
#'
#' @export
listCensusMetadata <-
function(name,
vintage = NULL,
type = "variables",
group = NULL,
variable_name = NULL,
include_values = FALSE) {
constructURL <- function(name, vintage) {
if (is.null(vintage)) {
apiurl <- paste("https://api.census.gov/data", name, sep="/")
} else {
apiurl <- paste("https://api.census.gov/data", vintage, name, sep="/")
}
# Handle messy urls
lastchar <- substr(apiurl, nchar(apiurl), nchar(apiurl))
if (lastchar=="?" | lastchar=="/") {
apiurl <- substr(apiurl, 1, nchar(apiurl)-1)
}
apiurl
}
# Return API's built in error message if invalid call
apiCheck <- function(req) {
if (!(req$status_code %in% c(200, 201, 202))) {
if (req$status_code == 404) {
stop(paste("Invalid metadata request, (404) not found.",
"\n Your API call was: ", print(req$url)), call. = FALSE)
} else if (req$status_code==400) {
stop(paste("The Census Bureau returned the following error message:\n", req$error_message,
"\n Your API call was: ", print(req$url)))
} else if (req$status_code==204) {
stop(paste("204, no content was returned. \n Your API call was: ", print(req$url)), call. = FALSE)
} else if (identical(httr::content(req, as = "text"), "")) {
stop(paste("No output to parse. \n Your API call was: ", print(req$url)), call. = FALSE)
}
}
}
apiParse <- function (req) {
if (jsonlite::validate(httr::content(req, as="text"))[1] == FALSE) {
error_message <- (gsub("<[^>]*>", "", httr::content(req, as="text")))
stop(paste("The Census Bureau returned the following error message:\n", error_message, "\nYour api call was: ", req$url))
} else {
raw <- jsonlite::fromJSON(httr::content(req, as = "text"))
}
}
apiurl <- constructURL(name, vintage)
if (type %in% c("variables")) {
# Too nested and irregular for automatic conversion
if (!is.null(group)) {
u <- paste(apiurl, "/groups/", group, ".json", sep="")
req <- httr::GET(u)
# Check the API call for a valid response
apiCheck(req)
# If check didn't fail, parse the content
raw <- apiParse(req)
cols <- unique(unlist(lapply(raw$variables, names)))
makeDf <- function(d) {
df <- data.frame(d)
df[, setdiff(cols, names(df))] <- NA
return(df)
}
dts <- lapply(raw$variables, function(x) {makeDf(x)})
} else {
u <- paste(apiurl, "variables.json", sep="/")
req <- httr::GET(u)
# Check the API call for a valid response
apiCheck(req)
# If check didn't fail, parse the content
raw <- apiParse(req)
# JSON of variables has irregular structure that gets standardized in the HTML view
# Particularly the datetime filed used in some APIs
# Generally, predicateOnly = parameter, exclude predicateOnly (parameters)
# Manual fill with NAs as needed to avoid adding a dplyr::bind_rows or similar dependency
# Get the list of possible column names
cols <- unique(unlist(lapply(raw$variables, names)))
# Remove invalid dashes in variable names - problem present in Microdata APIs
cols <- gsub("-", "_", cols)
if (include_values == FALSE | !("values" %in% cols)) {
# Warn the user if they've asked for value labels but none are present
if (include_values == TRUE & !("values" %in% cols)) {
warning("You've set `include_values` to TRUE but this dataset does not contain variable values. Variable values will not be returned")
}
cols <- cols[!(cols %in% c("validValues", "values", "datetime"))]
# Remove attributes that have nested lists
makeDf <- function(d) {
names(d) <- gsub("-", "_", names(d))
if ("validValues" %in% names(d)) {
d$validValues <- NULL
}
if ("values" %in% names(d)) {
d$values <- NULL
}
if ("datetime" %in% names(d)) {
d$datetime <- NULL
}
df <- data.frame(d)
df[, setdiff(cols, names(df))] <- NA
return(df)
}
dts <- lapply(raw$variables, makeDf)
} else if (include_values == TRUE) {
# Prepare for value code and label if the value metadata is present
if ("values" %in% cols) {
# print("VALUES ARE PRESENT")
cols <- c(cols, "values_code", "values_label")
cols <- cols[cols != "values"]
}
makeDf <- function(d) {
names(d) <- gsub("-", "_", names(d))
# As of right now, not using the "range" metadata in some of the microdata,
# only item labels
if ("values" %in% names(d) & "item" %in% names(d$values)) {
# print(("YES VALUES META")
# Make data frame of value labels
temp_vals <- utils::stack(d$values$item)
# Column cleaning
colnames(temp_vals) <- c("label", "code")
temp_vals <- temp_vals[, c("code", "label")]
# Use character, not factor
temp_vals$code <- as.character(temp_vals$code)
# Assign back to parent
d$values <- temp_vals
df <- as.data.frame(d)
names(df) <- gsub("\\.", "_", names(df))
} else {
# print("NO VALUES META")
# Set values to null in case it exists but without `item` labels
d$values <- NULL
df <- as.data.frame(d)
}
df[, setdiff(cols, names(df))] <- NA
return(df)
}
dts <- lapply(raw$variables, makeDf)
}
}
temp <- Filter(is.data.frame, dts)
dt <- do.call(rbind, temp)
# Clean up row names aka variable names
dt <- cbind(name = row.names(dt), dt)
row.names(dt) <- NULL
# If there are periods in the name field from concatenated numbers, remove
dt$name <- gsub("\\..*", "", dt$name)
dt[] <- lapply(dt, as.character)
} else if (type %in% c("geographies", "geography")) {
u <- paste(apiurl, "geography.json", sep="/")
req <- httr::GET(u)
# Check the API call for a valid response
apiCheck(req)
# If check didn't fail, parse the content
raw <- apiParse(req)
dt <- raw$fips
} else if (type %in% c("groups")) {
u <- paste(apiurl, "groups.json", sep="/")
req <- httr::GET(u)
# Check the API call for a valid response
apiCheck(req)
# If check didn't fail, parse the content
raw <- apiParse(req)
dt <- raw[[1]]
if (is.null(dim(dt))) {
stop("Groups are not available for the selected API endpoint.")
}
} else if (type == "values") {
u <- paste0(apiurl, "/variables/", variable_name, ".json")
req <- httr::GET(u)
# Check the API call for a valid response
apiCheck(req)
# If check didn't fail, parse the content
raw <- apiParse(req)
if (length(raw$values) == 0 | !("item" %in% names(raw$values))) {
stop(paste("Value labels are not available for the selected variable:", variable_name))
}
dt <- utils::stack(raw$values$item)
colnames(dt) <- c("label", "code")
dt <- dt[, c("code", "label")]
} else {
stop(paste('For "type", you entered: "', type, '". Did you mean "variables", "geographies", "groups", or "values"?', sep = ""))
}
return(dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.