R/metadata_functions.R

Defines functions listCensusMetadata listCensusApis

Documented in listCensusApis listCensusMetadata

#' Get useful dataset metadata on all available APIs as a data frame
#'
#' Scrapes {https://api.census.gov/data.json} and returns a dataframe
#' that includes: title, description, name, vintage, url, dataset type, and other useful fields.
#'
#' @keywords metadata
#' @export
#' @examples
#' \dontrun{
#' apis <- listCensusApis()
#' head(apis)
#' }
listCensusApis <- function() {
	# Get data.json
	u <- "https://api.census.gov/data.json"
	raw <- jsonlite::fromJSON(u)
	datasets <- jsonlite::flatten(raw$dataset)

	# Format variable names and values
	colnames(datasets) <- gsub("c_", "", colnames(datasets))
	datasets$name <- apply(datasets, 1, function(x) paste(x$dataset, collapse = "/"))
	datasets$url <- apply(datasets, 1, function(x) x$distribution$accessURL)

	names(datasets)[names(datasets) == "contactPoint.hasEmail"] <- "contact"

	# Add a dataset type variable built from binary variables
	datasets$type <- ifelse(datasets$isMicrodata %in% TRUE , "Microdata",
													ifelse(datasets$isTimeseries %in% TRUE, "Timeseries",
																 ifelse(datasets$isAggregate %in% TRUE, "Aggregate",
																 			 NA)))

	# Keep only valuable columns - many are not useful (empty or the same for all datasets)
	dt <- datasets[, c("title", "name", "vintage", "type", "temporal", "url", "modified", "description", "contact")]
	dt$contact <- gsub("mailto:", "", dt$contact)

	# Give some logic to the row ordering
	dt <- dt[order(-dt$vintage, dt$name),]
	rownames(dt) <- NULL
	return(dt)
}

#' Get information about a specific API as a data frame
#'
#' @param name API programmatic name - e.g. acs/acs5. See list of names with listCensusApis().
#' @param vintage Vintage (year) of dataset. Not required for timeseries APIs
#' @param type Type of metadata to return. Options are:
#'   * `variables` (default) - list of variable names and descriptions for the dataset.
#'   * `geographies` - available geographies.
#'   * `groups` - available variable groups. Not used for all datasets.
#'   * `values` - encoded value labels for a given variable. Pair with
#'      `variable_name`. Not used for all datasets.
#' @param group An optional variable group code, used to return metadata for a specific group
#' of variables only. Variable groups are not used for all APIs.
#' @param variable_name A name of a specific variable used to return value labels for that
#' variable. Value labels are not published for all APIs.
#' @param include_values Use with `type = "variables"`. Include value metadata
#' for all variables in a dataset if value metadata exists. Default is "FALSE".
#' @keywords metadata
#' @examples
#' \dontrun{
#'
#' # List the variables available in the Small Area Health Insurance Estimates.
#' sahie_variables <- listCensusMetadata(
#'   name = "timeseries/healthins/sahie",
#'   type = "variables")
#'  head(sahie_variables)
#'
#' # List the geographies available in the 5-year 2020 American Community Survey.
#' acs_geographies <- listCensusMetadata(
#'   name = "acs/acs5",
#'   vintage = 2020,
#'   type = "geographies")
#'  head(acs_geographies)
#'
#' # List the variable groups available in the 5-year 2020 American Community Survey.
#' acs_groups <- listCensusMetadata(
#'   name = "acs/acs5",
#'   vintage = 2020,
#'   type = "groups")
#'  head(acs_groups)

#' # Create a data dictionary with all variable names and encoded values for
#' # a microdata API.
#' cbp_dict <- listCensusMetadata(
#'   name = "cbp",
#'   vintage = 2020,
#'   type = "variables",
#'   include_values = TRUE)
#'  head(cbp_dict)
#'
#' # List the value labels of the NAICS2017 variable in the 2020 County
#' # Business Patterns dataset.
#' cbp_naics_values <- listCensusMetadata(
#'   name = "cbp",
#'   vintage = 2020,
#'   type = "values",
#'   variable = "NAICS2017")
#'  head(cbp_naics_values)
#'
#' # List of variables that are included in the B17020 group in the
#' # 5-year American Community Survey.
#' group_B17020 <- listCensusMetadata(
#'   name = "acs/acs5",
#'   vintage = 2017,
#'   type = "variables",
#'   group = "B17020")
#'  head(group_B17020)
#' }
#' @export
listCensusMetadata <-
	function(name,
					 vintage = NULL,
					 type = "variables",
					 group = NULL,
					 variable_name = NULL,
					 include_values = FALSE) {

		constructURL <- function(name, vintage) {
			if (is.null(vintage)) {
				apiurl <- paste("https://api.census.gov/data", name, sep="/")
			} else {
				apiurl <- paste("https://api.census.gov/data", vintage, name, sep="/")
			}

			# Handle messy urls
			lastchar <- substr(apiurl, nchar(apiurl), nchar(apiurl))
			if (lastchar=="?" | lastchar=="/") {
				apiurl <- substr(apiurl, 1, nchar(apiurl)-1)
			}
			apiurl
		}

		# Return API's built in error message if invalid call
		apiCheck <- function(req) {
			if (!(req$status_code %in% c(200, 201, 202))) {
				if (req$status_code == 404) {
					stop(paste("Invalid metadata request, (404) not found.",
										 "\n Your API call was: ", print(req$url)), call. = FALSE)
				} else if (req$status_code==400) {
					stop(paste("The Census Bureau returned the following error message:\n", req$error_message,
										 "\n Your API call was: ", print(req$url)))
				} else if (req$status_code==204) {
					stop(paste("204, no content was returned. \n Your API call was: ", print(req$url)), call. = FALSE)
				} else if (identical(httr::content(req, as = "text"), "")) {
					stop(paste("No output to parse. \n Your API call was: ", print(req$url)), call. = FALSE)
				}
			}
		}

		apiParse <- function (req) {
			if (jsonlite::validate(httr::content(req, as="text"))[1] == FALSE) {
				error_message <- (gsub("<[^>]*>", "", httr::content(req, as="text")))
				stop(paste("The Census Bureau returned the following error message:\n", error_message, "\nYour api call was: ", req$url))
			} else {
				raw <- jsonlite::fromJSON(httr::content(req, as = "text"))
			}
		}

		apiurl <- constructURL(name, vintage)

		if (type %in% c("variables")) {
			# Too nested and irregular for automatic conversion

			if (!is.null(group)) {
				u <- paste(apiurl, "/groups/", group, ".json", sep="")

				req <- httr::GET(u)
				# Check the API call for a valid response
				apiCheck(req)

				# If check didn't fail, parse the content
				raw <- apiParse(req)

				cols <- unique(unlist(lapply(raw$variables, names)))
				makeDf <- function(d) {
					df <- data.frame(d)
					df[, setdiff(cols, names(df))] <- NA
					return(df)
				}
				dts <- lapply(raw$variables, function(x) {makeDf(x)})

			} else {
				u <- paste(apiurl, "variables.json", sep="/")
				req <- httr::GET(u)
				# Check the API call for a valid response
				apiCheck(req)

				# If check didn't fail, parse the content
				raw <- apiParse(req)
				# JSON of variables has irregular structure that gets standardized in the HTML view
				# Particularly the datetime filed used in some APIs
				# Generally, predicateOnly = parameter, exclude predicateOnly (parameters)

				# Manual fill with NAs as needed to avoid adding a dplyr::bind_rows or similar dependency

				# Get the list of possible column names
				cols <- unique(unlist(lapply(raw$variables, names)))

				# Remove invalid dashes in variable names - problem present in Microdata APIs
				cols <- gsub("-", "_", cols)

				if (include_values == FALSE | !("values" %in% cols)) {

					# Warn the user if they've asked for value labels but none are present
					if (include_values == TRUE & !("values" %in% cols)) {
						warning("You've set `include_values` to TRUE but this dataset does not contain variable values. Variable values will not be returned")
					}
					cols <- cols[!(cols %in% c("validValues", "values", "datetime"))]

					# Remove attributes that have nested lists
					makeDf <- function(d) {
						names(d) <- gsub("-", "_", names(d))
						if ("validValues" %in% names(d)) {
							d$validValues <- NULL
						}
						if ("values" %in% names(d)) {
							d$values <- NULL
						}
						if ("datetime" %in% names(d)) {
							d$datetime <- NULL
						}
						df <- data.frame(d)

						df[, setdiff(cols, names(df))] <- NA
						return(df)
					}

					dts <- lapply(raw$variables, makeDf)

				} else if (include_values == TRUE) {

			  	# Prepare for value code and label if the value metadata is present
					if ("values" %in% cols) {
						# print("VALUES ARE PRESENT")
						cols <- c(cols, "values_code", "values_label")
						cols <- cols[cols != "values"]
					}

					makeDf <- function(d) {
						names(d) <- gsub("-", "_", names(d))

						# As of right now, not using the "range" metadata in some of the microdata,
						# only item labels
						if ("values" %in% names(d) & "item" %in% names(d$values)) {
							# print(("YES VALUES META")
							# Make data frame of value labels
							temp_vals <- utils::stack(d$values$item)

							# Column cleaning
							colnames(temp_vals) <- c("label", "code")
							temp_vals <- temp_vals[, c("code", "label")]
							# Use character, not factor
							temp_vals$code <- as.character(temp_vals$code)

							# Assign back to parent
							d$values <- temp_vals

							df <- as.data.frame(d)
							names(df) <- gsub("\\.", "_", names(df))

						} else {
							# print("NO VALUES META")
							# Set values to null in case it exists but without `item` labels
							d$values <- NULL

							df <- as.data.frame(d)
						}

						df[, setdiff(cols, names(df))] <- NA
						return(df)
					}

					dts <- lapply(raw$variables, makeDf)
				}
			}

			temp <- Filter(is.data.frame, dts)
			dt <- do.call(rbind, temp)

			# Clean up row names aka variable names
			dt <- cbind(name = row.names(dt), dt)
			row.names(dt) <- NULL
			# If there are periods in the name field from concatenated numbers, remove
			dt$name <- gsub("\\..*", "", dt$name)
			dt[] <- lapply(dt, as.character)


		} else if (type %in% c("geographies", "geography")) {
			u <- paste(apiurl, "geography.json", sep="/")
			req <- httr::GET(u)
			# Check the API call for a valid response
			apiCheck(req)

			# If check didn't fail, parse the content
			raw <- apiParse(req)
			dt <- raw$fips
		} else if (type %in% c("groups")) {
			u <- paste(apiurl, "groups.json", sep="/")
			req <- httr::GET(u)
			# Check the API call for a valid response
			apiCheck(req)

			# If check didn't fail, parse the content
			raw <- apiParse(req)
			dt <- raw[[1]]
			if (is.null(dim(dt))) {
				stop("Groups are not available for the selected API endpoint.")
			}
		} else if (type == "values") {
			u <- paste0(apiurl, "/variables/", variable_name, ".json")
			req <- httr::GET(u)
			# Check the API call for a valid response
			apiCheck(req)

			# If check didn't fail, parse the content
			raw <- apiParse(req)
			if (length(raw$values) == 0 | !("item" %in% names(raw$values))) {
				stop(paste("Value labels are not available for the selected variable:", variable_name))
			}
			dt <- utils::stack(raw$values$item)
			colnames(dt) <- c("label", "code")
			dt <- dt[, c("code", "label")]

		}	else {
			stop(paste('For "type", you entered: "', type, '". Did you mean "variables", "geographies", "groups", or "values"?', sep = ""))
		}
		return(dt)
	}

Try the censusapi package in your browser

Any scripts or data that you put into this service are public.

censusapi documentation built on Aug. 9, 2022, 5:07 p.m.