# 1/18/2018 : clean up
# package functions ============================================================
#' Read decennial census data
#'
#' @description This function retrieves data from summary file 1 (with urban/rural update)
#' of decennial censuses. In addition to selected geographic headers and table contents,
#' it also returns total population and coordinates of selected geographic
#' areas, as well as summary levels and geographic components.
#'
#'
#' @param year year of the decennial census
#' @param states vector of state abbreviations, for example "IN" or c("MA", "RI").
#' @param table_contents selected references of contents in census tables. Users
#' can choose a name for each reference, such as in
#' c("abc = PCT012F139", "fff = P0030008", "rural_p = P0020005").
#' Try to make names meaningful. To find the references of table contents
#' of interest, search with function \code{\link{search_tablecontents}}.
#' @param areas For metro area, in the format like "New York metro".
#' For county, city, or town, must use the exact name as those in
#' \code{\link{dict_fips}} in the format like "kent county, RI",
#' "Boston city, MA", and "Lincoln town, RI". And special examples like
#' "Salt Lake City city, UT" must keep the "city" after "City".
#' @param geo_headers vector of references of selected geographci headers to be
#' included in the return. Search with \code{\link{search_geoheaders}}
#' @param summary_level select which summary level to keep, default to keep all. It takes strings
#' including "state", "county", "county subdivision", "place", "tract", "block group",
#' and "block" for the most common levels. It also take code for level. Search all codes with
#' \code{\link{search_summarylevels}}.
#' @param geo_comp select which geographic component to keep, "*" to keep every geo-component,
#' "total" for "00", "urban" for "01", "urbanized area" for "04",
#' "urban cluster" for "28", "rural" for "43". For all other geographic component,
#' use code,
#' which can be found with \code{\link{search_geocomponents}}. Availability
#' of geocomponent depends on summary level. State level contains all
#' geographic component. County subdivision and higher level have "00",
#' "01", and "43". Census tract and lower level have only "00".
#' @param show_progress show progress of file reading if TRUE. Turn off if FALSE, which
#' is useful in RMarkdown output.
#'
#' @return A data.table whose columns include the selected geoheaders and
#' table contents plus SUMLEV, GEOCOMP, and state.
#'
#'
#' @examples
#' \dontrun{
#' # read one table and one area from one state
#' aaa = read_decennial(
#' year = 2010,
#' states = "UT",
#' table_contents = c("urban = P0020002", "rural = P0020005"),
#' geo_headers = "CBSA",
#' summary_level = "tract"
#' )
#'
#'
#' # read multiple table contents and areas from multiple states
#' bbb = read_decennial(
#' year = 2010,
#' states = c("UT", "RI"),
#' table_contents = c("urban = P0020002", "rural = P0020005"),
#' areas = c(
#' "place = ut62360",
#' "Providence city, RI",
#' "cousub = ri41500",
#' "cbsa = 39300"
#' ),
#' summary_level = "block"
#' )
#'
#'
#' # read table contents of all county subdivisions in Providence metro
#' ccc <- read_decennial(
#' year = 2010,
#' states = "US",
#' table_contents = c("urban = P0020002", "rural = P0020005"),
#' geo_headers = "CBSA",
#' summary_level = "county subdivision",
#' geo_comp = "*"
#' )
#' }
#'
#' @export
#'
read_decennial <- function(year,
states,
table_contents = NULL,
areas = NULL,
geo_headers = NULL,
summary_level = NULL,
geo_comp = "total",
show_progress = TRUE){
# check if the path to census is set
if (Sys.getenv("PATH_TO_CENSUS") == ""){
message(paste(
"Please set up the path to downloaded census data, following the instruction at",
"https://github.com/GL-Li/totalcensus."
))
return(NULL)
}
if (is.null(summary_level)) summary_level <- "*"
# allow lowerscase input
states <- toupper(states)
# check whether to download data
path_to_census <- Sys.getenv("PATH_TO_CENSUS")
# check if need to download generated data from census2010
generated_data <- paste0(path_to_census, "/generated_data")
if (!file.exists(generated_data)){
download_generated_data()
} else {
version_file <- paste0(generated_data, "/version.txt")
if (!file.exists(version_file)){
download_generated_data()
} else {
version = readChar(version_file, 5)
if (version != "0.6.0"){
download_generated_data()
}
}
}
# check whether to download census data
if (year == 2010) {
ext <- ".ur1"
} else if (year == 2000){
ext <- ".uf1"
}
not_downloaded <- c()
for (st in states){
geo_file <- paste0(
path_to_census, "/census", year, "/", st, "/", tolower(st), "geo",
year, ext
)
if (!file.exists(geo_file)){
not_downloaded <- c(not_downloaded, st)
}
}
if (length(not_downloaded) > 0){
cat(paste0(
"Do you want to download decennial census ",
year,
" summary files of states ",
paste0(not_downloaded, collapse = ", "),
" and save it to your computer?. It is necessary for extracting the data."
))
continue <- switch(
menu(c("yes", "no")),
TRUE,
FALSE
)
if (continue){
download_census("decennial", year, not_downloaded)
} else {
stop("You choose not to download data.")
}
}
# turn off warning, fread() gives warnings when read non-scii characters.
options(warn = -1)
if (is.null(areas) + is.null(geo_headers) == 0){
stop("Must keep at least one of arguments areas and geo_headers NULL")
}
# add population to table contents so that it will never empty
# if (any(grepl("P0010001", table_contents))){
# message("P0010001 is the population column.")
# }
# table_contents <- table_contents[!grepl("P0010001", table_contents)]
# table_contents <- c("population = P0010001", table_contents) %>%
# unique()
if (!is.null(table_contents)){
content_names <- organize_tablecontents(table_contents) %>%
.[, name]
table_contents <- organize_tablecontents(table_contents) %>%
.[, reference]
}
if (!is.null(areas)){
dt <- read_decennial_areas_(
year, states, table_contents, areas, summary_level, geo_comp,
show_progress
)
} else {
dt <- read_decennial_geoheaders_(
year, states, table_contents, geo_headers, summary_level, geo_comp,
show_progress
)
}
if (!is.null(table_contents)){
setnames(dt, table_contents, content_names)
}
options(warn = 0)
return(dt)
}
# internal functions ============================================================
read_decennial_areas_ <- function(year,
states,
table_contents = NULL,
areas,
summary_level = "*",
geo_comp = "total",
show_progress = TRUE){
# read decennial census data of selected areas
#
# Args_____
# year : year of the decennial
# states : vector of abbreviations of states such as c("MA", "RI")
# table_contents : vector of reference of available table contents
# areas : For metro area, in the format like "New York metro".
# For county, city, or town, must use the exact name as those in
# \code{\link{dict_fips}} in the format like "kent county, RI",
# "Boston city, MA", and "Lincoln town, RI". And special examples like
# "Salt Lake City city, UT" must keep the "city" after "City".
# summary_level : summary level like "050"
# geo_comp : geographic component such as "00", "01", and "43"
# show_progress : whether to show progress in fread()
#
# Return_____
# A data.table
#
# Examples_____
# aaa = read_decennial_areas_(
# year = 2010,
# states = c("ri", "MA"),
# table_contents = c("P0030003", "P0080036", "PCT012G002", "PCT012G181"),
# areas = c("Lincoln town, ri", "PLACE = RI59000", "providence metro"),
# summary_level = "block"
# )
#
# bbb <- read_decennial_areas_(
# year = 2010,
# states = c("ut", "ri"),
# table_contents = c("P0030003", "P0080036", "PCT012G002", "PCT012G181"),
# areas = c("Lincoln Town, RI",
# "Kent county, RI",
# "Salt Lake City city, UT",
# "Salt Lake metro",
# "Providence city, RI"),
# summary_level = "block group"
# )
#=== prepare arguments ===
# convert areas to the form of data.table
# geoheader code state name
# 1: PLACE 62360 UT Providence city, UT
# 2: COUNTY 005 RI Newport County, RI
dt_areas <- convert_areas(areas)
states <- toupper(states)
# toupper(NULL) ---> character(0) will cause trouble
if (!is.null(table_contents)) table_contents <- toupper(table_contents)
# this is used to extract geographic headers
if (!is.null(areas)) geo_headers <- unique(dt_areas[, geoheader])
# switch summary level and geocomponent
summary_level <- switch_summarylevel(summary_level, year)
geo_comp <- switch_geocomp(geo_comp)
# lookup of the year
lookup <- get(paste0("lookup_decennial_", year))
for (content in table_contents) {
if (!tolower(content) %in% tolower(lookup$reference)){
stop(paste("The table content reference", content, "does not exist."))
}
}
# === read files ===
lst_state <- list()
for (st in states) {
# read geography. do NOT read geo_headers from decennial data, instead read
# from GEOID_coord_XX later on, which is generated from Census 2010 and
# has much more geo_header data
geo <- read_decennial_geo_(year, st,
c(geo_headers, "STATE"),
summary_level,
show_progress = show_progress) %>%
# convert STATE fips to state abbreviation
.[, state := convert_fips_to_names(STATE)] %>%
setkey(LOGRECNO)
# read data from each file
if(!is.null(table_contents)){
# get files for table contents
dt <- read_decennial_tablecontents_(year, st, table_contents,
show_progress)
decennial <- merge(geo, dt)
} else {
decennial <- geo
}
# To determine what PLACE or COUSUB a tract or block group (partially)
# blongs, replace PLACE and COUSUB with those obtained from census 2010
# in generate_geoid_coordinate.R
decennial <- add_geoheader(decennial, st, geo_headers, summary_level,
survey = "decennial")
lst_state[[st]] <- decennial[SUMLEV %like% summary_level & GEOCOMP %like% geo_comp]
}
combined <- rbindlist(lst_state) %>%
.[, ":=" (LOGRECNO = NULL, STATE = NULL)] %>%
convert_geocomp_name() %>%
# convert NA in state to nothing for selection below
.[is.na(state), state := ""]
# select data for argument areas
selected <- map(
1:nrow(dt_areas),
function(x) combined[get(dt_areas[x, geoheader]) %like% dt_areas[x, code] &
state %like% dt_areas[x, state]] %>%
.[, area := dt_areas[x, name]]
) %>%
rbindlist() %>%
# no use of the geoheaders
.[, unique(dt_areas[, geoheader]) := NULL] %>%
convert_geocomp_name()
# add acs_NAME
if (summary_level != "*"){
selected <- add_acsname(selected)
}
if (summary_level != "*"){
begin <- c("area", "GEOID", "NAME", "acs_NAME", "population")
} else {
begin <- c("area", "NAME", "population")
}
end <- c("GEOCOMP", "SUMLEV", "state", "STUSAB", "lon", "lat")
# reorder columns
# begin <- c("area", "lon", "lat")
# end <- c("GEOCOMP", "SUMLEV")
# setcolorder(selected, c(begin, setdiff(names(selected), c(begin, end)), end))
setcolorder(selected, c(begin, table_contents, end))
return(selected)
}
read_decennial_geoheaders_ <- function(year,
states,
table_contents = NULL,
geo_headers = NULL,
summary_level = "*",
geo_comp = "*",
show_progress = TRUE){
# read decennial census data of selected geoheaders
#
# Args_____
# year : end year of the 5-year survey
# states : vector of abbreviations of states such as c("MA", "RI")
# table_contents : vector of reference of available table contents
# geo_headers : vetor of geographic header such as c("PLACE", "CBSA")
# summary_level : summary level like "050"
# geo_comp : geographic component such as "00", "01", and "43"
# show_progress : whether to show progress in fread()n
#
# Return_____
# A data.table
#
# Examples_____
# aaa = read_decennial_geoheaders_(
# year = 2010,
# states = c("ri", "MA"),
# table_contents = c("P0030003", "P0080036", "PCT012G002", "PCT012G181"),
# geo_headers = c("COUSUB", "PLACE", "CBSA"),
# summary_level = "block"
# )
#
# bbb <- read_decennial_geoheaders_(
# year = 2010,
# states = c("ut", "ri"),
# table_contents = c("P0030003", "P0080036", "PCT012G002", "PCT012G181"),
# geo_headers = c("COUSUB", "PLACE", "CBSA"),
# summary_level = "block group"
# )
#=== prepare arguments ===
states <- toupper(states)
# toupper(NULL) ---> character(0) will cause trouble
if (!is.null(table_contents)) table_contents <- toupper(table_contents)
if (!is.null(geo_headers)) geo_headers <- toupper(geo_headers)
# switch summary level to code
summary_level <- switch_summarylevel(summary_level, year)
geo_comp <- switch_geocomp(geo_comp)
# lookup of the year
lookup <- get(paste0("lookup_decennial_", year))
for (content in table_contents) {
if (!tolower(content) %in% tolower(lookup$reference)){
stop(paste("The table content reference", content, "does not exist."))
}
}
# === read files ===
lst_state <- list()
for (st in states) {
# read geography. do NOT read geo_headers from decennial data, instead read
# from GEOID_coord_XX later on, which is generated from decennial 2010 and
# has much more geo_header data
geo <- read_decennial_geo_(year, st,
c(geo_headers, "STATE"),
summary_level,
show_progress = show_progress) %>%
# convert STATE fips to state abbreviation
.[, state := convert_fips_to_names(STATE)] %>%
setkey(LOGRECNO)
# read data from each file
if(!is.null(table_contents)){
# get files for table contents, follow the notation of read_tablecontent.R
dt <- read_decennial_tablecontents_(year, st, table_contents,
show_progress)
decennial <- merge(geo, dt)
} else {
decennial <- geo
}
# To determine what PLACE or COUSUB a tract or block group (partially)
# blongs, replace PLACE and COUSUB with those obtained from decennial 2010
# in generate_census_data.R
decennial <- add_geoheader(decennial, st, geo_headers, summary_level,
survey = "decennial")
lst_state[[st]] <- decennial[SUMLEV %like% summary_level & GEOCOMP %like% geo_comp]
}
combined <- rbindlist(lst_state) %>%
.[, LOGRECNO := NULL] %>%
convert_geocomp_name()
if (!"STATE" %in% geo_headers){
combined[, STATE := NULL]
}
# add acs_NAME
if (summary_level != "*"){
combined <- add_acsname(combined)
}
if (summary_level != "*"){
begin <- c("GEOID", "NAME", "acs_NAME")
} else {
begin <- c("NAME")
}
end <- c("GEOCOMP", "SUMLEV", "state", "STUSAB", "lon", "lat")
setcolorder(combined,
c(begin, geo_headers, "population", table_contents, end))
return(combined)
}
# Read geographic header record file of ONE state
read_decennial_geo_ <- function(year,
state,
geo_headers = NULL,
summary_level = "*",
show_progress = TRUE) {
# This function reads the geographic record file of one state and returns a
# data.table with columns of LOGRECNO, SUNLEV, GEOCOMP plus geoheaders
# given in argument geo_headers. In addition, it generate GEOID for
# summary_levels used in ACS summary files with utility function add_geoid()
#=== prepare arguments ===
path_to_census <- Sys.getenv("PATH_TO_CENSUS")
state <- toupper(state)
geoheaders_alway_keep <- c("LOGRECNO", "GEOCOMP", "NAME", "POP100",
"STUSAB", "INTPTLON", "INTPTLAT")
if (summary_level != "*"){
geoid_geoheaders <- get_geoheaders_of_summarylevel(summary_level)
geoheaders <- c(toupper(geo_headers), geoheaders_alway_keep,
geoid_geoheaders) %>%
unique() %>%
# SUMLEV processed seperately
setdiff("SUMLEV")
} else {
geoheaders <- c(toupper(geo_headers), geoheaders_alway_keep) %>%
unique() %>%
setdiff("SUMLEV")
}
if (show_progress) {
cat(paste("Reading", state, "geographic header record file\n"))
}
#=== read files and select columns ===
if (year == 2010){
file_extension = ".ur1"
dict_geoheader = dict_decennial_geoheader_2010
} else if (year == 2000){
file_extension = ".uf1"
dict_geoheader = dict_decennial_geoheader_2000
}
file <- paste0(path_to_census, "/census", year, "/", state, "/", tolower(state),
"geo", year, file_extension)
# use "Latin-1" for encoding special spanish latters such as ñ in Cañada
geo <- fread(file, header = FALSE, sep = "\n", encoding = "Latin-1" ,
showProgress = show_progress) %>%
# select summary level here
.[, SUMLEV := str_sub(V1, 9, 11)] %>%
.[SUMLEV %like% summary_level]
# add all selected geoheaders to output
for (ref in geoheaders) {
# identify numeric geohearders
if (ref %in% c("LOGRECNO", "INTPTLAT", "INTPTLON", "AREALAND",
"AREAWATR", "POP100", "HU100")) {
# place variable in () to add new columns
geo[, (ref) := as.numeric(str_sub(
geo[, V1],
dict_geoheader[reference == ref, start],
dict_geoheader[reference == ref, end]
))]
} else {
geo[, (ref) := str_trim(str_sub(
geo[, V1],
dict_geoheader[reference == ref, start],
dict_geoheader[reference == ref, end]
))]
}
}
geo[, V1 := NULL]
setnames(geo, c("POP100", "INTPTLON", "INTPTLAT"),
c("population", "lon", "lat"))
if (summary_level != "*"){
add_geoid(geo, summary_level)
gh_to_keep <- c("LOGRECNO", "GEOID", "NAME", "population", "STUSAB",
geo_headers, "GEOCOMP", "SUMLEV", "lon", "lat") %>%
unique()
} else {
gh_to_keep <- c("LOGRECNO", "NAME", "population", "STUSAB",
geo_headers, "GEOCOMP", "SUMLEV", "lon", "lat") %>%
unique()
}
setkey(geo, LOGRECNO)
return(geo[, gh_to_keep, with = FALSE])
}
# read selected table content of a state
# These contents can be from different tables, for example,
# c("PCT012F139", "P0030008", "P0100059", "P0150008", "H0070016", "PCT012F138").
# The rows of short contents that have no corresponding LOGRECNOs are filled with
# NAs.
read_decennial_1_file_tablecontents_ <- function(year,
state,
file_seg,
table_contents = NULL,
show_progress = TRUE){
# Read selected table content in one data file of a state and return a
# data.table, which has columns of LOGRECNO and selected table contents
# Args_____
# year : integrer, year of the decennial census, such as 2010
# state : string, abbreviation of a state, such as "IN"
# filg_seg : segment number of file such as "01" and "34"
# table_contents : vector of references of table contents in the same file
# segment, for example c("P0030001", "P0030008")
# show_progress : wheather to show progress in fread()
#
# Return_____
# a data.table keyed with LOGRECNO
#=== prepare arguments ===
path_to_census <- Sys.getenv("PATH_TO_CENSUS")
#=== read data ===
lookup <- get(paste0("lookup_decennial_", year))
# determine location of the flds in file_seg
all_contents <- lookup[file_segment == file_seg, reference]
# The line below is a bug, which is not the true order of table contents
# loc <- which(all_contents %in% table_contents)
# the new loc
loc <- integer(length(table_contents))
for (i in 1:length(table_contents)){
idx <- which(all_contents %in% table_contents[i])
loc[i] <- idx
}
cols <- paste0("V", loc)
if (year == 2010){
file_extension = ".ur1"
} else if (year == 2000){
file_extension = ".uf1"
}
file <- paste0(path_to_census, "/census", year, "/", state, "/", tolower(state),
"000", file_seg, year, file_extension)
if (show_progress) {
cat(paste("Reading", state, "file", file_seg, "\n"))
}
# fread assigns column names as "V1", "V2", ... when header = FALSE
dt <- fread(file, header = FALSE, select = c("V5", cols), showProgress = show_progress) %>%
setnames(names(.), c("LOGRECNO", table_contents)) %>%
setkey(LOGRECNO)
return(dt)
}
read_decennial_tablecontents_ <- function(year,
state,
table_contents = NULL,
show_progress = TRUE){
# Read table_contents from multiple files of a state.
# Args_____
# year : integrer, year of the decennial census, such as 2010
# state : string, abbreviation of a state, such as "IN"
# table_contents : vector of references of table contents can be in different
# file segment, for example c("P0030001", "P0030008")
# show_progress : wheather to show progress in fread()
#
# Return_____
# a data.table keyed with LOGRECNO
#
# Example_____
# read_decennial_tablecontents_(
# 2010, "RI",
# table_contents = c("P0150008", "P0030001",
# "P0030003", "P0080036",
# "PCT012G002", "PCT012G181")
# )
# LOGRECNO P0030001 P0030003 P0080036 P0150008 PCT012G002 PCT012G181
# 1: 1 1052567 60189 71 2988 16942 32
# 2: 2 955043 59629 70 2955 16340 31
# 3: 3 952101 59613 70 2955 16315 31
# 4: 4 2942 16 0 0 25 0
# 5: 5 97524 560 1 33 602 1
# ---
# 31758: 31758 29191 635 3 16 343 0
# 31759: 31759 22787 222 0 5 232 0
# 31760: 31760 41186 2621 6 41 889 1
# 31761: 31761 0 0 0 0 0 0
# 31762: 31762 35809 242 0 12 225 0
# locate data files for the content
lookup <- get(paste0("lookup_decennial_", year))
file_content <- lookup_tablecontents(table_contents, lookup)
dt <- purrr::map2(file_content[, file_seg],
file_content[, table_contents],
function(x, y) read_decennial_1_file_tablecontents_(
year, state, file_seg = x, table_contents = y,
show_progress = show_progress
)) %>%
purrr::reduce(merge, all = TRUE)
return(dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.