#' @title Download (if necessary) and merge GEO files for ACS
#'
#' @description
#' Returns a data.frame of all states merged geo info and saves geo.RData to working directory.
#'
#' @details
#' Uses download.geo() then read.geo(), and then does some cleanup.
#'
#' Note that if this finds the geographic file in folder already, it will not download it again even if that file was corrupt.
#' Read and compile geo data for entire USA with PR DC,
#'
#' This takes some time for the entire USA:
#'
#' 2 to 10 minutes, depending.
#'
#' @param mystates Character vector of 2-character state abbreviations, required.
#' @param new.geo Logical value, optional, FALSE by default. If FALSE, if geo exists in memory don't download and parse again.
#' @param folder Defaults to current working directory.
#' @param end.year optional character year to specify last year of 5-year summary file.
#' @param testing Default to FALSE. If TRUE, provides info on progress of download.
#' @param silent Default is FALSE.
#'
#' @return Returns a data.frame of all states geo info.
#'
#' ACS 2008-2012, tract and block group counts:
#'
#' table(geo$SUMLEVEL)
#'
#' 140 150
#'
#' 74001 220333
#' @details
#'
#' Remaining fields in geo:
#'
#' "STUSAB" "SUMLEVEL" "GEOID" "FIPS" "KEY"
#'
#' NOTE: do not really need GEOID or KEY.
#'
#' GEOID is redundant, but might be useful for joining to shapefiles/ boundaries
#'
#' Also, could specify here if "NAME" field from geo files should be dropped -
#' it might be useful but takes lots of RAM and encoding of S
#' panish characters in Puerto Rico caused a problem in Mac OSX.
#'
#' NOTE FROM CENSUS:
#'
#' The ACS Summary File GEOID contains the necessary information to
#' connect to the TIGER/Line Shapefiles,
#' but it needs to be modified in order to exactly match up.
#' Notice that the ACS GEOID, 05000US10001, contains
#' the TIGER/Line GEOID string, 10001.
#' In order to create an exact match of both GEOIDs,
#' it is necessary to
#' remove all of the characters before and including
#' the letter S in the ACS Summary File.
#' By removing these characters, the new GEOID in the
#' ACS Summary File exactly matches the field GEOID in
#' the TIGER/Line Shapefiles.
#' @seealso [get.acs()] which uses this, and [download.geo()]
#'
#' @export
#'
get.read.geo <- function(mystates,
new.geo = FALSE,
folder = getwd(),
end.year = acsdefaultendyearhere_func(),
testing = FALSE,
silent = FALSE) {
validate.end.year(end.year)
if (!new.geo) {
# IF DO NOT WANT TO REDO WORK TO GET GEO DATA
if (exists('geo')) {
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat('Using geo info from prior work, already in memory\n')
}
# skip download & parse
}
if (!exists('geo')) {
if (file.exists(file.path(folder, 'geo.RData'))) {
load(file.path(folder, 'geo.RData'))
if (!silent) {
cat('Loading geo.RData\n')
}
# skip download & parse
}
if (!file.exists(file.path(folder, 'geo.RData'))) {
# Said !new.geo, but can't find old geo, so must do all parsing after all.
# Repeating download.geo() is ok since checks for need to download, then do parsing
if (!silent) {
cat(
' Cannot find geo in memory or disk, so redoing parsing (downloading first if necessary)\n'
)
}
new.geo <- TRUE
}
}
}
if (new.geo) {
# IF WANT NEW GEO, do download & parse new geo.
# if user choice is to create a new geo dataset, (default), do that here (downloading if needed, then parsing):
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat("Started to download geo files \n")
}
# note this won't re-download geofiles that have already been downloaded into the local data folder
# don't need to specify download.geo(..., data.path) since already did setwd(data.path)
download.geo(
mystates,
end.year = end.year,
folder = folder,
testing = testing,
silent = silent
)
# e.g., download.geo( c("pr", "dc") )
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat("Finished downloading geo files \n")
}
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat("Started parsing geo files \n")
}
geo <-
read.geo(mystates,
folder = folder,
end.year = end.year,
silent = silent)
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat("Finished parsing geo files \n")
}
# or just for example
# stateabbs.mine <- c("de", "dc"); geo <- read.geo( stateabbs.mine)
gc()
# CLEAN UP THE geo DATA
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat("Started cleaning up geo files \n")
}
# Cut white space (OR drop GEOID entirely), (but already used trim.whitespace so redundant)
geo$GEOID <- gsub(" *$", "", geo$GEOID)
#geo$GEOID <- NULL
# Might also drop huge name field if it was imported (but it causes problems on OSX likely due to encoding)
# or at least trim large amounts of white space at end of name field (but already used trim.whitespace)
if ("NAME" %in% names(geo)) {
geo$NAME <- gsub(" {2,}", "", geo$NAME)
# geo$NAME <- NULL
}
############################## #
# Add leading zeroes and create FIPS (don't actually need all these fields once FIPS and KEY are created)
# An alternative way to create fips would be to extract it from the geoid field:
# geo$FIPS <- gsub("[[:alnum:]]*US", "", geo$GEOID)
geo$BLKGRP[is.na(geo$BLKGRP)] <- ""
geo$TRACT <- analyze.stuff::lead.zeroes(geo$TRACT, 6)
geo$STATE <- analyze.stuff::lead.zeroes(geo$STATE, 2)
geo$COUNTY <- analyze.stuff::lead.zeroes(geo$COUNTY, 3)
geo$FIPS <-
with(geo, paste(STATE, COUNTY, TRACT, BLKGRP, sep = ""))
# Ideally would call these FIPS.ST, FIPS.COUNTY, FIPS.TRACT, FIPS as elsewhere
############################### #
# THE UNIQUE ID THAT WILL BE USED FOR JOINS IS A COMBO OF STATE AND LOGRECNO
# NOTE LOWER CASE STUSAB USED TO CREATE KEY
geo$KEY <- paste(tolower(geo$STUSAB), geo$LOGRECNO, sep = "")
# THESE CAN BE DROPPED, AT LEAST AFTER THE KEY FIELD IS CREATED: (saving about 8 MB of memory)
# LOGRECNO, STATE, COUNTY, TRACT, BLKGRP
geo <-
geo[, names(geo)[!(names(geo) %in% c("LOGRECNO", "STATE", "COUNTY", "TRACT", "BLKGRP"))]]
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat("Finished cleaning up geo files \n")
}
# if (save.files) {
# save this .RData file, so that restarting interrupted get.acs() will look for it and not recreate it once it is on disk.
# Would save lots of time to avoid parsing geo files more than once - don't need to do that usually..?
# unless first run on a few places and then expanded to more states? *** problem if sees small geo and doesn't make bigger one!
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat("Started saving geo files on disk \n")
}
# can save on disk in case a copy is needed later
save(geo, file = file.path(folder, "geo.RData"))
if (!silent) {
cat(as.character(Sys.time()), ' ')
cat("Finished saving geo.RData file on disk \n")
}
# }
}
return(geo)
########################################## DONE reading GEO FILES ######################### #
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.