print_update <- function(geog, source, survey, year, race){
cat("Getting data: ", survey, " | ", str_pad(source, 8, side="right"), " | ", str_pad(geog, 9, side="right"), "| ", year, " | ", race, "\n")
}
count_suppressed <- function(df){
# create a new dataframe without Margins of error
df_estimates <- select(df, ends_with("E"))
## return a dataframe with a new column in which the counts of estimates (not MOEs) with NA values is returned
add_column(df, SUPPRESSED = apply(df_estimates, MARGIN = 1, function(x) sum(is.na(x))), .before = 0)
}
assemble_peer_acs <- function(table = NULL, variables = NULL, # Use either singular `table` or vector of `variables`
years = 2018, # Required. can be a vector of years.
peers = NULL, # Pass in Peer MSA table. Requires variables `GEOID` and `NAME_short`.
racial = FALSE, # = TRUE for iterating through racial tables or variables
race = NULL, # Only use to attempt to override the default value in output's race column
try_suppressed = TRUE, # = FALSE to not attempt replacing suppressed 1year data with 5year. (when 5 year doesn't exist)
avg_weight = NULL, # If null, counties will SUM for regional total. Specify a weight variable to trigger weighted average.
racecode = NULL, # Used internally only, for racial iterations when racial = TRUE
state_fips = "17", counties = c("031", "043", "089", "093", "097", "111", "197")) # Can be overridden for non-CMAP use
{
# PART 1: Check for valid inputs ---------------------------------------------
# determine table or variables
if (is.null(table) && is.null(variables)){
stop("Either a table or vector of variables must be supplied", call. = FALSE)
}
if (!is.null(table) && !is.null(variables)){
stop("Both a table and a vector of variables cannot be supplied. Provide one or the other", call. = FALSE)
}
# define mode, name descriptive data source for print-back, and select last character(s) for racial check
if (!is.null(table)){
mode <- "table"
source <- table
}else{
mode <- "variables"
if(length(variables)<6) {
source <- paste(variables, collapse = ", ")
} else{
source <- paste0(paste(variables[1:5], collapse = ", "),"...")
}
}
# if user did not pass peer MSA table, look for `PEERS` in outer scope. Accept it, error if needed, or use sample peer list as needed.
if(is.null(peers)){
if(exists("PEERS")){
if(is_tibble(PEERS)){
peers <- PEERS
message(" User did not pass variable `peers` but `PEERS` is defined in global environment. Using `PEERS`.")
}else{
stop("`user did not pass variable `peers`. Function attempted to use `PEERS` from global environment but it is not a tibble.", call. = FALSE)
}
}else{
peers <- tribble(
~GEOID, ~NAME_short,
16980, "Chicago (MSA)",
14460, "Boston",
35620, "New York",
47900, "Washington, D.C.")
message(" User did not pass variable `peers` so using Chicago, Boston, NYC, and DC by default.")
}
}
# confirm correct variables exist in peers tibble
if(!("GEOID" %in% names(peers)) | !("NAME_short" %in% names(peers))){
stop("`peers` must be a tibble containing variables `GEOID` and `NAME_short`.", call. = FALSE)
}
# PART 2: Create internal data -----------------------------------------------
# create empty datasets
data <- tibble()
suppressed_counties <- tibble()
races <- tribble(
~CODE, ~RACE,
"B", "Black or African-American",
"D", "Asian",
"H", "White (non-hispanic)",
"I", "Hispanic or Latino")
peers_geoids <- c(peers$GEOID)
# PART 3: In racial mode, iterate through function repeatedly -----------------
if(racial){
for(racecode in races$CODE){
# if in table mode, adjust table name for each race while not passing variables
if(mode == "table"){
table2 <- paste0(table,racecode)
variables2 <- NULL
# if in variable mode, adjust variable names for each race while not passing table
}else if(mode == "variables"){
variables2 <- sub(x=variables,pattern = "_", replacement = paste0(racecode, "_"))
#LOCS_TO_INSERT <- str_locate(variables,"_")[,1]
table2 <- NULL
# if mode is something else, error and stop.
}else stop("Error: unknown mode", call. = FALSE)
# run function for this race
race_specific_data <- assemble_peer_acs(table = table2, variables = variables2, racial = FALSE, # these attributes have been modified
race = race, years = years, racecode = racecode, try_suppressed = try_suppressed, # these have not been modified
avg_weight = avg_weight, state_fips = state_fips, counties = counties, peers = peers) # these have not been modified
# Remove race code from variable names so that join aligns
names(race_specific_data) <- gsub(x = names(race_specific_data), pattern = paste0(racecode, "_"), replacement = "_")
# add this data to the master set
data <- bind_rows(data, race_specific_data)
}
return(data)
}
# PART 4: best descriptive race variable ------------------------------
# if this is a racial iteration, look up correct race value
if(!is.null(racecode)){
if(!is.null(race)){
message(paste0(" User inputted `race`='",race,"' but race appears to be '", races$RACE[match(racecode, races$CODE)], "'. User input overridden."))
}
race = races$RACE[match(racecode, races$CODE)]
# if not a racial iteration, and in variable mode...
}else if(mode == "variables"){
# if user did not specify race, put in placeholder.
if(is.null(race)){
message(" It is not known whether the called variables are race-specific. `Race`='see var name'. Consider replacing.")
race = "see var name"
# if user did specify race, accept their value.
}else{
message(paste0(" Accepting user input of `race`=", race,"."))
}
# if not a racial iteration, and in table mode...
}else if(mode == "table"){
# identify last character in table name
last_char <- substr(table, nchar(table), nchar(table))
# check if last character of table is a race (must be A through I):
if (last_char %in% LETTERS[1:9]){
# if last character is a race known to this function, look up the correct race
if(last_char %in% races$CODE){
# but display an error if user also tried to input a race
if(!is.null(race)){
message(paste0(" User inputted `race`='",race,"' but race appears to be '", races$RACE[match(last_char, races$CODE)], "'. User input overridden."))
}
race = races$RACE[match(last_char, races$CODE)]
# if last character is a race unknown to this function and user did not pass race variable, put in placeholder.
}else if(is.null(race)){
message(paste0(" Table appears to have race code '", last_char, "' which is not known to this function. `Race`='see var name'. Consider replacing."))
race = "see var name"
# if last character is a race unknown to this function and user DID pass race variable, accept it.
}else{
message(paste0(" Table appears to have race code '", last_char, "' which is not known to this function. `Race`='",race,"'."))
}
# if last character of table is NOT a race:
}else{
# and user did not enter a race:
if(is.null(race)){
race = "All"
# if user entered a race, accept it.
}else{
message(paste0(" Table does not appear to be a racial table, but user inputted `race`=",race,". User input accepted."))
}
}
}else{stop("Error in function mode detection.", call. = FALSE)}
# PART 5: Look up 1 year ACS data --------------------------------------------
for(year in years){
# get CMAP7 county data
print_update("County", source, "acs1", year, race)
raw_7co <- suppressMessages(get_acs(geography="county", table=table, variables = variables, survey="acs1", year=year, output="wide",
state=state_fips, county=counties, cache_table=TRUE)) %>%
mutate(SURVEY = "acs1", GEO = "county")
# Shorten county names
raw_7co$NAME <- gsub(x = raw_7co$NAME, pattern = " County, Illinois", replacement = "")
# get peer MSA data
print_update("MSA", source, "acs1", year, race)
raw_msa <- suppressMessages(get_acs(geography="metropolitan statistical area/micropolitan statistical area",
table=table, variables = variables, survey="acs1", year=year, output="wide", cache_table=TRUE)) %>%
filter(GEOID %in% peers_geoids) %>%
merge(peers, by="GEOID") %>%
mutate(SURVEY = "acs1", GEO="MSA") %>%
select(-NAME) %>%
select(NAME=NAME_short, everything())
# get national data
print_update("National", source, "acs1", year, race)
raw_us <- suppressMessages(get_acs(geography="us", table=table, variables = variables, survey="acs1", year=year, output="wide", cache_table=TRUE) %>%
mutate(SURVEY = "acs1", GEO="nation"))
# combine data
raw <- bind_rows(raw_7co, raw_msa, raw_us) %>%
mutate(YEAR = year, RACE = race) %>%
select(YEAR, SURVEY, GEO, GEOID, RACE, everything())
# count suppressed data
raw <- count_suppressed(raw)
# Identify suppressed geographies
suppressed_geogs <- select(filter(raw, SUPPRESSED>0), GEOID, GEO, NAME)
# if there are suppressed geogs, report which
if(nrow(suppressed_geogs)>0){
message(paste(" Geogs with suppressed data:", paste(suppressed_geogs$NAME, collapse=", ")))
# add counties to master suppressed counties list
suppressed_counties <- suppressed_geogs %>%
filter(GEO == "county") %>%
select(GEOID, NAME) %>%
bind_rows(suppressed_counties) %>%
distinct()
}
# add data to primary dataset
data <- bind_rows(data, raw)
}
# PART 6: Replace suppressed with 5 year ACS data ----------------------------------
if(nrow(suppressed_counties)>0 && try_suppressed == TRUE){
message(paste(" Attempting to replace suppressed data:", paste0(suppressed_counties$NAME, collapse = ", "), "..."))
# make a tibble of three digit county FIPS codes:
suppressed_counties_list <- transmute(suppressed_counties, GEOID_SHORT = substr(GEOID, 3, 5))
# and convert it to a list
suppressed_counties_list <- suppressed_counties_list$GEOID_SHORT
# create empty dataset for ACS5 values
was_suppressed <- tibble()
for(year in years){
# print update
print_update("County", source, "acs5", year, race)
# get ACS5 data
raw_was_suppressed <- suppressMessages(get_acs(geography="county", table=table, variables = variables, survey="acs5", year=year, output="wide",
state=state_fips, county=suppressed_counties_list, cache_table=TRUE)) %>%
mutate(SURVEY = "acs5", GEO = "county", YEAR = year, RACE = race) %>%
select(YEAR, SURVEY, GEO, GEOID, RACE, everything())
# Shorten county names
raw_was_suppressed$NAME <- gsub(x = raw_was_suppressed$NAME, pattern = " County, Illinois", replacement = "")
# add it to master ACS5 table
was_suppressed <- bind_rows(was_suppressed, raw_was_suppressed)
}
# check for suppressed values in 5 year data
was_suppressed <- count_suppressed(was_suppressed)
# add 5 year data to main data
data <- bind_rows(data, was_suppressed)
}
# PART 7: Summarize multi-county region ----------------------------------
# add E for estimate onto end of weight variable specified by user
avg_weight2 <- paste0(avg_weight,"E")
# if this is a racial iteration, insert race code as well:
if (!is.null(racecode)){
avg_weight2 <- sub(x = avg_weight2, pattern = "_", replacement = paste0(racecode,"_"))
}
# calculate correct type of sum
if (is.null(avg_weight)){
summary_type <- "sum"
message(" Calculating regional total using sum.")
}else{
# check to see if variable specified exists in data
if(avg_weight2 %in% names(data)) {
summary_type <- "weighted.mean"
message(paste0(" Calculating regional total using mean weighted on ", avg_weight,"."))
}else{
message(" `avg_weight` not a column in data. Returning data without row for regional totals.")
return(data)
}
}
# create sub-list of the correct rows to total, preferring ACS5 rows where they exist
counties_to_total <- filter(data, GEO == "county") %>%
group_by(YEAR) %>%
arrange(desc(SURVEY), .by_group = TRUE) %>% # brings acs5 years to top
distinct(NAME, .keep_all = TRUE) # %>% # drops acs1 years where acs5 years exist
# create a string of surveys used for each summary
surveys_used <- ungroup(counties_to_total) %>%
distinct(SURVEY) %>%
summarise(SURVEY = paste(SURVEY, collapse=",")) %>%
as.character()
# identify how many counties are being used to generate regional total
# (should be all in region unless get_suppressed = FALSE and some data are suppressed)
counties_counted <- summarise(counties_to_total, count = sum(SUPPRESSED == 0)) %>%
select(count) %>%
min() %>%
as.character()
# calculate the totals based on the summary type selected
if(summary_type == "sum"){
# generate totals row using summarize at
totals_rows <- select(counties_to_total, -SUPPRESSED, -GEO, -GEOID, -NAME, -RACE) %>%
summarize_at(vars(ends_with("E")), sum, na.rm = TRUE) %>%
mutate(SURVEY = surveys_used, GEO = "region", NAME = "Chicago (CMAP7)", RACE = race, COUNT = counties_counted, GEOID = "0")
}else{
# generate weighted averages
totals_rows <- select(counties_to_total, -SUPPRESSED, -GEO, -GEOID, -NAME, -RACE) %>%
summarize_at(vars(ends_with("E")), funs(weighted.mean(., w=UQ(rlang::sym(avg_weight2)))), na.rm = TRUE) %>% # https://stackoverflow.com/questions/26003574/use-dynamic-variable-names-in-dplyr
mutate(SURVEY = surveys_used, GEO = "region", NAME = "Chicago (CMAP7)", RACE = race, COUNT = counties_counted, GEOID = "0") %>%
# the above code created a mean of the weight variable, weighted on itself. This is confusing and unnecessary. Drop it.
select(-!!avg_weight2)
}
# add the total row
data <- bind_rows(data, totals_rows)
# organize data
data <- select(data, YEAR, GEO, GEOID, NAME, RACE, SURVEY, COUNT, SUPPRESSED, everything()) %>%
arrange(YEAR, GEO)
# PART 8: Return -------------------------------------------------------
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.