library("tidyverse")
library("lubridate")
library("censusapi")
library("usmap")
data("countypop")
# requires api key in .Renviron ... or maybe not if doing less than 500 calls per day
# listCensusMetadata("pep/int_population", type = "variables", vintage = 2000)
# listCensusMetadata("pep/int_population", type = "geography", vintage = 2000)
# listCensusMetadata("pep/population", type = "variables", vintage = 2019)
## things to fix
## 46102 Shannon County, SD becomes 46113 Oglala Lakota in 2010
## 51019/51515 Bedford/Bedford City merge into
# need to have an API key in environment variable for this to work.
# getCensus("pep/int_population", vintage = 2000,
# vars = c("GEONAME","DATE_","DATE_DESC", "POP"),
# region = "county") %>%
# saveRDS(file = here::here("data-raw/pre2010-census-raw.rds"))
pre2010 <- readRDS(file = here::here("data-raw/pre2010-census-raw.rds")) %>%
rename(DATE_CODE = DATE_,
county_fips = county,
state_fips = state) %>%
filter(DATE_CODE %in% as.character(2:11)) %>% # remove the weirdos
mutate(fips = paste0(state_fips, county_fips),
DATE = mdy(str_extract(DATE_DESC, "^\\d{1,2}\\/\\d{1,2}\\/\\d{4}"))) %>%
left_join(select(countypop, fips, abbr, county), by = "fips")
# getCensus("pep/population", vintage = 2019,
# vars = c("DATE_CODE","DATE_DESC", "POP"),
# region = "county") %>%
# saveRDS(file = here::here("data-raw/post2010-census-raw.rds"))
post2010 <- readRDS(file = here::here("data-raw/post2010-census-raw.rds")) %>%
rename(county_fips = county,
state_fips = state) %>%
filter(DATE_CODE %in% as.character(3:12)) %>% # remove the weirdos
mutate(fips = paste0(state_fips, county_fips),
DATE = mdy(str_extract(DATE_DESC, "^\\d{1,2}\\/\\d{1,2}\\/\\d{4}"))) %>%
left_join(select(countypop, fips, abbr, county), by = "fips")
county_popn <- bind_rows(pre2010,post2010) %>%
mutate(year = year(DATE),
fips = if_else(fips %in% c("46102","46113"), "46102/46113", fips),
fips = if_else(fips %in% c("51019","51515"), "51019/51515", fips),
abbr = if_else(fips %in% c("46102/46113"), "SD", abbr),
county = if_else(fips %in% c("46102/46113"), "Oglala Lakota", county)) %>%
group_by(fips, year) %>%
summarize(state_fips = first(state_fips),
state_abbr = first(abbr),
county = first(county),
pop = sum(as.numeric(POP))) %>%
ungroup() %>%
filter(!(state_fips %in% c("02","15","72"))) # remove AK, HI, and PR
## Code for checking wnvdata census.data -- why is it smaller?
# library(wnvdata)
# test <- county_popn %>%
# mutate(year = lubridate::year(DATE),
# fips = as.character(as.numeric(fips))) %>%
# anti_join(census.data, by = c("year", "fips"))
# test1 <- county_popn %>%
# mutate(year = lubridate::year(DATE)) %>%
# filter(year == 2010,
# !(state_fips %in% c("02","72","15")))
# View(anti_join(test1, neurownv2000_2018, by = "fips"))
# View(anti_join(neurownv2000_2018, test1, by = "fips"))
# pull in forecast template from CDC (also provided in data-raw)
target <- readr::read_csv(here::here("data-raw/wnv_forecasting_template_(4-20-2020).csv"))
# need to be able to match against "location" field, and ensure that fips code
# present for matching against other data
# should be statename-countyname
# Below are the data manipulations used to finish processing the Census data
# Note that cdc.raw refers to data provided by the CDC as part of the forecasting challenge.
# These data are not provided here as they are sensitive and are not authorized to be shared.
# Work on getting CENSUS data merged
#county_popn$state = sapply(county_popn$GEONAME, splitter, ', ', 2, 1)
# Add a column with state for each abbreviation
#state.lookup.file = sprintf('%s/../data-raw/stateCodes.csv', drew.path)
# stateCodes file I have has the wrong fips code for states
# state.lookup = readr::read_csv(here::here("data-raw/stateCodes.csv"),
# col_names = c("state_fips", "state2", 'state_abbr')) %>%
# mutate(state_fips = sprintf("%02d",state_fips))
state.lookup <- readxl::read_excel(here::here("data-raw/all-geocodes-v2017.xlsx"),
sheet = "nationalpluspr_17",
range = "A6:G43915",
col_names = c("summary_level","state_fips","county_fips","county_subdivision","place_code","consolidated_city_code", "name")) %>%
filter(summary_level == "040") %>%
rename(state = name)
# Add state names to county_popn
county_popn <- county_popn %>%
left_join(select(state.lookup, state_fips, state), by = 'state_fips')
# old code - the statesLookup file I have has incorrect fips codes
# Update states from those with no GEONAME, but that have an abbreviation
# county_popn.b = merge(county_popn, state.lookup, by = 'state_fips', all.x = TRUE)
# county_popn.b$missing = is.na(county_popn.b$state)
# index.b = grep(TRUE, county_popn.b$missing)
# county_popn.b$state[index.b] = as.character(county_popn.b$state2[index.b])
# Update 'state' for District of Columbia records
# already fixed
# dc.index = grep("District of", county_popn.b$county)
# county_popn.b$state[dc.index] = "District of Columbia"
# Update records for Shannon County, South Dakota (missing county field); Use new county name
# shannon.co = grep("Shannon County, South Dakota", county_popn.b$GEONAME)
# county_popn.b$county[shannon.co] = "Oglala Lakota"
# Drop AK, HI, and Puerto Rico and NA
#county_popn2 = county_popn.b[county_popn.b$state %in% as.character(cdc.states), ] # Down to the 49, this worked. # up to 62160 rows. More than in the CDC data set. Good.
# Drop the word "COUNTY" from counties
county_popn$county2 = gsub(" County", "", county_popn$county)
# Drop the word "Parish" from counties
county_popn$county2 = gsub(" Parish", "", county_popn$county2)
# Correct city to City in county_popn2
county_popn$county2 = gsub("city", "City", county_popn$county2)
# Create location join field
county_popn$location = sprintf("%s-%s", county_popn$state, county_popn$county2)
# Change LaSalle to La Salle to match CDC data
county_popn$location = gsub("Illinois-LaSalle", "Illinois-La Salle", county_popn$location)
county_popn$location = gsub("Louisiana-LaSalle", "Louisiana-La Salle", county_popn$location)
# Change De Kalb, La Porte, Lagrange to match CDC data (only for Indiana, the others are DeKalb in the cdc data)
county_popn$location = gsub("Indiana-DeKalb", "Indiana-De Kalb", county_popn$location)
county_popn$location = gsub("Indiana-LaPorte", "Indiana-La Porte", county_popn$location)
county_popn$location = gsub("Indiana-LaGrange", "Indiana-Lagrange", county_popn$location)
# Correct others to match CDC
county_popn$location = gsub("New Mexico-DoÐa Ana", "New Mexico-Dona Ana", county_popn$location) #**# Watch for problems with encoding for special character
county_popn$location = gsub("New Mexico-De Baca", "New Mexico-DeBaca", county_popn$location)
# check against CDC prediction target file
# target %>%
# filter(type == "Point") %>%
# anti_join(county_popn, by = "location") %>%
# View()
# Add Location-year field
#county_popn$year = sapply(as.character(county_popn2$DATE), splitter, "-", 1, 0)
county_popn$location_year = sprintf("%s:%s", county_popn$location, county_popn$year)
# # this will only work on drew's computer, private package that protects data
# data(neurownv2000_2019, package = "unlwnvepi")
# cdc.raw <- neurownv2000_2019
# # Add join field to cdc data
# #cdc.raw object is not added in this script because it is proprietary, the reference is here to document what was done with it.
# # it was used to confirm that the census data could be merged with the forecast challenge data
# cdc.raw$location_year = sprintf("%s:%s", cdc.raw$location, cdc.raw$year)
# cdc.raw %>%
# anti_join(county_popn, by = "location_year") %>%
# View() no rows, check!
#cdc.raw has 59052 rows - mine has 62160 maybe because includes 2019
#county_popn2 has 62160 rows now that we merge on GEONAME and abbr
#unique(test2$county.y) # 49 entries.
# Test merge with CDC data & Check that everything merged correctly
# test = merge(county_popn2, cdc.raw, by = 'location_year', all.y = TRUE)
# max(is.na(test$POP)) # At least some have NA
# test2 = test[is.na(test$POP), ] # Down to 29 # Down to 180, now. #Formerly 940 failed joins
# Reformat census data to essential fields
census.data <- county_popn %>%
select(district_year = location_year, location, year, POP = pop, fips)
# census.data = cbind(county_popn2$location_year, county_popn2$location, county_popn2$year, county_popn2$POP)
# colnames(census.data) = c("district_year", "location", "year", "POP")
# census.data = as.data.frame(census.data)
# census.data$year = as.numeric(as.character(census.data$year))
# census.data$POP = as.numeric(as.character(census.data$POP))
#
# # merge in fips to facilitate joins
# census.data.2 = base::merge(census.data, fips.lookup, by = "location")
# census.data = census.data.2 # convert back to census.data name after troubleshooting any problems
usethis::use_data(census.data, overwrite = TRUE) # Must be in the wnv_data directory
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.