##################################################################################################
#
# Program name: build_data
# Description: This program builds the U.S. geographic datasets for the package.
#
###################################################################################################
# ---- Preliminaries ------------------------------------------------------------------------------
library(tidyverse)
library(geosphere)
devtools::load_all()
# ---- Census regions -----------------------------------------------------------------------------
# Load data
census_df = readr::read_csv(
file = "./data-raw/census_codes.csv"
)
# Save data
usethis::use_data(census_df, overwrite = TRUE)
# ---- US states ----------------------------------------------------------------------------------
# Load data
state_df = readr::read_csv(
file = "./data-raw/state_list.csv"
)
# Wrangle the data
state_df = state_df %>%
dplyr::select(state, state_code) %>%
dplyr::filter(
!(state_code %in% c("GU", "PR", "VI"))
)
# Save data
usethis::use_data(state_df, overwrite = TRUE)
# ---- Adjacent counties --------------------------------------------------------------------------
# Load data
adjacent_county_df = readr::read_tsv(
file = "./data-raw/county_adjacency.txt",
col_names = c("county", "fips_code", "neighbor", "neighbor_fips_code")
)
temp_df = readr::read_csv(
file = "./data-raw/pop_center_csv.csv"
) %>%
mutate(fips_code = statefp*1000 + county_num)
# Initial wrangling of data
adjacent_county_df = adjacent_county_df %>%
# Fill NAs
mutate(
county = zoo::na.locf(county),
fips_code = zoo::na.locf(fips_code)
) %>%
# Correct one-off encoding cases in US (ignore PR)
dplyr::mutate(
county = ifelse(fips_code == '35013', "Dona Ana County, NM", county),
neighbor = ifelse(neighbor_fips_code == '35013', "Dona Ana County, NM", neighbor)
)
#%>%
# Create fips numeric - string distinctions
#dplyr::mutate(
# fips_string = fips_code,
# fips_code = as.numeric(fips_code),
# neighbor_fips_string = neighbor_fips_code,
# neighbor_fips_code = as.numeric(neighbor_fips_code)
#)
# Split state and county names and merge back onto file
# County splits
temp_county = as.data.frame(
matrix(
unlist(
strsplit(adjacent_county_df$county, ",")
),
ncol = 2,
byrow = TRUE
),
stringsAsFactors = FALSE
) %>%
dplyr::rename(county_name = V1, county_state = V2) %>%
dplyr::mutate(county_name = trimws(county_name), county_state = trimws(county_state))
# Neighbor splits
temp_neighbor = as.data.frame(
matrix(
unlist(
strsplit(adjacent_county_df$neighbor, ",")
),
ncol = 2,
byrow = TRUE
),
stringsAsFactors = FALSE
) %>%
dplyr::rename(neighbor_name = V1, neighbor_state = V2) %>%
dplyr::mutate(neighbor_name = trimws(neighbor_name), neighbor_state = trimws(neighbor_state))
# Merge back onto main data
adjacent_county_df = adjacent_county_df %>%
dplyr::bind_cols(
temp_county,
temp_neighbor
)
# Final wrangling of data
adjacent_county_df = adjacent_county_df %>%
# Remove non-U.S. states
dplyr::filter(
!(county_state %in% c("AS", "GU", "MP", "PR", "VI", "MH", "FM", "PW"))
) %>%
# Remove non-continental U.S. (Hawaiian island counties have no adjacent counties)
dplyr::filter(
!(county_state %in% c("AK", "HI"))
) %>%
# Merge on county coordinates
dplyr::mutate(
temp_fips_code = as.numeric(fips_code)
) %>%
dplyr::left_join(
y = temp_df %>%
select(fips_code, population, lat, long),
by = c("temp_fips_code" = "fips_code")
) %>%
dplyr::select(-one_of("temp_fips_code")) %>%
# Merge on neighbor county coordinates
dplyr::mutate(
temp_neighbor_fips_code = as.numeric(neighbor_fips_code)
) %>%
dplyr::left_join(
y = temp_df %>%
select(fips_code, population, lat, long) %>%
rename(
neighbor_fips_code = fips_code,
neighbor_population = population,
neighbor_lat = lat,
neighbor_long = long
),
by = c("temp_neighbor_fips_code" = "neighbor_fips_code")
) %>%
dplyr::select(-one_of("temp_neighbor_fips_code")) %>%
# Reorder columns
dplyr::select(
county_name, county_state, fips_code,
population, lat, long,
neighbor_name, neighbor_state, neighbor_fips_code,
neighbor_population, neighbor_lat, neighbor_long
) %>%
# Remove county matches to themselves
dplyr::filter(
fips_code != neighbor_fips_code
)
# Save data
usethis::use_data(adjacent_county_df, overwrite = TRUE)
# Remove temporary files
rm(temp_county, temp_neighbor, temp_df)
# ---- Counties -----------------------------------------------------------------------------------
# Wrangle adjacent county data and merge on coordinates
county_df = adjacent_county_df %>%
dplyr::distinct(
county_name, county_state, fips_code,
population, lat, long
)
# Save data
usethis::use_data(county_df, overwrite = TRUE)
# Remove temporary files
rm(temp_df)
# ---- ZIP codes ----------------------------------------------------------------------------------
# Load data
zip_df = readr::read_csv(
file = "./data-raw/zip_county_crosswalk_2018.csv"
)
temp_df = readr::read_csv(
file = "./data-raw/zip_code_location_and_population_csv.csv"
)
# Wrangle ZIP data
zip_df = zip_df %>%
# Keep single ZIP-county observation (ZIP may map to > 1 counties)
dplyr::arrange(zip, dplyr::desc(tot_ratio)) %>%
dplyr::distinct(zip, .keep_all = TRUE) %>%
# Cast as numerics
dplyr::mutate(
fips_code = county,
zip_code = zip,
) %>%
dplyr::select(
zip_code, fips_code
) %>%
# Merge on state information and restrict to U.S. ZIP codes
dplyr::inner_join(
y = county_df %>%
dplyr::select(fips_code, county_state) %>%
dplyr::rename(state = county_state),
by = c("fips_code" = "fips_code")
)
# Append on ZCTA (subset of ZIP) data
zip_df = zip_df %>%
dplyr::left_join(
y = temp_df %>%
dplyr::select(
-dplyr::one_of(c("zip_numeric", "land_meters", "water_meters"))
),
by = c("zip_code" = "zip_char")
)
# Save data
usethis::use_data(zip_df, overwrite = TRUE)
# Remove temporary files
rm(temp_df)
# ---- State border coordinates -------------------------------------------------------------------
# Load the data
border_coord_df = readr::read_csv(
file = "./data-raw/state_borders.csv"
) %>%
dplyr::mutate(st1st2 = gsub("IL-IA", "IA-IL", st1st2))
# Wrangle the data
border_coord_df = border_coord_df %>%
dplyr::select(-dplyr::one_of(c("st1_fips", "st2_fips"))) %>%
# Correct longitude
dplyr::mutate(
st1st2 = gsub(" ", "", st1st2),
long = -1*long
) %>%
# Un-pack first and second state
dplyr::mutate(
state_one = sapply(st1st2, FUN = function(x) { unlist(strsplit(x, "-",))[1] }),
state_two = sapply(st1st2, FUN = function(x) { unlist(strsplit(x, "-",))[2] })
) %>%
dplyr::arrange(
bordindx, milemark
)
# Merge on next value for constructing partition
border_coord_df = border_coord_df %>%
dplyr::group_by(bordindx) %>%
dplyr::mutate(
next_lat = dplyr::lead(lat),
next_long = dplyr::lead(long),
next_milemark = dplyr::lead(milemark)
) %>%
dplyr::ungroup() %>%
dplyr::mutate(diff_mmilemark = next_milemark - milemark)
# Refine border partitions
for(i in c(1:nrow(border_coord_df))) {
temp_df = border_coord_df[i, ]
if(is.na(temp_df$next_milemark)) {
between_points = temp_df %>%
data.frame() %>%
dplyr::select(lat, long, bordindx, st1st2)
} else {
dist = floor(temp_df$next_milemark - temp_df$milemark)
alphas = c(0:dist)/dist
between_points = data.frame(
lat = (1-alphas)*temp_df$lat + alphas*temp_df$next_lat,
long = (1-alphas)*temp_df$long + alphas*temp_df$next_long,
stringsAsFactors = FALSE
) %>%
dplyr::mutate(
bordindx = temp_df$bordindx,
st1st2 = temp_df$st1st2
)
}
if(i == 1) {
temp_df_storage = between_points
} else {
temp_df_storage = dplyr::bind_rows(
temp_df_storage,
between_points
)
}
}
# Re-name file and get uniques
border_coord_df = temp_df_storage %>%
dplyr::distinct(lat, long, bordindx, st1st2)
# Save the file
usethis::use_data(border_coord_df, overwrite = TRUE)
# Remove temporary files
rm(temp_df_storage, between_points, temp_df)
# ---- Cross-border county pairs ------------------------------------------------------------------
# Wrangle the data
cbcp_df = adjacent_county_df %>%
dplyr::filter(county_state != neighbor_state) %>%
dplyr::mutate(
cbcp_id = ifelse(
fips_code <= neighbor_fips_code,
paste0(fips_code, "_", neighbor_fips_code),
paste0(neighbor_fips_code, "_", fips_code)
),
state_border_id = ifelse(
county_state <= neighbor_state,
paste0(county_state, "-", neighbor_state),
paste0(neighbor_state, "-", county_state)
)
)
# Calculate distances between counties
cbcp_df = cbcp_df %>%
dplyr::mutate(
dist_bt_centers = county_dist(fips_code, neighbor_fips_code)
)
# Reduce variables
cbcp_df = cbcp_df %>%
dplyr::select(
fips_code, county_state,
cbcp_id, state_border_id,
neighbor_fips_code, neighbor_state,
dist_bt_centers
)
# Save the file
usethis::use_data(cbcp_df, overwrite = TRUE)
# ---- State-border strip county assignments -----------------------------------------------------
# Wrangle the data
sbscp_df = adjacent_county_df %>%
dplyr::filter(county_state != neighbor_state) %>%
dplyr::mutate(
state_border_id = ifelse(
county_state <= neighbor_state,
paste0(county_state, "-", neighbor_state),
paste0(neighbor_state, "-", county_state)
)
) %>%
dplyr::select(fips_code, county_state, state_border_id)
# Calculate closest distances to borders
sbscp_df = sbscp_df %>%
dplyr::mutate(
dist_to_border = county_to_state_border(
fips_code = fips_code
),
border_dist_ref = county_to_state_border(
fips_code = fips_code,
return_state_border_id = TRUE
)
)
# Calculate number of mathces for each fips_code (some will = 0; closest to non-adjacent border!)
sbscp_df = sbscp_df %>%
dplyr::group_by(fips_code) %>%
dplyr::mutate(
num_obs = dplyr::n(),
num_matches = sum(as.numeric(border_dist_ref == state_border_id))
) %>%
dplyr::ungroup()
# Keep closest observation or alphabetically first observation if zero matches
sbscp_df = sbscp_df %>%
dplyr::arrange(fips_code, state_border_id) %>%
dplyr::distinct(fips_code, .keep_all = TRUE) %>%
dplyr::mutate(
final_id = ifelse(
num_matches >= 1,
border_dist_ref,
state_border_id
)
)
# Limit variable set
sbscp_df = sbscp_df %>%
dplyr::select(fips_code, county_state, final_id, dist_to_border) %>%
dplyr::rename(state_border_id = final_id)
# Add on information for state border strip (final analysis should restrict based on this)
sbscp_df = sbscp_df %>%
dplyr::group_by(state_border_id) %>%
dplyr::mutate(
num_counties_in_strip = dplyr::n(),
num_states_in_strip = dplyr::n_distinct(county_state)
) %>%
dplyr::ungroup()
# Save the file
usethis::use_data(sbscp_df, overwrite = TRUE)
# ---- Couplet and relaxed couplet county assignemnts ---------------------------------------------
# Wrangle the data
cpcp_df = adjacent_county_df %>%
dplyr::filter(county_state != neighbor_state) %>%
dplyr::mutate(
fips_pair = ifelse(
fips_code <= neighbor_fips_code,
paste0(fips_code, "-", neighbor_fips_code),
paste0(neighbor_fips_code, "-", fips_code)
),
fips_obs_1 = ifelse(
fips_code <= neighbor_fips_code,
fips_code,
neighbor_fips_code
),
fips_obs_2 = ifelse(
fips_code <= neighbor_fips_code,
neighbor_fips_code,
fips_code
)
) %>%
dplyr::select(
fips_code, neighbor_fips_code, county_state, neighbor_state,
fips_pair, fips_obs_1, fips_obs_2
)
# Append number of times each county appears in data
cpcp_df = cpcp_df %>%
dplyr::group_by(fips_obs_1) %>%
dplyr::mutate(total_fips_obs_1 = dplyr::n()) %>%
dplyr::ungroup() %>%
dplyr::group_by(fips_obs_2) %>%
dplyr::mutate(total_fips_obs_2 = dplyr::n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(fips_pair_obs = total_fips_obs_1 + total_fips_obs_2) %>%
dplyr::arrange(fips_pair_obs, fips_pair, fips_code)
# Construct couplets - algorithm begins with county pair with least number of matches
temp_df = cpcp_df
couplet_vector = NULL
for(x in unique(temp_df$fips_pair)) {
inner_temp_df = temp_df %>%
dplyr::filter(fips_pair == x)
if(nrow(inner_temp_df) >= 2) {
couplet_vector = c(couplet_vector, x)
temp_df = temp_df %>%
dplyr::filter(!(fips_code %in% unique(inner_temp_df$fips_code)))
}
}
# Assign couplet pair identifier to matched counties
cpcp_df = cpcp_df %>%
dplyr::mutate(
cpcp_id = ifelse(
fips_pair %in% couplet_vector,
fips_pair,
NA
)
)
# Assign couplet pair identifier of neighboring to unmatched counties
cpcp_df = cpcp_df %>%
dplyr::left_join(
y = cpcp_df %>%
dplyr::distinct(fips_code, .keep_all = TRUE) %>%
dplyr::select(fips_code, cpcp_id) %>%
dplyr::rename(neighbor_cpcp_id = cpcp_id),
by = c("neighbor_fips_code" = "fips_code")
) %>%
dplyr::mutate(
relaxed_cpcp_id = ifelse(
!is.na(cpcp_id),
cpcp_id,
neighbor_cpcp_id
),
sort_flag = ifelse(
!is.na(cpcp_id),
0,
ifelse(
!is.na(relaxed_cpcp_id),
1,
2
)
)
) %>%
dplyr::arrange(
fips_code, sort_flag, relaxed_cpcp_id,
) %>%
dplyr::distinct(
fips_code, .keep_all = TRUE
) %>%
dplyr::select(
fips_code, county_state, cpcp_id, relaxed_cpcp_id
)
# Add on metadata for pairings
cpcp_df = cpcp_df %>%
dplyr::mutate(
cpcp_remove_flag = as.numeric(is.na(cpcp_id)),
relaxed_cpcp_remove_flag = as.numeric(is.na(relaxed_cpcp_id))
)
# Save the file
usethis::use_data(cpcp_df, overwrite = TRUE)
# Remove temporary files
rm(temp_df, couplet_vector, inner_temp_df)
# ---- Max neighbor county assignments -----------------------------------------------------------
# Wrangle the data
mxcp_df = adjacent_county_df %>%
dplyr::filter(county_state != neighbor_state)
# Count number of times neighbor is adjacent to another county
temp_df = mxcp_df %>%
dplyr::group_by(neighbor_fips_code) %>%
summarise(num_neighbor = dplyr::n())
# Merge on matches for counties and neighbors
mxcp_df = mxcp_df %>%
dplyr::left_join(
y = temp_df %>%
dplyr::rename(num_county = num_neighbor),
by = c("fips_code" = "neighbor_fips_code")
) %>%
dplyr::left_join(
y = temp_df,
by = c("neighbor_fips_code" = "neighbor_fips_code")
) %>%
dplyr::select(
fips_code, county_state, neighbor_fips_code,
num_county, num_neighbor
)
# Assign cluster for each adjacent pair based on observation with more adjacent counties
mxcp_df = mxcp_df %>%
dplyr::mutate(
mxcp_id = ifelse(
num_county >= num_neighbor,
fips_code,
neighbor_fips_code
),
mxcp_counts = ifelse(
num_county >= num_neighbor,
num_county,
num_neighbor
)
)
# Count number of distinct counties in each cluster
mxcp_df = mxcp_df %>%
dplyr::group_by(mxcp_id) %>%
dplyr::mutate(distinct_in_cluster = dplyr::n_distinct(fips_code)) %>%
dplyr::ungroup()
# First-pass matches: most distinct observations in cluster
temp_df = mxcp_df %>%
dplyr::arrange(fips_code, dplyr::desc(distinct_in_cluster)) %>%
dplyr::distinct(fips_code, .keep_all = TRUE) %>%
dplyr::group_by(mxcp_id) %>%
dplyr::mutate(num_in_cluster = dplyr::n()) %>%
dplyr::ungroup() %>%
data.frame()
# First-pass matches: set singleton clusters to NA
temp_df = temp_df %>%
dplyr::mutate(
mxcp_id = ifelse(
num_in_cluster < 2,
NA,
mxcp_id
)
)
# Second-pass matches: merge on first-pass and neighbor assignments
mxcp_df = mxcp_df %>%
dplyr::inner_join(
y = temp_df %>%
dplyr::select(fips_code, mxcp_id) %>%
dplyr::rename(county_mxcp_id = mxcp_id),
by = c("fips_code" = "fips_code")
) %>%
dplyr::inner_join(
y = temp_df %>%
dplyr::select(fips_code, mxcp_id, num_in_cluster) %>%
dplyr::rename(
neighbor_mxcp_id = mxcp_id,
neighbor_num_in_cluster = num_in_cluster
),
by = c("neighbor_fips_code" = "fips_code")
)
# Second-pass matches: assign to neighbor's cluster, sorted by largest
mxcp_df = mxcp_df %>%
dplyr::mutate(
relaxed_mxcp_id = ifelse(
!is.na(county_mxcp_id),
county_mxcp_id,
neighbor_mxcp_id
),
relaxed_mxcp_id = ifelse(
is.na(relaxed_mxcp_id),
ifelse(
fips_code <= neighbor_fips_code,
fips_code,
neighbor_fips_code
),
relaxed_mxcp_id
),
sort_flag = ifelse(
!is.na(relaxed_mxcp_id),
0,
1
)
) %>%
dplyr::arrange(
fips_code, sort_flag,
dplyr::desc(neighbor_num_in_cluster), relaxed_mxcp_id
) %>%
dplyr::distinct(
fips_code, .keep_all = TRUE
)
# Count observations in each cluster
mxcp_df = mxcp_df %>%
dplyr::select(
fips_code, county_state,
county_mxcp_id, relaxed_mxcp_id
) %>%
dplyr::group_by(relaxed_mxcp_id) %>%
dplyr::mutate(num_relaxed = dplyr::n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(
relaxed_mxcp_id = ifelse(
num_relaxed < 2,
NA,
relaxed_mxcp_id
)
) %>%
dplyr::rename(mxcp_id = county_mxcp_id) %>%
dplyr::mutate(
mxcp_remove_flag = as.numeric(is.na(mxcp_id)),
relaxed_mxcp_remove_flag = as.numeric(is.na(relaxed_mxcp_id))
)
# Save the file
usethis::use_data(mxcp_df, overwrite = TRUE)
# Remove temporary files
rm(temp_df)
# ---- County border distance strips --------------------------------------------------------------
# Get county distances to border and closest border segment
bscp_df = adjacent_county_df %>%
dplyr::filter(county_state != neighbor_state) %>%
dplyr::distinct(fips_code, .keep_all = TRUE) %>%
dplyr::mutate(
dist_to_border = county_to_state_border(fips_code)
) %>%
dplyr::select(fips_code, county_state, dist_to_border)
# Merge on closest border segment
bscp_df = bscp_df %>%
dplyr::left_join(
y = county_x_mile_match(
fips_code = bscp_df$fips_code,
segment_length = 50
) %>%
dplyr::select(fips_code, min_dist, segment_id) %>%
dplyr::rename(dist_to_segment = min_dist),
by = c("fips_code" = "fips_code")
) %>%
dplyr::rename(bscp_id = segment_id)
# Save the file
usethis::use_data(bscp_df, overwrite = TRUE)
# ---- Cross-border counties ----------------------------------------------------------------------
# Restrict to border counties and merge on identifiers from methods
cbcounty_df = adjacent_county_df %>%
dplyr::filter(county_state != neighbor_state) %>%
dplyr::distinct(fips_code, .keep_all = TRUE) %>%
dplyr::select(fips_code, county_state) %>%
dplyr::left_join(
y = sbscp_df %>%
dplyr::select(
fips_code,
state_border_id,
dist_to_border
),
by = c("fips_code" = "fips_code")
) %>%
dplyr::left_join(
y = bscp_df %>%
dplyr::select(
fips_code,
bscp_id,
dist_to_segment,
) %>%
dplyr::rename(
segment_id = bscp_id
),
by = c("fips_code" = "fips_code")
) %>%
dplyr::left_join(
y = cpcp_df %>%
dplyr::select(
fips_code,
cpcp_id,
relaxed_cpcp_id
),
by = c("fips_code" = "fips_code")
) %>%
dplyr::left_join(
y = mxcp_df %>%
dplyr::select(
fips_code,
mxcp_id,
relaxed_mxcp_id
),
by = c("fips_code" = "fips_code")
)
# Save the file
usethis::use_data(cbcounty_df, overwrite = TRUE)
# ---- Cross-border ZIP codes ---------------------------------------------------------------------
# Distance from each ZIP code to x mile long border strips
temp_df1 = x_mile_match(zip_df$zip_code, segment_length = 20)
# Distance from each ZIP code to one mile long border strips
temp_df2 = x_mile_match(zip_df$zip_code, segment_length = 1)
# Append on distance to closest border, closest border, and closest 20 mile strip
cbzip_df = zip_df %>%
dplyr::select(zip_code, fips_code, state) %>%
dplyr::left_join(
y = temp_df2 %>%
dplyr::select(-one_of("segment_id")) %>%
dplyr::rename(
dist_to_border = min_dist,
nearest_border = st1st2,
nearest_index = bordindx
),
by = c("zip_code" = "zip_code")
) %>%
dplyr::left_join(
y = temp_df1 %>%
dplyr::select(zip_code, min_dist, segment_id) %>%
dplyr::rename(
dist_to_segment_id = min_dist,
cb_segment_id = segment_id
),
by = c("zip_code" = "zip_code")
)
# Append on cross-border county matches
cbzip_df = cbzip_df %>%
dplyr::left_join(
y = sbscp_df %>%
dplyr::select(fips_code, state_border_id),
by = c("fips_code" = "fips_code")
) %>%
dplyr::left_join(
y = cpcp_df %>%
dplyr::select(fips_code, cpcp_id, relaxed_cpcp_id),
by = c("fips_code" = "fips_code")
) %>%
dplyr::left_join(
y = mxcp_df %>%
dplyr::select(fips_code, mxcp_id , relaxed_mxcp_id),
by = c("fips_code" = "fips_code")
) %>%
dplyr::left_join(
y = adjacent_county_df %>%
dplyr::filter(county_state != neighbor_state) %>%
dplyr::distinct(fips_code, .keep_all = TRUE) %>%
dplyr::mutate(border_county_flag = 1) %>%
dplyr::select(fips_code, border_county_flag),
by = c("fips_code" = "fips_code")
) %>%
dplyr::mutate(
border_county_flag = as.numeric(!is.na(border_county_flag))
)
# Save the file
usethis::use_data(cbzip_df, overwrite = TRUE)
# Remove temporary files
rm(temp1_df, temp2_df)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.