Nothing
#' Gets Census race counts necessary for BISG.
#'
#' This function gets the columns necessary to run BISG in compliance with
#' the Census' Bureau's surname lists. It can extract columns for either the
#' decennial Census or the ACS.
#'
#' @param geography The geographic level at which to obtain Census data. If
#' obtaining data from the decennial Census, can be up to "block". If
#' obtaining data from the ACS, can only be up to "block group".
#' @param state The state from which to obtain Census data, as a string.
#' @param county The county(ies) from which to obtain Census data. If NULL,
#' data is obtained from all counties in the state.
#' @param year The year to obtain Census data from. If 2010, uses decennial
#' data. Otherwise, uses the 5-year ACS summary data.
#' @param cache A bool denoting whether the Census data should be cached.
#' @return A tibble containing the race counts per geography unit for white,
#' black, Hispanic/Latino, Asian, and other voters.
#'
#' @import dplyr
#' @importFrom rlang :=
#' @importFrom tidycensus get_decennial get_acs
#' @importFrom tigris list_counties
#' @export get_census_race_counts
get_census_race_counts <- function(
geography, state, county = NULL, year = 2010, cache = FALSE
) {
# Get FIPS county codes if they're not provided
if (is.null(county)) {
# Filter county codes from tidycensus by state
county <- tigris::list_counties(state)[["county_code"]]
}
# Decennial data
if (year == 2010) {
# Variable names for SF1 dataset
variables <- c("P005003",
"P005004",
"P005005",
"P005006",
"P005007",
"P005008",
"P005009",
"P005010")
# Use tidycensus to extract Census counts
counts <- suppressMessages(tidycensus::get_decennial(
geography = geography,
variables = variables,
year = year,
state = state,
county = county,
output = "wide",
cache_table = cache)) %>%
dplyr::mutate(
fips = .data$GEOID,
whi = .data$P005003,
bla = .data$P005004,
his = .data$P005010,
asi = .data$P005006 + .data$P005007,
oth = .data$P005005 + .data$P005008 + .data$P005009) %>%
dplyr::select(dplyr::all_of(c("fips", "whi", "bla", "his", "asi", "oth")))
} else {
# Variable names for ACS
variables <- c("B03002_003",
"B03002_004",
"B03002_005",
"B03002_006",
"B03002_007",
"B03002_008",
"B03002_009",
"B03002_010",
"B03002_011",
"B03002_012")
# Use tidycensus to extract counts for the ACS
counts <- suppressMessages(tidycensus::get_acs(
geography = geography,
variables = variables,
year = year,
output = "wide",
state = state,
county = county,
survey = "acs5",
cache_table = cache)) %>%
dplyr::mutate(
fips = .data$GEOID,
whi = .data$B03002_003E,
bla = .data$B03002_004E,
his = .data$B03002_012E,
asi = .data$B03002_006E + .data$B03002_007E,
oth = .data$B03002_008E + .data$B03002_009E +
.data$B03002_010E + .data$B03002_011E
) %>%
dplyr::select(
'fips', any_of("whi", "bla", "his", "asi", "oth")
)
}
return(counts)
}
#' Computes the probability a person is located in a specific geographic unit.
#'
#' This is a utility function for performing BISG. It operates on a dataframe
#' obtained from the Census Bureau via the provided eiCompare helper function.
#'
#' @param counts A tibble containing counts (divided amongst constituent groups)
#' per geographic units (rows).
#' @param cols The columns denoting the constituent groups within each
#' geographic units.
#' @return A vector containing the probability of selecting a person from each
#' geographic unit.
#'
#' @export compute_p_g
compute_p_g <- function(counts, cols = c("whi", "bla", "his", "asi", "oth")) {
total_by_geo <- rowSums(counts[, cols])
p_g <- total_by_geo / sum(total_by_geo)
return(p_g)
}
#' Computes the probability a person is of a specific racial group, conditioned
#' on geographic unit.
#'
#' This is a utility function for performing BISG. It operates on a dataframe
#' obtained from the Census Bureau via the provided eiCompare helper function.
#'
#' @param counts A tibble containing counts (divided amongst constituent groups)
#' per geographic units (rows).
#' @param cols The columns denoting the constituent groups within each
#' geographic units.
#' @return A tibble with dimensions equal to counts, with each entry denoting
#' the probability of selecting a person per racial group (columns) conditioned
#' on racial group (rows).
#'
#' @importFrom dplyr across mutate
#' @export compute_p_r_cond_g
compute_p_r_cond_g <- function(
counts, cols = c("whi", "bla", "his", "asi", "oth")
) {
p_r_g <- counts %>%
dplyr::mutate(
dplyr::across(.cols = cols, .fns = ~ . / rowSums(counts[, cols]))
) %>%
dplyr::mutate(dplyr::across(.cols = cols, .fns = ~ ifelse(is.na(.), 0, .)))
return(p_r_g)
}
#' Computes the probability a person is in a specific geographic unit,
#' conditioned on a racial group.
#'
#' This is a utility function for performing BISG. It operates on a dataframe
#' obtained from the Census Bureau via the provided eiCompare helper function.
#'
#' @param geo_counts A tibble containing counts (divided amongst constituent groups)
#' per geographic units (rows).
#' @param cols The columns denoting the constituent groups within each
#' geographic units.
#' @return A tibble with dimensions equal to counts, with each entry denoting
#' the probability of selecting a person per geographic unit (rows)
#' conditioned on racial group (columns).
#'
#' @importFrom dplyr across all_of mutate
#' @export compute_p_r_cond_g
compute_p_g_cond_r <- function(
geo_counts, cols = c("whi", "bla", "his", "asi", "oth")
) {
# Need P(G) and P(R|G) to invert for P(G|R) according to Bayes' Theorem
p_g <- compute_p_g(geo_counts, cols)
p_r_g <- compute_p_r_cond_g(geo_counts, cols)
# Apply Bayes' Theorem
p_g_r <- p_r_g %>%
dplyr::mutate(
dplyr::across(.cols = cols, .fns = ~ . * dplyr::all_of(p_g))
) %>%
dplyr::mutate(dplyr::across(.cols = cols, .fns = ~ . / sum(.)))
return(p_g_r)
}
#' Computes the probability a person is of a specific racial group, conditioned
#' on surname.
#'
#' This is a utility function for performing BISG. It operates on a voter file,
#' and utilizes WRU's function for calculating probability of race by surname.
#'
#' @param voter_file A tibble containing a list of voters (by row), and a
#' column that denotes their surname.
#' @param surname_col A string denoting which column contains the voter surname.
#' @return A tibble with rows denoting voters and columns denoting the
#' probability that each voter is of a particular racial group.
#' @param surname_counts A dataframe denoting the frequency with which surnames
#' correspond to different race/ethnicities. If NULL, the Census surname list is
#' used with categories and merging functions from wru. The dataframe should
#' contain one column with surnames (specified with the y surname_col_counts
#' parameter) and one column for each race/ethnicity group (specified with the
#' race_cols parameter).
#' @param surname_col_counts A string denoting the column in the surname_counts
#' tibble that refers to the geographic unit.
#' @param race_cols name of race columns in \code{surname_counts} data.frame.
#' @param impute_missing Boolean indicating weather to fill in missing names
#' with mean probability across all surnames.
#' @import dplyr
#' @import wru
#' @export compute_p_r_cond_s
compute_p_r_cond_s <- function(
voter_file,
surname_col,
surname_counts = NULL,
surname_col_counts = "surname",
race_cols = c("whi", "bla", "his", "asi", "oth"),
impute_missing = TRUE
) {
# If no surname table given, use wru default
if (is.null(surname_counts)) {
# Get merge_surnames function out from wru
merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru")
p_r_s <- suppressWarnings(suppressMessages(
merge_surnames_copy(
voter.file = dplyr::rename(voter_file,
surname = dplyr::all_of(surname_col)),
surname.year = 2010,
clean.surname = TRUE,
impute.missing = TRUE
)
)) %>%
dplyr::rename(
!!surname_col := 'surname',
whi = 'p_whi',
bla = 'p_bla',
his = 'p_his',
asi = 'p_asi',
oth = 'p_oth'
) %>%
dplyr::select(
all_of(surname_col), all_of(race_cols)
)
} else {
surname_counts[,race_cols] <-
surname_counts[,race_cols] / rowSums(surname_counts[,race_cols])
surname_counts <- surname_counts[,c(surname_col_counts, race_cols)]
p_r_s <- dplyr::left_join(
x = voter_file,
y = surname_counts,
by = stats::setNames(surname_col_counts, surname_col)
)
# Fill in missing names with mean probability across all surnames
# (surnames weighted equally!)
if (impute_missing) {
p_r_s[which(is.na(p_r_s[[race_cols[1]]])), race_cols] <-
colMeans(surname_counts[,race_cols])
}
}
return(p_r_s)
}
#' Computes the probability a person is of a specific racial group, conditioned
#' on surname and geolocation.
#'
#' This is a utility function for performing BISG. It operates on a voter file,
#' and counts obtained from the Census Bureau via eiCompare's helper function.
#'
#' @param voter_file A tibble containing a list of voters (by row), and a
#' column that denotes their surname.
#' @param geo_counts A tibble containing counts (divided amongst constituent groups)
#' per geographic units (rows).
#' @param surname_col A string denoting which column contains the voter surname.
#' @param geo_col A string denoting which column contains the geographic unit
#' ID.
#' @param surname_counts A dataframe denoting the frequency with which surnames
#' correspond to different race/ethnicities. If NULL, the Census surname list is
#' used with categories and merging functions from wru. The dataframe should
#' contain one column with surnames (specified with the y surname_col_counts
#' parameter) and one column for each race/ethnicity group (specified with the
#' race_cols parameter).
#' @param race_cols A list of strings denoting the columns containing racial
#' groups.
#' @param geo_col_counts A string denoting the column in the geo_counts tibble
#' that refers to the geographic unit.
#' @param surname_col_counts A string denoting the column in the surname_counts
#' tibble that refers to the geographic unit.
#' @return A tibble with rows denoting voters and columns denoting the
#' probability that each voter is of a particular racial group.
#'
#' @import dplyr
#' @importFrom stats setNames
#' @importFrom stringr str_length str_sub
#' @export compute_p_r_cond_s_g
compute_p_r_cond_s_g <- function(
voter_file,
geo_counts,
surname_col,
geo_col,
surname_counts = NULL,
race_cols = c("whi", "bla", "his", "asi", "oth"),
geo_col_counts = "fips",
surname_col_counts = "surname"
) {
# Compute probability of race conditioned on surname
p_r_s <- compute_p_r_cond_s(
voter_file = voter_file,
surname_col = surname_col,
surname_counts = surname_counts,
surname_col_counts = surname_col_counts,
race_cols = race_cols
) %>%
rename_with(~ paste0("sur_", .), race_cols)
p_r_s_unique <- unique(p_r_s)
# Compute probability of geography conditioned on race, for all geographies
p_g_r_all <- compute_p_g_cond_r(geo_counts = geo_counts, cols = race_cols)
# Check if the IDs for voter file and the counts have the same length
fips_length <- unique(stringr::str_length(geo_counts[[geo_col_counts]]))
voter_file[[geo_col]] <- stringr::str_sub(
voter_file[[geo_col]],
end = fips_length)
# Merge p(G|R) into the voter file by FIPS code
p_g_r <- dplyr::left_join(
x = voter_file,
y = p_g_r_all,
by = stats::setNames(geo_col_counts, geo_col)
) %>%
rename_with(~ paste0("geo_", .), race_cols)
p_r_s_g_join <- left_join(p_g_r,
p_r_s_unique,
by = stats::setNames(surname_col, surname_col))
p_r_s_g_mult <-
(p_r_s_g_join[, paste0("geo_", race_cols)] * p_r_s_g_join[, paste0("sur_", race_cols)]) %>%
rename_with(~ gsub("geo_", "", .x)) %>%
dplyr::rowwise() %>%
dplyr::mutate(total = sum(dplyr::across(.cols = race_cols))) %>%
dplyr::ungroup() %>%
dplyr::mutate(dplyr::across(.cols = race_cols, .fns = ~ . / total)) %>%
dplyr::select(dplyr::all_of(race_cols))
# # Compute the final probability of race
# p_r_s_g <- (p_r_s[, race_cols] * p_g_r[, race_cols]) %>%
# dplyr::rowwise() %>%
# dplyr::mutate(total = sum(dplyr::across(.cols = race_cols))) %>%
# dplyr::ungroup() %>%
# dplyr::mutate(dplyr::across(.cols = race_cols, .fns = ~ . / total)) %>%
# dplyr::select(dplyr::all_of(race_cols))
p_r_s_g <- cbind(p_r_s_g_join[,c(surname_col, geo_col)], p_r_s_g_mult)
return(p_r_s_g)
}
#' @title Performs Bayesian Improved Surname Geocoding(BISG)
#' @description Performs BISG, i.e computes the probability a
#' person is of a specific racial group, conditioned on surname and geolocation.
#'
#' @param voter_file A tibble containing a list of voters (by row), and a
#' column that denotes their surname.
#' @param surname_col A string denoting which column contains the voter surname.
#' @param geo_col A string denoting which column contains the geographic unit
#' ID.
#' @param geo_counts A tibble containing counts (divided among constituent
#' groups) per geographic units (rows). If NULL, these counts will be obtained
#' using the eiCompare helper function and the other parameters.
#' @param surname_counts A dataframe denoting the frequency with which surnames
#' correspond to different race/ethnicities. If NULL, the Census surname list is
#' used with categories and merging functions from wru. The dataframe should
#' contain one column with surnames (specified with the y surname_col_counts
#' parameter) and one column for each race/ethnicity group (specified with the
#' race_cols parameter).
#' @param geography The geographic level at which to obtain Census data. If
#' obtaining data from the decennial Census, can be up to "block". If
#' obtaining data from the ACS, can only be up to "block group".
#' @param state The state from which to obtain Census data, as a string.
#' @param county The county(ies) from which to obtain Census data. If NULL,
#' data is obtained from all counties in the state.
#' @param year The year to obtain Census data from. If 2010, uses decennial
#' data. Otherwise, uses the 5-year ACS summary data.
#' @param geo_col_counts A string denoting the column in the counts tibble that
#' refers to the geographic unit.
#' @param surname_col_counts A string denoting the column in the surname_counts
#' tibble that refers to the geographic unit.
#' @param race_cols A list of strings denoting the columns containing racial
#' groups.
#' @param impute_missing A bool denoting whether voter file entries that do not
#' match at the geographic level should be imputed with either the surname
#' probabilities, or should be imputed with probabilities calculated at a
#' broader geographic unit.
#' @param verbose A boolean denoting the verbosity.
#' @param cache A boolean denoting whether Census data should be cached.
#' @return A tibble with rows denoting voters and columns denoting the
#' probability that each voter is of a particular racial group.
#' @importFrom dplyr bind_cols
#' @export bisg
#' @examples
#'
#' library(eiExpand)
#'
#' # Load WA example data from eiExpand #
#' data("wa_geocoded")
#' data("wa_block_data")
#'
#' # Use lat/long to merge with block shapes,
#' # Bring in block identifier #
#' voter_file_geo <- eiCompare::merge_voter_file_to_shape(
#' voter_file = wa_geocoded,
#' shape_file = wa_block_data,
#' coords = c("longitude","latitude"),
#' voter_id = "id") %>%
#' dplyr::as_tibble() %>%
#' dplyr::select("id", "county", "precinct", "surname", "GEOID20")
#'
#' # Run BISG #
#' bisg_vf <- bisg(
#' voter_file = voter_file_geo,
#' surname_col = "surname",
#' geo_col = "GEOID20",
#' geo_counts = wa_block_data %>% as.data.frame(),
#' geography = "block",
#' state = "WA",
#' geo_col_counts = "GEOID20",
#' impute_missing = FALSE
#' )
#'
#' # Preview individual race predictions #
#' dplyr::glimpse(bisg_vf)
#' #'
#'
#'
#'
#'
#'
bisg <- function(
voter_file,
surname_col,
geo_col,
geo_counts = NULL,
surname_counts = NULL,
geography = NULL,
state = NULL,
county = NULL,
year = NULL,
geo_col_counts = "fips",
surname_col_counts = "surname",
race_cols = c("whi", "bla", "his", "asi", "oth"),
impute_missing = TRUE,
verbose = FALSE,
cache = FALSE
) {
# If geo_counts aren't provided, obtain them via tidycensus
if (is.null(geo_counts)) {
if (verbose) {
message("Obtaining counts from Census Bureau using tidycensus.")
}
geo_counts <- get_census_race_counts(
geography = geography,
state = state,
county = county,
year = year,
cache = cache
)
}
# Merge voter file with WRU surname database
if (verbose) {
message("Merging surnames.")
}
# Compute the probability of race conditioned on geography and surname
if (verbose) {
message("Calculating BISG probabilities.")
}
p_r_s_g <- compute_p_r_cond_s_g(
voter_file = voter_file,
geo_counts = geo_counts,
surname_counts = surname_counts,
surname_col = surname_col,
geo_col = geo_col,
race_cols = race_cols,
geo_col_counts = geo_col_counts,
surname_col_counts = surname_col_counts
)
# If necessary, impute the records located in blocks recorded as having no
# population
if (impute_missing) {
no_geocode_match <- is.na(p_r_s_g[[race_cols[1]]])
# Imputing is only necessary if an entry didn't match
while (any(no_geocode_match)) {
if (verbose) {
message("Some voters didn't match to a geography.")
}
# Case 1: user did not provide a geography
if (is.null(geography)) {
if (verbose) {
message(paste("Geography not provided.",
"Imputing with surname probabilities."))
}
p_r_s <- compute_p_r_cond_s(
voter_file = voter_file,
surname_col = surname_col,
surname_counts = surname_counts,
surname_col_counts = surname_col_counts,
race_cols = race_cols
)
p_r_s_g[no_geocode_match, race_cols] <- p_r_s[no_geocode_match,
race_cols]
} else {
# Case 2: user provides geography; extract new counts
geography <- swap_census_geography(geography)
if (verbose) {
message(paste("Re-performing BISG at the", geography, "level."))
}
# Get Census counts at a broader spatial extent
geo_counts <- get_census_race_counts(
geography = geography,
state = state,
county = county,
year = year,
cache = cache
)
# Re-perform BISG
new_p_r_s_g <- compute_p_r_cond_s_g(
voter_file = voter_file[no_geocode_match, ],
geo_counts = geo_counts,
surname_col = surname_col,
geo_col = geo_col,
race_cols = race_cols,
geo_col_counts = geo_col_counts
)
p_r_s_g[no_geocode_match, ] <- new_p_r_s_g
}
no_geocode_match <- is.na(p_r_s_g$whi)
}
}
# Combine BISG probabilities back into voter file
p_r_s_g_unique <- unique(p_r_s_g)
voter_file <- left_join(
voter_file,
p_r_s_g_unique,
by = stats::setNames(c(surname_col, geo_col), c(surname_col, geo_col))
)
#voter_file <- dplyr::bind_cols(voter_file, p_r_s_g)
if (verbose) {
message("BISG complete.")
}
return(voter_file)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.