#' long voter-vote to voter-based data
#'
#' This drops offices where multiple votes are cast for the same contest_code. This
#' is necessary to make a voter uniquely identify a vote for a contest (column).
#' Some offices like congress and state leg are separated out into district-name pairs,
#' with a specific column for a district and the vote in that corresponding district.
#' Absentees are also dropped because they do not have precint information.
#'
#' @param df A standardized long dataset with contest_code and contest_name. Should
#' have the variables \code{elec}, \code{county}, \code{contest_code}, \code{voter_id},,
#' \code{precinct}, \code{precinct_id}.
#' @param contests A vector of contest_codes to select and put into the wide.
#'
#' @return A dataset at the individual voter level. For some offices with a district,
#' these are collapsed to a pair-district pairing. If a voter abstained the
#' _vote variable is \code{NA} (indeed it was not in the long dataset), but
#' we try to impute the district by the voter's precinct and ballot style.
#'
#' @importFrom data.table as.data.table dcast melt
#' @import dtplyr
#'
#' @export
cast_to_wide <- function(df = raw,
contests = c("PTY", "PRS", "USH", "USS", "GOV", "LGV", "SOS", "ATG", "SSI",
"HOU", "SEN", "JPR", "SHF", "COR", "CLR", "AUD",
"CTR", "CCL", "CCD", "CCA", "CCC", "SCH", "WAT", "Q", "LRCA")) {
# dt
df <- lazy_dt(df)
# slow
office_votes <- df %>%
count(elec, county, contest_code, voter_id) %>%
group_by(elec, county, contest_code) %>%
summarize(max_votes = max(n)) %>%
ungroup() %>%
arrange(-max_votes)
# votes for code, by precinct. since every record in raw is a vote, length captures the number of votes
# office with one vote
df_one <- semi_join(df, filter(office_votes, max_votes == 1),
by = c("elec", "county", "contest_code"))
# Non-absentee precinct
p_absentee <- df %>%
filter(str_detect(precinct, regex("(Absentee|Failsafe)", ignore_case = TRUE))) %>%
distinct(elec, precinct_id)
# select offices and non-absentee
select_offices <- filter(df_one, contest_type %in% contests) %>%
anti_join(p_absentee, by = c("elec", "precinct_id"))
# separate columns for district
dist_offices <- c("HOU", "SEN", "USH", "SOL", "CCD")
# dist num in one office, candidates in another. For SMD
dt_fmt <- select_offices %>%
filter(contest_type %in% dist_offices) %>%
mutate(dist = as.numeric(parse_number(contest_code))) %>% # DISTRICT
rename(vote = choice_name)
dist_wide <- dt_fmt %>%
as.data.table() %>%
dcast(elec + county + precinct_id + precinct + ballot_style + voter_id ~ contest_type,
value.var = c("vote", "dist")) %>%
lazy_dt() %>%
rename_at(vars(matches("vote_")), function(x) str_c(str_remove(x, "vote_"), "_vote")) %>%
rename_at(vars(matches("dist_")), function(x) str_c(str_remove(x, "dist_"), "_dist"))
# impute dist when the vote (therefore the indicator of district) is missing
dists_long <- dist_wide %>%
as.data.table() %>%
melt(id.vars = c("elec", "precinct_id", "ballot_style"),
measure.vars = patterns("_dist$"),
na.rm = TRUE,
value.name = "dist",
variable.name = "office") %>% # ignore abstentions
lazy_dt() %>%
distinct()
if (nrow(as.data.table(distinct(dists_long, elec, precinct_id, ballot_style, office))) !=
nrow(as.data.table(dists_long))) {
n_temp <- nrow(as.data.table(dists_long))
dists_long <- distinct(dists_long, elec, precinct_id, ballot_style, office, .keep_all = TRUE)
warning(glue("multiple districts in the same office. Force dropping {n_temp - nrow(dists_long)} district-candidates!"))
}
# check if each precinct - district number is unique
district_table <- dists_long %>%
as.data.table() %>%
dcast(elec + precinct_id + ballot_style ~ office, value.var = "dist") %>%
lazy_dt()
# join and coalesce
dist_wide_add <- left_join(dist_wide, district_table,
by = c("elec", "precinct_id", "ballot_style"),
suffix = c(".a", ".b"))
# a lot of code just to coalesce any districts in any office that had a missing
# due to abstension
newdists <- dist_wide_add %>%
as_tibble() %>%
pivot_longer(c(matches("\\.(a|b)")),
names_pattern = "([A-z]+_dist)\\.(a|b)",
names_to = c("type", ".value"),
values_to = "type",
values_drop_na = TRUE) %>%
mutate(dist = coalesce(a, b)) %>%
pivot_wider(id_cols = c(elec, voter_id), names_from = type, values_from = dist)
# merge in new districts (and drop old ones)
dist_wide_imp <- dist_wide_add %>%
as_tibble() %>%
select(-matches("\\.(a|b)")) %>%
left_join(newdists, by = c("elec", "voter_id"))
# other offices (use the 7-digit code on its own)
df_wide_oth <- select_offices %>%
filter(!contest_type %in% dist_offices) %>%
as_tibble() %>%
pivot_wider(id_cols = c(elec, voter_id),
names_from = contest_code,
values_from = choice_name)
as_tibble(left_join(dist_wide_imp, df_wide_oth, by = c("elec", "voter_id")))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.