Nothing
#' Surname probability merging function.
#'
#' \code{merge_surnames} merges surnames in user-input dataset with corresponding
#' race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List.
#'
#' This function allows users to match surnames in their dataset with the U.S.
#' Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain
#' Pr(Race | Surname) for each of the five major racial groups.
#'
#' By default, the function matches surnames to the Census list as follows:
#' 1) Search raw surnames in Census surname list;
#' 2) Remove any punctuation and search again;
#' 3) Remove any spaces and search again;
#' 4) Remove suffixes (e.g., Jr) and search again;
#' 5) Split double-barreled surnames into two parts and search first part of name;
#' 6) Split double-barreled surnames into two parts and search second part of name;
#' 7) For any remaining names, impute probabilities using distribution
#' for all names not appearing on Census list.
#'
#' Each step only applies to surnames not matched in a previous ste.
#' Steps 2 through 7 are not applied if \code{clean.surname} is FALSE.
#'
#' Note: Any name appearing only on the Spanish Surname List is assigned a
#' probability of 1 for Hispanics/Latinos and 0 for all other racial groups.
#'
#' @param voter.file An object of class \code{data.frame}. Must contain a field
#' named 'surname' containing list of surnames to be merged with Census lists.
#' @param surname.year An object of class \code{numeric} indicating which year
#' Census Surname List is from. Accepted values are \code{2010} and \code{2000}.
#' Default is \code{2020}.
#' @param name.data An object of class \code{data.frame}. Must contain a leading
#' column of surnames, and 5 subsequent columns, with Pr(Race | Surname) for each
#' of the five major racial categories.
#' @param clean.surname A \code{TRUE}/\code{FALSE} object. If \code{TRUE},
#' any surnames in \code{\var{voter.file}} that cannot initially be matched
#' to surname lists will be cleaned, according to U.S. Census specifications,
#' in order to increase the chance of finding a match. Default is \code{TRUE}.
#' @param impute.missing A \code{TRUE}/\code{FALSE} object. If \code{TRUE},
#' race/ethnicity probabilities will be imputed for unmatched names using
#' race/ethnicity distribution for all other names (i.e., not on Census List).
#' Default is \code{TRUE}.
#' @return Output will be an object of class \code{data.frame}. It will
#' consist of the original user-input data with additional columns that
#' specify the part of the name matched with Census data (\code{\var{surname.match}}),
#' and the probabilities Pr(Race | Surname) for each racial group
#' (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black,
#' \code{\var{p_his}} for Hispanic/Latino,
#' \code{\var{p_asi}} for Asian and Pacific Islander, and
#' \code{\var{p_oth}} for Other/Mixed).
#'#'
#' @examples
#' data(voters)
#' \dontrun{try(merge_surnames(voters))}
#'
#' @keywords internal
merge_surnames <- function(voter.file, surname.year = 2020, name.data, clean.surname = TRUE, impute.missing = TRUE) {
if ("surname" %in% names(voter.file) == FALSE) {
stop('Data does not contain surname field.')
}
## Census Surname List
if (surname.year == 2000) {
surnames2000$surname <- as.character(surnames2000$surname)
surnames <- surnames2000
} else if (surname.year == 2010) {
surnames2010$surname <- as.character(surnames2010$surname)
surnames <- surnames2010
} else {
surnames <- name.data
colnames(surnames) <- colnames(surnames2010)
surnames$surname <- as.character(surnames$surname)
}
p_eth <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth")
## Convert Surnames in Voter File to Upper Case
df <- voter.file
df$surname.match <- df$surname.upper <- toupper(as.character(df$surname))
## Merge Surnames with Census List (No Cleaning Yet)
df <- merge(df[names(df) %in% p_eth == FALSE], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)
if (nrow(df[df$surname.upper %in% surnames$surname == F, ]) == 0) {
return(df[, c(names(voter.file), "surname.match", p_eth)])
}
df[df$surname.upper %in% surnames$surname == F, ]$surname.match <- ""
df1 <- df[df$surname.upper %in% surnames$surname, ] #Matched surnames
df2 <- df[df$surname.upper %in% surnames$surname == F, ] #Unmatched surnames
## Clean Surnames (if Specified by User)
if (clean.surname) {
## Remove All Punctuation and Try Merge Again
df2$surname.match <- gsub("[^[:alnum:] ]", "", df2$surname.upper)
df2 <- merge(df2[names(df2) %in% p_eth == FALSE], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {
df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, ])
df2 <- df2[df2$surname.match %in% surnames$surname == F, ]
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""}
}
## Remove All Spaces and Try Merge Again
df2$surname.match <- gsub(" ", "", df2$surname.match)
df2 <- merge(df2[names(df2) %in% p_eth == FALSE], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {
df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, ])
df2 <- df2[df2$surname.match %in% surnames$surname == FALSE, ]
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""}
}
## Remove Jr/Sr/III Suffixes
suffix <- c("JUNIOR", "SENIOR", "THIRD", "III", "JR", " II", " J R", " S R", " IV")
for (i in 1:length(suffix)) {
df2$surname.match <- ifelse(substr(df2$surname.match, nchar(df2$surname.match) - (nchar(suffix)[i] - 1), nchar(df2$surname.match)) == suffix[i],
substr(df2$surname.match, 1, nchar(df2$surname.match) - nchar(suffix)[i]),
df2$surname.match)
}
df2$surname.match <- ifelse(nchar(df2$surname.match) >= 7,
ifelse(substr(df2$surname.match, nchar(df2$surname.match) - 1, nchar(df2$surname.match)) == "SR",
substr(df2$surname.match, 1, nchar(df2$surname.match) - 2),
df2$surname.match),
df2$surname.match) #Remove "SR" only if name has at least 7 characters
df2 <- merge(df2[names(df2) %in% p_eth == FALSE], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {
df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, ])
df2 <- df2[df2$surname.match %in% surnames$surname == FALSE, ]
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""}
}
## Names with Hyphens or Spaces, e.g. Double-Barreled Names
df2$surname2 <- df2$surname1 <- NA
df2$surname1[grep("-", df2$surname.upper)] <- sapply(strsplit(grep("-", df2$surname.upper, value = TRUE), "-"), "[", 1)
df2$surname2[grep("-", df2$surname.upper)] <- sapply(strsplit(grep("-", df2$surname.upper, value = TRUE), "-"), "[", 2)
df2$surname1[grep(" ", df2$surname.upper)] <- sapply(strsplit(grep(" ", df2$surname.upper, value = TRUE), " "), "[", 1)
df2$surname2[grep(" ", df2$surname.upper)] <- sapply(strsplit(grep(" ", df2$surname.upper, value = TRUE), " "), "[", 2)
## Use first half of name to merge in priors
df2$surname.match <- as.character(df2$surname1)
df2 <- merge(df2[names(df2) %in% c(p_eth) == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df2)]
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {
df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, names(df2) %in% names(df1)])
df2 <- df2[df2$surname.match %in% surnames$surname == F, ]
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""}
}
## Use second half of name to merge in priors for rest
df2$surname.match <- as.character(df2$surname2)
df2 <- merge(df2[names(df2) %in% c(p_eth, "surname1", "surname2") == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df2) %in% c("surname1", "surname2") == F]
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {
df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, names(df2) %in% names(df1)])
df2 <- df2[df2$surname.match %in% surnames$surname == F, ]
if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""}
}
}
## Impute priors for names not on Census lists
if (impute.missing) {
if (nrow(df2) > 0) {
df2$surname.match <- ""
df2$p_whi <- .6665; df2$p_bla <- .0853; df2$p_his <- .1367; df2$p_asi <- .0797; df2$p_oth <- .0318
message(paste("Probabilities were imputed for", nrow(df2), ifelse(nrow(df2) == 1, "surname", "surnames"), "that could not be matched to Census list."))
}
} else message(paste(nrow(df2), ifelse(nrow(df2) == 1, "surname was", "surnames were"), "not matched."))
df <- rbind(df1, df2)
return(df[, c(names(voter.file), "surname.match", p_eth)])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.