Summary: Assessing match quality middle initials in the CenSoc-Numident (V2) file.
## Library packages library(tidyverse) library(data.table) library(censocdev) library(cowplot) library(gt) library(ipumsr) library(here) library(janitor) library(broom)
## read in BUNMD bunmd <- fread("/censoc/data/censoc_v2/bunmd_v2.csv") bunmd %>% summarize(mean(mname == ""))
## read in numident ## special file with ssn numident <- fread("/censoc/data/censoc_v2.1/censoc_numident_v2.1.csv") %>% janitor::clean_names() ## numident numident <- numident %>% left_join(bunmd %>% select(ssn, mname), by = "ssn") ## read in 1940 census census_1940 <- fread("/ipums-repo2019/1940/TSV/P.tsv", select = c("NAMEFRST", "HISTID", "SERIALP", "AGE", "INCWAGE", "SEX", "EDUC", "RACE", "RELATE", "MARST", "SEI")) %>% janitor::clean_names() censoc_numident <- numident %>% left_join(census_1940, by = "histid")
get_second_word <- function(x) { x.split <- strsplit(x, split = " ") x1.list <- lapply(x.split,`[`, 2) # returns NA if no name x1 <- unlist(x1.list) return(x1) } clean_key <- function(key){ return(gsub(" +|(?!_)[[:punct:]]","", key, perl = T)) }
## Select First letter of middle name (census) censoc_numident$middle_name_census <- get_second_word(censoc_numident$namefrst) censoc_numident$middle_name_census <- clean_key(censoc_numident$middle_name_census) censoc_numident$middle_name_census <- substring(censoc_numident$middle_name_census, 1, 1) ## Select First letter of middle name (ss5) censoc_numident$middle_name_numident <- substring(censoc_numident$mname, first = 1, 1)
censoc_numident %>% count(middle_name_numident) %>% arrange(desc(n)) censoc_numident %>% count(middle_name_census) %>% arrange(desc(n)) censoc_numident <- censoc_numident %>% mutate(middle_name_numident_recode = case_when( str_detect(middle_name_numident, "[:alpha:]") ~ middle_name_numident, TRUE ~ NA_character_ ), middle_name_census_recode = case_when( str_detect(middle_name_census, "[:alpha:]") ~ middle_name_census, TRUE ~ NA_character_ )) censoc_numident %>% summarize(mean(!is.na(middle_name_numident_recode)), mean(!is.na(middle_name_census_recode)), mean(!is.na(middle_name_census_recode) & !is.na(middle_name_numident_recode)))
censoc_numident_middle <- censoc_numident %>% filter(sex.x == 1) %>% filter(!is.na(middle_name_census_recode) & !is.na(middle_name_numident_recode)) censoc_numident_middle <- censoc_numident_middle %>% mutate(agreement = case_when( middle_name_census_recode == middle_name_numident_recode ~ 1, TRUE ~ 0))
censoc_numident_middle %>% summarize(mean(agreement)) censoc_numident_middle %>% group_by(link_abe_exact_conservative) %>% summarize(mean(agreement))
match_rate <- censoc_numident_middle %>% filter(link_abe_exact_conservative == 1) %>% group_by(byear) %>% summarize(match_rate = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(match = "conservative") match_rate_standard <- censoc_numident_middle %>% group_by(byear) %>% summarize(match_rate = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(match = "standard") match_rate_standard_not_conservative <- censoc_numident_middle %>% filter(link_abe_exact_conservative == 0) %>% group_by(byear) %>% summarize(match_rate = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(match = "standard, not conservative") middle_initial_agreement_plot <- match_rate %>% bind_rows(match_rate_standard) %>% bind_rows(match_rate_standard_not_conservative) %>% filter(byear %in% c(1900:1939)) %>% ggplot(aes(x = byear, y = match_rate, ymin = match_rate - 1.96*se, ymax = match_rate + 1.96*se, color = match, shape = match)) + geom_pointrange() + geom_line() + theme_cowplot() + ggsci::scale_color_lancet() + labs(x = "Birth Cohort", y = "Middle Initial Agreement Rate", title = "Middle Initial Agreement, CenSoc-Numident (Men)") + theme(legend.position = "bottom", legend.title = element_blank()) + ylim(0.3, 1) ## save plot ggsave(middle_initial_agreement_plot, filename = here("vignettes/assess_match_quality/figs/numident_middle_initial_plot.pdf"), height = 5, width = 7)
censoc_numident_middle <- censoc_numident_middle %>% recode_education(educ_var = educ) educ_middle_conservative <- censoc_numident_middle %>% filter(link_abe_exact_conservative == 1) %>% group_by(educ_yrs) %>% summarize(agreement_mean = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(method = "conservative") educ_middle_standard_noconserv <- censoc_numident_middle %>% filter(link_abe_exact_conservative == 0) %>% group_by(educ_yrs) %>% summarize(agreement_mean = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(method = "standard, not conservative") educ_middle <- censoc_numident_middle %>% group_by(educ_yrs) %>% summarize(agreement_mean = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(method = "standard") middle_initial_plot <- educ_middle %>% bind_rows(educ_middle_conservative) %>% # bind_rows(educ_middle_standard_noconserv) %>% filter(!is.na(educ_yrs)) %>% ggplot(aes(x = educ_yrs, y = agreement_mean, ymin = agreement_mean - 1.96*se, ymax = agreement_mean + 1.96*se, color = method, shape = method)) + geom_pointrange() + geom_line() + theme_cowplot() + ggsci::scale_color_lancet() + labs(x = "Years of education", y = "Middle Initial Agreement Rate", title = "Middle Initial Agreement, CenSoc-Numident (Men)") + theme(legend.position = "bottom") + theme(legend.position = "bottom", legend.title = element_blank()) middle_initial_plot <- educ_middle %>% bind_rows(educ_middle_conservative) %>% # bind_rows(educ_middle_standard_noconserv) %>% filter(!is.na(educ_yrs)) %>% ggplot(aes(x = educ_yrs, y = agreement_mean, ymin = agreement_mean - 1.96*se, ymax = agreement_mean + 1.96*se, color = method)) + geom_pointrange() + geom_line() + theme_cowplot() + ggsci::scale_color_lancet() + labs(x = "Years of education", y = "Middle Initial Agreement Rate", title = "Middle Initial Agreement, CenSoc-Numident (Men)") + theme(legend.position = "bottom") + theme(legend.position = "bottom", legend.title = element_blank()) ## save plot ggsave(middle_initial_plot, filename = here("vignettes/assess_match_quality/figs/numident_education_middle_initial_plot.pdf"), height = 5, width = 7)
middle_initial_agreement_byear_flex <- censoc_numident_middle %>% filter(link_abe_exact_conservative == 1) %>% group_by(byeardiff_census_minus_numident) %>% summarize(middle = mean(agreement), se = sd(agreement)/n()) %>% ggplot(aes(x = byeardiff_census_minus_numident, y = middle, ymin = middle - 1.96*se, ymax = middle + 1.96*se, )) + geom_pointrange(size = 1) + theme_cowplot() + ylim(0, 1) + labs(x = "Birth Year Difference (Census - Numident)", y = "Middle Initial Agreemen") ## save plot ggsave(middle_initial_agreement_byear_flex, filename = here("vignettes/assess_match_quality/figs/numident_byearflex_middle_initial_plot.pdf"), height = 6, width = 8)
standard <- censoc_numident_middle %>% filter(byear %in% 1900:1920) conservative <- censoc_numident_middle %>% filter(link_abe_exact_conservative == 1) %>% filter(byear %in% 1900:1920) standard_not_conservative <- censoc_numident_middle %>% filter(link_abe_exact_conservative == 0) %>% filter(byear %in% 1900:1920) education_standard <- standard %>% group_by(agreement) %>% do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% unnest(fitHour) %>% mutate(method = "standard") education_conservative <- conservative %>% group_by(agreement) %>% do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% unnest(fitHour) %>% mutate(method = "conservative") education_standard_conservative <- standard_not_conservative %>% group_by(agreement) %>% do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% unnest(fitHour) %>% mutate(method = "standard, not conservative") education_pooled_standard <- tidy(lm(death_age ~ educ_yrs + as.factor(byear), standard)) %>% mutate(method = "standard") education_pooled_conservative <- tidy(lm(death_age ~ educ_yrs + as.factor(byear), conservative)) %>% mutate(method = "conservative") education_pooled_standard_noconservative <- tidy(lm(death_age ~ educ_yrs + as.factor(byear), standard_not_conservative)) %>% mutate(method = "standard, not conservative") association_between_educ_and_longevity <- education_standard %>% bind_rows(education_conservative) %>% bind_rows(education_standard_conservative) %>% bind_rows(education_pooled_standard) %>% bind_rows(education_pooled_conservative) %>% bind_rows(education_pooled_standard_noconservative) %>% mutate(agreement = as.factor(agreement)) %>% filter(term == "educ_yrs") %>% mutate(agreement = case_when( agreement == 1 ~ "Agree", agreement == 0 ~"Disagree", TRUE ~ "Pooled")) %>% ggplot(aes(x = method, y = estimate, ymin = estimate - 1.96 * std.error, ymax = estimate + 1.96*std.error, color = agreement, shape = agreement)) + geom_pointrange(position = position_dodge(0.1)) + ggsci::scale_color_lancet() + cowplot::theme_cowplot() + theme(legend.position = "bottom") + ylim(0, 0.15) + labs(x = "Match Method", y = "Coefficient", title = "Association between years of education and longevity (OLS)", subtitle = "CenSoc-Numident, Birth cohorts of 1900-1920 (Men Only)", col = "Middle Initial Agreement", shape = "Middle Initial Agreement") ## save plot ggsave(association_between_educ_and_longevity, filename = here("vignettes/assess_match_quality/figs/numident_association_between_educ_and_longevity.pdf"), height = 6, width = 8.5)
## different in birth year education_standard <- censoc_numident_middle %>% group_by(byeardiff_census_minus_numident) %>% do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% unnest(fitHour) %>% filter(term == "educ_yrs") %>% mutate(method = "standard") education_conservative <- censoc_numident_middle %>% filter(byear %in% 1900:1920) %>% filter(link_abe_exact_conservative == 1) %>% group_by(byeardiff_census_minus_numident) %>% do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% unnest(fitHour) %>% filter(term == "educ_yrs") %>% mutate(method = "conservative") education_gradient_birth_difference <- education_standard %>% bind_rows(education_conservative) %>% ggplot(aes(x = byeardiff_census_minus_numident, y = estimate, ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error, color = method)) + theme_cowplot() + geom_pointrange(position = position_dodge(0.15)) + ggsci::scale_color_lancet() + labs(x = "Census Birth Year - Numident Birth Year", y = "Regression Coefficient", color = "Match Method" ) + theme(legend.position = "bottom") ## save plot ggsave(education_gradient_birth_difference, filename = here("vignettes/assess_match_quality/figs/numident_education_gradient_birth_difference.pdf"), height = 5, width = 7)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.