Summary: Assessing match quality for men in the CenSoc-Numident (V2) file.
## Library packages library(tidyverse) library(data.table) library(censocdev) library(cowplot) library(gt) library(ipumsr) library(here) library(janitor) library(broom)
dmf_full <- fread("/censoc/data/dmf/dmf.csv")
## read in numident ## special file with ssn dmf <- fread("/censoc/data/censoc_v2.1/censoc_dmf_v2.1.csv") %>% janitor::clean_names() ## dmf dmf <- dmf %>% left_join(dmf_full %>% select(ssn, mname), by = "ssn") ## read in 1940 census census_1940 <- fread("/ipums-repo2019/1940/TSV/P.tsv", select = c("NAMEFRST", "HISTID", "SERIALP", "AGE", "INCWAGE", "SEX", "EDUC", "RACE", "RELATE", "MARST", "SEI")) %>% janitor::clean_names() ## dmf censoc_dmf <- dmf %>% left_join(census_1940, by = "histid")
get_second_word <- function(x) { x.split <- strsplit(x, split = " ") x1.list <- lapply(x.split,`[`, 2) # returns NA if no name x1 <- unlist(x1.list) return(x1) } clean_key <- function(key){ return(gsub(" +|(?!_)[[:punct:]]","", key, perl = T)) }
## Select First letter of middle name (census) censoc_dmf$middle_name_census <- get_second_word(censoc_dmf$namefrst) censoc_dmf$middle_name_census <- clean_key(censoc_dmf$middle_name_census) censoc_dmf$middle_name_census <- substring(censoc_dmf$middle_name_census, 1, 1) ## Select First letter of middle name (ss5) censoc_dmf$middle_name_dmf <- substring(censoc_dmf$mname, first = 1, 1)
## recode dmf censoc_dmf <- censoc_dmf %>% mutate(middle_name_dmf_recode = case_when( str_detect(middle_name_dmf, "[:alpha:]") ~ middle_name_dmf, TRUE ~ NA_character_ ), middle_name_census_recode = case_when( str_detect(middle_name_census, "[:alpha:]") ~ middle_name_census, TRUE ~ NA_character_ )) ## recode middle name analysis censoc_dmf %>% summarize(mean(!is.na(middle_name_dmf_recode)), mean(!is.na(middle_name_census_recode)), mean(!is.na(middle_name_census_recode) & !is.na(middle_name_dmf_recode)))
censoc_dmf_middle <- censoc_dmf %>% filter(!is.na(middle_name_census_recode) & !is.na(middle_name_dmf_recode)) censoc_dmf_middle <- censoc_dmf_middle %>% mutate(agreement = case_when( middle_name_census_recode == middle_name_dmf_recode ~ 1, TRUE ~ 0))
## raw count for whole dataset censoc_dmf_middle %>% summarize(mean(agreement)) censoc_dmf_middle %>% group_by(link_abe_exact_conservative) %>% summarize(mean(agreement))
match_rate <- censoc_dmf_middle %>% filter(link_abe_exact_conservative == 1) %>% group_by(byear) %>% summarize(match_rate = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(match = "conservative") match_rate_standard <- censoc_dmf_middle %>% group_by(byear) %>% summarize(match_rate = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(match = "standard") match_rate_standard_not_conservative <- censoc_dmf_middle %>% filter(link_abe_exact_conservative == 0) %>% group_by(byear) %>% summarize(match_rate = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(match = "standard, not conservative") middle_initial_agreement_plot <- match_rate %>% bind_rows(match_rate_standard) %>% bind_rows(match_rate_standard_not_conservative) %>% filter(byear %in% c(1900:1939)) %>% ggplot(aes(x = byear, y = match_rate, ymin = match_rate - 1.96*se, ymax = match_rate + 1.96*se, color = match, shape = match)) + geom_pointrange() + geom_line() + theme_cowplot() + ggsci::scale_color_lancet() + labs(x = "Birth Cohort", y = "Middle Initial Agreement Rate", title = "Middle Initial Agreement, CenSoc-DMF") + theme(legend.position = "bottom", legend.title = element_blank()) + ylim(0.3, 1) ## save plot ggsave(middle_initial_agreement_plot, filename = here("vignettes/assess_match_quality/figs/dmf_middle_initial_plot.pdf"), height = 5, width = 7)
## dmf middle initial censoc_dmf_middle <- censoc_dmf_middle %>% recode_education(educ_var = educ) ## education middle initial educ_middle_conservative <- censoc_dmf_middle %>% filter(link_abe_exact_conservative == 1) %>% group_by(educ_yrs) %>% summarize(agreement_mean = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(method = "conservative") educ_middle_standard_noconserv <- censoc_dmf_middle %>% filter(link_abe_exact_conservative == 0) %>% group_by(educ_yrs) %>% summarize(agreement_mean = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(method = "standard, not conservative") educ_middle <- censoc_dmf_middle %>% group_by(educ_yrs) %>% summarize(agreement_mean = mean(agreement), se = sd(agreement)/sqrt(n())) %>% mutate(method = "standard") middle_initial_plot <- educ_middle %>% bind_rows(educ_middle_conservative) %>% # bind_rows(educ_middle_standard_noconserv) %>% filter(!is.na(educ_yrs)) %>% ggplot(aes(x = educ_yrs, y = agreement_mean, ymin = agreement_mean - 1.96*se, ymax = agreement_mean + 1.96*se, color = method)) + geom_pointrange() + geom_line() + theme_cowplot() + ggsci::scale_color_lancet() + labs(x = "Years of education", y = "Middle Initial Agreement Rate", title = "Middle Initial Agreement, CenSoc-dmf (Men)") + theme(legend.position = "bottom") + theme(legend.position = "bottom", legend.title = element_blank())
middle_initial_agreement <- censoc_dmf_middle %>% filter(link_abe_exact_conservative == 1) %>% group_by(byeardiff_census_minus_dmf) %>% summarize(middle = mean(agreement), se = sd(agreement)/n()) %>% ggplot(aes(x = byeardiff_census_minus_dmf, y = middle, ymin = middle - 1.96*se, ymax = middle + 1.96*se, )) + geom_pointrange() + theme_cowplot() + ylim(0, 1) + labs(x = "Birth Year Difference (Census - DMF)", y = "Middle Initial Agreemen") censoc_dmf_middle %>% filter(link_abe_exact_conservative == 1) %>% group_by(byeardiff_census_minus_dmf) %>% tally()
## different in birth year education_standard <- censoc_dmf_middle %>% group_by(byeardiff_census_minus_dmf) %>% do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% unnest(fitHour) %>% filter(term == "educ_yrs") %>% mutate(method = "standard") education_conservative <- censoc_dmf_middle %>% filter(byear %in% 1900:1920) %>% filter(link_abe_exact_conservative == 1) %>% group_by(byeardiff_census_minus_dmf) %>% do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% unnest(fitHour) %>% filter(term == "educ_yrs") %>% mutate(method = "conservative") education_gradient_birth_difference <- education_standard %>% bind_rows(education_conservative) %>% ggplot(aes(x = byeardiff_census_minus_dmf, y = estimate, ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error, color = method)) + theme_cowplot() + geom_pointrange(position = position_dodge(0.15)) + ggsci::scale_color_lancet() + labs(x = "Census Birth Year - DMF Birth Year", y = "Regression Coefficient", color = "Match Method" ) + theme(legend.position = "bottom") ## save plot ggsave(education_gradient_birth_difference, filename = here("vignettes/assess_match_quality/figs/dmf_education_gradient_birth_difference.pdf"), height = 5, width = 7)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.