Summary: Assessing match quality for men in the CenSoc-Numident (V2) file.

## Library packages 
library(tidyverse)
library(data.table)
library(censocdev)
library(cowplot)
library(gt)
library(ipumsr)
library(here)
library(janitor)
library(broom)
dmf_full <- fread("/censoc/data/dmf/dmf.csv")
## read in numident 
## special file with ssn 
dmf <- fread("/censoc/data/censoc_v2.1/censoc_dmf_v2.1.csv") %>% 
  janitor::clean_names()

## dmf  
dmf <- dmf %>% 
  left_join(dmf_full %>% select(ssn, mname), by = "ssn")

## read in 1940 census 
census_1940 <- fread("/ipums-repo2019/1940/TSV/P.tsv", select = c("NAMEFRST", "HISTID", "SERIALP", "AGE", "INCWAGE", "SEX", "EDUC", "RACE", "RELATE", "MARST", "SEI")) %>% 
  janitor::clean_names()

## dmf 
censoc_dmf <- dmf %>% 
  left_join(census_1940, by = "histid")
get_second_word <- function(x)
{
  x.split <- strsplit(x, split = " ")
  x1.list <- lapply(x.split,`[`, 2) # returns NA if no name
  x1 <- unlist(x1.list)
  return(x1)
}

clean_key <- function(key){
  return(gsub(" +|(?!_)[[:punct:]]","", key, perl = T))
}
## Select First letter of middle name (census)
censoc_dmf$middle_name_census <- get_second_word(censoc_dmf$namefrst)
censoc_dmf$middle_name_census <- clean_key(censoc_dmf$middle_name_census)
censoc_dmf$middle_name_census <- substring(censoc_dmf$middle_name_census, 1, 1)

## Select First letter of middle name (ss5)
censoc_dmf$middle_name_dmf <- substring(censoc_dmf$mname, first = 1, 1)
## recode dmf 
censoc_dmf <- censoc_dmf %>% 
  mutate(middle_name_dmf_recode = case_when(
    str_detect(middle_name_dmf, "[:alpha:]") ~ middle_name_dmf,
    TRUE ~ NA_character_
  ),
  middle_name_census_recode = case_when(
    str_detect(middle_name_census, "[:alpha:]") ~ middle_name_census,
    TRUE ~ NA_character_
  ))

## recode middle name analysis 
censoc_dmf %>% 
  summarize(mean(!is.na(middle_name_dmf_recode)),
            mean(!is.na(middle_name_census_recode)),
            mean(!is.na(middle_name_census_recode) & !is.na(middle_name_dmf_recode)))
censoc_dmf_middle <- censoc_dmf %>% 
  filter(!is.na(middle_name_census_recode) & !is.na(middle_name_dmf_recode))

censoc_dmf_middle <- censoc_dmf_middle %>% 
  mutate(agreement = case_when(
    middle_name_census_recode == middle_name_dmf_recode ~ 1,
    TRUE ~ 0))
## raw count for whole dataset
censoc_dmf_middle %>% 
  summarize(mean(agreement))

censoc_dmf_middle %>% 
  group_by(link_abe_exact_conservative) %>% 
  summarize(mean(agreement))
match_rate <- censoc_dmf_middle %>% 
  filter(link_abe_exact_conservative == 1) %>% 
  group_by(byear) %>% 
  summarize(match_rate = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(match = "conservative")

match_rate_standard <- censoc_dmf_middle %>% 
  group_by(byear) %>% 
  summarize(match_rate = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(match = "standard")

match_rate_standard_not_conservative <- censoc_dmf_middle %>% 
  filter(link_abe_exact_conservative == 0) %>% 
  group_by(byear) %>% 
  summarize(match_rate = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(match = "standard, not conservative")

middle_initial_agreement_plot <- match_rate %>% 
  bind_rows(match_rate_standard) %>% 
  bind_rows(match_rate_standard_not_conservative) %>% 
  filter(byear %in% c(1900:1939)) %>% 
  ggplot(aes(x = byear, y = match_rate, ymin = match_rate - 1.96*se, ymax = match_rate + 1.96*se,  color = match, shape = match)) + 
  geom_pointrange() + 
  geom_line() + 
  theme_cowplot() + 
  ggsci::scale_color_lancet() + 
  labs(x = "Birth Cohort",
       y = "Middle Initial Agreement Rate",
       title = "Middle Initial Agreement, CenSoc-DMF") + 
  theme(legend.position = "bottom", legend.title = element_blank()) + 
  ylim(0.3, 1)


## save plot 
ggsave(middle_initial_agreement_plot, filename = here("vignettes/assess_match_quality/figs/dmf_middle_initial_plot.pdf"), height = 5, width = 7)
## dmf middle initial 
censoc_dmf_middle <- censoc_dmf_middle %>% 
  recode_education(educ_var = educ)

## education middle initial 
educ_middle_conservative <- censoc_dmf_middle %>% 
  filter(link_abe_exact_conservative == 1) %>% 
  group_by(educ_yrs) %>% 
  summarize(agreement_mean = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(method = "conservative")

educ_middle_standard_noconserv <- censoc_dmf_middle %>% 
  filter(link_abe_exact_conservative == 0) %>% 
  group_by(educ_yrs) %>% 
  summarize(agreement_mean = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(method = "standard, not conservative")

educ_middle <- censoc_dmf_middle %>% 
  group_by(educ_yrs) %>% 
  summarize(agreement_mean = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
    mutate(method = "standard")

middle_initial_plot <- educ_middle %>% 
  bind_rows(educ_middle_conservative) %>% 
 #  bind_rows(educ_middle_standard_noconserv) %>% 
  filter(!is.na(educ_yrs)) %>% 
ggplot(aes(x = educ_yrs,
           y = agreement_mean,
           ymin = agreement_mean - 1.96*se,
           ymax = agreement_mean + 1.96*se,
           color = method)) + 
  geom_pointrange() + 
  geom_line() + 
  theme_cowplot() + 
  ggsci::scale_color_lancet() + 
  labs(x = "Years of education",
       y = "Middle Initial Agreement Rate",
       title = "Middle Initial Agreement, CenSoc-dmf (Men)") + 
  theme(legend.position = "bottom") + 
  theme(legend.position = "bottom", legend.title = element_blank())
 middle_initial_agreement <- censoc_dmf_middle %>% 
    filter(link_abe_exact_conservative == 1) %>% 
  group_by(byeardiff_census_minus_dmf) %>% 
  summarize(middle = mean(agreement),
            se = sd(agreement)/n()) %>% 
  ggplot(aes(x = byeardiff_census_minus_dmf, 
             y = middle,
             ymin = middle - 1.96*se,
             ymax = middle + 1.96*se,
             )) + 
  geom_pointrange() + 
  theme_cowplot() + 
  ylim(0, 1) + 
  labs(x = "Birth Year Difference (Census - DMF)",
       y = "Middle Initial Agreemen")

 censoc_dmf_middle %>% 
    filter(link_abe_exact_conservative == 1) %>% 
  group_by(byeardiff_census_minus_dmf) %>% 
   tally()
## different in birth year 
education_standard <- censoc_dmf_middle %>% 
  group_by(byeardiff_census_minus_dmf) %>%
  do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% 
  unnest(fitHour) %>% 
  filter(term == "educ_yrs") %>% 
  mutate(method = "standard")

education_conservative <- censoc_dmf_middle %>% 
    filter(byear %in% 1900:1920) %>% 
    filter(link_abe_exact_conservative == 1) %>% 
  group_by(byeardiff_census_minus_dmf) %>%
  do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% 
  unnest(fitHour) %>% 
  filter(term == "educ_yrs") %>% 
  mutate(method = "conservative")

education_gradient_birth_difference <- education_standard %>%  
  bind_rows(education_conservative) %>% 
  ggplot(aes(x = byeardiff_census_minus_dmf, 
             y = estimate,
             ymin = estimate - 1.96*std.error,
             ymax = estimate + 1.96*std.error,
         color = method)) + 
  theme_cowplot() + 
  geom_pointrange(position = position_dodge(0.15)) + 
  ggsci::scale_color_lancet() + 
  labs(x = "Census Birth Year - DMF Birth Year",
       y = "Regression Coefficient",
       color = "Match Method"
  ) + 
  theme(legend.position = "bottom")

## save plot 
ggsave(education_gradient_birth_difference, filename = here("vignettes/assess_match_quality/figs/dmf_education_gradient_birth_difference.pdf"), height = 5, width = 7)


caseybreen/wcensoc documentation built on Nov. 21, 2024, 5:15 a.m.