[^updated]: Last updated: r format(Sys.time(), '%d %B, %Y')

| Page| Variable | Label | |----:|:-------------|:---------------------------------------------| | \hyperlink{page.2}{2} | \hyperlink{page.2}{HISTID} |Historical unique identifier | | \hyperlink{page.3}{3} | \hyperlink{page.3}{byear} |Year of birth | | \hyperlink{page.4}{4} | \hyperlink{page.4}{bmonth} |Month of birth | | \hyperlink{page.5}{5} | \hyperlink{page.5}{dyear} |Year of death | | \hyperlink{page.6}{6} | \hyperlink{page.6}{dmonth} |Month of death | | \hyperlink{page.7}{7} | \hyperlink{page.7}{death_age} |Age at death (years) | | \hyperlink{page.8}{8} | \hyperlink{page.8}{link_abe_exact_conservative} |Flag for conservative ABE match | | \hyperlink{page.9}{9} | \hyperlink{page.9}{weight} |CenSoc weight | | \hyperlink{page.10}{10} | \hyperlink{page.10}{weight_conservative} |CenSoc weight (Conservative Sample) |

\vspace{250pt}

Summary: The CenSoc-DMF Version 2.1 dataset (N = 7,762,221) links the full-count 1940 Census to the Death Master File (DMF), a collection of death records reported to the Social Security Administration. Records were linked using the standard and conservative ABE method developed by Abramitzky, Boustan, and Eriksson (2012, 2014, 2017). To merge 1940 Census variables with this dataset, researchers can obtain a copy of the 1940 Census from IPUMS-USA and link on the individual-level, unique identifier HISTID variable.

\newpage

\huge HISTID \normalsize \vspace{12pt}

Label: Historical Unique Identifier

Description: HISTID is a unique individual-level identifier. It can be used to merge the CenSoc-DMF file with the 1940 Full-Count Census from IPUMS.

\newpage

\huge byear \normalsize \vspace{12pt}

Label: Birth Year

Description: byear reports a person's year of birth, as recorded in the Social Security Death Master File.

## Library Packages
library(tidyverse)
library(data.table)

## read in censoc_dmf_v2.1 data file
censoc_dmf_v2.1 <- read_csv("/data/censoc/censoc_data_releases/censoc_dmf/censoc_dmf_v2.1/censoc_dmf_v2.1.csv")
byear_plot <- censoc_dmf_v2.1 %>%
    group_by(byear) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = byear, y = n)) + 
    geom_line() + 
    geom_point() +
    theme_minimal(base_size = 15) + #replace with a different theme (theme_bw()) if the bbplot package isn't downloaded 
  ggtitle("Year of Birth") + 
  theme(legend.position="bottom") +
  xlab("title") + 
  labs(x = "Year", 
       y = "Count") + 
  scale_y_continuous(labels = scales::comma) + 
  scale_x_continuous(breaks = scales::pretty_breaks(n=5)) 

\vspace{75pt}

byear_plot

\newpage \huge bmonth \normalsize \vspace{12pt}

Label: Birth Month

Description: bmonth reports a person's month of birth, as recorded in the Social Security Death Master File.

## run in the console and copy and paste into documentation
bmonth_tabulated <- knitr::kable(censoc_dmf_v2.1 %>%
    filter(bmonth != 0) %>% 
    group_by(bmonth) %>%
    tally() %>%
    mutate(freq = signif(n*100 / sum(n), 2)) %>%
    mutate(label = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")) %>%
    select(bmonth, label, n, `freq %` = freq))

\vspace{30pt}

bmonth_tabulated

\newpage

\huge dyear \normalsize \vspace{12pt}

Label: Death Year

Description: dyear reports a person's year of death, as recorded in the Social Security Death Master File.

dyear_plot <- censoc_dmf_v2.1 %>%
    group_by(dyear) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = dyear, y = n)) + 
    geom_line() + 
    geom_point() +
    theme_minimal(base_size = 15) +  
  ggtitle("Year of Death") + 
  theme(legend.position="bottom") +
  xlab("title") + 
  labs(x = "Year", 
       y = "Count") + 
  scale_y_continuous(labels = scales::comma, limits = c(0, 300000)) + 
  scale_x_continuous(breaks = scales::pretty_breaks(n=5)) 

\vspace{75pt}

dyear_plot

\newpage

\huge dmonth \normalsize \vspace{12pt}

Label: Death Month

Description: dmonth reports a person's month of death, as recorded in the Social Security Death Master File.

dmonth_tabulated <- knitr::kable(censoc_dmf_v2.1 %>%
    group_by(dmonth) %>%
    tally() %>%
    mutate(freq = signif(n*100 / sum(n), 2)) %>%
    mutate(label = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")) %>%
    select(dmonth, label, n, `freq %` = freq))

\vspace{30pt}

dmonth_tabulated

\newpage

\huge death_age \normalsize \vspace{12pt}

Label: Age at Death (Years)

Description: death_age reports a person's age at death in years, calculated using the birth and death information recorded in the Social Security Death Master File.

death_age_plot <- censoc_dmf_v2.1 %>%
    group_by(death_age) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = death_age, y = n)) + 
    geom_line() + 
    geom_point() +
    theme_minimal(base_size = 15) + #replace with a different theme (theme_bw()) if the bbplot package isn't downloaded 
  ggtitle("Age at Death") + 
  theme(legend.position="bottom") +
  xlab("title") + 
  labs(x = "Age at Death", 
       y = "Count") + 
  scale_y_continuous(labels = scales::comma) + 
  scale_x_continuous(breaks = scales::pretty_breaks(n=5)) 

\vspace{75pt}

death_age_plot

\newpage

\huge link_abe_exact_conservative

\normalsize

\vspace{12pt}

Label: Flag for conservative ABE match

Description: A flag variable reporting whether a match was established with the ABE conservative match with exact names.

link_abe_exact_conservative_tabulated <- knitr::kable(censoc_dmf_v2.1 %>%
    group_by(link_abe_exact_conservative) %>%
    tally() %>%
    mutate(freq = signif(n*100 / sum(n), 3)) %>%
    arrange(desc(link_abe_exact_conservative)) %>% 
    mutate(label = c("Conservative and Standard ABE Link", "Standard ABE Link Only")) %>%
    select(link_abe_exact_conservative, label, n, `freq %` = freq))

\vspace{50pt}

link_abe_exact_conservative_tabulated

\newpage

\huge weight \normalsize

\vspace{12pt}

Label: Sample Weights

Description: A post-stratification person-weight to Human Mortality Database (HMD) totals for persons (1) born between 1895-1939 (2) dying between 1975-2005 (3) dying between ages 65-100. Please see the CenSoc Methods Protocol for more details on weighting procedure.

weights_tabulated <- censoc_dmf_v2.1 %>%
  filter(!is.na(weight)) %>% 
  summarize('Min Weight' = round(min(weight),2), 'Max Weight' = round(max(weight), 2)) %>%
  mutate(id = 1:n()) %>% 
  pivot_longer(-id, names_to = "Label", values_to = "Value") %>% 
  select(Value, Label) %>% 
  add_row(Label = "No Weight Assigned", Value = NA) %>% 
  knitr::kable()

weight.plot <- censoc_dmf_v2.1 %>%
  filter(!is.na(weight)) %>% 
  ggplot() +
    geom_boxplot(aes(x=weight), fill='grey92') +
    ylim(-1,1) +
    theme_minimal(15) +
    theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
    ggtitle("Distribution of Weights") +
    xlab('Weight')

\vspace{50pt}

weights_tabulated

\vspace{50pt}

weight.plot

\newpage

\huge weight_conservative \normalsize

\vspace{12pt}

Label: Sample Weights (Conservative Sample)

Description: A post-stratification person-weight to Human Mortality Database (HMD) totals (only for matches established via the conservative ABE algorithm) for persons (1) born between 1895-1939 (2) dying between 1975-2005 (3) dying between ages 65-100. Please see the CenSoc Methods Protocol for more details on weighting procedure.

weights_conservtive_tabulated <- censoc_dmf_v2.1 %>%
  filter(link_abe_exact_conservative == 1) %>% 
  filter(!is.na(weight_conservative)) %>% 
  summarize('Min Weight' = round(min(weight_conservative),2), 'Max Weight' = round(max(weight_conservative), 2)) %>%
  mutate(id = 1:n()) %>% 
  pivot_longer(-id, names_to = "Label", values_to = "Value") %>% 
  select(Value, Label) %>% 
  add_row(Label = "No Weight Assigned", Value = NA) %>% 
  knitr::kable()

weight_conservative.plot <- censoc_dmf_v2.1 %>%
  filter(link_abe_exact_conservative == 1) %>% 
  filter(!is.na(weight_conservative)) %>% 
  ggplot() +
    geom_boxplot(aes(x=weight_conservative), fill='grey92') +
    ylim(-1,1) +
    theme_minimal(15) +
    theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
    ggtitle("Distribution of Weights") +
    xlab('Weight')

\vspace{50pt}

weights_conservtive_tabulated

\vspace{50pt}

weight_conservative.plot


caseybreen/wcensoc documentation built on Nov. 21, 2024, 5:15 a.m.