data-raw/prep_since_1969_adjusted.R

# dict: https://seer.cancer.gov/popdata/popdic.html -----------------------

library(tidyverse)

# Option 1: Use the 19 age groups
# url = "https://seer.cancer.gov/popdata/yr1969_2016.19ages/us.1969_2016.19ages.adjusted.txt.gz"
# fil_gz = "data-raw/us.1969_2016.19ages.adjusted.txt.gz"
# fil    = "data-raw/us.1969_2016.19ages.adjusted.txt"

# Option 2: Use the single year age groups
url = "https://seer.cancer.gov/popdata/yr1969_2016.singleages/us.1969_2016.singleages.adjusted.txt.gz"
fil_gz = "data-raw/us.1969_2016.singleages.adjusted.txt.gz"
fil    = "data-raw/us.1969_2016.singleages.adjusted.txt"

if (!file.exists(fil_gz)) {
    download.file(url, fil_gz)
    R.utils::gunzip(fil_gz, fil, remove = FALSE)
}

# Load raw data -----------------------------------------------------------

read_fwf(fil,
         fwf_widths(c(4, 2, 2, 3, 2, 1, 1, 1, 2, 8),
                    c("year", "usps", "state_fips", "county_fips", "registry",
                      "race", "origin", "sex", "age", "pop"))) %>%
    print() -> raw

raw %>% sample_n(1000) %>% distinct(race)
raw %>% sample_n(1000) %>% distinct(origin)
raw %>% sample_n(1000) %>% distinct(sex)
raw %>% sample_n(1000) %>% distinct(registry)
raw %>% sample_n(1000) %>% distinct(age)

# Aggregate to county-year level ------------------------------------------

raw %>%
    # sample_n(100) %>% # Speed up the pipe
    mutate(county_fips = paste0(state_fips, county_fips)) %>%
    mutate(pop = pop %>% as.integer()) %>%
    arrange(state_fips, usps, county_fips, year) %>%
    group_by(state_fips, usps, county_fips, year) %>%
    summarise(pop = pop %>% sum()) %>%
    ungroup() %>%
    print() -> pop

raw %>%
    # sample_n(100) %>% # Speed up the pipe
    mutate(county_fips = paste0(state_fips, county_fips)) %>%
    mutate(pop = pop %>% as.integer()) %>%
    mutate(race = case_when(
        race == 1 ~ "white",
        race == 2 ~ "black",
        race == 3 ~ "other"
    )) %>%
    group_by(state_fips, usps, county_fips, year, race) %>%
    summarise(pop = pop %>% sum()) %>%
    spread(race, pop) %>%
    print() -> pop_by_race

raw %>%
    # sample_n(100) %>% # Speed up the pipe
    mutate(county_fips = paste0(state_fips, county_fips)) %>%
    mutate(pop = pop %>% as.integer()) %>%
    mutate(age = age %>% as.integer()) %>%
    mutate(age_grp = case_when(
        age %in% 0:18  ~ "age_1_18",
        age %in% 19:44 ~ "age_19_44",
        age %in% 45:64 ~ "age_45_64",
        age %in% 65:85 ~ "age_65_85p"
    )) %>%
    group_by(state_fips, usps, county_fips, year, age_grp) %>%
    summarise(pop = pop %>% sum()) %>%
    spread(age_grp, pop) %>%
    print() -> pop_by_age_grp

raw %>%
    # sample_n(100) %>% # Speed up the pipe
    mutate(county_fips = paste0(state_fips, county_fips)) %>%
    mutate(pop = pop %>% as.integer()) %>%
    mutate(sex_grp = if_else(sex == 2, "female", "male")) %>%
    group_by(state_fips, usps, county_fips, year, sex_grp) %>%
    summarise(pop = pop %>% sum()) %>%
    spread(sex_grp, pop) %>%
    print() -> pop_by_sex

pop %>%
    left_join(fips::state) %>%
    select(state_fips, usps, state, county_fips, year, pop) %>%
    left_join(pop_by_race) %>%
    left_join(pop_by_age_grp) %>%
    left_join(pop_by_sex) %>%
    print() -> pop_since_1969_adjusted

devtools::use_data(pop_since_1969_adjusted, overwrite = TRUE)
jjchern/county.pop documentation built on May 24, 2019, 5:06 a.m.