data-raw/us_second_cancer.R

## code to prepare `us_second_cancer` dataset goes here
library(tidyverse)
library(lubridate)

nn <- 100000

set.seed(2048)

spc <- tibble::tibble(fake_id = as.character(sample(nn:(nn*10), size = nn, replace = FALSE)), #using character variable as ID
              #sex sampled 50:50
              sex = sample(c("Male", "Female"), size = nn, replace = TRUE),
              #race (76% White, 14% Black, 10% Other)
              race = sample(c(rep("White", 76), rep("Black", 14), rep("Other", 11)), size = nn, replace = TRUE),
              #SEER registry (for now using no weight 1 for all)
              registry = sample(c(rep("SEER Reg 01 - San Francisco-Oakland SMSA", 3),
                                  rep("SEER Reg 20 - Detroit (Metropolitan)", 3),
                                  rep("SEER Reg 02 - Connecticut", 2),
                                  rep("SEER Reg 21 - Hawaii", 1)
                                  ), 
                                size = nn, replace = TRUE),
              #age
              fc_age = sample(c(rep(0:20, 5),
                                rep(21:40, 15),
                                rep(41:59, 20),
                                rep(60:64, 25),
                                rep(65:69, 30),
                                rep(70:74, 19),
                                rep(75:79, 17),
                                rep(80:84, 10),
                                rep(85:89, 6),
                                rep(90:100, 1),
                                rep(100:105, 1))
                                   ,size = nn, replace = TRUE),
              #grouping of fc_age
              fc_agegroup = dplyr::case_when(fc_age < 5                  ~ "00 - 04" ,
                                      fc_age >= 5  & fc_age < 10  ~ "05 - 09" ,
                                      fc_age >= 10 & fc_age < 15  ~ "10 - 14" ,
                                      fc_age >= 15 & fc_age < 20  ~ "15 - 19" ,
                                      fc_age >= 20 & fc_age < 25  ~ "20 - 24" ,
                                      fc_age >= 25 & fc_age < 30  ~ "25 - 29" ,
                                      fc_age >= 30 & fc_age < 35  ~ "30 - 34" ,
                                      fc_age >= 35 & fc_age < 40  ~ "35 - 39" ,
                                      fc_age >= 40 & fc_age < 45  ~ "40 - 44" ,
                                      fc_age >= 45 & fc_age < 50  ~ "45 - 49" ,
                                      fc_age >= 50 & fc_age < 55  ~ "50 - 54" ,
                                      fc_age >= 55 & fc_age < 60  ~ "55 - 59" ,
                                      fc_age >= 60 & fc_age < 65  ~ "60 - 64" ,
                                      fc_age >= 65 & fc_age < 70  ~ "65 - 69" ,
                                      fc_age >= 70 & fc_age < 75  ~ "70 - 74" ,
                                      fc_age >= 75 & fc_age < 80  ~ "75 - 79" ,
                                      fc_age >= 80 & fc_age < 85  ~ "80 - 84" ,
                                      fc_age >= 85                ~ "85 - 120",
                                      TRUE ~ NA_character_),
              #tmp var for creating missings
              miss = sample(c(1, rep(0, 500)), size = nn, replace = TRUE),
              #tmp var for random month
              ran_mon = sample(1:12, size = nn, replace = TRUE))

tumors <- tibble::tibble(fake_id = as.character(sample(spc$fake_id, size = nn * 1.14, replace = TRUE)),
              #Site of  cancer FC, weighted by approximated relative incidence
              t_site_icd = sample(c( rep("C14",  1), 
                                     rep("C18", 31), 
                                     rep("C34", 91), 
                                     rep("C44", 30), 
                                     rep("C50", 75), 
                                     rep("C54", 14), 
                                     rep("C64", 19), 
                                     rep("C80",  8))
                                   , size = nn*1.14, replace = TRUE))
              

tumors2 <- tumors %>%
  dplyr::mutate(t_year = 1990 + sample(1:29, size = n(), replace = TRUE),
         t_month = sample(1:12, size = n(), replace = TRUE),
         t_datediag = lubridate::make_date(year = t_year, month = t_month, day = 15),
         t_yeardiag = dplyr::case_when(year(t_datediag) >= 1970 &  year(t_datediag) < 1975  ~ "1970 - 1974",
                                year(t_datediag) >= 1975 &  year(t_datediag) < 1980  ~ "1975 - 1979",   
                                year(t_datediag) >= 2015 &  year(t_datediag) < 2020  ~ "2015 - 2019",
                                year(t_datediag) >= 1980 &  year(t_datediag) < 1985  ~ "1980 - 1984",
                                year(t_datediag) >= 1985 &  year(t_datediag) < 1990  ~ "1985 - 1989",
                                year(t_datediag) >= 1990 &  year(t_datediag) < 1995  ~ "1990 - 1994",
                                year(t_datediag) >= 1995 &  year(t_datediag) < 2000  ~ "1995 - 1999",
                                year(t_datediag) >= 2000 &  year(t_datediag) < 2005  ~ "2000 - 2004",
                                year(t_datediag) >= 2005 &  year(t_datediag) < 2010  ~ "2005 - 2009",
                                year(t_datediag) >= 2010 &  year(t_datediag) < 2015  ~ "2010 - 2014",
                                TRUE ~ NA_character_),
         t_dco = sample(c(rep("DCO case", 1), rep("histology", 9)), size = n(), replace = TRUE),
         t_hist = sample(c(8000:9989), size = n(), replace = TRUE)
         ) %>%
  #calculate new renumbered variable #group by case_id_var
  dplyr::arrange(fake_id, t_datediag) %>%
  tidytable::mutate(SEQ_NUM := as.integer(tidytable::row_number()), .by = fake_id)

cancer_pre <- tumors2 %>% 
  tibble::as_tibble() %>%
  dplyr::left_join(., spc, by = "fake_id") %>%
  dplyr::arrange(fake_id, SEQ_NUM)
  

cancer_pre2 <- cancer_pre %>%
  #year of birth (first cancer diagnosis year minus age)
  dplyr::mutate(p_yeardob = case_when(SEQ_NUM == 1 ~ t_year - fc_age, TRUE ~ NA_real_)) %>%
  tidyr::fill(p_yeardob) %>%
  #calculate year of death by taking year of last SEQ_NUM and adding a random number of years
  dplyr::group_by(fake_id) %>%
  dplyr::mutate(p_lastdiag = dplyr::last(t_datediag),
         p_lastyear = dplyr::last(t_year)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(p_yeardod = case_when(SEQ_NUM == 1 ~ p_lastyear + sample(0:25, size = n(), replace = TRUE),
                               TRUE ~ NA_real_))%>%
  tidyr::fill(p_yeardod) %>%
  #add random missings to date of death
  dplyr::mutate(p_yeardod = dplyr::case_when(miss == 0 ~ p_yeardod,
                               TRUE ~ NA_real_)) %>%
  #make dates of years
  dplyr::mutate(
    datebirth = lubridate::make_date(year = p_yeardob, month = 01, day = 01),
    datedeath = lubridate::make_date(year = p_yeardod, month = ran_mon, day = 01),
    #calculate life status for year 2019
    p_alive = dplyr::case_when(p_yeardod > 2019 ~ "Alive",
                        TRUE ~ "Dead"),
    #set date of death missing if Alive
    datedeath = dplyr::case_when(p_alive == "Dead" ~ datedeath,
                            TRUE ~ NA_Date_)) %>%
  #dod: make sure that t_diagdat > p_datedeath, if not take p_lastdiag as datedeath
  dplyr::mutate(datedeath = dplyr::case_when(datedeath < p_lastdiag ~ p_lastdiag,
                                             TRUE ~ datedeath)) %>%
  #minimum year of death for missing dod
  dplyr::mutate(p_dodmin = dplyr::case_when(p_alive == "Dead" & is.na(datedeath)  ~ p_lastdiag,
                                       TRUE ~ NA_Date_))

  
us_second_cancer <- cancer_pre2 %>%
  #sort columns
  dplyr::select(fake_id, SEQ_NUM, registry, sex, race, datebirth, t_datediag, t_site_icd, t_dco, t_hist, fc_age, datedeath, 
         p_alive, p_dodmin, fc_agegroup, t_yeardiag,
         -t_year, -t_month, -p_yeardob, -p_yeardod, -p_lastyear, -p_lastdiag, -miss, -ran_mon)

rm(cancer_pre, cancer_pre2, spc, tumors, tumors2)

usethis::use_data(us_second_cancer, overwrite = TRUE)
marianschmidt/msSPChelpR documentation built on Feb. 1, 2024, 6:45 a.m.