R/getCases.R

Defines functions getCases

Documented in getCases

# !diagnostics off

getCases <- function(outcome,cpsCode,icd9Code,icd10Code,rep_code,inccanv=NULL,sitec=NULL,
                     initialPath=file.path("s:/cps")){
  require(dplyr)
  message("getCases() was updated on July 30th 2019 and includes followup through June 2015.  Please update the function if followup is extended")


  # Pull the date of interview variables for a censoring cohort - include prevalent cancer variable
  foo <- rbind(variables$men,variables$women)
  foo$FullPath <- file.path(initialPath,foo$subPath,foo$File)
  dtint <- dplyr::filter(foo,substr(Variables,1,5)=="DTINT")
  files <- dtint$FullPath
  keep <- c("ID",dtint$Variables,"CA92","CA82")
  rm(foo)

  men <- lapply(dtint$FullPath[dtint$Sex=="MEN"],function(x){
    foo <- readRDS(x)
    foo <- foo[,names(foo) %in% keep]
  }) %>% Reduce(function(x,y) full_join(x,y,"ID"),.)
  men$SEX <- "MEN"
  women <- lapply(dtint$FullPath[dtint$Sex=="WOMEN"],function(x){
    foo <- readRDS(x)
    foo <- foo[,names(foo) %in% keep]
  }) %>% Reduce(function(x,y) full_join(x,y,"ID"),.)
  women$SEX <- "WOMEN"
  dtint <- dplyr::bind_rows(men,women)
  rm(keep,men,women)


  # Master and person-year files
  master  <- rbind(readRDS(file.path(initialPath,"master/cps2smgmst17f.rds")),
                   readRDS(file.path(initialPath,"master/cps2smgmst17m.rds")))
  master$MASTER <- 1
  pyear <- rbind(readRDS(file.path(initialPath,"nutrition/pyear/nut92fempyr1216.rds")),
                 readRDS(file.path(initialPath,"nutrition/pyear/nut92menpyr1216.rds")))


  # Mortality files - input codes included in function call

  mortality <- readRDS(file.path(initialPath,"mortality/smgmrt17d190411.rds"))
  mortality$DEAD <- 1

  # Reported files
  path <- file.path(initialPath, "nutrition/reported")

  # Reported
  report15 <- readRDS(file.path(path,"cancer15apr2019reported.rds"))
  report15 <- reportedFun(report15,"15",rep_code)
  report13 <- readRDS(file.path(path,"cancer13aug2018reported.rds"))
  report13 <- reportedFun(report13,"13",rep_code)
  report11 <- readRDS(file.path(path,"cancer11aug2018reported.rds"))
  report11 <- reportedFun(report11,"11",rep_code)
  report09 <- readRDS(file.path(path,"cancer09aug2018reported.rds"))
  report09 <- reportedFun(report09,"09",rep_code)
  report07 <- readRDS(file.path(path,"cancer07aug2018reported.rds"))
  report07 <- reportedFun(report07,"07",rep_code)
  report05 <- readRDS(file.path(path,"cancer05aug2018reported.rds"))
  report05 <- reportedFun(report05,"05",rep_code)
  report03 <- readRDS(file.path(path,"cancer03mar2011reported.rds"))
  report03 <- reportedFun(report03,"03",rep_code)
  report01 <- readRDS(file.path(path,"cancer01mar2011reported.rds"))
  report01 <- reportedFun(report01,"01",rep_code)
  report99 <- readRDS(file.path(path,"cancer99sep2007reported.rds"))
  report99 <- reportedFun(report99,"99",rep_code)
  report97 <- readRDS(file.path(path,"cancer97feb2015reported.rds"))
  report97 <- reportedFun(report97,"97",rep_code)
  reported <- Reduce(function(x,y) full_join(x,y,"ID"),
                     list(report97,report99,report01,report03,report05,
                          report07,report09,report11,report13,report15))
  reported <- reported[!duplicated(reported$ID),]
  rm(report97,report99,report01,report03,report05,
     report07,report09,report11,report13,report15)


  # Verified file
  verified <- readRDS(file.path(initialPath,"nutrition/verified/mar2019verified.rds"))

  # Recode diagnosis date for everyone
  verified$DODDY[verified$DODDY=="99"] <- "15"
  verified$DODMO[verified$DODMO=="99"] <- "6"
  verified$DODDY[verified$DODDY > "30" & verified$DODMO %in% c("04","06","09","11")] <- "30"
  verified$DODDY[verified$DODDY > "28" & verified$DODMO==2] <- "28"
  verified$DXDATE <- as.Date(paste(verified$DODYR, verified$DODMO, verified$DODDY, sep="-"),
                             format="%Y-%m-%d")

  # Select only first diagnoses
  verified <- filter(verified[order(verified$DXDATE),],!duplicated(ID) &
                       DXDATE < as.Date("2015-06-30"))


  # Selected cancer and all other cancers diagnosis
  verified$VERFCANCER <- with(verified,ifelse(
    INCCANV %in% inccanv | SITEC %in% sitec,1,0))
  verified$VERFOTHER <- ifelse(verified$VERFCANCER==1,0,1)

  # Drop SEX because the variable messes up my merges
  verified <- dplyr::select(verified,-SEX)

  # I want to keep all the variables in the verified file
  verifiedNames <- names(verified)
  verifiedNames <- verifiedNames[!verifiedNames %in% c("VERFCANCER","VERFOTHER","ID")]

  # Rename othver variables and keep
  ver <- select(verified[verified$VERFCANCER==1,],-VERFOTHER)
  othver <- verified[verified$VERFOTHER==1,]
  othver$OTHDXDATE <- othver$DXDATE
  othver$OTHVERCAN <- othver$INCCANV
  othver <- othver[,c("ID","OTHDXDATE","OTHVERCAN","VERFOTHER")]


  cohort <- dplyr::filter(Reduce(function(x,y) full_join(x,y,"ID"),
                                 list(dtint,ver,othver,master,mortality,pyear,reported)),
                          NUT_92==1 & MASTER==1)
  names(cohort) <- tolower(names(cohort))
  rm(dtint,master,mortality,pyear,reported,verified,othver,ver)


# Convert dates
  dates <- c("dtint82","dtint92","dtint97","dtint99","dtint01","dtint03","dtint05",
             "dtint07","dtint09","dtint11","dtint13","dtint15","bdaydate","datedd")
  cohort[,dates] <- lapply(cohort[,dates],function(x) as.Date(x,origin="1960-01-01"))

  # Hard code some missing data
  na <- c("dead","verfcancer","verfother","report97","report99","report01","report03",
          "report05","report07","report09","report11","report13","report15")
  cohort[,na] <- lapply(cohort[,na],function(x) ifelse(is.na(x),0,x))

  # Clean diagnosis dates
  # Clean up the diagnosis dates
   cohort$dxdate <- dxdateClean(cohort,"dxdate")


  # End of followup
  cohort$dead <- ifelse(cohort$dead==1 & cohort$datedd > "2015-06-30",0,cohort$dead)
  cohort$dthdate <- ifelse(cohort$dead==1,cohort$datedd,NA)

  # lost to followup
  cohort$lostfu <- ifelse(apply(cohort[,c("nut_97","nut_99","nut_01","nut_03",
                                          "nut_05","nut_07","nut_09","nut_11",
                                          "nut_13","nut_15")],1,sum)==0,1,0)
  cohort$lostfu[cohort$lostfu==1 & cohort$datedd <= "1997-12-31"] <- 0


  # Fatal cases
  # any cancer
  cohort$anycancer <- with(cohort,ifelse(
    (codetype=='1' & code1>='31' & code1 <='99') |
      (codetype=='2' & code1>='140' & code1 <= '2099') |
      (codetype=='2' & code1>='230' & code1 <= '2399') |
      (codetype=='3' & code1>='C00' & code1 <= 'D099') |
      (codetype=='3' & code1>='D37' & code1 <= 'D499'),1,0))
  cohort$cancerdth <- with(cohort,ifelse(
    (codetype=="1" & code1 %in% cpsCode) |
      (codetype=="2" & code1 %in% icd9Code) |
      (codetype=="3" & code1 %in% icd10Code),1,ifelse(
        anycancer==0 &
          (codetype=="1" & code2 %in% cpsCode) |
          (codetype=="2" & code2 %in% icd9Code) |
          (codetype=="3" & code2 %in% icd10Code),1,ifelse(
            anycancer==0 &
              (codetype=="1" & code3 %in% cpsCode) |
              (codetype=="2" & code3 %in% icd9Code) |
              (codetype=="3" & code3 %in% icd10Code),1,0))))

  # some deaths occurred after the end of followup date
  cohort$anycancer[cohort$dead==0] <- 0
  cohort$cancerdth[cohort$dead==0] <- 0

  # death only cases
  cohort$cancerdied <- ifelse(cohort$verfcancer==0 & cohort$cancerdth==1,1,0)

  # Incident case:  verified + died (don't include self-only)
  cohort$incidence <- ifelse(cohort$cancerdied==1 | cohort$verfcancer==1,1,0)
  cohort$incidence[is.na(cohort$incidence)] <- 0

  # Outcome definitions
  report <- paste0("report",c("97","99","01","03","05","07","09","11","13","15"))
  cohort$selfreport <- ifelse(apply(cohort[,report],1,sum) != 0,1,0)

  # Flag the self-only cases
  cohort$self_only <- ifelse(cohort$incidence==0 & cohort$selfreport==1,1,0)


  # Censoring ---------------------------------------------------------------

  # Censor criteria
  # 1.  Last survey returned
  # 2.  Date of any diagnosis/death
  # 3.  Reported cancer of interest


  cohort$endfu <- as.Date("2015-06-30",origin="1970-01-01")

  cohort$dateft <- apply(cohort[,c("datedd","endfu")],1,function(x){
    min(as.Date(x))
  })
  cohort$dateft <- with(cohort,ifelse(dead==1, dthdate,dateft))
  cohort$dateft <- with(cohort,ifelse(verfcancer==1 & !is.na(dxdate),dxdate,dateft))
  cohort$dateft <- with(cohort,ifelse(verfother==1, othdxdate,dateft))
  cohort$dateft <- as.Date(cohort$dateft,origin="1970-01-01")


  # 2013
  cohort$cen13 <- 0
  cohort$cen13 <- with(cohort,ifelse(
    ((!is.na(dtint13) & dateft>dtint13) | (is.na(dtint13) & dateft>=as.Date("2013-10-31"))) &
      (nut_13==1 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("2015-06-30"))), 1, 0))
  cohort$cen13[cohort$report13==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change13 <- ifelse(cohort$cen13 == 1,1,0)
  cohort$dateft[cohort$change13==1] <- cohort$dtint13[cohort$change13==1]
  cohort$dateft[cohort$change13==1 & is.na(cohort$dtint13)] <- as.Date("2013-10-31")


  # 2011
  cohort$cen11 <- 0
  cohort$cen11 <- with(cohort,ifelse(
    ((!is.na(dtint11) & dateft>dtint11) | (is.na(dtint11) & dateft>=as.Date("2011-10-31"))) &
      (nut_11==1 & nut_13==0 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("2013-10-31"))), 1, 0))
  cohort$cen11[cohort$report11==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change11 <- ifelse(cohort$cen11 == 1,1,0)
  cohort$dateft[cohort$change11==1] <- cohort$dtint11[cohort$change11==1]
  cohort$dateft[cohort$change11==1 & is.na(cohort$dtint11)] <- as.Date("2011-10-31")

  # 2009
  cohort$cen09 <- 0
  cohort$cen09 <- with(cohort,ifelse(
    ((!is.na(dtint09) & dateft>dtint09) | (is.na(dtint09) & dateft>=as.Date("2009-10-31"))) &
      (nut_09==1 & nut_11==0 & nut_13==0 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("2011-10-31"))), 1, 0))
  cohort$cen09[cohort$report09==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change09 <- ifelse(cohort$cen09 == 1,1,0)
  cohort$dateft[cohort$change09==1] <- cohort$dtint09[cohort$change09==1]
  cohort$dateft[cohort$change09==1 & is.na(cohort$dtint09)] <- as.Date("2009-10-31")


  # 2007
  cohort$cen07 <- 0
  cohort$cen07 <- with(cohort,ifelse(
    ((!is.na(dtint07) & dateft>dtint07) | (is.na(dtint07) & dateft>=as.Date("2007-10-31"))) &
      (nut_07==1 & nut_09==0 & nut_11==0 & nut_13==0 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("2009-10-31"))), 1, 0))
  cohort$cen07[cohort$report07==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change07 <- ifelse(cohort$cen07 == 1,1,0)
  cohort$dateft[cohort$change07==1] <- cohort$dtint07[cohort$change07==1]
  cohort$dateft[cohort$change07==1 & is.na(cohort$dtint07)] <- as.Date("2007-10-31")


  # 2005
  cohort$cen05 <- 0
  cohort$cen05 <- with(cohort,ifelse(
    ((!is.na(dtint05) & dateft>dtint05) | (is.na(dtint05) & dateft>=as.Date("2005-10-31"))) &
      (nut_05==1 & nut_07==0 & nut_09==0 & nut_11==0 & nut_13==0 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("2007-10-31"))), 1, 0))
  cohort$cen05[cohort$report05==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change05 <- ifelse(cohort$cen05 == 1,1,0)
  cohort$dateft[cohort$change05==1] <- cohort$dtint05[cohort$change05==1]
  cohort$dateft[cohort$change05==1 & is.na(cohort$dtint05)] <- as.Date("2005-10-31")


  # 2003
  cohort$cen03 <- 0
  cohort$cen03 <- with(cohort,ifelse(
    ((!is.na(dtint03) & dateft>dtint03) | (is.na(dtint03) & dateft>=as.Date("2003-10-31"))) &
      (nut_03==1 & nut_05==0 & nut_07==0 & nut_09==0 & nut_11==0 & nut_13==0 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("2005-10-31"))), 1, 0))
  cohort$cen03[cohort$report03==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change03 <- ifelse(cohort$cen03 == 1,1,0)
  cohort$dateft[cohort$change03==1] <- cohort$dtint03[cohort$change03==1]
  cohort$dateft[cohort$change03==1 & is.na(cohort$dtint03)] <- as.Date("2003-10-31")


  # 2001
  cohort$cen01 <- 0
  cohort$cen01 <- with(cohort,ifelse(
    ((!is.na(dtint01) & dateft>dtint01) | (is.na(dtint01) & dateft>=as.Date("2001-12-31"))) &
      (nut_01==1 & nut_03==0 & nut_05==0 & nut_07==0 & nut_09==0 & nut_11==0 & nut_13==0 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("2003-10-31"))), 1, 0))
  cohort$cen01[cohort$report01==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change01 <- ifelse(cohort$cen01 == 1,1,0)
  cohort$dateft[cohort$change01==1] <- cohort$dtint01[cohort$change01==1]
  cohort$dateft[cohort$change01==1 & is.na(cohort$dtint01)] <- as.Date("2001-10-31")



  # 1999
  cohort$cen99 <- 0
  cohort$cen99 <- with(cohort,ifelse(
    ((!is.na(dtint99) & dateft>dtint99) | (is.na(dtint99) & dateft>=as.Date("1999-12-31"))) &
      (nut_99==1 & nut_01==0 & nut_03==0 & nut_05==0 & nut_07==0 & nut_09==0 &
         nut_11==0 & nut_13==0 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("2001-12-31"))), 1, 0))
  cohort$cen99[cohort$report99==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change99 <- ifelse(cohort$cen99 == 1,1,0)
  cohort$dateft[cohort$change99==1] <- cohort$dtint99[cohort$change99==1]
  cohort$dateft[cohort$change99==1 & is.na(cohort$dtint99)] <- as.Date("1999-10-31")


  # 1997
  cohort$cen97 <- 0
  cohort$cen97 <- with(cohort,ifelse(
    ((!is.na(dtint97) & dateft>dtint97) | (is.na(dtint97) & dateft>=as.Date("1997-12-31"))) &
      (nut_97==1 & nut_99==0 & nut_01==0 & nut_03==0 & nut_05==0 & nut_07==0 & nut_09==0 &
         nut_11==0 & nut_13==0 & nut_15==0) &
      ((dead==0) | (dead==1 & datedd>as.Date("1999-12-31"))), 1, 0))
  cohort$cen97[cohort$report97==1 & cohort$verfcancer==0 & cohort$cancerdied==0] <- 1 # censor on reported
  cohort$change97 <- ifelse(cohort$cen97 == 1,1,0)
  cohort$dateft[cohort$change97==1] <- cohort$dtint97[cohort$change97==1]
  cohort$dateft[cohort$change97==1 & is.na(cohort$dtint97)] <- as.Date("1997-10-31")


  cohort$changeft <- ifelse(apply(cohort[,c("change97","change99","change01","change03",
                                            "change05","change07","change09","change11",
                                            "change13")],1,sum)>0,1,0)

  # flag prevalent cancers
  cohort[,c("prev82","prev92")] <- 0
  cohort$prev82[!cohort$ca82 %in% c("N","73")] <- 1
  cohort$prev92[cohort$ca92 != 0] <- 1
  cohort$prevcancer <- with(cohort,ifelse(prev92==1 | prev82==1,1,0))

  # Change case value for censored cases
  cohort$incidence[cohort$incidence==1 & cohort$changeft==1] <- 0
  cohort[cohort$incidence==0,c("cancerdth","cancerdied","cancerdth2","selfreport","verfcancer")] <- 0

  # Change some variable names for my outcomes to make them more descriptive
  cohort[,paste0(outcome,"_","FINAL")] <- cohort$incidence
  cohort[,paste0(outcome,"_","DEATH")] <- cohort$cancerdth
  cohort[,paste0(outcome,"_","DIED")] <- cohort$cancerdied
  cohort[,paste0(outcome,"_","SELF")] <- cohort$selfreport
  cohort[,paste0(outcome,"_","VERIFIED")] <- cohort$verfcancer
  renames <- paste0(outcome,"_",c("FINAL","DEATH","DIED","SELF","VERIFIED"))


  # Only want to return some of the variables
  keep <- c("sex","code1","code2","code3","dead","dthdate","lostfu","anycancer",
            "dateft","prev82","prev92","prevcancer","self_only","othdxdate","verfother",
            "datedd")

  names(cohort) <- toupper(names(cohort))
  cohort <- cohort[,toupper(c("id",verifiedNames,keep,renames))]

  return(cohort)
}
buddha2490/MargotFun documentation built on Nov. 4, 2019, 8:16 a.m.