inst/fcds_import-clean.R

library(dplyr)
fcds:::requires_package(c("here", "readxl", "readr"), "fcds_import-clean.R")
library(here)
library(readr)
library(readxl)

dat <- read_csv(here("data-raw", "STAT_dataset_2018.dat"))
icd <- read_excel(here("data-raw", "sitetype.icdo3.20180323.xls"))
icd <- icd[!duplicated(icd$`Histology/Behavior`), c("Histology/Behavior", "Histology/Behavior Description")]
saveRDS(icd, here::here("data", "icd.rds"))

# recode datedx, sex, birthplace state, race
dat <- dat %>%
   mutate(
      dx_year = recode(
         Date_of_Dx_Year_Recoded,
         "1115" = "2011-2015",
         "0610" = "2006-2010",
         "0105" = "2001-2005",
         "9600" = "1996-2000",
         "9195" = "1991-1995",
         "8690" = "1986-1990",
         "8185" = "1981-1985"),
      dx_year_mid = recode(
         Date_of_Dx_Year_Recoded,
         "1115" = "2013",
         "0610" = "2008",
         "0105" = "2003",
         "9600" = "1998",
         "9195" = "1993",
         "8690" = "1988",
         "8185" = "1983"),
      sex = recode(
         Sex_Recoded,
         `1` = "Male",
         `2` = "Female",
         `9` = "Unknown"),
      race = recode(
         Race_Recoded,
         `1` = "White",
         `2` = "Black",
         `3` = "Other",
         `9` = "Unknown"),
      hispanic = recode(
         Ethnicity_Recoded,
         `0` = "Not Hispanic",
         `8` = "Hispanic",
         `9` = "Unknown"),
      address = recode(
         Addr_at_DX_State_Recoded,
         `00` = "Florida",
         `01` = "Other US States and Territories",
         `02` = "Not Applicable"),
      birth_state = recode(
         Birthplace_State_Abrv_Recoded,
         `00` = "Florida",
         `01` = "Other US States and Territories",
         `02` = "Not Applicable"),
      dx_country = recode(
         Addr_at_Dx_Country_Recoded,
         `01` = "US States and Territories",
         `02` = "Other Countries",
         `99` = "Unknown"),
      birth_country = recode(
         Birthplace_Country_Recoded,
         `01` = "US States and Territories",
         `02` = "Other Countries",
         `99` = "Unknown"),
      marital_status = recode(
         # coding for unknown different than what is specified in the layout page
         Marital_Status_Recoded,
         `1` = "Married; Unmarried or Domestic Partner",
         `2` = "Single, Separated, Divorced or Widowed",
         `9` = "Unknown"),
      primary_payer = recode(
         Dx_Primary_Payor_Recoded,
         `01` = "Not insured",
         `02` = "Insurance",
         `03` = "Medicaid",
         `04` = "Medicare",
         `05` = "Tricare",
         `99` = "Unknown"),
      seer_stage_2000 = recode(
         # Although the codes are identical for SEER Summary Stage 1977 and 2000
         # the methodology for coding these variables is distinct. Please refer to
         # SEER Staging Manuals SEER Summary Staging Guide and the SEER Summary
         # Staging Manual 2000. coding from 8 taken from
         # (https://seer.cancer.gov/tools/ssm/SSM2018-General-Instructions.pdf)
         # since not in ANY OF THE DOCUMENTATION
         SEER_Summ_Stage_2000_N759,
         `0` = "In Situ",
         `1` = "Local",
         `2` = "Regional/Direct extension",
         `3` = "Regional/Nodes only",
         `4` = "Regional/Direct extension & nodes",
         `5` = "Regional NOS",
         `7` = "Distant/Systemic Disease",
         `8` = "Benign/Borderline",
         `9` = "Unknown"),
      seer_stage_1977 = recode(
         SEER_Summ_Stage_1977_N760,
         `0` = "In Situ",
         `1` = "Local",
         `2` = "Regional/Direct extension",
         `3` = "Regional/Nodes only",
         `4` = "Regional/Direct extension & nodes",
         `5` = "Regional NOS",
         `7` = "Distant/Systemic Disease",
         `8` = "Benign/Borderline",
         `9` = "Unknown"),
      seer_stage = ifelse(is.na(seer_stage_2000), seer_stage_1977, seer_stage_2000),
      fcds_site_group = recode(
         # Using most-specific summary-level group from
         # https://seer.cancer.gov/siterecode/icdo3_dwhoheme/
         # (i.e. level 2 header for a level 3 item, if all level 3 items are grouped together)
         FCDS_Site_Group,
         `0110` = "Oral Cavity and Pharynx",
         `0011` = "Esophagus",
         `0012` = "Stomach",
         `0013` = "Small Intestine",
         `1422` = "Colon excluding Rectum",
         `2324` = "Rectum and Rectosigmoid Junction",
         `0025` = "Anus, Anal Canal & Anorectum",
         `2627` = "Liver and Intrahepatic Bile Duct",
         `0028` = "Gall Bladder",
         `0029` = "Other Biliary",
         `0030` = "Pancreas",
         `3133` = "Retroperitoneum, Peritoneum, Omentum & Mesentery, Other Digestive Organs",
         ## Where is Nose, Nasal Cavity and Middle Ear?
         `0035` = "Larynx",
         `0036` = "Lung and Bronchus",
         ## Where are Pleura, and "Trachea ... and Other Respiratory"?
         `0039` = "Bones and Joints",
         `0040` = "Soft Tissue including Heart",
         `0041` = "Melanoma of the Skin",
         `0042` = "Other Non-Epithelial Skin",
         `0043` = "Breast",
         `0044` = "Cervix Uteri",
         `4546` = "Corpus and Uterus, NOS",
         `0047` = "Ovary",
         `4850` = "Vagina, Vulva, Other Female Genital Organs",
         `0051` = "Prostate Gland",
         `0052` = "Testes",
         ## Where are Penis and other Male Genital Organs?
         `0055` = "Urinary Bladder",
         `0056` = "Kidney & Renal Pelvis",
         `5758` = "Ureter, Other Urinary Organs",
         `0059` = "Eye and Orbit",
         `6061` = "Brain and Other Nervous System",
         `0062` = "Thyroid Gland",
         ## Missing "Other Endocrin including Thymus" (possibly in Benign/Borderline)
         `6465` = "Hodgkin Lymphoma",
         `6667` = "Non-Hodgkin Lymphoma",
         `0068` = "Multiple Myeloma",
         `6971` = "Lymphocytic Leukemia",
         `7275` = "Myeloid and Monocytic Leukemia",
         `7677` = "Other Leukemia",
         `0078` = "Mesothelioma",
         `0079` = "Kaposi Sarcoma",
         `0080` = "Other",
         `8183` = "Benign/Borderline (Brain, Nervous System, Endocrine)"),
      fcds_site_specific = recode(
         # https://fcds.med.miami.edu/downloads/datarequest/STAT%202018%20layout.pdf
         FCDS_Site_Group,
         `0110` = paste("Lip, Tongue, Salivary Glands , Floor of Mouth, Gum & Other Mouth,",
                        "Nasopharynx, Tonsil, Oropharynx, Hypopharynx, Other Buccal Cavity & Pharynx"),
         `0011` = "Esophagus",
         `0012` = "Stomach",
         `0013` = "Small Intestine",
         `1422` = paste("Cecum, Appendix, Ascending Colon, Hepatic Flexure, Transverse Colon,",
                        "Splenic Flexure, Descending Colon, Sigmoid Colon, Large Intestine, NOS"),
         `2324` = "Rectosigmoid Junction, Rectum",
         `0025` = "Anus, Anal Canal & Anorectum",
         `2627` = "Liver, Intrahepatic Bile Duct",
         `0028` = "Gall Bladder",
         `0029` = "Other Biliary",
         `0030` = "Pancreas",
         `3133` = "Retroperitoneum, Peritoneum, Omentum & Mesentery, Other Digestive Organs",
         `0035` = "Larynx",
         `0036` = "Lung & Bronchus",
         `0039` = "Bones & Joints",
         `0040` = "Soft Tissue (Including Heart)",
         `0041` = "Melanoma of the Skin",
         `0042` = "Other Non-Epithelial Skin",
         `0043` = "Breast",
         `0044` = "Cervix Uteri",
         `4546` = "Corpus Uteri, Uterus, NOS",
         `0047` = "Ovary",
         `4850` = "Vagina, Vulva, Other Female Genital Organs",
         `0051` = "Prostate Gland",
         `0052` = "Testes",
         `0055` = "Urinary Bladder",
         `0056` = "Kidney & Renal Pelvis",
         `5758` = "Ureter, Other Urinary Organs",
         `0059` = "Eye & Orbit",
         `6061` = "Brain, Other Nervous System",
         `0062` = "Thyroid Gland",
         `6465` = "Hodgkin's Disease Nodal, Hodgkin's Disease Extra Nodal",
         `6667` = "Non-Hodgkin's Nodal, Non-Hodgkin's Extra Nodal",
         `0068` = "Multiple Myeloma",
         `6971` = "Acute Lymphocytic Leukemia, Chronic Lymphocytic Leukemia, Other Lymphocytic Leukemia",
         `7275` = "Acute Myeloid Leukemia, Chronic Myeloid Leukemia, Other Myeloid/Monocytic Leukemia, Acute Monocytic Leukemia",
         `7677` = "Other Acute Leukemia, Aleukemic, Subleukemic & NOS",
         `0078` = "Mesothelioma",
         `0079` = "Kaposi Sarcoma",
         `0080` = "Other",
         `8183` = "Benign/Borderline- Brain, Cranial Nerves Other Nervous System, Other Endocrine including Thymus (Benign/Border)"),
      age_group = recode(
         FCDS_Age_Group,
         `4`  =  "0 - 4",
         `9`  =  "5 - 9",
         `14` = "10 - 14",
         `19` = "15 - 19",
         `24` = "20 - 24",
         `29` = "25 - 29",
         `34` = "30 - 34",
         `39` = "35 - 39",
         `44` = "40 - 44",
         `49` = "45 - 49",
         `54` = "50 - 54",
         `59` = "55 - 59",
         `64` = "60 - 64",
         `69` = "65 - 69",
         `74` = "70 - 74",
         `79` = "75 - 79",
         `84` = "80 - 84",
         `85` = "85+",
         `999` = "Unknown"),
      county_fips = County_at_DX_N90,
      county_name = recode(
         County_at_DX_N90,
         `1`  = "Alachua",
         `3`  = "Baker",
         `5`   = "Bay",
         `7`   = "Bradford",
         `9`   = "Brevard",
         `11`  = "Broward",
         `13`  = "Calhoun",
         `15`  = "Charlotte",
         `17`  = "Citrus",
         `19`  = "Clay",
         `21`  = "Collier",
         `23`  = "Columbia",
         `27`  = "DeSoto",
         `29`  = "Dixie",
         `31`  = "Duval",
         `33`  = "Escambia",
         `35`  = "Flagler",
         `37`  = "Franklin",
         `39`  = "Gadsden",
         `41`  = "Gilchrist",
         `43`  = "Glades",
         `45`  = "Gulf",
         `47`  = "Hamilton",
         `49`  = "Hardee",
         `51`  = "Hendry",
         `53`  = "Hernando",
         `55`  = "Highlands",
         `57`  = "Hillsborough",
         `59`  = "Holmes",
         `61`  = "Indian River",
         `63`  = "Jackson",
         `65`  = "Jefferson",
         `67`  = "Lafayette",
         `69`  = "Lake",
         `71`  = "Lee",
         `73`  = "Leon",
         `75`  = "Levy",
         `77`  = "Liberty",
         `79`  = "Madison",
         `81`  = "Manatee",
         `83`  = "Marion",
         `85`  = "Martin",
         `86`  = "Miami-Dade",
         `87`  = "Monroe",
         `89`  = "Nassau",
         `91`  = "Okaloosa",
         `93`  = "Okeechobee",
         `95`  = "Orange",
         `97`  = "Osceola",
         `99`  = "Palm Beach",
         `101` = "Pasco",
         `103` = "Pinellas",
         `105` = "Polk",
         `107` = "Putnam",
         `113` = "Santa Rosa",
         `115` = "Sarasota",
         `117` = "Seminole",
         `109` = "St. Johns",
         `111` = "St. Lucie",
         `119` = "Sumter",
         `121` = "Suwannee",
         `123` = "Taylor",
         `125` = "Union",
         `127` = "Volusia",
         `129` = "Wakulla",
         `131` = "Walton",
         `133` = "Washington",
         `999` = "Unknown"),
      grade = recode(
         Grade_N440,
         `1` = "Grade I",
         `2` = "Grade II",
         `3` = "Grade III",
         `4` = "Grade IV",
         `5` = "T-cell",
         `6` = "B-cell",
         `7` = "Null cell",
         `8` = "NK cell",
         `9` = "Unknown"),
      laterality = recode(
         Laterality_N410,
         `0` = "Not a paired site",
         `1` = "Right:origin of primary",
         `2` = "Left:origin of primary",
         `3` = "Only one side involved, right or left origin unspecified",
         `4` = paste("Bilateral involvement at time of diagnosis, lateral origin unknown for a single primary;",
                     "or both ovaries involved simultaneously, single histology; bilateral retinoblastomas;",
                     "bilateral Wilms' tumors"),
         `5` = "Paired site: midline tumor",
         `9` = "Paired site, but no information concerning laterality"),
      dx_confirmation = recode(
         Diagnostic_Confirmation_N490,
         `1` = "Positive histology",
         `2` = "Positive cytology",
         `3` = "Positive histology PLUS - positive immunophenotyping AND/OR positive genetic studies",
         `4` = "Positive microscopic confirmation, method not specified",
         `5` = "Positive laboratory test/marker study",
         `6` = "Direct visualization without microscopic confirmation",
         `7` = "Radiography and/or other imaging techniques without microscopic confirmation",
         `8` = "Clinical diagnosis only",
         `9` = "Unknown whether or not microscopically confirmed; death certificate only"),
      reporting_source = recode(
         Type_of_Reporting_Source_N500,
         `1` = "Hospital inpatient",
         `2` = "Radiation Treatment Centers or Medical Oncology Centers",
         `3` = "Laboratory only",
         `4` = "Physician's office/private medical practitioner",
         `5` = "Nursing/convalescent home/hospice",
         `6` = "Autopsy only",
         `7` = "Death certificate only",
         `8` = "Other hospital outpatient units/surgery centers"),
      cancer_status = recode(
         Cancer_Status_N1770,
         `1` = "No evidence of tumor",
         `2` = "Evidence of tumor",
         `9` = "Unknown"),
      icd03_conversion = recode(
         # Code specifying how the conversion of site and morphology codes from
         # ICD-O-2 to ICD-O-3 was accomplished.
         ICDO3_Conversion_FL_N2116,
         `0` = "Originally coded in ICD-O-3",
         `1` = "Converted without review",
         `3` = "Converted with review"),
      seer_stage_derived_1977 = recode(
         Derived_SS1977_FL_N3040,
         `1` = "Derived from Collaborative Stage",
         `2` = "Derived from EOD"),
      seer_stage_derived_2000 = recode(
         Derived_SS2000_FL_N3050,
         `1` = "Derived from Collaborative Stage",
         `2` = "Derived from EOD"),
      tobacco_cigarette = recode(
         FCDS_Tob_Use_Cigarette_N1300,
         `0` = "Never used",
         `1` = "Current user",
         `2` = "Former user, quit within 1 year of Dx",
         `3` = "Former user, quit more than 1 year of Dx",
         `4` = "Former user, unknown when quit",
         `9` = "Unknown"),
      tobacco_other = recode(
         FCDS_Tob_Use_OthSmoke_N1300,
         `0` = "Never used",
         `1` = "Current user",
         `2` = "Former user, quit within 1 year of Dx",
         `3` = "Former user, quit more than 1 year of Dx",
         `4` = "Former user, unknown when quit",
         `9` = "Unknown"),
      tobacco_smokeless = recode(
         FCDS_Tob_Use_Smokeless_Tob_N1300,
         `0` = "Never used",
         `1` = "Current user",
         `2` = "Former user, quit within 1 year of Dx",
         `3` = "Former user, quit more than 1 year of Dx",
         `4` = "Former user, unknown when quit",
         `9` = "Unknown"),
      tobacco_no = recode(
         FCDS_Tob_Use_NOS_N1300,
         `0` = "Never used",
         `1` = "Current user",
         `2` = "Former user, quit within 1 year of Dx",
         `3` = "Former user, quit more than 1 year of Dx",
         `4` = "Former user, unknown when quit",
         `9` = "Unknown"),
      behavior = recode(
         Behavior_Code_ICDO3_N523,
         `0` = "Benign",
         `1` = "Borderline",
         `2` = "Insitu",
         `3` = "Invasive"),
      morphology = paste0(Histologic_Type_ICDO3_N522, "/", Behavior_Code_ICDO3_N523)
   ) %>%
   select(patient_id = Patient_ID_N20, dx_year:morphology)

saveRDS(dat, file = here("data/stat_dataset_2018_clean.rds"), compress = "gzip")
GerkeLab/fcds documentation built on July 30, 2020, 7:04 p.m.