data-raw/make-data.R

### Clean datasets to include in the package
# Required packages:
library(readxl)
library(stringr)
library(devtools)
library(usethis)
library(haven)
library(stringi)
library(tidyverse)

########################################################################################################################
### Dataset #1: ICD-10 codes, 2009 version
# Download dataset
download.file(url = "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10/allvalid2009(detailed%20titles%20headings).xls", destfile = "data-raw/allvalid2009.xls")

# Read data in Excel format
icd10_2009 <- readxl::read_excel(
  "data-raw/allvalid2009.xls",
  skip = 7,
  col_names = c("Status", "Code", "ICD.title")
)

# Remove lines where code contains the character "-", i.e. headers:
icd10_2009[grepl("-", icd10_2009[["Code"]]), ]
icd10_2009 <- icd10_2009[!grepl("-", icd10_2009[["Code"]]), ]

# Produce a "Code.clean" variable with no punctuation
icd10_2009[["Code.clean"]] <- stringr::str_replace_all(string = icd10_2009[["Code"]], pattern = "[^[:alnum:]]", replacement = "")

# Re-order the columns
icd10_2009 <- icd10_2009[, c(2, 4, 3, 1)]

# Convert all character columns to ASCII format
icd10_2009[["Code"]] <- iconv(icd10_2009[["Code"]], from = "UTF-8", to = "ASCII")
icd10_2009[["Code.clean"]] <- iconv(icd10_2009[["Code.clean"]], from = "UTF-8", to = "ASCII")
icd10_2009[["ICD.title"]] <- iconv(icd10_2009[["ICD.title"]], from = "UTF-8", to = "ASCII")
icd10_2009[["Status"]] <- iconv(icd10_2009[["Status"]], from = "UTF-8", to = "ASCII")

# Save data in R format
usethis::use_data(icd10_2009, overwrite = TRUE)

########################################################################################################################
### Dataset #2: ICD-10 codes, 2011 version
# Download dataset
download.file(url = "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10/allvalid2011%20%28detailed%20titles%20headings%29.xls", destfile = "data-raw/allvalid2011.xls")

# Read data in Excel format
icd10_2011 <- readxl::read_excel(
  "data-raw/allvalid2011.xls",
  skip = 7,
  col_names = c("Status", "Code", "ICD.title")
)

# Remove lines where code contains the character "-", i.e. headers:
icd10_2011[grepl("-", icd10_2011[["Code"]]), ]
icd10_2011 <- icd10_2011[!grepl("-", icd10_2011[["Code"]]), ]

# Produce a "Code.clean" variable with no punctuation
icd10_2011[["Code.clean"]] <- stringr::str_replace_all(string = icd10_2011[["Code"]], pattern = "[^[:alnum:]]", replacement = "")

# Re-order the columns
icd10_2011 <- icd10_2011[, c(2, 4, 3, 1)]

# Convert all character columns to ASCII format
icd10_2011[["Code"]] <- iconv(icd10_2011[["Code"]], from = "UTF-8", to = "ASCII")
icd10_2011[["Code.clean"]] <- iconv(icd10_2011[["Code.clean"]], from = "UTF-8", to = "ASCII")
icd10_2011[["ICD.title"]] <- iconv(icd10_2011[["ICD.title"]], from = "UTF-8", to = "ASCII")
icd10_2011[["Status"]] <- iconv(icd10_2011[["Status"]], from = "UTF-8", to = "ASCII")

# Save data in R format
usethis::use_data(icd10_2011, overwrite = TRUE)

########################################################################################################################
### Dataset #3: ICD-9 codes, 2015 version
# Download dataset
download.file(url = "https://www.cms.gov/Medicare/Coding/ICD9ProviderDiagnosticCodes/Downloads/ICD-9-CM-v32-master-descriptions.zip", destfile = "data-raw/ICD-9-CM-v32-master-descriptions.zip")

# Unzip files
unzip("data-raw/ICD-9-CM-v32-master-descriptions.zip", exdir = "data-raw")

# Read ICD-9 diagnostic codes
icd9_2015 <- readxl::read_excel("data-raw/CMS32_DESC_LONG_SHORT_DX.xlsx", skip = 1, col_names = c("Code", "Long_description", "Short_description"))

# Convert all character columns to ASCII format
icd9_2015[["Code"]] <- iconv(icd9_2015[["Code"]], from = "UTF-8", to = "ASCII")
icd9_2015[["Long_description"]] <- iconv(icd9_2015[["Long_description"]], from = "UTF-8", to = "ASCII")
icd9_2015[["Short_description"]] <- iconv(icd9_2015[["Short_description"]], from = "UTF-8", to = "ASCII")

# Save data in R format
usethis::use_data(icd9_2015, overwrite = TRUE)

########################################################################################################################
### Dataset #4 ICD-10-CM codes, 2018 version
download.file(url = "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10CM/2018/2018-ICD-10-CM-Codes-File.zip", destfile = "data-raw/2018-ICD-10-CM-Codes-File.zip")

# Unzip files
unzip("data-raw/2018-ICD-10-CM-Codes-File.zip", exdir = "data-raw")

# Read files
icd10cm_2018 <- readr::read_tsv(file = "data-raw/icd10cm_codes_2018.txt", col_names = FALSE)
icd10cm_2018[["Code"]] <- substr(icd10cm_2018[[1]], 1, 7)
icd10cm_2018[["Description"]] <- substr(icd10cm_2018[[1]], 9, 400)
icd10cm_2018[[1]] <- NULL

# Convert all character columns to ASCII format
icd10cm_2018[["Code"]] <- iconv(icd10cm_2018[["Code"]], from = "UTF-8", to = "ASCII")
icd10cm_2018[["Description"]] <- iconv(icd10cm_2018[["Description"]], from = "UTF-8", to = "ASCII")

# Save data in R format
usethis::use_data(icd10cm_2018, overwrite = TRUE)

########################################################################################################################
### Dataset #5 ICD-10-CM codes, 2017 version
download.file(url = "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10CM/2017/icd10cm_codes_2017.txt", destfile = "data-raw/icd10cm_codes_2017.txt")

# Read files
icd10cm_2017 <- readr::read_tsv(file = "data-raw/icd10cm_codes_2017.txt", col_names = FALSE)
icd10cm_2017[["Code"]] <- substr(icd10cm_2017[[1]], 1, 7)
icd10cm_2017[["Description"]] <- substr(icd10cm_2017[[1]], 9, 400)
icd10cm_2017[[1]] <- NULL

# Convert all character columns to ASCII format
icd10cm_2017[["Code"]] <- iconv(icd10cm_2017[["Code"]], from = "UTF-8", to = "ASCII")
icd10cm_2017[["Description"]] <- iconv(icd10cm_2017[["Description"]], from = "UTF-8", to = "ASCII")

# Save data in R format
usethis::use_data(icd10cm_2017, overwrite = TRUE)

########################################################################################################################
### Dataset #6: Adult same-day discharges, 2010 (from Stata)
nhds2010 <- haven::read_dta("https://www.stata-press.com/data/r17/nhds2010.dta")
nhds2010 <- haven::zap_formats(nhds2010)
nhds2010 <- haven::zap_label(nhds2010)
nhds2010 <- haven::zap_labels(nhds2010)

# Save data in R format
usethis::use_data(nhds2010, overwrite = TRUE)

########################################################################################################################
### Dataset #7: Australian mortality data, 2010 (from Stata)
australia10 <- haven::read_dta("https://www.stata-press.com/data/r17/australia10.dta")
australia10 <- haven::zap_formats(australia10)
australia10 <- haven::zap_label(australia10)
australia10 <- haven::zap_labels(australia10)

# Save data in R format
usethis::use_data(australia10, overwrite = TRUE)

########################################################################################################################
### Dataset #7 ICD-10-CM codes, 2022 version
download.file(url = "https://www.cms.gov/files/zip/2022-code-descriptions-tabular-order-updated-02012022.zip", destfile = "data-raw/tmp.zip")
unzip(zipfile = "data-raw/tmp.zip", exdir = "data-raw")

# Read files
icd10cm_2022 <- readLines(con = "data-raw/Code Descriptions/icd10cm_codes_2022.txt")
where_to_split <- stri_locate_first(str = icd10cm_2022, regex = " ")
icd10cm_2022 <- data.frame(
  Code = stri_sub(icd10cm_2022, from = 1L, to = where_to_split[2, ]),
  Description = stri_sub(icd10cm_2022, from = where_to_split[2, ])
)
icd10cm_2022 <- mutate(
  icd10cm_2022,
  Code = stri_trim_both(Code),
  Description = stri_trim_both(Description)
)

# Save data in R format
usethis::use_data(icd10cm_2022, overwrite = TRUE)

########################################################################################################################
### Remove unnecessary files
lf <- list.files(path = "data-raw/Code Descriptions", full.names = TRUE, pattern = ".xls|.txt|.zip|.pdf")
invisible(file.remove(lf))
invisible(file.remove("data-raw/Code Descriptions/"))
lf <- list.files(path = "data-raw", full.names = TRUE, pattern = ".xls|.txt|.zip|.pdf")
invisible(file.remove(lf))
ellessenne/comorbidity documentation built on May 4, 2023, 10:36 p.m.