R/lookups_and_mappings.R

Defines functions extend_read_v2_drugs_bnf extend_bnf_lkp reformat_icd10_phecode_map_1_2 reformat_icd9_icd10 reformat_read_ctv3_icd10 reformat_read_v2_icd10 get_phecode_icd10_map get_phecode_icd9_map get_phecode_definitions read_all_lkps_maps get_ukb_self_report_med_to_atc_map get_ukb_all_lkps_maps get_nhsbsa_snomed_bnf build_all_lkps_maps all_lkps_maps_to_db

Documented in all_lkps_maps_to_db build_all_lkps_maps get_nhsbsa_snomed_bnf get_phecode_definitions get_phecode_icd10_map get_phecode_icd9_map get_ukb_all_lkps_maps get_ukb_self_report_med_to_atc_map read_all_lkps_maps

# EXPORTED ----------------------------------------------------------------

#' Build a SQLite database of clinical code look up and mapping tables
#'
#' Write the output from \code{\link{build_all_lkps_maps}} to a SQLite database.
#'
#' @param all_lkps_maps A named list of look up and mapping tables, created
#'   by \code{\link{build_all_lkps_maps}}.
#' @param db_path Where the database will be created. If an SQLite database file
#'   already exists here, then the lookup and mapping tables will be added to
#'   this. If \code{NULL} (default), then no database will be created/modified.
#' @param overwrite If \code{TRUE}, overwrite tables in the database if they
#'   already exist. Default value is \code{FALSE}.
#'
#' @return Returns \code{db_path} invisibly
#' @seealso [build_all_lkps_maps()]
#' @export
#' @examples
#' # build dummy all_lkps_maps resource (supressing warning messages)
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # write to SQLite database file
#' db_path <- suppressMessages(
#'   all_lkps_maps_to_db(
#'     all_lkps_maps = all_lkps_maps_dummy,
#'     db_path = tempfile()
#'   )
#' )
#'
#' # connect to SQLite database
#' con <- DBI::dbConnect(RSQLite::SQLite(), db_path)
#'
#' # create named list of tbl_dbi objects
#' all_lkps_maps_dummy_db <- ukbwranglr::db_tables_to_list(con)
#'
#' head(all_lkps_maps_dummy_db$icd10_lkp)
#'
#' # import to R with dplyr::collect()
#' dplyr::collect(all_lkps_maps_dummy_db$icd10_lkp)
all_lkps_maps_to_db <- function(all_lkps_maps = build_all_lkps_maps(),
                                db_path = "all_lkps_maps.db",
                                overwrite = FALSE) {
  # If database already exists at db_path, check if tables to be written are
  # already present
  if (file.exists(db_path)) {
    message(paste0(
      "Existing file found at ",
      db_path
    ))

    check_tables_do_not_already_exist <- TRUE
  } else {
    check_tables_do_not_already_exist <- FALSE
  }

  # connect to db
  con <- DBI::dbConnect(RSQLite::SQLite(), db_path)
  on.exit(DBI::dbDisconnect(con))

  if (check_tables_do_not_already_exist) {
    tables_to_be_written <- c(
      CLINICAL_CODE_MAPPINGS_MAP$mapping_table,
      CODE_TYPE_TO_LKP_TABLE_MAP$lkp_table
    )

    tables_already_in_db <- DBI::dbListTables(con)

    tables_already_present_in_db <- subset(
      tables_to_be_written,
      tables_to_be_written %in% tables_already_in_db
    )

    if (!overwrite) {
      assertthat::assert_that(
        rlang::is_empty(tables_already_present_in_db),
        msg = paste0(
          "Error! The following tables are already present in the database at ",
          db_path,
          ": ",
          stringr::str_c(
            tables_already_present_in_db,
            sep = "",
            collapse = ", "
          )
        )
      )
    } else if (overwrite &
      !rlang::is_empty(tables_already_present_in_db)) {
      warning(
        "The following tables are already present in the database at ",
        db_path,
        " and will be overwritten: ",
        stringr::str_c(
          tables_already_present_in_db,
          sep = "",
          collapse = ", "
        )
      )
    }
  }

  # write to db
  message(paste0(
    "Writing lookup and mapping tables to SQLite database at ",
    db_path
  ))
  for (table_name in names(all_lkps_maps)) {
    message(table_name)
    DBI::dbWriteTable(
      conn = con,
      name = table_name,
      value = all_lkps_maps[[table_name]],
      # ensure table is not inadvertently overwritten/appended to
      overwrite = overwrite,
      append = FALSE
    )
  }

  message(paste0(
    "Success! Connect to database with `con <- DBI::dbConnect(RSQLite::SQLite(), '",
    db_path,
    "')`, then access all tables with `all_lkps_maps <- ukbwranglr::db_tables_to_list(con)`"
  ))
  invisible(db_path)
}

#' Build named list of clinical code look up and mapping tables
#'
#' Downloads the lookup and mapping tables from
#' \href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{UK Biobank resource
#' 592} as well as the NHSBSA BNF-SNOMED mapping table (available
#' \href{https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping}{here})
#' and OPCS4, self-reported medical conditions/medications/operations from the
#' UK Biobank codings file (available
#' \href{https://biobank.ctsu.ox.ac.uk/crystal/exinfo.cgi?src=accessing_data_guide}{here}).
#'
#' @param all_lkps_maps UK Biobank resource 592, as returned by
#'   \code{\link{get_ukb_all_lkps_maps}}.
#' @param ukb_codings The UK Biobank codings file, as returned by
#'   \code{\link[ukbwranglr]{get_ukb_codings}}.
#' @param bnf_dmd Optional: path to the NHSBSA BNF-SNOMED mapping table (see
#'   [get_nhsbsa_snomed_bnf()]).
#' @param self_report_med_to_atc_map Optional: path to a UK Biobank
#'   self-reported medication to ATC map (see
#'   [get_ukb_self_report_med_to_atc_map()]).
#' @param ctv3sctmap2 Optional: path to the NHS TRUD mapping file for Read 3 to
#'   SNOMEDCT ("ctv3sctmap2_uk_20200401000001.txt").
#' @param phecode_1_2_lkp Optional: path to the phecode v1.2 lookup file
#'   (see [get_phecode_definitions()]).
#' @param icd10_phecode_1_2 Optional: path to the phecode v1.2 to ICD10 mapping
#'   file (see [get_phecode_icd10_map()]).
#' @param icd9_phecode_1_2 Optional: path to the phecode v1.2 to ICD10 mapping
#'   file (see [get_phecode_icd9_map()]).
#'
#' @return Returns a named list of data frames.
#' @seealso [all_lkps_maps_to_db()]
#' @export
#' @examples
#' # build dummy all_lkps_maps using just UKB resource 592 and UKB codings file
#' build_all_lkps_maps(
#'   all_lkps_maps = read_all_lkps_maps_dummy(),
#'   ukb_codings = read_ukb_codings_dummy(),
#'   bnf_dmd = NULL,
#'   self_report_med_to_atc_map = NULL,
#'   ctv3sctmap2 = NULL,
#'   phecode_1_2_lkp = NULL,
#'   icd10_phecode_1_2 = NULL,
#'   icd9_phecode_1_2 = NULL
#' )
build_all_lkps_maps <-
  function(all_lkps_maps = read_all_lkps_maps(),
           ukb_codings = ukbwranglr::get_ukb_codings(),
           bnf_dmd = get_nhsbsa_snomed_bnf(),
           self_report_med_to_atc_map = get_ukb_self_report_med_to_atc_map(),
           ctv3sctmap2 = NULL,
           phecode_1_2_lkp = get_phecode_definitions(),
           icd10_phecode_1_2 = get_phecode_icd10_map(),
           icd9_phecode_1_2 = get_phecode_icd9_map()) {
    # ukb resource 592 ----------------

    ## remove metadata footer rows and add row index column -------------------
    all_lkps_maps <- all_lkps_maps %>%
      purrr::map(rm_footer_rows_all_lkps_maps_df) %>%
      purrr::map(~ tibble::rowid_to_column(
        .data = .x,
        var = ".rowid"
      ))

    ## reformat tables individually ---------------

    ### icd9_icd10 -------------------

    all_lkps_maps$icd9_icd10 <- reformat_icd9_icd10(all_lkps_maps$icd9_icd10)

    ### read2_icd10 ------------------------

    all_lkps_maps$read_v2_icd10 <- reformat_read_v2_icd10(all_lkps_maps$read_v2_icd10,
      icd10_lkp = all_lkps_maps$icd10_lkp
    )

    ### read3_icd10 ------------------------

    all_lkps_maps$read_ctv3_icd10 <- reformat_read_ctv3_icd10(all_lkps_maps$read_ctv3_icd10)

    ## extend tables -----------------
    message("Extending tables in UKB resource 592")
    all_lkps_maps$read_v2_drugs_bnf <-
      extend_read_v2_drugs_bnf(all_lkps_maps)
    all_lkps_maps$bnf_lkp <-
      extend_bnf_lkp(all_lkps_maps)

    # opcs4 -------------------------

    ## from ukb codings file
    opcs4_lkp <- make_lkp_from_ukb_codings(
      ukb_codings = ukb_codings,
      Coding = "240",
      Value_col_new_name = "opcs4_code"
    )

    # UKB self-reported medical conditions/medications/operations ------------

    ## from ukb codings file
    self_report_cancer <-
      make_lkp_from_ukb_codings(
        ukb_codings = ukb_codings,
        Coding = "3",
        Value_col_new_name = "data_coding_3"
      )

    self_report_medication <-
      make_lkp_from_ukb_codings(
        ukb_codings = ukb_codings,
        Coding = "4",
        Value_col_new_name = "data_coding_4"
      )

    self_report_operation <-
      make_lkp_from_ukb_codings(
        ukb_codings = ukb_codings,
        Coding = "5",
        Value_col_new_name = "data_coding_5"
      )

    self_report_non_cancer <-
      make_lkp_from_ukb_codings(
        ukb_codings = ukb_codings,
        Coding = "6",
        Value_col_new_name = "data_coding_6"
      )

    # Add 'extra' tables -------------------------

    ## NHSBSA-SNOMED BNF -----------------------------

    if (!is.null(bnf_dmd)) {
      message("Reformatting bnf_dmd map")
      # read file and tidy names
      bnf_dmd_map_unzipped_dir <- file.path(tempdir(), "bnf_dmd")

      utils::unzip(bnf_dmd,
        files = NULL,
        exdir = bnf_dmd_map_unzipped_dir
      )

      bnf_dmd_file <- list.files(bnf_dmd_map_unzipped_dir)
      assertthat::assert_that(length(bnf_dmd_file) == 1,
        msg = "Error! Unexpected number of files after unzipping NHBSA BNF-SNOMED file"
      )

      bnf_dmd <-
        readxl::read_excel(file.path(bnf_dmd_map_unzipped_dir, bnf_dmd_file))

      names(bnf_dmd) <-
        ukbwranglr:::remove_special_characters_and_make_lower_case(names(bnf_dmd)) %>%
        stringr::str_remove_all("plus_")
    }

    ## Self-reported med to ATC map ------------

    if (!is.null(self_report_med_to_atc_map)) {
      message("Reformatting self_report_med_to_atc_map")
      # read file
      self_report_med_to_atc_map <- readxl::read_excel(
        self_report_med_to_atc_map,
        skip = 2,
        col_names = c(
          "self_report_medication",
          "data_coding_4",
          "atc_code",
          "drug_name",
          "rm_1",
          "rm_2"
        )
      )

      # drop redundant cols
      self_report_med_to_atc_map <- self_report_med_to_atc_map[, -c(5, 6)]

      # append drug_name in brackets
      self_report_med_to_atc_map$self_report_medication <-
        paste0(
          self_report_med_to_atc_map$self_report_medication,
          " (",
          self_report_med_to_atc_map$drug_name,
          ")"
        )
    }

    ## NHS TRUD Read 3 to SNOMEDCT mapping table ---------
    if (!is.null(ctv3sctmap2)) {
      read_ctv3_sct <- readr::read_tsv(ctv3sctmap2)
    }

    ## Phecode lookup ----------------
    if (!is.null(phecode_1_2_lkp)) {
      phecode_lkp <- readr::read_csv(phecode_1_2_lkp,
        progress = FALSE,
        col_types = readr::cols(.default = "c")
      )
    }

    ## Phecode to ICD10 map ---------------------
    if (!is.null(icd10_phecode_1_2)) {
      message("Reformatting ICD10-Phecode map")
      icd10_phecode <- readr::read_csv(icd10_phecode_1_2,
        progress = FALSE,
        col_types = readr::cols(.default = "c")
      )

      icd10_phecode <- reformat_icd10_phecode_map_1_2(icd10_phecode,
        all_lkps_maps = all_lkps_maps
      )
    }

    ## Phecode to ICD9 map ------------------
    message("Reformatting ICD9-Phecode map")
    if (!is.null(icd9_phecode_1_2)) {
      icd9_phecode <- readr::read_csv(icd9_phecode_1_2) %>%
        dplyr::mutate("icd9" = stringr::str_remove(.data[["icd9"]],
          pattern = "\\."
        ))
    }

    # Combine -----------------------
    all_lkps_maps <- c(
      all_lkps_maps,
      list(
        opcs4_lkp = opcs4_lkp,
        self_report_cancer = self_report_cancer,
        self_report_medication = self_report_medication,
        self_report_operation = self_report_operation,
        self_report_non_cancer = self_report_non_cancer
      )
    )

    # Append 'extra' lookup/mapping tables
    if (!is.null(bnf_dmd)) {
      all_lkps_maps <- c(
        all_lkps_maps,
        list(bnf_dmd = bnf_dmd)
      )
    }

    if (!is.null(self_report_med_to_atc_map)) {
      all_lkps_maps <- c(
        all_lkps_maps,
        list(self_report_med_to_atc_map = self_report_med_to_atc_map)
      )
    }

    if (!is.null(ctv3sctmap2)) {
      all_lkps_maps <- c(
        all_lkps_maps,
        list(read_ctv3_sct = read_ctv3_sct)
      )
    }

    if (!is.null(phecode_1_2_lkp)) {
      all_lkps_maps <- c(
        all_lkps_maps,
        list(phecode_lkp = phecode_lkp)
      )
    }

    if (!is.null(icd10_phecode_1_2)) {
      all_lkps_maps <- c(
        all_lkps_maps,
        list(icd10_phecode = icd10_phecode)
      )
    }

    if (!is.null(icd9_phecode_1_2)) {
      all_lkps_maps <- c(
        all_lkps_maps,
        list(icd9_phecode = icd9_phecode)
      )
    }

    # convert all to tibbles (avoids potential problems when writing to SQLite
    # database)
    all_lkps_maps <- all_lkps_maps %>%
      purrr::map(tibble::as_tibble)

    message("Success!")
    return(all_lkps_maps)
  }

#' Download and read the NHSBSA BNF_SNOMED mapping file
#'
#' Mapping table available from
#' \href{https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping}{here}.
#'
#' @param path Path where file will be downloaded to.
#'
#' @return File path to downloaded file.
#' @export
#' @examples
#' \dontrun{
#' get_nhsbsa_snomed_bnf()
#' }
get_nhsbsa_snomed_bnf <- function(path = file.path(
                                    tempdir(),
                                    "bnf_dmd.zip"
                                  )) {
  download_file(
    download_url = "https://www.nhsbsa.nhs.uk/sites/default/files/2021-08/BNF%20Snomed%20Mapping%20data%2020210819.zip",
    path = path
  )
}

#' Get UK Biobank resource 592 directly from UKB
#' website
#'
#' Downloads the UK Biobank code mappings file (\code{all_lkps_maps_v4.xlsx},
#' \href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{resource 592})
#' directly from the UKB website.
#'
#' \strong{Note:} This is a large object (>450 MB)
#'
#' @param dir_path Directory path to download to.
#'
#' @return File path to downloaded `all_lkps_maps_v4.xlsx`.
#' @export
#' @examples
#' \dontrun{
#' # download UKB resource 592, returning file path invisibly
#' file_path <- get_ukb_all_lkps_maps()
#'
#' # view path to downloaded file
#' file_path
#' }
get_ukb_all_lkps_maps <- function(dir_path = tempdir()) {
  message("Getting UKB resource 592")
  # name of resource 592 excel file
  primarycare_codings <- "all_lkps_maps_v4.xlsx"

  # filepaths in tempdir
  primarycare_codings_zip_filepath <-
    file.path(tempdir(), "primarycare_codings.zip")
  primarycare_codings_excel_filepath <-
    file.path(tempdir(), primarycare_codings)

  # download primary care codings file to tempdir, if not already there
  if (!file.exists(primarycare_codings_zip_filepath)) {
    message("Downloading primarycare_codings.zip (UKB resource 592) to tempdir")
    utils::download.file(
      "https://biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/primarycare_codings.zip",
      primarycare_codings_zip_filepath,
      mode = "wb"
    )
  }

  # extract excel file only from zip
  message("Extracting all_lkps_maps_v4.xlsx from zip file to tempdir")
  utils::unzip(primarycare_codings_zip_filepath,
    files = primarycare_codings,
    exdir = dir_path
  )

  # return file path
  return(file.path(dir_path, primarycare_codings))
}

#' Download and read a UKB welf-reported medication code to ATC mapping file
#'
#' Mapping table obtained from [Wray et al
#' 2019](https://www.nature.com/articles/s41467-019-09572-5#Sec23),
#' Supplementary Data 1.
#'
#' @param path Path where file will be downloaded to.
#'
#' @return File path to downloaded file.
#' @export
#' @examples
#' \dontrun{
#' get_nhsbsa_snomed_bnf()
#' }
get_ukb_self_report_med_to_atc_map <- function(path = file.path(
                                                 tempdir(),
                                                 "self_report_med_to_atc_map.xlsx"
                                               )) {
  download_file(
    download_url = "https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-019-09572-5/MediaObjects/41467_2019_9572_MOESM3_ESM.xlsx",
    path = path
  )
}

#' Read UK Biobank resource 592 into a named list
#'
#' Reads the UK Biobank code mappings file (`all_lkps_maps_v4.xlsx`,
#' \href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{resource 592}) into
#' a named list of data frames.
#'
#' \strong{Note:} This is a large object (>450 MB)
#'
#' @param path Path to `all_lkps_maps_v4.xlsx`. By default, this is downloaded
#'   from the UK Biobank website using [get_ukb_all_lkps_maps()].
#'
#' @return A named list of data frames.
#' @export
#' @examples
#' read_all_lkps_maps(dummy_all_lkps_maps_path())
read_all_lkps_maps <- function(path = get_ukb_all_lkps_maps()) {
  read_excel_to_named_list(
    path = path,
    to_include = NULL,
    to_exclude = c("Description", "Contents"),
    col_types = "text"
  )
}

#' Download the Phecode 1.2 definitions file
#'
#' Download link obtained from https://phewascatalog.org/phecodes.
#'
#' @param path Path where file will be downloaded to.
#'
#' @return File path to downloaded file.
#' @export
#' @examples
#' \dontrun{
#' get_phecode_definitions()
#' }
get_phecode_definitions <- function(path = file.path(
                                      tempdir(),
                                      "phecode_definitions1.2.csv.zip"
                                    )) {
  download_file(
    download_url = "https://phewascatalog.org/files/phecode_definitions1.2.csv.zip",
    path = path
  )
}

#' Download the Phecode 1.2 to ICD9 mapping file
#'
#' Download link obtained from https://phewascatalog.org/phecodes.
#'
#' @param path Path where file will be downloaded to.
#'
#' @return File path to downloaded file.
#' @export
#' @examples
#' \dontrun{
#' get_phecode_icd9_map()
#' }
get_phecode_icd9_map <- function(path = file.path(
                                   tempdir(),
                                   "phecode_icd9_map_unrolled.csv.zip"
                                 )) {
  download_file(
    download_url = "https://phewascatalog.org/files/phecode_icd9_map_unrolled.csv.zip",
    path = path
  )
}

#' Download the Phecode 1.2 to ICD10 (beta) mapping file
#'
#' Download link obtained from https://phewascatalog.org/phecodes.
#'
#' @param path Path where file will be downloaded to.
#'
#' @return File path to downloaded file.
#' @export
#' @examples
#' \dontrun{
#' get_phecode_icd10_map()
#' }
get_phecode_icd10_map <- function(path = file.path(
                                    tempdir(),
                                    "Phecode_map_v1_2_icd10_beta.csv.zip"
                                  )) {
  download_file(
    download_url = "https://phewascatalog.org/files/Phecode_map_v1_2_icd10_beta.csv.zip",
    path = path
  )
}

# PRIVATE -----------------------------------------------------------------

#' Reformat the Read 2 to ICD10 mapping table
#'
#' Converts values in the `icd10_code` column to 'ALT_CODE' format ICD10 codes
#' that are recognised in the `icd10_lkp` lookup table. This involves dividing
#' cells containing more than one ICD10 code over multiple rows (e.g.
#' 'A414+J038' becomes 2 rows), and removing appended 'D'/'A' characters (which
#' indicate dagger/asterisk codes) to a separate column called
#' `icd10_dagger_asterisk` (e.g.'A010D I398A' becomes 'A010' and 'I398' under
#' `icd10_code`, with 'D' and 'A' recorded under `icd10_dagger_asterisk`).
#'
#' **NOTE:** A number of undivided 3 character ICD10 codes appear (incorrectly)
#' without an 'X' appended in this mapping table. For example, 'A64X' appears
#' (incorrectly) as 'A50-A64' in 2 rows. 'A65X' appears as 'A65-A69', 'A70X' as
#' 'A70-A74', 'A89X' as 'A80-A89', 'A99X' as 'A92-A99' etc. This function
#' converts appends 'X' to these codes to match how they appear in the
#' `icd10_lkp` table.
#'
#' ICD10 code 'C836' ('Diffuse non-Hodkin's lymphoma - Undifferentiated
#' (diffuse)') is also removed, as this code has been removed from ICD10 and does
#' not exist in the `icd10_lkp` table.
#'
#' @param read_v2_icd10 The Read 2 to ICD10 mapping table.
#' @param icd10_lkp Data frame. ICD10 lookup table (note, must have a '.rowid'
#'   column).
#'
#' @return A data frame.
#' @noRd
reformat_read_v2_icd10 <- function(read_v2_icd10,
                                   icd10_lkp) {

  # Undivided 3 character ICD10 ALT_CODE X/no X map (as named list)
  icd10_lkp_alt_x_map <- get_icd10_code_alt_code_x_map(
    icd10_lkp = icd10_lkp,
    undivided_3char_only = TRUE,
    as_named_list = "names_no_x"
  )

  # remove icd10_code 'C836' (see notes above)
  read_v2_icd10 <- read_v2_icd10 %>%
    dplyr::filter(.data[["icd10_code"]] != "C836")

  # replace spaces and '+' with commas
  read_v2_icd10 <- read_v2_icd10 %>%
    dplyr::mutate("icd10_code" = stringr::str_replace_all(.data[["icd10_code"]],
      pattern = "[\\s|\\+]",
      replacement = ","
    ))

  # split by comma, then unnest
  read_v2_icd10 <- read_v2_icd10 %>%
    dplyr::mutate("icd10_code" = stringr::str_split(.data[["icd10_code"]],
      pattern = ","
    )) %>%
    tidyr::unnest(cols = "icd10_code")

  # remove 'D' and 'A' final characters from ICD10 codes, and place in separate
  # column `icd10_dagger_asterisk`
  read_v2_icd10 <- read_v2_icd10 %>%
    dplyr::mutate(
      "icd10_dagger_asterisk" = rm_or_extract_appended_icd10_dxa(
        icd10_codes = .data[["icd10_code"]],
        keep_x = TRUE,
        rm_extract = "extract"
      )
    ) %>%
    dplyr::mutate("icd10_code" = rm_or_extract_appended_icd10_dxa(
      icd10_codes = .data[["icd10_code"]],
      keep_x = TRUE,
      rm_extract = "rm"
    ))

  # expand icd10 code ranges, which are flagged as '2' under `icd10_code_def` (e.g. 'E100-E109')
  read_v2_icd10 <- read_v2_icd10 %>%
    expand_icd10_ranges(
      icd10_lkp = icd10_lkp,
      icd10_lkp_alt_x_map = icd10_lkp_alt_x_map
    )

  # Make sure undivided 3 character ICD10 codes have an 'X' appended
  read_v2_icd10 <- read_v2_icd10 %>%
    dplyr::mutate("icd10_code" = dplyr::recode(.data[["icd10_code"]], !!!icd10_lkp_alt_x_map))

  # check all ICD10 codes now exist in `icd10_lkp`
  check_codes_exist(
    codes = unique(read_v2_icd10$icd10_code),
    lkp_codes = icd10_lkp$ALT_CODE,
    code_type = "icd10"
  )

  # return result
  return(read_v2_icd10)
}

#' Reformat mapping table `read_ctv3_icd10`
#'
#' Removes 'D' and 'A' from the ends of ICD10 codes, and separates these into a
#' separate column called `icd10_dagger_asterisk`. The 'D' and 'A' indicate
#' whether the code is a 'dagger' or 'asterisk' respectively. However, these
#' codes are listed without the appended 'D'/'A' in the `icd10_lkp` table.
#'
#' @param read_ctv3_icd10 A data frame.
#'
#' @return
#' @noRd
reformat_read_ctv3_icd10 <- function(read_ctv3_icd10) {
  read_ctv3_icd10 %>%
    dplyr::mutate(
      "icd10_dagger_asterisk" = rm_or_extract_appended_icd10_dxa(
        icd10_codes = .data[["icd10_code"]],
        keep_x = TRUE,
        rm_extract = "extract"
      )
    ) %>%
    dplyr::mutate("icd10_code" = rm_or_extract_appended_icd10_dxa(
      icd10_codes = .data[["icd10_code"]],
      keep_x = TRUE,
      rm_extract = "rm"
    ))
}

#' Reformat mapping table `icd9_icd10`
#'
#' For ICD9 codes without an equivalent ICD10 code, the ICD10 code is recorded
#' as 'UNDEF', with `NA` for the description (and vice versa). This function
#' converts values of 'UNDEF' in the `ICD9` and `ICD10` columns to `NA`, and
#' checks that `NA` only appears for such cases (i.e. a ICD code cannot be `NA`
#' if it has a description and vice versa).
#'
#' @param icd9_icd10 The `icd9_icd10` mapping table
#'
#' @return A data frame
#' @noRd
reformat_icd9_icd10 <- function(icd9_icd10) {
  # convert 'UNDEF' ICD9/10 codes to `NA`
  icd9_icd10 <- icd9_icd10 %>%
    dplyr::mutate(dplyr::across(
      tidyselect::all_of(c(
        "ICD9",
        "ICD10"
      )),
      ~ ifelse(.x == "UNDEF",
        yes = NA_character_,
        no = .x
      )
    ))

  # check that description and ICD code are either both `NA` or both not `NA`
  icd9_icd10_nabular <- icd9_icd10 %>%
    dplyr::mutate(dplyr::across(
      tidyselect::everything(),
      ~ is.na(.x)
    ))

  assertthat::assert_that(all(icd9_icd10_nabular$ICD9 == icd9_icd10_nabular$DESCRIPTION_ICD9))
  assertthat::assert_that(all(icd9_icd10_nabular$ICD10 == icd9_icd10_nabular$DESCRIPTION_ICD10))

  # return result
  return(icd9_icd10)
}

reformat_icd10_phecode_map_1_2 <- function(icd10_phecode,
                                           all_lkps_maps) {

  # get vector of all present icd10 codes in ALT_CODE format. Also returns a
  # message listing ICD10 codes with modifiers that will map to >1 ICD10 code in
  # ALT_CODE format. Also raises warning if any unrecognised ICD10 codes are
  # present.
  icd10_codes_in_icd10_phecode <- codemapper::reformat_icd10_codes(
    icd10_codes = icd10_phecode$ICD10,
    all_lkps_maps = all_lkps_maps,
    input_icd10_format = "ICD10_CODE",
    output_icd10_format = "ALT_CODE",
    unrecognised_codes = "warning",
    strip_x = FALSE
  )

  # append `ALT_CODE`
  icd10_phecode <- all_lkps_maps$icd10_lkp %>%
    dplyr::select(tidyselect::all_of(c(
      "ICD10_CODE",
      "ALT_CODE"
    ))) %>%
    dplyr::filter(.data[["ALT_CODE"]] %in% !!icd10_codes_in_icd10_phecode) %>%
    dplyr::collect() %>%
    dplyr::right_join(icd10_phecode,
      by = c("ICD10_CODE" = "ICD10")
    )

  # remove empty PHECODE rows
  icd10_phecode <- icd10_phecode %>%
    dplyr::filter(!is.na(.data[["PHECODE"]]))

  # result
  return(icd10_phecode)
}

extend_bnf_lkp <- function(all_lkps_maps) {
  # each drug entry in `bnf_lkp` gets repeated 8 times: chapter, section,
  # paragraph, subparagraph, chemical_substance, product_name, further_info,
  # full

  bnf_lkp <- all_lkps_maps[["bnf_lkp"]] %>%
    dplyr::collect()

  bnf_lkp %>%
    dplyr::mutate(
      "code_chapter" = stringr::str_sub(
        string = .data[["BNF_Presentation_Code"]],
        start = 1,
        end = 2
      ),
      "code_section" = stringr::str_sub(
        string = .data[["BNF_Presentation_Code"]],
        start = 1,
        end = 4
      ),
      "code_paragraph" = stringr::str_sub(
        string = .data[["BNF_Presentation_Code"]],
        start = 1,
        end = 6
      ),
      "code_subparagraph" = stringr::str_sub(
        string = .data[["BNF_Presentation_Code"]],
        start = 1,
        end = 7
      ),
      "code_chemical_substance" = stringr::str_sub(
        string = .data[["BNF_Presentation_Code"]],
        start = 1,
        end = 9
      ),
      "code_product_name" = stringr::str_sub(
        string = .data[["BNF_Presentation_Code"]],
        start = 1,
        end = 11
      ),
      "code_further_info" = stringr::str_sub(
        string = .data[["BNF_Presentation_Code"]],
        start = 1,
        end = 13
      )
    ) %>%
    dplyr::rename("code_full" = .data[["BNF_Presentation_Code"]]) %>%
    tidyr::pivot_longer(
      cols = tidyselect::starts_with("code"),
      names_to = "BNF_Code_Level",
      values_to = "BNF_Code"
    ) %>%
    dplyr::select(
      .data[["BNF_Code_Level"]],
      .data[["BNF_Code"]],
      tidyselect::everything()
    ) %>%
    dplyr::mutate("BNF_Code_Level" = stringr::str_remove(
      .data[["BNF_Code_Level"]],
      "code_"
    )) %>%
    dplyr::distinct(.data[["BNF_Code"]],
      .keep_all = TRUE
    ) %>%
    dplyr::mutate(
      "Description" = dplyr::case_when(
        .data[["BNF_Code_Level"]] == "chapter" ~ .data[["BNF_Chapter"]],
        .data[["BNF_Code_Level"]] == "section" ~ .data[["BNF_Section"]],
        .data[["BNF_Code_Level"]] == "paragraph" ~ .data[["BNF_Paragraph"]],
        .data[["BNF_Code_Level"]] == "subparagraph" ~ .data[["BNF_Subparagraph"]],
        .data[["BNF_Code_Level"]] == "chemical_substance" ~ .data[["BNF_Chemical_Substance"]],
        .data[["BNF_Code_Level"]] == "product_name" ~ .data[["BNF_Product"]],
        .data[["BNF_Code_Level"]] == "further_info" ~ .data[["BNF_Presentation"]],
        .data[["BNF_Code_Level"]] == "full" ~ .data[["BNF_Presentation"]]
      )
    ) %>%
    dplyr::mutate(
      "BNF_Presentation" = dplyr::case_when(
        !.data[["BNF_Code_Level"]] %in% c(
          "full",
          "further_info"
        ) ~ as.character(NA),
        TRUE ~ .data[["BNF_Presentation"]]
      ),
      "BNF_Product" = dplyr::case_when(
        !.data[["BNF_Code_Level"]] %in% c(
          "full",
          "further_info",
          "product_name"
        ) ~ as.character(NA),
        TRUE ~ .data[["BNF_Product"]]
      ),
      "BNF_Chemical_Substance" = dplyr::case_when(
        !.data[["BNF_Code_Level"]] %in% c(
          "full",
          "further_info",
          "product_name",
          "chemical_substance"
        ) ~ as.character(NA),
        TRUE ~ .data[["BNF_Chemical_Substance"]]
      ),
      "BNF_Subparagraph" = dplyr::case_when(
        !.data[["BNF_Code_Level"]] %in% c(
          "full",
          "further_info",
          "product_name",
          "chemical_substance",
          "subparagraph"
        ) ~ as.character(NA),
        TRUE ~ .data[["BNF_Subparagraph"]]
      ),
      "BNF_Paragraph" = dplyr::case_when(
        !.data[["BNF_Code_Level"]] %in% c(
          "full",
          "further_info",
          "product_name",
          "chemical_substance",
          "subparagraph",
          "paragraph"
        ) ~ as.character(NA),
        TRUE ~ .data[["BNF_Paragraph"]]
      ),
      "BNF_Section" = dplyr::case_when(
        !.data[["BNF_Code_Level"]] %in% c(
          "full",
          "further_info",
          "product_name",
          "chemical_substance",
          "subparagraph",
          "paragraph",
          "section"
        ) ~ as.character(NA),
        TRUE ~ .data[["BNF_Section"]]
      ),
    ) %>%
    dplyr::select(tidyselect::all_of(c(
      "BNF_Code",
      "BNF_Code_Level",
      "BNF_Chapter",
      "BNF_Section",
      "BNF_Paragraph",
      "BNF_Subparagraph",
      "BNF_Chemical_Substance",
      "BNF_Product",
      "BNF_Presentation",
      "Description"
    )))
}

extend_read_v2_drugs_bnf <- function(all_lkps_maps) {
  # get required tables
  read_v2_drugs_bnf <- all_lkps_maps[["read_v2_drugs_bnf"]] %>%
    dplyr::collect()
  read_v2_drugs_lkp <- all_lkps_maps[["read_v2_drugs_lkp"]] %>%
    dplyr::collect()
  bnf_lkp <- all_lkps_maps[["bnf_lkp"]] %>%
    dplyr::collect()

  # extend 'bnf_lkp'
  bnf_lkp_extended <- extend_bnf_lkp(all_lkps_maps)

  # extend `read_v2_drugs_bnf`
  expected_nrow <- nrow(read_v2_drugs_bnf)

  result <- read_v2_drugs_bnf %>%
    # add read code descriptions
    dplyr::left_join(read_v2_drugs_lkp,
      by = "read_code"
    ) %>%
    # extract bnf chapter, section etc from `bnf_code` col in `read_v2_drugs_bnf`
    dplyr::mutate(
      "bnf_chapter_code" = stringr::str_sub(
        stringr::str_remove_all(.data[["bnf_code"]], pattern = "\\."),
        start = 1,
        end = 2
      ),
      "bnf_section_code" = stringr::str_sub(
        stringr::str_remove_all(.data[["bnf_code"]], pattern = "\\."),
        start = 1,
        end = 4
      ),
      "bnf_paragraph_code" = stringr::str_sub(
        stringr::str_remove_all(.data[["bnf_code"]], pattern = "\\."),
        start = 1,
        end = 6
      ),
      "bnf_subparagraph_code" = paste0(
        .data[["bnf_paragraph_code"]],
        stringr::str_sub(
          stringr::str_remove_all(.data[["bnf_code"]], pattern = "\\."),
          start = 8,
          end = 8
        )
      )
    ) %>%
    # add BNF details
    dplyr::left_join(bnf_lkp_extended[, c("BNF_Code", "BNF_Chapter")],
      by = c("bnf_chapter_code" = "BNF_Code")
    ) %>%
    dplyr::left_join(bnf_lkp_extended[, c("BNF_Code", "BNF_Section")],
      by = c("bnf_section_code" = "BNF_Code")
    ) %>%
    dplyr::left_join(bnf_lkp_extended[, c("BNF_Code", "BNF_Paragraph")],
      by = c("bnf_paragraph_code" = "BNF_Code")
    ) %>%
    dplyr::left_join(bnf_lkp_extended[, c("BNF_Code", "BNF_Subparagraph")],
      by = c("bnf_subparagraph_code" = "BNF_Code")
    )

  # check nrows remains the same
  assertthat::assert_that(expected_nrow == nrow(result),
    msg = "Error! Unexpected number of rows when extending `read_v2_drugs_bnf`"
  )

  return(result)
}
rmgpanw/codemapper documentation built on Aug. 30, 2023, 4:07 p.m.