R/clinical_codes.R

Defines functions check_codes_exist handle_unrecognised_codes check_rowid_col_present check_all_lkps_maps_path check_mapping_args check_codes expand_icd10_ranges get_icd10_code_range get_icd10_code_alt_code_x_map rm_or_extract_appended_icd10_dxa filter_cols standardise_output_fn reformat_standardised_codelist get_preferred_description_code_for_lookup_sheet get_col_for_lookup_sheet get_lookup_sheet get_value_for_mapping_sheet get_from_to_mapping_sheet default_col_filters reformat_icd10_codes get_mapping_df map_codes code_descriptions_like lookup_codes codes_starting_with

Documented in code_descriptions_like codes_starting_with default_col_filters get_mapping_df lookup_codes map_codes reformat_icd10_codes

# OVERVIEW ----------------------------------------------------------------

# Functions to map between different clinical codes e.g. between Read2 and
# Read3, or Read3 and ICD-10. These rely on the code mapping file provided by UK
# Biobank (resource 592: https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592) and
# the BNF to SNOMED mapping file from the NHSBSA website
# (https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping)

# EXPORTED FUNCTIONS ------------------------------------------------------

# Exploring and mapping clinical codes ------------------------------------

#' Get codes that start with...
#'
#' This is case \emph{sensitive} (important for read codes especially).
#'
#' @param codes character. A vector of code strings to search for matching
#'   codes.
#' @param code_type character. The type of clinical code system to be searched.
#'   Must be one of `r knitr::combine_words(CODE_TYPE_TO_LKP_TABLE_MAP$code, and = "or ")`.
#' @param all_lkps_maps Either a named list of lookup and mapping tables (either
#'   data frames or `tbl_dbi` objects), or the path to a SQLite database
#'   containing these tables (see also [build_all_lkps_maps()] and
#'   [all_lkps_maps_to_db()]). If `NULL`, will attempt to connect to an SQLite
#'   database named 'all_lkps_maps.db' in the current working directory, or to a
#'   a SQLite database specified by an environmental variable named
#'   'ALL_LKPS_MAPS_DB' (see
#'   [here](https://resources.numbat.space/using-rprofile-and-renviron.html#renviron)
#'    for how to set environment variables using a `.Renviron` file). The latter
#'   method will be used in preference.
#' @param codes_only bool. If \code{TRUE}, return a character vector of
#'   \emph{unique} codes. If \code{FALSE} (default), return a data frame of all
#'   results including code descriptions (useful for manual validation).
#' @param preferred_description_only bool. Return only preferred descriptions
#'   for clinical codes with synonyms. Default value is \code{TRUE}.
#' @param col_filters A named list where each name in the list refers to the
#'   name of a lookup or mapping table. Each item is also a named list, where
#'   the names refer to column names in the corresponding table, and the items
#'   are vectors of values to filter for. For example, `list(my_lookup_table =
#'   list(colA = c("A", "B"))` will result in `my_lookup_table` being filtered
#'   for rows where `colA` is either 'A' or 'B'. Uses `default_col_filters()` by
#'   default. Set to `NULL` to remove all filters.
#'
#' @inheritParams lookup_codes
#' @export
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # lookup codes
#' lookup_codes(
#'   codes = c("E10", "E11"),
#'   code_type = "icd10",
#'   all_lkps_maps = all_lkps_maps_dummy
#' )
codes_starting_with <- function(codes,
                                code_type,
                                all_lkps_maps = NULL,
                                codes_only = FALSE,
                                preferred_description_only = TRUE,
                                standardise_output = TRUE,
                                col_filters = default_col_filters()) {
  # validate args
  match.arg(
    arg = code_type,
    choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
  )

  # connect to database file path if `all_lkps_maps` is a string, or `NULL`
  if (is.character(all_lkps_maps)) {
    con <- check_all_lkps_maps_path(all_lkps_maps)
    all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
    on.exit(DBI::dbDisconnect(con))
  } else if (is.null(all_lkps_maps)) {
    if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
      message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
      con <-
        check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else if (file.exists("all_lkps_maps.db")) {
      message("Attempting to connect to all_lkps_maps.db in current working directory")
      con <- check_all_lkps_maps_path("all_lkps_maps.db")
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else {
      stop(
        "No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
      )
    }
  }

  check_codes(codes)

  assertthat::assert_that(is.logical(codes_only),
    msg = "`code_only` must be either 'TRUE' or 'FALSE'"
  )

  assertthat::assert_that(!(codes_only & standardise_output),
    msg = "Error! `codes_only` and `standardise_output` cannot both be `TRUE`"
  )

  # TODO check all sheets are present
  validate_all_lkps_maps()

  # determine relevant lookup sheet
  lkp_table <- get_lookup_sheet(code_type = code_type)

  # determine code column for lookup sheet
  code_col <- get_col_for_lookup_sheet(
    lookup_sheet = lkp_table,
    column = "code_col"
  )

  # determine relevant column indicating whether code description is preferred
  # (for code types with synonymous code descriptions like read 2 and read 3)
  preferred_description_col <-
    get_col_for_lookup_sheet(
      lookup_sheet = lkp_table,
      column = "preferred_synonym_col"
    )

  # get preferred code, if appropriate
  if (!is.na(preferred_description_col)) {
    preferred_description_code <-
      get_preferred_description_code_for_lookup_sheet(lookup_sheet = lkp_table)
  }

  # reformat codes - escape '.', prefix with anchor and append '.*'
  codes <-
    stringr::str_replace_all(codes, pattern = "\\.", replacement = "\\\\.")
  codes <- paste0("^", codes, ".*")

  # combine into single string, separated by "|"
  codes <- stringr::str_c(codes, sep = "", collapse = "|")

  # get children (filter for codes which match ANY of those in `codes` arg)
  result <- all_lkps_maps[[lkp_table]] %>%
    dplyr::collect() %>%
    dplyr::filter(stringr::str_detect(.data[[code_col]],
      pattern = stringr::regex(codes,
        ignore_case = FALSE
      )
    ))

  # filter on `col_filters` parameters
  if (!is.null(col_filters)) {
    result <- filter_cols(
      df = result,
      df_name = lkp_table,
      col_filters = col_filters
    )
  }

  # filter for preferred code descriptions only if requested
  if (preferred_description_only &
    !is.na(preferred_description_col)) {
    result <- result %>%
      dplyr::filter(.data[[preferred_description_col]] == preferred_description_code)
  }

  # return result
  if (nrow(result) == 0) {
    message("No matching codes found. Returning `NULL`")
    return(NULL)
  } else {
    # return either unique codes only, or df including code descriptions
    if (codes_only) {
      return(unique(result[[code_col]]))
    } else if (standardise_output) {
      # Note, not all mapping sheets in UKB resource 592 contain descriptions
      # (e.g. 'read_v2_icd9'). Therefore need to use `lookup_codes` if
      # `standardise_output` is `TRUE`
      codes <- unique(result[[code_col]])

      return(
        lookup_codes(
          codes = codes,
          code_type = code_type,
          all_lkps_maps = all_lkps_maps,
          preferred_description_only = preferred_description_only,
          unrecognised_codes = "error"
        )
      )
    } else {
      return(result)
    }
  }
}

#' Look up descriptions for clinical codes
#'
#' Returns a data frame including descriptions for the codes of interest
#'
#' @param codes character. Vector of codes to lookup
#' @param standardise_output bool. If \code{TRUE} (default), outputs a data
#'   frame with columns named 'code', 'description' and 'code_type'. Otherwise
#'   returns a data frame with all columns for the relevant lookup sheet from
#'   (\href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{UK Biobank
#'   resource 592}).
#' @param unrecognised_codes Either 'error' (default) or 'warning'. If any input
#'   `codes` are unrecognised, then either an error or warning will be raised.
#' @param .return_unrecognised_codes If `TRUE`, return a vector of unrecognised
#'   codes only.
#' @inheritParams codes_starting_with
#'
#' @return data frame
#' @export
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # look up ICD10 codes
#' lookup_codes(
#'   codes = c("E10", "E11"),
#'   code_type = "icd10",
#'   all_lkps_maps = all_lkps_maps_dummy
#' )
lookup_codes <- function(codes,
                         code_type,
                         all_lkps_maps = NULL,
                         preferred_description_only = TRUE,
                         standardise_output = TRUE,
                         unrecognised_codes = "error",
                         col_filters = default_col_filters(),
                         .return_unrecognised_codes = FALSE) {
  # validate args
  check_codes(codes)

  match.arg(
    arg = code_type,
    choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
  )

  # connect to database file path if `all_lkps_maps` is a string, or `NULL`
  if (is.character(all_lkps_maps)) {
    con <- check_all_lkps_maps_path(all_lkps_maps)
    all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
    on.exit(DBI::dbDisconnect(con))
  } else if (is.null(all_lkps_maps)) {
    if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
      message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
      con <-
        check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else if (file.exists("all_lkps_maps.db")) {
      message("Attempting to connect to all_lkps_maps.db in current working directory")
      con <- check_all_lkps_maps_path("all_lkps_maps.db")
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else {
      stop(
        "No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
      )
    }
  }

  validate_all_lkps_maps()

  # determine relevant lookup sheet
  lkp_table <- get_lookup_sheet(code_type = code_type)

  # determine code column for lookup sheet
  code_col <- get_col_for_lookup_sheet(
    lookup_sheet = lkp_table,
    column = "code_col"
  )

  # determine description column for lookup sheet
  description_col <-
    get_col_for_lookup_sheet(
      lookup_sheet = lkp_table,
      column = "description_col"
    )

  # determine relevant column indicating whether code description is preferred
  # (for code types with synonymous code descriptions like read 2 and read 3)
  preferred_description_col <-
    get_col_for_lookup_sheet(
      lookup_sheet = lkp_table,
      column = "preferred_synonym_col"
    )

  # get preferred code, if appropriate
  if (!is.na(preferred_description_col)) {
    preferred_description_code <-
      get_preferred_description_code_for_lookup_sheet(lookup_sheet = lkp_table)
  }

  # lookup - filter lookup sheet for codes
  result <- all_lkps_maps[[lkp_table]] %>%
    dplyr::filter(.data[[code_col]] %in% codes) %>%
    dplyr::collect()

  # filter on `col_filters` parameters
  if (!is.null(col_filters)) {
    result <- filter_cols(
      df = result,
      df_name = lkp_table,
      col_filters = col_filters
    )
  }

  # check for unrecognised codes
  missing_codes <- subset(codes, !codes %in% result[[code_col]])

  if (.return_unrecognised_codes) {
    # optionally return vector of unrecognised codes only
    message(paste0(
      "Returning unrecognised codes only. N unrecognised: ",
      length(missing_codes)
    ))
    return(missing_codes)
  }

  handle_unrecognised_codes(
    unrecognised_codes = unrecognised_codes,
    missing_codes = missing_codes,
    table_name = lkp_table,
    code_type = code_type
  )

  # filter for preferred code descriptions only if requested
  if (!is.null(preferred_description_only)) {
    if (preferred_description_only &
      !is.na(preferred_description_col)) {
      result <- result %>%
        dplyr::filter(.data[[preferred_description_col]] == preferred_description_code)
    }
  }

  # standardise output if requested
  if (standardise_output) {
    result <- standardise_output_fn(
      result,
      lkp_table = lkp_table,
      code_col = code_col,
      description_col = description_col,
      code_type = code_type
    )
  }

  # return result
  if (nrow(result) == 0) {
    message("No matching codes found. Returning `NULL`")
    return(NULL)
  } else {
    # return either unique codes only, or df including code descriptions
    return(result)
  }
}


#' Search for codes that match a description
#'
#' Returns a data frame with clinical codes that match the supplied regular
#' expression. Ignores case by default.
#'
#' @param reg_expr a regular expression to search for
#' @inheritParams stringr::regex
#' @inheritParams codes_starting_with
#'
#' @return data frame by default, or a character vector of codes if
#'   \code{codes_only} is \code{TRUE}.
#' @export
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # lookup ICD10 code descriptions matching 'cyst'
#' code_descriptions_like(
#'   reg_expr = "cyst",
#'   code_type = "icd10",
#'   all_lkps_maps = all_lkps_maps_dummy
#' )
code_descriptions_like <- function(reg_expr,
                                   code_type,
                                   all_lkps_maps = NULL,
                                   ignore_case = TRUE,
                                   codes_only = FALSE,
                                   preferred_description_only = TRUE,
                                   standardise_output = TRUE,
                                   col_filters = default_col_filters()) {
  # validate args
  check_codes(reg_expr)

  match.arg(
    arg = code_type,
    choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
  )

  # connect to database file path if `all_lkps_maps` is a string, or `NULL`
  if (is.character(all_lkps_maps)) {
    con <- check_all_lkps_maps_path(all_lkps_maps)
    all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
    on.exit(DBI::dbDisconnect(con))
  } else if (is.null(all_lkps_maps)) {
    if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
      message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
      con <-
        check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else if (file.exists("all_lkps_maps.db")) {
      message("Attempting to connect to all_lkps_maps.db in current working directory")
      con <- check_all_lkps_maps_path("all_lkps_maps.db")
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else {
      stop(
        "No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
      )
    }
  }

  validate_all_lkps_maps()

  # determine relevant lookup sheet
  lkp_table <- get_lookup_sheet(code_type = code_type)

  # determine code and description columns for lookup sheet
  code_col <- get_col_for_lookup_sheet(
    lookup_sheet = lkp_table,
    column = "code_col"
  )

  description_col <-
    get_col_for_lookup_sheet(
      lookup_sheet = lkp_table,
      column = "description_col"
    )

  # determine relevant column indicating whether code description is preferred
  # (for code types with synonymous code descriptions like read 2 and read 3)
  preferred_description_col <-
    get_col_for_lookup_sheet(
      lookup_sheet = lkp_table,
      column = "preferred_synonym_col"
    )

  # get preferred code, if appropriate
  if (!is.na(preferred_description_col)) {
    preferred_description_code <-
      get_preferred_description_code_for_lookup_sheet(lookup_sheet = lkp_table)
  }

  # search for codes
  result <- all_lkps_maps[[lkp_table]] %>%
    dplyr::collect() %>%
    dplyr::filter(stringr::str_detect(
      string = .data[[description_col]],
      pattern = stringr::regex(
        pattern = reg_expr,
        ignore_case = ignore_case
      )
    ))

  # filter on `col_filters` parameters
  if (!is.null(col_filters)) {
    result <- filter_cols(
      df = result,
      df_name = lkp_table,
      col_filters = col_filters
    )
  }

  # filter for preferred code descriptions only if requested
  if (!is.null(preferred_description_only)) {
    if (preferred_description_only &
      !is.na(preferred_description_col)) {
      result <- result %>%
        dplyr::filter(.data[[preferred_description_col]] == preferred_description_code)
    }
  }

  # standardise output, if requested
  if (standardise_output) {
    result <- standardise_output_fn(
      result,
      lkp_table = lkp_table,
      code_col = code_col,
      description_col = description_col,
      code_type = code_type
    )

    code_col <- "code"
  }

  # return result
  if (nrow(result) == 0) {
    message("No matching codes found. Returning `NULL`")
    return(NULL)
  } else {
    if (codes_only) {
      return(unique(result[[code_col]]))
    } else {
      return(result)
    }
  }
}


#' Map clinical codes from one coding system to another
#'
#' Uses the code mapping file provided by UK Biobank
#' (\href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{resource 592}).
#'
#' The values for arguments \code{from} and \code{to} must be one of
#' \code{read2}, \code{read3}, \code{icd9}, \code{icd10}, \code{bnf},
#' \code{dmd}, \code{read2_drugs} or \code{opcs4}.
#'
#' @param codes A character vector of codes to be mapped.
#' @param from Coding system that \code{codes} belong to.
#' @param to Coding system to map \code{codes} to.
#' @param unrecognised_codes Either 'error' (default) or 'warning'. If any input
#'   `codes` are unrecognised for the coding system being mapped from, then
#'   either an error or warning will be raised.
#' @param preferred_description_only If `TRUE`, return only preferred
#'   descriptions for clinical codes with synonyms. Can only be \code{TRUE} if
#'   \code{standardise_output} is also \code{TRUE}. Default value is
#'   \code{NULL}.
#' @param reverse_mapping If 'error' (default), an error raised if attempting to
#'   map between coding systems for which a mapping table does not exist. If
#'   'warning', will raise a warning and attempt to use an existing mapping
#'   table in the opposite direction (for example, a mapping from ICD10 to Read
#'   3 would be attempted using the Read 3-to-ICD10 mapping table).
#' @inheritParams codes_starting_with
#' @inheritParams lookup_codes
#'
#' @export
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # map codes from Read 2 to ICD10
#' map_codes(
#'   codes = "G20..",
#'   from = "read2",
#'   to = "icd10",
#'   all_lkps_maps = all_lkps_maps_dummy
#' )
map_codes <- function(codes,
                      from,
                      to,
                      all_lkps_maps = NULL,
                      codes_only = FALSE,
                      standardise_output = TRUE,
                      unrecognised_codes = "error",
                      preferred_description_only = NULL,
                      reverse_mapping = "error",
                      col_filters = default_col_filters()) {
  # validate args
  check_codes(codes)

  # connect to database file path if `all_lkps_maps` is a string, or `NULL`
  if (is.character(all_lkps_maps)) {
    con <- check_all_lkps_maps_path(all_lkps_maps)
    all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
    on.exit(DBI::dbDisconnect(con))
  } else if (is.null(all_lkps_maps)) {
    if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
      message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
      con <-
        check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else if (file.exists("all_lkps_maps.db")) {
      message("Attempting to connect to all_lkps_maps.db in current working directory")
      con <- check_all_lkps_maps_path("all_lkps_maps.db")
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else {
      stop(
        "No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
      )
    }
  }

  validate_all_lkps_maps()

  assertthat::assert_that(is.logical(codes_only),
    msg = "`code_only` must be either 'TRUE' or 'FALSE'"
  )

  assertthat::assert_that(!(codes_only & standardise_output),
    msg = "Error! `codes_only` and `standardise_output` cannot both be `TRUE`"
  )

  if (!is.null(preferred_description_only)) {
    assertthat::assert_that(!(
      preferred_description_only == TRUE & standardise_output == FALSE
    ),
    msg = "Error! `preferred_description_only` cannot be `TRUE` unless `standardise_output` is also `TRUE`"
    )
  }

  # check mapping args and get required details - mapping_table, from_col and
  # to_col
  mapping_params <- check_mapping_args(
    from = from,
    to = to,
    reverse_mapping = reverse_mapping
  )

  from_col <- mapping_params$from_col
  to_col <- mapping_params$to_col
  mapping_table <- mapping_params$mapping_table

  # determine relevant column indicating whether code description is preferred
  # (for code types with synonymous code descriptions like read 2 and read 3)
  preferred_description_col <-
    get_value_for_mapping_sheet(
      mapping_table = mapping_table,
      value = "preferred_synonym_col"
    )

  # get preferred code, if appropriate
  if (!is.na(preferred_description_col)) {
    preferred_description_code <-
      get_value_for_mapping_sheet(
        mapping_table = mapping_table,
        value = "preferred_code"
      )
  }

  # do mapping
  result <- all_lkps_maps[[mapping_table]] %>%
    dplyr::filter(.data[[from_col]] %in% codes) %>%
    dplyr::filter(!is.na(.data[[to_col]])) %>%
    dplyr::collect()

  # filter on `col_filters` parameters
  if (!is.null(col_filters)) {
    result <- filter_cols(
      df = result,
      df_name = mapping_table,
      col_filters = col_filters
    )
  }

  # check for unrecognised codes
  missing_codes <- subset(codes, !codes %in% result[[from_col]])

  handle_unrecognised_codes(
    unrecognised_codes = unrecognised_codes,
    missing_codes = missing_codes,
    table_name = mapping_table,
    code_type = from
  )

  # return result
  if (nrow(result) == 0) {
    message("No codes found after mapping. Returning `NULL`")
    return(NULL)
  } else {
    # return either unique codes only, or df including descriptions
    if (codes_only) {
      result <- unique(result[[to_col]])

      return(result)
    } else if (standardise_output) {
      # Note, not all mapping sheets in UKB resource 592 contain descriptions
      # (e.g. 'read_v2_icd9'). Therefore need to use `lookup_codes` if
      # `standardise_output` is `TRUE`

      codes <- unique(result[[to_col]])

      return(
        lookup_codes(
          codes = codes,
          code_type = to,
          all_lkps_maps = all_lkps_maps,
          preferred_description_only = preferred_description_only,
          unrecognised_codes = "error"
        )
      )
    } else {
      return(result)
    }
  }
}

#' Get a 'from-to' mapping data frame
#'
#' Returns a data frame with 'from' and 'to' columns for a specified pair of
#' coding systems.
#'
#' @param from A clinical coding system to map from.
#' @param to A clinical coding system to map to.
#' @inheritParams codes_starting_with
#' @inheritParams map_codes
#' @param rename_from_to Optionally supply a named vector to rename the 'from'
#'   and 'to' columns. For example `c(from = "original_codes", to =
#'   "new_codes")`. By default, the columns will be named using the values for
#'   `from` and `to` arguments.
#' @param na.rm If `TRUE` (default), remove any rows with `NA` from the returned
#'   mapping data frame. The mapping tables may sometimes include `NA` values to
#'   explicitly show which 'from' codes have not been mapped.
#'
#' @return A data frame with column names 'from' and 'to'.
#' @export
#'
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # get mapping data frame for Read 2 to ICD10
#' get_mapping_df(
#'   from = "read3",
#'   to = "icd10",
#'   all_lkps_maps = all_lkps_maps_dummy
#' )
get_mapping_df <- function(from,
                           to,
                           all_lkps_maps = NULL,
                           rename_from_to = NULL,
                           na.rm = TRUE,
                           reverse_mapping = "error",
                           col_filters = default_col_filters()) {
  # validate args -----------
  # get mapping sheet, from and to cols
  # check mapping args and get required details - mapping_table, from_col and
  # to_col
  mapping_params <- check_mapping_args(
    from = from,
    to = to,
    reverse_mapping = reverse_mapping
  )

  from_col <- mapping_params$from_col
  to_col <- mapping_params$to_col
  mapping_table <- mapping_params$mapping_table

  # rename_from_to
  rename_from_to_error_msg <-
    "Error! `rename_from_to` should be a named character vector of length 2, with names 'from' and 'to'"

  if (!is.null(rename_from_to)) {
    assertthat::assert_that(is.character(rename_from_to) &&
      (length(rename_from_to) == 2) &&
      all(c("from", "to") %in% names(rename_from_to)),
    msg = rename_from_to_error_msg
    )
  }

  # connect to database file path if `all_lkps_maps` is a string, or `NULL`
  if (is.character(all_lkps_maps)) {
    con <- check_all_lkps_maps_path(all_lkps_maps)
    all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
    on.exit(DBI::dbDisconnect(con))
  } else if (is.null(all_lkps_maps)) {
    if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
      message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
      con <-
        check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else if (file.exists("all_lkps_maps.db")) {
      message("Attempting to connect to all_lkps_maps.db in current working directory")
      con <- check_all_lkps_maps_path("all_lkps_maps.db")
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else {
      stop(
        "No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
      )
    }
  }

  # get just distinct combinations of from_col and to_col for mapping_table
  from_to_cols <- c(
    from_col,
    to_col
  )

  result <- all_lkps_maps[[mapping_table]] %>%
    dplyr::collect()


  # filter on `col_filters` parameters
  if (!is.null(col_filters)) {
    result <- filter_cols(
      df = result,
      df_name = mapping_table,
      col_filters = col_filters
    )
  }

  # keep required columns only
  result <- result %>%
    dplyr::select(tidyselect::all_of(from_to_cols))

  # remove rows with `NA` values
  if (na.rm) {
    result <- tidyr::drop_na(result)
  }

  # distinct rows only (e.g. read 2 'J5310' maps to both primary and secondary
  # descriptions for read 3 'J5311')
  result <- result %>%
    dplyr::distinct(dplyr::across(tidyselect::everything()),
      .keep_all = TRUE
    )

  # rename
  if (!is.null(rename_from_to)) {
    new_from_to_cols <- c(
      rename_from_to["from"],
      rename_from_to["to"]
    )
  } else {
    new_from_to_cols <- c(
      from,
      to
    )
  }

  result <- result %>%
    ukbwranglr:::rename_cols(
      old_colnames = from_to_cols,
      new_colnames = new_from_to_cols
    )

  return(result)
}

# Utilities ---------------------------------------------------------------

#' Reformat ICD-10 codes
#'
#' The lookup sheet in
#' \href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{UKB resource 592}
#' for ICD-10 ("icd10_lkp") has a column called `ALT_CODE`, which is an
#' alternative format for ICD-10 codes. This is the format used in the mapping
#' sheets for this resource, as well as in
#' \href{https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=41270}{Field ID 41270}.
#' *Note however that in contrast to the ICD10 lookup and mapping sheets,
#' undivided 3 character codes do not have an 'X' appended in the UK Biobank
#' dataset*. This function converts from one format to the other, with the
#' option to strip the final 'X' from undivided 3 character codes if converting
#' to `ALT_CODE` format.
#'
#' @param icd10_codes character vector of ICD-10 codes
#' @param input_icd10_format character. Must be either "ICD10_CODE" or
#'   "ALT_CODE".
#' @param output_icd10_format character. Must be either "ICD10_CODE" or
#'   "ALT_CODE".
#' @inheritParams codes_starting_with
#' @inheritParams lookup_codes
#' @param strip_x If `TRUE` and converting to `ALT_CODE` format, 'X' is removed
#'   from the end of undivided 3 character codes (default is `FALSE`).
#'
#' @return character vector of ICD-10 codes, reformatted as specified by
#'   \code{output_icd10_format}.
#' @export
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # reformat from ICD10_CODE to ALT_CODE
#' reformat_icd10_codes(
#'   icd10_codes = c("E10.9"),
#'   all_lkps_maps = all_lkps_maps_dummy,
#'   input_icd10_format = "ICD10_CODE",
#'   output_icd10_format = "ALT_CODE"
#' )
#'
#' # reformat from ALT_CODE to ICD10_CODE
#' reformat_icd10_codes(
#'   icd10_codes = c("E109"),
#'   all_lkps_maps = all_lkps_maps_dummy,
#'   input_icd10_format = "ALT_CODE",
#'   output_icd10_format = "ICD10_CODE"
#' )
reformat_icd10_codes <- function(icd10_codes,
                                 all_lkps_maps = NULL,
                                 input_icd10_format = "ICD10_CODE",
                                 output_icd10_format = "ALT_CODE",
                                 unrecognised_codes = "error",
                                 strip_x = FALSE) {
  # validate args
  match.arg(
    arg = input_icd10_format,
    choices = c("ICD10_CODE", "ALT_CODE")
  )

  match.arg(
    arg = output_icd10_format,
    choices = c("ICD10_CODE", "ALT_CODE")
  )


  assertthat::assert_that(input_icd10_format != output_icd10_format,
    msg = "Error for `reformat_icd10_codes()`! Input and output icd10 formats cannot be the same"
  )

  if (strip_x & (output_icd10_format == "ICD10_CODE")) {
    stop("`strip_x` can only be `TRUE` if `output_icd10_format` is 'ALT_CODE'")
  }

  # connect to database file path if `all_lkps_maps` is a string, or `NULL`
  if (is.character(all_lkps_maps)) {
    con <- check_all_lkps_maps_path(all_lkps_maps)
    all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
    on.exit(DBI::dbDisconnect(con))
  } else if (is.null(all_lkps_maps)) {
    if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
      message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
      con <-
        check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else if (file.exists("all_lkps_maps.db")) {
      message("Attempting to connect to all_lkps_maps.db in current working directory")
      con <- check_all_lkps_maps_path("all_lkps_maps.db")
      all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
      on.exit(DBI::dbDisconnect(con))
    } else {
      stop(
        "No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
      )
    }
  }

  validate_all_lkps_maps()

  icd10_mapping_df <- all_lkps_maps$icd10_lkp %>%
    dplyr::filter(.data[[input_icd10_format]] %in% icd10_codes) %>%
    dplyr::select(tidyselect::all_of(c(
      "ICD10_CODE",
      "ALT_CODE"
    ))) %>%
    dplyr::collect()

  # handle any unrecognised codes
  missing_codes <-
    subset(
      icd10_codes,
      !icd10_codes %in% icd10_mapping_df[[input_icd10_format]]
    ) %>%
    unique()

  handle_unrecognised_codes(
    unrecognised_codes = unrecognised_codes,
    missing_codes = missing_codes,
    table_name = "icd10_lkp",
    code_type = input_icd10_format
  )

  # Note: some ICD10_CODE values have multiple associated ALT_CODEs - these
  # include a modifier description in `MODIFIER_5` (e.g. ICD-10 codes M00,
  # M77, M07, M72, M65, S52 or S72). The number of output codes may therefore
  # be larger/smaller than the input number. Raise informative message if this
  # is is the case.

  non_unique_icd10_codes <- icd10_mapping_df %>%
    dplyr::count(.data[["ICD10_CODE"]]) %>%
    dplyr::filter(.data[["n"]] > 1) %>%
    dplyr::pull(.data[["ICD10_CODE"]])

  if (length(non_unique_icd10_codes) > 0) {
    input_icd10_not_1_to_1_mapping <- icd10_mapping_df %>%
      dplyr::filter(.data[["ICD10_CODE"]] %in% non_unique_icd10_codes) %>%
      dplyr::pull(.data[[input_icd10_format]]) %>%
      unique()

    more_or_fewer_returned_codes <-
      ifelse(input_icd10_format == "ICD10_CODE",
        yes = "*more*",
        no = "*fewer*"
      )

    message(
      "The following ",
      length(input_icd10_not_1_to_1_mapping),
      " input ICD10 codes do not have a 1-to-1 ICD10_CODE-to-ALT_CODE mapping: '",
      stringr::str_c(
        input_icd10_not_1_to_1_mapping,
        sep = "",
        collapse = "', '"
      ),
      "'. There will therefore be ",
      more_or_fewer_returned_codes,
      " output than input codes"
    )
  }

  # get requested icd10 format
  result <- unique(icd10_mapping_df[[output_icd10_format]])

  # optionally remove appended 'X' for undivided 3 character codes in `ALT_CODE` format
  if (strip_x & (output_icd10_format == "ALT_CODE")) {
    message("Removing 'X' from any undivided 3 character ICD10 codes")
    result <- stringr::str_remove(
      result,
      "X$"
    )
  }

  # return result
  return(result)
}

#' Default filtering parameters for lookup and mapping tables.
#'
#' To be used as `col_filters` argument in 'Clinical code lookups and mappings'
#' functions. Returns a named list where each name in the list refers to the
#' name of a lookup or mapping table. Each item is also a named list, where the
#' names refer to column names in the corresponding table, and the items are
#' vectors of values to filter for.
#'
#' @return A named list.
#' @export
#'
#' @family Clinical code lookups and mappings
#' @examples
#' default_col_filters()
default_col_filters <- function() {
  list(
    read_ctv3_icd10 = list(
      mapping_status = c("E", "G", "D"),
      refine_flag = c("C", "P"),
      element_num = c("0"),
      block_num = c("0")
    ),
    read_v2_icd10 = list(icd10_code_def = c("1", "15", "3", "5", "7", "8")),
    read_ctv3_read_v2 = list(IS_ASSURED = "1"),
    read_v2_read_ctv3 = list(IS_ASSURED = "1")
  )
}

# PRIVATE FUNCTIONS -------------------------------------------------------


#' Helper function for \code{\link{map_codes}}
#'
#' Returns name of the appropriate mapping sheet from the UKB code mappings
#' excel file (resource 592) for mapping from one clinical coding system to
#' another.
#'
#' @param from character
#' @param to character
#'
#' @return character (scalar)
#' @noRd
#' @family Clinical code lookups and mappings
get_from_to_mapping_sheet <- function(from, to) {
  CLINICAL_CODE_MAPPINGS_MAP[(CLINICAL_CODE_MAPPINGS_MAP[["from"]] == from &
    CLINICAL_CODE_MAPPINGS_MAP[["to"]] == to), ][["mapping_table"]]
}

#' Helper function for \code{\link{map_codes}}
#'
#' Returns the requested value for a 'mapping_table' in
#' \code{CLINICAL_CODE_MAPPINGS_MAP}.
#'
#' @param mapping_table character
#' @param value character. column name from
#'   \code{CLINICAL_CODE_MAPPINGS_MAP} (apart from
#'   "mapping_table").
#'
#' @return character (scalar)
#' @noRd
#' @family Clinical code lookups and mappings
get_value_for_mapping_sheet <- function(mapping_table,
                                        value) {
  # validate args
  match.arg(
    arg = mapping_table,
    choices = CLINICAL_CODE_MAPPINGS_MAP[["mapping_table"]]
  )

  match.arg(
    arg = value,
    choices = subset(
      names(CLINICAL_CODE_MAPPINGS_MAP),
      subset = names(CLINICAL_CODE_MAPPINGS_MAP) != "mapping_table"
    )
  )

  # return specified `value`
  CLINICAL_CODE_MAPPINGS_MAP[CLINICAL_CODE_MAPPINGS_MAP[["mapping_table"]] == mapping_table, ][[value]]
}

#' Get name of lookup sheet for a clinical code system
#'
#' Helper function for \code{\link{lookup_codes}} and \code{\link{codes_starting_with}}
#'
#' @param code_type character
#'
#' @return character (scalar)
#' @noRd
#' @family Clinical code lookups and mappings
get_lookup_sheet <- function(code_type) {
  # validate args
  match.arg(code_type,
    choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
  )

  # get lookup sheet
  CODE_TYPE_TO_LKP_TABLE_MAP %>%
    dplyr::filter(.data[["code"]] == code_type) %>%
    .$lkp_table
}

#' Get name of code, description or preferred synonym column for a lookup sheet
#'
#' Helper function for \code{\link{lookup_codes}} and \code{\link{codes_starting_with}}
#'
#' @param lookup_sheet character
#' @param column character
#'
#' @return character (scalar)
#'
#' @family Clinical code lookups and mappings
#' @noRd
get_col_for_lookup_sheet <- function(lookup_sheet,
                                     column) {
  # validate args
  match.arg(
    arg = lookup_sheet,
    choices = CODE_TYPE_TO_LKP_TABLE_MAP$lkp_table
  )

  match.arg(
    arg = column,
    choices = c("code_col", "description_col", "preferred_synonym_col")
  )

  # get column name for lookup sheet
  CODE_TYPE_TO_LKP_TABLE_MAP %>%
    dplyr::filter(.data[["lkp_table"]] == lookup_sheet) %>%
    .[[column]]
}

#' Get preferred description code for a lookup sheet
#'
#' Helper function for \code{\link{lookup_codes}} and \code{\link{codes_starting_with}}
#'
#' @param lookup_sheet character
#'
#' @return character (scalar)
#'
#' @family Clinical code lookups and mappings
#' @noRd
get_preferred_description_code_for_lookup_sheet <-
  function(lookup_sheet) {
    # validate args
    match.arg(
      arg = lookup_sheet,
      choices = CODE_TYPE_TO_LKP_TABLE_MAP$lkp_table
    )

    # get preferred description code for lookup sheet
    CODE_TYPE_TO_LKP_TABLE_MAP %>%
      dplyr::filter(.data[["lkp_table"]] == lookup_sheet) %>%
      .[["preferred_code"]]
  }

#' Reformat a dataframe of clinical codes to work with
#' \code{\link[ukbwranglr]{extract_phenotypes}}
#'
#' A utility function that helps reformat the output from \code{\link{map_codes}}
#' or \code{\link{lookup_codes}} to work with
#' \code{\link[ukbwranglr]{extract_phenotypes}}. See also output
#' from \code{\link[ukbwranglr]{example_clinical_codes}} for an example of
#' the format that this function will output.
#'
#' @param standardised_codelist a data frame with column names "code",
#'   "description", "code_type".
#' @param code_type character (scalar). The clinical code type e.g. "read2"
#' @param disease character (scalar), e.g. "Secondary polycythaemia"
#' @param disease_category character (scalar). The subcategory of \code{disease}
#'   that these codes belong to e.g. "Diagnosis of Secondary polycythaemia".
#' @param author character (scalar), e.g. "caliber".
#'
#' @return A data frame with the following column names: 'disease',
#'   'description', 'category', 'code_type', 'code' and 'author'.
#' @noRd
reformat_standardised_codelist <- function(standardised_codelist,
                                           code_type,
                                           disease,
                                           disease_category,
                                           author) {
  # validate args
  assertthat::assert_that(is.data.frame(standardised_codelist),
    msg = "Error! standardised_codelist must be a data frame (or tibble/data table"
  )

  assertthat::is.string(code_type)
  assertthat::is.string(disease)
  assertthat::is.string(disease_category)
  assertthat::is.string(author)

  match.arg(code_type,
    choices = ukbwranglr:::CLINICAL_EVENTS_SOURCES$data_coding
  )

  assertthat::assert_that(all(
    names(standardised_codelist) == c("code", "description", "code_type")
  ),
  msg = "Error! `standardised_codelist` must be a data frame with the following headings: 'code', 'description', 'code_type'"
  )

  assertthat::assert_that(
    all(
      standardised_codelist$code_type %in% unique(ukbwranglr:::CLINICAL_EVENTS_SOURCES$data_coding)
    ),
    msg = paste0(
      "Error! `standardised_codelist$code_type` contains unrecognised code types. Recognised code types: ",
      stringr::str_c(
        unique(ukbwranglr:::CLINICAL_EVENTS_SOURCES$data_coding),
        sep = "",
        collapse = ", "
      )
    )
  )

  # reformat to work with `extract_phenotypes()`
  standardised_codelist <- standardised_codelist %>%
    dplyr::mutate(
      "code_type" = code_type,
      "disease" = disease,
      "category" = disease_category,
      "author" = author,
    ) %>%
    dplyr::select(
      .data[["disease"]],
      .data[["description"]],
      .data[["category"]],
      .data[["code_type"]],
      .data[["code"]],
      .data[["author"]]
    )

  return(standardised_codelist)
}

standardise_output_fn <-
  function(df,
           lkp_table,
           code_col,
           description_col,
           code_type) {
    names(df)[which(names(df) == code_col)] <- "code"
    names(df)[which(names(df) == description_col)] <- "description"

    # Some ICD-10 descriptions include a modifier e.g. "E10" = "Type 1 diabetes
    # mellitus", whereas "E10.0" = "Type 1 diabetes mellitus with coma". "With
    # coma" is contained in the modifier columns "MODIFIER-4". See 'S27' for an
    # example code where additional description is contained in the "MODIFER-5"
    # column. The returned "description" column from `standardise_output == TRUE`
    # therefore combines the 'DESCRIPTION' column with one of these 2 columns
    # (whichever is not NA). There are no codes with a modifier description in
    # both "MODIFIER_4" and "MODIFIER_5".

    if (lkp_table == "icd10_lkp") {
      df$description <- dplyr::case_when(
        !is.na(df$MODIFIER_4) ~ paste(df$description, df$MODIFIER_4),
        !is.na(df$MODIFIER_5) ~ paste(df$description, df$MODIFIER_5),
        TRUE ~ df$description
      )
    }

    # return code, description and code_type cols only
    df <- df[c("code", "description")]
    df[["code_type"]] <- code_type

    return(df)
  }

#' Filter lookup/mapping table for specified values in columns
#'
#' Helper function that enables filtering of lookup/mapping tables for certain
#' values (e.g. filter a mapping table for only 'exact' code mappings). Uses an
#' `%in%` filter statement. Note that no error is currently raised if attempting
#' to filter for any values that do not exist in `df` columns.
#'
#' @param df Lookup or mapping df to be filtered
#' @param df_name Name of lookup/mapping df (e.g. "icd10_lkp").
#' @param col_filters Either `NULL` (default, in which case `df` is returned
#'   unchanged) or a named list. First level names are names of lookup/mapping
#'   tables. Each item is also a named list of vectors. Columns in `df` that
#'   match the list names are filtered for values in the corresponding vectors
#'   (using `%in%`). An error is raised
#'
#' @return A dataframe
#' @noRd
filter_cols <- function(df,
                        df_name,
                        col_filters = NULL) {
  # if `col_filters` is `NULL`, return `df` unchanged (exit early)
  if (is.null(col_filters)) {
    return(df)
  }

  # get relevant columns/filter values from `col_filters`
  col_filters <- col_filters[[df_name]]

  # if `df_name` is not present in `names(col_filters)` return `df` unchanged (exit early)
  if (is.null(col_filters)) {
    return(df)
  }

  # check that selected element of `col_filters` is a named list of vectors
  stopifnot(is.list(col_filters))
  if (is.null(names(col_filters))) {
    stop("Each item in `col_filters` must be named")
  }

  if (any(names(col_filters) == "")) {
    stop("Each item in `col_filters` must be named")
  }

  col_filters_item_types <- col_filters %>%
    purrr::map_lgl(is.vector)

  assertthat::assert_that(sum(!col_filters_item_types) == 0,
    msg = "Each item in `col_filters` must be a vector"
  )

  # check that column names exist in df. Raise error is any are unrecognised.
  unrecognised_colnames <-
    subset(names(col_filters), !names(col_filters) %in% names(df))

  if (length(unrecognised_colnames) > 0) {
    stop(
      paste0(
        "The following ",
        length(unrecognised_colnames),
        " column names specified by `col_filters` are not present in `",
        df_name,
        "`: ",
        stringr::str_c(
          unrecognised_colnames,
          sep = "",
          collapse = ", "
        )
      ),
      call. = FALSE
    )
  }

  # filter `df` for specified values in each column listed by `col_filters`
  for (i in names(col_filters)) {
    col_filter_values <- col_filters[[i]]

    # check that type matches
    df_col_class <- class(df[[i]])
    col_filter_values_class <- class(col_filter_values)
    assertthat::assert_that(
      df_col_class %in% col_filter_values_class,
      msg = paste0(
        "Cannot filter column ",
        i,
        " in ",
        df_name,
        " as classes do not match. Column `",
        i,
        "`` is class ",
        df_col_class,
        ", but filter values specified by `col_filters` are of class ",
        col_filter_values_class
      )
    )

    df <- df %>%
      dplyr::filter(.data[[i]] %in% !!col_filter_values)
  }

  # return result
  return(df)
}

#' Remove or extract appended ICD10 letters
#'
#' @param icd10_codes Character vector of ICD10 codes
#' @param rm_extract Either 'rm' (remove 'D'/'X'/'A' from end of codes) or
#'   'extract' (return these characters only, or `NA` if the code does not end
#'   with letter characters).
#'
#' @return Character vector
#' @noRd
#' @examples
#' rm_or_extract_appended_icd10_letters(c(
#'   "A00",
#'   "A408",
#'   "A390D",
#'   "A38X",
#'   "G01XA"
#' ))
rm_or_extract_appended_icd10_dxa <- function(icd10_codes,
                                             keep_x = TRUE,
                                             rm_extract = "rm") {
  # validate args
  match.arg(rm_extract,
    choices = c("rm", "extract")
  )

  stopifnot(is.character(icd10_codes))

  # either remove or extract appended 'D'/'X'/'A' from `icd10_codes`
  pattern <- ifelse(keep_x,
    yes = "[D|A]*$",
    no = "[D|X|A]*$"
  )

  switch(rm_extract,
    rm = stringr::str_remove(icd10_codes,
      pattern = pattern
    ),
    extract = stringr::str_extract(icd10_codes,
      pattern = pattern
    ) %>%
      ifelse(. == "",
        yes = NA_character_,
        no = .
      )
  )
}

#' Get a mapping table for ICD10 codes in ALT_CODE format, with and without 'X'
#' appended for undivided 3 character codes
#'
#' @param icd10_lkp The lookup table for ICD10 codes, `icd10_lkp`.
#' @param undivided_3char_only If `TRUE` return only undivided 3 character ICD10
#'   codes. Default is `FALSE`.
#' @param as_named_list If `NULL`, returns a data frame with columns `ALT_CODE`
#'   and `ALT_CODE_minus_x`. If 'names_no_x' or 'names_with_x', returns a named
#'   list with either `ALT_CODE` or `ALT_CODE_minus_x` set as names (and the
#'   other as values) respectively.
#'
#' @return A data frame or a named list (see argument `as_named_list`).
#' @noRd
get_icd10_code_alt_code_x_map <- function(icd10_lkp,
                                          undivided_3char_only = FALSE,
                                          as_named_list = NULL) {

  # validate args
  match.arg(as_named_list,
    choices = c("names_no_x", "names_with_x")
  )

  # make mapping df
  icd10_lkp_alt_x_map <- icd10_lkp %>%
    dplyr::select(.data[["ALT_CODE"]]) %>%
    dplyr::filter(!is.na(.data[["ALT_CODE"]])) %>%
    dplyr::collect() %>%
    dplyr::mutate("ALT_CODE_minus_x" = stringr::str_remove(.data[["ALT_CODE"]],
      pattern = "X$"
    ))

  # check that all codes are unique
  stopifnot(
    length(icd10_lkp_alt_x_map$ALT_CODE) == dplyr::n_distinct(icd10_lkp_alt_x_map$ALT_CODE)
  )
  stopifnot(
    length(icd10_lkp_alt_x_map$ALT_CODE_minus_x) == dplyr::n_distinct(icd10_lkp_alt_x_map$ALT_CODE_minus_x)
  )

  # check that all codes are the same comparing `ALT_CODE` and
  # `ALT_CODE_minus_x`, after removing the X in `ALT_CODE`
  stopifnot(all(
    icd10_lkp_alt_x_map$ALT_CODE_minus_x == stringr::str_remove(icd10_lkp_alt_x_map$ALT_CODE,
      pattern = "X$"
    )
  ))

  if (undivided_3char_only) {
    icd10_lkp_alt_x_map <- icd10_lkp_alt_x_map %>%
      dplyr::filter(.data[["ALT_CODE_minus_x"]] != .data[["ALT_CODE"]])
  }

  # convert to named list
  if (!is.null(as_named_list)) {
    icd10_lkp_alt_x_map <- switch(as_named_list,
      names_no_x = icd10_lkp_alt_x_map %>%
        tidyr::pivot_wider(
          names_from = "ALT_CODE_minus_x",
          values_from = "ALT_CODE"
        ) %>%
        as.list(),
      names_with_x = icd10_lkp_alt_x_map %>%
        tidyr::pivot_wider(
          names_from = "ALT_CODE",
          values_from = "ALT_CODE_minus_x"
        ) %>%
        as.list()
    )
  }

  # return result
  return(icd10_lkp_alt_x_map)
}

#' Get a vector of ICD10 codes in ALT_CODE format for a specified start/end
#' range of ICD10 codes
#'
#' Note that `start_icd10_code` and `end_icd10_code` must be of the same length,
#' unless one ends with 'X'. For example, expanding the range 'A80-A81' is
#' equivalent to expanding both 'A800-A809' and 'A810-A819'.
#'
#' @param start_icd10_code String
#' @param end_icd10_code String
#' @param icd10_lkp The ICD10 lookup table. Must have a `.rowid` column.
#'
#' @noRd
#' @return A character vector of
get_icd10_code_range <- function(start_icd10_code,
                                 end_icd10_code,
                                 icd10_lkp) {
  # validate args
  stopifnot(is.character(start_icd10_code))
  stopifnot(is.character(end_icd10_code))

  assertthat::assert_that(
    stringr::str_length(stringr::str_remove(
      start_icd10_code,
      "X$"
    )) == stringr::str_length(stringr::str_remove(
      end_icd10_code,
      "X$"
    )),
    msg = paste0(
      "`start_icd10_code` and `end_icd10_code` must have the same number of characters. Start/end values provided: ",
      start_icd10_code,
      " and ",
      end_icd10_code
    )
  )

  check_rowid_col_present(
    df = icd10_lkp,
    df_name = "icd10_lkp"
  )

  check_codes_exist(
    codes = c(start_icd10_code, end_icd10_code),
    lkp_codes = icd10_lkp$ALT_CODE,
    code_type = "icd10",
    table_name = "icd10_lkp",
    return_unrecognised_codes = FALSE
  )

  # get start and end row indices
  start_rowid <- icd10_lkp %>%
    dplyr::filter(.data[["ALT_CODE"]] == !!start_icd10_code) %>%
    dplyr::pull(.data[[".rowid"]])

  end_rowid <- icd10_lkp %>%
    dplyr::filter(.data[["ALT_CODE"]] == !!end_icd10_code) %>%
    dplyr::pull(.data[[".rowid"]])

  # check start/end row indices are scalar, and create range of row index integers
  assertthat::is.number(start_rowid)
  assertthat::is.number(end_rowid)

  icd10_lkp_rowids <- start_rowid:end_rowid

  # filter for selected row index integers
  result <- icd10_lkp %>%
    dplyr::filter(.data[[".rowid"]] %in% icd10_lkp_rowids) %>%
    dplyr::pull(.data[["ALT_CODE"]])

  pattern <- stringr::str_c(paste0("^", result),
    sep = "",
    collapse = "|"
  )

  # expand (e.g. for 'A80-A81', at this stage all 'A80' should be present
  # ('A800-A809'), but for 'A81', only 'A81' wil be present - needs expanding
  # to 'A801-A819')
  result <- icd10_lkp %>%
    dplyr::filter(stringr::str_detect(.data[["ALT_CODE"]],
      pattern = pattern
    )) %>%
    dplyr::pull(.data[["ALT_CODE"]])

  return(result)
}

#' Expand a `read_v2_icd10` data frame containing code ranges in the
#' `icd10_code` column
#'
#' To be used within `reformat_read_v2_icd10()`.
#'
#' @param read_v2_icd10 Data frame
#' @param icd10_lkp Data frame
#' @param icd10_lkp_alt_x_map A named list of undivided 3 character ICD10 codes,
#'   where the names do not have an 'X' appended and the values do.
#'
#' @return A data frame
#' @noRd
expand_icd10_ranges <- function(read_v2_icd10,
                                icd10_lkp,
                                icd10_lkp_alt_x_map) {
  # validate args
  assertthat::assert_that(all(
    c(
      "read_code",
      "icd10_code",
      "icd10_code_def"
    ) %in% names(read_v2_icd10)
  ),
  msg = "Unexpected column names in `read_v2_icd10`"
  )

  # separate by '-'
  read_v2_icd10 <- read_v2_icd10 %>%
    tidyr::separate(
      .data[["icd10_code"]],
      into = c("start_icd10_code", "end_icd10_code"),
      sep = "-",
      remove = FALSE,
      fill = "right"
    )

  read_v2_icd10 <- read_v2_icd10 %>%
    dplyr::mutate("start_icd10_code" = ifelse(is.na(.data[["end_icd10_code"]]),
      yes = NA_character_,
      no = .data[["start_icd10_code"]]
    ))

  # strip any appended 'D/X/A' (last character(s) e.g. 'A89X' and 'A170D' become
  # 'A89' and 'A170'. 'G01XA' would become 'G01', although note that this code
  # does not appear together with a '-')
  read_v2_icd10 <- read_v2_icd10 %>%
    dplyr::mutate(dplyr::across(
      tidyselect::all_of(c(
        "start_icd10_code",
        "end_icd10_code"
      )),
      ~ rm_or_extract_appended_icd10_dxa(
        icd10_codes = .x,
        keep_x = TRUE,
        rm_extract = "rm"
      )
    ))

  # Make sure undivided 3 character ICD10 codes have an 'X' appended
  read_v2_icd10 <- read_v2_icd10 %>%
    dplyr::mutate(dplyr::across(
      tidyselect::all_of(c(
        "start_icd10_code",
        "end_icd10_code"
      )),
      ~ dplyr::recode(
        .x,
        !!!icd10_lkp_alt_x_map
      )
    ))

  # expand range
  read_v2_icd10 %>%
    dplyr::rowwise() %>%
    dplyr::mutate(icd10_range_new = ifelse(
      is.na(.data[["start_icd10_code"]]),
      yes = list(NA_character_),
      no = list(
        get_icd10_code_range(
          start_icd10_code = .data[["start_icd10_code"]],
          end_icd10_code = .data[["end_icd10_code"]],
          icd10_lkp = icd10_lkp
        )
      )
    )) %>%
    dplyr::ungroup() %>%
    tidyr::unnest(cols = "icd10_range_new") %>%
    dplyr::mutate("icd10_code" = ifelse(is.na(.data[["icd10_range_new"]]),
      yes = .data[["icd10_code"]],
      no = .data[["icd10_range_new"]]
    )) %>%
    dplyr::select(-tidyselect::all_of(c(
      "start_icd10_code",
      "end_icd10_code",
      "icd10_range_new"
    )))
}

## Validation helpers ---------------------------

check_codes <- function(codes) {
  assertthat::assert_that(is.character(codes),
    msg = "Error! `codes` must be a character vector"
  )

  assertthat::assert_that(sum(is.na(codes)) == 0,
    msg = "Error! `codes` cannot contain `NA` values"
  )
}

check_mapping_args <- function(from,
                               to,
                               reverse_mapping = "error") {
  match.arg(reverse_mapping,
    choices = c("error", "warning")
  )

  match.arg(
    arg = from,
    choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
  )
  # choices = CLINICAL_CODE_MAPPINGS_MAP$from)

  match.arg(
    arg = to,
    choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
  )
  # choices = CLINICAL_CODE_MAPPINGS_MAP$to)

  assertthat::assert_that(!from == to,
    msg = "Error! `from` and `to` args cannot be the same"
  )

  # get appropriate mapping sheet
  swap_mapping_cols <- FALSE
  mapping_table <- get_from_to_mapping_sheet(from = from, to = to)

  # if above returns `character(0)`, try to map the other way
  if (rlang::is_empty(mapping_table)) {
    swap_mapping_cols <- TRUE
    mapping_table <- get_from_to_mapping_sheet(from = to, to = from)
  }

  # if still returns `character(0)`, error
  if (rlang::is_empty(mapping_table)) {
    stop("Error! Invalid (or unavailable) code mapping request")
  } else if (swap_mapping_cols) {
    switch(reverse_mapping,
      error = stop("No mapping sheet available for this request"),
      warning = warning(
        "No mapping sheet available for this request. Attempting to map anyway using: ",
        mapping_table
      )
    )
  }

  # get from_col and to_col column names for mapping sheet
  # swap if appropriate
  if (swap_mapping_cols) {
    from_col <-
      get_value_for_mapping_sheet(
        mapping_table = mapping_table,
        value = "to_col"
      )
    to_col <-
      get_value_for_mapping_sheet(
        mapping_table = mapping_table,
        value = "from_col"
      )
  } else {
    from_col <-
      get_value_for_mapping_sheet(
        mapping_table = mapping_table,
        value = "from_col"
      )
    to_col <-
      get_value_for_mapping_sheet(
        mapping_table = mapping_table,
        value = "to_col"
      )
  }

  # return result
  return(list(
    from_col = from_col,
    to_col = to_col,
    mapping_table = mapping_table
  ))
}

check_all_lkps_maps_path <- function(file_path) {
  # check file exists
  assertthat::assert_that(file.exists(file_path),
    msg = paste0(
      "Error! No file found at ",
      file_path
    )
  )

  # check file ends with '.db'
  assertthat::assert_that(stringr::str_detect(
    file_path,
    ".+\\.db"
  ))

  # return con object if tests pass
  DBI::dbConnect(RSQLite::SQLite(), file_path)
}

check_rowid_col_present <- function(df,
                                    df_name) {
  assertthat::assert_that(".rowid" %in% names(df),
    msg = paste0(
      "'.rowid' column not present in `",
      df_name,
      "`"
    )
  )
  assertthat::assert_that(
    class(df[[".rowid"]]) == "integer",
    msg = paste0(
      "'.rowid' column in `",
      df_name,
      "should be class 'integer', not '",
      class(df[[".rowid"]]),
      "'"
    )
  )
}

#' Helper function - raise error or warning if unrecognised codes are present
#'
#' Raises an error or warning for unrecognised codes.
#'
#' @param unrecognised_codes Either 'error' or 'warning'. Determines how to
#'   handle unrecognised codes
#' @param missing_codes character vector of unrecognised codes.
#' @param code_type The type of clinical coding system
#' @param table_name Name of lookup/mapping table from which codes are missing
#'
#' @return Called for side effects.
#' @noRd
#' @family Clinical code lookups and mappings
handle_unrecognised_codes <-
  function(unrecognised_codes,
           missing_codes,
           table_name,
           code_type) {
    match.arg(unrecognised_codes,
      choices = c("error", "warning")
    )

    # make sure missing_codes are unique, if not already
    missing_codes <- unique(missing_codes)

    # only display first 25 codes
    if (length(missing_codes) > 25) {
      missing_codes_to_print <- utils::head(missing_codes,
        n = 25
      )
    } else {
      missing_codes_to_print <- missing_codes
    }

    missing_codes_to_print <- stringr::str_c(missing_codes_to_print,
      sep = "",
      collapse = "', '"
    )

    if (length(missing_codes) > 25) {
      missing_codes_to_print <- paste0(
        missing_codes_to_print,
        " (first 25 only shown)"
      )
    }

    if (length(missing_codes) > 0) {
      missing_codes_message <- paste0(
        "The following ",
        length(missing_codes),
        " codes were not found for '",
        code_type,
        "' in table '",
        table_name,
        "': '",
        stringr::str_c(missing_codes_to_print, sep = "", collapse = "', '"),
        "'"
      )

      switch(unrecognised_codes,
        error = stop(missing_codes_message),
        warning = warning(missing_codes_message)
      )
    }
  }

#' Utility function - check if all of a set of codes are recognised
#'
#' @param codes Codes being checked
#' @param lkp_codes Character vector of lookup codes. Any `codes` not present in
#'   `lkp_codes` are considered to be unrecognised.
#' @param code_type String
#' @param return_unrecognised_codes If `TRUE`, return a character vector of
#'   unrecognised codes. Default is `FALSE`.check
#' @param unrecognised_codes Either 'error' or 'warning'. Determines how to
#'   handle unrecognised codes.
#' @param table_name Name of lookup/mapping table from which `lkp_codes` was
#'   obtained.
#'
#' @noRd
#' @return Return vector of unrecognised codes if `return_missing_codes` is
#'   `TRUE`, otherwise called for side effect (error if any unrecognised codes)
check_codes_exist <- function(codes,
                              lkp_codes,
                              table_name,
                              code_type,
                              return_unrecognised_codes = FALSE,
                              unrecognised_codes = "error") {
  # check for unrecognised('missing') codes
  missing_codes <- subset(codes, !codes %in% lkp_codes)

  # return missing codes, if requested
  if (return_unrecognised_codes) {
    return(missing_codes)
  }

  # ...otherwise return error
  handle_unrecognised_codes(
    unrecognised_codes = unrecognised_codes,
    missing_codes = missing_codes,
    table_name = table_name,
    code_type = code_type
  )
}
rmgpanw/codemapper documentation built on Aug. 30, 2023, 4:07 p.m.