# OVERVIEW ----------------------------------------------------------------
# Functions to map between different clinical codes e.g. between Read2 and
# Read3, or Read3 and ICD-10. These rely on the code mapping file provided by UK
# Biobank (resource 592: https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592) and
# the BNF to SNOMED mapping file from the NHSBSA website
# (https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping)
# EXPORTED FUNCTIONS ------------------------------------------------------
# Exploring and mapping clinical codes ------------------------------------
#' Get codes that start with...
#'
#' This is case \emph{sensitive} (important for read codes especially).
#'
#' @param codes character. A vector of code strings to search for matching
#' codes.
#' @param code_type character. The type of clinical code system to be searched.
#' Must be one of `r knitr::combine_words(CODE_TYPE_TO_LKP_TABLE_MAP$code, and = "or ")`.
#' @param all_lkps_maps Either a named list of lookup and mapping tables (either
#' data frames or `tbl_dbi` objects), or the path to a SQLite database
#' containing these tables (see also [build_all_lkps_maps()] and
#' [all_lkps_maps_to_db()]). If `NULL`, will attempt to connect to an SQLite
#' database named 'all_lkps_maps.db' in the current working directory, or to a
#' a SQLite database specified by an environmental variable named
#' 'ALL_LKPS_MAPS_DB' (see
#' [here](https://resources.numbat.space/using-rprofile-and-renviron.html#renviron)
#' for how to set environment variables using a `.Renviron` file). The latter
#' method will be used in preference.
#' @param codes_only bool. If \code{TRUE}, return a character vector of
#' \emph{unique} codes. If \code{FALSE} (default), return a data frame of all
#' results including code descriptions (useful for manual validation).
#' @param preferred_description_only bool. Return only preferred descriptions
#' for clinical codes with synonyms. Default value is \code{TRUE}.
#' @param col_filters A named list where each name in the list refers to the
#' name of a lookup or mapping table. Each item is also a named list, where
#' the names refer to column names in the corresponding table, and the items
#' are vectors of values to filter for. For example, `list(my_lookup_table =
#' list(colA = c("A", "B"))` will result in `my_lookup_table` being filtered
#' for rows where `colA` is either 'A' or 'B'. Uses `default_col_filters()` by
#' default. Set to `NULL` to remove all filters.
#'
#' @inheritParams lookup_codes
#' @export
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # lookup codes
#' lookup_codes(
#' codes = c("E10", "E11"),
#' code_type = "icd10",
#' all_lkps_maps = all_lkps_maps_dummy
#' )
codes_starting_with <- function(codes,
code_type,
all_lkps_maps = NULL,
codes_only = FALSE,
preferred_description_only = TRUE,
standardise_output = TRUE,
col_filters = default_col_filters()) {
# validate args
match.arg(
arg = code_type,
choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
)
# connect to database file path if `all_lkps_maps` is a string, or `NULL`
if (is.character(all_lkps_maps)) {
con <- check_all_lkps_maps_path(all_lkps_maps)
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (is.null(all_lkps_maps)) {
if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
con <-
check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (file.exists("all_lkps_maps.db")) {
message("Attempting to connect to all_lkps_maps.db in current working directory")
con <- check_all_lkps_maps_path("all_lkps_maps.db")
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else {
stop(
"No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
)
}
}
check_codes(codes)
assertthat::assert_that(is.logical(codes_only),
msg = "`code_only` must be either 'TRUE' or 'FALSE'"
)
assertthat::assert_that(!(codes_only & standardise_output),
msg = "Error! `codes_only` and `standardise_output` cannot both be `TRUE`"
)
# TODO check all sheets are present
validate_all_lkps_maps()
# determine relevant lookup sheet
lkp_table <- get_lookup_sheet(code_type = code_type)
# determine code column for lookup sheet
code_col <- get_col_for_lookup_sheet(
lookup_sheet = lkp_table,
column = "code_col"
)
# determine relevant column indicating whether code description is preferred
# (for code types with synonymous code descriptions like read 2 and read 3)
preferred_description_col <-
get_col_for_lookup_sheet(
lookup_sheet = lkp_table,
column = "preferred_synonym_col"
)
# get preferred code, if appropriate
if (!is.na(preferred_description_col)) {
preferred_description_code <-
get_preferred_description_code_for_lookup_sheet(lookup_sheet = lkp_table)
}
# reformat codes - escape '.', prefix with anchor and append '.*'
codes <-
stringr::str_replace_all(codes, pattern = "\\.", replacement = "\\\\.")
codes <- paste0("^", codes, ".*")
# combine into single string, separated by "|"
codes <- stringr::str_c(codes, sep = "", collapse = "|")
# get children (filter for codes which match ANY of those in `codes` arg)
result <- all_lkps_maps[[lkp_table]] %>%
dplyr::collect() %>%
dplyr::filter(stringr::str_detect(.data[[code_col]],
pattern = stringr::regex(codes,
ignore_case = FALSE
)
))
# filter on `col_filters` parameters
if (!is.null(col_filters)) {
result <- filter_cols(
df = result,
df_name = lkp_table,
col_filters = col_filters
)
}
# filter for preferred code descriptions only if requested
if (preferred_description_only &
!is.na(preferred_description_col)) {
result <- result %>%
dplyr::filter(.data[[preferred_description_col]] == preferred_description_code)
}
# return result
if (nrow(result) == 0) {
message("No matching codes found. Returning `NULL`")
return(NULL)
} else {
# return either unique codes only, or df including code descriptions
if (codes_only) {
return(unique(result[[code_col]]))
} else if (standardise_output) {
# Note, not all mapping sheets in UKB resource 592 contain descriptions
# (e.g. 'read_v2_icd9'). Therefore need to use `lookup_codes` if
# `standardise_output` is `TRUE`
codes <- unique(result[[code_col]])
return(
lookup_codes(
codes = codes,
code_type = code_type,
all_lkps_maps = all_lkps_maps,
preferred_description_only = preferred_description_only,
unrecognised_codes = "error"
)
)
} else {
return(result)
}
}
}
#' Look up descriptions for clinical codes
#'
#' Returns a data frame including descriptions for the codes of interest
#'
#' @param codes character. Vector of codes to lookup
#' @param standardise_output bool. If \code{TRUE} (default), outputs a data
#' frame with columns named 'code', 'description' and 'code_type'. Otherwise
#' returns a data frame with all columns for the relevant lookup sheet from
#' (\href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{UK Biobank
#' resource 592}).
#' @param unrecognised_codes Either 'error' (default) or 'warning'. If any input
#' `codes` are unrecognised, then either an error or warning will be raised.
#' @param .return_unrecognised_codes If `TRUE`, return a vector of unrecognised
#' codes only.
#' @inheritParams codes_starting_with
#'
#' @return data frame
#' @export
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # look up ICD10 codes
#' lookup_codes(
#' codes = c("E10", "E11"),
#' code_type = "icd10",
#' all_lkps_maps = all_lkps_maps_dummy
#' )
lookup_codes <- function(codes,
code_type,
all_lkps_maps = NULL,
preferred_description_only = TRUE,
standardise_output = TRUE,
unrecognised_codes = "error",
col_filters = default_col_filters(),
.return_unrecognised_codes = FALSE) {
# validate args
check_codes(codes)
match.arg(
arg = code_type,
choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
)
# connect to database file path if `all_lkps_maps` is a string, or `NULL`
if (is.character(all_lkps_maps)) {
con <- check_all_lkps_maps_path(all_lkps_maps)
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (is.null(all_lkps_maps)) {
if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
con <-
check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (file.exists("all_lkps_maps.db")) {
message("Attempting to connect to all_lkps_maps.db in current working directory")
con <- check_all_lkps_maps_path("all_lkps_maps.db")
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else {
stop(
"No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
)
}
}
validate_all_lkps_maps()
# determine relevant lookup sheet
lkp_table <- get_lookup_sheet(code_type = code_type)
# determine code column for lookup sheet
code_col <- get_col_for_lookup_sheet(
lookup_sheet = lkp_table,
column = "code_col"
)
# determine description column for lookup sheet
description_col <-
get_col_for_lookup_sheet(
lookup_sheet = lkp_table,
column = "description_col"
)
# determine relevant column indicating whether code description is preferred
# (for code types with synonymous code descriptions like read 2 and read 3)
preferred_description_col <-
get_col_for_lookup_sheet(
lookup_sheet = lkp_table,
column = "preferred_synonym_col"
)
# get preferred code, if appropriate
if (!is.na(preferred_description_col)) {
preferred_description_code <-
get_preferred_description_code_for_lookup_sheet(lookup_sheet = lkp_table)
}
# lookup - filter lookup sheet for codes
result <- all_lkps_maps[[lkp_table]] %>%
dplyr::filter(.data[[code_col]] %in% codes) %>%
dplyr::collect()
# filter on `col_filters` parameters
if (!is.null(col_filters)) {
result <- filter_cols(
df = result,
df_name = lkp_table,
col_filters = col_filters
)
}
# check for unrecognised codes
missing_codes <- subset(codes, !codes %in% result[[code_col]])
if (.return_unrecognised_codes) {
# optionally return vector of unrecognised codes only
message(paste0(
"Returning unrecognised codes only. N unrecognised: ",
length(missing_codes)
))
return(missing_codes)
}
handle_unrecognised_codes(
unrecognised_codes = unrecognised_codes,
missing_codes = missing_codes,
table_name = lkp_table,
code_type = code_type
)
# filter for preferred code descriptions only if requested
if (!is.null(preferred_description_only)) {
if (preferred_description_only &
!is.na(preferred_description_col)) {
result <- result %>%
dplyr::filter(.data[[preferred_description_col]] == preferred_description_code)
}
}
# standardise output if requested
if (standardise_output) {
result <- standardise_output_fn(
result,
lkp_table = lkp_table,
code_col = code_col,
description_col = description_col,
code_type = code_type
)
}
# return result
if (nrow(result) == 0) {
message("No matching codes found. Returning `NULL`")
return(NULL)
} else {
# return either unique codes only, or df including code descriptions
return(result)
}
}
#' Search for codes that match a description
#'
#' Returns a data frame with clinical codes that match the supplied regular
#' expression. Ignores case by default.
#'
#' @param reg_expr a regular expression to search for
#' @inheritParams stringr::regex
#' @inheritParams codes_starting_with
#'
#' @return data frame by default, or a character vector of codes if
#' \code{codes_only} is \code{TRUE}.
#' @export
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # lookup ICD10 code descriptions matching 'cyst'
#' code_descriptions_like(
#' reg_expr = "cyst",
#' code_type = "icd10",
#' all_lkps_maps = all_lkps_maps_dummy
#' )
code_descriptions_like <- function(reg_expr,
code_type,
all_lkps_maps = NULL,
ignore_case = TRUE,
codes_only = FALSE,
preferred_description_only = TRUE,
standardise_output = TRUE,
col_filters = default_col_filters()) {
# validate args
check_codes(reg_expr)
match.arg(
arg = code_type,
choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
)
# connect to database file path if `all_lkps_maps` is a string, or `NULL`
if (is.character(all_lkps_maps)) {
con <- check_all_lkps_maps_path(all_lkps_maps)
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (is.null(all_lkps_maps)) {
if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
con <-
check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (file.exists("all_lkps_maps.db")) {
message("Attempting to connect to all_lkps_maps.db in current working directory")
con <- check_all_lkps_maps_path("all_lkps_maps.db")
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else {
stop(
"No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
)
}
}
validate_all_lkps_maps()
# determine relevant lookup sheet
lkp_table <- get_lookup_sheet(code_type = code_type)
# determine code and description columns for lookup sheet
code_col <- get_col_for_lookup_sheet(
lookup_sheet = lkp_table,
column = "code_col"
)
description_col <-
get_col_for_lookup_sheet(
lookup_sheet = lkp_table,
column = "description_col"
)
# determine relevant column indicating whether code description is preferred
# (for code types with synonymous code descriptions like read 2 and read 3)
preferred_description_col <-
get_col_for_lookup_sheet(
lookup_sheet = lkp_table,
column = "preferred_synonym_col"
)
# get preferred code, if appropriate
if (!is.na(preferred_description_col)) {
preferred_description_code <-
get_preferred_description_code_for_lookup_sheet(lookup_sheet = lkp_table)
}
# search for codes
result <- all_lkps_maps[[lkp_table]] %>%
dplyr::collect() %>%
dplyr::filter(stringr::str_detect(
string = .data[[description_col]],
pattern = stringr::regex(
pattern = reg_expr,
ignore_case = ignore_case
)
))
# filter on `col_filters` parameters
if (!is.null(col_filters)) {
result <- filter_cols(
df = result,
df_name = lkp_table,
col_filters = col_filters
)
}
# filter for preferred code descriptions only if requested
if (!is.null(preferred_description_only)) {
if (preferred_description_only &
!is.na(preferred_description_col)) {
result <- result %>%
dplyr::filter(.data[[preferred_description_col]] == preferred_description_code)
}
}
# standardise output, if requested
if (standardise_output) {
result <- standardise_output_fn(
result,
lkp_table = lkp_table,
code_col = code_col,
description_col = description_col,
code_type = code_type
)
code_col <- "code"
}
# return result
if (nrow(result) == 0) {
message("No matching codes found. Returning `NULL`")
return(NULL)
} else {
if (codes_only) {
return(unique(result[[code_col]]))
} else {
return(result)
}
}
}
#' Map clinical codes from one coding system to another
#'
#' Uses the code mapping file provided by UK Biobank
#' (\href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{resource 592}).
#'
#' The values for arguments \code{from} and \code{to} must be one of
#' \code{read2}, \code{read3}, \code{icd9}, \code{icd10}, \code{bnf},
#' \code{dmd}, \code{read2_drugs} or \code{opcs4}.
#'
#' @param codes A character vector of codes to be mapped.
#' @param from Coding system that \code{codes} belong to.
#' @param to Coding system to map \code{codes} to.
#' @param unrecognised_codes Either 'error' (default) or 'warning'. If any input
#' `codes` are unrecognised for the coding system being mapped from, then
#' either an error or warning will be raised.
#' @param preferred_description_only If `TRUE`, return only preferred
#' descriptions for clinical codes with synonyms. Can only be \code{TRUE} if
#' \code{standardise_output} is also \code{TRUE}. Default value is
#' \code{NULL}.
#' @param reverse_mapping If 'error' (default), an error raised if attempting to
#' map between coding systems for which a mapping table does not exist. If
#' 'warning', will raise a warning and attempt to use an existing mapping
#' table in the opposite direction (for example, a mapping from ICD10 to Read
#' 3 would be attempted using the Read 3-to-ICD10 mapping table).
#' @inheritParams codes_starting_with
#' @inheritParams lookup_codes
#'
#' @export
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # map codes from Read 2 to ICD10
#' map_codes(
#' codes = "G20..",
#' from = "read2",
#' to = "icd10",
#' all_lkps_maps = all_lkps_maps_dummy
#' )
map_codes <- function(codes,
from,
to,
all_lkps_maps = NULL,
codes_only = FALSE,
standardise_output = TRUE,
unrecognised_codes = "error",
preferred_description_only = NULL,
reverse_mapping = "error",
col_filters = default_col_filters()) {
# validate args
check_codes(codes)
# connect to database file path if `all_lkps_maps` is a string, or `NULL`
if (is.character(all_lkps_maps)) {
con <- check_all_lkps_maps_path(all_lkps_maps)
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (is.null(all_lkps_maps)) {
if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
con <-
check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (file.exists("all_lkps_maps.db")) {
message("Attempting to connect to all_lkps_maps.db in current working directory")
con <- check_all_lkps_maps_path("all_lkps_maps.db")
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else {
stop(
"No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
)
}
}
validate_all_lkps_maps()
assertthat::assert_that(is.logical(codes_only),
msg = "`code_only` must be either 'TRUE' or 'FALSE'"
)
assertthat::assert_that(!(codes_only & standardise_output),
msg = "Error! `codes_only` and `standardise_output` cannot both be `TRUE`"
)
if (!is.null(preferred_description_only)) {
assertthat::assert_that(!(
preferred_description_only == TRUE & standardise_output == FALSE
),
msg = "Error! `preferred_description_only` cannot be `TRUE` unless `standardise_output` is also `TRUE`"
)
}
# check mapping args and get required details - mapping_table, from_col and
# to_col
mapping_params <- check_mapping_args(
from = from,
to = to,
reverse_mapping = reverse_mapping
)
from_col <- mapping_params$from_col
to_col <- mapping_params$to_col
mapping_table <- mapping_params$mapping_table
# determine relevant column indicating whether code description is preferred
# (for code types with synonymous code descriptions like read 2 and read 3)
preferred_description_col <-
get_value_for_mapping_sheet(
mapping_table = mapping_table,
value = "preferred_synonym_col"
)
# get preferred code, if appropriate
if (!is.na(preferred_description_col)) {
preferred_description_code <-
get_value_for_mapping_sheet(
mapping_table = mapping_table,
value = "preferred_code"
)
}
# do mapping
result <- all_lkps_maps[[mapping_table]] %>%
dplyr::filter(.data[[from_col]] %in% codes) %>%
dplyr::filter(!is.na(.data[[to_col]])) %>%
dplyr::collect()
# filter on `col_filters` parameters
if (!is.null(col_filters)) {
result <- filter_cols(
df = result,
df_name = mapping_table,
col_filters = col_filters
)
}
# check for unrecognised codes
missing_codes <- subset(codes, !codes %in% result[[from_col]])
handle_unrecognised_codes(
unrecognised_codes = unrecognised_codes,
missing_codes = missing_codes,
table_name = mapping_table,
code_type = from
)
# return result
if (nrow(result) == 0) {
message("No codes found after mapping. Returning `NULL`")
return(NULL)
} else {
# return either unique codes only, or df including descriptions
if (codes_only) {
result <- unique(result[[to_col]])
return(result)
} else if (standardise_output) {
# Note, not all mapping sheets in UKB resource 592 contain descriptions
# (e.g. 'read_v2_icd9'). Therefore need to use `lookup_codes` if
# `standardise_output` is `TRUE`
codes <- unique(result[[to_col]])
return(
lookup_codes(
codes = codes,
code_type = to,
all_lkps_maps = all_lkps_maps,
preferred_description_only = preferred_description_only,
unrecognised_codes = "error"
)
)
} else {
return(result)
}
}
}
#' Get a 'from-to' mapping data frame
#'
#' Returns a data frame with 'from' and 'to' columns for a specified pair of
#' coding systems.
#'
#' @param from A clinical coding system to map from.
#' @param to A clinical coding system to map to.
#' @inheritParams codes_starting_with
#' @inheritParams map_codes
#' @param rename_from_to Optionally supply a named vector to rename the 'from'
#' and 'to' columns. For example `c(from = "original_codes", to =
#' "new_codes")`. By default, the columns will be named using the values for
#' `from` and `to` arguments.
#' @param na.rm If `TRUE` (default), remove any rows with `NA` from the returned
#' mapping data frame. The mapping tables may sometimes include `NA` values to
#' explicitly show which 'from' codes have not been mapped.
#'
#' @return A data frame with column names 'from' and 'to'.
#' @export
#'
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # get mapping data frame for Read 2 to ICD10
#' get_mapping_df(
#' from = "read3",
#' to = "icd10",
#' all_lkps_maps = all_lkps_maps_dummy
#' )
get_mapping_df <- function(from,
to,
all_lkps_maps = NULL,
rename_from_to = NULL,
na.rm = TRUE,
reverse_mapping = "error",
col_filters = default_col_filters()) {
# validate args -----------
# get mapping sheet, from and to cols
# check mapping args and get required details - mapping_table, from_col and
# to_col
mapping_params <- check_mapping_args(
from = from,
to = to,
reverse_mapping = reverse_mapping
)
from_col <- mapping_params$from_col
to_col <- mapping_params$to_col
mapping_table <- mapping_params$mapping_table
# rename_from_to
rename_from_to_error_msg <-
"Error! `rename_from_to` should be a named character vector of length 2, with names 'from' and 'to'"
if (!is.null(rename_from_to)) {
assertthat::assert_that(is.character(rename_from_to) &&
(length(rename_from_to) == 2) &&
all(c("from", "to") %in% names(rename_from_to)),
msg = rename_from_to_error_msg
)
}
# connect to database file path if `all_lkps_maps` is a string, or `NULL`
if (is.character(all_lkps_maps)) {
con <- check_all_lkps_maps_path(all_lkps_maps)
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (is.null(all_lkps_maps)) {
if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
con <-
check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (file.exists("all_lkps_maps.db")) {
message("Attempting to connect to all_lkps_maps.db in current working directory")
con <- check_all_lkps_maps_path("all_lkps_maps.db")
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else {
stop(
"No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
)
}
}
# get just distinct combinations of from_col and to_col for mapping_table
from_to_cols <- c(
from_col,
to_col
)
result <- all_lkps_maps[[mapping_table]] %>%
dplyr::collect()
# filter on `col_filters` parameters
if (!is.null(col_filters)) {
result <- filter_cols(
df = result,
df_name = mapping_table,
col_filters = col_filters
)
}
# keep required columns only
result <- result %>%
dplyr::select(tidyselect::all_of(from_to_cols))
# remove rows with `NA` values
if (na.rm) {
result <- tidyr::drop_na(result)
}
# distinct rows only (e.g. read 2 'J5310' maps to both primary and secondary
# descriptions for read 3 'J5311')
result <- result %>%
dplyr::distinct(dplyr::across(tidyselect::everything()),
.keep_all = TRUE
)
# rename
if (!is.null(rename_from_to)) {
new_from_to_cols <- c(
rename_from_to["from"],
rename_from_to["to"]
)
} else {
new_from_to_cols <- c(
from,
to
)
}
result <- result %>%
ukbwranglr:::rename_cols(
old_colnames = from_to_cols,
new_colnames = new_from_to_cols
)
return(result)
}
# Utilities ---------------------------------------------------------------
#' Reformat ICD-10 codes
#'
#' The lookup sheet in
#' \href{https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=592}{UKB resource 592}
#' for ICD-10 ("icd10_lkp") has a column called `ALT_CODE`, which is an
#' alternative format for ICD-10 codes. This is the format used in the mapping
#' sheets for this resource, as well as in
#' \href{https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=41270}{Field ID 41270}.
#' *Note however that in contrast to the ICD10 lookup and mapping sheets,
#' undivided 3 character codes do not have an 'X' appended in the UK Biobank
#' dataset*. This function converts from one format to the other, with the
#' option to strip the final 'X' from undivided 3 character codes if converting
#' to `ALT_CODE` format.
#'
#' @param icd10_codes character vector of ICD-10 codes
#' @param input_icd10_format character. Must be either "ICD10_CODE" or
#' "ALT_CODE".
#' @param output_icd10_format character. Must be either "ICD10_CODE" or
#' "ALT_CODE".
#' @inheritParams codes_starting_with
#' @inheritParams lookup_codes
#' @param strip_x If `TRUE` and converting to `ALT_CODE` format, 'X' is removed
#' from the end of undivided 3 character codes (default is `FALSE`).
#'
#' @return character vector of ICD-10 codes, reformatted as specified by
#' \code{output_icd10_format}.
#' @export
#' @family Clinical code lookups and mappings
#' @examples
#' # build dummy all_lkps_maps
#' all_lkps_maps_dummy <- build_all_lkps_maps_dummy()
#'
#' # reformat from ICD10_CODE to ALT_CODE
#' reformat_icd10_codes(
#' icd10_codes = c("E10.9"),
#' all_lkps_maps = all_lkps_maps_dummy,
#' input_icd10_format = "ICD10_CODE",
#' output_icd10_format = "ALT_CODE"
#' )
#'
#' # reformat from ALT_CODE to ICD10_CODE
#' reformat_icd10_codes(
#' icd10_codes = c("E109"),
#' all_lkps_maps = all_lkps_maps_dummy,
#' input_icd10_format = "ALT_CODE",
#' output_icd10_format = "ICD10_CODE"
#' )
reformat_icd10_codes <- function(icd10_codes,
all_lkps_maps = NULL,
input_icd10_format = "ICD10_CODE",
output_icd10_format = "ALT_CODE",
unrecognised_codes = "error",
strip_x = FALSE) {
# validate args
match.arg(
arg = input_icd10_format,
choices = c("ICD10_CODE", "ALT_CODE")
)
match.arg(
arg = output_icd10_format,
choices = c("ICD10_CODE", "ALT_CODE")
)
assertthat::assert_that(input_icd10_format != output_icd10_format,
msg = "Error for `reformat_icd10_codes()`! Input and output icd10 formats cannot be the same"
)
if (strip_x & (output_icd10_format == "ICD10_CODE")) {
stop("`strip_x` can only be `TRUE` if `output_icd10_format` is 'ALT_CODE'")
}
# connect to database file path if `all_lkps_maps` is a string, or `NULL`
if (is.character(all_lkps_maps)) {
con <- check_all_lkps_maps_path(all_lkps_maps)
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (is.null(all_lkps_maps)) {
if (Sys.getenv("ALL_LKPS_MAPS_DB") != "") {
message(paste0("Attempting to connect to ", Sys.getenv("ALL_LKPS_MAPS_DB")))
con <-
check_all_lkps_maps_path(Sys.getenv("ALL_LKPS_MAPS_DB"))
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else if (file.exists("all_lkps_maps.db")) {
message("Attempting to connect to all_lkps_maps.db in current working directory")
con <- check_all_lkps_maps_path("all_lkps_maps.db")
all_lkps_maps <- ukbwranglr::db_tables_to_list(con)
on.exit(DBI::dbDisconnect(con))
} else {
stop(
"No/invalid path supplied to `all_lkps_maps` and no file called 'all_lkps_maps.db' found in current working directory. See `?all_lkps_maps_to_db()`"
)
}
}
validate_all_lkps_maps()
icd10_mapping_df <- all_lkps_maps$icd10_lkp %>%
dplyr::filter(.data[[input_icd10_format]] %in% icd10_codes) %>%
dplyr::select(tidyselect::all_of(c(
"ICD10_CODE",
"ALT_CODE"
))) %>%
dplyr::collect()
# handle any unrecognised codes
missing_codes <-
subset(
icd10_codes,
!icd10_codes %in% icd10_mapping_df[[input_icd10_format]]
) %>%
unique()
handle_unrecognised_codes(
unrecognised_codes = unrecognised_codes,
missing_codes = missing_codes,
table_name = "icd10_lkp",
code_type = input_icd10_format
)
# Note: some ICD10_CODE values have multiple associated ALT_CODEs - these
# include a modifier description in `MODIFIER_5` (e.g. ICD-10 codes M00,
# M77, M07, M72, M65, S52 or S72). The number of output codes may therefore
# be larger/smaller than the input number. Raise informative message if this
# is is the case.
non_unique_icd10_codes <- icd10_mapping_df %>%
dplyr::count(.data[["ICD10_CODE"]]) %>%
dplyr::filter(.data[["n"]] > 1) %>%
dplyr::pull(.data[["ICD10_CODE"]])
if (length(non_unique_icd10_codes) > 0) {
input_icd10_not_1_to_1_mapping <- icd10_mapping_df %>%
dplyr::filter(.data[["ICD10_CODE"]] %in% non_unique_icd10_codes) %>%
dplyr::pull(.data[[input_icd10_format]]) %>%
unique()
more_or_fewer_returned_codes <-
ifelse(input_icd10_format == "ICD10_CODE",
yes = "*more*",
no = "*fewer*"
)
message(
"The following ",
length(input_icd10_not_1_to_1_mapping),
" input ICD10 codes do not have a 1-to-1 ICD10_CODE-to-ALT_CODE mapping: '",
stringr::str_c(
input_icd10_not_1_to_1_mapping,
sep = "",
collapse = "', '"
),
"'. There will therefore be ",
more_or_fewer_returned_codes,
" output than input codes"
)
}
# get requested icd10 format
result <- unique(icd10_mapping_df[[output_icd10_format]])
# optionally remove appended 'X' for undivided 3 character codes in `ALT_CODE` format
if (strip_x & (output_icd10_format == "ALT_CODE")) {
message("Removing 'X' from any undivided 3 character ICD10 codes")
result <- stringr::str_remove(
result,
"X$"
)
}
# return result
return(result)
}
#' Default filtering parameters for lookup and mapping tables.
#'
#' To be used as `col_filters` argument in 'Clinical code lookups and mappings'
#' functions. Returns a named list where each name in the list refers to the
#' name of a lookup or mapping table. Each item is also a named list, where the
#' names refer to column names in the corresponding table, and the items are
#' vectors of values to filter for.
#'
#' @return A named list.
#' @export
#'
#' @family Clinical code lookups and mappings
#' @examples
#' default_col_filters()
default_col_filters <- function() {
list(
read_ctv3_icd10 = list(
mapping_status = c("E", "G", "D"),
refine_flag = c("C", "P"),
element_num = c("0"),
block_num = c("0")
),
read_v2_icd10 = list(icd10_code_def = c("1", "15", "3", "5", "7", "8")),
read_ctv3_read_v2 = list(IS_ASSURED = "1"),
read_v2_read_ctv3 = list(IS_ASSURED = "1")
)
}
# PRIVATE FUNCTIONS -------------------------------------------------------
#' Helper function for \code{\link{map_codes}}
#'
#' Returns name of the appropriate mapping sheet from the UKB code mappings
#' excel file (resource 592) for mapping from one clinical coding system to
#' another.
#'
#' @param from character
#' @param to character
#'
#' @return character (scalar)
#' @noRd
#' @family Clinical code lookups and mappings
get_from_to_mapping_sheet <- function(from, to) {
CLINICAL_CODE_MAPPINGS_MAP[(CLINICAL_CODE_MAPPINGS_MAP[["from"]] == from &
CLINICAL_CODE_MAPPINGS_MAP[["to"]] == to), ][["mapping_table"]]
}
#' Helper function for \code{\link{map_codes}}
#'
#' Returns the requested value for a 'mapping_table' in
#' \code{CLINICAL_CODE_MAPPINGS_MAP}.
#'
#' @param mapping_table character
#' @param value character. column name from
#' \code{CLINICAL_CODE_MAPPINGS_MAP} (apart from
#' "mapping_table").
#'
#' @return character (scalar)
#' @noRd
#' @family Clinical code lookups and mappings
get_value_for_mapping_sheet <- function(mapping_table,
value) {
# validate args
match.arg(
arg = mapping_table,
choices = CLINICAL_CODE_MAPPINGS_MAP[["mapping_table"]]
)
match.arg(
arg = value,
choices = subset(
names(CLINICAL_CODE_MAPPINGS_MAP),
subset = names(CLINICAL_CODE_MAPPINGS_MAP) != "mapping_table"
)
)
# return specified `value`
CLINICAL_CODE_MAPPINGS_MAP[CLINICAL_CODE_MAPPINGS_MAP[["mapping_table"]] == mapping_table, ][[value]]
}
#' Get name of lookup sheet for a clinical code system
#'
#' Helper function for \code{\link{lookup_codes}} and \code{\link{codes_starting_with}}
#'
#' @param code_type character
#'
#' @return character (scalar)
#' @noRd
#' @family Clinical code lookups and mappings
get_lookup_sheet <- function(code_type) {
# validate args
match.arg(code_type,
choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
)
# get lookup sheet
CODE_TYPE_TO_LKP_TABLE_MAP %>%
dplyr::filter(.data[["code"]] == code_type) %>%
.$lkp_table
}
#' Get name of code, description or preferred synonym column for a lookup sheet
#'
#' Helper function for \code{\link{lookup_codes}} and \code{\link{codes_starting_with}}
#'
#' @param lookup_sheet character
#' @param column character
#'
#' @return character (scalar)
#'
#' @family Clinical code lookups and mappings
#' @noRd
get_col_for_lookup_sheet <- function(lookup_sheet,
column) {
# validate args
match.arg(
arg = lookup_sheet,
choices = CODE_TYPE_TO_LKP_TABLE_MAP$lkp_table
)
match.arg(
arg = column,
choices = c("code_col", "description_col", "preferred_synonym_col")
)
# get column name for lookup sheet
CODE_TYPE_TO_LKP_TABLE_MAP %>%
dplyr::filter(.data[["lkp_table"]] == lookup_sheet) %>%
.[[column]]
}
#' Get preferred description code for a lookup sheet
#'
#' Helper function for \code{\link{lookup_codes}} and \code{\link{codes_starting_with}}
#'
#' @param lookup_sheet character
#'
#' @return character (scalar)
#'
#' @family Clinical code lookups and mappings
#' @noRd
get_preferred_description_code_for_lookup_sheet <-
function(lookup_sheet) {
# validate args
match.arg(
arg = lookup_sheet,
choices = CODE_TYPE_TO_LKP_TABLE_MAP$lkp_table
)
# get preferred description code for lookup sheet
CODE_TYPE_TO_LKP_TABLE_MAP %>%
dplyr::filter(.data[["lkp_table"]] == lookup_sheet) %>%
.[["preferred_code"]]
}
#' Reformat a dataframe of clinical codes to work with
#' \code{\link[ukbwranglr]{extract_phenotypes}}
#'
#' A utility function that helps reformat the output from \code{\link{map_codes}}
#' or \code{\link{lookup_codes}} to work with
#' \code{\link[ukbwranglr]{extract_phenotypes}}. See also output
#' from \code{\link[ukbwranglr]{example_clinical_codes}} for an example of
#' the format that this function will output.
#'
#' @param standardised_codelist a data frame with column names "code",
#' "description", "code_type".
#' @param code_type character (scalar). The clinical code type e.g. "read2"
#' @param disease character (scalar), e.g. "Secondary polycythaemia"
#' @param disease_category character (scalar). The subcategory of \code{disease}
#' that these codes belong to e.g. "Diagnosis of Secondary polycythaemia".
#' @param author character (scalar), e.g. "caliber".
#'
#' @return A data frame with the following column names: 'disease',
#' 'description', 'category', 'code_type', 'code' and 'author'.
#' @noRd
reformat_standardised_codelist <- function(standardised_codelist,
code_type,
disease,
disease_category,
author) {
# validate args
assertthat::assert_that(is.data.frame(standardised_codelist),
msg = "Error! standardised_codelist must be a data frame (or tibble/data table"
)
assertthat::is.string(code_type)
assertthat::is.string(disease)
assertthat::is.string(disease_category)
assertthat::is.string(author)
match.arg(code_type,
choices = ukbwranglr:::CLINICAL_EVENTS_SOURCES$data_coding
)
assertthat::assert_that(all(
names(standardised_codelist) == c("code", "description", "code_type")
),
msg = "Error! `standardised_codelist` must be a data frame with the following headings: 'code', 'description', 'code_type'"
)
assertthat::assert_that(
all(
standardised_codelist$code_type %in% unique(ukbwranglr:::CLINICAL_EVENTS_SOURCES$data_coding)
),
msg = paste0(
"Error! `standardised_codelist$code_type` contains unrecognised code types. Recognised code types: ",
stringr::str_c(
unique(ukbwranglr:::CLINICAL_EVENTS_SOURCES$data_coding),
sep = "",
collapse = ", "
)
)
)
# reformat to work with `extract_phenotypes()`
standardised_codelist <- standardised_codelist %>%
dplyr::mutate(
"code_type" = code_type,
"disease" = disease,
"category" = disease_category,
"author" = author,
) %>%
dplyr::select(
.data[["disease"]],
.data[["description"]],
.data[["category"]],
.data[["code_type"]],
.data[["code"]],
.data[["author"]]
)
return(standardised_codelist)
}
standardise_output_fn <-
function(df,
lkp_table,
code_col,
description_col,
code_type) {
names(df)[which(names(df) == code_col)] <- "code"
names(df)[which(names(df) == description_col)] <- "description"
# Some ICD-10 descriptions include a modifier e.g. "E10" = "Type 1 diabetes
# mellitus", whereas "E10.0" = "Type 1 diabetes mellitus with coma". "With
# coma" is contained in the modifier columns "MODIFIER-4". See 'S27' for an
# example code where additional description is contained in the "MODIFER-5"
# column. The returned "description" column from `standardise_output == TRUE`
# therefore combines the 'DESCRIPTION' column with one of these 2 columns
# (whichever is not NA). There are no codes with a modifier description in
# both "MODIFIER_4" and "MODIFIER_5".
if (lkp_table == "icd10_lkp") {
df$description <- dplyr::case_when(
!is.na(df$MODIFIER_4) ~ paste(df$description, df$MODIFIER_4),
!is.na(df$MODIFIER_5) ~ paste(df$description, df$MODIFIER_5),
TRUE ~ df$description
)
}
# return code, description and code_type cols only
df <- df[c("code", "description")]
df[["code_type"]] <- code_type
return(df)
}
#' Filter lookup/mapping table for specified values in columns
#'
#' Helper function that enables filtering of lookup/mapping tables for certain
#' values (e.g. filter a mapping table for only 'exact' code mappings). Uses an
#' `%in%` filter statement. Note that no error is currently raised if attempting
#' to filter for any values that do not exist in `df` columns.
#'
#' @param df Lookup or mapping df to be filtered
#' @param df_name Name of lookup/mapping df (e.g. "icd10_lkp").
#' @param col_filters Either `NULL` (default, in which case `df` is returned
#' unchanged) or a named list. First level names are names of lookup/mapping
#' tables. Each item is also a named list of vectors. Columns in `df` that
#' match the list names are filtered for values in the corresponding vectors
#' (using `%in%`). An error is raised
#'
#' @return A dataframe
#' @noRd
filter_cols <- function(df,
df_name,
col_filters = NULL) {
# if `col_filters` is `NULL`, return `df` unchanged (exit early)
if (is.null(col_filters)) {
return(df)
}
# get relevant columns/filter values from `col_filters`
col_filters <- col_filters[[df_name]]
# if `df_name` is not present in `names(col_filters)` return `df` unchanged (exit early)
if (is.null(col_filters)) {
return(df)
}
# check that selected element of `col_filters` is a named list of vectors
stopifnot(is.list(col_filters))
if (is.null(names(col_filters))) {
stop("Each item in `col_filters` must be named")
}
if (any(names(col_filters) == "")) {
stop("Each item in `col_filters` must be named")
}
col_filters_item_types <- col_filters %>%
purrr::map_lgl(is.vector)
assertthat::assert_that(sum(!col_filters_item_types) == 0,
msg = "Each item in `col_filters` must be a vector"
)
# check that column names exist in df. Raise error is any are unrecognised.
unrecognised_colnames <-
subset(names(col_filters), !names(col_filters) %in% names(df))
if (length(unrecognised_colnames) > 0) {
stop(
paste0(
"The following ",
length(unrecognised_colnames),
" column names specified by `col_filters` are not present in `",
df_name,
"`: ",
stringr::str_c(
unrecognised_colnames,
sep = "",
collapse = ", "
)
),
call. = FALSE
)
}
# filter `df` for specified values in each column listed by `col_filters`
for (i in names(col_filters)) {
col_filter_values <- col_filters[[i]]
# check that type matches
df_col_class <- class(df[[i]])
col_filter_values_class <- class(col_filter_values)
assertthat::assert_that(
df_col_class %in% col_filter_values_class,
msg = paste0(
"Cannot filter column ",
i,
" in ",
df_name,
" as classes do not match. Column `",
i,
"`` is class ",
df_col_class,
", but filter values specified by `col_filters` are of class ",
col_filter_values_class
)
)
df <- df %>%
dplyr::filter(.data[[i]] %in% !!col_filter_values)
}
# return result
return(df)
}
#' Remove or extract appended ICD10 letters
#'
#' @param icd10_codes Character vector of ICD10 codes
#' @param rm_extract Either 'rm' (remove 'D'/'X'/'A' from end of codes) or
#' 'extract' (return these characters only, or `NA` if the code does not end
#' with letter characters).
#'
#' @return Character vector
#' @noRd
#' @examples
#' rm_or_extract_appended_icd10_letters(c(
#' "A00",
#' "A408",
#' "A390D",
#' "A38X",
#' "G01XA"
#' ))
rm_or_extract_appended_icd10_dxa <- function(icd10_codes,
keep_x = TRUE,
rm_extract = "rm") {
# validate args
match.arg(rm_extract,
choices = c("rm", "extract")
)
stopifnot(is.character(icd10_codes))
# either remove or extract appended 'D'/'X'/'A' from `icd10_codes`
pattern <- ifelse(keep_x,
yes = "[D|A]*$",
no = "[D|X|A]*$"
)
switch(rm_extract,
rm = stringr::str_remove(icd10_codes,
pattern = pattern
),
extract = stringr::str_extract(icd10_codes,
pattern = pattern
) %>%
ifelse(. == "",
yes = NA_character_,
no = .
)
)
}
#' Get a mapping table for ICD10 codes in ALT_CODE format, with and without 'X'
#' appended for undivided 3 character codes
#'
#' @param icd10_lkp The lookup table for ICD10 codes, `icd10_lkp`.
#' @param undivided_3char_only If `TRUE` return only undivided 3 character ICD10
#' codes. Default is `FALSE`.
#' @param as_named_list If `NULL`, returns a data frame with columns `ALT_CODE`
#' and `ALT_CODE_minus_x`. If 'names_no_x' or 'names_with_x', returns a named
#' list with either `ALT_CODE` or `ALT_CODE_minus_x` set as names (and the
#' other as values) respectively.
#'
#' @return A data frame or a named list (see argument `as_named_list`).
#' @noRd
get_icd10_code_alt_code_x_map <- function(icd10_lkp,
undivided_3char_only = FALSE,
as_named_list = NULL) {
# validate args
match.arg(as_named_list,
choices = c("names_no_x", "names_with_x")
)
# make mapping df
icd10_lkp_alt_x_map <- icd10_lkp %>%
dplyr::select(.data[["ALT_CODE"]]) %>%
dplyr::filter(!is.na(.data[["ALT_CODE"]])) %>%
dplyr::collect() %>%
dplyr::mutate("ALT_CODE_minus_x" = stringr::str_remove(.data[["ALT_CODE"]],
pattern = "X$"
))
# check that all codes are unique
stopifnot(
length(icd10_lkp_alt_x_map$ALT_CODE) == dplyr::n_distinct(icd10_lkp_alt_x_map$ALT_CODE)
)
stopifnot(
length(icd10_lkp_alt_x_map$ALT_CODE_minus_x) == dplyr::n_distinct(icd10_lkp_alt_x_map$ALT_CODE_minus_x)
)
# check that all codes are the same comparing `ALT_CODE` and
# `ALT_CODE_minus_x`, after removing the X in `ALT_CODE`
stopifnot(all(
icd10_lkp_alt_x_map$ALT_CODE_minus_x == stringr::str_remove(icd10_lkp_alt_x_map$ALT_CODE,
pattern = "X$"
)
))
if (undivided_3char_only) {
icd10_lkp_alt_x_map <- icd10_lkp_alt_x_map %>%
dplyr::filter(.data[["ALT_CODE_minus_x"]] != .data[["ALT_CODE"]])
}
# convert to named list
if (!is.null(as_named_list)) {
icd10_lkp_alt_x_map <- switch(as_named_list,
names_no_x = icd10_lkp_alt_x_map %>%
tidyr::pivot_wider(
names_from = "ALT_CODE_minus_x",
values_from = "ALT_CODE"
) %>%
as.list(),
names_with_x = icd10_lkp_alt_x_map %>%
tidyr::pivot_wider(
names_from = "ALT_CODE",
values_from = "ALT_CODE_minus_x"
) %>%
as.list()
)
}
# return result
return(icd10_lkp_alt_x_map)
}
#' Get a vector of ICD10 codes in ALT_CODE format for a specified start/end
#' range of ICD10 codes
#'
#' Note that `start_icd10_code` and `end_icd10_code` must be of the same length,
#' unless one ends with 'X'. For example, expanding the range 'A80-A81' is
#' equivalent to expanding both 'A800-A809' and 'A810-A819'.
#'
#' @param start_icd10_code String
#' @param end_icd10_code String
#' @param icd10_lkp The ICD10 lookup table. Must have a `.rowid` column.
#'
#' @noRd
#' @return A character vector of
get_icd10_code_range <- function(start_icd10_code,
end_icd10_code,
icd10_lkp) {
# validate args
stopifnot(is.character(start_icd10_code))
stopifnot(is.character(end_icd10_code))
assertthat::assert_that(
stringr::str_length(stringr::str_remove(
start_icd10_code,
"X$"
)) == stringr::str_length(stringr::str_remove(
end_icd10_code,
"X$"
)),
msg = paste0(
"`start_icd10_code` and `end_icd10_code` must have the same number of characters. Start/end values provided: ",
start_icd10_code,
" and ",
end_icd10_code
)
)
check_rowid_col_present(
df = icd10_lkp,
df_name = "icd10_lkp"
)
check_codes_exist(
codes = c(start_icd10_code, end_icd10_code),
lkp_codes = icd10_lkp$ALT_CODE,
code_type = "icd10",
table_name = "icd10_lkp",
return_unrecognised_codes = FALSE
)
# get start and end row indices
start_rowid <- icd10_lkp %>%
dplyr::filter(.data[["ALT_CODE"]] == !!start_icd10_code) %>%
dplyr::pull(.data[[".rowid"]])
end_rowid <- icd10_lkp %>%
dplyr::filter(.data[["ALT_CODE"]] == !!end_icd10_code) %>%
dplyr::pull(.data[[".rowid"]])
# check start/end row indices are scalar, and create range of row index integers
assertthat::is.number(start_rowid)
assertthat::is.number(end_rowid)
icd10_lkp_rowids <- start_rowid:end_rowid
# filter for selected row index integers
result <- icd10_lkp %>%
dplyr::filter(.data[[".rowid"]] %in% icd10_lkp_rowids) %>%
dplyr::pull(.data[["ALT_CODE"]])
pattern <- stringr::str_c(paste0("^", result),
sep = "",
collapse = "|"
)
# expand (e.g. for 'A80-A81', at this stage all 'A80' should be present
# ('A800-A809'), but for 'A81', only 'A81' wil be present - needs expanding
# to 'A801-A819')
result <- icd10_lkp %>%
dplyr::filter(stringr::str_detect(.data[["ALT_CODE"]],
pattern = pattern
)) %>%
dplyr::pull(.data[["ALT_CODE"]])
return(result)
}
#' Expand a `read_v2_icd10` data frame containing code ranges in the
#' `icd10_code` column
#'
#' To be used within `reformat_read_v2_icd10()`.
#'
#' @param read_v2_icd10 Data frame
#' @param icd10_lkp Data frame
#' @param icd10_lkp_alt_x_map A named list of undivided 3 character ICD10 codes,
#' where the names do not have an 'X' appended and the values do.
#'
#' @return A data frame
#' @noRd
expand_icd10_ranges <- function(read_v2_icd10,
icd10_lkp,
icd10_lkp_alt_x_map) {
# validate args
assertthat::assert_that(all(
c(
"read_code",
"icd10_code",
"icd10_code_def"
) %in% names(read_v2_icd10)
),
msg = "Unexpected column names in `read_v2_icd10`"
)
# separate by '-'
read_v2_icd10 <- read_v2_icd10 %>%
tidyr::separate(
.data[["icd10_code"]],
into = c("start_icd10_code", "end_icd10_code"),
sep = "-",
remove = FALSE,
fill = "right"
)
read_v2_icd10 <- read_v2_icd10 %>%
dplyr::mutate("start_icd10_code" = ifelse(is.na(.data[["end_icd10_code"]]),
yes = NA_character_,
no = .data[["start_icd10_code"]]
))
# strip any appended 'D/X/A' (last character(s) e.g. 'A89X' and 'A170D' become
# 'A89' and 'A170'. 'G01XA' would become 'G01', although note that this code
# does not appear together with a '-')
read_v2_icd10 <- read_v2_icd10 %>%
dplyr::mutate(dplyr::across(
tidyselect::all_of(c(
"start_icd10_code",
"end_icd10_code"
)),
~ rm_or_extract_appended_icd10_dxa(
icd10_codes = .x,
keep_x = TRUE,
rm_extract = "rm"
)
))
# Make sure undivided 3 character ICD10 codes have an 'X' appended
read_v2_icd10 <- read_v2_icd10 %>%
dplyr::mutate(dplyr::across(
tidyselect::all_of(c(
"start_icd10_code",
"end_icd10_code"
)),
~ dplyr::recode(
.x,
!!!icd10_lkp_alt_x_map
)
))
# expand range
read_v2_icd10 %>%
dplyr::rowwise() %>%
dplyr::mutate(icd10_range_new = ifelse(
is.na(.data[["start_icd10_code"]]),
yes = list(NA_character_),
no = list(
get_icd10_code_range(
start_icd10_code = .data[["start_icd10_code"]],
end_icd10_code = .data[["end_icd10_code"]],
icd10_lkp = icd10_lkp
)
)
)) %>%
dplyr::ungroup() %>%
tidyr::unnest(cols = "icd10_range_new") %>%
dplyr::mutate("icd10_code" = ifelse(is.na(.data[["icd10_range_new"]]),
yes = .data[["icd10_code"]],
no = .data[["icd10_range_new"]]
)) %>%
dplyr::select(-tidyselect::all_of(c(
"start_icd10_code",
"end_icd10_code",
"icd10_range_new"
)))
}
## Validation helpers ---------------------------
check_codes <- function(codes) {
assertthat::assert_that(is.character(codes),
msg = "Error! `codes` must be a character vector"
)
assertthat::assert_that(sum(is.na(codes)) == 0,
msg = "Error! `codes` cannot contain `NA` values"
)
}
check_mapping_args <- function(from,
to,
reverse_mapping = "error") {
match.arg(reverse_mapping,
choices = c("error", "warning")
)
match.arg(
arg = from,
choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
)
# choices = CLINICAL_CODE_MAPPINGS_MAP$from)
match.arg(
arg = to,
choices = CODE_TYPE_TO_LKP_TABLE_MAP$code
)
# choices = CLINICAL_CODE_MAPPINGS_MAP$to)
assertthat::assert_that(!from == to,
msg = "Error! `from` and `to` args cannot be the same"
)
# get appropriate mapping sheet
swap_mapping_cols <- FALSE
mapping_table <- get_from_to_mapping_sheet(from = from, to = to)
# if above returns `character(0)`, try to map the other way
if (rlang::is_empty(mapping_table)) {
swap_mapping_cols <- TRUE
mapping_table <- get_from_to_mapping_sheet(from = to, to = from)
}
# if still returns `character(0)`, error
if (rlang::is_empty(mapping_table)) {
stop("Error! Invalid (or unavailable) code mapping request")
} else if (swap_mapping_cols) {
switch(reverse_mapping,
error = stop("No mapping sheet available for this request"),
warning = warning(
"No mapping sheet available for this request. Attempting to map anyway using: ",
mapping_table
)
)
}
# get from_col and to_col column names for mapping sheet
# swap if appropriate
if (swap_mapping_cols) {
from_col <-
get_value_for_mapping_sheet(
mapping_table = mapping_table,
value = "to_col"
)
to_col <-
get_value_for_mapping_sheet(
mapping_table = mapping_table,
value = "from_col"
)
} else {
from_col <-
get_value_for_mapping_sheet(
mapping_table = mapping_table,
value = "from_col"
)
to_col <-
get_value_for_mapping_sheet(
mapping_table = mapping_table,
value = "to_col"
)
}
# return result
return(list(
from_col = from_col,
to_col = to_col,
mapping_table = mapping_table
))
}
check_all_lkps_maps_path <- function(file_path) {
# check file exists
assertthat::assert_that(file.exists(file_path),
msg = paste0(
"Error! No file found at ",
file_path
)
)
# check file ends with '.db'
assertthat::assert_that(stringr::str_detect(
file_path,
".+\\.db"
))
# return con object if tests pass
DBI::dbConnect(RSQLite::SQLite(), file_path)
}
check_rowid_col_present <- function(df,
df_name) {
assertthat::assert_that(".rowid" %in% names(df),
msg = paste0(
"'.rowid' column not present in `",
df_name,
"`"
)
)
assertthat::assert_that(
class(df[[".rowid"]]) == "integer",
msg = paste0(
"'.rowid' column in `",
df_name,
"should be class 'integer', not '",
class(df[[".rowid"]]),
"'"
)
)
}
#' Helper function - raise error or warning if unrecognised codes are present
#'
#' Raises an error or warning for unrecognised codes.
#'
#' @param unrecognised_codes Either 'error' or 'warning'. Determines how to
#' handle unrecognised codes
#' @param missing_codes character vector of unrecognised codes.
#' @param code_type The type of clinical coding system
#' @param table_name Name of lookup/mapping table from which codes are missing
#'
#' @return Called for side effects.
#' @noRd
#' @family Clinical code lookups and mappings
handle_unrecognised_codes <-
function(unrecognised_codes,
missing_codes,
table_name,
code_type) {
match.arg(unrecognised_codes,
choices = c("error", "warning")
)
# make sure missing_codes are unique, if not already
missing_codes <- unique(missing_codes)
# only display first 25 codes
if (length(missing_codes) > 25) {
missing_codes_to_print <- utils::head(missing_codes,
n = 25
)
} else {
missing_codes_to_print <- missing_codes
}
missing_codes_to_print <- stringr::str_c(missing_codes_to_print,
sep = "",
collapse = "', '"
)
if (length(missing_codes) > 25) {
missing_codes_to_print <- paste0(
missing_codes_to_print,
" (first 25 only shown)"
)
}
if (length(missing_codes) > 0) {
missing_codes_message <- paste0(
"The following ",
length(missing_codes),
" codes were not found for '",
code_type,
"' in table '",
table_name,
"': '",
stringr::str_c(missing_codes_to_print, sep = "", collapse = "', '"),
"'"
)
switch(unrecognised_codes,
error = stop(missing_codes_message),
warning = warning(missing_codes_message)
)
}
}
#' Utility function - check if all of a set of codes are recognised
#'
#' @param codes Codes being checked
#' @param lkp_codes Character vector of lookup codes. Any `codes` not present in
#' `lkp_codes` are considered to be unrecognised.
#' @param code_type String
#' @param return_unrecognised_codes If `TRUE`, return a character vector of
#' unrecognised codes. Default is `FALSE`.check
#' @param unrecognised_codes Either 'error' or 'warning'. Determines how to
#' handle unrecognised codes.
#' @param table_name Name of lookup/mapping table from which `lkp_codes` was
#' obtained.
#'
#' @noRd
#' @return Return vector of unrecognised codes if `return_missing_codes` is
#' `TRUE`, otherwise called for side effect (error if any unrecognised codes)
check_codes_exist <- function(codes,
lkp_codes,
table_name,
code_type,
return_unrecognised_codes = FALSE,
unrecognised_codes = "error") {
# check for unrecognised('missing') codes
missing_codes <- subset(codes, !codes %in% lkp_codes)
# return missing codes, if requested
if (return_unrecognised_codes) {
return(missing_codes)
}
# ...otherwise return error
handle_unrecognised_codes(
unrecognised_codes = unrecognised_codes,
missing_codes = missing_codes,
table_name = table_name,
code_type = code_type
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.