# these are taken from 'icd', which is where they are derived
re_icd9_decimal_strict_bare <- "[[:digit:]]{3}(?:\\.(?:[[:digit:]]{1,2})?)?|V(?:0[1-9]|[1-9][[:digit:]])(?:\\.(?:[[:digit:]]{1,2})?)?|E[[:digit:]]{3}(?:\\.(?:[[:digit:]]{1})?)?" # nolint
re_icd9_decimal_bare <- "[[:digit:]]{1,3}(?:\\.(?:[[:digit:]]{1,2})?)?|[Vv](?:0[1-9]|[1-9][[:digit:]]?)(?:\\.(?:[[:digit:]]{1,2})?)?|[Ee][[:digit:]]{1,3}(?:\\.(?:[[:digit:]]{1})?)?" # nolint
re_icd9_major_strict_bare <- "(?:[[:digit:]]{3}|V(?:0[1-9]|[1-9][[:digit:]])|E[[:digit:]]{3})" # nolint
re_icd9_major_bare <- "(?:[[:digit:]]{1,3}|[Vv](?:0[1-9]|[1-9][[:digit:]]?)|[Ee][[:digit:]]{1,3})" # nolint
re_icd10_major_bare <- "[[:alpha:]][[:digit:]][[:alnum:]]"
#' Fetch RTF for a given year
#'
#' Will return NULL if offline and not available
#' @param year character vector of length one, e.g. "2011"
#' @param offline single logical value
#' @keywords internal datagen
#' @noRd
.dl_icd9cm_rtf_year <- function(year, ...) {
year <- as.character(year)
rtf_dat <- .icd9cm_sources[.icd9cm_sources$f_year == year, ]
fn <- rtf_dat$rtf_filename
url <- rtf_dat$rtf_url
.unzip_to_data_raw(
url = url,
save_name = .get_versioned_raw_file_name(fn, ver = year),
file_name = fn,
...
)
}
#' parse RTF description of entire ICD-9-CM for a specific year
#'
#' Currently only the most recent update is implemented. Note that CMS have
#' published additional ICD-9-CM billable code lists since the last one from the
#' CDC: I think these have been the same every year since 2011, though. The last
#' CDC release is \code{Dtab12.rtf} from 2011.
#'
#' The file itself is 7 bit ASCII, but has its own internal encoding using
#' 'CP1252.' Test 'Meniere's' disease with lines 24821 to 24822 from 2012 RTF
#' @param year from 1996 to 2012 (this is what CDC has published). Only 2012
#' implemented thus far
#' @template save_data
#' @template verbose
#' @template offline
#' @source \url{https://www.cdc.gov/nchs/icd/icd9cm.htm} Navigate to
#' 'Dtab12.zip' in the 2011 data. and similar files run from 1996 to 2011.
#' @keywords internal datagen
#' @noRd
.parse_icd9cm_rtf_year <- function(year = "2014",
...,
save_pkg_data = FALSE,
verbose = FALSE,
offline = .offline()) {
year <- as.character(year)
stopifnot(is.logical(save_pkg_data), length(save_pkg_data) == 1)
stopifnot(is.logical(verbose), length(verbose) == 1)
stopifnot(is.logical(offline), length(offline) == 1)
f_info_rtf <- .dl_icd9cm_rtf_year(year, offline = offline)
if (is.null(f_info_rtf)) {
stop("RTF data for year ", year, " unavailable.")
}
fp <- f_info_rtf$file_path
fp_conn <- file(fp, encoding = "ASCII")
on.exit(close(fp_conn))
rtf_lines <- readLines(fp_conn, warn = FALSE, encoding = "ASCII")
out <- .rtf_parse_lines(rtf_lines,
verbose = verbose,
...,
save_pkg_data = save_pkg_data
)
out <- icd::as.icd9cm(.swap_names_vals(out))
out_df <- data.frame(
code = icd::as.icd9cm(icd::decimal_to_short(unname(out))),
desc = names(out),
stringsAsFactors = FALSE
)
out_df <- out_df[order.icd9(out), ]
invisible(.lookup_icd9_hier(out_df, short_code = TRUE))
}
.lookup_icd9_hier <- function(x,
year = "2014",
short_code = TRUE,
verbose = .verbose()) {
dat <- x
if (!short_code) dat$code <- icd::decimal_to_short(dat$code)
dat$three_digit <- .get_major.icd9(dat$code)
if (verbose) message("Generating sub-chapter lookup for year: ", year)
sc_lookup <- .icd9_generate_subchap_lookup()
# if (x[1, 1] == "418") print(sc_lookup[380:390,])
mismatch_sub_chap <-
dat$three_digit[which(dat$three_digit %nin% sc_lookup$sc_major)]
if (length(mismatch_sub_chap) != 0L) {
stop(
"mismatch: ",
paste(mismatch_sub_chap, collapse = ", ")
)
}
mj_lookup <- data.frame(
three_digit = unname(icd9_majors),
mj_major = names(icd9_majors)
)
dat[["major"]] <-
merge(
x = dat["three_digit"],
y = mj_lookup,
by.x = "three_digit",
by.y = "three_digit",
all.x = TRUE
)[["mj_major"]]
dat[["sub_chapter"]] <-
merge(
x = dat["three_digit"],
y = sc_lookup,
by.x = "three_digit",
by.y = "sc_major",
all.x = TRUE
)[["sc_desc"]]
if (verbose) message("Generating chap lookup for year: ", year)
chap_lookup <- .icd9_generate_chap_lookup()
dat[["chapter"]] <-
merge(dat["three_digit"], chap_lookup,
by.x = "three_digit", by.y = "chap_major",
all.x = TRUE
)[["chap_desc"]]
# levels specified to keep ICD-9 ordering 0-9VE?
dat[["three_digit"]] <- factor(dat[["three_digit"]],
levels = unique(dat[["three_digit"]])
)
class(dat[["three_digit"]]) <- c("icd9cm", "icd9", "factor")
dat
}
# just wrap the new function in the old name
.icd9_get_chapters <- function(x, short_code = TRUE) {
dframe <- data.frame(
code = x,
stringsAsFactors = FALSE
)
.lookup_icd9_hier(dframe,
short_code = short_code
)[-1]
}
.rtf_pre_filter <- function(filtered, ...) {
stopifnot(is.character(filtered))
# merge any line NOT starting with "\\par" on to previous line
non_par_lines <- grep(pattern = "^\\\\par", x = filtered, invert = TRUE, ...)
# in reverse order, put each non-par line on end of previous, then filter out
# all non-par lines
for (i in rev(non_par_lines))
filtered[i - 1] <- paste(filtered[i - 1], filtered[i], sep = "")
filtered <- grep("^\\\\par", filtered, value = TRUE, ...)
filtered <- .rtf_fix_unicode(filtered, ...)
# extremely long terminal line in primary source is junk
longest_lines <- which(nchar(filtered) > 3000L)
filtered <- filtered[-longest_lines]
filtered <- .rtf_strip(filtered)
grep("^[[:space:]]*$", filtered, value = TRUE, invert = TRUE, ...)
}
#' parse lines of RTF
#'
#' parse a character vector containing RTF strings
#'
#' \code{...} might include: \code{perl = TRUE, useBytes = TRUE}
#' @param rtf_lines character vector containing RTF. Encoding?
#' @template verbose
#' @return named character vector, with names being the ICD-9 codes, and the
#' contents being the descriptions from the RTF source. Elsewhere I do this
#' the other way around, but the tests are now wired for this layout. 'Tidy'
#' data would favour having an unnamed two-column data frame.
#' @keywords internal datagen
#' @noRd
.rtf_parse_lines <- function(rtf_lines, verbose = FALSE,
save_pkg_data = FALSE,
...) {
stopifnot(is.character(rtf_lines))
stopifnot(is.logical(verbose), length(verbose) == 1)
stopifnot(is.logical(save_pkg_data), length(save_pkg_data) == 1)
filtered <- .rtf_pre_filter(rtf_lines, ...)
majors <- .rtf_make_majors(filtered, save_pkg_data = save_pkg_data, ...)
if (verbose) message("Have ", length(majors), " majors.")
# this is so ghastly: find rows with sequare brackets containing definition of
# subset of fourth or fifth digit codes. Need to pull code from previous row,
# and create lookup, so we can exclude these when processing the fourth an
# fifth digits
invalid_qual <- .rtf_make_invalid_qual(filtered, ...)
# several occurances of "Requires fifth digit", referring back to the previous
# higher-level definition, without having the parent code in the line itself
re_fifth_range_other <- "fifth +digit +to +identify +stage"
fifth_backref <- grep(re_fifth_range_other, filtered, ...)
# for these, construct a string which will be captured in the next block
# e.g. "Requires fifth digit to identify stage:" becomes
# "Requires fifth digit to identify stage: 634 Spontaneous abortion"
filtered[fifth_backref] <-
paste(filtered[fifth_backref], filtered[fifth_backref - 1])
re_fourth_range <- "fourth-digit.+categor"
fourth_rows <- grep(re_fourth_range, filtered, ...)
lookup_fourth <- .rtf_generate_fourth_lookup(filtered, fourth_rows)
# at least two examples of "Use 0 as fourth digit for category 672"
re_fourth_digit_zero <- "Use 0 as fourth digit for category"
fourth_digit_zero_lines <- grep(re_fourth_digit_zero, filtered, ...)
fourth_digit_zero_categories <- unname(
.str_pair_match(
filtered[fourth_digit_zero_lines],
"(.*category )([[:digit:]]{3})$", ...
)
)
# deal with 657 and 672 (in the default RTF), by appending the elements to the
# end of the input list. argh.
for (categ in fourth_digit_zero_categories) {
parent_row <- grep(paste0("^", categ, " .+"), filtered, value = TRUE, ...)
filtered[length(filtered) + 1] <-
paste0(categ, ".0 ", .str_pair_match(
parent_row,
"([[:digit:]]{3} )(.+)", ...
))
}
lookup_fifth <- .rtf_make_lookup_fifth(filtered, re_fifth_range_other)
filtered <- .rtf_filter_excludes(filtered, ...)
out <- .rtf_main_filter(filtered, ...)
out <- c(out, .rtf_lookup_fourth(out = out, lookup_fourth = lookup_fourth))
out <- c(out, .rtf_lookup_fifth(out, lookup_fifth))
out <- .rtf_fix_duplicates(out, verbose)
out <- out[-which(names(out) %in% invalid_qual)]
.rtf_fix_quirks_2015(out)
}
#' exclude some unwanted rows from filtered RTF
#'
#' Ignore excluded codes, and fix some odd-balls so they don't get dropped
#' "707.02Upper back", "066.40West Nile fever, unspecified", etc
#' @keywords internal datagen
#' @noRd
.rtf_filter_excludes <- function(filtered, ...) {
# drop excludes: lines with bracketed codes (removes chapter headings)
re_bracketed <- paste0(
"\\((", re_icd9_decimal_bare,
")-(", re_icd9_decimal_bare, ")\\)"
)
filtered <- grep(re_bracketed, filtered, value = TRUE, invert = TRUE, ...)
filtered <- grep("Exclude", filtered,
value = TRUE, invert = TRUE, ...
)
sub(
"((70[[:digit:]]\\.[[:digit:]]{2})|066\\.40)([[:alpha:]])", "\\1 \\2",
filtered, ...
)
}
#' filter RTF for actual ICD-9 codes
#'
#' Keep some more information, but we'll just take the primary description for
#' each item, i.e. where a code begins a line. Some codes have ten or so
#' alternative descriptions, e.g. 410.0
#' @keywords internal datagen
#' @noRd
.rtf_main_filter <- function(filtered, ...) {
filtered <- grep(paste0("^[[:space:]]*(", re_icd9_decimal_strict_bare, ") "),
filtered,
value = TRUE, ...
)
# spaces to single
filtered <- gsub("[[:space:]]+", " ", filtered, ...)
# fix a few things, e.g. "040. 1 Rhinoscleroma", "527 .0 Atrophy"
filtered <-
sub(
"^([VvEe]?[[:digit:]]+) ?\\. ?([[:digit:]]) (.*)", "\\1\\.\\2 \\3",
filtered, ...
)
# and high-level headings like "210-229 Benign neoplasms"
filtered <- grep("^[[:space:]]*[[:digit:]]{3}-[[:digit:]]{3}.*", filtered,
value = TRUE, invert = TRUE, ...
)
# "2009 H1 N1 swine influenza virus"
filtered <- grep("^2009", filtered, value = TRUE, invert = TRUE, ...)
# "495.7 \"Ventilation\" pneumonitis"
re_code_desc <- paste0("^(", re_icd9_decimal_bare, ") +([ \"[:graph:]]+)")
# out is the start of the eventual output of code to description pairs. seems
# to be quicker with perl and useBytes both FALSE
.str_pair_match(filtered, re_code_desc, perl = FALSE, useBytes = FALSE)
}
.rtf_make_majors <- function(filtered, ..., save_pkg_data = FALSE) {
use_bytes <- list(...)[["useBytes"]]
major_lines <- grep(paste0("^(", re_icd9_major_strict_bare, ") "),
filtered,
value = TRUE
)
re_major_split <- "([^[:space:]]+)[[:space:]]+(.+)"
icd9_majors <- gsub(
pattern = re_major_split, replacement = "\\1",
x = major_lines, perl = TRUE, useBytes = use_bytes
)
names(icd9_majors) <- gsub(
pattern = re_major_split, replacement = "\\2",
x = major_lines, perl = TRUE, useBytes = use_bytes
)
# this sub-chapter is simply missing from the otherwise consistent RTF way
# 'major' types are reported:
icd9_majors[["Place of occurrence"]] <- "E849"
# There are some duplicates created by the major search, mostly E001 to E030
# which are just listed twice in RTF. Also 199 (with punctuation difference),
# 209 and 239.
icd9_majors <- icd9_majors[!duplicated(icd9_majors)]
.save_in_resource_dir("icd9_majors", x = icd9_majors)
if (save_pkg_data) .save_in_data_dir("icd9_majors")
invisible(icd9_majors)
}
.rtf_make_invalid_qual <- function(filtered, ...) {
re_qual_subset <- "\\[[-, [:digit:]]+\\]"
qual_subset_lines <- grep(re_qual_subset, filtered, ...)
invalid_qual <- c()
for (ql in qual_subset_lines) {
# get prior code
m1 <- .str_match_all(
filtered[ql - 1],
paste0("(", re_icd9_decimal_bare, ") (.*)")
)
code <- unlist(m1)[[2]]
sb <- .rtf_parse_qualifier_subset(filtered[ql])
inv_sb <- setdiff(as.character(0:9), sb)
if (length(inv_sb) == 0) {
next
}
if (grepl("\\.", code, ...)) {
invalid_qual <- c(invalid_qual, paste0(code, inv_sb))
} else {
invalid_qual <- c(invalid_qual, paste0(code, ".", inv_sb))
}
}
invalid_qual
}
#' generate look-up for four digit codes
#'
#' \code{lookup_fourth} will contain vector of suffices, with names being the
#' codes they augment
#' @return named character vector, names are the ICD codes, values are the
#' descriptions
#' @keywords internal datagen
#' @noRd
.rtf_generate_fourth_lookup <- function(filtered, fourth_rows, verbose = FALSE) {
lookup_fourth <- c()
for (f in fourth_rows) {
range <- .rtf_parse_fifth_digit_range(filtered[f])
fourth_suffices <- .str_pair_match(
string = filtered[seq(f + 1, f + 37)],
pattern = "^([[:digit:]])[[:space:]](.*)"
)
re_fourth_defined <-
paste(c("\\.[", names(fourth_suffices), "]$"), collapse = "")
# drop members of range which don't have defined fourth digit
range <- grep(re_fourth_defined, range, value = TRUE)
# now replace value with the suffix, with name of item being the code itself
names(range) <- range
last <- -1
for (fourth in names(fourth_suffices)) {
if (last > as.integer(fourth)) break
re_fourth <- paste0("\\.", fourth, "$")
range[grep(re_fourth, range)] <- fourth_suffices[fourth]
last <- fourth
}
lookup_fourth <- c(lookup_fourth, range)
}
if (verbose) {
message("lookup_fourth has length: ", length(lookup_fourth), ", head: ")
print(lookup_fourth[1:5, ])
}
lookup_fourth
}
#' apply fourth digit qualifiers
#'
#' use the lookup table of fourth digit
#'
#' @keywords internal datagen
#' @noRd
.rtf_lookup_fourth <- function(out, lookup_fourth, verbose = FALSE) {
.rtf_lookup_fourth_alt_env(
out = out,
lookup_fourth = lookup_fourth,
verbose = verbose
)
}
.rtf_lookup_fourth_alt_base <- function(out, lookup_fourth, verbose = FALSE) {
out_fourth <- c()
for (f_num in seq_along(lookup_fourth)) {
lf <- lookup_fourth[f_num]
f <- names(lf)
parent_code <- .get_icd9_major(f)
if (parent_code %in% names(out)) {
pair_fourth <- paste(out[parent_code], lf, sep = ", ")
names(pair_fourth) <- f
out_fourth <- append(out_fourth, pair_fourth)
}
}
if (verbose) {
message("fourth output lines: length = ", length(out_fourth), ", head: ")
print(out_fourth[1:5, ])
}
out_fourth
}
.rtf_lookup_fourth_alt_env <- function(out, lookup_fourth, verbose = FALSE) {
out_fourth <- c()
out_env <- list2env(as.list(out))
for (f_num in seq_along(lookup_fourth)) {
lf <- lookup_fourth[f_num]
f <- names(lf)
parent_code <- .get_icd9_major(f)
if (!is.null(out_env[[parent_code]])) {
pair_fourth <- paste(out[parent_code], lf, sep = ", ")
names(pair_fourth) <- f
out_fourth <- append(out_fourth, pair_fourth)
}
}
if (verbose) {
message("fourth output lines: length = ", length(out_fourth), ", head: ")
print(out_fourth[1:5])
}
rm(out_env)
out_fourth
}
.rtf_make_lookup_fifth <- function(filtered,
re_fifth_range_other,
...,
verbose = FALSE) {
re_fifth_range <- "ifth-digit subclas|fifth-digits are for use with codes"
re_fifth_rows <- paste(re_fifth_range, re_fifth_range_other, sep = "|")
fifth_rows <- grep(pattern = re_fifth_rows, x = filtered, ...)
# lookup_fifth will contain vector of suffices, with names being the codes
# they augment
lookup_fifth <- c()
for (f in fifth_rows) {
if (verbose) message("working on fifth-digit row:", f)
range <- .rtf_parse_fifth_digit_range(filtered[f], verbose = verbose)
f1 <- filtered[seq(f + 1, f + 20)]
f2 <- grep(pattern = "^[[:digit:]][[:space:]].*", f1, value = TRUE, ...)
fifth_suffices <- .str_pair_match(f2, "([[:digit:]])[[:space:]](.*)", ...)
re_fifth_defined <- paste(c("\\.[[:digit:]][", names(fifth_suffices), "]$"),
collapse = ""
)
# drop members of range which don't have defined fifth digit
range <- grep(re_fifth_defined, range, value = TRUE, ...)
# now replace value with the suffix, with name of item being the code itself
names(range) <- range
last <- -1L
for (fifth in names(fifth_suffices)) {
if (last > as.integer(fifth)) break
re_fifth <- paste0("\\.[[:digit:]]", fifth, "$")
range[grep(re_fifth, range)] <- fifth_suffices[fifth]
last <- fifth
}
lookup_fifth <- c(lookup_fifth, range)
}
# V30-39 are a special case because combination of both fourth and fifth
# digits are specified
re_fifth_range_v30v39 <-
"The following two fifths-digits are for use with the fourth-digit \\.0"
re_v30v39_fifth <- "V3[[:digit:]]\\.0[01]$"
lines_v30v39 <- grep(re_fifth_range_v30v39, filtered)
stopifnot(length(lines_v30v39) == 1)
f1 <- filtered[seq(from = lines_v30v39 + 1, to = lines_v30v39 + 3)]
f2 <- grep(f1, pattern = "^[[:digit:]][[:space:]].*", value = TRUE, ...)
suffices_v30v39 <- .str_pair_match(f2, "([[:digit:]])[[:space:]](.*)", ...)
range <- c(
icd::expand_range("V30", "V37",
short_code = FALSE, defined = FALSE
),
icd::children("V39", short_code = FALSE, defined = FALSE)
)
range <- grep(re_v30v39_fifth, range, value = TRUE, ...)
names(range) <- range
for (fifth in names(suffices_v30v39)) {
# only applies to .0x (in 2015 at least), but .1 also exists without 5th
# digit
re_fifth <- paste0("\\.0", fifth, "$")
range[grep(re_fifth, range, ...)] <- suffices_v30v39[fifth]
}
c(lookup_fifth, range)
}
.rtf_lookup_fifth <- function(out,
lookup_fifth,
verbose = FALSE) {
.rtf_lookup_fifth_alt_env(
out = out,
lookup_fifth = lookup_fifth,
verbose = verbose
)
}
.rtf_lookup_fifth_alt_base <- function(out, lookup_fifth, verbose = FALSE) {
out_fifth <- c()
for (f_num in seq_along(lookup_fifth)) {
lf <- lookup_fifth[f_num]
f <- names(lf)
parent_code <- substr(f, 0, nchar(f) - 1)
if (parent_code %in% names(out)) {
pair_fifth <- paste(out[parent_code], lf, sep = ", ")
names(pair_fifth) <- f
out_fifth <- c(out_fifth, pair_fifth)
}
}
if (verbose) {
message("fifth output lines: length = ", length(out_fifth), ", head: ")
print(out_fifth[1:5])
}
out_fifth
}
.rtf_lookup_fifth_alt_env <- function(out, lookup_fifth, verbose = FALSE) {
out_fifth <- character(5000) # 2011 data is 4870 long
n <- 1L
out_env <- list2env(as.list(out))
for (f_num in seq_along(lookup_fifth)) {
lf <- lookup_fifth[f_num]
f <- names(lf)
parent_code <- substr(f, 0, nchar(f) - 1)
if (!is.null(out_env[[parent_code]])) {
out_fifth[n] <- paste(out[parent_code], lf, sep = ", ")
names(out_fifth)[n] <- f
n <- n + 1L
}
}
out_fifth <- out_fifth[1:n - 1]
if (verbose) {
message("fifth output lines: length = ", length(out_fifth), ", head: ")
print(out_fifth[1:5])
}
out_fifth
}
#' Fix Unicode characters in RTF
#'
#' fix ASCII, Code Page 1252 and Unicode horror: some character definitions are
#' split over lines... This needs care in Windows, or course. Maybe Mac, too?
#'
#' First: c cedilla, e grave, e acute Then: n tilde, o umlaut
#' @keywords internal datagen
#' @noRd
.rtf_fix_unicode <- function(filtered, ...) {
filtered <- gsub("\\\\'e7", "\u00e7", filtered, ...) # c cedilla
filtered <- gsub("\\\\'e8", "\u00e8", filtered, ...) # e gravel
filtered <- gsub("\\\\'e9", "\u00e9", filtered, ...) # e acute
filtered <- gsub("\\\\'f1", "\u00f1", filtered, ...) # n tilde
filtered <- gsub("\\\\'f6", "\u00f6", filtered, ...) # o umlaut
enc2utf8(filtered)
}
#' fix duplicates detected in RTF parsing
#'
#' clean up duplicates (about 350 in 2015 data), mostly one very brief
#' description and a correct longer one; or, identical descriptions
#' @keywords internal datagen
#' @noRd
.rtf_fix_duplicates <- function(out, verbose) {
dupes <- out[duplicated(names(out)) | duplicated(names(out), fromLast = TRUE)]
dupes <- unique(names(dupes))
for (d in dupes) {
dupe_rows <- which(names(out) == d)
if (all(out[dupe_rows[1]] == out[dupe_rows[-1]])) {
out <- out[-dupe_rows[-1]]
next
}
desclengths <- nchar(out[dupe_rows])
max_len <- max(desclengths)
if (verbose) {
message("removing differing duplicates: ", paste(out[dupe_rows]))
}
out <- out[-dupe_rows[-which(desclengths != max_len)]]
}
out
}
#' fix quirks for 2015 RTF parsing
#'
#' 2015 quirks (many more are baked into the parsing: try to splinter out the
#' most specific) some may well apply to other years 650-659 ( and probably many
#' others don't use whole subset of fourth or fifth digit qualifiers) going to
#' have to parse these, e.g. [0,1,3], as there are so many...
#' @keywords internal datagen
#' @noRd
.rtf_fix_quirks_2015 <- function(out) {
out <- out[grep("65[12356789]\\.[[:digit:]][24]", names(out), invert = TRUE)]
# 657 just isn't formatted like any other codes
out["657.0"] <- "Polyhydramnios"
out["657.00"] <-
"Polyhydramnios, unspecified as to episode of care or not applicable"
out["657.01"] <-
"Polyhydramnios, delivered, with or without mention of antepartum condition"
out["657.03"] <- "Polyhydramnios, antepartum condition or complication"
out["719.69"] <- "Other symptoms referable to joint, multiple sites"
out["807.19"] <- "Open fracture of multiple ribs, unspecified"
out["E849"] <- "Place of occurence"
out
}
#' parse a row of RTF source data for ranges to apply fifth digit
#'
#' sub-classifications
#' returns all the possible 5 digit codes encompassed by the given
#' definition. This needs to be whittled down to just those matching fifth
#' digits, but we haven't parsed them yet.
#' @template verbose
#' @keywords internal datagen
#' @noRd
.rtf_parse_fifth_digit_range <- function(row_str, verbose = FALSE) {
stopifnot(is.character(row_str))
stopifnot(is.logical(verbose), length(verbose) == 1)
out <- c()
# get numbers and number ranges
vals <- grep(unlist(strsplit(row_str, "[, :;]")),
pattern = "[VvEe]?[0-9]", value = TRUE
)
if (verbose) {
message("vals are:", paste(vals, collapse = ", "))
}
# sometimes we get things like:
# [1] "345.0" ".1" ".4-.9"
grepl(pattern = "^\\.[[:digit:]]+.*", vals) -> decimal_start
if (any(decimal_start)) {
base_code <- vals[1] # assume first is the base
stopifnot(icd::is_valid(base_code, short_code = FALSE))
for (dotmnr in vals[-1]) {
if (verbose) {
message("dotmnr is: ", dotmnr)
}
if (grepl("-", dotmnr)) {
# range of minors
pair <- unlist(strsplit(dotmnr, "-", fixed = TRUE))
first <- paste0(.get_icd9_major(base_code), pair[1])
last <- paste0(.get_icd9_major(base_code), pair[2])
if (verbose) {
message("expanding specified minor range from ", first, " to ", last)
}
out <- c(out, icd::expand_range(first,
last,
short_code = FALSE,
defined = FALSE
))
} else {
single <- paste0(.get_icd9_major(base_code), dotmnr)
out <- c(out, icd::children(single,
short_code = FALSE,
defined = FALSE
))
}
}
vals <- vals[1] # still need to process the base code
}
for (v in vals) {
# take care of ranges
if (grepl("-", v)) {
pair <- unlist(strsplit(v, "-", fixed = TRUE))
# sanity check
stopifnot(all(icd::is_valid(pair, short_code = FALSE)))
if (verbose) {
message("expanding explicit range ", pair[1], " to ", pair[2])
}
# formatting errors can lead to huge range expansions, e.g. "8-679".
# quickly strip off V or E part for comparison
pair_one <- gsub("[^[:digit:]]", "", pair[1])
pair_two <- gsub("[^[:digit:]]", "", pair[2])
if (as.integer(pair_two) - as.integer(pair_one) > 10) {
warning("probable formatting misinterpretation: huge range expansion")
}
out <- c(out, icd::expand_range(pair[1], pair[2],
short_code = FALSE, defined = FALSE
))
} else {
# take care of single values
if (!icd::is_valid(v, short_code = FALSE)) {
stop(paste(
"invalid code is: ",
icd::get_invalid(v, short_code = FALSE)
))
}
out <- c(out, icd::children(v, short_code = FALSE, defined = FALSE))
}
}
out
}
.rtf_parse_qualifier_subset <- function(qual) {
stopifnot(is.character(qual), length(qual) == 1)
out <- c()
s1 <- strsplit(.strip(qual), "[]\\[,]")
s2 <- grep(pattern = "[[:digit:]]", unlist(s1), value = TRUE)
vals <- unlist(strsplit(s2, ","))
for (v in vals) {
if (grepl("-", v)) {
pair <- as.integer(unlist(strsplit(v, "-")))
out <- c(out, seq(pair[1], pair[2]))
next
}
out <- c(out, as.integer(v))
}
as.character(out)
}
#' Strip RTF
#'
#' Take a vector of character strings containing RTF, replace each \\tab with a
#' space and eradicate all other RTF symbols
#'
#' just for \\tab, replace with space, otherwise, drop RTF tags entirely
#' @param x vector of character strings containing RTF
#' @keywords internal datagen
#' @noRd
.rtf_strip <- function(x, ...) {
# nolint start
x <- gsub("\\\\tab ", " ", x, ...)
x <- gsub("\\\\[[:punct:]]", "", x, ...) # control symbols, not control words
x <- gsub("\\\\lsdlocked[ [:alnum:]]*;", "", x, ...) # special case
x <- gsub("\\{\\\\bkmk(start|end).*?\\}", "", x, ...)
# no backslash in this next list, others removed from
# http://www.regular-expressions.info/posixbrackets.html
# punct not quite the same: [!"\#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~]
x <- gsub("\\\\[-[:alnum:]]*[ !\"#$%&'()*+,-./:;<=>?@^_`{|}~]?", "", x, ...)
x <- gsub(" *(\\}|\\{)", "", x, ...)
trimws(x)
# nolint end
}
.make_icd9cm_parse_rtf_fun <- function(year, verbose) {
# Must force, so that the values to the arguments are not promises which are
# later evaluated in a different environment.
force(year)
parse_fun <- function() {
if (.verbose()) message("Calling ICD-9-CM RTF parser for year:", year)
.parse_icd9cm_rtf_year(year = year)
}
parse_fun_env <- environment(parse_fun)
parse_fun_env$ver <- as.character(year)
parse_fun
}
.make_icd9cm_rtf_parsers <- function(env = parent.frame(),
verbose = .verbose()) {
for (y in .icd9cm_sources$f_year) {
# TODO: special case for 2011 / v32?
parse_fun_name <- .get_parser_icd9cm_rtf_name(y)
if (verbose) message("Making ICD-9-CM RTF parser: ", parse_fun_name)
parse_fun <- .make_icd9cm_parse_rtf_fun(y, verbose = verbose)
assign(parse_fun_name, parse_fun, envir = env)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.