# quick sanity checks - full tests of x in test-parse.R
.icd9cm_hierarchy_sanity <- function(x) {
stopifnot(
all(
icd::is_valid(x[["code"]], short_code = TRUE)
)
)
if (!any(
vapply(
x,
FUN = is.na,
FUN.VALUE = logical(nrow(x))
)
)) {
return(invisible())
}
print(colSums(vapply(x, is.na, logical(nrow(x)))))
print(x[which(is.na(x$major)), ])
print(x[which(is.na(x$three_digit)), ])
print(sum(is.na(x$sub_chapter)))
print(unique(x[which(is.na(x$sub_chapter)), "three_digit"]))
print(x[which(is.na(x$sub_chapter))[1:10], ]) # just top ten
print(x[which(is.na(x$chapter)), ])
stop("should not have any NA values in the ICD-9-CM flatten hierarchy")
}
#' Get billable codes from all available years
#'
#' For versions 23 to 32, those which are on the CMS web site, get any codes
#' with long or short descriptions. Earlier years only have abbreviated
#' descriptions, not long descriptions.
#' @param save_data single logical value, if \code{TRUE} the source text or CSV
#' file will be saved in the raw data directory, otherwise (the default) the
#' data is simply returned invisibly.
#' @template verbose
#' @template offline
#' @return data frame with \code{icd9}, \code{short_desc} and \code{long_desc}
#' columns. \code{NA} is placed in \code{long_desc} when not available.
#' @source
#' http://www.cms.gov/Medicare/Coding/ICD9ProviderDiagnosticCodes/codes.html
#' @keywords internal datagen
#' @noRd
.icd9cm_parse_leaf_descs <- function(save_data = TRUE,
verbose = FALSE,
...) {
stopifnot(is.logical(save_data), length(save_data) == 1)
stopifnot(is.logical(verbose), length(verbose) == 1)
versions <- .icd9cm_sources$version
if (verbose) {
message(
"Available versions of sources are: ",
paste(versions, collapse = ", ")
)
}
icd9cm_leaf_v32 <- .parse_icd9cm_leaf_year(
ver = "2014",
save_data = TRUE,
verbose = verbose,
...
)
invisible(icd9cm_leaf_v32)
}
.dl_icd9cm_leaf_year <- function(year,
verbose = .verbose(),
...) {
year <- as.character(year)
if (verbose) message("Downloading ICD-9-CM leaf/billable year: ", year)
stopifnot(year %in% .icd9cm_sources$f_year)
dat <- .icd9cm_sources[.icd9cm_sources$f_year == year, ]
fn_short_orig <- dat$short_filename
fn_long_orig <- dat$long_filename
f_info_long <- NA
f_info_short <- .unzip_to_data_raw(
url = dat$url,
file_name = fn_short_orig,
verbose = verbose,
save_name = .get_versioned_raw_file_name(fn_short_orig,
ver = year
),
...
)
if (!is.na(fn_long_orig)) {
f_info_long <- .unzip_to_data_raw(
url = dat$url,
file_name = fn_long_orig,
verbose = verbose,
save_name = .get_versioned_raw_file_name(fn_long_orig,
ver = year
),
...
)
}
list(short = f_info_short, long = f_info_long)
}
#' Read the ICD-9-CM leaf description data as provided by the Center for
#' Medicaid Services (CMS).
#'
#' The full ICD-9 specification is unfortunately only available in an RTF file,
#' but CMS also distributes a space-separated text file with just the
#' definitions for 'leaf' or 'billable' codes. Note that this canonical data
#' doesn't specify non-diagnostic higher-level codes, just the specific
#' diagnostic codes.
#'
#' The RTFs are parsed by this package to produce the icd9cm2011 etc data,
#' formerly called icd9cm_hierarchy.
#' @param version character vector of length one containing the ICD-9 version,
#' e.g. \code{"32"}.
#' @template save_data
#' @param path Absolute path in which to save parsed data
#' @param ... Arguments passed to other functions, e.g., \code{offline} for
#' \code{.unzip_to_data_raw}
#' @return invisibly return the result
#' @keywords internal datagen
#' @noRd
.parse_icd9cm_leaf_year <- function(year,
verbose = .verbose(),
...) {
stopifnot(length(year) == 1L)
year <- as.character(year)
stopifnot(grepl("^[[:digit:]]{4}$", year))
if (verbose) message("Fetching billable codes version: ", year)
# 2009 version 27 is exceptionally badly formatted:
if (year == "2009") {
return(invisible(.parse_leaf_desc_icd9cm_v27(...)))
}
stopifnot(year %in% .icd9cm_sources$f_year)
f_info <- .dl_icd9cm_leaf_year(year = year, verbose = verbose, ...)
# Below, yes, specify encoding twice, once to declare the source format, and
# again to tell R to flag (apparently only where necessary), the destination
# strings: in our case this is about ten accented character in long
# descriptions of disease names
#
# shortlines should always exist
shortlines <- readLines(f_info$short$file_path)
# longlines may not, and may have more complicated encoding
if (!is.na(f_info["long"])) {
longlines <- readLines(f_info$long$file_path, encoding = "latin1")
} else {
longlines <- NA_character_
}
shortlines <- strsplit(shortlines, "[[:space:]]+")
longlines <- strsplit(longlines, "[[:space:]]+")
# no need to trim: we just split on "space', so no extra spaces
short_codes <- vapply(shortlines,
FUN = function(x) x[1],
FUN.VALUE = character(1)
)
short_descs <- vapply(shortlines,
FUN = function(x) paste(x[-1], collapse = " "),
FUN.VALUE = character(1)
)
long_descs <- NA_character_
if (!is.na(longlines[1])) {
long_descs <- trimws(
vapply(longlines,
FUN = function(x) paste(x[-1], collapse = " "),
FUN.VALUE = character(1)
)
)
}
if (verbose) message("codes and descs separated")
out <- data.frame(
code = short_codes,
short_desc = short_descs,
long_desc = long_descs,
stringsAsFactors = FALSE
)
if (verbose) message("now sort so that E is after V")
new_order <- order.icd9(out[["code"]])
stopifnot(!anyNA(out[["code"]]))
stopifnot(!anyNA(new_order))
stopifnot(!any(grepl(out[["code"]], pattern = "[[:space:]]")))
stopifnot(!anyDuplicated(new_order))
stopifnot(all(seq_len(nrow(out)) %in% new_order))
# catches a mistaken zero-indexed reorder result
stopifnot(length(setdiff(seq_len(nrow(out)), new_order)) == 0)
stopifnot(length(setdiff(new_order, seq_len(nrow(out)))) == 0)
out <- out[new_order, ]
oldwarn <- options(warn = ifelse(verbose, 1, -1))
on.exit(options(oldwarn), add = TRUE)
if (!is.na(f_info["long"])) {
encs <- Encoding(out[["long_desc"]])
if (verbose) {
message(
"Found labelled encodings in long_desc: ",
paste(unique(encs), collapse = ", ")
)
message(
"non-ASCII rows of long descriptions are: ",
paste(get_non_ascii(out[["long_desc"]]), collapse = ", ")
)
message(
"Encodings found in long_desc: ",
unique(
Encoding(out[["long_desc"]][is_non_ascii(out[["long_desc"]])])
)
)
}
}
out$short_desc <- enc2utf8(out$short_desc)
out$long_desc <- enc2utf8(out$long_desc)
var_name <- paste0("icd9cm", year, "_leaf")
out[order.icd9(out$code), ]
rownames(out) <- NULL
.save_in_resource_dir(var_name = var_name, x = out)
invisible(out)
}
#' Parse billable codes for ICD-9-CM version 27
#'
#' These have a quirk which needs a different approach
#' @keywords internal datagen
#' @noRd
.parse_leaf_desc_icd9cm_v27 <- function(...) {
message("working on version 27 quirk")
v27_dat <- .icd9cm_sources[.icd9cm_sources$version == "27", ]
fn_orig <- v27_dat$other_filename
url <- v27_dat$url
message("original v27 file name = '", fn_orig, "'. URL = ", url)
f27_info <- .unzip_to_data_raw(
url = url,
file_name = fn_orig,
...
)
f <- file(f27_info$file_path, encoding = "latin1")
icd9cm_billable27 <-
utils::read.csv(f27_info$file_path,
stringsAsFactors = FALSE,
colClasses = "character", encoding = "latin1"
)
close(f)
names(icd9cm_billable27) <- c("code", "long_desc", "short_desc")
icd9cm_billable27 <- icd9cm_billable27[c(1, 3, 2)] # reorder columns
reorder <- order.icd9(icd9cm_billable27[["code"]])
invisible(icd9cm_billable27[reorder, ])
}
#' Fix NA sub-chapters in RTF parsing
#'
#' Fixes a couple of corner cases in parsing the 2011 ICD-9-CM RTF
#' @keywords internal datagen
#' @noRd
.fix_sub_chap_na <- function(x, start, end) {
stop("no longer needed: icd9_sub_chapters.R in data is now correct")
# 280, 740 (CONGENITAL ANOMALIES) are chapters with no sub-chapters defined.
# For consistency, assign the same name to sub-chapters
rng <- icd::expand_range(icd::as.icd9cm(start),
icd::as.icd9cm(end),
short_code = TRUE,
defined = FALSE
)
congenital <- x[["code"]] %in% rng
# assert all the same:
stopifnot(all(x[congenital[1], "chapter"] == x[congenital[-1], "chapter"]))
# insert a new level into the sub-chapter factor in the right place
previous_sub <- .as_char_no_warn(x[(which(congenital) - 1)[1], "sub_chapter"])
previous_sub_pos <- which(levels(x$sub_chapter) == previous_sub)
congenital_title <- .as_char_no_warn(x[which(congenital)[1], "chapter"])
new_subs <- .as_char_no_warn(x$sub_chapter)
new_subs[congenital] <- congenital_title
new_levels <- append(
levels(x$sub_chapter),
congenital_title,
previous_sub_pos
)
x$sub_chapter <- factor(new_subs, new_levels)
x
}
#' Generate ICD-9-CM hierarchy
#'
#' For each row of billing code, give the chapter, sub-chapter, major code and
#' description, and short and long descriptions. Currently this is specifically
#' for the 2011 ICD-9-CM after which there have been minimal changes.
#' Thankfully, ICD-10-CM has machine readable data available.
#' @template save_data
#' @template verbose
#' @template offline
#' @keywords internal datagen
#' @noRd
.icd9cm_gen_chap_hier <- function(save_data = FALSE,
verbose = FALSE,
offline = .offline(),
perl = TRUE,
use_bytes = TRUE) {
# TODO: Someday change 'billable' to 'leaf', and make consistent ICD-9 and
# ICD-10, e.g. icd9cm2011 instead of icd9cm_hierarchy lookup tables
stopifnot(is.logical(save_data), length(save_data) == 1)
stopifnot(is.logical(verbose), length(verbose) == 1)
stopifnot(is.logical(offline), length(offline) == 1)
stopifnot(is.logical(perl), length(perl) == 1)
stopifnot(is.logical(use_bytes), length(use_bytes) == 1)
icd9_rtf <- .parse_icd9cm_rtf_year(
year = "2011",
perl = perl,
useBytes = use_bytes,
save_data = FALSE,
verbose = verbose,
offline = offline
)
chaps <- .icd9_get_chapters(
x = icd9_rtf$code,
short_code = TRUE
)
icd9_order <- order.icd9
chaps <- chaps[icd9_order(.as_char_no_warn(chaps$three_digit)), ]
icd9_rtf <- icd9_rtf[icd9_order(icd9_rtf$code), ]
out <- cbind(
data.frame(
"code" = icd9_rtf$code,
"long_desc" = icd9_rtf$desc,
stringsAsFactors = FALSE
),
# the following can and should be factors:
chaps
)
# insert the short descriptions from the billable codes text file. Where there
# is no short description, e.g. for most Major codes, or intermediate codes,
# just copy the long description over.
# need ICD-9 codes to build this, right now just working off the final published edition.
bill32 <- get_icd9cm2014_leaf()
billable_codes <- bill32$code
billable_rows <- which(out[["code"]] %in% billable_codes)
title_rows <- which(out[["code"]] %nin% billable_codes)
stopifnot(setdiff(
c(billable_rows, title_rows),
seq_along(out$code)
) == integer(0))
out[billable_rows, "short_desc"] <- bill32$short_desc
# for rows without a short description (i.e. titles, non-billable),
# useexisting long desc
out[title_rows, "short_desc"] <- out[title_rows, "long_desc"]
# the billable codes list (where available) currently has better long
# descriptions than the RTF parse. For previous years, there is no long desc
# in billable, so careful when updating this.
out[billable_rows, "long_desc"] <- bill32$long_desc
# now put the short description in the right column position
out <- out[c(
"code", "short_desc", "long_desc", "three_digit",
"major", "sub_chapter", "chapter"
)]
out[["short_desc"]] <- enc2utf8(out[["short_desc"]])
out[["long_desc"]] <- enc2utf8(out[["long_desc"]])
.icd9cm_hierarchy_sanity(out)
billable <- out$code %in% bill32$code
icd9cm_hierarchy <- cbind(out[1], billable, out[-1])
row.names(icd9cm_hierarchy) <- NULL
if (save_data) {
.save_in_data_dir(icd9cm_hierarchy)
}
invisible(icd9cm_hierarchy)
}
.make_icd9cm_parse_leaf_fun <- function(year, verbose) {
# Must force, so that the values to the arguments are not promises which are
# later evaluated in a different environment.
force(year)
parse_fun <- function() {
if (.verbose()) message("Calling ICD-9-CM leaf parser for year:", year)
.parse_icd9cm_leaf_year(year = year)
}
parse_fun_env <- environment(parse_fun)
parse_fun_env$ver <- as.character(year)
parse_fun
}
.make_icd9cm_leaf_parsers <- function(env = parent.frame(),
verbose = .verbose()) {
for (y in .icd9cm_sources$f_year) {
# TODO: special case for 2011 / v32?
parse_fun_name <- .get_parser_icd9cm_leaf_name(y)
if (verbose) message("Making ICD-9-CM leaf parser: ", parse_fun_name)
parse_fun <- .make_icd9cm_parse_leaf_fun(y, verbose = verbose)
assign(parse_fun_name, parse_fun, envir = env)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.