Nothing
# Copyright (C) 2014 - 2015 Jack O. Wasey
#
# This file is part of icd9.
#
# icd9 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# icd9 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with icd9. If not, see <http:#www.gnu.org/licenses/>.
# we don't ever use magrittr in 'live' package use, just when it is using its
# own functions for testing and generating its own data: in those cases magrittr
# will be available, but we don't want CRAN check problems, so:
utils::globalVariables(c(".", "%>%"))
# try parsing the RTF, and therefore get subheadings, as well as billable codes.
# ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD9-CM/2011/
#
# see https://github.com/LucaFoschini/ICD-9_Codes for a completely different
# approach in python
#' @title parse RTF description of entire ICD-9-CM for a specific year
#' @description Currently only the most recent update is implemented. Note that
#' CMS have published additional ICD-9-CM billable code lists since the last
#' one from the CDC: I think these have been the same every year since 2011,
#' though. THe last CDC release is Dtab12.rtf from 2011.
#' @param year from 1996 to 2012 (this is what CDC has published). Only 2012
#' implemented thus far
#' @template verbose
#' @source
#' http://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD9-CM/2011/Dtab12.zip
#' and similar files run from 1996 to 2011.
#' @keywords internal
parseRtfYear <- function(year = "2011", save = FALSE, fromWeb = FALSE, verbose = FALSE) {
assertString(year)
assertFlag(save)
assertFlag(fromWeb)
assertFlag(verbose)
rtf_dat <- data_sources[data_sources$f_year == year, ]
url <- rtf_dat$rtf_url
fn <- rtf_dat$rtf_filename
fp <- file.path("inst", "extdata", fn)
if (!save && !file.exists(fp))
fp <- system.file("extdata", fn, package = "icd9")
if (fromWeb || !file.exists(fp) || save) {
zip_single(url, fn, fp)
lines <- readLines(url, fp)
} else {
fp_conn <- file(fp, encoding = "ASCII")
on.exit(close(fp_conn))
lines <- readLines(fp_conn, warn = FALSE)
}
# the file itself is 7 bit ASCII, but has its own internal encoding using CP1252.
# test meniere's disease with lines 24821 to 24822 from 2012 RTF
parseRtfLines(lines, verbose) %>% swapNamesWithVals %>% icd9SortDecimal -> out
# make Tidy data: don't like using row names to store things
icd9Desc <- data.frame(
icd9 = out %>% unname %>% icd9DecimalToShort,
desc = names(out),
stringsAsFactors = FALSE)
if (save) saveInDataDir("icd9Desc")
invisible(icd9Desc)
}
#' @title parse a character vector containing RTF strings
#' @description parse a character vector containing RTF strings
#' @param lines character vector containing RTF. Encoding?
#' @template verbose
#' @return named character vector, with names being the ICD-9 codes, and the
#' contents being the descriptions from the RTF source. Elsewhere I do this
#' the other way around, but the tests are now wired for this layout. "Tidy"
#' data would favour having an un-named two-column data frame.
#' @keywords internal
parseRtfLines <- function(lines, verbose = FALSE) {
assertCharacter(lines)
assertFlag(verbose)
# filtered <- iconv(lines, from = "ASCII", to = "UTF-8", mark = TRUE) I think
# the first 127 characters of ASCII are the same in Unicode, but we must make
# sure R treats the lines as Unicode.
filtered <- lines
# merge any line NOT starting with "\\par" on to previous line
non_par_lines <- grep("^\\\\par", filtered, invert = TRUE)
# in reverse order, put each non-par line on end of previous, then filter out all non-par lines
for (i in rev(non_par_lines)) {
filtered[i - 1] <- paste(filtered[i - 1], filtered[i], sep = "")
}
filtered <- grep("^\\\\par", filtered, value = TRUE)
# fix ASCII/CP1252/Unicode horror: of course, some char defs are split over lines...
filtered <- gsub("\\\\'e8", "\u00e8", filtered)
filtered <- gsub("\\\\'e9", "\u00e9", filtered)
filtered <- gsub("\\\\'f1", "\u00f1", filtered)
filtered <- gsub("\\\\'f16", "\u00f6", filtered)
# drop stupid long line at end:
longest_lines <- nchar(filtered) > 3000
# if none, then -c() returns no rows, so we have to test first
if (any(longest_lines))
filtered <- filtered[-c(which(longest_lines))]
filtered <- stripRtf(filtered)
filtered <- grep("^[[:space:]]*$", filtered, value = TRUE, invert = TRUE) # empty lines
re_anycode <- "(([Ee]?[[:digit:]]{3})|([Vv][[:digit:]]{2}))(\\.[[:digit:]]{1,2})?"
# this is so ghastly: find rows with sequare brackets containing definition of
# subset of fourth or fifth digit codes. Need to pull code from previous row,
# and create lookup, so we can exclude these when processing the fourth an
# fifth digits
re_qual_subset <- "\\[[-, [:digit:]]+\\]"
qual_subset_lines <- grep(re_qual_subset, filtered)
invalid_qual <- c()
for (ql in qual_subset_lines) {
# get prior code
strMultiMatch(paste0("(", re_anycode, ") (.*)"), filtered[ql - 1]) %>%
unlist %>% magrittr::extract2(1) -> code
sb <- parseRtfQualifierSubset(filtered[ql])
inv_sb <- setdiff(as.character(0:9), sb)
if (length(inv_sb) == 0) next
if (grepl("\\.", code))
invalid_qual <- c(invalid_qual, paste0(code, inv_sb))
else
invalid_qual <- c(invalid_qual, paste0(code, ".", inv_sb))
}
# grab fifth digit ranges now:
re_fifth_range_other <- "fifth +digit +to +identify +stage"
re_fifth_range <- "ifth-digit subclas|fifth-digits are for use with codes"
re_fifth_range_V30V39 <- "The following two fifths-digits are for use with the fourth-digit \\.0"
fifth_rows <- grep(paste(re_fifth_range, re_fifth_range_other, sep = "|"), filtered)
# several occurances of "Requires fifth digit", referring back to the previous
# higher-level definition, without having the parent code in the line itself
fifth_backref <- grep(re_fifth_range_other, filtered)
# for these, construct a string which will be captured in the next block
filtered[fifth_backref] <- paste(filtered[fifth_backref], filtered[fifth_backref - 1], sep = " ")
# fourth-digit qualifiers:
re_fourth_range <- "fourth-digit.+categor"
fourth_rows <- grep(re_fourth_range, filtered)
# lookup_fourth will contain vector of suffices, with names being the codes they augment
lookup_fourth <- c()
for (f in fourth_rows) {
if (verbose) message("working on fourth-digit row:", f)
range <- parseRtfFifthDigitRanges(filtered[f])
filtered[seq(f + 1, f + 37)] %>%
grep("^[[:digit:]][[:space:]].*", ., value = TRUE) %>%
strPairMatch("([[:digit:]])[[:space:]](.*)", .) -> fourth_suffices
re_fourth_defined <- paste(c("\\.[", names(fourth_suffices), "]$"), collapse = "")
# drop members of range which don't have defined fourth digit
range <- grep(re_fourth_defined, range, value = TRUE)
# now replace value with the suffix, with name of item being the code itself
names(range) <- range
last <- -1
for (fourth in names(fourth_suffices)) {
if (last > as.integer(fourth)) break
re_fourth <- paste0("\\.", fourth, "$")
range[grep(re_fourth, range)] <- fourth_suffices[fourth]
last <- fourth
}
lookup_fourth <- c(lookup_fourth, range)
}
# at least two examples of "Use 0 as fourth digit for category 672"
re_fourth_digit_zero <- "Use 0 as fourth digit for category"
fourth_digit_zero_lines <- grep(re_fourth_digit_zero, filtered)
strPairMatch("(.*category )([[:digit:]]{3})$", filtered[fourth_digit_zero_lines]) %>%
unname -> fourth_digit_zero_categories
for (categ in fourth_digit_zero_categories) {
parent_row <- grep(paste0("^", categ, " .+"), filtered, value = TRUE)
filtered[length(filtered) + 1] <- paste0(categ, ".0 ", strPairMatch("([[:digit:]]{3} )(.+)", parent_row))
}
# lookup_fifth will contain vector of suffices, with names being the codes they augment
lookup_fifth <- c()
for (f in fifth_rows) {
if (verbose) message("working on fifth-digit row:", f)
range <- parseRtfFifthDigitRanges(filtered[f], verbose = verbose)
filtered[seq(f + 1, f + 20)] %>%
grep("^[[:digit:]][[:space:]].*", ., value = TRUE) %>%
strPairMatch("([[:digit:]])[[:space:]](.*)", .) -> fifth_suffices
re_fifth_defined <- paste(c("\\.[[:digit:]][", names(fifth_suffices), "]$"), collapse = "")
# drop members of range which don't have defined fifth digit
range <- grep(re_fifth_defined, range, value = TRUE)
# now replace value with the suffix, with name of item being the code itself
names(range) <- range
last <- -1
for (fifth in names(fifth_suffices)) {
if (last > as.integer(fifth)) break
re_fifth <- paste0("\\.[[:digit:]]", fifth, "$")
range[grep(re_fifth, range)] <- fifth_suffices[fifth]
last <- fifth
}
lookup_fifth <- c(lookup_fifth, range)
}
# V30-39 are a special case because combination of both fourth and fifth digits are specified
re_V30V39_fifth <- "V3[[:digit:]]\\.0[01]$"
lines_V30V39 <- grep(re_fifth_range_V30V39, filtered)
stopifnot(length(lines_V30V39) == 1)
filtered[seq(from = lines_V30V39 + 1, to = lines_V30V39 + 3)] %>%
grep("^[[:digit:]][[:space:]].*", ., value = TRUE) %>%
strPairMatch("([[:digit:]])[[:space:]](.*)", .) -> suffices_V30V39
range <- c("V30" %i9da% "V37", icd9ChildrenDecimal("V39"))
range <- grep(re_V30V39_fifth, range, value = TRUE)
names(range) <- range
for (fifth in names(suffices_V30V39)) {
# only applies to .0x (in 2015 at least), but .1 also exists without 5th digit
re_fifth <- paste0("\\.0", fifth, "$")
range[grep(re_fifth, range)] <- suffices_V30V39[fifth]
}
lookup_fifth <- c(lookup_fifth, range)
# now here we could potentially capture chapter headings, but I can drop
# excludes easily by removing lines with bracketed codes
filtered <- grep(paste0("\\((", re_anycode, ")+[-[:digit:]]*\\)"), filtered, value = TRUE, invert = TRUE)
filtered <- grep(paste0("Exclude"), filtered, value = TRUE, invert = TRUE)
# again, we can keep some more information, but we'll just take the primary
# description for each item, i.e. where a code begins a line. Some codes have
# ten or so alternative descriptions, e.g. 410.0
filtered <- grep(paste0("^[[:space:]]*", re_anycode), filtered, value = TRUE)
# spaces to single
filtered <- gsub("[[:space:]]+", " ", filtered)
# fix a few things, e.g. "040. 1 Rhinoscleroma", "527 .0 Atrophy"
filtered <- sub("^([VvEe]?[[:digit:]]+) ?\\. ?([[:digit:]]) (.*)", "\\1\\.\\2 \\3", filtered)
# and high-level headings like "210-229 Benign neoplasms"
filtered <- grep("^[[:space:]]*[[:digit:]]{3}-[[:digit:]]{3}.*", filtered, value = TRUE, invert = TRUE)
# "707.02Upper back"
filtered <- sub("([[:digit:]])([[:alpha:]])", "\\1 \\2", filtered)
# "2009 H1 N1 swine influenza virus"
filtered <- grep("^2009", filtered, value = TRUE, invert = TRUE)
# "495.7 \"Ventilation\" pneumonitis"
re_code_desc <- paste0("^(", re_anycode, ") +([ \"[:graph:]]+)")
out <- strPairMatch(re_code_desc, filtered, pos = c(1, 6))
# apply fourth digit qualifiers
for (f in names(lookup_fourth)) {
if (verbose) message("applying fourth digits to: ", f)
parent_code <- icd9GetMajor(f, isShort = FALSE)
if (parent_code %in% names(out)) {
out <- c(out, lookup_fourth[f])
out[f] <- paste(out[parent_code], lookup_fourth[f], sep = ", ")
}
}
# apply fifth digit qualifiers:
for (f in names(lookup_fifth)) {
if (verbose) message("applying fifth digits to: ", f)
parent_code <- substr(f, 0, nchar(f) - 1)
if (parent_code %in% names(out)) {
# add just the suffix with name being the five digit code
out <- c(out, lookup_fifth[f])
# then update to have the parent code in description
out[f] <- paste(out[parent_code], lookup_fifth[f], sep = ", ")
}
}
# clean up duplicates (about 350 in 2015 data), mostly one very brief
# description and a correct longer one; or, identical descriptions
dupes <- out[duplicated(names(out)) | duplicated(names(out), fromLast = TRUE)] %>% names %>% unique
for (d in dupes) {
dupe_rows <- which(names(out) == d)
if (all(out[dupe_rows[1]] == out[dupe_rows[-1]])) {
out <- out[-dupe_rows[-1]]
next
}
desclengths <- out[dupe_rows] %>% nchar
longestlength <- desclengths %>% max
if (verbose) message("removing differing duplicates: ", paste(out[dupe_rows]))
out <- out[-dupe_rows[-which(desclengths != longestlength)]]
}
# drop all the codes not specified by 5th digits in square brackets, which are
# applied over a range of codes.
out <- out[-which(names(out) %in% invalid_qual)]
# 2015 quirks (many more are baked into the parsing: try to splinter out the most specific)
# some may well apply to other years
# 650-659 ( and probably many others don't use whole subset of fourth or fifth digit qualifiers)
# going to have to parse these, e.g. [0,1,3], as there are so many...
out <- out[grep("65[12356789]\\.[[:digit:]][24]", names(out), invert = TRUE)]
#657 just isn't formatted like any other codes
out["657.0"] <- "Polyhydramnios"
out["657.00"] <- "Polyhydramnios, unspecified as to episode of care or not applicable"
out["657.01"] <- "Polyhydramnios, delivered, with or without mention of antepartum condition"
out["657.03"] <- "Polyhydramnios, antepartum condition or complication"
out["719.69"] <- "Other symptoms referable to joint, multiple sites"
out["807.19"] <- "Open fracture of multiple ribs, unspecified"
out["E849"] <- "Place of occurence"
out
}
#' @title parse a row of RTF source data for ranges to apply fifth digit
#' sub-classifications
#' @description returns all the possible 5 digit codes encompassed by the given
#' definition. This needs to be whittled down to just those matching fifth
#' digits, but we haven't parsed them yet.
#' @template verbose
#' @keywords internal
parseRtfFifthDigitRanges <- function(row_str, verbose = FALSE) {
assertString(row_str)
assertFlag(verbose)
out <- c()
# get numbers and number ranges
strsplit(row_str, "[, :;]") %>% unlist %>% grep("[VvEe]?[0-9]", ., value = TRUE) -> vals
if (verbose) message("vals are:", paste(vals, collapse = ", "))
# sometimes we get things like:
# [1] "345.0" ".1" ".4-.9"
grepl(pattern = "^\\.[[:digit:]]+.*", vals) -> decimal_start
if (any(decimal_start)) {
base_code <- vals[1] # assume first is the base
stopifnot(icd9IsValidDecimal(base_code))
for (dotmnr in vals[-1]) {
if (verbose) message("dotmnr is: ", dotmnr)
if (grepl("-", dotmnr)) {
# range of minors
strsplit(dotmnr, "-", fixed = TRUE) %>% unlist -> pair
first <- paste0(icd9GetMajor(base_code, isShort = FALSE), pair[1])
last <- paste0(icd9GetMajor(base_code, isShort = FALSE), pair[2])
if (verbose) message("expanding specified minor range from ", first, " to ", last)
out <- c(out, first %i9da% last)
} else {
single <- paste0(icd9GetMajor(base_code, isShort = FALSE), dotmnr)
out <- c(out, icd9ChildrenDecimal(single, onlyReal = FALSE))
}
}
vals <- vals[1] # still need to process the base code
}
for (v in vals) {
# take care of ranges
if (grepl("-", v)) {
v %>% strsplit("-", fixed = TRUE) %>% unlist -> pair
# sanity check
stopifnot(all(icd9IsValidDecimal(pair)))
if (verbose) message("expanding explicit range ", pair[1], " to ", pair[2])
# formatting errors can lead to huge range expansions, e.g. "8-679"
# quickly strip off V or E part for comparison
pair_one <- gsub("[^[:digit:]]", "", pair[1])
pair_two <- gsub("[^[:digit:]]", "", pair[2])
if (as.integer(pair_two) - as.integer(pair_one) > 10) {
warning("probable formatting misinterpretation because huge range expansion is requested")
}
out <- c(out, pair[1] %i9da% pair[2])
} else {
# take care of single values
if (!icd9IsValidDecimal(v)) stop(paste("invalid code is: ", icd9GetInvalidDecimal(v)))
out <- c(out, icd9ChildrenDecimal(v))
}
}
out
}
parseRtfQualifierSubset <- function(qual) {
assertString(qual) # one at a time
out <- c()
qual %>% strip %>%
strsplit("[]\\[,]") %>%
unlist %>%
grep("[[:digit:]]", ., value = TRUE) %>%
strsplit(",") %>% unlist -> vals
for (v in vals) {
if (grepl("-", v)) {
strsplit(v, "-") %>% unlist %>% as.integer -> pair
out <- c(out, seq(pair[1], pair[2]))
next
}
out <- c(out, as.integer(v))
}
as.character(out)
}
#' Strip RTF
#'
#' Take a vector of character strings containing RTF, replace each \\tab with a
#' space and eradicate all other RTF symbols
#' @param x vector of character strings containing RTF
#' @keywords internal
stripRtf <- function(x) {
x %>%
# just for \tab, replace with space, otherwise, drop rtf tags entirely
# nolint start
gsub("\\\\tab ", " ", .) %>%
gsub("\\\\[[:punct:]]", "", .) %>% # control symbols only, not control words
gsub("\\\\lsdlocked[ [:alnum:]]*;", "", .) %>% # special case, still needed?
#gsub("\\\\[-[:alnum:]]+[ ;:,.]?", "", .) %>%
gsub("\\{\\\\bkmkstart.*?\\}", "", .) %>%
gsub("\\{\\\\bkmkend.*?\\}", "", .) %>%
#gsub("\\\\[[:alnum:]]*[ [:punct:]]", "", .) %>%
## no backslash in this next list, others removed from http://www.regular-expressions.info/posixbrackets.html
gsub("\\\\[-[:alnum:]]*[ !\"#$%&'()*+,-./:;<=>?@^_`{|}~]?", "", .) %>%
gsub(" *(\\}|\\{)", "", .) %>%
trim
# nolint end
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.