Nothing
#' Parse CLAMP NLP Output
#'
#' Takes files with the raw medication extraction output generated by the CLAMP
#' natural language processing system and converts it into a standardized format.
#'
#' Output from different medication extraction systems is formatted in different ways.
#' In order to be able to process the extracted information, we first need to convert
#' the output from different systems into a standardized format. Extracted expressions
#' for various drug entities (e.g., drug name, strength, frequency, etc.) each receive
#' their own column formatted as "extracted expression::start position::stop position".
#' If multiple expressions are extracted for the same entity, they will be separated by
#' backticks.
#'
#' CLAMP output files anchor extractions to a specific drug name extraction through
#' semantic relations.
#'
#' See EHR Vignette for Extract-Med and Pro-Med-NLP as well as Dose Building Using Example Vanderbilt EHR Data for details.
#'
#' @param filename File name for a single file containing CLAMP output.
#'
#' @return A data.table object with columns for filename, drugname, strength, dose, route,
#' and freq. The filename contains the file name corresponding to the clinical
#' note. Each of the entity columns are of the format
#' "extracted expression::start position::stop position".
#' @export
parseCLAMP <- function(filename) {
con <- file(filename, 'r', blocking = TRUE)
l <- readLines(con)
close(con)
if(length(l) == 0) return(NULL)
x <- strsplit(l, '\t')
type <- vapply(x, `[`, character(1), 1)
xe <- x[type == 'NamedEntity']
if('Relation' %in% type) {
xr <- x[type == 'Relation']
rel <- do.call(rbind, xr)
drel <- data.frame(rel[grep('^drug', rel[,7]), c(7,2,3,5,6), drop = FALSE], stringsAsFactors = FALSE)
names(drel) <- c('type', 'drugstart', 'drugstop', 'infostart', 'infostop')
} else {
drel <- NULL
}
sem <- vapply(xe, `[`, character(1), 4)
if(!any(grepl('=drug', sem))) {
return(NULL)
}
xe1 <- xe[grep('=drug', sem)]
xe5 <- vapply(xe1, `[`, character(1), 5)
dcui <- do.call(rbind, xe1[grep('^cui', xe5)])
doth <- do.call(rbind, xe1[!grepl('^cui', xe5)])
drug <- data.frame(name = sub('ne=', '', dcui[,6]),
start = as.numeric(dcui[,2]),
stop = as.numeric(dcui[,3]), stringsAsFactors = FALSE)
info <- data.frame(name = sub('ne=', '', doth[,5]),
start = as.numeric(doth[,2]),
stop = as.numeric(doth[,3]),
type = sub('semantic=', '', doth[,4]),
stringsAsFactors = FALSE)
drug <- rbind(drug, info[info[,'type'] == 'drug', 1:3])
drug <- drug[order(drug[,'start']),]
info <- info[info[,'type'] != 'drug',]
if(!is.null(drel)) {
validinfo <- merge(drel, info[,c('name','start')], by.y = 'start', by.x = 'infostart')
iid <- sort(unique(validinfo[,'drugstart']))
distr <- cbind(entity = validinfo[,'type'], str = do.call(paste, c(validinfo[,c('name','infostart','infostop')], sep = ':')))
druginfo <- split.data.frame(distr, validinfo[,'drugstart'])
} else {
iid <- numeric(0)
druginfo <- NULL
}
# reshape data
l <- nrow(drug)
cast <- character(l)
drugname <- cast
strength <- cast
dose <- cast
freq <- cast
route <- cast
ix <- match(drug[,2], iid)
for(i in seq(l)) {
drugname[i] <- do.call(paste, c(drug[i,], sep='::'))
if(!is.na(ix[i])) {
di <- druginfo[[ix[i]]]
attr.s <- di[di[,'entity'] == 'drug::STRENGTH', 'str']
attr.d <- di[di[,'entity'] == 'drug::DOSEAMT', 'str']
attr.f <- di[di[,'entity'] == 'drug::FREQ', 'str']
attr.r <- di[di[,'entity'] == 'drug::RUT', 'str']
strength[i] <- medxnEntityFormat(attr.s)
dose[i] <- medxnEntityFormat(attr.d)
freq[i] <- medxnEntityFormat(attr.f)
route[i] <- medxnEntityFormat(attr.r)
}
}
x <- data.frame(filename = basename(filename), drugname, strength, dose, route, freq, stringsAsFactors = FALSE)
data.table::as.data.table(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.