R/parseClamp.R
In EHR: Electronic Health Record (EHR) Data Processing and Analysis Tool

Documented in parseCLAMP

#' Parse CLAMP NLP Output
#'
#' Takes files with the raw medication extraction output generated by the CLAMP
#' natural language processing system and converts it into a standardized format.
#'
#' Output from different medication extraction systems is formatted in different ways.
#' In order to be able to process the extracted information, we first need to convert
#' the output from different systems into a standardized format. Extracted expressions
#' for various drug entities (e.g., drug name, strength, frequency, etc.) each receive
#' their own column formatted as "extracted expression::start position::stop position".
#' If multiple expressions are extracted for the same entity, they will be separated by
#' backticks.
#'
#' CLAMP output files anchor extractions to a specific drug name extraction through
#' semantic relations.
#'
#' See EHR Vignette for Extract-Med and Pro-Med-NLP as well as Dose Building Using Example Vanderbilt EHR Data for details.
#'
#' @param filename File name for a single file containing CLAMP output.
#'
#' @return A data.table object with columns for filename, drugname, strength, dose, route,
#' and freq. The filename contains the file name corresponding to the clinical
#' note. Each of the entity columns are of the format
#' "extracted expression::start position::stop position".
#' @export

parseCLAMP <- function(filename) {
  con <- file(filename, 'r', blocking = TRUE)
  l <- readLines(con)
  close(con)
  if(length(l) == 0) return(NULL)
  x <- strsplit(l, '\t')
  type <- vapply(x, `[`, character(1), 1)
  xe <- x[type == 'NamedEntity']
  if('Relation' %in% type) {
    xr <- x[type == 'Relation']
    rel <- do.call(rbind, xr)
    drel <- data.frame(rel[grep('^drug', rel[,7]), c(7,2,3,5,6), drop = FALSE], stringsAsFactors = FALSE)
    names(drel) <- c('type', 'drugstart', 'drugstop', 'infostart', 'infostop')
  } else {
    drel <- NULL
  }
  sem <- vapply(xe, `[`, character(1), 4)
  if(!any(grepl('=drug', sem))) {
    return(NULL)
  }
  xe1 <- xe[grep('=drug', sem)]
  xe5 <- vapply(xe1, `[`, character(1), 5)
  dcui <- do.call(rbind, xe1[grep('^cui', xe5)])
  doth <- do.call(rbind, xe1[!grepl('^cui', xe5)])
  drug <- data.frame(name = sub('ne=', '', dcui[,6]),
                     start = as.numeric(dcui[,2]),
                     stop = as.numeric(dcui[,3]), stringsAsFactors = FALSE)
  info <- data.frame(name = sub('ne=', '', doth[,5]),
                     start = as.numeric(doth[,2]),
                     stop = as.numeric(doth[,3]),
                     type = sub('semantic=', '', doth[,4]),
                     stringsAsFactors = FALSE)
  drug <- rbind(drug, info[info[,'type'] == 'drug', 1:3])
  drug <- drug[order(drug[,'start']),]
  info <- info[info[,'type'] != 'drug',]
  if(!is.null(drel)) {
    validinfo <- merge(drel, info[,c('name','start')], by.y = 'start', by.x = 'infostart')
    iid <- sort(unique(validinfo[,'drugstart']))
    distr <- cbind(entity = validinfo[,'type'], str = do.call(paste, c(validinfo[,c('name','infostart','infostop')], sep = ':')))
    druginfo <- split.data.frame(distr, validinfo[,'drugstart'])
  } else {
    iid <- numeric(0)
    druginfo <- NULL
  }

  # reshape data
  l <- nrow(drug)
  cast <- character(l)
  drugname <- cast
  strength <- cast
  dose <- cast
  freq <- cast
  route <- cast
  ix <- match(drug[,2], iid)
  for(i in seq(l)) {
    drugname[i] <- do.call(paste, c(drug[i,], sep='::'))
    if(!is.na(ix[i])) {
      di <- druginfo[[ix[i]]]
      attr.s <- di[di[,'entity'] == 'drug::STRENGTH', 'str']
      attr.d <- di[di[,'entity'] == 'drug::DOSEAMT', 'str']
      attr.f <- di[di[,'entity'] == 'drug::FREQ', 'str']
      attr.r <- di[di[,'entity'] == 'drug::RUT', 'str']
      strength[i] <- medxnEntityFormat(attr.s)
      dose[i] <- medxnEntityFormat(attr.d)
      freq[i] <- medxnEntityFormat(attr.f)
      route[i] <- medxnEntityFormat(attr.r)
    }
  }
  x <- data.frame(filename = basename(filename), drugname, strength, dose, route, freq, stringsAsFactors = FALSE)
  data.table::as.data.table(x)
}