R/parseMedExtractR.R
In EHR: Electronic Health Record (EHR) Data Processing and Analysis Tool

Documented in parseMedExtractR

#' Parse medExtractR NLP Output
#'
#' Takes files with the raw medication extraction output generated by the medExtractR
#' natural language processing system and converts it into a standardized format.
#'
#' Output from different medication extraction systems is formatted in different ways.
#' In order to be able to process the extracted information, we first need to convert
#' the output from different systems into a standardized format. Extracted expressions
#' for various drug entities (e.g., drug name, strength, frequency, etc.) each receive
#' their own column formatted as "extracted expression::start position::stop position".
#' If multiple expressions are extracted for the same entity, they will be separated by
#' backticks.
#'
#' The medExtractR system returns extractions in a long table format, indicating the
#' entity, extracted expression, and start:stop position of the extraction. To perform
#' this initial parsing, entities are paired with the closest preceding drug name. The
#' one exception to this is the dose change entity, which can occur before the drug name
#' (see Weeks, et al. 2020 for details). 
#'
#' See EHR Vignette for Extract-Med and Pro-Med-NLP as well as Dose Building Using Example Vanderbilt EHR Data for details.
#'
#' @param filename File name for a single file containing medExtractR output.
#'
#' @return A data.table object with columns for filename, drugname, strength, dose, route,
#' freq, dosestr, dosechange and lastdose. The filename contains the file name
#' corresponding to the clinical note. Each of the entity columns are of the format
#' "extracted expression::start position::stop position".
#'
#' @examples
#' mxr_output <- system.file("examples", "lam_mxr.csv", package = "EHR")
#' mxr_parsed <- parseMedExtractR(mxr_output)
#' mxr_parsed
#' @export

parseMedExtractR <- function(filename) {
  ndig <- options(digits = 15)
  on.exit(options(ndig))
  # NSE fix for R CMD CHECK
  pos <- NULL
  expr <- NULL
  entity <- NULL
  # end
  df <- fread(filename, stringsAsFactors = FALSE)
  rm(filename)
  ix <- which(df[['entity']] == 'DrugName')
  bord <- c(ix[-1]-1, nrow(df))
  l <- length(ix)
  init <- character(l)
  filename <- init
  drugname <- init
  strength <- init
  dose <- init
  route <- init
  freq <- init
  # add potential attributes
  lastdose <- init
  dosestr <- init
  dosechange <- init
#   dc.ix <- which(df[['entity']] == 'DoseChange')
  # negative exemption shouldn't be necessary
  dc.ix <- which(df[['entity']] == 'DoseChange' & !grepl('^(-|#)', df[['pos']]))
  # restrict to closest DrugName in same note
  # complicated because note number potentially overflows
  startLoc <- function(x) sprintf("%06d", as.numeric(sub(':.*', '', x)))
  fn <- df[ix,filename]
  fileID <- unclass(as.factor(fn)) * 10
  dn.row <- as.numeric(paste0(fileID, '.', startLoc(df[ix,pos])))
  if(length(dc.ix)) {
    dcFileId <- fileID[match(df[dc.ix,filename], fn)]
    dc.row <- as.numeric(paste0(dcFileId, '.', startLoc(df[dc.ix,pos])))
    dc.match <- ix[vapply(dc.row, function(i) which.min(abs(i - dn.row)), numeric(1))]
  } else {
    dc.match <- NA
  }
  # are there any duplicates?
  for(i in seq(l)) {
    tmp <- df[seq(ix[i], bord[i])]
    filename[i] <- tmp[1, filename]
#     filename[i] <- tmp[1, paste(grid, date, note, sep = '_')]
    drugname[i] <- tmp[1, paste(expr, pos, sep = ':')]
    # if ix[i] is in dc.match, attach DoseChange
    dcfound <- match(ix[i], dc.match)
    if(!is.na(dcfound)) {
      attr.ds <- df[dc.ix[dcfound], paste(expr, pos, sep = ':')]
      dosechange[i] <- medxnColonFormat(attr.ds)
    }
    attr.s <- tmp[entity == 'Strength', paste(expr, pos, sep = ':')]
    attr.d <- tmp[entity == 'DoseAmt', paste(expr, pos, sep = ':')]
    attr.f <- tmp[entity == 'Frequency', paste(expr, pos, sep = ':')]
    attr.t <- tmp[entity == 'IntakeTime', paste(expr, pos, sep = ':')]
    attr.da <- tmp[entity == 'Dose', paste(expr, pos, sep = ':')]
    attr.ld <- tmp[entity == 'LastDose', paste(expr, pos, sep = ':')]
    if(length(attr.t)) {
      attr.f <- c(attr.f, attr.t)
    }
    strength[i] <- medxnEntityFormat(attr.s)
    dose[i] <- medxnEntityFormat(attr.d)
    freq[i] <- medxnEntityFormat(attr.f)
    dosestr[i] <- medxnEntityFormat(attr.da)
    lastdose[i] <- medxnEntityFormat(attr.ld)
  }
  # need double-colon
  drugname <- medxnColonFormat(drugname)
  # if all dosechange or lastdose are missing, drop column?
  x <- data.frame(filename, drugname, strength, dose, route, freq, dosestr, dosechange, lastdose, stringsAsFactors = FALSE)
  data.table::as.data.table(x)
}