Nothing
#' Extract Medication Entities From Phrase
#'
#' This function searches a phrase for medication dosing entities of interest. It
#' is called within \code{\link{medExtractR}} and generally not intended for use outside
#' that function. The \code{phrase} argument containing text to search corresponds to an
#' individual mention of the drug of interest.
#'
#' @param phrase Text to search.
#' @param p_start Start position of phrase within original text.
#' @param p_stop End position of phrase within original text.
#' @param unit Unit of measurement for medication strength, e.g. \sQuote{mg}.
#' @param frequency_fun Function used to extract frequency.
#' @param intaketime_fun Function used to extract intake time.
#' @param duration_fun Function used to extract duration.
#' @param route_fun Function used to extract route.
#' @param strength_sep Delimiter for contiguous medication strengths.
#' @param \dots Parameter settings used in extracting frequency and intake time,
#' including additional arguments to the \code{<entity>_fun} arguments. Use \code{frequency_dict},
#' \code{intaketime_dict}, \code{duration_dict}, and \code{route_dict} to identify
#' custom frequency, intake time, duration, and route dictionaries, respectively.
#'
#' @details Various medication dosing entities are extracted within this function
#' including the following:
#'
#' \emph{strength}: The amount of drug in a given dosage form (i.e., tablet, capsule).\cr
#' \emph{dose amount}: The number of tablets, capsules, etc. taken at a given intake time.\cr
#' \emph{dose strength}: The total amount of drug given intake. This quantity would be
#' equivalent to strength x dose amount, and appears similar to strength when
#' dose amount is absent.\cr
#' \emph{frequency}: The number of times per day a dose is taken, e.g.,
#' \dQuote{once daily} or \sQuote{2x/day}.\cr
#' \emph{intaketime}: The time period of the day during which a dose is taken,
#' e.g., \sQuote{morning}, \sQuote{lunch}, \sQuote{in the pm}.\cr
#' \emph{duration}: How long a patient is on a drug regimen, e.g., \sQuote{2 weeks},
#' \sQuote{mid-April}, \sQuote{another 3 days}.\cr
#' \emph{route}: The administration route of the drug, e.g., \sQuote{by mouth},
#' \sQuote{IV}, \sQuote{topical}.\cr
#'
#' Note that extraction of the entities drug name, dose change, and time of last dose are not
#' handled by the \code{extract_entities} function. Those entities are extracted separately
#' and appended to the \code{extract_entities} output within the main \code{medExtractR} function.
#' Strength, dose amount, and dose strength are primarily numeric quantities, and are identified
#' using a combination of regular expressions and rule-based approaches. Frequency, intake time,
#' route, and duration, on the other hand, use dictionaries for identification.
#'
#' By default and when an argument \code{<entity>_fun} is \code{NULL}, the
#' \code{\link{extract_generic}} function will be used to extract that entity. This function
#' can also inherit user-defined entity dictionaries, supplied as arguments \code{<entity>_dict}
#' to \code{\link{medExtractR}} or \code{\link{medExtractR_tapering}} (see documentation files for main function(s) for details).
#'
#' The \code{stength_sep} argument is \code{NULL} by default, but can be used to
#' identify shorthand for morning and evening doses. For example, consider the
#' phrase \dQuote{Lamotrigine 300-200} (meaning 300 mg in the morning and 200 mg
#' in the evening). The argument \code{strength_sep = '-'} identifies
#' the full expression \emph{300-200} as \emph{dose strength} in this phrase.
#'
#' @return data.frame with entities information. At least one row per entity is returned,
#' using \code{NA} when no expression was found for a given entity.\cr
#' The \dQuote{entity} column of the output contains the formatted label for that entity, according to
#' the following mapping.\cr
#' strength: \dQuote{Strength}\cr
#' dose amount: \dQuote{DoseAmt}\cr
#' dose strength: \dQuote{DoseStrength}\cr
#' frequency: \dQuote{Frequency}\cr
#' intake time: \dQuote{IntakeTime}\cr
#' duration: \dQuote{Duration}\cr
#' route: \dQuote{Route}\cr
#' Sample output for the phrase \dQuote{Lamotrigine 200mg bid} would look like:\cr
#' \tabular{rr}{
#' entity \tab expr\cr
#' IntakeTime \tab <NA>\cr
#' Strength \tab <NA>\cr
#' DoseAmt \tab <NA>\cr
#' Route \tab <NA>\cr
#' Duration \tab <NA>\cr
#' Frequency \tab bid;19:22\cr
#' DoseStrength \tab 200mg;13:18
#' }
#'
#' @export
#'
#' @examples
#' note <- "Lamotrigine 25 mg tablet - 3 tablets oral twice daily"
#' extract_entities(note, 1, nchar(note), "mg")
#' # A user-defined dictionary can be used instead of the default
#' my_dictionary <- data.frame(c("daily", "twice daily"))
#' extract_entities(note, 1, 53, "mg", frequency_dict = my_dictionary)
extract_entities <- function(phrase, p_start, p_stop, unit, frequency_fun = NULL,
intaketime_fun = NULL, duration_fun = NULL, route_fun = NULL,
strength_sep = NULL, ...){
p_start <- p_start-1
# censor date expressions
phrase <- internal_censor_dates(phrase)
# generic extraction
xtra_args <- list(...)
ent_types <- sub('_dict', '', grep('_dict', names(xtra_args), value = TRUE))
oth_types <- setdiff(ent_types, c('duration','frequency','intaketime','route'))
oth_l <- length(oth_types)
oth_ent <- vector('list', oth_l)
if(oth_l) {
for(i in seq(oth_l)) {
ent_type <- oth_types[i]
oth_args <- list(phrase = phrase, type = ent_type, fun = NULL, ...)
# use_fun <- match(sprintf('%s_fun', ent_type), names(xtra_args))
df <- do.call(extract_type, oth_args)
my_ent <- entity_metadata(phrase, p_start, df)
oth_ent[[i]] <- data.frame(entity = ent_type, expr = my_ent)
}
}
### DURATION ####
df <- extract_type(phrase, 'duration', duration_fun, ...)
duration <- entity_metadata(phrase, p_start, df)
pfn <- internal_find_numbers(phrase,
"\\.?\\d+(\\.\\d+)?",
"\\s?(%|dose(s?)|hours|hrs|weeks|wks|days|years|yrs|a(m?)|p(m?))(?![a-zA-Z0-9])",
c("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten")
)
all_numbers <- pfn$all_numbers
num_positions <- pfn$num_positions
tn_expr <- pfn$tn_expr
tn_pos <- pfn$tn_pos
if(length(all_numbers) == 0) {
strength <- NA;doseamt <- NA;dosestr <- NA
remaining_numbers <- all_numbers
num_pos <- num_positions
} else { # only look for entities if they exist
### STRENGTH ####
unit_len <- nchar(unit)
phrase_lc <- tolower(phrase)
# check if any numbers are followed by unit
strength <- mapply(function(an, np){
# Need narrow window in case same number appears both with and without unit
r <- regexpr(paste0(an, "(\\s+)?", unit), substr(phrase_lc, np, np + nchar(an) + unit_len + 3))
if(r == -1){c(NA)}else{
stp <- np + attributes(r)$match.length
paste(substr(phrase, np, stp-1), paste(np + p_start, stp + p_start, sep = ":"), sep = ";")
}
}, an = all_numbers, np = num_positions, USE.NAMES=FALSE)
# Numeric expressions that still need to be examined
remaining_numbers <- all_numbers[which(is.na(strength))]
num_pos <- num_positions[which(is.na(strength))]
# Keep only non-NA values
strength <- strength[!is.na(strength)]
if(length(strength)==0){strength <- NA}
### DOSEAMT ###
if(length(remaining_numbers) == 0) {
doseamt <- NA
} else {
doseamt <- mapply(function(rn, np){
# substr is used to define narrow search windows
# this helps prevent overlap between adjacent numeric values
da <- regexpr(paste0(rn, "(?=(\\s+)?(\\(\\w*\\)\\s+)?tabs)"),
replace_tab(substr(phrase, np, np+15)), perl=T)
# "take" notation
if(da == -1){da_expr <- stringr::str_extract(substr(phrase, max(1, np-8), np+nchar(rn)),
paste0("(?<=(take|takes|taking)(\\s?))", rn))
if(!is.na(da_expr)){da <- regexpr(rn, substr(phrase, np, np+nchar(rn)))}}
# parenthetical notation
if(da == -1){da <- regexpr(paste0("(?<=[(])", rn, "(?=[)])"),
substr(phrase, max(1, np-2), np+nchar(rn)+1), perl=T)}
# number immediately after strength/dose mention, but not with another number immeiately after
if(da == -1){da_expr <- regexpr(paste0("(?<=", unit, ")\\)?\\s", rn, "\\s(?!(\\d|hours?|hrs?))"),
substr(phrase, max(1, np-8), np+nchar(rn)+7),perl=T)
if(da_expr!=-1){da <- regexpr(rn, substr(phrase, np, np+nchar(rn)))}}
if(da == -1){da <- NA}else{
stp <- np + attributes(da)$match.length
da <- paste(substr(phrase, np, stp-1), paste(np + p_start, stp + p_start, sep = ":"), sep = ";")
}
return(da)
}, rn = remaining_numbers, np = num_pos, USE.NAMES=FALSE)
# Numeric expressions that still need to be examined
remaining_numbers <- remaining_numbers[which(is.na(doseamt))]
num_pos <- num_pos[which(is.na(doseamt))]
# Keep only non-NA values
doseamt <- doseamt[!is.na(doseamt)]
if(length(doseamt)==0){doseamt <- NA}
}
# text numbers for doseamt
if(length(tn_expr) > 0) {
text_doseamt <- mapply(function(tne, tnp){
# tablet notation
da <- regexpr(paste0(tne, "(?=(\\s+)?(\\(\\w*\\)\\s+)?tabs)"),
replace_tab(substr(phrase_lc, tnp, tnp+15)), perl=T)
# "take" notation
if(da == -1){da_expr <- stringr::str_extract(substr(phrase_lc, max(1, tnp-8), tnp+nchar(tne)),
paste0("(?<=(take|takes|taking)(\\s?))", tne))
if(!is.na(da_expr)){da <- regexpr(tne, substr(phrase_lc, tnp, tnp+nchar(tne)))}}
# parenthetical notation
if(da == -1){da <- regexpr(paste0("(?<=[(])", tne, "(?=[)])"),
substr(phrase_lc, max(1, tnp-2), tnp+nchar(tne)+1), perl=T)}
if(da == -1){da <- NA}else{
stp <- tnp + attributes(da)$match.length
da <- paste(substr(phrase_lc, tnp, stp-1), paste(tnp + p_start, stp + p_start, sep = ":"), sep = ";")
}
return(da)
}, tne = tn_expr, tnp = tn_pos)
text_doseamt <- text_doseamt[!is.na(text_doseamt)]
if(length(text_doseamt) > 0){
if(any(is.na(doseamt))){doseamt <- text_doseamt}else{doseamt <- c(doseamt, text_doseamt)}
}
}
## DOSE ##
dosestr <- NA
if(!is.null(strength_sep)) {
if(length(remaining_numbers) > 0) {
# Cases where times of doses are denoted as ##-##
num_end <- num_pos + nchar(remaining_numbers)
# Check if there is only distance of 1 between end of one word and start of the next
maybe_dsc <- which(num_pos[-1]-num_end[-length(num_pos)] == 1)
is_dsc <- sapply(maybe_dsc, function(j){
substr(phrase, num_end[j], num_pos[j+1]-1)
}) %in% strength_sep
dsc_index <- sort(c(maybe_dsc[c(is_dsc)], maybe_dsc[c(is_dsc)]+1))
# Add to dose results
if(length(dsc_index) > 0) {
# Extract full expression - allow for variable number of dose separated by marker (e.g. x/x or x/x/x)
rpt <- which(dsc_index==c(NA,dsc_index[1:(length(dsc_index)-1)]))
dsci <- if(length(rpt)>0){dsc_index[-c(rpt-1,rpt)]}else{dsc_index}
# right now assumes only one dose expression like this in phrase (e.g. wouldn't account for "y/y" in "drug name x/x then y/y")
dsc_split <- remaining_numbers[dsci]
dsc_split_pos <- num_pos[dsci]
dsc <- paste(substr(phrase, dsc_split_pos[1], dsc_split_pos[2]+nchar(dsc_split[2])-1),
paste(dsc_split_pos[1]+ p_start,
dsc_split_pos[2]+nchar(dsc_split[2])+ p_start, sep=":"), sep=";")
num_pos <- num_pos[setdiff(1:length(remaining_numbers), dsc_index)]
remaining_numbers <- remaining_numbers[setdiff(1:length(remaining_numbers), dsc_index)]
dosestr <- dsc
}
}
}
}
# non-numeric entities
### FREQ ####
df <- extract_type(phrase, 'frequency', frequency_fun, ...)
freq <- entity_metadata(phrase, p_start, df)
### INTAKETIME ###
df <- extract_type(phrase, 'intaketime', intaketime_fun, ...)
intaketime <- entity_metadata(phrase, p_start, df)
### ROUTE ####
df <- extract_type(phrase, 'route', route_fun, ...)
route <- entity_metadata(phrase, p_start, df)
## BACK TO DOSE ##
# This is for cases where we have drug_name # freq, and # is dose
if(!exists("dosestr")){dosestr <- NA}
if(!all(is.na(freq)) & length(remaining_numbers) > 0) {
# start position from frequency
freq_sp <- as.numeric(sub('[^;]*;([0-9]+):.*', '\\1', freq))
# if format is drug_name rn freq, classify as dose
dsc <- mapply(function(rn, np){
# get position of this number relative to the whole note
np_note_pos <- c(np + p_start, np + p_start + nchar(rn)) # start, stop
if((np_note_pos[1] - p_stop <= 2) & # Allows for 0-1 separating characters
any(freq_sp - np_note_pos[2] <= 2)){
paste(substr(phrase, np, np+nchar(rn)-1), paste(np + p_start, np+nchar(rn) + p_start, sep = ":"), sep = ";")
}else{NA}
}, rn = remaining_numbers, np = num_pos, USE.NAMES=FALSE)
# If anything found, remove from further examination
if(sum(!is.na(dsc)) > 0){
# Keep the ones with NA (not recognized as Dose)
remaining_numbers <- remaining_numbers[is.na(dsc)]
num_pos <- num_pos[is.na(dsc)]
}
dsc <- dsc[!is.na(dsc)]
# Add to dose results
if(length(dsc) > 0){
if(all(is.na(dosestr))){dosestr <- dsc}else{dosestr <- c(dosestr, dsc)}
}
}
## BACK TO DOSEAMT ##
## Try to classify other numbers found among existing entities ##
if(length(remaining_numbers) > 0){
# Find last position of any found entities
ent_list <- list(freq, intaketime, strength, doseamt, dosestr)
last_pos_byent <- sapply(ent_list, function(x){
y <- gsub(x, pattern = ".+;", replacement = "")
max(as.numeric(gsub(y, pattern = ":.+", replacement = "")))
})
last_pos <- if(all(is.na(last_pos_byent))){NA}else{max(last_pos_byent, na.rm=T)}
# If remaining number is in the range of other found entities, record it
keep_num_id <- which(num_pos < (last_pos - p_start))
if(length(keep_num_id) > 0){
rnums <- remaining_numbers[keep_num_id]
npos <- num_pos[keep_num_id]
# probably not doseamt if over 10
da_ids <- which(as.numeric(rnums) <= 10)
if(length(da_ids)>0){
add_das <- sapply(da_ids, function(i){
stp <- npos[i] + nchar(rnums[i])
paste(substr(phrase, npos[i], stp-1),
paste(npos[i] + p_start, stp + p_start, sep = ":"), sep = ";")
})
# Numeric expressions that still need to be examined
remaining_numbers <- remaining_numbers[!da_ids]
num_pos <- num_pos[!da_ids]
if(all(is.na(doseamt))){doseamt <- add_das}else{doseamt <- c(doseamt, add_das)}
}
}
}
## ONE FINAL LOOK AT DOSE ##
# Reclassify strength as dose when necessary
keep_str <- sapply(strength, function(st){
## strength expression is in parentheses
# expression
expr <- sub(";.+", "", st)
# positions
bp1 <- sub(":.+", "", st)
bp <- as.numeric(sub(".+;", "", bp1)) - p_start
ep <- as.numeric(sub(".+:", "", st)) - p_start
in_paren <- grepl(paste0("(?<=[(])", expr, "((\\s\\w\\s)?tabs)?"),
replace_tab(substr(phrase, bp-1, ep+20)), perl=T)
# doseamt occurs right before strength (allow <=1 in case number is in parentheses)
after_da <- any(abs((bp+p_start-1) - as.numeric(sapply(doseamt, gsub,
pattern = ".+:",
replacement = ""))) <= 1)
# strength has tablet after it, not necessarily in parentheses
before_tab <- grepl(paste0(expr, "(\\s\\w\\s)?tabs"),
replace_tab(substr(phrase, bp-1, ep+20)), perl=T)
return(in_paren | after_da | before_tab)
})
str_holdout <- strength[keep_str]
if(all(is.na(doseamt))){ # If doseamt missing, reclassify strength as dose. works even if strength=NA
if(all(is.na(dosestr))){dosestr <- strength}else{dosestr <- c(dosestr, strength)}
strength <- NA
} else {
if(!all(is.na(strength))) {
# If both are non-missing - get start positions
da_sp <- as.numeric(stringr::str_extract(doseamt, "(?<=;).+(?=:)"))
str_sp <- as.numeric(stringr::str_extract(strength, "(?<=;).+(?=:)"))
df_sp <- data.frame(sp = c(da_sp, str_sp),
ent = c(rep("da", length(da_sp)), rep("str", length(str_sp))))
df_sp <- df_sp[order(df_sp[,'sp']),]
# entity indicators
is_str <- which(df_sp$ent=="str")
is_da <- which(df_sp$ent=="da")
# if doseamt isn't after strength, should be dose
is_dose <- sapply(is_str, function(i){
!any(is_da == i + 1)
})
if(any(is_dose)) {
dosestr <- strength[is_dose]
strength <- ifelse(all(is_dose), NA, strength[!is_dose])
}
}
}
# Things that should not have been changed over
if(length(str_holdout) > 0){
# remove from dose
switch_back <- dosestr %in% str_holdout
dosestr <- dosestr[!switch_back]
if(length(dosestr)==0){dosestr <- NA}
# put back into strength
if(all(is.na(strength))){
strength <- str_holdout
}else{
strength <- c(strength, str_holdout)
}
}
#### Building results ###
# If no strength/dose was found, then set all values to NA (only want when associated dose info is present)
if(all(sapply(list(strength, doseamt, dosestr, duration), function(x) all(is.na(x))))){
return(data.frame("entity" = c("Frequency", "IntakeTime", "Strength", "DoseAmt", "DoseStrength"),
"expr" = rep(NA, 5)))
}
ent_res <- list("Frequency" = freq, "IntakeTime" = intaketime,
"Strength" = strength, "DoseAmt" = doseamt, "DoseStrength" = dosestr,
"Duration" = duration, "Route" = route)
entities <- c("Frequency", "IntakeTime", "Strength", "DoseAmt", "DoseStrength", "Duration", "Route")
lf <- sum(!is.na(freq))
lit <- sum(!is.na(intaketime))
lstr <- sum(!is.na(strength))
lda <- sum(!is.na(doseamt))
lds <- sum(!is.na(dosestr))
ldur <- sum(!is.na(duration))
lrt <- sum(!is.na(route))
not_found <- entities[which(c(lf, lit, lstr, lda, lds, ldur, lrt) == 0)]
found <- setdiff(entities, not_found)
res_f <- NULL
res_nf <- NULL
if(length(not_found) > 0) {
res_nf <- data.frame(entity = not_found, expr = NA_character_)
}
if(length(found) > 0) {
found_res <- ent_res[names(ent_res) %in% found]
fr <- vector('list', length(found_res))
for(i in seq_along(found_res)) {
fr[[i]] <- data.frame(entity = found[i], expr = found_res[[i]])
}
# include other generic entities
res_f <- do.call(rbind, c(fr, oth_ent))
}
rbind.data.frame(res_nf, res_f)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.