Nothing
# Code for running the dose conversion algorithm
# Incorporates the VB 'partdata' functions.
# Instead of partdata, the dosage string is stored in
# an R character vector.
doseconvert <- function(text, textid = seq_along(text), dosage_mg = NULL,
simplify = TRUE, singlewords = NULL, multiwords = NULL,
patterns = NULL, maxifchoice = TRUE, usebuiltinlookups = TRUE,
customlookups = NULL, cores = 1, noisy = FALSE){
# converts a set of dosages. dosestrings must be a character vector
# and ids should be a vector of unique IDs.
# Cannot use this in noisy mode.
# Arguments:
# text = vector of dosestrings to analyse
# textid = vector of identifiers for dosestrings
# dosage_mg = vector of mg strength for dosage units. This is used
# to calculate an additional column in the output 'daily_mg'
# which is the daily dose in mg. If NULL, the 'daily_mg'
# column is omitted from the output.
# simplify = whether to return only a single row per dosestring
# patterns = patterns lookup, loaded from data or imported from
# parent / global environment if not available
# multiwords = multiwords lookup, loaded from data or imported from
# parent / global environment if not available
# singlewords = singlewords lookup, loaded from data or imported from
# parent / global environment if not available
# lookups = lookup table for common dosages to avoid the need to
# interpret
# maxifchoice = whether to report the maximum if choice of dose,
# otherwise report the average
# usebuiltinlookups = whether to use the built-in table of common
# interpreted dosages that can be looked up rather than
# re-interpreted
# cores = whether to use multiple cores (using parallel)
# customlookups = a table of custom lookups
# noisy = whether to print debug information
# For testing:
# textid = NULL; simplify = TRUE; singlewords = NULL; multiwords = NULL;
# patterns = NULL; uselookups = TRUE; lookups = NULL;
# customlookups = NULL; cores = 1
originalorder <- doseid <- dosestrings <- NULL
lookups <- NULL
if (!is.null(customlookups)){
custom <- NULL
try(custom <- as.drugdose_lookups(customlookups))
if (is.null(custom)){
warning('Custom lookup table ignored as it is in incorrect format.')
if (usebuiltinlookups){
# Load lookups
lookups <- loadDict(lookups, 'lookups')
}
} else {
if (usebuiltinlookups){
# Load lookups and append customlookups
lookups <- loadDict(lookups, 'lookups')
lookups <- rbind(lookups, custom)
} else {
# Only custom lookups are available
lookups <- custom
}
setkey(lookups, text)
}
} else {
if (usebuiltinlookups){
lookups <- loadDict(lookups, 'lookups')
}
}
# If no lookups table is in use, lookups remains NULL
singlewords <- loadDict(singlewords, 'singlewords')
multiwords <- loadDict(multiwords, 'multiwords')
patterns <- loadDict(patterns, 'patterns')
# Set up for multiprocessor analysis
if (cores > 1 & 'parallel' %in% loadedNamespaces() &
Sys.info()['sysname'] == 'Linux'){
options(mc.cores = cores)
multicore = TRUE
} else {
multicore = FALSE
}
indata <- data.table(dosestrings = as.character(text), textid = textid)
# Store the original order of the input data so that the output can
# match it
indata[, originalorder := 1:.N]
# doseid is the ID for a unique dose string
indata[, doseid := .GRP, by = dosestrings]
# Generate a unique list of dosestrings to make analysis faster
uniquedoses <- indata[, list(doseid = doseid[1]),
by = dosestrings]
analysis <- function(text){
qty <- freq <- tot <- time <- change <- choice <- duration <- NULL
daily_dose <- NULL
x <- trim(tolower(text))
if (!is.null(lookups)){
result <- lookups[x, .(qty, units, freq, tot, max, time,
change, choice, duration, daily_dose), nomatch = 0]
result[, order := rep(1L, nrow(result))]
} else {
# Zero-row result if no lookup
result <- data.frame(qty = numeric(0), units = character(0),
freq = numeric(0), tot = numeric(0),
max = factor(integer(0), c("max", "average", "exact")),
time = numeric(0), change = factor(integer(0),
c("first", "second", "nochange", "combined")),
choice = factor(integer(0), c("choice", "asneeded",
"nochoice")), duration = numeric(0), order = integer(0),
daily_dose = numeric(0))
}
if (nrow(result) == 0){
# wrap in 'try' to avoid stalling if there is an unexpected error
try(result <- interpret(x, singlewords = singlewords,
multiwords = multiwords, patterns = patterns,
simplify = simplify, maxifchoice = maxifchoice,
noisy = noisy))
}
result
}
# Analyse the dose strings
if (multicore){
results <- parallel::mclapply(uniquedoses$dosestrings, analysis)
} else {
results <- lapply(uniquedoses$dosestrings, analysis)
}
# Combine the results
results <- lapply(seq_along(results), function(x){
cbind(results[[x]], data.frame(doseid =
rep(uniquedoses[x, doseid], nrow(results[[x]]))))
})
results <- as.data.table(do.call('rbind', results))
setkey(indata, doseid)
setkey(results, doseid)
results <- results[indata]
# If the original textid were numeric, make them numeric in the output
if (is.numeric(textid)){
results[, textid := as.numeric(textid)]
}
# Order the output
setkey(results, originalorder, order)
# Remove temporary doseid and other unnecessary variables
results[, c('originalorder', 'doseid') := NULL]
if ('text' %in% colnames(results)) results[, text := NULL]
if ('i.textid' %in% colnames(results)) results[, i.textid := NULL]
setnames(results, 'dosestrings', 'text')
setcolorder(results, c('textid', 'order', 'text', 'qty',
'units', 'freq', 'tot', 'max', 'time', 'change',
'choice', 'duration', 'daily_dose'))
cat('\nAnalysed', length(text), 'dosage texts.\n')
if (!is.null(dosage_mg)){
try(results <- calculate_mg(results, dosage_mg))
}
results[]
}
calculate_mg <- function(results, dosage_mg){
# Arguments:
# results = output results, as from doseconvert
# dosage_mg = numeric vector with the same length as number
# of rows in results, with the strength of each dose unit
# in mg (e.g. if the dose unit is a 500mcg tablet,
# it will be 0.5). It should be missing if not
# meaningful (e.g. for creams)
# Adds a column 'daily_mg' to the output
daily_mg <- daily_dose <- NULL
if (length(dosage_mg) != nrow(results)){
stop(paste0('dosage_mg has ', length(dosage_mg),
'elements but results has ', nrow(results), ' rows'))
}
results[, daily_mg := as.numeric(NA)]
# Add dosage_mg column to results for ease of calculation
results[, dosage_mg := dosage_mg]
# Units that are mg quantities
results[units == 'mg', daily_mg := daily_dose]
results[units == 'gram', daily_mg := daily_dose * 1000]
results[units == 'mcg', daily_mg := daily_dose / 1000]
# Units that are dose units (or blank) e.g. tablets, capsules,
# metered dose sprays / puffs - not drops
results[units %in% c('', 'neb', 'pv', 'pre', 'puff', 'spray',
'suppos', 'sachet', 'blister', 'amp', 'pill', 'cap', 'tab'),
daily_mg := dosage_mg * daily_dose]
results[, dosage_mg := NULL]
results[]
}
interpret <- function(instring, singlewords, multiwords,
patterns, noisy = FALSE, simplify = FALSE, maxifchoice = TRUE,
id = NULL){
# returns a data.frame with one (if simplify=TRUE) or
# potentially many observations if there are multiple time
# intervals with different doses.
# If noisy=TRUE, write debug information to console
# STAGE 0. Initialisation
pd <- c('start', pdInit(instring))
if (noisy) cat('\n====================\nInitialising: ',
paste(pd, collapse = ' '))
# STAGE 1. singlewords checking and replacement
# only up to 1 word will be wordspace'd
pd_numeric <- isNumeric(pd)
replace <- singlewords[pd]
unmatched <- is.na(replace) & !pd_numeric
# Attempt to wordspace the largest unmatched word
getlength <- nchar(pd[unmatched])
if (length(getlength) > 0L) {
# there is at least one unmatched word, so try wordspace
dothis <- which(nchar(pd) == max(nchar(pd[unmatched])))[1]
replacement <- wordspace(pd[dothis], singlewords = singlewords)
if (!all(replacement == '')) {
# do the replacement, and re-check against dictionaries
pd <- c(pd[1L:(dothis - 1L)], replacement,
pd[(dothis + 1L):length(pd)])
pd_numeric <- isNumeric(pd)
replace <- singlewords[pd]
unmatched <- is.na(replace) & !pd_numeric
}
}
remove <- (unmatched | replace == ',' |
(is.na(replace) & !pd_numeric))
remove[is.na(remove)] <- FALSE
# replace words (not numeric entries)
pd[!pd_numeric] <- replace[!pd_numeric]
if (noisy) cat('\nAfter singlewords replacement: ',
paste(pd, collapse=' '))
# STAGE 2. Remove any unnecessary words or words between
# two unnecessary words
if (length(pd) == 3L) {
if (identical(remove, c(TRUE, FALSE, TRUE))) {remove <- rep(TRUE, 3L)}
} else if (length(pd) > 3L) {
# look for sandwiched words
remove <- remove | c(FALSE, remove[1L:(length(remove) - 2L)] &
remove[3L:length(remove)], FALSE)
}
pd <- pd[!remove]
# create a single string for numbers replacement and
# dictionary matching
pds <- ' ' %&% paste(pd, collapse = ' ') %&% ' '
if (noisy) cat('\nAfter singlewords removal: ', pds)
# STAGE 3. Numbers replacement phase one.
# Note that numbersReplace returns whether there was a choice of
# dose. If maxifchoice is TRUE, the maximum dose is returned.
# If maxifchoice is FALSE, the average dose is returned
nrep1 <- numbersReplace(pds, maxifchoice)
pds <- nrep1$pds
if (noisy) cat('\nAfter numbers replace 1: ', pds, " Choice: ",
nrep1$choice)
# STAGE 4. First dictionary
# Dictionaries are now written with standard (Unix) regular
# expressions
# Use gsub to replace all, useBytes=TRUE for faster matching
for (i in 1:nrow(multiwords)){
newpds <- gsub(multiwords[i, 'words'],
multiwords[i, 'replacement'], pds, useBytes=TRUE)
if (pds != newpds) {
if (noisy) cat('\nMultiwords match: ',
multiwords[i, 'words'])
if (noisy) cat('\n... phrase updated to: ', newpds)
pds <- newpds
}
}
# STAGE 5. Numbers replacement phase two.
nrep2 <- numbersReplace(pds, maxifchoice)
pds <- nrep2$pds
choice <- nrep1$choice | nrep2$choice
if (noisy) cat('\nAfter numbers replace 2: ', pds, " Choice: ",
choice, '\n')
# Convert pd string to pd
pd <- strsplit(pds, ' ')[[1]]
pd <- pd[!pd == '']
# STAGE 6.
# Note that doubledose and max are now in the 'patterns' dictionary
# The object is called 'thedose' instead of 'overall2'
# The only modifications made to dose are choice of dose
# (stored as choice)
thedose <- analyseDose(pd, noisy, patterns = patterns)
if (noisy){
cat('\nAfter analyseDose:\n')
print(thedose)
}
if (choice == TRUE & all(thedose$choice %in% c('nochoice', 'asneeded'))){
if (noisy) cat('\nRegistering choice of doses\n')
thedose$choice <- sub('nochoice', 'choice', thedose$choice)
if (maxifchoice){
thedose$max <- rep('max', nrow(thedose))
} else {
thedose$max <- rep('average', nrow(thedose))
}
}
# double dose for 'each ear' or 'each eye'
if (any(thedose$doubledose)) {
if (noisy) cat('\nDoubling dose for both ears / eyes')
thedose$tot <- thedose$tot*2
thedose$qty <- thedose$qty*2
}
# note that the quantity is doubled; this is a change
# from the previous version
if (any(thedose$units == 'fiveml')) {
if (noisy) cat('\nConverting fiveml')
thedose$units <- rep('ml', nrow(thedose))
thedose$tot <- thedose$tot*5
thedose$qty <- thedose$qty*5
}
# Frequency = 0 means that the frequency is not specified
# Increase any time intervals so that minimum frequency is 1
changeme <- (thedose$freq < 1 & thedose$freq > 0)
if (any(changeme)) {
thedose[changeme, 'time'] <-
thedose[changeme, 'time'] / thedose[changeme, 'freq']
thedose[changeme, 'tot'] <-
thedose[changeme, 'tot'] / thedose[changeme, 'freq']
thedose[changeme, 'freq'] <- 1
}
# Simplify into a single dose if required
thedose$id <- id
if (simplify == TRUE & nrow(thedose) > 1) {
if (noisy) cat('\nSimplfying into a single dose.\n')
thedose <- simplifydose(thedose, noisy)
if (!is.null(id)) {
# use ID for row names
row.names(thedose) <- id
}
} else {
# Not simplifying
if (!is.null(id)) {
# use ID.X for row names where X is the row number
row.names(thedose) <- paste(id, 1:nrow(thedose), sep='.')
}
}
# Also add a separate column for dose line order
thedose$order <- 1:nrow(thedose)
# Add daily dose
thedose$daily_dose <- 0
hasdailydose <- !is.na(thedose$tot) & !is.na(thedose$time)
thedose$daily_dose[hasdailydose] <-
thedose$tot[hasdailydose] / thedose$time[hasdailydose]
thedose$daily_dose[thedose$daily_dose == Inf] <- 0
# Convert missing numerical data to zero
thedose$qty[is.na(thedose$qty)] <- 0
thedose$freq[is.na(thedose$freq)] <- 0
thedose$time[is.na(thedose$time)] <- 0
thedose$tot[is.na(thedose$tot)] <- 0
thedose$duration[is.na(thedose$duration)] <- 0
if (noisy) cat('\nFinished analysing dose.\n================\n')
# Remove unnecessary columns
thedose$words <- NULL
thedose$doubledose <- NULL
thedose$link <- NULL
thedose$divpos <- NULL
thedose$priority <- NULL
thedose[]
}
simplifydose <- function(regimens, noisy = FALSE) {
# like combineparts, but combine different dose lines with
# defined durations (e.g. 1 daily for 1 week then 2 daily for 1 week)
# to produce output similar to the initial (VB) version of
# the dose analysis software
while (nrow(regimens) > 1) {
# loop through, combining doses until there is only one dose
if (regimens$link[1] == 'stat') {
# deal with a 'stat' or loading dose
# assume frequency is the same as main part of prescription
if (regimens$time[2] > 0) {
if (regimens$freq[2] > 0) {
# most common situation, hopefully
# frequency is taken from instruction after stat dose
# stat dose assumed to be taken at the same frequency
# as main dose (e.g. stat dose before a three times a
# day dose would be taken 1/3 of a day before)
regimens$duration[1] <-
regimens$time[2] / regimens$freq[2]
regimens$time[1] <- regimens$time[2] / regimens$freq[2]
regimens$freq[1] <- 1
} else {
# second dose does not have frequency
# assume that stat dose section contains frequency
regimens$duration[1] <- regimens$time[2]
regimens$time[1] <- regimens$time[2]
}
} else {
# assume stat dose lasts for 1 day
regimens$freq[1] <- 1
regimens$time[1] <- 1
regimens$duration <- 1
}
if (noisy){
cat('\nAfter preparing stat dose line:\n')
print(regimens[1:2,])
}
}
# combine doses if they both have a duration
# use 2nd dose if one or both have no duration and both are valid
if (regimens$duration[1] > 0 & regimens$duration[2] > 0) {
# 1st dose 2nd dose Combined
# freq, qty freq, qty[same] weighted average freq
# freq, qty freq[same], qty weighted average qty
# freq, qty freq, qty weighted average qty, freq from 1st dose
# freq, qty freq assume 2nd dose qty is the same
# freq, qty qty assume 2nd dose freq is the same
# freq, qty [other] assume 2nd dose is the same as first
# [missing] [any] use freq, qty from 2nd dose
# first fill in missing frequencies, quantities and totals
# (unless both freq and qty are 0, in which case it might be
# intentional)
if (regimens$freq[2] == 0 & regimens$qty[2] > 0){
regimens$freq[2] <- regimens$freq[1]
} else if (regimens$qty[2] == 0 & regimens$freq[2] > 0){
regimens$qty[2] <- regimens$qty[1]
} else if (regimens$freq[1] == 0 & regimens$qty[1] > 0){
regimens$freq[1] <- regimens$freq[2]
} else if (regimens$qty[1] == 0 & regimens$freq[1] > 0){
regimens$qty[1] <- regimens$qty[2]
}
if (noisy){
cat('\nPreparing to combine dose lines:\n')
print(regimens[1:2,])
}
# Check that time is the same between the dose lines
if (regimens$time[1] != regimens$time[2]){
if (regimens$time[2] > 0){
regimens$freq[2] <- regimens$freq[2] *
(regimens$time[1] / regimens$time[2])
regimens$tot[2] <- regimens$tot[2] *
(regimens$time[1] / regimens$time[2])
}
regimens$time[2] <- regimens$time[1]
if (noisy){
cat('\nStandardising time between dose lines:\n')
print(regimens[1:2,])
}
}
if (regimens$qty[1] == 0 & regimens$qty[2] == 0){
# no quantities, only frequencies (e.g. 'apply twice daily')
if (noisy) cat('\nNo quantities, only frequencies\n')
totaldose <- sum(regimens$tot[1:2] * regimens$duration[1:2])
totalfreq <- sum(regimens$freq[1:2] * regimens$duration[1:2])
regimens$tot[1] <- totaldose / sum(regimens$duration[1:2])
regimens$freq[1] <- totalfreq / sum(regimens$duration[1:2])
} else if (regimens$qty[1] == regimens$qty[2] & regimens$freq[1] > 0) {
# equal qty, nonmissing freq --> calculate average freq
if (noisy) cat('\nEqual quantities, calculating average frequency\n')
totalfreq <- sum(regimens$freq[1:2] * regimens$duration[1:2])
regimens$freq[1] <- totalfreq / sum(regimens$duration[1:2])
regimens$tot[1] <- regimens$freq[1] * regimens$qty[1]
} else if (regimens$freq[1] == regimens$freq[2]){
# equal freq (and nonmissing qty) --> calculate average qty
if (noisy) cat('\nEqual frequency, calculating average quantity\n')
totalqty <- sum(regimens$qty[1:2] * regimens$duration[1:2])
regimens$qty[1] <- totalqty / sum(regimens$duration[1:2])
regimens$tot[1] <- regimens$freq[1] * regimens$qty[1]
} else if (regimens$time[2] > 0) {
if (noisy) cat('\nUse frequency and time from first dose; standardise quantity\n')
# at least some data in dose line 2
# use frequency from first dose; scale quantity appropriately
regimens$qty[2] <- regimens$tot[2] / regimens$freq[1]
regimens$freq[2] <- regimens$freq[1]
# now calculate average quantity
totalqty <- sum(regimens$qty[1:2] * regimens$duration[1:2])
regimens$qty[1] <- totalqty / sum(regimens$duration[1:2])
regimens$tot[1] <- regimens$freq[1] * regimens$qty[1]
} else if (regimens$time[2] == 0) {
# no data in dose 2; use qty and freq from dose 1
# (all values should already be in the right place)
if (noisy) cat('\nNo daily dose in dose 2\n')
}
# add durations. 'change' marked as combined doses
regimens$duration[1] <- sum(regimens$duration[1:2])
regimens$change[1] <- 'combined'
if (noisy){
cat('\nAfter combining dose lines:\n')
print(regimens[1,])
}
} else {
# use the first dose if:
# it is a valid dose AND (dose2 not valid OR dose2 is zero)
if (regimens$time[1] > 0 & (
regimens$time[2] == 0 | regimens$tot[2] == 0)) {
# use first dose, as second dose is not complete
regimens$change[1] <- 'first'
if (noisy){
cat('\nKeeping first dose line:\n')
print(regimens[1,])
}
} else {
# otherwise use second dose
if (regimens$link[1] == 'stat') {
if (regimens$qty[1] > 0 & regimens$qty[2] == 0 &
all(regimens$freq[1:2] > 0)) {
regimens$qty[2] <- regimens$qty[1]
regimens$tot[2] <- regimens$freq[2] * regimens$qty[2]
}
}
items <- c('time', 'freq', 'qty', 'tot', 'duration')
regimens[1, items] <- regimens[2, items]
regimens$change[1] <- 'second'
if (noisy){
cat('\nKeeping second dose line:\n')
print(regimens[1,])
}
}
}
# ensure that time is minimum 1 day
if (regimens$time[1] < 1 & regimens$time[1] > 0){
if (noisy) cat('\nEnsuring that time interval is 1 day.\n')
regimens$freq[1] <- regimens$freq[1] / regimens$time[1]
regimens$tot[1] <- regimens$tot[1] / regimens$time[1]
regimens$time[1] <- 1
}
# remove the second row now that it has been
# incorporated into the first row
if (nrow(regimens) > 2) {
regimens <- regimens[c(1, 3:nrow(regimens)),]
} else {regimens <- regimens[1,]}
}
regimens$link <- NULL
regimens$divpos <- NULL
regimens
}
analyseDose <- function(pd, noisy = FALSE, patterns = patterns){
# Analyses the text after pre-processing.
# Splits the dose into sections, analyses each section,
# and combines the results
# divide up dosage instruction into sections divided at:
# and, or, upto, max, then, reducing to, etc.
# then analyse each bit separately
# blank if dosage instruction consists of one word
if (length(pd) < 2) {
if (noisy){
cat('\nSingle word: ', pd, '; cannot interpret.\n')
}
return(singledosage())
} else {
if (noisy){
cat('\nRunning analyseDose\n')
}
}
# define all possible link points
pos_link_word <- pd %in% c('and', 'or', 'upto',
'changeto', 'day', 'morning', 'midday', 'afternoon',
'evening', 'night', 'stat', 'max', 'miss')
# filter link points to select the last of a series
pos_link_word <- pos_link_word &
!c(pos_link_word[2:length(pos_link_word)], FALSE)
# Last word is always a link word to designate end of section
pos_link_word[length(pos_link_word)] <- TRUE
all_sections_have_data <- FALSE
while (all_sections_have_data == FALSE) {
# identify locations of link words
divpos <- which(pos_link_word)
# vector of link words
link <- pd[divpos]
# different time of day sections are all classified
# with the same word
link[link %in% c('morning', 'midday', 'afternoon',
'evening', 'night')] <- 'day'
# check that each section contains some data; if not,
# combine with previous section and re-analyse
trial <- list()
all_sections_have_data <- TRUE
strings <- restring(pd, divpos)
for (b in 1:(length(strings))){
# trial is a list of dosages
trial[[b]] <- partDose(strings[b],
noisy = noisy, patterns = patterns)
if (trial[[b]]$tot == 0 & trial[[b]]$time == 0 &
length(divpos) > 1){
if (b == 1){
if (pos_link_word[divpos[b]] == TRUE) {
pos_link_word[divpos[b]] <- FALSE
# combine with next section
all_sections_have_data <- FALSE
# only set to false if a change has been made
}
}
}
if (noisy & length(trial) > 1){
cat('\nAnalysing dose in', length(trial), 'sections:\n')
for (b in 1:(length(trial))){
cat('\nSection: ', strings[b], '\n')
print(trial[[b]])
}
}
}
# If all_sections_have_data is FALSE, re-run this loop
}
# That was the first section - ensuring that each part of the
# interpreted text has some data. Next stage: combine sections.
# However in this program we will not combine stat and changeto
# because these can bedone separately if desired, by the
# simplifydose function called by interpret.
trial <- do.call('rbind', trial)
trial$link <- link
if (noisy){
cat('\nPart extracted dosage information in multiple parts:\n')
print(trial)
cat('\nNow combining parts\n')
}
trial <- combineParts(trial, "or", noisy)
trial <- combineParts(trial, "and", noisy)
trial <- combineParts(trial, "day", noisy)
trial <- combineParts(trial, "upto", noisy)
trial <- combineParts(trial, "max", noisy)
trial <- combineParts(trial, "miss", noisy)
return(trial)
}
pdInit <- function(instring){
# Quite slow, takes 0.1 seconds per text!
# Initialise text, splitting words if there are no
# spaces between them. Use lower case only.
instring <- tolower(instring) %&% " "
# pre-allocate an output character vector which is
# at least as long as required
outstring <- character(nchar(instring))
# looping through words
i <- 0L
# looping through letters
j <- 0L
while (j < nchar(instring)){
# go to next word
i <- i + 1L
while (j < nchar(instring) & (outstring[i] == '' |
sameWord(outstring[i], substr(instring, j+1, j+1)))) {
# go to next letter
j <- j + 1L
outstring[i] = outstring[i] %&% substr(instring, j, j)
}
}
length(outstring) <- i
# remove extra words and spaces
outstring[!(outstring %in% c('', ' ', ',', '.'))]
}
wordspace <- function(word, maxwd = 8, singlewords) {
# A simpler R-style version of wordspace
# Recursive method to builds a list of potential matches
# Choose the first match with no words repeated and no single
# letter words, or evaluate all and score them.
# if single letter, exit function
if (nchar(word) == 1) return(word)
# remove trailing X (e.g. dailyxxxxxxxxxxxxxxxxxx)
word <- sub('[x]+$', '', word)
# don't bother if word is more than 13 characters
if (nchar(word) > 13) return(word)
# now the main part of the function to try possible splits
maxtrials <- 20
numtried <- 0
found <- FALSE
splits <- matrix(logical(maxtrials * nchar(word)),
nrow = maxtrials, ncol = nchar(word))
scores <- numeric(maxtrials)
pushback <- function(mybool) {
# switches the last set of contiguous TRUE to FALSE
# and switches the immediately preceding item to TRUE
starttrue <- (mybool == TRUE &
c(FALSE, mybool[1:(length(mybool) - 1)]) == FALSE)
mybool[max(which(starttrue)) - 1] <- TRUE
mybool[max(which(starttrue)):length(mybool)] <- FALSE
mybool
}
match <- function(mybool) {
# whether this is a partial (all except last segment)
# or full match against the singlewords dictionary
words <- strsplitPos(word, mybool)
trial <- singlewords[words]
if (length(trial) == 1) {
if (is.na(trial)) return(0) else return(50)
}
if (!any(is.na(trial[1:(length(trial)-1)]))) {
# at least a partial match
if (is.na(trial[length(trial)])) {
return(1)
} else {
# calculate a matchscore according to
# fewer number of words, fewer one-letter words, lack
# of repetition:
return(50 - (length(words) - length(unique(words)) -
sum(nchar(words) == 1)))
}
} else return(0)
}
cursplit <- c(rep(FALSE, nchar(word)-1), TRUE)
# looking for possible split points
while (found == FALSE & any(cursplit) & numtried < maxtrials &
sum(cursplit) < maxwd) {
# continue while a good match has not been found, current
# split suggestion includes at least one split, fewer than
# maximum number of trials have been attempted and total
# number of split points does not exceed limit
matchscore <- match(cursplit)
if (matchscore == 0) {
# no match
cursplit <- pushback(cursplit)
} else if (matchscore > 1) {
# full match; to record this match
numtried <- numtried + 1
splits[numtried,] <- cursplit
scores[numtried] <- matchscore
cursplit <- pushback(cursplit)
if (matchscore == 50) found <- TRUE
} else {
# partial match - so try an additional split
if (cursplit[length(cursplit)] == FALSE) {
cursplit[length(cursplit)] <- TRUE
} else {
cursplit <- pushback(cursplit)
}
}
}
if (found == TRUE) {
return(strsplitPos(word, which(splits[numtried,])))
} else if (numtried > 0) {
# Selects the first match with highest score
return(strsplitPos(word, which(splits[
which(scores == max(scores, na.rm = TRUE))[1],])))
} else {
return('')
}
}
numbersReplace <- function(pds, maxifchoice) {
# requires pds as a character string
# (with one space between words)
# returns a list containing the modified pds string
# and a TRUE/FALSE variable as to whether
# there is a choice of dose
# Arguments: pds = character string
# maxifchoice = Boolean, whether to return maximum dose
# if there is a choice of dose
# default
choice <- FALSE
# 1O ml etc.
pds <- gsub(' ([[:digit:]]+) o (ml|microgram|mcg|gram|mg) ',
' \\10 \\2 ', pds)
# t d s
pds <- gsub(' ([[:lower:]]{1}) ([[:lower:]]{1}) ([[:lower:]]{1}) ',
' \\1\\2\\3 ', pds)
# 1/2
pds <- gsub(' 1 / 2 ', ' 0.5 ', pds)
# 1.5 times 2 ... (not used for 5ml doses, because this is treated as a separate dose units)
if (!grepl(' ([[:digit:].]+) (times|x) ([[:digit:]]+) 5 ml', pds)) {
pds <- gsub(' ([[:digit:].]+) (times|x) ([[:digit:]]+) ',
' \\1*\\3 ', pds)
}
# 1 mg x 2 ... (but not 1 mg x 5 days)
if (!grepl(' ([[:digit:].]+) (ml|mg|gram|mcg|microgram|unit) (times|x) ([[:digit:]]+) (days|month|week) ', pds)) {
pds <- gsub(' ([[:digit:].]+) (ml|mg|gram|mcg|microgram|unit) (times|x) ([[:digit:]]+) ',
' \\1*\\4 \\2 ', pds)
}
# 1 drop or 2 ...
pd2 <- strsplit(gsub('^[[:print:]]*([[:digit:].]+) (tab|drops|cap|ml|puff|fiveml) (to|-|star) ([[:digit:].]+)[[:print:]]*$',
'MATCHED \\1 \\4', pds), ' ')
if ((pd2[1])=='MATCHED') {
# check that upper dose limit is greater than lower, otherwise
# the text may not actually represent a dose range
if (pd2[3] > pd2[1]) {
pds <- gsub(' ([[:digit:].]+) (tab|drops|cap|ml|puff|fiveml) (to|-|star) ([[:digit:].]+) ',
' \\1 \\2 or \\4 ', pds)
# this function does not combine the dose options
choice <- TRUE
} else {
# not a choice, two pieces of information (e.g. '25mg - 2 daily')
pds <- gsub(' ([[:digit:].]+) (tab|drops|cap|ml|puff|fiveml) (to|-|star) ([[:digit:].]+) ',
' \\1 \\2 \\4 ', pds)
}
}
# 1 and 2 ...
pds <- gsub(' ([[:digit:].]+) (and|\\+) ([[:digit:].]+) ',
' (\\1+\\3) ', pds)
# 3 weeks ...
pds <- gsub(' ([[:digit:].]+) (week) ',
' \\1\\*7 days ', pds)
# 3 months ... NB assume 30 days in a month
pds <- gsub(' ([[:digit:].]+) (month) ',
' \\1\\*30 days ', pds)
# day 1 to day 14 ...
pds <- gsub(' day ([[:digit:]]+) (to|-) day ([[:digit:]]+) ',
' for \\3-\\1 days ', pds)
# X times day to X times day
if (maxifchoice){
pds2 <- gsub(' ([[:digit:]]+) (times|x) day (to|or|-|upto|star) ([[:digit:]]+) (times|x) day ',
' max(c(\\1,\\4)) times day ', pds)
} else {
pds2 <- gsub(' ([[:digit:]]+) (times|x) day (to|or|-|upto|star) ([[:digit:]]+) (times|x) day ',
' (\\1+\\4)/2 times day ', pds)
}
if (pds2 != pds) {
choice <- TRUE
pds <- pds2
}
# days 1 to 14 ...
pds <- gsub(' days ([[:digit:]]+) (to|-) ([[:digit:]]+) ',
' for \\3-\\1 days ', pds)
# 1 or 2 ... moved to below 'days X to X' because
# otherwise the day numbers would be averaged
if (maxifchoice){
pds2 <- gsub(' ([[:digit:].]+) (to|or|-|star) ([[:digit:].]+) ',
' max(c(\\1,\\3)) ', pds)
} else {
pds2 <- gsub(' ([[:digit:].]+) (to|or|-|star) ([[:digit:].]+) ',
' (\\1+\\3)/2 ', pds)
}
if (pds2 != pds) {
choice <- TRUE
pds <- pds2
}
# X times or X times ...
if (maxifchoice){
pds2 <- gsub(' ([[:digit:].]+) times (to|or|-|star) ([[:digit:].]+) times ',
' max(c(\\1,\\3)) times ', pds)
} else {
pds2 <- gsub(' ([[:digit:].]+) times (to|or|-|star) ([[:digit:].]+) times ',
' (\\1+\\3)/2 times ', pds)
}
if (pds2 != pds) {
choice <- TRUE
pds <- pds2
}
# x days every x days
pds <- gsub('( for | )([[:digit:]\\.]+) days every ([[:digit:]\\.]+) days ',
' for \\2 days changeto 0 0 times day for \\3-\\2 days ', pds)
# Evaluate expressions
pd <- strsplit(pds, ' ')[[1]]
pd <- pd[!is.na(pd)]
pd <- pd[pd != '' & pd != ' ']
# If pd contains an expression that can be converted to a number
# try to evaluate it. If it contains any letters (except 'max' as
# part of an expression, e.g. max(2, 3) then ignore. If it is just
# a single item of punctuation without numbers also ignore.
dont_evaluate <- (grepl('[a-z]', pd) & !grepl('^max', pd)) |
pd %in% c('(', ')', ':', '/', '&', '-')
for (i in seq_along(pd)){
if (!dont_evaluate[i]){
try(pd[i] <- as.character(eval(parse(text = pd[i]))),
silent = TRUE)
}
}
pds <- paste(' ', paste(pd, collapse = ' '), ' ', sep = ' ')
list(pds = pds, choice = choice)
}
singledosage <- function(words = '', qty = 0, units = '', freq = 0,
tot = 0, priority = 0, max = c('exact', 'max', 'average'), time = 0,
change = c('nochange', 'first', 'second', 'combined'),
choice = c('nochoice', 'choice', 'asneeded'),
doubledose = FALSE, duration = 0){
max <- factor(max, c('max', 'average', 'exact'))[1]
change <- factor(change, c('first', 'second', 'nochange', 'combined'))[1]
choice <- factor(choice, c('choice', 'asneeded', 'nochoice'))[1]
data.frame(words = words, qty = qty, units = units, freq = freq,
tot = tot, priority = priority, max = max, time = time, change = change,
choice = choice, doubledose = doubledose, duration = duration)
}
extractNum <- function(pattern_dose, existing_dose, pd_matched,
noisy = FALSE){
# new_dose is from the patterns dictionary
# existing dose is the trial or existing dose
# pd_matched is a character vector starting from the
# first matched position
# units of dose
if (pattern_dose$units != '' & pattern_dose$priority >=
existing_dose$priority) {
existing_dose$priority <- pattern_dose$priority
existing_dose$units <- sub(pattern_dose$words,
trim(pattern_dose$units), paste(c(' ', pd_matched, ' '),
collapse = ' '))
}
# numerical quantities
for (item in c('tot', 'freq', 'qty', 'time', 'duration')){
if (pattern_dose[,item] == '') {
# blank; ignore
} else {
existing_dose[, item] <- as.numeric(sub(
pattern_dose$words, pattern_dose[, item],
paste(c(' ', pd_matched, ' '), collapse = ' ')))
}
# delete frequency and quantity information if they were based
# on previous patterns and are incorrect
if (item == 'tot' & existing_dose$tot > 0 &
existing_dose$tot != existing_dose$freq*existing_dose$qty &
existing_dose$freq > 0 & existing_dose$qty > 0) {
if (noisy){
cat('\nProvisional total', existing_dose$tot, 'not equal to',
existing_dose$qty, 'x', existing_dose$freq, '.\n')
}
existing_dose$freq <- 0
existing_dose$qty <- 0
}
}
# doubledose
if (pattern_dose$doubledose == TRUE) {existing_dose$doubledose <- TRUE}
# choice of dose
if (pattern_dose$choice == 'asneeded' |
(pattern_dose$choice == 'choice' &
existing_dose$choice != 'asneeded')) {
existing_dose$choice <- pattern_dose$choice
}
# maximum dose
if (pattern_dose$max == 'max') {existing_dose$max <- 'max'}
return(existing_dose)
}
partDose <- function(text, noisy=FALSE, patterns) {
# extracts dosage information for a specified text
# uses the patterns dictionary, starting with a blank dose
# Sanitize text and append 'start' to the front
# ensuring that text has one space at the beginning and end
text <- ' start ' %&% sub('^ +', '', sub(' +$', '', text)) %&% ' '
text <- sub(' start start', ' start', text)
trial <- singledosage()
for (i in 1:nrow(patterns)) {
# find all matches, and extract information from pattern
matches <- gregexpr(patterns$words[i], text)[[1]]
if (matches != -1) {
# match found
for (j in length(matches)) {
# create vector of matched text ignoring initial space
matchtext <- strsplit(substr(text, matches[j] + 1,
matches[j] + attr(matches, 'match.length')[j] - 1),
' ')[[1]]
if (noisy) {
cat('\nPattern "', trim(patterns[i,'words']),
'" matches: "', matchtext, '"\n')
}
trial2 <- extractNum(patterns[i,], trial, matchtext,
noisy = noisy)
if (noisy) {
cat('... Existing dose information:\n')
print(trial)
cat('... New dose information:\n')
print(trial2)
for (item in c('qty', 'units', 'freq', 'max', 'time',
'change', 'choice', 'doubledose', 'duration')){
if (trial[1, item] != trial2[1, item] &
trial2[1, item] != 0) {
cat('... ', item, trial2[1, item], ' extracted.\n')
}
}
if (trial$tot != trial2$tot){
cat('\n... Total quantity per time period: ', trial2$tot, '\n')
}
}
trial <- trial2
}
}
}
# calculate total dose
if (trial$freq > 0 & trial$qty > 0) {
trial$tot <- trial$qty * trial$freq
}
if (trial$tot == 0 & trial$qty == 0) {
trial$tot <- trial$freq
}
if (trial$tot == 0 & trial$freq == 0) {
trial$tot <- trial$qty
}
# clean up
trial$units <- trim(trial$units)
# return the interpreted data
return(trial)
}
combineParts <- function(trial, linkword, noisy = FALSE) {
# combines adjacent parts of the dose instructions according to a
# chosen linkword
# trial is a data.frame with a column 'link' which contains
# the linkword at the end of that section.
if (nrow(trial) == 1){
return(trial)
} else {
nparts <- nrow(trial)
}
# sub-function for combining 2 dosages
combine2parts <- function(a, b, linkword) {
if (a$units == 'fiveml' & b$units == 'ml') {
b$units <- 'fiveml'
b$tot <- b$tot / 5
b$qty <- b$qty / 5
} else if (a$units == 'ml' & b$units == 'fiveml') {
b$units <- 'ml'
b$tot <- b$tot * 5
b$qty <- b$qty * 5
} else if (a$units == '') {
a$units <- b$units
} else if (b$units == '') {
# nothing happens
} else if (a$priority == 5 & b$priority==3) {
# e.g. 20 mg tab mane and 2 tabs nocte
# change the second quantity to a frequency
# to allow correct adding up
if (a$freq > 0 & (b$freq %in% c(0, 1))) {
b$freq <- b$qty
b$qty <- 0
}
} else if (a$units != b$units) {
# if both a and b are valid doses, make b replace a if
# higher priority (e.g. mcg over puffs) otherwise stick with a.
if (a$tot > 0 & b$tot > 0 & a$time > 0 &
b$time > 0 & a$priority < b$priority) {
a$units <- b$units
items <- c('freq', 'qty', 'tot', 'time')
a[,items] <- b[,items]
}
}
# if dose 2 has low quantity but no units,
# convert it to frequency
if (a$qty > 5 * b$qty & b$qty >= 1) {
if (a$freq > 0 & (b$freq %in% c(0,1)) & b$priority < 4) {
b$freq <- b$qty
b$qty <- 0
}
}
# if very high frequency, convert it to quantity
if (a$freq > 10) {
if (a$qty %in% c(0,1)) {
a$qty <- a$freq
a$freq <- 0
}
}
if (b$freq > 10) {
if (b$qty %in% c(0,1)) {
b$qty <- b$freq
b$freq <- 0
}
}
# standardise the time period (but not if first dose is 'STAT')
if (a$time > 0 & b$time > 0 & a$duration != 1) {
if (b$freq > 0) {
b$freq = b$freq * (a$time / b$time)
} else {
b$qty = b$qty * (a$time / b$time)
}
b$tot = b$tot * (a$time / b$time)
b$time <- a$time
}
if (a$change == 'nochange' & b$change == 'first') {
a$change <- 'first'
}
if (a$change == 'nochange' & b$change == 'second') {
a$change <- 'second'
}
if (a$change == 'first' & b$change == 'second') {
a$change <- 'second'
}
if (a$choice == 'nochoice') {
a$choice <- b$choice
}
if (a$priority == 0) {a$priority <- b$priority}
if (a$max == 'exact') {a$max <- b$max}
if (b$qty > 0 & linkword == 'upto') {linkword <- 'or'}
if (linkword == 'miss'){
# subtract the second dose or frequency
if (b$qty == 0 | b$qty == a$qty){
if (b$time == 0){
a$freq <- a$freq - b$freq
} else {
a$freq <- a$freq - b$freq * (a$time / b$time)
a$tot <- a$qty * a$freq
}
}
} else if (linkword %in% c('and', 'day')) {
# upto 10mg every day ...
if (linkword == 'day' & (b$time == 0 | b$max == 'max')) {
# ignore b: 1 ANY, 2 QTY BUT NO TIME: e.g. 1bd and | sixty
} else if (b$freq == 0 & b$qty > 0 & b$time == 0) {
if (a$tot > 0 & a$time > 0) {
# ignore b
} else {
# e.g. 150mg + 250mg
if (a$freq == 0 | a$freq == 1) {
a$qty <- a$qty + b$qty
a$tot <- a$qty
} else if (a$qty > 0) {
a$tot <- a$tot + b$tot
a$freq <- a$tot / a$qty
}
}
} else if (a$freq == 0 & a$qty > 0 & a$time == 0) {
# 1 QTY BUT NO TIME, 2 ANY: one and | two twice a day
a$freq <- b$freq
a$time <- b$time
a$qty <- a$qty + b$qty
a$tot <- a$freq * a$qty
} else if (a$freq > 0 & a$qty == 0 & b$freq > 0 & b$qty > 0) {
# FREQUENCIES ONLY: 2 times a day and | once at night
a$freq <- a$freq + b$freq
a$tot <- a$tot + b$tot
} else if (a$freq %in% c(0,1) & a$qty > 0 &
b$freq %in% c(0,1) & b$qty > 0) {
# QUANTITIES ONLY: 3 tablets and | 4 tablets
a$tot <- a$tot + b$tot
if (a$time == 0) {
a$qty <- a$qty + b$qty
} else {
a$freq <- a$tot / a$qty
}
} else if (a$freq %in% c(0,1) & a$qty > 0 & b$freq > 0 &
b$qty > 0) {
# 1 QUANTITY, 2 FREQUENCY:
# 5 ml before breakfast and | twice a day
a$freq <- 1 + b$freq
a$tot <- a$freq * a$qty
} else if (a$freq > 0 & a$qty > 0 & b$freq > 0 & b$qty > 0) {
# 1 QTY, FREQ; 2 FREQ: 10 ml every morning and | twice a day
a$freq <- a$freq + b$freq
a$tot <- a$freq * a$qty
} else {
# OTHER: 3 after breakfast and | 2 at night
a$tot <- a$tot + b$tot
# stick with temp1.qty; adjust a$freq so totals compute
if (a$freq > 0 & a$qty > 0) {
a$freq <- a$tot / a$qty
}
}
# combine duration e.g. 'one bd or two bd for 7 days'
if (a$duration == 0) {a$duration <- b$duration}
} else if (linkword == 'or') {
if (b$freq == 0 & b$qty > 0 & b$time ==0) {
# 1 ANY, 2 QTY BUT NO TIME: 1bd - | sixty -- ignore b
} else if (a$freq == 0 & a$qty > 0 & a$time == 0) {
# 1 QTY BUT NO TIME, 2 ANY: one - | two twice a day
a$freq <- b$freq
a$time <- b$time
a$qty <- (a$qty + b$qty) / 2
a$tot <- a$freq * a$qty # error corrected 5/6/2024
} else if (a$freq > 0 & a$qty > 0 & b$freq > 0 & b$qty == 0) {
# FREQUENCIES ONLY: 2 times a day or | once at night
a$freq <- (a$freq + b$freq) / 2
a$tot <- (a$tot + b$tot) / 2
} else if (a$freq %in% c(0,1) & a$qty > 0 &
b$freq %in% c(0,1) & b$qty > 0) {
# QUANTITIES ONLY: 3 tablets or | 4 tablets
a$qty <- (a$qty + b$qty) / 2
a$tot <- (a$tot + b$tot) / 2
} else if (a$freq %in% c(0,1) & a$qty > 0 & b$freq > 0 &
b$qty > 0) {
# 1 QUANTITY, 2 FREQUENCY: 5 ml daily or | twice a day
a$freq <- (1 + b$freq) / 2
a$tot <- a$freq * a$qty
} else if (a$freq > 0 & a$qty > 0 & b$freq > 0 & b$qty > 0) {
# 1 QTY, FREQ; 2 FREQ: 10 ml every morning or | twice a day
a$freq <- (a$freq + b$freq) / 2
a$tot <- a$tot * a$qty
} else if (b$tot > 0) {
# OTHER: 3 after breakfast or | 2 after breakfast
a$tot <- (a$tot + b$tot) / 2
# stick with a$qty; adjust a$freq so totals compute
if (a$qty > 0) {a$freq <- a$tot / a$qty}
}
a$max <- 'average'
# combine duration e.g. 'one bd or two bd for 7 days'
if (a$duration == 0) {a$duration <- b$duration}
} else if (linkword %in% c('upto', 'max')) {
# Dose 2 has freq but no qty; dose 1 has qty but no freq:
# combine doses and max=2
# If not, and Dose 2 has duration but nothing else:
# combine doses using duration from 2 (max not affected)
# If not, and total dose 1 > dose 2, or dose 1 incomplete:
# use dose 2
# If not and dose 2 is valid, combine doses
# Otherwise keep dose 1 and max=1
#
# If previous dose is more than maximum or it is an
# invalid dose, use max
if (a$tot == 0) {
# no useful info in first dose (a)
a$max <- 'max'
a[,c('tot', 'freq', 'qty')] <- b[,c('tot', 'freq', 'qty')]
} else if (a$freq == 0 & a$qty > 0 & b$freq > 0 & b$qty == 0) {
# combine doses
a$max <- 'max'
a$freq <- b$freq
a$tot <- a$freq * a$qty
} else if (b$duration > 0 & b$tot == 0) {
a$duration <- b$duration
} else if ((b$tot < a$tot & a$time > 0 & b$time > 0) |
(a$time == 0 & a$tot > 0) | (b$tot > 0 & a$duration == 1) |
(a$tot == 0 & b$tot > 0)) {
# use dose 2
items <- c('freq', 'qty', 'tot', 'time', 'duration')
a[,items] <- b[,items]
a$max <- 'max'
} else {
# retain existing first dose
a$max <- 'average'
}
}
if (a$time == 0) {a$time <- b$time}
a$link <- b$link
return(a)
}
# End of combine2parts sub-function
# Looping through sections, and combining as necessary
pos <- 1
while (pos < nrow(trial)) {
if (trial$link[pos] == linkword) {
trial[pos,] <- combine2parts(trial[pos,], trial[pos+1,], linkword)
if (pos + 2 > nrow(trial)) {
trial <- trial[1:pos,]
} else {
trial <- trial[c(1:pos, (pos+2):nrow(trial)),]
}
}
pos <- pos + 1
}
if (nrow(trial) < nparts & noisy){
cat('\nAfter combining', linkword, ':\n')
print(trial)
cat('\n')
}
return(trial)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.