Nothing
#' Make Dose Data
#'
#' Takes parsed and paired medication data, calculates dose intake and daily dose, and
#' removes redundant information at the note and date level.
#'
#' This function standardizes frequency, route, and duration entities. Dose amount, strength,
#' and frequency entities are converted to numeric. Rows with only drug name and/or route are
#' removed. If there are drug name changes in adjacent rows (e.g., from a generic to brand name),
#' these rows are collapsed into one row if there are no conflicts. Missing strengths, dose
#' amounts, frequencies, and routes are borrowed or imputed using various rules (see McNeer et al.,
#' 2020 for details). Dose given intake and daily dose are calculated. Redundancies are removed at
#' the date and note level. If time of last dose is being used and it is unique within the level of
#' collapsing, it is borrowed across all rows.
#'
#' @param x data.frame containing the output of \code{\link{buildDose}}, or the output
#' of \code{\link{addLastDose}} if last dose information is being incorporated.
#' @param noteMetaData data.frame containing identifying meta data for each
#' note, including patient ID, date of the note, and note ID. Column names
#' should be set to \sQuote{filename}, \sQuote{pid}, \sQuote{date},
#' \sQuote{note}. Date should have format YYYY-MM-DD.
#' @param naFreq Replacing missing frequencies with this value, or by default the most common value across
#' the entire set in \code{x}.
#'
#' @return A list containing two dataframes, one with the note level and one with the date level
#' collapsed data.
#'
#' @examples
#' data(lam_mxr_parsed)
#' data(lam_metadata)
#'
#' lam_build_out <- buildDose(lam_mxr_parsed)
#'
#' lam_collapsed <- makeDose(lam_build_out, lam_metadata)
#' lam_collapsed[[1]] # Note level collapsing
#' lam_collapsed[[2]] # Date level collapsing
#' @export
makeDose <- function(x, noteMetaData, naFreq = 'most') {
s2f <- options(stringsAsFactors = FALSE)
on.exit(options(s2f))
ix <- match(x[,'filename'], noteMetaData[,'filename'])
if(any(is.na(ix))) stop('ensure that all filenames in `x` are present in noteMetaData')
grid <- noteMetaData[ix,'pid']
date <- as.Date(noteMetaData[ix,'date'], format = '%Y-%m-%d')
note <- noteMetaData[ix,'note']
# make keys
x[,'key0'] <- paste(grid, date, note, x[['drugname_start']], sep = '|')
x[,'key1'] <- paste(grid, date, note, sep = '|')
x[,'key2'] <- paste(grid, date, sep = '|')
# ever a reason to preserver order?
x[,'rowOrder'] <- order(x[['key1']], x[['drugname_start']])
reqCols <- c('strength.num','doseamt.num','freq')
xcols <- names(x)
useRte <- 'route' %in% xcols
useDur <- 'duration' %in% xcols
useDC <- 'dosechange' %in% xcols
useDS <- 'dosestr' %in% xcols
useLD <- 'lastdose' %in% xcols
if(useRte) {
reqCols <- c(reqCols, 'route')
rte <- stdzRoute(x[,'route'])
x[,'route'] <- rte
na.route <- most(rte)
if(is.null(na.route)) {
warning('unable to calculate mode for route, naRoute set to "orally"', call. = FALSE)
na.route <- 'orally'
}
}
if(useDur) {
reqCols <- c(reqCols, 'duration')
x[,'duration'] <- stdzDuration(x[,'duration'])
}
if(useDC) {
reqCols <- c(reqCols, 'dosechange')
x[!is.na(x[,'dosechange']) & nchar(x[,'dosechange']) == 0,'dosechange'] <- NA
}
of <- x[,'freq']
cfreq <- unique(of)
cfreq1 <- stdzFreq(cfreq)
ix <- match(of, cfreq)
if(naFreq == 'most') {
# na.freq <- cfreq1[match(most(of), cfreq)]
na.freq <- most(cfreq1[ix])
# possible there isn't a most-used frequency
if(is.null(na.freq)) {
warning('unable to calculate mode for frequency, naFreq set to "bid"', call. = FALSE)
na.freq <- 'bid'
}
} else {
na.freq <- stdzFreq(naFreq)
}
cfreq2 <- freqNum(cfreq1)
# set freq to NA if freqNum failed
cfreq1[is.na(cfreq2)] <- NA
cfreq2[is.na(cfreq2)] <- freqNum(na.freq)
x[,'freq'] <- cfreq1[ix]
# ignore duplicate dash data, by storing rowOrder in vector
ignDupDashDatDS <- numeric(0)
if(useDS) {
reqCols <- c(reqCols, 'dosestr.num')
cstrg <- stdzStrength(x[,'dosestr'], x[,'freq'])
x[,'dosestr.num'] <- c(cstrg)
# add additional rows if necessary
if('addl_data' %in% names(attributes(cstrg))) {
addl <- attr(cstrg, 'addl_data')
ix <- which(lengths(addl) > 0)
added <- vector('list', length(ix))
ignDupDashDatDS <- numeric(length(ix))
for(i in seq_along(ix)) {
impdat <- addl[[ix[i]]]
newdat <- x[rep(ix[i], nrow(impdat)),]
newdat[,'dosestr.num'] <- impdat[,'str']
if('freq' %in% names(impdat)) {
newdat[,'freq'] <- impdat[,'freq']
} else {
ignDupDashDatDS[i] <- x[ix[i],'rowOrder']
}
added[[i]] <- newdat
}
# remove original row and add new rows
x <- rbind(x[-ix,], do.call(rbind, added))
x <- reOrder(x)
ignDupDashDatDS <- ignDupDashDatDS[ignDupDashDatDS > 0]
}
}
if(useLD) {
reqCols <- c(reqCols, 'lastdose')
x[!is.na(x[,'lastdose']) & nchar(x[,'lastdose']) == 0,'lastdose'] <- NA
}
cstrg <- stdzStrength(x[,'strength'], x[,'freq'])
x[,'strength.num'] <- c(cstrg)
# ignore duplicate dash data, by storing rowOrder in vector
ignDupDashDat <- numeric(0)
# add additional rows if necessary
if('addl_data' %in% names(attributes(cstrg))) {
addl <- attr(cstrg, 'addl_data')
ix <- which(lengths(addl) > 0)
added <- vector('list', length(ix))
ignDupDashDat <- numeric(length(ix))
for(i in seq_along(ix)) {
impdat <- addl[[ix[i]]]
newdat <- x[rep(ix[i], nrow(impdat)),]
newdat[,'strength.num'] <- impdat[,'str']
if('freq' %in% names(impdat)) {
newdat[,'freq'] <- impdat[,'freq']
} else {
ignDupDashDat[i] <- x[ix[i],'rowOrder']
}
added[[i]] <- newdat
}
# remove original row and add new rows
x <- rbind(x[-ix,], do.call(rbind, added))
x <- reOrder(x)
ignDupDashDat <- ignDupDashDat[ignDupDashDat > 0]
}
ignDupDashDat <- sort(unique(c(ignDupDashDatDS, ignDupDashDat)))
x[,'doseamt.num'] <- stdzDose(x[,'dose'])
# remove rows with only drugname and/or route
verCols <- setdiff(reqCols, 'route')
nas <- rowSums(is.na(x[,verCols])) == length(verCols)
x <- x[!nas,]
# borrow last dose
# at note level, then date level if unique
if(useLD) {
vkey <- unique(x[!is.na(x[,'lastdose']),'key1'])
chkv <- x[,'key1'] %in% vkey
if(any(chkv)) {
x1 <- x[chkv,]
x2 <- x[!chkv,]
# use first lastdose
ldvals <- tapply(x1[,'lastdose'], x1[,'key1'], function(i) i[!is.na(i)][1])
x1[,'lastdose'] <- ldvals[match(x1[,'key1'], names(ldvals))]
x <- rbind(x1, x2)
ldkey <- unique(x[is.na(x[,'lastdose']),'key2'])
chkld <- x[,'key2'] %in% ldkey
x1 <- x[chkld,]
x2 <- x[!chkld,]
# borrow if unique
x1 <- do.call(qrbind, lapply(split(x1, x1[,'key2']), borrowVal, 'lastdose'))
x <- rbind(x1, x2)
# convert back to POSIXct
x[,'lastdose'] <- as.POSIXct(as.numeric(x[,'lastdose']), origin = '1970-01-01')
}
x <- reOrder(x)
}
# borrow dosestr.num - require unique w/in mention
if(useDS) {
dskey <- unique(x[is.na(x[,'strength.num']) & is.na(x[,'dosestr.num']) & is.na(x[,'doseamt.num']),'key0'])
chkds <- x[,'key0'] %in% dskey
x1 <- x[chkds,]
x2 <- x[!chkds,]
x1 <- do.call(qrbind, lapply(split(x1, x1[,'key0']), function(xs) {
borrowVal(xs, col = 'dosestr.num', elig = is.na(xs[,'strength.num']) & is.na(xs[,'doseamt.num']))
}))
x <- rbind(x1, x2)
x <- reOrder(x)
hasDS <- !is.na(x[,'dosestr.num'])
ds <- x[hasDS,]
x <- x[!hasDS,]
}
# find drugname changes
nr <- nrow(x)
fn <- tolower(x[,'filename'])
dn <- tolower(x[,'drugname'])
verCols <- setdiff(reqCols, c('dosestr.num','lastdose'))
ix <- which(fn[-1] == fn[-nr] & dn[-1] != dn[-nr])
if(length(ix)) {
# first pass-through ignores consecutive merges
ix1 <- ix[which(c(TRUE, diff(ix) != 1))]
if(length(ix1)) {
rowList <- vector('list', length(ix1))
valList <- rowList
for(i in seq_along(ix1)) {
rowix <- seq(ix1[i], length.out=2)
tmp <- mergeAdjacent(x[rowix,], verCols)
if(length(tmp)) {
rowList[[i]] <- rowix
valList[[i]] <- tmp
}
}
rowI <- unlist(rowList)
valI <- do.call(qrbind, valList)
if(length(rowI) > 0L) {
x[rowI,] <- valI
}
}
# second pass-through allows consecutive merges
ix2 <- setdiff(ix, ix1)
for(i in seq_along(ix2)) {
rowix <- seq(ix2[i], length.out=2)
tmp <- mergeAdjacent(x[rowix,], verCols)
if(length(tmp)) {
# this is really slow
x[rowix,] <- tmp
}
}
}
# "bad" strength was tagged for removal
x <- x[is.na(x[,'strength.num']) | x[,'strength.num'] != -999,]
if(useDC) {
hasDC <- !is.na(x[,'dosechange'])
dc <- x[hasDC,]
x <- x[!hasDC,]
# add from dosestr if available
if(useDS) {
hasDC <- !is.na(ds[,'dosechange'])
dc <- rbind(dc, ds[hasDC,])
ds <- ds[!hasDC,]
}
}
# borrow strength - require unique w/in mention
strkey <- unique(x[is.na(x[,'strength.num']),'key0'])
if(length(strkey)) {
chkstr <- x[,'key0'] %in% strkey
x1 <- x[chkstr,]
x2 <- x[!chkstr,]
x1 <- do.call(qrbind, lapply(split(x1, x1[,'key0']), borrowVal, 'strength.num'))
x <- rbind(x1, x2)
# remove missing strength
x <- x[!is.na(x[,'strength.num']),]
x <- reOrder(x)
}
# borrow dose
# first try unique within mention
# otherwise, impute 1
## note this is done before DOSESTR is added back
dosekey <- unique(x[is.na(x[,'doseamt.num']),'key0'])
if(length(dosekey)) {
chkdose <- x[,'key0'] %in% dosekey
x1 <- x[chkdose,]
x2 <- x[!chkdose,]
x1 <- do.call(qrbind, lapply(split(x1, x1[,'key0']), borrowVal, 'doseamt.num'))
x <- rbind(x1, x2)
}
ix <- which(is.na(x[,'doseamt.num']))
x[ix, 'doseamt.num'] <- 1
# re-combine
if(useDC) {
x <- rbind(x, dc)
}
if(useDS) {
x <- rbind(x, ds)
}
x <- reOrder(x)
# borrow freq
# first check for am/noon/pm dose sequence within note
# then mode within mention
# otherwise, impute global mode
# nodupx <- x[!duplicated(x[,'rowOrder']),]
# ignore rows created by dash (STR1-STR2)
nodupx <- x[!(x[,'rowOrder'] %in% ignDupDashDat & duplicated(x[,'rowOrder'])),]
freq.mention.mode <- tapply(nodupx[,'freq'], nodupx[,'key0'], function(y) {
yy <- most(y)
# use global freq if no mode
if(is.null(yy)) na.freq else yy
})
freqkey <- unique(x[is.na(x[,'freq']),'key1'])
chkfreq <- x[,'key1'] %in% freqkey
x1 <- x[chkfreq,]
x2 <- x[!chkfreq,]
x1 <- do.call(qrbind, lapply(split(x1, x1[,'key1']), borrowFreqDoseSeq))
x <- reOrder(rbind(x1, x2))
ix <- which(is.na(x[,'freq']))
if(length(ix)) {
# retrieve mode for each mention
x[ix,'freq'] <- unname(freq.mention.mode[x[ix,'key0']])
}
# numeric freq
x[,'freq.num'] <- cfreq2[match(x[,'freq'], cfreq1)]
# this is unlikely to occur
x[is.na(x[,'freq.num']), 'freq.num'] <- freqNum(na.freq)
# borrow route
if(useRte) {
vkey <- unique(x[is.na(x[,'route']),'key1'])
chkv <- x[,'key1'] %in% vkey
if(any(chkv)) {
x1 <- x[chkv,]
x2 <- x[!chkv,]
# borrow if unique
x1 <- do.call(qrbind, lapply(split(x1, x1[,'key1']), borrowVal, 'route'))
# borrow global mode
x1[is.na(x1[,'route']),'route'] <- na.route
x <- rbind(x1, x2)
}
x <- reOrder(x)
}
# dose intake
x[,'dose.intake'] <- x[,'strength.num'] * x[,'doseamt.num']
if(useDS) {
ix <- which(!is.na(x[,'dosestr.num']) & !is.na(x[,'dose.intake']))
if(length(ix)) {
stop('dose.intake should be missing if dosestr is present')
}
ix <- which(!is.na(x[,'dosestr.num']) & is.na(x[,'dose.intake']))
if(length(ix)) {
x[ix,'dose.intake'] <- x[ix,'dosestr.num']
}
}
# dosechange can be borrowed from empty rows
if(useDC) {
nr <- nrow(x)
fn <- tolower(x[,'filename'])
dn <- tolower(x[,'drugname'])
dnchange <- c(fn[-1] == fn[-nr] & dn[-1] != dn[-nr], FALSE)
ix <- which(dnchange & is.na(x[,'dose.intake']) & !is.na(x[,'dosechange']) & c(is.na(x[-1,'dosechange']), FALSE))
if(length(ix)) {
dckey <- x[ix+1,'key0']
dcval <- x[ix,'dosechange']
dcix <- match(x[,'key0'], dckey)
kix <- which(!is.na(dcix))
x[kix, 'dosechange'] <- dcval[dcix[kix]]
}
}
# at this point, dose.intake is required
x <- x[!is.na(x[,'dose.intake']),]
# exclude dosechange again
if(useDC) {
# daily dose
x <- calcDailyDose(x, useDC = TRUE)
hasDC <- !is.na(x[,'dosechange'])
dc <- x[hasDC,]
x <- x[!hasDC,]
} else {
# daily dose
# creates intaketime|dose.seq|dose.daily
x <- calcDailyDose(x)
}
l1 <- rmDuplicates(x, useRoute = useRte, useDuration = useDur, useLastDose = useLD)
xn <- l1[['note']]
xd <- l1[['date']]
# re-combine
if(useDC) {
l2 <- rmDuplicates(dc, useRoute = useRte, useDuration = useDur, useDoseChange = TRUE, useLastDose = useLD)
xn <- rbind(xn, l2[['note']])
xd <- rbind(xd, l2[['date']])
}
xn <- reOrder(xn)
xd <- reOrder(xd)
xn[,'rowOrder'] <- NULL
xd[,'rowOrder'] <- NULL
list(note = xn, date = xd)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.