#----------------------------------------------------------------------------#
#' @title Generate medication-related features (med data).
#'
#' @description \
#'
#' @export
#' @import data.table
#' @param cohort
#' @param cohort_key_var_merge
#' @param med_file_mod_arg
#' @param leak_med_day_arg
#' @param combine
#' @param med_file_mod_ext
#' @param med_file_mod_ext_ext
#' @param file_date_var
#' @return
#' @examples
med_feature_gen <- function(cohort, cohort_key_var_merge, cohort_key_var, med_file_mod_arg=med_file_mod,
leak_med_day_arg=leak_med_day,
combine=FALSE, med_file_mod_ext=NA, med_file_mod_ext_ext=NA, file_date_var="med_date") {
print("launching med_feature_gen")
##############################################################################
### Load the modified/pre-processed med file for the specified data sample --
### if no such file exists - excute the function_med_class.R code (access/submit as
### batchmode job using machine/function_class_batchmode.txt)
# (a) load the stored code - return error message if file does not exist
tryCatch(med <- readRDS_merge(med_file_mod_arg), warning=function(w)
print("no classified med file available for the data sample"))
# XXX NOTE: RDS file preserves formatting of empi as character
if (combine==TRUE) {
tryCatch(med_ext <- readRDS_merge(med_file_mod_ext), warning=function(w)
print("no classified med file available for the data sample"))
tryCatch(med_ext_ext <- readRDS_merge(med_file_mod_ext_ext), warning=function(w)
print("no classified med file available for the data sample"))
# temp
med[, med_date_1:=med_date]
med_ext[, med_date_1:=med_date]
med_ext_ext[, med_date_1:=med_date]
med_ext_ext[, activation_dt:=NULL]
med_ext_ext[, indication_dt:=NULL]
med <- rbindlist(list(med_ext, med), fill=T, use.names=T)
med <- rbindlist(list(med, med_ext_ext), fill=T, use.names=T)
med[, med_id:=1:nrow(med)]
}
# remove if empi is missing
med <- med[!is.na(empi)]
# subset to smaller sample for testing if so specified in control file
if (test_raw_file==TRUE) {
store_shorten_file("med")
}
##############################################################################
### load the dia_feature code
## if no med_form -- assume administered
if(!("med_form" %in% names(med))) med[,med_form:=""]
med[is.na(med_form)|med_form=="", med_form:="adm"]
## if no chemo cat nci
if(!("chemo_cat_nci" %in% names(med))) med[,chemo_cat_nci:=""]
##############################################################################
### merge med file with cohort (cohort_key_variables) & format dates
# XXX NOTE: foverlaps - ensures that all vital signs max timeframe_long days
# prior to outcome date
med <- med[empi %in% cohort$empi]
invisible(parse_date(med, c("med_date")))
med[, c("med_date_1","med_date_2"):=.(med_date)]
med <-foverlaps(med, cohort[, mget(cohort_key_var_merge)], by.x=c("empi",
"med_date_1", "med_date_2"), nomatch=0)
med[, time_diff:=as.numeric(difftime(pred_date, get(file_date_var),
units="days"))]
### implement leakage control (as specified in control file -
### omit day of outcome/days pre outcome)
if (!is.na(leak_med_day_arg)) {
med <- med[!(pred_date-med_date_1<=leak_med_day_arg)]
}
##############################################################################
### subsetting & dividing into smaller DT based on timeframe (ST/LT) -
### return as list (...timeframe_comb)
invisible(timeframe_split(list("med"), "med_date"))
name_ext_extended <- name_ext_extended[sapply(med_timeframe_comb, nrow)!=0]
name_ext <- name_ext_extended[2:length(name_ext_extended)]
med_timeframe_comb <- med_timeframe_comb[sapply(med_timeframe_comb, nrow)!=0]
time_min <- min(do.call("c", lapply(med_timeframe_comb, function(x) as.Date(min(x[,
med_date]), "%Y-%m-%d"))))
time_max <- max(do.call("c", lapply(med_timeframe_comb, function(x) as.Date(max(x[,
med_date]), "%Y-%m-%d"))))
##############################################################################
### reshaping - create med count vars & impose feature
### categorisation ("med_count_adm.." or "med_count_pres.." "..."_short/_long")
# XXX NOTE: med count vars == sum of med over timeperiod in question (no hierarchy)
# XXX TO-DO: Fix value.name - currently names "med_destin1" automatically even if specify
# med_destin as name
med_snomed_timeframe_comb <- lapply(med_timeframe_comb, function(x)
melt(x, id.vars =c("outcome_id", "empi", "pred_date", "med_form", "time_diff"),
measure=patterns("^snomed_destin"), variable.name="med_destin_name",
value.name="med_destin"))
med_snomed_timeframe_comb <- lapply(med_snomed_timeframe_comb, function(x)
dcast.data.table(x, outcome_id + empi + pred_date ~ med_form + med_destin1,
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!med_destin1=="" & !is.na(med_destin1))))
med_snomed_timeframe_comb <- feature_var_format(med_snomed_timeframe_comb)
invisible(mapply(function(DT,name_ext) setnames(DT, grep("pres|adm",
names(DT), value=T), paste0("med_med.count_med.snomed_med.", gsub("(.*?)_(.*)", "\\1..\\2",
grep("pres|adm", names(DT), value=T)),name_ext)), DT = med_snomed_timeframe_comb,
name_ext_extended))
##############################################################################
### reshaping - create med count vars & impose feature
### categorisation ("med_count_adm.." or "med_count_pres.." "..."_short/_long")
# XXX NOTE: med count vars == sum of med over timeperiod in question (no hierarchy)
# XXX TO-DO: Fix value.name - currently names "med_destin1" automatically even if specify
# med_destin as name
med_atc_timeframe_comb <- lapply(med_timeframe_comb, function(x)
melt(x, id.vars =c("outcome_id", "empi", "pred_date", "med_form", "time_diff"),
measure=patterns("^atc_cat_"), variable.name="med_destin_name",
value.name="med_destin"))
med_atc_timeframe_comb <- lapply(med_atc_timeframe_comb, function(x)
dcast.data.table(x, outcome_id + empi + pred_date ~ med_form + med_destin1,
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!med_destin1=="" & !is.na(med_destin1))))
med_atc_timeframe_comb <- feature_var_format(med_atc_timeframe_comb)
invisible(mapply(function(DT,name_ext) setnames(DT, grep("pres|adm",
names(DT), value=T), paste0("med_med.count_med.atc_med.", gsub("(.*?)_(.*)", "\\1..\\2",
grep("pres|adm", names(DT), value=T)),name_ext)), DT = med_atc_timeframe_comb,
name_ext_extended))
#############################################################################
## reshaping - create ed med order count vars (snomed) NO ANTI & impose feature
## categorisation ("ed_ed.med.order.count.."..."_short/_long")
med_snomed_excl_anti_timeframe_comb <- lapply(med_timeframe_comb, function(x)
melt(x, id.vars =c("outcome_id", "empi", "pred_date", "med_form", "snomed_anti_cat", "time_diff"),
measure=patterns("^snomed_destin"), variable.name="med_destin_name",
value.name="med_destin"))
med_snomed_excl_anti_timeframe_comb <- lapply(med_snomed_excl_anti_timeframe_comb, function(x)
dcast.data.table(x, outcome_id + empi + pred_date ~ med_form + med_destin1,
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!med_destin1=="" & !is.na(med_destin1) & is.na(snomed_anti_cat))))
med_snomed_excl_anti_timeframe_comb <- feature_var_format(med_snomed_excl_anti_timeframe_comb)
invisible(mapply(function(DT,name_ext) setnames(DT, grep("pres|adm",
names(DT), value=T), paste0("med_med.count_med.snomed.excl.anti_excl.anti.med.", gsub("(.*?)_(.*)", "\\1..\\2",
grep("pres|adm", names(DT), value=T)),name_ext)), DT = med_snomed_excl_anti_timeframe_comb,
name_ext_extended))
##############################################################################
### reshaping - create ed med order count vars (snomed) NO CHEMO & impose feature
### categorisation ("ed_ed.med.order.count.."..."_short/_long")
# XXX NOTE: ED Med order count vars == sum of ed cc dummies over timeperiod in question
med_snomed_excl_chemo_timeframe_comb <- lapply(med_timeframe_comb, function(x)
melt(x, id.vars =c("outcome_id", "empi", "pred_date", "med_form", "snomed_chemo_cat", "time_diff"),
measure=patterns("^snomed_destin"), variable.name="med_destin_name",
value.name="med_destin"))
med_snomed_excl_chemo_timeframe_comb <- lapply(med_snomed_excl_chemo_timeframe_comb, function(x)
dcast.data.table(x, outcome_id + empi + pred_date ~ med_form + med_destin1,
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!med_destin1=="" & !is.na(med_destin1) & is.na(snomed_chemo_cat))))
med_snomed_excl_chemo_timeframe_comb <- feature_var_format(med_snomed_excl_chemo_timeframe_comb)
invisible(mapply(function(DT,name_ext) setnames(DT, grep("pres|adm",
names(DT), value=T), paste0("med_med.count_med.snomed.excl.chemo_excl.chemo.med.", gsub("(.*?)_(.*)", "\\1..\\2",
grep("pres|adm", names(DT), value=T)),name_ext)), DT = med_snomed_excl_chemo_timeframe_comb,
name_ext_extended))
##############################################################################
### reshaping - create ed med order count vars -- ANTIBIOTIC(snomed) & impose feature
### categorisation ("ed_ed.med.order.count.."..."_short/_long")
# XXX NOTE: ED Med order count vars == sum of ed cc dummies over timeperiod in question
med_snomed_anti_timeframe_comb <- lapply(med_timeframe_comb, function(x)
melt(x, id.vars =c("outcome_id", "empi", "pred_date", "med_form", "snomed_anti_cat", "time_diff"),
measure=patterns("^snomed_destin"), variable.name="med_destin_name",
value.name="med_destin"))
med_snomed_anti_timeframe_comb <- lapply(med_snomed_anti_timeframe_comb, function(x)
if(nrow(x[!med_destin1=="" & !is.na(snomed_anti_cat)])>0) { dcast.data.table(x, outcome_id + empi + pred_date ~
med_form + snomed_anti_cat,
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!med_destin1=="" & !is.na(snomed_anti_cat)))} else {data.table (empi = character(0),
outcome_id = numeric(0), pred_date=numeric(0))})
med_snomed_anti_timeframe_comb <- feature_var_format(med_snomed_anti_timeframe_comb)
invisible(mapply(function(DT,name_ext) if (length(grep("pres|adm",
names(DT), value=T))>0) {setnames(DT, grep("pres|adm",
names(DT), value=T), paste0("med_med.count_med.snomed.anti_anti.med.", gsub("(.*?)_(.*)", "\\1..\\2",
grep("pres|adm", names(DT), value=T)),name_ext))}, DT = med_snomed_anti_timeframe_comb,
name_ext_extended))
med_snomed_anti_timeframe_comb <- lapply(med_snomed_anti_timeframe_comb,
function(x) if(length(grep("med_med.count_med.snomed.anti_anti.med.pres",
names(x), value=T))>0) {x[, med_med.count_med.snomed.anti_anti.med.pres..any:=
rowSums(.SD), .SDcols=grep("med_med.count_med.snomed.anti_anti.med.pres",
names(x), value=T)]} else {x})
med_snomed_anti_timeframe_comb <- lapply(med_snomed_anti_timeframe_comb,
function(x) if(length(grep("med_med.count_med.snomed.anti_anti.med.adm",
names(x), value=T))>0) {x[, med_med.count_med.snomed.anti_anti.med.adm..any:=
rowSums(.SD), .SDcols=grep("med_med.count_med.snomed.anti_anti.med.adm",
names(x), value=T)]} else {x})
invisible(mapply(function(DT,name_ext) if(length(grep("med_med.count_med.snomed.anti_anti.med.*any",
names(DT), value=T))>0) {setnames(DT, grep("med_med.count_med.snomed.anti_anti.med.*any",
names(DT), value=T), paste0(grep("med_med.count_med.snomed.anti_anti.med.*any",
names(DT), value=T),name_ext))}, DT = med_snomed_anti_timeframe_comb, name_ext_extended))
##############################################################################
### reshaping - create ed med order count vars -- ANTIBIOTIC(snomed) & impose feature
### categorisation ("ed_ed.med.order.count.."..."_short/_long")
# XXX NOTE: ED Med order count vars == sum of ed cc dummies over timeperiod in question
med_anti_timeframe_comb <- lapply(med_timeframe_comb, function(x) if(nrow(x[!is.na(snomed_anti_cat)])>0) {
dcast.data.table(x, outcome_id + empi + pred_date ~ paste0("..anti_", snomed_anti_cat),
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!is.na(snomed_anti_cat)))} else {data.table(empi = character(0),
outcome_id = numeric(0), pred_date=numeric(0))})
med_anti_timeframe_comb <- feature_var_format(med_anti_timeframe_comb)
invisible(mapply(function(DT,name_ext) if(length(grep("..anti_",
names(DT), value=T))>0) {setnames(DT, grep("..anti_",
names(DT), value=T), paste0("med_med.count_med.anti", grep("..anti_", names(DT),
value=T),name_ext))}, DT = med_anti_timeframe_comb, name_ext=name_ext_extended))
##############################################################################
### reshaping - create ed med order count vars -- ANTIBIOTIC(snomed) & impose feature
### categorisation ("ed_ed.med.order.count.."..."_short/_long")
# XXX NOTE: ED Med order count vars == sum of ed cc dummies over timeperiod in question
med_chemo_timeframe_comb <- lapply(med_timeframe_comb, function(x) if(nrow(x[!is.na(snomed_chemo_cat)])>0) {
dcast.data.table(x, outcome_id + empi + pred_date ~ paste0("..chemo_", snomed_chemo_cat),
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!is.na(snomed_chemo_cat)))} else {data.table (empi = character(0),
outcome_id = numeric(0), pred_date=numeric(0))})
med_chemo_timeframe_comb <- feature_var_format(med_chemo_timeframe_comb)
invisible(mapply(function(DT,name_ext) if(length(grep("..chemo_",
names(DT), value=T))>0) { setnames(DT, grep("..chemo_",
names(DT), value=T), paste0("med_med.count_med.chemo", grep("..chemo_", names(DT),
value=T),name_ext))}, DT = med_chemo_timeframe_comb, name_ext=name_ext_extended))
##############################################################################
### reshaping - create ed med order count vars -- CHEMO(snomed) & impose feature
### categorisation ("ed_ed.med.order.count.."..."_short/_long")
# XXX NOTE: ED Med order count vars == sum of ed cc dummies over timeperiod in question
med_snomed_chemo_timeframe_comb <- lapply(med_timeframe_comb, function(x)
melt(x, id.vars =c("outcome_id", "empi", "pred_date", "med_form", "snomed_chemo_cat", "time_diff"),
measure=patterns("^snomed_destin"), variable.name="med_destin_name",
value.name="med_destin"))
med_snomed_chemo_timeframe_comb <- lapply( med_snomed_chemo_timeframe_comb , function(x)
if (nrow(x[!!med_destin1=="" & !is.na(snomed_chemo_cat)])>0) {dcast.data.table(x, outcome_id + empi + pred_date ~ med_form + snomed_chemo_cat,
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!med_destin1=="" & !is.na(snomed_chemo_cat)))} else {data.table (empi = character(0),
outcome_id = numeric(0), pred_date=numeric(0))})
med_snomed_chemo_timeframe_comb <- feature_var_format(med_snomed_chemo_timeframe_comb)
invisible(mapply(function(DT,name_ext) if (length(grep("pres|adm",
names(DT), value=T))>0) {setnames(DT, grep("pres|adm",
names(DT), value=T), paste0("med_med.count_med.snomed.chemo_chemo.med.", gsub("(.*?)_(.*)", "\\1..\\2",
grep("pres|adm", names(DT), value=T)),name_ext))}, DT = med_snomed_chemo_timeframe_comb,
name_ext_extended))
med_snomed_chemo_timeframe_comb <- lapply(med_snomed_chemo_timeframe_comb,
function(x) if(length(grep("med_med.count_med.snomed.chemo_chemo.med.pres",
names(x), value=T))>0) {x[, med_med.count_med.snomed.chemo_chemo.med.pres..any:=
rowSums(.SD), .SDcols=grep("med_med.count_med.snomed.chemo_chemo.med.pres",
names(x), value=T)]} else {x})
med_snomed_chemo_timeframe_comb <- lapply(med_snomed_chemo_timeframe_comb,
function(x) if(length(grep("med_med.count_med.snomed.chemo_chemo.med.adm",
names(x), value=T))>0) {x[, med_med.count_med.snomed.chemo_chemo.med.adm..any:=
rowSums(.SD), .SDcols=grep("med_med.count_med.snomed.chemo_chemo.med.adm",
names(x), value=T)]} else {x})
invisible(mapply(function(DT,name_ext) if(length(grep("med_med.count_med.snomed.chemo_chemo.med.*any",
names(DT), value=T))>0) {setnames(DT, grep("med_med.count_med.snomed.chemo_chemo.med.*any",
names(DT), value=T), paste0(grep("med_med.count_med.snomed.chemo_chemo.med.*any",
names(DT), value=T),name_ext))}, DT = med_snomed_chemo_timeframe_comb, name_ext_extended))
##############################################################################
### reshaping - create nci chemo features
if (nrow(med_timeframe_comb[[length(name_ext_extended)]][!(is.na(chemo_cat_nci)|chemo_cat_nci=="null")])>0) {
# XXX NOTE: ED Med order count vars == sum of ed cc dummies over timeperiod in question
med_nci_chemo_timeframe_comb <- lapply(med_timeframe_comb, function(x)
melt(x, id.vars =c("outcome_id", "empi", "pred_date", "time_diff"),
measure=patterns("^chemo_cat_nci"), variable.name="chemo_cat_nci_name",
value.name="chemo_cat_nci"))
med_nci_chemo_timeframe_comb <- lapply(med_nci_chemo_timeframe_comb, function(x)
dcast.data.table(x, outcome_id + empi + pred_date ~ paste0("..chemo_nci_", chemo_cat_nci1),
fun.aggregate=list(length, function(x) min(x, na.rm=T)), value.var = "time_diff",
subset=.(!(is.na(chemo_cat_nci1)|chemo_cat_nci1=="null"))))
med_nci_chemo_timeframe_comb <- feature_var_format(med_nci_chemo_timeframe_comb)
invisible(mapply(function(DT,name_ext) setnames(DT, grep("..chemo_nci",
names(DT), value=T), paste0("med_med.count_med.chemo.nci", grep("..chemo_nci", names(DT),
value=T),name_ext)), DT = med_nci_chemo_timeframe_comb , name_ext=name_ext_extended))
}
##############################################################################
### merge med feature files
med_feature_list <- list("med_snomed_timeframe_comb",
"med_snomed_excl_anti_timeframe_comb",
"med_snomed_excl_chemo_timeframe_comb",
"med_snomed_anti_timeframe_comb",
"med_snomed_chemo_timeframe_comb",
"med_anti_timeframe_comb",
"med_chemo_timeframe_comb",
"med_nci_chemo_timeframe_comb",
"med_atc_timeframe_comb")
med_feature_list <- med_feature_list[which(med_feature_list %in% ls())]
timeframe_combine(med_feature_list)
# lapply(med_feature_list, function(x) get(x)[,outcome_id:=as.character(outcome_id)])
med <- Reduce(mymerge, mget(unlist(med_feature_list)))
##############################################################################
### merge with cohort file - empty records -> 0
#med[,outcome_id:=as.character(outcome_id)]
#cohort[,outcome_id:=as.character(outcome_id)]
med <- med[cohort, mget(names(med)), on=c("outcome_id", "empi", "pred_date")]
non_days_to_last_var <- setdiff(names(med),grep("days_to_last", names(med),value=T))
set_na_zero(med, subset_col=non_days_to_last_var)
##############################################################################
### categorise variables to ensure proper treatment in models -- integer
med_integer <- med[, mget(setdiff(names(med), c("outcome_id", "pred_date", "empi")))]
med_integer[, names(med_integer):=lapply(.SD, function(x) as.integer(x))]
med <- cbind(med[, mget(c("outcome_id", "pred_date", "empi"))], med_integer)
med[, ':='(med_time_min=time_min, med_time_max=time_max)]
med[, grep("med_id$", names(med), value=T):=NULL]
feature_var_format_2(med)
##############################################################################
### return med features & delete key files creted in function
rm(med_integer)
rm(list=unlist(med_feature_list))
return (med)
}
#----------------------------------------------------------------------------#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.