knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
install.packages("snakecase") install.packages("h2o") install.packages("remotes") remotes::install_github('ML4LHS/gpmodels')
library(tidyr) library(dplyr) library(readr) library(magrittr) library(snakecase) library(gpmodels) library(lubridate) library(ggplot2)
https://data.world/informatics-edu/synthea-synthetic-patient-informationus
https://synthea.mitre.org/downloads
temp <- tempfile() download.file("https://synthetichealth.github.io/synthea-sample-data/downloads/synthea_sample_data_csv_apr2020.zip", temp) unzip(temp)
patients = read_csv("./csv/patients.csv") %>% janitor::clean_names() %>% # mutate(description = to_snake_case(description)) %>% select(patient = id, birthdate, deathdate, marital, race, ethnicity, gender) patients %>% str() conditions = read_csv("./csv/conditions.csv") %>% janitor::clean_names() %>% mutate(description = to_snake_case(description), category = "condition", value = 1) %>% select(patient, encounter, time = start, category, description = description) conditions %>% str() observations = read_csv("./csv/observations.csv") %>% janitor::clean_names() %>% unite(col = "description", c("description", "units")) %>% mutate(description = to_snake_case(description), category = "observation") %>% select(patient, encounter, time = date, category, description, value) observations %>% str() encounters = read_csv("./csv/encounters.csv") %>% janitor::clean_names() %>% filter(encounterclass == "inpatient") %>% mutate(description = to_snake_case(description)) %>% select(patient, encounter = id, start, stop, encounterclass, encounter_descr = description)
A static outcome must be defined and joined with the fixed data
outcome_pne = conditions %>% filter(grepl("pneumonia", description)) %>% group_by(encounter) %>% slice_min(time) %>% ungroup() %>% select(patient, encounter, pneumonia_time = time) outcome_pne %>% str()
fixed_data = patients %>% inner_join(encounters %>% filter(encounterclass == "inpatient") %>% unique() %>% arrange(desc(start))) %>% left_join(outcome_pne %>% select(patient, pneumonia_time)) %>% mutate(pneumonia = if_else(!is.na(pneumonia_time), "PNA", "No_PNA") %>% factor(levels = c("PNA", "No_PNA"))) %>% mutate(pneumonia = if_else(!is.na(pneumonia_time), 1, 0, 0) %>% as.factor()) %>% mutate(pred_start = start, pred_stop = coalesce(deathdate, pneumonia_time, stop)) %>% select(-c(start, stop)) %>% filter(!is.na(pred_start), # prediction start must be present pred_start <= pred_stop) %>% # prediction start should never be greater than prediction stop distinct(across(c(encounter)), .keep_all = TRUE)
temporal_data = bind_rows(conditions, observations) %>% select(-encounter) %>% inner_join(fixed_data %>% select(patient, encounter)) %>% mutate(time = as_datetime(time))
start predictions: admission end predictions: discharge prediction interval: 6 hours
tf = gpmodels::time_frame(fixed_data = fixed_data, temporal_data = temporal_data, # timestamped data fixed_id = 'encounter', fixed_start = 'pred_start', fixed_end = 'pred_stop', temporal_id = 'encounter', temporal_time = 'time', temporal_variable = 'description', temporal_value = ,'value', temporal_category = 'category', step = hours(6), # how often to make predictions max_length = days(7), # maximum time limit to make predictions (e.g. 7 days inpatient) output_folder = './gpmodels_out/', create_folder = TRUE, chunk_size = 2000)
tf$fixed_data %>% janitor::tabyl(pneumonia)
tf = tf %>% pre_dummy_code()
presence = function (x) { if_else(length(x) > 0, 1, 0, 0) }
future::plan('multisession') tf %>% add_baseline_predictors(category = "condition", lookback = months(12), window = months(3), stats = c(presence = presence)) tf %>% add_baseline_predictors(category = "observation", lookback = months(12), # window = months(3), stats = c(min = min, max = max, median = median)) tf %>% add_rolling_predictors(category = "observation", lookback = hours(12), window = hours(6), stats = c(min = min, max = max, median = median, last = dplyr::last))
pneumonia in the next 24 hours
tf %>% add_rolling_outcomes(variables = "pneumonia", lookahead = hours(24), stats = c(presence = presence))
set.seed(1234) folds = tf$fixed_data %>% select(patient) %>% distinct() %>% group_by(patient) %>% mutate(fold = sample.int(6, patient %>% length(), replace = TRUE)) folds %>% write_csv(file.path("folds.csv")) folds %>% janitor::tabyl(fold)
gpm_prepped = tf %>% combine_output() %>% write_csv("prepped_data.csv")
response = "pneumonia" predictors = setdiff(names(gpm_prepped), c(response,"outcome_pneumonia_presence_24", gpm_prepped %>% select(patient:pred_stop) %>% names())) predictors %>% length() # predictors library(h2o) h2o.init(nthreads=-1) # multiprocessing support h2o.removeAll() # Clean slate - just in case the cluster was already running
training = gpm_prepped %>% inner_join(folds %>% filter(fold %in% 1:4)) %>% mutate(pneumonia = pneumonia %>% as.factor()) validation = gpm_prepped %>% inner_join(folds %>% filter(fold == 5)) %>% mutate(pneumonia = pneumonia %>% as.factor()) %>% select(-fold) test = gpm_prepped %>% inner_join(folds %>% filter(fold == 6)) %>% mutate(pneumonia = pneumonia %>% as.factor()) %>% select(-fold)
gbm_pna_static = h2o.gbm(x = predictors, y = response, training_frame = training %>% as.h2o(), fold_column = "fold", validation_frame = validation %>% as.h2o(), ntrees = 200, learn_rate = 0.05, learn_rate_annealing = 0.99, score_tree_interval = 10, stopping_rounds = 5, stopping_tolerance = 1e-5, stopping_metric = "AUC", seed = 1234, keep_cross_validation_predictions = TRUE) gbm_pna_static %>% h2o.performance(newdata = test %>% as.h2o()) %>% h2o.auc() %>% round(3)
gbm_pna_static_importance = h2o.varimp(gbm_pna_static) %>% arrange(desc(relative_importance)) gbm_pna_static_importance %>% as.data.frame() %>% head(20) %>% mutate(relative_importance = relative_importance %>% round(2)) %>% select(variable, relative_importance) %>% arrange(desc(relative_importance)) %>% ggplot(aes(x = reorder(variable, relative_importance), y = relative_importance)) + geom_bar(stat = "identity") + coord_flip() + xlab("")
Jason Walonoski, Mark Kramer, Joseph Nichols, Andre Quina, Chris Moesel, Dylan Hall, Carlton Duffett, Kudakwashe Dube, Thomas Gallagher, Scott McLachlan, Synthea: An approach, method, and software mechanism for generating synthetic patients and the synthetic electronic health care record, Journal of the American Medical Informatics Association, Volume 25, Issue 3, March 2018, Pages 230–238, https://doi.org/10.1093/jamia/ocx079
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.