In ML4LHS/gpmodels: A Grammar of Prediction Models

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

install packages

install.packages("snakecase")
install.packages("h2o")
install.packages("remotes")
remotes::install_github('ML4LHS/gpmodels')

setup

library(tidyr)
library(dplyr)
library(readr)
library(magrittr)
library(snakecase)
library(gpmodels)
library(lubridate)
library(ggplot2)

https://data.world/informatics-edu/synthea-synthetic-patient-informationus

https://synthea.mitre.org/downloads

Import data

temp <- tempfile()
download.file("https://synthetichealth.github.io/synthea-sample-data/downloads/synthea_sample_data_csv_apr2020.zip", temp)
unzip(temp)

Format data

patients = read_csv("./csv/patients.csv") %>% 
  janitor::clean_names() %>% 
  # mutate(description = to_snake_case(description)) %>% 
  select(patient = id, birthdate, deathdate, marital, race, ethnicity, gender)

patients %>% str()

conditions = read_csv("./csv/conditions.csv") %>% 
  janitor::clean_names() %>% 
  mutate(description = to_snake_case(description), category = "condition", value = 1) %>% 
  select(patient, encounter, time = start, category, description = description)

conditions %>% str()

observations = read_csv("./csv/observations.csv") %>% 
  janitor::clean_names() %>% 
  unite(col = "description", c("description", "units")) %>% 
  mutate(description = to_snake_case(description), category = "observation") %>% 
  select(patient, encounter, time = date, category, description, value)

observations %>% str()

encounters = read_csv("./csv/encounters.csv") %>% 
  janitor::clean_names() %>% 
  filter(encounterclass == "inpatient") %>% 
  mutate(description = to_snake_case(description)) %>% 
  select(patient, encounter = id, start, stop, encounterclass, encounter_descr = description)

Define static outcome

A static outcome must be defined and joined with the fixed data

outcome_pne = conditions %>% 
  filter(grepl("pneumonia", description)) %>% 
  group_by(encounter) %>% 
  slice_min(time) %>% 
  ungroup() %>% 
  select(patient, encounter, pneumonia_time = time)

outcome_pne %>% str()

Build cohort and demographic data (fixed data)

fixed_data = patients %>% 
  inner_join(encounters %>% filter(encounterclass == "inpatient") %>% unique() %>% arrange(desc(start))) %>% 
  left_join(outcome_pne %>% select(patient, pneumonia_time)) %>% 
  mutate(pneumonia = if_else(!is.na(pneumonia_time), "PNA", "No_PNA") %>% factor(levels = c("PNA", "No_PNA"))) %>% 
  mutate(pneumonia = if_else(!is.na(pneumonia_time), 1, 0, 0) %>% as.factor()) %>% 
  mutate(pred_start = start, pred_stop = coalesce(deathdate, pneumonia_time, stop)) %>% 
  select(-c(start, stop)) %>% 
  filter(!is.na(pred_start), # prediction start must be present
         pred_start <= pred_stop) %>%  # prediction start should never be greater than prediction stop
  distinct(across(c(encounter)), .keep_all = TRUE)

Temporal data

temporal_data = bind_rows(conditions, observations) %>% 
  select(-encounter) %>% 
  inner_join(fixed_data %>% select(patient, encounter)) %>% 
  mutate(time = as_datetime(time))

Define data model

start predictions: admission end predictions: discharge prediction interval: 6 hours

tf = gpmodels::time_frame(fixed_data = fixed_data,
                          temporal_data = temporal_data, # timestamped data
                          fixed_id = 'encounter', 
                          fixed_start = 'pred_start', 
                          fixed_end = 'pred_stop', 
                          temporal_id = 'encounter', 
                          temporal_time = 'time', 
                          temporal_variable = 'description', 
                          temporal_value = ,'value',
                          temporal_category = 'category', 
                          step = hours(6), # how often to make predictions
                          max_length = days(7), # maximum time limit to make predictions (e.g. 7 days inpatient)
                          output_folder = './gpmodels_out/',
                          create_folder = TRUE,
                          chunk_size = 2000)

tf$fixed_data %>% 
  janitor::tabyl(pneumonia)

tf = tf %>% 
  pre_dummy_code()

Define custom functions

presence = function (x) { if_else(length(x) > 0, 1, 0, 0) }

Define predictors

future::plan('multisession')

tf %>% 
  add_baseline_predictors(category = "condition", 
                          lookback = months(12), 
                          window = months(3), 
                          stats = c(presence = presence))

tf %>% 
  add_baseline_predictors(category = "observation", 
                          lookback = months(12), 
                          # window = months(3), 
                          stats = c(min = min, 
                                    max = max, 
                                    median = median))

tf %>% 
  add_rolling_predictors(category = "observation", 
                          lookback = hours(12), 
                          window = hours(6), 
                          stats = c(min = min, 
                                    max = max, 
                                    median = median, 
                                    last = dplyr::last))

Define rolling outcome

pneumonia in the next 24 hours

tf %>% 
  add_rolling_outcomes(variables = "pneumonia", 
                       lookahead = hours(24), 
                       stats = c(presence = presence))

Randomly split patients into groups for cross-validation

set.seed(1234)
folds = tf$fixed_data %>%
  select(patient) %>%
  distinct() %>%
  group_by(patient) %>%
  mutate(fold = sample.int(6, patient %>% length(), replace = TRUE))

folds %>% write_csv(file.path("folds.csv"))

folds %>% janitor::tabyl(fold)

Combine prepped data

gpm_prepped = tf %>% 
  combine_output() %>% 
  write_csv("prepped_data.csv")

Build model

response = "pneumonia"
predictors = setdiff(names(gpm_prepped), c(response,"outcome_pneumonia_presence_24", gpm_prepped %>% select(patient:pred_stop) %>% names()))
predictors %>% length()
# predictors

library(h2o)
h2o.init(nthreads=-1) # multiprocessing support
h2o.removeAll() # Clean slate - just in case the cluster was already running

Prep modeling datasets

training = gpm_prepped %>% inner_join(folds %>% filter(fold %in% 1:4)) %>% 
  mutate(pneumonia = pneumonia %>% as.factor())

validation = gpm_prepped %>% inner_join(folds %>% filter(fold == 5)) %>% 
  mutate(pneumonia = pneumonia %>% as.factor()) %>% select(-fold)

test = gpm_prepped %>% inner_join(folds %>% filter(fold == 6)) %>% 
  mutate(pneumonia = pneumonia %>% as.factor()) %>% select(-fold)

Build and evaluate model

gbm_pna_static = h2o.gbm(x = predictors,
                         y = response,
                         training_frame = training %>% as.h2o(),
                         fold_column = "fold",
                         validation_frame = validation %>% as.h2o(),
                         ntrees = 200,
                         learn_rate = 0.05,
                         learn_rate_annealing = 0.99,
                         score_tree_interval = 10,
                         stopping_rounds = 5,
                         stopping_tolerance = 1e-5,
                         stopping_metric = "AUC",
                         seed = 1234,
                         keep_cross_validation_predictions = TRUE)

gbm_pna_static %>%
  h2o.performance(newdata = test %>% as.h2o()) %>%
  h2o.auc() %>%
  round(3)

gbm_pna_static_importance = h2o.varimp(gbm_pna_static) %>%
  arrange(desc(relative_importance))

gbm_pna_static_importance %>%
  as.data.frame() %>%
  head(20) %>%
  mutate(relative_importance = relative_importance %>% round(2)) %>%
  select(variable, relative_importance) %>%
  arrange(desc(relative_importance)) %>%
  ggplot(aes(x = reorder(variable, relative_importance), y = relative_importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  xlab("")

References

Jason Walonoski, Mark Kramer, Joseph Nichols, Andre Quina, Chris Moesel, Dylan Hall, Carlton Duffett, Kudakwashe Dube, Thomas Gallagher, Scott McLachlan, Synthea: An approach, method, and software mechanism for generating synthetic patients and the synthetic electronic health care record, Journal of the American Medical Informatics Association, Volume 25, Issue 3, March 2018, Pages 230–238, https://doi.org/10.1093/jamia/ocx079

ML4LHS/gpmodels documentation built on Feb. 1, 2024, 8:31 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ML4LHS/gpmodels
A Grammar of Prediction Models

In ML4LHS/gpmodels: A Grammar of Prediction Models

install packages

setup

Import data

Format data

Define static outcome

Build cohort and demographic data (fixed data)

Temporal data

Define data model

Define custom functions

Define predictors

Define rolling outcome

Randomly split patients into groups for cross-validation

Combine prepped data

Build model

Prep modeling datasets

Build and evaluate model

References

R Package Documentation

Browse R Packages

We want your feedback!

ML4LHS/gpmodels A Grammar of Prediction Models

In ML4LHS/gpmodels: A Grammar of Prediction Models

install packages

setup

Import data

Format data

Define static outcome

Build cohort and demographic data (fixed data)

Temporal data

Define data model

Define custom functions

Define predictors

Define rolling outcome

Randomly split patients into groups for cross-validation

Combine prepped data

Build model

Prep modeling datasets

Build and evaluate model

References

R Package Documentation

Browse R Packages

We want your feedback!

ML4LHS/gpmodels
A Grammar of Prediction Models