knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

wizard

Windowed Summarization for Autoregressive Data

This package uses windowed summarization to convert time series data into a form that can be modeled by prediction models.

Lifecycle: maturing

Installation

You can install the GitHub version of wizard with:

remotes::install_github('ML4LHS/wizard')

How to set up a wiz_frame()

Start by loading and package and defining your wiz_frame(). A wiz_frame is simply a list with the class wiz_frame and contains all the key information needed to describe both your fixed dataset (such as demographics, one row per patient) and your temporal dataset (one row per observation linked to a timestamp).

library(wizard)
library(magrittr)
library(lubridate)

future::plan('multisession')

unlink(file.path(tempdir(), 'wizard_dir', '*.*'))

wf = wiz_frame(fixed_data = sample_fixed_data,
               temporal_data = sample_temporal_data %>% dplyr::filter(id %in% 1:100),
               fixed_id = 'id',
               fixed_start = 'admit_time',
               fixed_end = 'dc_time',
               temporal_id = 'id',
               temporal_time = 'time',
               temporal_variable = 'variable',
               temporal_category = 'category',
               temporal_value = 'value',
               step = hours(6),
               max_length = days(7), # optional parameter to limit to first 7 days of hospitalization
               output_folder = file.path(tempdir(), 'wizard_dir'),
               create_folder = TRUE)

Let's look at the automatically generated data dictionaries

names(wf)

wf$step

wf$step_units

wf$fixed_data_dict

wf$temporal_data_dict

Let's dummy code the temporal categorical variables

wf = wf %>% 
  wiz_dummy_code()

This affects only the temporal data and not the fixed data.

wf$fixed_data_dict

wf$temporal_data_dict

Let's add some predictors and outcomes

The default method writes output to the folder defined in your wiz_frame. When you write your output to file, you are allowed to chain together add_predictors() and add_outcomes() functions. This is possble because these functions invisibly return a wiz_frame.

If, however, you set output_file to FALSE, then your actual output is returned (rather than the wiz_frame) so you cannot chain functions.

wf %>%           
  wiz_add_predictors(variables = 'cr', # Note: You can supply a vector of variables
                     lookback = hours(12), 
                     window = hours(6), 
                     stats = c(mean = mean,
                               min = min,
                               max = max,
                               median = median,
                               length = length)) %>%
  wiz_add_baseline_predictors(variables = 'cr', # add baseline creatinine
                              lookback = days(90),
                              offset = hours(10),
                              stats = c(min = min)) %>%
  wiz_add_growing_predictors(variables = 'cr', # cumulative max creatinine since admission
                              stats = c(max = max)) %>%
  wiz_add_predictors(category = 'med', # Note: category is always a regular expression 
                     lookback = days(7),
                     stats = c(sum = sum)) %>% 
  wiz_add_outcomes(variables = 'cr',
                   lookahead = hours(24), 
                   stats = c(max = max))

Let's combine our output into a single data frame

You can provide wiz_combine() with a set of data frames separated by commas. Or, now you can provide a vector of file names using the files argument. If you leave files blank, it will automatically find all the .csv files from the output_folder of your wiz_frame.

This resulting frame is essentially ready for modeling (using tidymodels, for example). Make sure to keep individual patients in the same fold if you divide this dataset into multiple folds.

model_data = wiz_combine(wf)

head(model_data)

Testing wiz_frame without writing output to files

If you want to simply test wiz_frame, you may prefer not to write your output to file. You can accomplish this by setting output_file to FALSE.

wf %>% 
  wiz_add_predictors(variables = 'cr',
                     lookback = hours(12), 
                     window = hours(6), 
                     stats = c(mean = mean,
                               min = min,
                               max = max,
                               median = median,
                               length = length),
                     output_file = FALSE) %>% 
  head()

You can also supply a vector of variables

wf %>% 
  wiz_add_predictors(variables = c('cr', 'med_aspirin'),
                     lookback = weeks(1), 
                     stats = c(length = length),
                     output_file = FALSE) %>% 
  head()

Category accepts regular expressions

wf %>% 
  wiz_add_predictors(category = 'lab|med',
                     lookback = hours(12), 
                     stats = c(length = length),
                     output_file = FALSE) %>% 
  head()

Let's benchmark the performance on our package

Running in parallel

benchmark_results = list()

# future::plan('multisession')

benchmark_results[['multisession']] = 
  microbenchmark::microbenchmark(
    wf %>% 
      wiz_add_predictors(variable = 'cr',
                         lookback = hours(48), 
                         window = hours(6), 
                         stats = c(mean = mean,
                                   min = min,
                                   max = max,
                                   median = median,
                                   length = length)),
    times = 1
  )

Running in parallel with a chunk_size of 20

wf_with_chunks = wf
wf_with_chunks$chunk_size = 20

benchmark_results[['multisession with chunk_size 20']] = 
  microbenchmark::microbenchmark(
    wf_with_chunks %>% 
      wiz_add_predictors(variable = 'cr',
                         lookback = hours(48), 
                         window = hours(6), 
                         stats = c(mean = mean,
                                   min = min,
                                   max = max,
                                   median = median,
                                   length = length)),
    times = 1
  )

Running in serial

future::plan('sequential')

benchmark_results[['sequential']] = 
  microbenchmark::microbenchmark(
  wf %>% 
    wiz_add_predictors(variable = 'cr',
                       lookback = hours(48), 
                       window = hours(6), 
                       stats = c(mean = mean,
                                 min = min,
                                 max = max,
                                 median = median,
                                 length = length)),
  times = 1
  )

Benchmark results

benchmark_results
unlink(file.path(tempdir(), 'wizard_dir', '*.*'))


ML4LHS/wizard documentation built on May 31, 2021, 1:24 a.m.