Based on this article.
library(survival) library(tidyverse) library(ggfortify)
#surv_data <- readRDS(file = "../data/ed_ex.rds") #surv_data <- readRDS(file = "../../cw-data/cw_ip_spells.rds") #surv_data <- readRDS(file = "../../lgt-data/data-out/spell_table_lgt_20190816_b704e260_ms.rds") surv_data <- readRDS(file = "../../cw-data/spell_table_cw_20190827_38ba2c47.rds") spec_dir_mapping <- readr::read_csv("../data/SPECIALTY_HCPmap.csv") surv_data <- surv_data %>% dplyr::left_join(spec_dir_mapping, by = c("main_specialty_start" = "spec_name")) surv_data <- surv_data %>% filter(admission_method_type == "Emergency Admissions", !(age_band_start %in% c("1-4 yrs", "5-9 yrs", "10-14 yrs")), directorate == "Medical"#, #main_specialty_start %in% c("ACUTE INTERNAL MEDICINE", "GENERAL MEDICINE") ) %>% mutate(start_datetime = initial_ed_end_datetime, end_datetime = spell_end, age_band_start = forcats::fct_drop(age_band_start)) surv_data <- surv_data %>% mutate(los = difftime(end_datetime, start_datetime), adm_wd = weekdays(start_datetime)) %>% filter(!is.na(los)) units(surv_data$los) <- "hours" clean_surv_data <- surv_data %>% filter(!is.na(los) & los >= 0 & end_datetime < lubridate::ymd_hm('2017-09-01 00:00')) obsLen <- 365*24 # length of observation time (censoring time = end of study) clean_surv_data <- clean_surv_data %>% mutate(eventT = los, obsT = pmin(eventT, obsLen), status = eventT <= obsLen, gender = dplyr::if_else(gender == "Male", 1, 0), adm_calmonth = as.factor(format(start_datetime, "%b")), adm_daytime = dplyr::if_else(dplyr::between(lubridate::hour(start_datetime), 8, 16), 1, 0)) # eventT <- clean_surv_data %>% pull(los) # adm_wd <- clean_surv_data %>% pull(adm_wd) # obsT <- pmin(eventT, obsLen) # observed censored event times # status <- eventT <= obsLen # has event occured? dfSurv <- clean_surv_data %>% select(eventT, obsT, status) dfSurv2 <- clean_surv_data %>% select(spell_number, eventT, obsT, status, adm_wd, adm_calmonth, gender, age_band_start, start_datetime, end_datetime, adm_daytime)
ggplot(dfSurv, aes(eventT)) + stat_ecdf(geom = "step") + scale_x_continuous(breaks = scales::pretty_breaks(n=12), limits = c(0, 30L*24L)) + ggtitle("Cumulative time since arrival distribution") + xlab("Time from arrival (t)") + ylab("F(t)") + geom_vline(xintercept = obsLen, colour = "blue") #+ #geom_vline(xintercept = 4, colour = "green", linetype = "longdash")
KM0 <- survfit(Surv(obsT, status) ~ 1, type="kaplan-meier", conf.type="log", data=dfSurv) KM1 <- survfit(Surv(obsT, status) ~ adm_wd, type="kaplan-meier", conf.type="log", data=dfSurv2)
autoplot(KM1) + ggtitle(expression(paste("Kaplan-Meier estimate ", hat(S)(t), " with CI"))) + xlab("Time from arrival (t)") + ylab(expression(paste(hat(S)(t), ": % remaining in hospital"))) + geom_vline(xintercept = obsLen, colour = "blue") + scale_x_continuous(breaks = scales::pretty_breaks(n=12), limits = c(0, 30L*24L))
S_function <- make_survival_function(KM0) cond_S_function <- function(t0, t1, S_function) { pmin(S_function(t1)/S_function(t0),1) } ggplot(data.frame(x=c(0,30L*24L)), aes(x)) + stat_function(fun = S_function, geom = "line", aes(colour = "S(x)")) + stat_function(fun = function(x) cond_S_function(1L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 1 day)")) + stat_function(fun = function(x) cond_S_function(2L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 2 days)")) + stat_function(fun = function(x) cond_S_function(4L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 4 days)")) + stat_function(fun = function(x) cond_S_function(7L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 7 days)")) + stat_function(fun = function(x) cond_S_function(14L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 14 days)")) + ggtitle("Probability of remaining in hospital after time t1, given in hospital after time t0") + xlab("Time from arrival (t) hours") + ylab("Probability still in hospital at time t") + scale_x_continuous(breaks = seq(0, 30L*24L, by = 24L), limits = c(0, 30L*24L)) + geom_vline(xintercept = obsLen, colour = "blue") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Next we should estimate the variability of this for typical medical population e.g. 180 medical pts. H: with full info can predict e.g. 3day horizon given 2day; but not for e.g. 100days.
So lets say we have 30 (or 300) patients who have been in hospital already for (random exponentially distributed) times (in hours):
times_in_hospital_1 <- floor(rexp(30, 1/51.6)) times_in_hospital_2 <- floor(rexp(300, 1/51.6)) times_in_hospital_1 times_in_hospital_2
Now use the survival function from above to estimate the probability each patient will still be in after a further 48 hours.
probabilities_still_in_48h_1 <- get_discharge_probability(times_in_hospital_1, S_function, 48) probabilities_still_in_48h_1 probabilities_still_in_48h_2 <- get_discharge_probability(times_in_hospital_2, S_function, 48) probabilities_still_in_48h_2
These are the parameters of individual bernoulli trials, the sum of which yields a Poisson Binomial distribution. It is possible to calculate this distribution via the discrete Fourier transform of the following vector:
p1 <- probabilities_still_in_48h_1 x1 <- (1/(length(p1)+1)) * get_poisson_binomial_characteristic_values(p1) p2 <- probabilities_still_in_48h_2 x2 <- (1/(length(p2)+1)) * get_poisson_binomial_characteristic_values(p2)
The Poisson Binomial distribution mass function is now given by (for the sample of 30):
pb_dist1 <- fft(x1) pb_dist1 <- tibble::tibble(prob = Re(pb_dist1)) %>% tibble::rowid_to_column("k") %>% mutate(k = k-1) expected_k1 <- pb_dist1 %>% dplyr::mutate(prod = prob*k) %>% summarise(expectation = sum(prod)) %>% pull(expectation) ggplot(data = pb_dist1, aes(x = k, y = prob)) + geom_bar(stat = "identity") + geom_vline(xintercept = expected_k1, colour = "green") expected_k1
The Poisson Binomial distribution mass function is now given by (for the sample of 300):
pb_dist2 <- fft(x2) pb_dist2 <- tibble::tibble(prob = Re(pb_dist2)) %>% tibble::rowid_to_column("k") %>% mutate(k = k-1) expected_k2 <- pb_dist2 %>% dplyr::mutate(prod = prob*k) %>% summarise(expectation = sum(prod)) %>% pull(expectation) ggplot(data = pb_dist2, aes(x = k, y = prob)) + geom_bar(stat = "identity") + geom_vline(xintercept = expected_k2, colour = "green") expected_k2
Next step is to write a function to run this on patients actually in hospital/ward at a specified time t.
T1 <- strptime("2017-07-01 12:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "Europe/London") ipss1 <- get_inpatient_snapshot(df = surv_data, t = T1) %>% filter(!is.na(stay_duration)) ipss1 <- ipss1 %>% mutate(prob_here_48h = get_discharge_probability(stay_duration, S_function, 48)) x_vect1 <- (1/(nrow(ipss1)+1)) * get_poisson_binomial_characteristic_values(ipss1 %>% pull(prob_here_48h)) pb_dist_T1 <- fft(x_vect1) pb_dist_T1 <- tibble::tibble(prob = Re(pb_dist_T1)) %>% tibble::rowid_to_column("k") %>% mutate(k = k-1) expected_k_T1 <- pb_dist_T1 %>% dplyr::mutate(prod = prob*k) %>% summarise(expectation = sum(prod)) %>% pull(expectation) ggplot(data = pb_dist_T1, aes(x = k, y = prob)) + geom_bar(stat = "identity") + geom_vline(xintercept = expected_k_T1, colour = "green") expected_k_T1
Now let's see how accurate that prediction is
ipss1_48h <- get_inpatient_snapshot(df = surv_data, t = T1 + 48*60*60) %>% mutate(residual = spell_number %in% ipss1$spell_number) residual_T1 <- ipss1_48h %>% filter(residual == TRUE) %>% nrow() paste("Predicted residual occupancy: ", expected_k_T1, ". Actual residual occupancy: ", residual_T1) paste("Error: ", expected_k_T1 - residual_T1, ". Relative error: ", (expected_k_T1 - residual_T1)/residual_T1)
Note this is still using the full survival function from the whole data - in reality this could only be calculated from data prior to T1.
For now let's carry on and hack together a means of getting the prediction for a range of times, to get a better feel for the distribution of error.
predict_residual_occupancy(df = surv_data, t = T1, S_function = S_function, delta_t = 48) get_residual_occupancy(df = surv_data, t = T1, delta_t = 48) dates <- seq(lubridate::ymd_hm('2017-01-01 12:00', tz = "Europe/London"), lubridate::ymd_hm('2017-07-31 12:00', tz = "Europe/London"), by = '1 day') model_output <- run_S_func_model(df = surv_data, S_func = S_function, date_seq = dates) summary(model_output$predictions)
model_output$diagnostics$comp_plot
model_output$diagnostics$err_plot
model_output$diagnostics$rel_err_plot
The errors clearly show (as expected) the effect of weekends on this model - this naive model does not take into account any weekend effect.
Next steps will be to generalise the model to include this weekly effect.
CM1 <- coxph(Surv(obsT, status) ~ adm_wd, data = dfSurv2) KM1S <- survfit(CM1, newdata = data.frame(adm_wd = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) step_funs <- lapply(c(1:7), function(x) {make_survival_function(KM1S[,x])}) names(step_funs) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
ipss <- get_inpatient_snapshot(df = dfSurv2, t = T1) %>% filter(!is.na(stay_duration)) newdata1 <- ipss %>% select(obsT = stay_duration, adm_wd) %>% mutate(status = TRUE) newdata1d <- newdata1 %>% mutate(obsT = obsT + 48) CM1_prediction_1 <- predict(CM1, newdata = newdata1, type = "expected") CM1_prediction_2 <- predict(CM1, newdata = newdata1d, type = "expected") prob_here_48h_CM1 <- exp(-CM1_prediction_2)/exp(-CM1_prediction_1) predictions_CM1 <- cbind(ipss, CM1_pred_prob = prob_here_48h_CM1) sum(predictions_CM1$CM1_pred_prob) # Now lets embed that into predict_residual_occupancy predict_residual_occupancy(df = dfSurv2, t = T1, delta_t = 48, coxmodel = "Surv(obsT, status) ~ adm_wd") get_residual_occupancy(df = dfSurv2, t = T1, delta_t = 48)
model_output_CM <- run_S_func_model(df = dfSurv2, coxmodel = "Surv(obsT, status) ~ adm_wd", date_seq = dates) summary(model_output_CM$predictions)
model_output_CM$diagnostics$comp_plot
model_output_CM$diagnostics$err_plot
model_output_CM$diagnostics$rel_err_plot
CM2 <- coxph(Surv(obsT, status) ~ adm_wd + adm_daytime + adm_calmonth + gender + age_band_start, data = dfSurv2) newdata2 <- ipss %>% select(obsT = stay_duration, adm_wd, adm_daytime, adm_calmonth, gender, age_band_start) %>% mutate(status = TRUE) newdata2d <- newdata2 %>% mutate(obsT = obsT + 48) CM2_prediction_1 <- predict(CM2, newdata = newdata2, type = "expected") CM2_prediction_2 <- predict(CM2, newdata = newdata2d, type = "expected") prob_here_48h_CM2 <- exp(-CM2_prediction_2)/exp(-CM2_prediction_1) model_output_CM2 <- run_S_func_model(df = dfSurv2, coxmodel = "Surv(obsT, status) ~ adm_wd + adm_daytime + adm_calmonth + gender + age_band_start", date_seq = dates) summary(model_output_CM2$predictions)
model_output_CM2$diagnostics$comp_plot
model_output_CM2$diagnostics$err_plot
model_output_CM2$diagnostics$rel_err_plot
naive_preds <- model_output_CM2$predictions %>% dplyr::select(dates, actual) naive_preds <- naive_preds %>% dplyr::mutate(predicted = dplyr::lag(actual, 2)) %>% dplyr::filter(!is.na(predicted)) naive_preds <- naive_preds %>% dplyr::mutate(error = predicted - actual, rel_error = error/actual, abs_error = abs(error), abs_rel_error = abs(rel_error)) prediction_comparison <- naive_preds %>% select(dates, predicted, actual) %>% gather(key = "type", value = "residual_occ", predicted, actual) comp_plot <- ggplot2::ggplot(data = prediction_comparison, mapping = aes(dates, residual_occ)) + ggplot2::geom_point(aes(group = type, colour = type)) + geom_line(aes(group = type, colour = type)) + ggtitle("Residual occupancy: predictions versus actual") + xlab("Date of prediction") + ylab("Residual occupancy") err_plot <- ggplot2::ggplot(data = naive_preds, mapping = aes(dates, error)) + ggplot2::geom_point() + geom_line() + ggtitle("Residual occupancy: errors in predictions versus actual") + xlab("Date of prediction") + ylab("Error") rel_err_plot <- ggplot2::ggplot(data = naive_preds, mapping = aes(dates, rel_error)) + ggplot2::geom_point() + geom_line() + ggtitle("Residual occupancy: relative errors in predictions versus actual") + xlab("Date of prediction") + ylab("Relative Error") summary(naive_preds)
comp_plot
err_plot
rel_err_plot
AAU - more consistency 7 day Downstream - much less so - could assume no weekend discharges?
General framework above does work to deploy and evaluate these models. However would be better to use the structure of e.g. caret!
This includes: 1. A structure to better represent (conditional) survival functions 2. A function or function template for calculating these survival functions from (a specified, and eventually possibly rolling, "training" subset of) the data 3. A function for applying these survival functions to the data at a given point in time to yield a delta-t horizon prediction for residual occupancy 4. A function or functions to evaluate these predictions across a "test" range
Idea: can I use the newdata argument of survfit to provide the actual covariates of snapshot data, to get a) S(t1 | covs) and b) S(t0 | covs), then compute their ratio to get the probability of still being in? If so this saves a lot of machinery in e.g. predict_residual_occupancy. - This is essentially what the Cox model approach above is doing.
Evaluation by occupancy "zones" - ROC? Time series - esp. for delta on the predictions v v v Busy ED/AAU : ?more admissions Visualise to see the prediction before the effect Time since prev adm- Existence of certain diagnoses previuos: chest pain, asthma, overdose, falls, found on floor, hf copd, dka Prb of overfull given predicted residual occupancy + distrib of expected admits over 48hrs...
?incorporate lou's data?
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.