library(survival) library(tidyverse) library(ggfortify)
#surv_data <- readRDS(file = "../data/ed_ex.rds") #surv_data <- readRDS(file = "../../cw-data/cw_ip_spells.rds") #surv_data <- readRDS(file = "../../lgt-data/data-out/spell_table_lgt_20190816_b704e260_ms.rds") surv_data <- readRDS(file = "../../cw-data/spell_table_cw_20190902_b7fae6f7.rds") surv_data <- add_directorate_variable(surv_data) surv_data <- surv_data %>% filter(admission_method_type == "Emergency Admissions", !(age_band_start %in% c("1-4 yrs", "5-9 yrs", "10-14 yrs")), directorate == "Medical"#, #main_specialty_start %in% c("ACUTE INTERNAL MEDICINE", "GENERAL MEDICINE") ) %>% mutate(start_datetime = if_else(is.na(initial_ed_end_datetime), spell_start, initial_ed_end_datetime), end_datetime = spell_end, age_band_start = forcats::fct_drop(age_band_start)) surv_data <- surv_data %>% mutate(los = difftime(end_datetime, start_datetime), adm_wd = weekdays(start_datetime)) %>% filter(!is.na(los)) %>% mutate(days_since_prev_disch = as.numeric(replace(days_since_prev_disch, days_since_prev_disch <= 0, NA)), disch_in_last_week = if_else(!is.na(days_since_prev_disch) & days_since_prev_disch <=7, TRUE, FALSE), disch_in_last_month = if_else(!is.na(days_since_prev_disch) & days_since_prev_disch <=30, TRUE, FALSE), admitted_before = !is.na(days_since_prev_disch)) units(surv_data$los) <- "hours" clean_surv_data <- surv_data %>% filter(!is.na(los) & los >= 0 & end_datetime < lubridate::ymd_hm('2019-03-01 00:00')) obsLen <- 365*24 # length of observation time (censoring time = end of study) clean_surv_data <- clean_surv_data %>% mutate(eventT = los, obsT = pmin(eventT, obsLen), status = eventT <= obsLen, gender = dplyr::if_else(gender == "Male", 1, 0), adm_calmonth = as.factor(format(start_datetime, "%b")), adm_daytime = dplyr::if_else(dplyr::between(lubridate::hour(start_datetime), 8, 16), 1, 0)) # eventT <- clean_surv_data %>% pull(los) # adm_wd <- clean_surv_data %>% pull(adm_wd) # obsT <- pmin(eventT, obsLen) # observed censored event times # status <- eventT <= obsLen # has event occured? dfSurv <- clean_surv_data %>% select(eventT, obsT, status) dfSurv2 <- clean_surv_data %>% select(spell_number, eventT, obsT, status, adm_wd, adm_calmonth, gender, age_band_start, start_datetime, end_datetime, adm_daytime, days_since_prev_disch, disch_in_last_week, disch_in_last_month, admitted_before)
ggplot(dfSurv, aes(eventT)) + stat_ecdf(geom = "step") + scale_x_continuous(breaks = scales::pretty_breaks(n=12), limits = c(0, 30L*24L)) + ggtitle("Cumulative time since arrival distribution") + xlab("Time from arrival (t)") + ylab("F(t)") + geom_vline(xintercept = obsLen, colour = "blue") #+ #geom_vline(xintercept = 4, colour = "green", linetype = "longdash")
KM0 <- survfit(Surv(obsT, status) ~ 1, type="kaplan-meier", conf.type="log", data=dfSurv) KM1 <- survfit(Surv(obsT, status) ~ adm_wd, type="kaplan-meier", conf.type="log", data=dfSurv2)
autoplot(KM1) + ggtitle(expression(paste("Kaplan-Meier estimate ", hat(S)(t), " with CI"))) + xlab("Time from arrival (t)") + ylab(expression(paste(hat(S)(t), ": % remaining in hospital"))) + geom_vline(xintercept = obsLen, colour = "blue") + scale_x_continuous(breaks = scales::pretty_breaks(n=12), limits = c(0, 30L*24L))
S_function <- make_survival_function(KM0) cond_S_function <- function(t0, t1, S_function) { pmin(S_function(t1)/S_function(t0),1) } ggplot(data.frame(x=c(0,30L*24L)), aes(x)) + stat_function(fun = S_function, geom = "line", aes(colour = "S(x)")) + stat_function(fun = function(x) cond_S_function(1L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 1 day)")) + stat_function(fun = function(x) cond_S_function(2L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 2 days)")) + stat_function(fun = function(x) cond_S_function(4L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 4 days)")) + stat_function(fun = function(x) cond_S_function(7L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 7 days)")) + stat_function(fun = function(x) cond_S_function(14L*24L, x, S_function), geom = "line", aes(colour = "S(x|t0 = 14 days)")) + ggtitle("Probability of remaining in hospital after time t1, given in hospital after time t0") + xlab("Time from arrival (t) hours") + ylab("Probability still in hospital at time t") + scale_x_continuous(breaks = seq(0, 30L*24L, by = 24L), limits = c(0, 30L*24L)) + geom_vline(xintercept = obsLen, colour = "blue") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
T1 <- strptime("2018-09-01 12:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "Europe/London")
predict_residual_occupancy(df = surv_data, t = T1, S_function = S_function, delta_t = 48) get_residual_occupancy(df = surv_data, t = T1, delta_t = 48) dates <- seq(lubridate::ymd_hm('2018-08-01 12:00', tz = "Europe/London"), lubridate::ymd_hm('2019-01-31 12:00', tz = "Europe/London"), by = '1 day') model_output <- run_S_func_model(df = surv_data, S_func = S_function, date_seq = dates) summary(model_output$predictions)
model_output$diagnostics$comp_plot
model_output$diagnostics$err_plot
model_output$diagnostics$rel_err_plot
The errors clearly show (as expected) the effect of weekends on this model - this naive model does not take into account any weekend effect.
Next steps will be to generalise the model to include this weekly effect.
CM1 <- coxph(Surv(obsT, status) ~ adm_wd, data = dfSurv2) KM1S <- survfit(CM1, newdata = data.frame(adm_wd = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) step_funs <- lapply(c(1:7), function(x) {make_survival_function(KM1S[,x])}) names(step_funs) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
ipss <- get_inpatient_snapshot(df = dfSurv2, t = T1) %>% filter(!is.na(stay_duration)) newdata1 <- ipss %>% select(obsT = stay_duration, adm_wd) %>% mutate(status = TRUE) newdata1d <- newdata1 %>% mutate(obsT = obsT + 48) CM1_prediction_1 <- predict(CM1, newdata = newdata1, type = "expected") CM1_prediction_2 <- predict(CM1, newdata = newdata1d, type = "expected") prob_here_48h_CM1 <- exp(-CM1_prediction_2)/exp(-CM1_prediction_1) predictions_CM1 <- cbind(ipss, CM1_pred_prob = prob_here_48h_CM1) sum(predictions_CM1$CM1_pred_prob) # Now lets embed that into predict_residual_occupancy predict_residual_occupancy(df = dfSurv2, t = T1, delta_t = 48, coxmodel = "Surv(obsT, status) ~ adm_wd") get_residual_occupancy(df = dfSurv2, t = T1, delta_t = 48)
model_output_CM <- run_S_func_model(df = dfSurv2, coxmodel = "Surv(obsT, status) ~ adm_wd", date_seq = dates) summary(model_output_CM$predictions)
model_output_CM$diagnostics$comp_plot
model_output_CM$diagnostics$err_plot
model_output_CM$diagnostics$rel_err_plot
CM2 <- coxph(Surv(obsT, status) ~ adm_wd + adm_daytime + adm_calmonth + gender + age_band_start, data = dfSurv2) newdata2 <- ipss %>% select(obsT = stay_duration, adm_wd, adm_daytime, adm_calmonth, gender, age_band_start) %>% mutate(status = TRUE) newdata2d <- newdata2 %>% mutate(obsT = obsT + 48) CM2_prediction_1 <- predict(CM2, newdata = newdata2, type = "expected") CM2_prediction_2 <- predict(CM2, newdata = newdata2d, type = "expected") prob_here_48h_CM2 <- exp(-CM2_prediction_2)/exp(-CM2_prediction_1) model_output_CM2 <- run_S_func_model(df = dfSurv2, coxmodel = "Surv(obsT, status) ~ adm_wd + adm_daytime + adm_calmonth + gender + age_band_start", date_seq = dates) summary(model_output_CM2$predictions)
model_output_CM2$diagnostics$comp_plot
model_output_CM2$diagnostics$err_plot
model_output_CM2$diagnostics$rel_err_plot
CM3 <- coxph(Surv(obsT, status) ~ adm_wd + adm_daytime + adm_calmonth + gender + age_band_start + disch_in_last_week + disch_in_last_month + admitted_before, data = dfSurv2) newdata3 <- ipss %>% select(obsT = stay_duration, adm_wd, adm_daytime, adm_calmonth, gender, age_band_start, disch_in_last_week, disch_in_last_month, admitted_before) %>% mutate(status = TRUE) newdata3d <- newdata3 %>% mutate(obsT = obsT + 48) CM3_prediction_1 <- predict(CM3, newdata = newdata3, type = "expected") CM3_prediction_2 <- predict(CM3, newdata = newdata3d, type = "expected") prob_here_48h_CM3 <- exp(-CM3_prediction_2)/exp(-CM3_prediction_1) model_output_CM3 <- run_S_func_model(df = dfSurv2, coxmodel = "Surv(obsT, status) ~ adm_wd + adm_daytime + adm_calmonth + gender + age_band_start + disch_in_last_week + disch_in_last_month + admitted_before", date_seq = dates) summary(model_output_CM3$predictions)
model_output_CM3$diagnostics$comp_plot
model_output_CM3$diagnostics$err_plot
model_output_CM3$diagnostics$rel_err_plot
naive_preds <- model_output_CM2$predictions %>% dplyr::select(dates, actual) naive_preds <- naive_preds %>% dplyr::mutate(predicted = dplyr::lag(actual, 2)) %>% dplyr::filter(!is.na(predicted)) naive_preds <- naive_preds %>% dplyr::mutate(error = predicted - actual, rel_error = error/actual, abs_error = abs(error), abs_rel_error = abs(rel_error)) prediction_comparison <- naive_preds %>% select(dates, predicted, actual) %>% gather(key = "type", value = "residual_occ", predicted, actual) comp_plot <- ggplot2::ggplot(data = prediction_comparison, mapping = aes(dates, residual_occ)) + ggplot2::geom_point(aes(group = type, colour = type)) + geom_line(aes(group = type, colour = type)) + ggtitle("Residual occupancy: predictions versus actual") + xlab("Date of prediction") + ylab("Residual occupancy") err_plot <- ggplot2::ggplot(data = naive_preds, mapping = aes(dates, error)) + ggplot2::geom_point() + geom_line() + ggtitle("Residual occupancy: errors in predictions versus actual") + xlab("Date of prediction") + ylab("Error") rel_err_plot <- ggplot2::ggplot(data = naive_preds, mapping = aes(dates, rel_error)) + ggplot2::geom_point() + geom_line() + ggtitle("Residual occupancy: relative errors in predictions versus actual") + xlab("Date of prediction") + ylab("Relative Error") summary(naive_preds)
comp_plot
err_plot
rel_err_plot
AAU - more consistency 7 day Downstream - much less so - could assume no weekend discharges?
General framework above does work to deploy and evaluate these models. However would be better to use the structure of e.g. caret!
This includes: 1. A structure to better represent (conditional) survival functions 2. A function or function template for calculating these survival functions from (a specified, and eventually possibly rolling, "training" subset of) the data 3. A function for applying these survival functions to the data at a given point in time to yield a delta-t horizon prediction for residual occupancy 4. A function or functions to evaluate these predictions across a "test" range
Idea: can I use the newdata argument of survfit to provide the actual covariates of snapshot data, to get a) S(t1 | covs) and b) S(t0 | covs), then compute their ratio to get the probability of still being in? If so this saves a lot of machinery in e.g. predict_residual_occupancy. - This is essentially what the Cox model approach above is doing.
Evaluation by occupancy "zones" - ROC? Time series - esp. for delta on the predictions v v v Busy ED/AAU : ?more admissions Visualise to see the prediction before the effect Time since prev adm- Existence of certain diagnoses previuos: chest pain, asthma, overdose, falls, found on floor, hf copd, dka Prb of overfull given predicted residual occupancy + distrib of expected admits over 48hrs...
?incorporate lou's data?
28th Jan 2020
plot survival curves by age-band
diagnostic prev
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.