Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.align = "center",
dpi = 150,
fig.width = 6,
fig.height = 4.5
)
## ----setup, message=FALSE-----------------------------------------------------
# basic setup
library(smdi)
library(ggplot2)
library(survival)
library(gt)
library(broom)
library(fastDummies)
library(survival)
library(mice)
library(dplyr)
library(gridExtra)
library(tibble)
library(forcats)
## -----------------------------------------------------------------------------
# load complete dataset
smdi_data_complete <- smdi_data_complete %>%
dummy_columns(
select_columns = "ses_cat",
remove_most_frequent_dummy = TRUE,
remove_selected_columns = TRUE
)
# determine missingness pattern
age_col <- which(colnames(smdi_data_complete)=="age_num")
miss_pattern <- rep(1, ncol(smdi_data_complete))
miss_pattern_age <- replace(miss_pattern, age_col, 0)
# weights to compute missingness probability
# covariate itself is only predictor
miss_weights_mnar_v <- rep(0, ncol(smdi_data_complete))
miss_weights_mnar_v <- replace(miss_weights_mnar_v, age_col, 1)
miss_prop_age <- .55
set.seed(42)
smdi_data_mnar_v <- ampute(
data = smdi_data_complete,
prop = miss_prop_age,
mech = "MNAR",
patterns = miss_pattern_age,
weights = miss_weights_mnar_v,
bycases = TRUE,
type = "LEFT"
)$amp
## ----warning=FALSE------------------------------------------------------------
# plot
bind_rows(
smdi_data_complete %>% select(age_num) %>% mutate(dataset = "Complete"),
smdi_data_mnar_v %>% select(age_num) %>% mutate(dataset = "MNAR(value)")
) %>%
ggplot(aes(x = dataset, y = age_num)) +
geom_violin() +
geom_boxplot(width = 0.3) +
stat_summary(
fun = "mean",
geom = "pointrange",
color = "red"
) +
labs(
y = "Age [years]",
x = "Cohort"
) +
theme_bw()
## -----------------------------------------------------------------------------
smdi_diagnose(
data = smdi_data_mnar_v,
covar = "age_num",
model = "cox",
form_lhs = "Surv(eventtime, status)"
) %>%
smdi_style_gt()
## -----------------------------------------------------------------------------
# outcome model (see data generation script)
cox_lhs <- "survival::Surv(eventtime, status)"
covariates <- smdi_data_complete %>%
select(-c(exposure, eventtime, status)) %>%
names()
cox_rhs <- paste(covariates, collapse = " + ")
cox_form <- as.formula(paste(cox_lhs, "~ exposure +", cox_rhs))
cox_form
## -----------------------------------------------------------------------------
# true outcome model
cox_fit_true <- coxph(cox_form, data = smdi_data_complete) %>%
tidy(exponentiate = TRUE, conf.int = TRUE) %>%
filter(term == "exposure") %>%
select(term, estimate, conf.low, conf.high, std.error) %>%
mutate(analysis = "True estimate")
# complete case analysis
cox_fit_cc <- coxph(cox_form, data = smdi_data_mnar_v) %>%
tidy(exponentiate = TRUE, conf.int = TRUE) %>%
filter(term == "exposure") %>%
select(term, estimate, conf.low, conf.high, std.error) %>%
mutate(analysis = "Complete case analysis")
# Multiple imputation (predictive mean matching)
cox_fit_imp <- mice(
data = smdi_data_mnar_v,
seed = 42,
print = FALSE
) %>%
with(
expr = coxph(formula(paste(format(cox_form), collapse = "")))
) %>%
pool() %>%
summary(conf.int = TRUE, exponentiate = TRUE) %>%
filter(term == "exposure") %>%
select(term, estimate, conf.low = `2.5 %`, conf.high = `97.5 %`, std.error) %>%
mutate(analysis = "Multiple imputation")
forest <- bind_rows(cox_fit_true, cox_fit_cc, cox_fit_imp) %>%
mutate(analysis = factor(analysis, levels = c("True estimate", "Complete case analysis", "Multiple imputation"))) %>%
ggplot(aes(y = fct_rev(analysis))) +
geom_point(aes(x = estimate), shape = 15, size = 3) +
geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) +
geom_vline(xintercept = cox_fit_true[["estimate"]], linetype = "dashed") +
labs(x = "Hazard ratio (95% CI)", y= "") +
theme_bw()
table <- bind_rows(cox_fit_true, cox_fit_cc, cox_fit_imp) %>%
select(-term) %>%
relocate(analysis, .before = estimate) %>%
mutate(across(where(is.numeric), ~round(.x, 2)))
grid.arrange(tableGrob(table, rows = NULL), forest)
## ----echo=FALSE, eval=FALSE---------------------------------------------------
# # compute the conditional mean difference in age
# # between dataset with complete observations
# # and dataset with partially observed age covariate
# lm_form <- as.formula(paste("age_num ~ complete_dataset + exposure + ", paste(covariates[-1], collapse = "+")))
#
# data_combined <- rbind(
# smdi_data_complete %>% mutate(complete_dataset = 1),
# smdi_data_mnar_v %>% mutate(complete_dataset = 0)
# )
#
# cond_mean_diff <- lm(lm_form, data = data_combined)$coefficients[["complete_dataset"]]
# cond_mean_diff
## ----fig.width=6--------------------------------------------------------------
# initialize method vector
method_vector <- rep("", ncol(smdi_data_mnar_v))
# columns for narfcs imputation with sensitivity parameter
mnar_imp_method <- which(colnames(smdi_data_mnar_v) == "age_num")
# update method vector
method_vector <- replace(x = method_vector, list = c(mnar_imp_method), values = c("mnar.norm"))
# modeled over a range of deltas
narfcs_modeled <- function(i){
# mnar model specification ('i' is delta parameter)
mnar_blot <- list(age_num = list(ums = paste(i)))
narfcs_imp <- mice(
data = smdi_data_mnar_v,
method = method_vector,
blots = mnar_blot,
seed = 42,
print = FALSE
) %>%
with(
expr = coxph(formula(paste(format(cox_form), collapse = "")))
) %>%
pool() %>%
summary(conf.int = TRUE, exponentiate = TRUE) %>%
filter(term == "exposure") %>%
select(term, estimate, conf.low = `2.5 %`, conf.high = `97.5 %`, std.error) %>%
mutate(delta = i)
}
narfcs_range <- lapply(
X = seq(-25, 25, 1),
FUN = narfcs_modeled
)
narfcs_range_df <- do.call(rbind, narfcs_range)
reference_lines <- tibble(
yintercept = c(cox_fit_true[[2]]),
Reference = c("TRUE HR"),
color = c("darkgreen")
)
narfcs_range_df %>%
ggplot(aes(x = delta, y = estimate)) +
geom_point() +
geom_line() +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.15) +
labs(x = "Sensitivity parameter δ", y = "Hazard ratio (95% CI)") +
scale_x_continuous(breaks = seq(-25, 25, 2), limits = c(-25, 25)) +
scale_y_continuous(breaks = seq(0.6, 1.2, 0.1), limits = c(0.6, 1.2)) +
geom_hline(aes(yintercept = yintercept, color = Reference), reference_lines) +
scale_colour_manual(values = reference_lines$color) +
theme_bw() +
theme(legend.position="top")
## -----------------------------------------------------------------------------
smdi_data %>%
smdi_summarize()
## -----------------------------------------------------------------------------
# we one hot encode the `ses_cat` variable again
# in the smdi_data dataset
smdi_data <- smdi_data %>%
dummy_columns(
select_columns = "ses_cat",
remove_most_frequent_dummy = TRUE,
remove_selected_columns = TRUE
)
# initialize method vector
method_vector <- rep("", ncol(smdi_data))
# specify columns for narfcs and 'normal' imputation
pdl1_mnar_col <- which(colnames(smdi_data) == "pdl1_num")
ecog_mar_col <- which(colnames(smdi_data) == "ecog_cat")
egfr_mnar_col <- which(colnames(smdi_data) == "egfr_cat")
# update method vector
method_vector <- replace(
x = method_vector,
list = c(pdl1_mnar_col, ecog_mar_col, egfr_mnar_col),
values = c("mnar.norm", "logreg", "logreg")
)
# modeled over a range of deltas
narfcs_modeled <- function(i){
# mnar model specification for pdl1_num ('i' is delta parameter)
mnar_blot <- list(pdl1_num = list(ums = paste(i)))
narfcs_imp <- mice(
data = smdi_data,
method = method_vector,
blots = mnar_blot,
seed = 42,
print = FALSE
) %>%
with(
expr = coxph(formula(paste(format(cox_form), collapse = "")))
) %>%
pool() %>%
summary(conf.int = TRUE, exponentiate = TRUE) %>%
filter(term == "exposure") %>%
select(term, estimate, conf.low = `2.5 %`, conf.high = `97.5 %`, std.error) %>%
mutate(delta = i)
}
narfcs_range <- lapply(
X = seq(-25, 25, 1),
FUN = narfcs_modeled
)
narfcs_range_df <- do.call(rbind, narfcs_range)
reference_lines <- tibble(
yintercept = c(cox_fit_true[[2]]),
Reference = c("TRUE HR"),
color = c("darkgreen")
)
narfcs_range_df %>%
ggplot(aes(x = delta, y = estimate)) +
geom_point() +
geom_line() +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.15) +
labs(x = "Sensitivity parameter δ", y = "Hazard ratio (95% CI)") +
scale_x_continuous(breaks = seq(-25, 25, 2), limits = c(-25, 25)) +
scale_y_continuous(breaks = seq(0.5, 1.2, 0.1), limits = c(0.5, 1.2)) +
geom_hline(aes(yintercept = yintercept, color = Reference), reference_lines) +
scale_colour_manual(values = reference_lines$color) +
theme_bw() +
theme(legend.position="top")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.