Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(survML)
library(survival)
library(dplyr)
library(ggplot2)
set.seed(72924)
## ----km, fig.width=8, fig.height=6--------------------------------------------
data(cancer)
km_fit <- survfit(Surv(rfstime, status) ~ 1, data = gbsg)
plot(km_fit, xlab = "Time (days)", ylab = "Recurrence-free survival probability")
## ----data_setup---------------------------------------------------------------
### variables of interest
# rfstime - recurrence-free survival
# status - censoring indicator
# hormon - hormonal therapy treatment indicator
# age - in years
# meno - 1 = premenopause, 2 = post
# size - tumor size in mm
# grade - factor 1,2,3
# nodes - number of positive nodes
# pgr - progesterone receptor in fmol
# er - estrogen receptor in fmol
# create dummy variables and clean data
gbsg$tumgrad2 <- ifelse(gbsg$grade == 2, 1, 0)
gbsg$tumgrad3 <- ifelse(gbsg$grade == 3, 1, 0)
gbsg <- gbsg %>% na.omit() %>% select(-c(pid, grade))
time <- gbsg$rfstime
event <- gbsg$status
X <- gbsg %>% select(-c(rfstime, status)) # remove outcome
# find column indices of features/feature groups
X_names <- names(X)
age_index <- paste0(which(X_names == "age"))
meno_index <- paste0(which(X_names == "meno"))
size_index <- paste0(which(X_names == "size"))
nodes_index <- paste0(which(X_names == "nodes"))
pgr_index <- paste0(which(X_names == "pgr"))
er_index <- paste0(which(X_names == "er"))
hormon_index <- paste0(which(X_names == "hormon"))
grade_index <- paste0(which(X_names %in% c("tumgrad2", "tumgrad3")), collapse = ",")
tum_index <- paste0(which(X_names %in% c("size", "nodes", "pgr", "er", "tumgrad2", "tumgrad3")),
collapse = ",")
person_index <- paste0(which(X_names %in% c("age", "meno", "hormon")), collapse = ",")
feature_group_names <- c("age", "meno.", "size", "nodes",
"prog.", "estro.", "hormone",
"grade")
feature_groups <- c(age_index, meno_index, size_index, nodes_index,
pgr_index, er_index, hormon_index, grade_index)
## ----fig.width=8, fig.height=8------------------------------------------------
# landmark times for AUC
landmark_times <- c(1000, 2000)
output <- vim(type = "AUC",
time = time,
event = event,
X = X,
landmark_times = landmark_times,
large_feature_vector = 1:ncol(X),
small_feature_vector = (1:ncol(X))[-as.numeric(age_index)],
conditional_surv_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2,
bin_size = 0.5),
large_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
small_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
cf_fold_num = 2,
sample_split = FALSE,
scale_est = TRUE)
output$result$indx <- rep(age_index, nrow(output$result))
output$result$indx_name <- rep("age", nrow(output$result))
output$result
## -----------------------------------------------------------------------------
# save the objects that we will reuse
saved_conditional_surv_preds <- output$conditional_surv_preds
saved_large_oracle_preds <- output$large_oracle_preds
saved_folds <- output$folds
saved_approx_times <- output$approx_times
pooled_output <- output$result # save the results for age
# iterate over other feature groups
for (i in 2:length(feature_group_names)){
indx_char <- feature_groups[i]
indx_name <- feature_group_names[i]
indx <- as.numeric(strsplit(indx_char, split = ",")[[1]])
output <- vim(type = "AUC",
time = time,
event = event,
X = X,
landmark_times = landmark_times,
approx_times = saved_approx_times,
large_feature_vector = 1:ncol(X),
small_feature_vector = (1:ncol(X))[-indx],
conditional_surv_preds = saved_conditional_surv_preds,
large_oracle_preds = saved_large_oracle_preds,
cf_folds = saved_folds$cf_folds,
ss_folds = saved_folds$ss_folds,
small_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
sample_split = TRUE,
scale_est = TRUE)
output$result$indx <- rep(indx_char, nrow(output$result))
output$result$indx_name <- rep(indx_name, nrow(output$result))
pooled_output <- rbind(pooled_output, output$result)
}
plot_results <- function(results, plot_title){
# plot results
p_auc <- results %>%
mutate(landmark_time = factor(landmark_time,
levels = c(1000, 2000),
labels = c("1000 days", "2000 days"))) %>%
arrange(landmark_time, est) %>%
mutate(Order = row_number()) %>%
{ggplot(., aes(x = est, y = Order)) +
geom_errorbarh(aes(xmin = cil, xmax = ciu)) +
geom_point() +
theme_bw() +
xlab("Estimated importance") +
ylab("Feature group") +
xlim(c(0,0.5)) +
scale_y_continuous(
breaks = .$Order,
labels = .$indx_name,
) +
facet_wrap(~landmark_time, dir = "v", strip.position = "right", scales = "free_y", ncol = 1) +
ggtitle(plot_title)+
theme(strip.background = element_blank(),
strip.placement = "outside")
}
return(p_auc)
}
p_auc <- plot_results(pooled_output, "AUC variable importance relative to full feature vector")
p_auc
## ----fig.width=8, fig.height=5------------------------------------------------
# consider joint importance of all tumor-level and person-level features
feature_group_names2 <- c("tumor", "person")
feature_groups2 <- c(tum_index, person_index)
# repeat the analysis for feature groups
for (i in 1:length(feature_group_names2)){
indx_char <- feature_groups2[i]
indx_name <- feature_group_names2[i]
indx <- as.numeric(strsplit(indx_char, split = ",")[[1]])
output <- vim(type = "AUC",
time = time,
event = event,
X = X,
landmark_times = landmark_times,
approx_times = saved_approx_times,
large_feature_vector = 1:ncol(X),
small_feature_vector = (1:ncol(X))[-indx],
conditional_surv_preds = saved_conditional_surv_preds,
large_oracle_preds = saved_large_oracle_preds,
small_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
cf_folds = saved_folds$cf_folds,
ss_folds = saved_folds$ss_folds,
sample_split = TRUE,
scale_est = TRUE)
output$result$indx <- rep(indx_char, nrow(output$result))
output$result$indx_name <- rep(indx_name, nrow(output$result))
if (!(i == 1)){
pooled_output <- rbind(pooled_output, output$result)
} else{
pooled_output <- output$result
}
}
p_auc <- plot_results(pooled_output, "AUC variable importance relative to full feature vector (groups)")
p_auc
## ----fig.width=8, fig.height=7------------------------------------------------
# For importance relative to baseline features, the 'small' model uses only person-level (baseline) features
# The 'large' model uses baseline + feature of interest
size_index <- paste0(c(size_index, person_index), collapse = ",")
nodes_index <- paste0(c(nodes_index, person_index), collapse = ",")
pgr_index <- paste0(c(pgr_index, person_index), collapse = ",")
er_index <- paste0(c(er_index, person_index), collapse = ",")
grade_index <- paste0(c(grade_index, person_index), collapse = ",")
feature_group_names <- c("size", "nodes", "prog.", "estro.", "grade")
feature_groups <- c(size_index, nodes_index,
pgr_index, er_index, grade_index)
person_index_numeric <- as.numeric(strsplit(person_index, split = ",")[[1]])
for (i in 1:length(feature_group_names)){
indx_char <- feature_groups[i]
indx_name <- feature_group_names[i]
indx <- as.numeric(strsplit(indx_char, split = ",")[[1]])
if (i == 1){
output <- vim(type = "AUC",
time = time,
event = event,
X = X,
landmark_times = landmark_times,
approx_times = saved_approx_times,
large_feature_vector = indx,
small_feature_vector = person_index_numeric,
conditional_surv_preds = saved_conditional_surv_preds,
large_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
small_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
cf_folds = saved_folds$cf_folds,
ss_folds = saved_folds$ss_folds,
sample_split = TRUE,
scale_est = TRUE)
saved_small_oracle_preds <- output$small_oracle_preds
} else{
output <- vim(type = "AUC",
time = time,
event = event,
X = X,
landmark_times = landmark_times,
approx_times = saved_approx_times,
large_feature_vector = indx,
small_feature_vector = person_index_numeric,
conditional_surv_preds = saved_conditional_surv_preds,
small_oracle_preds = saved_small_oracle_preds,
large_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
cf_folds = saved_folds$cf_folds,
ss_folds = saved_folds$ss_folds,
sample_split = TRUE,
scale_est = TRUE)
}
output$result$indx <- rep(indx_char, nrow(output$result))
output$result$indx_name <- rep(indx_name, nrow(output$result))
if (!(i == 1)){
pooled_output <- rbind(pooled_output, output$result)
} else{
pooled_output <- output$result
}
}
p_auc <- plot_results(pooled_output, "AUC variable importance relative to person-level features")
p_auc
## ----fig.width=8, fig.height=7------------------------------------------------
size_index <- paste0(c(size_index, person_index), collapse = ",")
nodes_index <- paste0(c(nodes_index, person_index), collapse = ",")
pgr_index <- paste0(c(pgr_index, person_index), collapse = ",")
er_index <- paste0(c(er_index, person_index), collapse = ",")
grade_index <- paste0(c(grade_index, person_index), collapse = ",")
feature_group_names <- c("size", "nodes", "prog.", "estro.", "grade")
feature_groups <- c(size_index, nodes_index,
pgr_index, er_index, grade_index)
for (i in 1:length(feature_group_names)){
indx_char <- feature_groups[i]
indx_name <- feature_group_names[i]
indx <- as.numeric(strsplit(indx_char, split = ",")[[1]])
if (i == 1){
output <- vim(type = "AUC",
time = time,
event = event,
X = X,
landmark_times = landmark_times,
approx_times = saved_approx_times,
large_feature_vector = (1:ncol(X))[-person_index_numeric],
small_feature_vector = (1:ncol(X))[-indx],
conditional_surv_preds = saved_conditional_surv_preds,
large_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
small_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
cf_folds = saved_folds$cf_folds,
ss_folds = saved_folds$ss_folds,
sample_split = TRUE,
scale_est = TRUE)
saved_large_oracle_preds <- output$large_oracle_preds
} else{
output <- vim(type = "AUC",
time = time,
event = event,
X = X,
landmark_times = landmark_times,
approx_times = saved_approx_times,
large_feature_vector = (1:ncol(X))[-person_index_numeric],
small_feature_vector = (1:ncol(X))[-indx],
conditional_surv_preds = saved_conditional_surv_preds,
large_oracle_preds = saved_large_oracle_preds,
small_oracle_generator_control = list(SL.library = c("SL.mean", "SL.glm"),
V = 2),
cf_folds = saved_folds$cf_folds,
ss_folds = saved_folds$ss_folds,
sample_split = TRUE,
scale_est = TRUE)
}
output$result$indx <- rep(indx_char, nrow(output$result))
output$result$indx_name <- rep(indx_name, nrow(output$result))
if (!(i == 1)){
pooled_output <- rbind(pooled_output, output$result)
} else{
pooled_output <- output$result
}
}
p_auc <- plot_results(pooled_output, "Adjusted AUC variable importance relative to all tumor-level features")
p_auc
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.