fake_data/fake_data_analysis.R

################################################
#
# R script that runs the analysis of the fake data set. It is the same as used for the real data analysis. Analysis results cannot be exactly reproduced.
# It reproduces:
# Table 3: Estimated AUCs for predicting a rebound in viral load in split sample test set based
# on Cox-PH-weights and Figure 1
# Figure 3: Aalen-Johansen estimates for the cause-specific cumulative
# incidences of failing to maintain HIV RNA undetectable within 2 years of suppression within
# quartiles of the test data set and  ROC curves for the the optimized stacked IPCW bagging.
# 
# We select the tuning parameters as it is described in the appendix of the paper  Gonzalez Ginestet et al. (2019+). "Stacked IPCW Bagging bagging: a case study in the HIV care registry".
# The tuning parameter is done using the function stackBagg::tune_params_ml based on a grid values of each of the hyperparameter considered. 
# We consider the following hyperparameters:
# GAM: degree of freedom (df). we only consider df equal to 3 and 4
# LASSO: lambda
# Random Forest:  num_trees and mtry parameters.
# k-NN:  number of neighbours considered
# SVM: the cost parameter,  the gamma and the kernel. kernel=1 denotes "radial" and kernel=2 denotes "linear".
# Neural Network: number of neurons
# BartMachine:  num_tree parameter,  k parameter and  q parameter.
###############################################

rm(list = ls())
library(dplyr)
library(stackBagg)

#  we load the fake data set which lies in this folder
load("fake_data_train.RData")
load("fake_data_test.RData")

# we subset the train fake data set because the random forest crashes  but one could run all the following 
# lines using the 2335 observations of fake_data_train 

fake_data_train <- fake_data_train[sample(nrow(fake_data_train),1000,replace = FALSE),]

# we define the variables to be included in the analysis
xnam <- names(fake_data_train)[-(1:2)]

# we set the grid of the hyperparameters that we are going to use in the tuning step
grid.hyperparam <- stackBagg::grid_parametersDataHIV(xnam,fake_data_train,tao=730)

# we tune the hyperparameters using the grid 
tuneparams_fakedata<- stackBagg::tune_params_ml(gam_param = grid.hyperparam$gam_param, 
                                                 lasso_param = grid.hyperparam$lasso_param,
                                                 randomforest_param = grid.hyperparam$randomforest_param,
                                                 knn_param = grid.hyperparam$knn_param,
                                                 svm_param = grid.hyperparam$svm_param,
                                                 nn_param = grid.hyperparam$nn_param,
                                                 bart_param = grid.hyperparam$bart_param,
                                                 folds=5,
                                                 xnam=xnam,
                                                 tao=730,
                                                 data=fake_data_train,
                                                 weighting="CoxPH")

# we specify the algorithms
ens.library <-stackBagg::all.algorithms()

# we predict the outcome of interest using the stackBagg with Cox PH weights using all algorithms available in the package
res.stackBagg <- stackBagg::stackBagg(train.data = fake_data_train,
                                test.data = fake_data_test,
                                xnam=xnam,
                                tao=730,
                                weighting = "CoxPH",
                                folds = 5,
                                ens.library = ens.library,
                                tuneparams = tuneparams_fakedata)


###########             Table 2                     ########### 
#Estimated AUCs for predicting a rebound in viral load in split sample test set based
#on Cox-PH-weights.

table3 <- cbind(c(res.stackBagg$auc_ipcwBagg,res.stackBagg$auc_survival[1:2]),c(res.stackBagg$auc_native_weights,rep(NA,7)),c(rep(NA,3),res.stackBagg$auc_survival[3],rep(NA,7)))
table3


######               Figure 3                       ##########
# Panel at right: the ROC curves for the the optimized stacked IPCW bagging
stackBagg::plot_roc(time=fake_data_test$ttilde,delta=fake_data_test$delta, marker=res.stackBagg$prediction_ensBagg[,"Stack"], tao=730, method="ipcw")

#Panel at left: the Aalen-Johansen estimates for the cause-specific cumulative
#incidences of failing to maintain HIV RNA undetectable within 2 years of suppression within
#quartiles of the test data set

library(prodlim)
library(ggplot2)

tao=730

pred.ens <- res.stackBagg$prediction_ensBagg[,"Stack"]
fake_data_test$quartile <- with(fake_data_test,
                           cut(pred.ens, 
                               breaks=quantile(pred.ens, probs=seq(0,1, by=0.2), na.rm=TRUE), 
                               include.lowest=TRUE))
fake_data_test$quartile1 <- as.numeric(data.test$quartile)

cuminc_q=matrix(NA,max(fake_data_test$quartile1),1)
l_ci=matrix(NA,max(fake_data_test$quartile1),1)
u_ci=matrix(NA,max(fake_data_test$quartile1),1)
for(i in 1:max(fake_data_test$quartile1)){
 
  cum_inc_1 <- prodlim(Hist(ttilde,delta)~1,data=fake_data_test[fake_data_test$quartile1==i,],type = "cuminc")
  cuminc_q[i] <- cum_inc_1$cuminc$`1` [cum_inc_1$time>=tao][which.min(cum_inc_1$time>=tao)]
  
  u_ci[i]=cum_inc_1$upper$`1`[cum_inc_1$time>=tao][which.min(cum_inc_1$time>=tao)]
  l_ci[i]=cum_inc_1$lower$`1`[cum_inc_1$time>=tao][which.min(cum_inc_1$time>=tao)]
}

cuminc_all_test=prodlim(Hist(ttilde,delta)~1,data=fake_data_test,type = "cuminc")
ci_all.test=cuminc_all_test$cuminc$`1` [cuminc_all_test$time>=tao][which.min(cuminc_all_test$time>=tao)]
u.all.test=cuminc_all_test$upper$`1` [cuminc_all_test$time>=tao][which.min(cuminc_all_test$time>=tao)]
l.all.test=cuminc_all_test$lower$`1` [cuminc_all_test$time>=tao][which.min(cuminc_all_test$time>=tao)]

#data for the plot
df <- data.frame(x =levels(fake_data_test$quartile),
                 cuminc =cuminc_q,
                 L =l_ci,
                 U =u_ci)

df$x <- factor(df$x, levels = df$x[order(df$cuminc)])

figure1 <- ggplot(df, aes(x = factor(x), y = cuminc)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymax = U, ymin = L), width = 0.25)+
  scale_y_continuous(name="Cumulative Incidence at 2 years \n", breaks=c(0,.1,.2,.27,.3,.4,.5),limits=c(0, .5) )+
  xlab("\n Quartiles of IPCW bagging Super Learner Predictions")+
  theme_bw() + theme(panel.grid.major = element_blank(),plot.margin = margin(10,0,10,0),
                     axis.title.x = element_text( size=16),
                     axis.title.y = element_text( size=16),
                     axis.text.x = element_text( 
                       size=14),
                     axis.text.y = element_text( 
                       size=14)) +  theme(aspect.ratio = 1)+
  geom_hline(yintercept = ci_all.test) +
  geom_hline(yintercept = u.all.test,linetype="dotted")+
  geom_hline(yintercept = l.all.test,linetype="dotted")

figure1
pablogonzalezginestet/stackBagg documentation built on Aug. 27, 2023, 10:21 p.m.