knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

If you have not done so, proper installation of keras in R is required. Please follow https://tensorflow.rstudio.com/install/

This is a modified version of the code used to generate the simulation study in the CBNN manuscript. We hope to highlight the modularity, such that should the user wish to implement this in alternative languages or libraries may do so. Note that the current functions assume tabular data, but are modular and may be easily extended. Feel free to reach out if you have a specific implementation request on the github page.

Setup environment

#install.packages(cbnn)
library(cbnn)
library(keras)
##########################################
### Required packages for data simulation
##########################################
library(flexsurv)
library(simsurv)
library(splines)
#Splines is needed here too to permit a flexible baseline hazard
############################################
### Required packages to run linear Cox
############################################
library(survival)
############################################
### Required packages to run linear casebase
############################################
library(casebase)
############################################
### Required package for IPA and AUC_{IPCW}
############################################
library(riskRegression)
############################################
### visualization packages
############################################
library(ggplot2)


##########################
###Hyperparameters
##########################

min_delta = 0 
epo=2000
patience <- 10
iteration<-100
numSamp<-5000


#note that these weren't the best hyperparameters, just a set that works ok. proper hyperparameter optimization procedures are ideally done for your specific use case.
lr=c(0.001)
drpt=c(0.01)
layer1=c(50)
layer2=c(10)
bsize=c(100)
actv=c("relu")

Simulate data with time-varying interaction and complex baseline hazard

First, we define our log cumulative hazard using simsurv

##############################
###set up complex simulation
##############################

# Should the user have specific scenarios they want to simulate, examples can be found at 
# https://cran.r-project.org/web/packages/simsurv/vignettes/simsurv_usage.html#example-3-simulating-under-a-weibull-model-with-time-dependent-effects

# Define a function returning the log cum hazard at time t
logcumhaz <- function(t, x, betas, knots) {

  # Obtain the basis terms for the spline-based log
  # cumulative hazard (evaluated at time t)
  basis <- flexsurv::basis(knots, log(t))

  # Evaluate the log cumulative hazard under the
  # Royston and Parmar specification
  res <-betas[["gamma0"]] * basis[[1]] + 
    betas[["gamma1"]] * basis[[2]] +
    betas[["gamma2"]] * basis[[3]] +
    betas[["gamma3"]] * basis[[4]] +
    betas[["gamma4"]] * basis[[5]] +
    betas[["z1"]] * x[["z1"]]+
    betas[["z2"]] * x[["z2"]]+
    betas[["z3"]] * x[["z3"]]+
    betas[["term1"]] * x[["z1"]]*t+
    betas[["term3"]] * x[["z2"]]*(x[["z3"]])
  #
  # Return the log cumulative hazard at time t
  res
}

Then, we use flexsurv to provide a realistic estimate of a complex basline hazard on breast cancer survival data.

# get a real fit on breast cancer survival to generate our flexible baseline.
true_mod <- flexsurv::flexsurvspline(Surv(rectime, censrec) ~ 1, data = simsurv::brcancer, k = 3) #from flexsurv

#########################
### Define coefficients
#########################
coefficients<-true_mod$coefficients
coefficients<-c(coefficients,z1=-5,z2=-1,z3=1,term1=0.001,term3=-1) #terms are interactions

Finally, we simulate the data, perform some processing for convenience and split the data into train-validate-test sets. NOTE: We have not performed casebase sampling yet.

#########################
###Simulate data
#########################
cov<-data.frame(id=1:numSamp,
                z1=c(rep(1,numSamp/2),rep(0,numSamp/2)),
                z2=c(rnorm(numSamp/2,1,0.5),rnorm(numSamp/2,0,0.5)),
                z3=c(rnorm(numSamp,1,0.5)))

dat <- simsurv(betas = coefficients,      # "true" parameter values
               x = cov,                   # covariate data for {numSamp} individuals
               knots = true_mod$knots,    # knot locations for splines
               logcumhazard = logcumhaz,  # definition of log(cumulative hazard) above
               maxt = 10000,              # want to prevent right censoring.
               interval = c(0,100000000)) # interval for root finding
colnames(dat)<-c("id","time","status")

#Introduce random sampling
samp<-sample(seq(1,nrow(dat),by=1), floor(nrow(dat)*(0.1))) #random censoring
dat$status[samp]<-0

# Merge the simulated event times onto covariate data frame
data <- merge(cov, dat)
data<-data[,-1]


#split the data into training, validation and test sets.
spec = c(train = .70,validate=.15, test = .15)
g = sample(cut(
  seq(nrow(data)), 
  nrow(data)*cumsum(c(0,spec)),
  labels = names(spec)
))

res = split(data, g)

Fit model

In this section, we fit a linear model using casebase with cubic splines on time, a linear Cox regression and finally, our cbnn model.

Perform correct normalization and pre-processing

Normalize the data, according to the training set

We get the mean and sd from the training set, to appropriately normalize the validation and test set.

#normalizing function for convenience
normalizer<-function(data,means,sds,maxTime){
  normalized<-as.data.frame(data)
  for (i in 1:ncol(data)){
    if(length(unique(data[,i]))>1){
      normalized[,i]<-(data[,i]-means[i])/sds[i]
    }
  }
  normalized$status<-data$status

  normalized$time<-data$time/maxTime
  return(normalized)
}



train<-res$train
val<-res$validate
fullTest<-res$test
#get means, standard deviations (maxtime is kept for plotting results later)
sds<-sapply(train, function(x) (sd(x)))
means<-sapply(train, function(x) (mean(x)))
maxTime<-max(train$time)
timeVar<-'time'
statusVar<-'status'

train<-normalizer(train,means,sds,maxTime)
val<-normalizer(res$validate,means,sds,maxTime)
fullTest<-normalizer(fullTest,means,sds,maxTime)
test<-fullTest[,!(colnames(fullTest) %in% statusVar)] #remove status column, only include features, 



#for the linear models (casebase and Cox), we train on all of the training and validation data.
linModelTrain<-as.data.frame(rbind(train,val))

# we prepare the times of interest
times<-seq(from=min(test$time),
           to=max(test$time),
           length.out = 100
)

times<-head(times, -1)
times<-tail(times, -1)

We implement a custom RiskRegressionFunction for convenience, as the usual interface requires an implementation on their end into their API for the predictRisk function. As this does not follow their implementation scheme, and the flexibility is preferred for the neural network side of things, we inject our own function into their API. It simply creates an object that is recognized by their code, removing the need to calculate cumulative incidence curves from their end. This arbitrary "tunnel" class will be recognized by their API.

# If you want to generate your own matrices for comparison, simply add the tunnel class to a sample(rows) by time (columns) cumulative incidence matrix
# i.e class(cumulativeIncidenceMatrix)<-c("tunnel",class(cumulativeIncidenceMatrix))
predictRisk.tunnel <- function(object, newdata, times, cause, ...){
  class(object)<-class(object)[-1]
  return(object)
}

Fit optimal model

This model shows the best performance we can get with the samples we have. Here, we know what interactions and direct effects exists within our data, and we add them. If you decide to modify the simulated data, this should be adjusted accordingly to act as an ideal optimal model.

###########################
###casebase optimal model
###########################
final_mod_cb_glm <- fitSmoothHazard(status~bs(time)-time+z1+z2+z3+z1:time+ z2:z3,#+z1:z3
                                    data = linModelTrain,
                                    time = "time",
                                    event="status",
                                    ratio = 100
)

summary(final_mod_cb_glm)
tglmAbsRisk<-absoluteRisk(final_mod_cb_glm,time=times,newdata=test,type="CI")
tglmAbsRisk<-as.data.frame(tglmAbsRisk)
rownames(tglmAbsRisk)<-tglmAbsRisk$time
trueGlmProper<- t(tglmAbsRisk[-1,-1])
class(trueGlmProper)<-c("tunnel",class(trueGlmProper))

fit a casebase model without interactions

This model serves as the flexible baseline hazard example, demonstrating if there's a benefit or loss on prediction on this end of the prediction.

#########################
###casebase+splines
##########################
mod_cb_glm <- fitSmoothHazard(status~bs(time)+.-time,
                              data = linModelTrain,
                              time = "time",
                              event="status",
                              ratio = 100
)

glmAbsRisk<-as.data.frame(absoluteRisk(mod_cb_glm,time=times,newdata=test,type="CI"))
rownames(glmAbsRisk)<-glmAbsRisk$time
glmProper<- t(glmAbsRisk[-1,-1])
class(glmProper)<-c("tunnel",class(glmProper))

Fit a Cox regression model without the interactions

This model serves as a semi-parametric baseline to see if there's a benefit or loss on prediction. Note: this is implemented directly in riskregression and does not require the tunnel.

############################
###coxph
############################
cox<-coxph(Surv(time, status) ~ ., data = linModelTrain,x=T)

Fit the CBNN model

Casebase sampling for training and validation sets

Here we fit a CBNN model. First we casebase sample the training and validation set. Then, we use the training offset as the validation offset, to mimic a realistic scenario (we should only know the information in the training data). As we are still in the fitting process, the offset should NOT be set to 0.

#############################
###CBNN
#############################
traincb<-sampleCaseBase(data = as.data.frame(train),time="time",event="status",ratio=100)
valcb<-sampleCaseBase(data = as.data.frame(val),time="time",event="status",ratio=100)

  valcb<-list(list(as.matrix(valcb[,-c(ncol(valcb)-1,ncol(valcb))]),
                 as.matrix(valcb[,ncol(valcb),drop=F])),
            as.matrix(valcb[,ncol(valcb)-1,drop=F]))

  valcb[[1]][[2]]<-rep(traincb$offset[1],nrow(valcb[[1]][[1]]))   

Prepare CBNN model

Here we prepare all the inputs to our model, namely the input layer and feed forward segment we wish to add our offset to. NOTE: the offset term and final sigmoid activation function are added in the cbnnPrep function. If you wish to do it yourself you must modify the function. The hyperparameters are defined in the setup chunk. These were selected from the original manuscript, however with package updates etc. we expect some differences in the final results.

covars_input<-keras::layer_input(shape=ncol(train)-1,
                                 name = 'main_input')

covars_output<-covars_input%>%
  layer_dense(units=layer1,use_bias = T,activation = actv)%>%
  layer_dropout(drpt)%>%
  layer_dense(units=layer2,use_bias = T,activation = actv)%>%
  layer_dropout(drpt)%>%
  layer_dense(units=1,use_bias = T)
cbnnPrepared<-prepCbnn(features=colnames(train)[-ncol(train)],
               nnInput = covars_input,
               nnOutput = covars_output,
               data = traincb,
               offset=traincb$offset,
               timeVar = "time",
               eventVar= "status",
               compRisk=FALSE,
               kOptim=optimizer_adam(learning_rate = lr,  weight_decay=10^-7)
)

Fit CBNN model

callBacks<-keras::callback_early_stopping(
  monitor = "val_loss",
  min_delta = min_delta,
  patience = patience,
  verbose = 0,
  mode = c("auto"),
  baseline =NULL,
  restore_best_weights = T
)


fitCBNN<-fitHazard(cbnnPrepared,
               epochs=epo,
               batchSize=bsize,
               earlyStoppingCallbacks = callBacks,
               valData=valcb)

Generate Cumulative Incidence Curves for CBNN

cbnnPredictionsCI<-cuIncCbnn(fitCBNN,
              times=times,
              x_test=test
)


rownames(cbnnPredictionsCI)<-cbnnPredictionsCI[,1]
cbnnPredictionsCI<- t(cbnnPredictionsCI[,-1])
class(cbnnPredictionsCI)<-c("tunnel",class(cbnnPredictionsCI))

Generate metrics for comparison

############################
### Brier Score
############################  
brierFinalResults <- Score(list("Cox_Lin" = cox,'CB_Logi'=glmProper,
                                'CBNN_Poly'=cbnnPredictionsCI,'Optimal'=trueGlmProper),
                           data =fullTest, 
                           formula = Hist(time, status != 0) ~ 1, summary = c("risks","IPA","ibs"), 
                           se.fit = FALSE, metrics = c("auc", "brier"), contrasts = FALSE, times = times)
#brierFinalResults$Brier$score contains the brier score derived metrics like IPA.
#brierFinalResults$AUC$score contains the AUC metric.

Plot Results

ggplot(data=brierFinalResults$Brier$score, aes(x=times*maxTime,y=IPA,col=model))+geom_line()
ggplot(data=brierFinalResults$AUC$score, aes(x=times*maxTime,y=AUC,col=model))+geom_line()


Jesse-Islam/cbnn documentation built on Jan. 13, 2024, 3:48 a.m.