In WinVector/QSurvival: Quick survival models by data transformation

knitr::opts_chunk$set(fig.width=7)

suppressWarnings(suppressMessages(library("ggplot2")))
suppressWarnings(suppressMessages(library('dplyr')))
suppressWarnings(suppressMessages(library('mgcv')))
set.seed(32535)

params <- list(groupA=list(k=1.8,lambda=50),
               groupB=list(k=0.8,lambda=10))
#plot(dweibull(1:100,shape=params[['groupA']][['k']],
#              scale=params[['groupA']][['lambda']]))
#plot(dweibull(1:100,shape=params[['groupB']][['k']],
#              scale=params[['groupB']][['lambda']]))

censorPoint = 50

mkData <- function(n) {
  d <- data.frame(group = sample(c('groupA', 'groupB'), n, replace = TRUE),
                  stringsAsFactors = FALSE)
  d$unobservedLifetime <- vapply(seq_len(nrow(d)),
                                 function(i) {
                                   max(1, round(rweibull(
                                     1, shape = params[[d[i, 'group']]][['k']],
                                     scale = params[[d[i, 'group']]][['lambda']]
                                   )))
                                 },
                                 numeric(1))
  d$observedLifetime <- pmin(d$unobservedLifetime, censorPoint)
  d$censored <- d$unobservedLifetime > d$observedLifetime
  d
}


dTrain <- mkData(200)
dTest <- mkData(101)
dApplication <- mkData(51)

ggplot(dTrain,aes(x=unobservedLifetime,color=group)) +
  geom_density() + xlim(c(0,150)) + geom_vline(xintercept = censorPoint)
dTrain %>% dplyr::group_by(group) %>%
  dplyr::summarise(meanLifetime=mean(unobservedLifetime))

ggplot(dTrain,aes(x=observedLifetime,color=group)) +
  geom_density() 
dTrain %>% dplyr::group_by(group) %>%
  dplyr::summarise(meanLifetime=mean(observedLifetime))

Logistic regression direct hazard method for survival models similar to: http://data.princeton.edu/wws509/notes/c7.pdf

parallelCluster <- c()
## can't run paralle code in vignettes
#if(requireNamespace("parallel",quietly=TRUE)) {
#     parallelCluster <- parallel::makeCluster(4)
#}

ageLimit <- 45


# Can sub-sample the quasi events, but results are sensitive to this
# so you want targetSize to be large (but may want to set it for safety).
prepTrain <- 
  QSurvival::buildQuasiObsForTraining(dTrain,
                                      dTrain$observedLifetime,
                                      ifelse(dTrain$censored,NA,dTrain$observedLifetime),
                                      'ID','observedAge','deathEvent',
                                      parallelCluster=parallelCluster,
                                      targetSize=10000000)
# allow windowing further out than training without causing new levels.
# basically want time as shown to model (if you are showing time to mode)
# to end will before censoring region to make sure you are not extrapolating
# off a noisy end estimate.
prepTrain$surrogateAge <- pmin(prepTrain$observedAge,ageLimit)

obsWindow <- 200

prepStudy <- 
  QSurvival::buildQuasiObsForComparison(dTrain,
                                        obsWindow,
                                      dTrain$observedLifetime,
                                      ifelse(dTrain$censored,NA,dTrain$observedLifetime),
                                      'ID','observedAge','deathEvent',
                                      parallelCluster=parallelCluster)
prepStudy$surrogateAge <- pmin(prepStudy$observedAge,ageLimit)


prepTest <- 
  QSurvival::buildQuasiObsForComparison(dTest,
                                     obsWindow,
                                     dTest$observedLifetime,
                                     ifelse(dTest$censored,NA,dTest$observedLifetime),
                                     'ID','observedAge','deathEvent',
                                     parallelCluster=parallelCluster)
prepTest$surrogateAge <- pmin(prepTest$observedAge,45)

prepApp <- 
  QSurvival::buildQuasiObsForApplication(dApplication,
                                         obsWindow,
                                         'ID','observedAge',
                                         parallelCluster=parallelCluster)
prepApp$surrogateAge <- pmin(prepApp$observedAge,45)

if(!is.null(parallelCluster)) {
  parallel::stopCluster(parallelCluster)
  parallelCluster <- NULL
}

Reduced complexity of age portion of model, using spline over surrogate age.

# Could use s(observedAge), but using s(surrogateAge) means we don't use
# the GAM spline to extrapolate.
# Use degree 2 spline to approximate typical bathtub hazard shape.
# Use "0+" notation to kick out DC term which might be mixture of groups sensitive.
model <- gam(deathEvent~0+group+s(surrogateAge,k=2),
             data=prepTrain,family=binomial)
print(summary(model))
prepStudy$hazardPrediction <- as.numeric(predict(model,newdata=prepStudy,type='response'))
prepTest$hazardPrediction <- as.numeric(predict(model,newdata=prepTest,type='response'))
prepApp$hazardPrediction <- as.numeric(predict(model,newdata=prepApp,type='response'))

studyD <- QSurvival::summarizeHazard(prepStudy,'ID','observedAge','hazardPrediction',
                           survivalColumnName='survival',
                           deathIntensityColumnName='deathIntensity',
                           parallelCluster=parallelCluster) 
studyRes <- studyD$details
head(studyD$expectedLifetime)


studyRes %>% dplyr::group_by(group,observedAge) %>%
  dplyr::summarise(deathIntensity=mean(deathIntensity)) -> plotFrame

# recovered empirical distribution of lifetimes
ggplot(data=plotFrame,aes(x=observedAge,y=deathIntensity,color=group)) + 
  geom_line() + ggtitle("GAM splined time surrogate age study") + xlim(c(0,150))

studyRes %>% dplyr::group_by(group,observedAge) %>%
  dplyr::summarise(hazardPrediction=mean(hazardPrediction)) -> plotFrame

ggplot(data=plotFrame,aes(x=observedAge,y=hazardPrediction,color=group)) + 
  geom_line() + ggtitle("GAM splined hazard study") + xlim(c(0,150))

studyRes %>% dplyr::group_by(group,observedAge) %>%
  dplyr::summarise(survival=mean(survival)) -> plotFrameS

ggplot(data=plotFrameS,aes(x=observedAge,y=survival,color=group)) + 
  geom_line() + ggtitle("GAM splined survival plot") + xlim(c(0,150))

actualFrame <-
  QSurvival::summarizeActualFrame(dTrain,'group','unobservedLifetime',
                           parallelCluster=parallelCluster)
ggplot(data=actualFrame,aes(x=unobservedLifetime,y=survival,color=group)) +
  geom_line() +  ggtitle("empirical unobserved survival (study)") +
  xlim(c(0,150))

ggplot() +
  geom_line(data=plotFrameS,aes(x=observedAge,
                                y=survival,color=group),linetype=2) +
  geom_line(data=actualFrame,aes(x=unobservedLifetime,
                                 y=survival,color=group)) +
  xlab('age') + 
  ggtitle("study survival curves (solid actual, dashed fit)")

Out of sample work.

testD <- QSurvival::summarizeHazard(prepTest,'ID','observedAge','hazardPrediction',
                           survivalColumnNames='survival',
                           deathIntensityColumnNames='deathIntensity') 
testRes <- testD$details
head(testD$expectedLifetime)


dTestAug <- dTest
dTestAug$ID <- seq_len(nrow(dTest))
colnames(testD$expectedLifetime) <- c('ID','expectedLifetime')
dTestAug %>% left_join(testD$expectedLifetime,by='ID') -> dTestAug

dTestAug %>% group_by(group) %>% summarize(
          expectedLifetime=mean(expectedLifetime),
          unobservedLifetime=mean(unobservedLifetime),
          observedLifetime=mean(observedLifetime))

ggplot(data=dTestAug,
       aes(x=expectedLifetime,y=unobservedLifetime,color=group)) + 
  geom_abline() + geom_point()

Standard survival model.

library('survival')

# In Surv() TRUE means event happened (dead, not censored).
dTrain$surv <- with(dTrain,Surv(observedLifetime,!censored))

# Kaplan-Meier
fit <- survfit(surv~group,
               data=dTrain,
               conf.type = "log-log")
print(fit)
plot(fit,conf=TRUE,mark.time=TRUE)
survdiff(surv~group,dTrain)

# Cox proportional hazard
fit <- coxph(surv~group,
               data=dTrain)
survfit(fit,newdata=dTest[2,'group',drop=FALSE])$surv