SurvivalEnsembles.R
In mboost: Model-Based Boosting

### R code from vignette source 'SurvivalEnsembles.Rnw'

###################################################
### code chunk number 1: setup
###################################################
source("setup.R")
if (!require("TH.data"))
    stop("cannot attach package ", sQuote("TH.data"))
if (!require("rpart"))
    stop("cannot attach package ", sQuote("rpart"))
if (!require("survival"))
    stop("cannot attach package ", sQuote("survival"))
if (!require("partykit"))
    stop("cannot attach package ", sQuote("partykit"))

set.seed(290875)
CEX <- 0.85
options(digits = 3)

### mean difference plots
mdplot <- function(obs, pred, main = "", ...) {

    m <- (obs + pred)/2
    d <- obs - pred
    plot(m, d, xlab = "(Observed + Predicted) / 2",
         ylab = "Observed - Predicted", main =
         main, cex.axis = CEX, cex.main = CEX, cex.lab = CEX, ...)
    abline(h = 0, lty = 3)
}


###################################################
### code chunk number 2: loaddata
###################################################
### load data. See `mboost/inst/readAML_Bullinger.R' for
### how the data were generated from the raw data.
load(file.path(path.package(package = "TH.data"), "rda", "AML_Bullinger.rda"))


###################################################
### code chunk number 3: AML-dpp
###################################################
### compute IPC weights
AMLw <- IPCweights(Surv(clinical$time, clinical$event))

### risk score
risk <- rep(0, nrow(clinical))
rlev <- levels(clinical[, "Cytogenetic.group"])
risk[clinical[, "Cytogenetic.group"] %in% rlev[c(7,8,4)]] <- "low"
risk[clinical[, "Cytogenetic.group"] %in% rlev[c(5, 9)]] <- "intermediate"
risk[clinical[, "Cytogenetic.group"] %in% rlev[-c(4,5, 7,8,9)]] <- "high"
risk <- as.factor(risk)

### set-up learning sample
AMLlearn <- cbind(clinical[, c("time", "Sex", "Age", "LDH", "WBC",
                        "FLT3.aberration.", "MLL.PTD", "Tx.Group.")],
               risk = risk,
               iexpressions[, colnames(iexpressions) %in% selgenes[["Clone.ID"]]])
cc <- complete.cases(AMLlearn)
AMLlearn <- AMLlearn[AMLw > 0 & cc,]
AMLw <- AMLw[AMLw > 0 & cc]


###################################################
### code chunk number 4: AML-RF
###################################################
### controls for tree growing
ctrl <- ctree_control(testtype = "Teststatistic", 
                      teststat = "maximum", mincriterion = .1, minsplit = 5)
### was: cforest_control(mincriterion = 0.1, mtry = 5, minsplit = 5, ntree = 250)

### fit random forest for censored data (warnings are OK here)
AMLrf <- cforest(log(time) ~ ., data = AMLlearn, control = ctrl,
                 weights = AMLw, mtry = 5, ntree = 250, 
                 perturb = list(replace = TRUE, fraction = 0.632))


###################################################
### code chunk number 5: AML-boost
###################################################
AMLl2b <- glmboost(I(log(time)) ~ ., data = AMLlearn, weights = AMLw,
                    control = boost_control(mstop = 5000))


###################################################
### code chunk number 6: AML-AIC
###################################################
### AIC criterion
plot(aic <- AIC(AMLl2b))


###################################################
### code chunk number 7: AML-fitted
###################################################
### restrict number of boosting iterations and inspect selected variables
AMLl2b <- AMLl2b[mstop(aic)]
cAML <- coef(AMLl2b)
cAML[abs(cAML) > 0]

### fitted values
AMLprf <- predict(AMLrf, newdata = AMLlearn)
AMLpb <- predict(AMLl2b, newdata = AMLlearn)


###################################################
### code chunk number 8: Figure1
###################################################
Mmod <- sum(AMLw * log(AMLlearn$time))/sum(AMLw )
par(mai = par("mai") * c(0.7, 0.8, 0.7, 0.6))
layout(matrix(1:4, ncol = 2))

mdplot(log(AMLlearn$time), AMLprf, main = "Random Forest",
       cex = AMLw / 4, ylim = c(-4, 4), xlim = c(0, 7))

plot(log(AMLlearn$time), AMLprf, cex = AMLw / 4,
       ylim = range(log(AMLlearn$time)), ylab = "Predicted", xlab = "Observed",
       main = "Random Forest",  cex.axis = CEX, cex.main = CEX, cex.lab = CEX)
abline(h = Mmod, lty = 2)

mdplot(log(AMLlearn$time), AMLpb,
        cex = AMLw / 4, main = "Boosting", ylim = c(-4, 4), xlim = c(0, 7))

plot(log(AMLlearn$time), AMLpb, cex = AMLw / 4,
       ylim = range(log(AMLlearn$time)), ylab = "Predicted", xlab = "Observed",
       main = "Boosting",  cex.axis = CEX, cex.main = CEX, cex.lab = CEX)
abline(h = Mmod, lty = 2)


###################################################
### code chunk number 9: GBSG2-dpp
###################################################
### attach data
data("GBSG2", package = "TH.data")

### IPC weights
GBSG2w <- IPCweights(Surv(GBSG2$time, GBSG2$cens))

### set-up learning sample
GBSG2learn <- cbind(GBSG2[,-which(names(GBSG2) %in% c("time", "cens"))],
               ltime = log(GBSG2$time))
n <- nrow(GBSG2learn)


###################################################
### code chunk number 10: GBSG2-models
###################################################
### linear model
LMmod <- lm(ltime ~ . , data = GBSG2learn, weights = GBSG2w)
LMerisk <- sum((GBSG2learn$ltime - predict(LMmod))^2*GBSG2w) / n

### regression tree
pos <- GBSG2w > 0
TRmod <- rpart(ltime ~ . , data = GBSG2learn, weights = GBSG2w, 
               subset = pos)
TRerisk <- sum((GBSG2learn$ltime[pos] - predict(TRmod))^2*GBSG2w[pos]) / n

### tree controls
ctrl <- ctree_control(testtype = "Teststatistic", 
                      teststat = "maximum", mincriterion = qnorm(.95), 
                      minsplit = 5)
### was: cforest_control(mincriterion = qnorm(0.95), mtry = 5,
###                      minsplit = 5, ntree = 100)


### fit random forest for censored data (warnings are OK here)
RFmod <- cforest(ltime ~ . , data = GBSG2learn, weights = GBSG2w,
                 control = ctrl, mtry = 5, ntree = 100, 
                 perturb = list(replace = TRUE, 
                     fraction = 0.632 * sum(GBSG2w > 0)))

### fit L2 boosting for censored data
L2Bmod <- glmboost(ltime ~ ., data = GBSG2learn, weights = GBSG2w,
                   control = boost_control(mstop = 250))

### with Huber loss function
L2BHubermod <- glmboost(ltime ~ ., data = GBSG2learn, weights = GBSG2w,
                        family = Huber(d = log(2)))


###################################################
### code chunk number 11: GBSG2-AIC
###################################################
plot(aic <- AIC(L2Bmod))


###################################################
### code chunk number 12: GBSG2-fitted
###################################################
GBSG2Hp <- predict(L2BHubermod, newdata = GBSG2learn)
L2Berisk <- sum((GBSG2learn$ltime - predict(L2Bmod, newdata = GBSG2learn))^2*GBSG2w) / n
RFerisk <- sum((GBSG2learn$ltime - predict(RFmod, newdata = GBSG2learn))^2*GBSG2w) / n


###################################################
### code chunk number 13: Figure3
###################################################
lim <- c(4,9)
mylwd <- 0.5
par(mai = par("mai") * c(0.7, 0.8, 0.7, 0.6))

layout(matrix(1:4, ncol = 2))
Mmod <- sum(GBSG2w * GBSG2learn$ltime)/sum(GBSG2w)

mdplot(GBSG2learn$ltime, predict(LMmod), cex = GBSG2w / 4, main = "Linear Model",
       ylim = c(-3, 3), xlim = c(5, 8))
mdplot(GBSG2learn$ltime[pos], predict(TRmod), cex = GBSG2w / 4, main = "Tree",
       ylim = c(-3, 3), xlim = c(5, 8))
mdplot(GBSG2learn$ltime, predict(RFmod, newdata = GBSG2learn), cex = GBSG2w / 4,
       main = "Random Forest", ylim = c(-3, 3), xlim = c(5, 8))
mdplot(GBSG2learn$ltime, predict(L2Bmod, newdata = GBSG2learn), cex = GBSG2w / 4,
       main = "Boosting", ylim = c(-3, 3), xlim = c(5, 8))


###################################################
### code chunk number 14: Figure5
###################################################
RFpr <- predict(RFmod, newdata = GBSG2learn)
L2Bpr <- predict(L2Bmod, newdata = GBSG2learn)
ylim <- range(c(RFpr[GBSG2w > 0], L2Bpr[GBSG2w > 0]))
mydf <- 4
par(mai = par("mai") * c(0.7, 0.8, 0.4, 0.6))
layout(matrix(1:4, ncol = 2))
plot(GBSG2learn$pnodes, RFpr, cex = GBSG2w/4, xlim = c(0,40), lwd = mylwd,
     xlab = "Nr. positive lymph nodes", ylim = ylim, ylab =
     expression(hat(Y)), cex.axis = CEX, cex.main = CEX, cex.lab = CEX,)
lines(smooth.spline(GBSG2learn$pnodes, RFpr, GBSG2w/4, df = mydf))
plot(GBSG2learn$age, RFpr, cex = GBSG2w/4, xlab = "Age",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$age, RFpr, GBSG2w/4, df = mydf))
plot(GBSG2learn$estrec, RFpr, cex = GBSG2w/4, xlab = "Estrogen receptor",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$estrec, RFpr, GBSG2w/4, df = mydf))
indx <- which(GBSG2learn$progrec < 100)
plot(GBSG2learn$progrec[indx], RFpr[indx], cex = GBSG2w[indx]/4,
     xlab = "Progesterone receptor (< 100 fmol / l)",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$progrec[indx], RFpr[indx], GBSG2w[indx]/4, df = mydf))


###################################################
### code chunk number 15: Figure6
###################################################
par(mai = par("mai") * c(0.7, 0.8, 0.4, 0.6))
layout(matrix(1:4, ncol = 2))
plot(GBSG2learn$pnodes, L2Bpr, cex = GBSG2w/4, xlim = c(0,40),
     ylab = expression(hat(Y)),
     xlab = "Nr. positive lymph nodes", ylim = ylim, lwd = mylwd, cex.axis =
     CEX, cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$pnodes, L2Bpr, GBSG2w/4, df = mydf))
plot(GBSG2learn$age, L2Bpr, cex = GBSG2w/4, xlab = "Age",
     ylab = expression(hat(Y)),
     ylim = ylim, lwd = mylwd, cex.axis = CEX, cex.main = CEX, cex.lab =
     CEX)
lines(smooth.spline(GBSG2learn$age, L2Bpr, GBSG2w/4, df = mydf))
plot(GBSG2learn$estrec, L2Bpr, cex = GBSG2w/4, xlab = "Estrogen receptor",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$estrec, L2Bpr, GBSG2w/4, df = mydf))
indx <- which(GBSG2learn$progrec < 100)
plot(GBSG2learn$progrec[indx], L2Bpr[indx], cex = GBSG2w[indx]/4,
     xlab = "Progesterone receptor (< 100 fmol / l)",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$progrec[indx], L2Bpr[indx], GBSG2w[indx]/4, df = mydf))


###################################################
### code chunk number 16: Figure7
###################################################
Mmod <- sum(GBSG2w * GBSG2learn$ltime)/sum(GBSG2w)
par(mai = par("mai") * c(0.7, 0.8, 0.7, 0.6))
layout(matrix(1:4, ncol = 2))

yl <- range(c(GBSG2Hp[GBSG2w > 0], L2Bpr[GBSG2w > 0]))
mdplot(GBSG2learn$ltime, GBSG2Hp, main = "Huber Loss",
       cex = GBSG2w / 4, ylim = c(-3, 3), xlim = c(5, 8))

plot(GBSG2learn$ltime, GBSG2Hp, cex = GBSG2w / 4,
       xlim = range(GBSG2learn$ltime[GBSG2w > 0]), ylim = yl, ylab = "Predicted", xlab = "Observed",
       main = "Huber Loss",  cex.axis = CEX, cex.main = CEX, cex.lab = CEX)

mdplot(GBSG2learn$ltime, L2Bpr,
        cex = GBSG2w / 4, main = "Quadratic Loss", ylim = c(-3, 3), xlim = c(5, 8))

plot(GBSG2learn$ltime, L2Bpr, cex = GBSG2w / 4,
       xlim = range(GBSG2learn$ltime[GBSG2w > 0]), ylim = yl, ylab = "Predicted", xlab = "Observed",
       main = "Quadratic Loss",  cex.axis = CEX, cex.main = CEX, cex.lab = CEX)