inst/doc/tbm_supplement.R

## ----setup, echo = FALSE, results = "hide", message = FALSE--------------
set.seed(290875)

dir <- system.file("applications", package = "tbm")
datadir <- system.file("rda", package = "TH.data")


sapply(c("tram", "survival", "tbm", "mboost", "lattice", 
         "latticeExtra", "partykit"), library, char = TRUE)

trellis.par.set(list(plot.symbol = list(col=1,pch=20, cex=0.7),
                     box.rectangle = list(col=1),
                     box.umbrella = list(lty=1, col=1),
                     strip.background = list(col = "white")))
ltheme <- canonical.theme(color = FALSE)     ## in-built B&W theme
ltheme$strip.background$col <- "transparent" ## change strip bg
lattice.options(default.theme = ltheme)

knitr::opts_chunk$set(echo = TRUE, results = 'markup', error = FALSE,
                      warning = FALSE, message = FALSE,
                      tidy = FALSE, cache = FALSE, size = "small",
                      fig.width = 6, fig.height = 4, fig.align = "center",
                      out.width = NULL, ###'.6\\linewidth', 
                      out.height = NULL,
                      fig.scap = NA)
knitr::render_sweave()  # use Sweave environments
knitr::set_header(highlight = '')  # do not \usepackage{Sweave}
## R settings
options(prompt = "R> ", continue = "+  ", useFancyQuotes = FALSE)  # JSS style
options(width = 75)
library("colorspace")
col <- diverge_hcl(2, h = c(246, 40), c = 96, l = c(65, 90))
fill <- diverge_hcl(2, h = c(246, 40), c = 96, l = c(65, 90), alpha = .3)

## ----citation, echo = FALSE----------------------------------------------
year <- substr(packageDescription("tbm")$Date, 1, 4)
version <- packageDescription("tbm")$Version

## ----src-appl, results = "hide"------------------------------------------
system.file("applications", package = "tbm")

## ----src-sim, results = "hide"-------------------------------------------
system.file("simulations", package = "tbm")

## ----results, echo = FALSE-----------------------------------------------
ds <- c("Beetles Exctinction Risk",
        "Birth Weight Prediction",
        "Body Fat Mass",
        "CAO/ARO/AIO-04 DFS",
        "Childhood Malnutrition",
        "Head Circumference",
        "PRO-ACT ALSFRS",
        "PRO-ACT OS")
names(ds) <- c("ex_beetles.rda", "ex_fetus.rda", "ex_bodyfat.rda", "ex_CAO.rda", 
        "ex_india.rda", "ex_heads.rda", "ex_ALSFRS.rda", "ex_ALSsurv.rda")
models <- c(expression(paste("L ", bold(vartheta)(bold(x)))),
       expression(paste("L ", beta(bold(x)))),
       expression(paste("N ", bold(vartheta)(bold(x)))),
       expression(paste("N ", beta(bold(x)))),
       expression(paste("T ", bold(vartheta)(bold(x)))),
       expression(paste("T ", beta(bold(x)))),
       expression(paste("Tree ", bold(vartheta)(bold(x)))),
       expression(paste("F ", bold(vartheta)(bold(x)))))
names(models) <- c("glm_ctm", "glm_tram", "gam_ctm", "gam_tram", 
              "tree_ctm", "tree_tram", "trtf_tree", "trtf_forest")
mor <- c("gam_ctm", "glm_ctm", "tree_ctm", "gam_tram", "glm_tram", 
        "tree_tram", "trtf_tree", "trtf_forest")

pit <- function(object, data) {
    cf <- predict(object, newdata = data, coef = TRUE)
    tmp <- object$model
    ret <- numeric(NROW(data))
    for (i in 1:NROW(data)) {
        coef(tmp) <- cf[i,]
        ret[i] <- predict(tmp, newdata = data[i,,drop = FALSE], type = "distribution")
    }
    ret
}

pitplot <- function(object, data, main = "") {
    u <- pit(object, data)
    plot(1:length(u) / length(u), sort(u), xlab = "Theoretical Uniform Quantiles", 
         ylab = "PIT Quantiles", main = main, ylim = c(0, 1), xlim = c(0, 1))
    abline(a = 0, b = 1, col = "red")
} 

plot.ctm <- function(x, which = sort(unique(selected(object))), 
                     obs = TRUE, ...) {

    object <- x
    q <- mkgrid(x$model, n = 100)
    X <- model.matrix(x$model$model, data = q)
    pfun <- function(which) {
        stopifnot(length(which) == 1)
        df <- model.frame(object, which = which)[[1]]
        df <- lapply(df, function(x) seq(from = min(x), 
                                         to = max(x), length = 50))
        df2 <- do.call("expand.grid", c(df, q))
        class(object) <- class(object)[-1]
        tmp <- predict(object, newdata = df, which = which)
        CS <- diag(ncol(tmp[[1]]))
        CS[upper.tri(CS)] <- 1
        cf <- tmp[[1]] %*% CS
        df2$pr <- as.vector(t(X %*% t(cf)))
        ret <- cbind(df2, which = names(df)[1])
        names(ret)[1:2] <- c("x", "y")
        ret
    }

    ret <- do.call("rbind", lapply(which, pfun))
    at <- quantile(ret$pr, prob = 0:10/10)

    pf <- function(x, y, z, subscripts, ...) {
        xname <- as.character(unique(ret[subscripts, "which"]))
        xo <- model.frame(object, which = xname)[[1]][[xname]]
        yo <- object$response
        panel.levelplot.raster(x, y, z, subscripts, ...)
        panel.contourplot(x, y, z, subscripts, contour = TRUE, ...)
        if (obs)
            panel.xyplot(x = xo, y = yo, pch = 20)
    }
    levelplot(pr ~ x + y | which, data = ret, panel = pf,
              scales = list(x = list(relation = "free")), region = TRUE, 
              at = at, col.regions = grey.colors(length(at)), ...)
}

### source("movie.R")

## ----riskplot, eval = FALSE, echo = FALSE--------------------------------
#  x <- risk[,mor] - ll0
#  med <- apply(x, 2, median, na.rm = TRUE)
#  boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
#          ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
#          ylim = ylim)
#  abline(v = c(3.5, 6.5), col = "lightgrey")
#  abline(h = 0, col = "lightgrey")
#  abline(h = max(med), col = col[2])
#  axis(1, at = 1:ncol(x), labels = models[colnames(x)])
#  axis(2)
#  box()

## ----beetles-results, echo = FALSE, fig.width = 7, fig.height = 6--------
load(file.path(dir, file <- "ex_beetles.rda"))
ylim <- c(-.5, .5)
x <- risk[,mor] - ll0
med <- apply(x, 2, median, na.rm = TRUE)
boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
        ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
        ylim = ylim)
abline(v = c(3.5, 6.5), col = "lightgrey")
abline(h = 0, col = "lightgrey")
abline(h = max(med), col = col[2])
axis(1, at = 1:ncol(x), labels = models[colnames(x)])
axis(2)
box()

## ----beetles-setup, echo = FALSE-----------------------------------------
load(file.path(datadir, "beetles.rda"))

ldata <- bmodel
ldata$TS <- NULL

X <- model.matrix(~ species - 1, data = ldata)
ldata$species <- NULL

nvars <- c("mean_body_size", "mean_elev", "flowers",
          "niche_decay", "niche_diam", "niche_canopy", "distribution")
fvars <- colnames(ldata)[!colnames(ldata) %in% c("RL", nvars)]
xvars <- c(nvars, fvars)

ldata[nvars] <- lapply(nvars, function(x) as.numeric(scale(ldata[[x]])))

fm_gam <- c(
"ctm" = as.formula(paste("RL ~ buser(X, K = Ainv, df = 2) + ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))),
"stm" = as.formula(paste("RL ~ buser(X, K = Ainv, df = 1) + ",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+"))))

fm_glm <- c(
"ctm" = as.formula(paste("RL ~ buser(X, K = Ainv, df = 2) +",
    paste("bols(", xvars, ", df = 2)", collapse = "+"))),
"stm" = as.formula(paste("RL ~ buser(X, K = Ainv, df = 1) +",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"))))

fm_tree <- as.formula(paste("RL ~ ", paste(xvars, collapse = "+")))

mf <- model.frame(fm_tree, data = ldata)
xf <- sum(sapply(mf[,-1], is.factor))
N <- dim(mf)[1]
p <- dim(mf)[2] - 1
vars <- names(mf)[-1]

## ----beetles-model0, echo = TRUE, cache = TRUE---------------------------
m_mlt <- Polr(RL ~ 1, data = ldata)
logLik(m_mlt)

## ----beetles-null, echo = FALSE------------------------------------------
q <- mkgrid(m_mlt, 200)[[1]]
p <- c(predict(m_mlt, newdata = data.frame(RL = q), type = "density"))
names(p) <- as.character(q)
barplot(p, xlab = "Red List Status", ylab = "Distribution")

## ----beetles-model, cache = TRUE-----------------------------------------
fm_tree
bm <- stmboost(m_mlt, formula = fm_tree, data = ldata,
               method = quote(mboost::blackboost))[626]

## ----insampleriskplot, echo = FALSE--------------------------------------
plot(-risk(bm), xlab = "Boosting Iteration", ylab = "Log-likelihood", type = "l")

## ----beetles-prop, echo = FALSE------------------------------------------
x <- tabulate(do.call("c", sapply(get("ens", environment(bm$predict)), 
    function(x) nodeapply(x$model, ids = nodeids(x$model, terminal = FALSE), FUN = function(x) split_node(x)$varid - 1))), nbins = length(vars))
names(x) <- vars
x

## ----beetles-plot, echo = FALSE, fig.width = 6, fig.height = 8-----------
idx <- sapply(levels(ldata$RL), function(x) 
    which(ldata$RL == x)[1])
q <- mkgrid(m_mlt, n = 200)[[1]]
d <- predict(bm, newdata = ldata[idx,], type = "density", q = q)
layout(matrix(1:6, nrow = 3, byrow = TRUE))
out <- sapply(1:6, function(x) barplot(d[,x], xlab = "Red List",
              ylab = "Density", main = paste(gsub("_", " ", rownames(ldata)[idx[x]]), " (RL:", ldata$RL[idx[x]], ")", sep = "")))

## ----beetles-mlt, cache = TRUE-------------------------------------------
po <- blackboost(fm_tree, data = ldata, family = PropOdds())[626]
max(abs(risk(bm) - risk(po)))

## ----fetus-results, echo = FALSE, fig.width = 7, fig.height = 6----------
load(file.path(dir, file <- "ex_fetus.rda"))
ylim <- c(0, 1.7)
x <- risk[,mor] - ll0
med <- apply(x, 2, median, na.rm = TRUE)
boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
        ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
        ylim = ylim)
abline(v = c(3.5, 6.5), col = "lightgrey")
abline(h = 0, col = "lightgrey")
abline(h = max(med), col = col[2])
axis(1, at = 1:ncol(x), labels = models[colnames(x)])
axis(2)
box()

## ----fetus-setup, echo = FALSE-------------------------------------------
load(file.path(datadir, "fetus.rda"))

fetus$birthweight <- as.double(fetus$birthweight)
ldata <- fetus

xvars <- colnames(ldata)
xvars <- xvars[xvars != "birthweight"]

ldata[xvars] <- lapply(xvars, function(x) as.numeric(scale(ldata[[x]])))


fm_gam <- c("ctm" = as.formula(paste("birthweight ~ ",
                paste("bols(", xvars, ", df = 2)", collapse = "+"), "+",
                paste("bbs(", xvars, ", center = TRUE, df = 2)", collapse = "+"))),
            "stm" = as.formula(paste("birthweight ~ ", 
                paste("bols(", xvars, ", intercept = FALSE)", collapse = "+"), "+",
                paste("bbs(", xvars, ", center = TRUE, df = 1)", collapse = "+"))))
fm_glm <- c("ctm" = as.formula(paste("birthweight ~ ",
                paste("bols(", xvars, ", df = 2)", collapse = "+"))),
            "stm" = as.formula(paste("birthweight ~ ", 
                paste("bols(", xvars, ", intercept = FALSE)", collapse = "+"))))

fm_tree <- birthweight ~ .

mf <- model.frame(fm_tree, data = ldata)
xf <- sum(sapply(mf[,-1], is.factor))
N <- dim(mf)[1]
p <- dim(mf)[2] - 1
vars <- names(mf)[-1]

## ----fetus-model0, cache = TRUE------------------------------------------
m_mlt <- BoxCox(birthweight ~ 1, data = ldata, extrapolate = TRUE)
logLik(m_mlt)

## ----fetus-null, echo = FALSE--------------------------------------------
plot(as.mlt(m_mlt), newdata = data.frame(1), type = "density", K = 200,
     xlab = "Birth Weight (in gram)", ylab = "Density", col = "black")

## ----fetus-model, cache = TRUE-------------------------------------------
fm_gam[["stm"]]
bm <- stmboost(m_mlt, formula = fm_gam[["stm"]], data = ldata,
               method = quote(mboost::mboost))[253]

## ----fetus-risk, echo = FALSE--------------------------------------------
plot(-risk(bm), xlab = "Boosting Iteration", ylab = "Log-likelihood", type = "l")

## ----fetus-pit, echo = FALSE, fig.width = 5, fig.height = 5--------------
pitplot(bm, ldata, main = ds[file])

## ----fetus-prop, echo = FALSE--------------------------------------------
tab <- matrix(tabulate(selected(bm), nbins = length(bm$baselearner)), ncol = 1)
rownames(tab) <- names(bm$baselearner)
colnames(tab) <- ""
tab

## ----fetus-plot, echo = FALSE, fig.width = 6, fig.height = 6-------------
w <- sort(unique(selected(bm)))
mbm <- bm
class(mbm) <- class(mbm)[-1]
nc <- ceiling(length(w) / 2) * 2
layout(matrix(1:nc, ncol = 2))
plot(mbm, which = w, ylim = c(-3, 3))

## ----fetus-base, echo = FALSE--------------------------------------------
tmp <- as.mlt(m_mlt)
coef(tmp) <- nuisance(bm)
plot(tmp, newdata = data.frame(1), K = 200, type = "trafo", xlab = "Birth weight (in gram)", 
     ylab = "Transformation h", col = "black")

## ----fetus-dens, echo = FALSE--------------------------------------------
idx <- order(ldata$birthweight)[floor(c(.1, .25, .5, .75, .9) * nrow(ldata))]
q <- mkgrid(m_mlt, n = 200)[[1]]
matplot(x = q, y = d <- predict(bm, newdata = ldata[idx,], type = "density", q = q), type =
        "l", col = rgb(.1, .1, .1, .6), lty = 1, xlab = "Birth Weight (in gram)", ylab = "Density")
j <- sapply(ldata[idx, "birthweight"], function(y) which.min((q - y)^2))
points(ldata[idx, "birthweight"], sapply(1:5, function(i) d[j[i], i]), pch = 19, col = col[1])

## ----bodyfat-results, echo = FALSE, fig.width = 7, fig.height = 6--------
load(file.path(dir, file <- "ex_bodyfat.rda"))
ylim <- c(0, 1.7)
x <- risk[,mor] - ll0
med <- apply(x, 2, median, na.rm = TRUE)
boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
        ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
        ylim = ylim)
abline(v = c(3.5, 6.5), col = "lightgrey")
abline(h = 0, col = "lightgrey")
abline(h = max(med), col = col[2])
axis(1, at = 1:ncol(x), labels = models[colnames(x)])
axis(2)
box()

## ----bodyfat-setup, echo = FALSE-----------------------------------------

data("bodyfat", package = "TH.data")
ldata <- bodyfat

xvars <- colnames(ldata)
xvars <- xvars[xvars != "DEXfat"]

ldata[xvars] <- lapply(xvars, function(x) as.numeric(scale(ldata[[x]])))


fm_gam <- c("ctm" = as.formula(paste("DEXfat ~ ", 
                paste("bbs(", xvars, ")", collapse = "+"))),
            "stm" = as.formula(paste("DEXfat ~ ", 
                paste("bols(", xvars, ", intercept = FALSE)", collapse = "+"), "+",
                paste("bbs(", xvars, ", center = TRUE, df = 1)", collapse = "+"))))
fm_glm <- c("ctm" = as.formula(paste("DEXfat ~ ",
                paste("bols(", xvars, ")", collapse = "+"))),
            "stm" = as.formula(paste("DEXfat ~ ", 
                paste("bols(", xvars, ", intercept = FALSE)", collapse = "+"))))

fm_tree <- DEXfat ~ .

mf <- model.frame(fm_tree, data = ldata)
xf <- sum(sapply(mf[,-1], is.factor))
N <- dim(mf)[1]
p <- dim(mf)[2] - 1
vars <- names(mf)[-1]

## ----bodyfat-model0, cache = TRUE----------------------------------------
m_mlt <- BoxCox(DEXfat ~ 1, data = ldata, prob = c(.1, .99))
logLik(m_mlt)

## ----bodyfat-null, echo = FALSE------------------------------------------
plot(as.mlt(m_mlt), newdata = data.frame(1), type = "density", K = 200,
     xlab = "Body Fat Mass (in kilogram)", ylab = "Density", col = "black")

## ----bodyfat-model, cache = TRUE-----------------------------------------
fm_gam[["ctm"]]
bm <- ctmboost(m_mlt, formula = fm_gam[["ctm"]], data = ldata,
               method = quote(mboost::mboost))[1000]

## ----bodyfat-risk, echo = FALSE------------------------------------------
plot(-risk(bm), xlab = "Boosting Iteration", ylab = "Log-likelihood", type = "l")

## ----bodyfat-pit, echo = FALSE, fig.width = 5, fig.height = 5------------
pitplot(bm, ldata, main = ds[file])

## ----bodyfat-ctmplot, echo = FALSE, fig.width = 6, fig.height = 6, dev = "png"----
plot.ctm(bm, ylab = "Body Fat Mass")

## ----bodyfat-prop, echo = FALSE------------------------------------------
tab <- matrix(tabulate(selected(bm), nbins = length(bm$baselearner)), ncol = 1)
rownames(tab) <- names(bm$baselearner)
colnames(tab) <- ""
tab

## ----bodyfat-dens, echo = FALSE------------------------------------------
idx <- order(ldata$DEXfat)[floor(c(.1, .25, .5, .75, .9) * nrow(ldata))]
q <- mkgrid(m_mlt, n = 200)[[1]]
matplot(x = q, y = d <- predict(bm, newdata = ldata[idx,], type = "density", q = q), type =
        "l", col = rgb(.1, .1, .1, .6), lty = 1, xlab = "Body Fat Mass (in kilogram)", ylab = "Density")
j <- sapply(ldata[idx, "DEXfat"], function(y) which.min((q - y)^2))
points(ldata[idx, "DEXfat"], sapply(1:5, function(i) d[j[i], i]), pch = 19, col = col[1])

## ----CAO-results, echo = FALSE, fig.width = 7, fig.height = 6------------
load(file.path(dir, file <- "ex_CAO.rda"))
ylim <- NULL
x <- risk[,mor] - ll0
med <- apply(x, 2, median, na.rm = TRUE)
boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
        ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
        ylim = ylim)
abline(v = c(3.5, 6.5), col = "lightgrey")
abline(h = 0, col = "lightgrey")
abline(h = max(med), col = col[2])
axis(1, at = 1:ncol(x), labels = models[colnames(x)])
axis(2)
box()

## ----CAO-setup, echo = FALSE---------------------------------------------
load(file.path(datadir, "Primary_endpoint_data.rda"))

ldata <- CAOsurv

xvars <- c("strat_t", "strat_n", "randarm",
           "geschlecht", "age", "groesse", "gewicht",
           "ecog_b", "mason",
           "gesamt_t", "gesamt_n", 
           "histo", "grading_b", "ap_anlage",
           "stenose",
           "gesamt_n_col", "UICC", "bentf")

ldata <- ldata[, c("DFS", xvars)]
ldata <- ldata[complete.cases(ldata),]
ldata$UICC <- ldata$UICC[,drop = TRUE]
ldata$ecog_b <- ldata$ecog_b[,drop = TRUE]


nvars <- c("age", "groesse", "gewicht")
fvars <- xvars[!xvars %in% nvars]
xvars <- c(nvars, fvars)

ldata[nvars] <- lapply(nvars, function(x) as.numeric(scale(ldata[[x]])))

fm_gam <- c(
"ctm" = as.formula(paste("DFS ~ ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))),
"stm" = as.formula(paste("DFS ~ ",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+"))))

fm_glm <- c(
"ctm" = as.formula(paste("DFS ~ ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"))),
"stm" = as.formula(paste("DFS ~",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"))))

fm_tree <- as.formula(paste("DFS ~ ", paste(xvars, collapse = "+")))

mf <- model.frame(fm_tree, data = ldata)
xf <- sum(sapply(mf[,-1], is.factor))
N <- dim(mf)[1]
p <- dim(mf)[2] - 1
vars <- names(mf)[-1]

## ----CAO-model0, cache = TRUE--------------------------------------------
m_mlt <- Coxph(DFS ~ 1, data = ldata, prob = c(0, .9))
logLik(m_mlt)

## ----CAO-null, echo = FALSE----------------------------------------------
plot(as.mlt(m_mlt), newdata = data.frame(1), type = "survivor", K = 200,
     xlab = "Time (in days)", ylab = "Time", col = "black")

## ----CAO-model, cache = TRUE, eval = FALSE-------------------------------
#  fm_glm[["stm"]]
#  bm <- stmboost(m_mlt, formula = fm_glm[["stm"]], data = ldata,
#                 method = quote(mboost::mboost))[963]

## ----CAO-mlt, cache = TRUE, eval = FALSE---------------------------------
#  bmCoxPH <- mboost(fm_glm[["stm"]], data = ldata, family = CoxPH())[963]

## ----india-results, echo = FALSE, fig.width = 6, fig.height = 6----------
load(file.path(dir, file <- "ex_india.rda"))
ylim <- NULL
x <- risk[,mor] - ll0
med <- apply(x, 2, median, na.rm = TRUE)
boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
        ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
        ylim = ylim)
abline(v = c(3.5, 6.5), col = "lightgrey")
abline(h = 0, col = "lightgrey")
abline(h = max(med), col = col[2])
axis(1, at = 1:ncol(x), labels = models[colnames(x)])
axis(2)
box()

## ----india-setup, echo = FALSE-------------------------------------------
load(file.path(datadir, "india.rda"))

xvars <- c("cage", "breastfeeding", "mbmi", "mage", "medu", 
           "edupartner", "csex", "ctwin", "cbirthorder", 
           "munemployed", "mreligion", "mresidence", "deadchildren",
           "electricity", "radio", "television", "refrigerator", 
           "bicycle", "motorcycle", "car")

fvars <- xvars[sapply(xvars, function(x) length(unique(kids[[x]]))) < 6]
nvars <- xvars[!xvars %in% fvars]
kids[fvars] <- lapply(fvars, function(f) factor(kids[[f]]))
kids[nvars] <- lapply(nvars, function(n) scale(kids[[n]]))

fm_gam <- c(
"ctm" = as.formula(paste("stunting ~ ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))),
"stm" = as.formula(paste("stunting ~",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+"))))

fm_glm <- c(
"ctm" = as.formula(paste("stunting ~ ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"))),
"stm" = as.formula(paste("stunting ~",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"))))

fm_tree <- as.formula(paste("stunting ~ ", paste(xvars, collapse = "+")))

kids$stunting <- as.double(kids$stunting)
ldata <- kids

mf <- model.frame(fm_tree, data = ldata)
xf <- sum(sapply(mf[,-1], is.factor))
N <- dim(mf)[1]
p <- dim(mf)[2] - 1
vars <- names(mf)[-1]


## ----india-model0, cache = TRUE------------------------------------------
m_mlt <- BoxCox(stunting ~ 1, data = ldata, prob = c(.05, .975), extrapolate = TRUE)
logLik(m_mlt)

## ----india-null, echo = FALSE--------------------------------------------
plot(as.mlt(m_mlt), newdata = data.frame(1), type = "density", K = 200,
     xlab = "Stunting", ylab = "Density", col = "black")

## ----india-model, cache = TRUE, eval = FALSE-----------------------------
#  fm_tree
#  bm <- ctmboost(m_mlt, formula = fm_tree, data = ldata,
#                 method = quote(mboost::blackboost))[515]

## ----heads-results, echo = FALSE, fig.width = 7, fig.height = 6----------
load(file.path(dir, file <- "ex_heads.rda"))
ylim <- NULL
x <- risk[,mor] - ll0
med <- apply(x, 2, median, na.rm = TRUE)
boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
        ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
        ylim = ylim)
abline(v = c(3.5, 6.5), col = "lightgrey")
abline(h = 0, col = "lightgrey")
abline(h = max(med), col = col[2])
axis(1, at = 1:ncol(x), labels = models[colnames(x)])
axis(2)
box()

## ----heads-setup, echo = FALSE-------------------------------------------
### Paul Eilers: Using 1/2 is better
data("db", package = "gamlss.data")
db$lage <- with(db, age^(1/3))

ldata <- db

fm_gam <- c("ctm" = head ~ bbs(lage),
            "stm" = head ~ bols(lage, intercept = FALSE) + bbs(lage, center = TRUE, df = 1))
fm_glm <- c("ctm" = head ~ bols(lage),
            "stm" = head ~ bols(lage, intercept = FALSE))

fm_tree <- head ~ .
N <- nrow(ldata)

## ----heads-model0, cache = TRUE------------------------------------------
m_mlt <- BoxCox(head ~ 1, data = ldata)
logLik(m_mlt)

## ----heads-model, cache = TRUE-------------------------------------------
fm_gam[["ctm"]]
bm <- ctmboost(m_mlt, formula = fm_gam[["ctm"]], data = ldata, 
               method = quote(mboost::mboost))[1000]

## ----heads-risk, echo = FALSE, cache = TRUE------------------------------
plot(-risk(bm), xlab = "Boosting Iteration", ylab = "Log-likelihood", type = "l")

## ----heads-pit, echo = FALSE, fig.width = 5, fig.height = 5, cache = TRUE, dev = "png"----
pitplot(bm, ldata, main = ds[file])

## ----heads-plot, fig.width = 6, fig.height = 4, echo = FALSE, dev = "png"----
l <- with(db, seq(from = min(lage), to = max(lage), length = 100))
q <- mkgrid(m_mlt, n = 200)[[1]]
pr <- expand.grid(head = q, lage = l^3)
pr$p <- c(predict(bm, newdata = data.frame(lage = l), q = q, type = "distribution"))
pr$cut <- factor(pr$lage > 2.5)
levels(pr$cut) <- c("Age < 2.5 yrs", "Age > 2.5 yrs")

pfun <- function(x, y, z, subscripts, at, ...) {
    panel.contourplot(x, y, z, subscripts, 
        at = c(0.4, 2, 10, 25, 50, 75, 90, 98, 99.6)/ 100, ...)
    panel.xyplot(x = db$age, y = db$head, pch = 20,
                 col = rgb(.1, .1, .1, .1), ...)
}
print(contourplot(p ~ lage + head | cut, data = pr, panel = pfun, region = FALSE,
            xlab = "Age (years)", ylab = "Head circumference (cm)", 
            scales = list(x = list(relation = "free"))))

## ----ALSFRS-results, echo = FALSE, fig.width = 7, fig.height = 6---------
load(file.path(dir, file <- "ex_ALSFRS.rda"))
ylim <- NULL
x <- risk[,mor] - ll0
med <- apply(x, 2, median, na.rm = TRUE)
boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
        ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
        ylim = ylim)
abline(v = c(3.5, 6.5), col = "lightgrey")
abline(h = 0, col = "lightgrey")
abline(h = max(med), col = col[2])
axis(1, at = 1:ncol(x), labels = models[colnames(x)])
axis(2)
box()

## ----ALSFRS-setup, echo = FALSE------------------------------------------
load(file.path(datadir, "ALSFRSdata.rda"))
ldata <- ALSFRSdata[complete.cases(ALSFRSdata),]

xvars <- names(ldata)
xvars <- xvars[xvars != "ALSFRS.halfYearAfter"]
fvars <- xvars[sapply(ldata[xvars], is.factor)]
nvars <- xvars[!xvars %in% fvars]

ldata[nvars] <- lapply(nvars, function(x) as.numeric(scale(ldata[[x]])))

fm_gam <- c(
"ctm" = as.formula(paste("ALSFRS.halfYearAfter ~ ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))),
"stm" = as.formula(paste("ALSFRS.halfYearAfter ~",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+"))))

fm_glm <- c(
"ctm" = as.formula(paste("ALSFRS.halfYearAfter ~ ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"))),
"stm" = as.formula(paste("ALSFRS.halfYearAfter ~",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"))))

fm_tree <- as.formula(paste("ALSFRS.halfYearAfter ~ ", paste(xvars, collapse = "+")))

mf <- model.frame(fm_tree, data = ldata)
xf <- sum(sapply(mf[,-1], is.factor))
N <- dim(mf)[1]
p <- dim(mf)[2] - 1
vars <- names(mf)[-1]


## ----ALSFRS-model-0, cache = TRUE----------------------------------------
m_mlt <- Colr(ALSFRS.halfYearAfter ~ 1, data = ldata, prob = c(.05, .9), extrapolate = TRUE)
logLik(m_mlt)

## ----ALSFRS-null, echo = FALSE-------------------------------------------
plot(as.mlt(m_mlt), newdata = data.frame(1), type = "density", K = 200,
     xlab = "ALSFRS at 6 months", ylab = "Density", col = "black")

## ----ALSFRS-model, cache = TRUE------------------------------------------
bm <- ctmboost(m_mlt, formula = fm_glm[["ctm"]], data = ldata, 
               method = quote(mboost::mboost))[613]

## ----ALSFRS-risk, echo = FALSE-------------------------------------------
plot(-risk(bm), xlab = "Boosting Iteration", ylab = "Log-likelihood", type = "l")

## ----ALSFRS-prop, echo = FALSE-------------------------------------------
tab <- matrix(tabulate(selected(bm), nbins = length(bm$baselearner)), ncol = 1)
rownames(tab) <- names(bm$baselearner)
colnames(tab) <- ""
tab[tab > 0,]

## ----ALSFRS-plot, results = "hide", echo = FALSE-------------------------
cf <- mboost:::coef.mboost(bm)
q <- mkgrid(m_mlt, n = 100)[[1]]
X <- model.matrix(m_mlt$model, data = data.frame(ALSFRS.halfYearAfter = q))
vars <- c("time_onset_treatment", "ALSFRS.Start") 
layout(matrix(1:length(vars), ncol = 2))
out <- sapply(vars, function(v) {
    cf <- matrix(cf[[grep(v, names(cf))]], ncol = 2, byrow = TRUE)
    cf <- cumsum(cf[,2])
    plot(q, X %*% cf, main = v, xlab = "ALSFRS at 6 months",
         ylab = "Varying coefficient", type = "l")
})

## ----ALSsurv-results, echo = FALSE, fig.width = 7, fig.height = 6--------
load(file.path(dir, file <- "ex_ALSsurv.rda"))
ylim <- NULL
x <- risk[,mor] - ll0
med <- apply(x, 2, median, na.rm = TRUE)
boxplot(x, axes = FALSE, main = ds[file], outline = FALSE, var.width = TRUE,
        ylab = "Out-of-sample log-likelihood (centered)", col = col[(med == max(med)) + 1],
        ylim = ylim)
abline(v = c(3.5, 6.5), col = "lightgrey")
abline(h = 0, col = "lightgrey")
abline(h = max(med), col = col[2])
axis(1, at = 1:ncol(x), labels = models[colnames(x)])
axis(2)
box()

## ----ALSsurv-setup, echo = FALSE-----------------------------------------
load(file.path(datadir, "ALSsurvdata.rda"))

ldata <- ALSsurvdata[complete.cases(ALSsurvdata),]
ldata$y <- with(ldata, Surv(survival.time, cens))
ldata$survival.time <- NULL
ldata$cens <- NULL

nvars <- c("time_onset_treatment", "age", "height")
fvars <- names(ldata)[!names(ldata) %in% c("y", nvars)]
xvars <- c(nvars, fvars)

ldata[nvars] <- lapply(nvars, function(x) as.numeric(scale(ldata[[x]])))


fm_gam <- c(
"ctm" = as.formula(paste("y ~ ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 2)", collapse = "+"))),
"stm" = as.formula(paste("y ~ ",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"), "+",
    paste("bbs(", nvars, ", center = TRUE, df = 1)", collapse = "+"))))

fm_glm <- c(
"ctm" = as.formula(paste("y ~ ",
    paste("bols(", xvars, ", df = 2)", collapse = "+"))),
"stm" = as.formula(paste("y ~",
    paste("bols(", xvars, ", intercept = FALSE, df = 1)", collapse = "+"))))

fm_tree <- as.formula(paste("y ~ ", paste(xvars, collapse = "+")))

mf <- model.frame(fm_tree, data = ldata)
xf <- sum(sapply(mf[,-1], is.factor))
N <- dim(mf)[1]
p <- dim(mf)[2] - 1
vars <- names(mf)[-1]

## ----ALSsurv-model0, cache = TRUE----------------------------------------
m_mlt <- Coxph(y ~ 1, data = ldata)
logLik(m_mlt)

## ----ALSsurv-null, echo = FALSE------------------------------------------
plot(as.mlt(m_mlt), newdata = data.frame(1), type = "survivor", K = 200,
     xlab = "ALS Overall Survival (in days)", ylab = "Probability", col = "black")

## ----ALSsurv-model, cache = TRUE, eval = FALSE---------------------------
#  fm_tree
#  bm <- stmboost(m_mlt, formula = fm_tree, data = ldata,
#                 control = boost_control(nu = 0.01),
#                 method = quote(mboost::blackboost))[451]

## ----ALSsurv-mlt, cache = TRUE, eval = FALSE, eval = FALSE---------------
#  blackboost(fm_tree, data = ldata, family = CoxPH())

## ----dgp-----------------------------------------------------------------
library("tram")
### set-up RNG
set.seed(27031105)
### order of Bernstein polynomials
order <- 6
### support of distributions
sup <- c(-4, 6)
### bounds (essentially for plots)
bds <- c(-8, 8)
### shift effects: main effect of grp, x, and interaction effect
beta <- c(-2, 2, -1)

## ----dgp-data------------------------------------------------------------
### two groups
grp <- gl(2, 1)
### uniform predictor
x <- seq(from = 0, to = 1, length.out = 1000)
d <- expand.grid(grp = grp, x = x)
### transformation of x: sin(2 pi x) (1 + x)
d$g <- with(d, sin(x * 2 * pi) * (1 + x))
### generate some response (this is NOT the 
### response used for the simulations)
X <- model.matrix(~ grp * x, data = d)[,-1]
d$y <- rnorm(nrow(d), mean = X %*% beta)

## ----dgp-Linear-beta-----------------------------------------------------
### h_Y
(cf0 <- seq(from = sup[1], to = sup[2], length = order + 1) + 
        sin(seq(from = 0, to = 2*pi, length = order + 1)) * 
        (seq(from = 0, to = 2*pi, length = order + 1) <= pi) * 2)
m1 <- BoxCox(y ~ grp * x, data = d, order = order, 
             model_only = TRUE, support = sup, bounds = bds)
cf <- coef(m1)
(cf[] <- c(cf0, beta))
coef(m1) <- cf

## ----dgp-Nonlinear-beta--------------------------------------------------
m2 <- BoxCox(y ~ grp * g, data = d, order = order, 
             model_only = TRUE, support = sup, bounds = bds)
cf <- coef(m2)
cf[] <- c(cf0, beta)
coef(m2) <- cf

## ----dgp-Linear-parm-----------------------------------------------------
m3 <- BoxCox(y | grp * x ~ 1, data = d, order = order, 
             model_only = TRUE, support = sup, bounds = bds)
cf <- coef(m3)

### beta_1
(cf1 <- sin(seq(from = 0, to = pi / 2, length.out = order + 1)) * beta[1])
### beta_2
(cf2 <- sqrt(seq(from = 0, to = 2, length.out = order + 1)) / sqrt(2) * beta[2])
### beta_3
(cf21 <- sin(seq(from = 0, to = pi / 2, length.out = order + 1)) * beta[3])

cf[] <- c(cf0, cf1, cf2, cf21)
coef(m3) <- cf

## ----dgp-Nonlinear-parm--------------------------------------------------
m4 <- BoxCox(y | grp * g ~ 1, data = d, order = order, 
             model_only = TRUE, support = sup, bounds = bds)
cf <- coef(m4)

cf[] <- c(cf0, cf1, cf2, cf21)
coef(m4) <- cf

## ----plot-dgp, echo = FALSE,  fig.width = 6, fig.height = 8, dev = "png"----

xlim <- c(-5, 5)
q <- mkgrid(m1, n = 500)[["y"]]
x <- 0:10 / 10
nd <- expand.grid(grp = grp, x = x)
ndq <- expand.grid(y = q, grp = grp, x = x)
ndq$xconfig <- with(ndq, interaction(grp, factor(x)))
nd$g <- with(nd, sin(x * 2 * pi) * (1 + x))

ndq$d1 <- c(predict(m1, newdata = nd, type = "density", q = q))
ndq$d2 <- c(predict(m2, newdata = nd, type = "density", q = q))
ndq$d3 <- c(predict(m3, newdata = nd, type = "density", q = q))
ndq$d4 <- c(predict(m4, newdata = nd, type = "density", q = q))

ndq$t1 <- c(predict(m1, newdata = nd, type = "trafo", q = q))
ndq$t2 <- c(predict(m2, newdata = nd, type = "trafo", q = q))
ndq$t3 <- c(predict(m3, newdata = nd, type = "trafo", q = q))
ndq$t4 <- c(predict(m4, newdata = nd, type = "trafo", q = q))

colr <- rep(grey.colors(length(x), alpha = .9), each = 2)
xlim <- bds

ret <- do.call("rbind", list(ndq)[rep(1, 4)])
ret$d <- with(ndq, c(d1, d2, d3, d4))
ret$t <- with(ndq, c(t1, t2, t3, t4))
ret$model <- gl(4, nrow(ndq))
levels(ret$model) <- mlev <- c("GLM Tram", "GAM Tram", "DR", "CTM")
ret$model <- factor(as.character(ret$model), levels = rev(mlev), labels = rev(mlev))
levels(ret$grp) <- c("Group 1", "Group 2")

lev <- c(expression(paste("Nonlinear ", bold(vartheta)(bold(x)))),
         expression(paste("Linear ", bold(vartheta)(bold(x)))),
         expression(paste("Nonlinear ", beta(bold(x)))),
         expression(paste("Linear ", beta(bold(x)))))

sc <- function(which.given, ..., factor.levels) {
    if (which.given == 1) {
        strip.default(which.given = which.given, ..., factor.levels)
    } else {
        if (which.given == 2)
            strip.default(which.given = 2, ..., factor.levels = lev)
   }
}


obj <- (xyplot(t ~ y | grp + model, data = ret, type = "l", groups = xconfig, col = colr, strip = sc,
     lwd = 2, lty = 1, xlim = xlim, layout = c(4, 2), ylab = "Transformation h", xlab = "Response Y", 
     scales = list(y = list(relation = "free"))))

obj <- useOuterStrips(obj, strip.left = function(which.given, ..., factor.levels) 
    strip.custom(horizontal = FALSE)(which.given = which.given, ..., factor.levels = lev))

plot(obj <- update(obj, legend = list(right = list(fun = "draw.colorkey", 
                   args = list(list(at = 1:22 / 22, col = colr, title = "x"))))))

trellis.focus("legend", side="right", clipp.off=TRUE, highlight=FALSE)
grid.text(expression(x), 0.5, 1.07, hjust=0.5, vjust=1)
trellis.unfocus()

## ----simulate, eval = FALSE----------------------------------------------
#  y1 <- simulate(m1, newdata = d, n = 100)
#  y2 <- simulate(m2, newdata = d, n = 100)
#  y3 <- simulate(m3, newdata = d, n = 100)
#  y4 <- simulate(m4, newdata = d, n = 100)

## ----dgp-setup, echo = FALSE---------------------------------------------
dir <- system.file("simulations", package = "tbm")
ctm <- list.files(path = dir, pattern = "ctm.*rda", full = TRUE)
tram <- list.files(path = dir, pattern = "tram.*rda", full = TRUE)

out <- c()

for (f in c(ctm, tram)) {

    load(f)

    out <- rbind(out, ret)

}

out$model <- factor(out$model, levels = c("ctm_gam", "ctm_glm", "ctm_tree", "tram_gam", "tram_glm", "tram_tree"),
                    labels = c("gam_ctm", "glm_ctm", "tree_ctm", "gam_tram", "glm_tram", "tree_tram"))

out$PNON <- factor(out$PNON)
nlab <- paste("N =", nlev <- rev(sort(unique(out$NOBS))))
out$NOBS <- factor(out$NOBS, levels = nlev, labels = nlab)
out$m <- factor(out$m)
out$todistr <- factor(out$todistr)
out$order <- factor(out$order)

out$boost_ll[grep("rror", out$boost_ll)] <- NA
out$boost_ll <- as.double(out$boost_ll)
out$true_ll <- as.double(out$true_ll)
out$mlt_ll <- as.double(out$mlt_ll)
out$mstop <- as.double(out$mstop)

out$todistr <- factor(as.character(out$todistr), levels = c("normal", "logistic", "minextrval"), 
                      labels = c("Normal", "Logistic", "MEV"))

lev <- rev(c(expression(paste("Nonlinear ", bold(vartheta)(bold(x)))),
         expression(paste("Linear ", bold(vartheta)(bold(x)))),
         expression(paste("Nonlinear ", beta(bold(x)))),
         expression(paste("Linear ", beta(bold(x))))))

sc <- function(which.given, ..., factor.levels) {
    if (which.given == 2) {
        strip.default(which.given = which.given, ..., factor.levels)
    } else {
        if (which.given == 1)
            strip.default(which.given = 1, ..., factor.levels = lev)
   }
}

  mypanel <- function(x, y, groups, subscripts, ...) {
    med <- tapply(y, x, median, na.rm = TRUE)
    panel.bwplot(x = x, y = y, col = col[(med == max(med)) + 1], 
                 fill = col[(med == max(med)) + 1], pch = "|", ...)
    panel.abline(v = c(3.5), col = "lightgrey")
    panel.abline(h = 0, col = "lightgrey")
    panel.abline(h = max(med), col = col[2])
    panel.bwplot(x = x, y = y, col = col[(med == max(med)) + 1], 
                 fill = col[(med == max(med)) + 1], pch = "|", ...)
#    tapply(1:length(y), groups[subscripts], function(i) {
#        llines(x = 1:nlevels(x), y = y[i][order(x[i])], 
#               col = rgb(0.1, 0.1, 0.1, 0.1))
#    })
  }




## ----dgp-1, results = "asis", echo = FALSE-------------------------------
lto <- levels(out$todistr)
lp <- levels(out$PNON)
lo <- levels(out$order)

ret <- c()

i <- 1
for (l1 in lto) {
    for(l2 in lp) {
       for (l3 in lo) {
           os <- subset(out, todistr == l1 & PNON == l2 & order == l3)
           os <- subset(os, boost_ll - true_ll > -10000)

           txt <- paste("J = ", l2, "M = ", l3)
           if (l1 == "Normal") {
               main <- expression(F[Z] == Phi)
           } else if (l1 == "Logistic") {
               main <- expression(F[Z] == expit)
           } else {
               main <- expression(F[Z] == MEV)
           }
           pdf(fig <- paste("sfig", i, ".pdf", sep = ""), width = 12, height = 8)
           pl <- bwplot(I(boost_ll - true_ll) ~ model |  m + NOBS, data = os, main = main, strip = sc, 
                        ylab = "Out-of-sample log-likelihood (centered)",
                        xlab = txt, panel = mypanel,
                        scales = list(y = list(relation = "free"),
                                      x = list(at = 1:6, label = models[levels(os$model)])))
           plot(pl)
           dev.off()
           ret <- c(ret, "\\begin{sidewaysfigure}",
               "\\begin{center}",
               paste("\\includegraphics{", fig, "}", sep = ""),
               "\\end{center}",
               "\\end{sidewaysfigure}", " ")
           i <- i + 1
        }
    }
}
writeLines(ret)

## ----sessionInfo, echo = FALSE, results = "hide"-------------------------
sessionInfo()

Try the tbm package in your browser

Any scripts or data that you put into this service are public.

tbm documentation built on Jan. 14, 2022, 9:07 a.m.