inst/doc/coefficient-wise-varying-coefficient-regression.R

### R code from vignette source 'coefficient-wise-varying-coefficient-regression.Rnw'

###################################################
### code chunk number 1: header
###################################################

## Date:         2016-07-25
## Authors:      Reto Buergin and Gilbert Ritschard
## Institution:  Swiss National Centre of Competence in
##               Research LIVES (http://www.lives-nccr.ch/)
##
## R-codes for the article "Coefficient-wise
## tree-based varying coefficient regression with
## R-package 'vcrpart'". The code requires vcrpart (>= 0.4-1),
## mlbench (>= 2.1-1) and Ecdat (>= 0.2-9).
##
## Notes:
## Chunks with '(eval = FALSE)' in the header or
## with if / else conditions on 'run' objects are
## commented out because they are computationally
## burdensome. Instead, the pre-run results
## stored in supplementary files are used, see the
## commands
##
## load(file = "RData/vcrpart-applications.RData")
## load(file = "RData/vcrpart-bench.RData")
## load(file = "RData/vcrpart-simulations.RData")
##
## Depending where you have stored the RData files,
## you should modify these commands correspondingly.
##
## The comparison study of Section 4.1 or the the simulation
## study of section 4.2 can be processed by setting
## 'run <- TRUE' in the corresponding chunks.
##
## We cannot guarantee that the functions work with future
## versions of R and the indicated packages.
##
## Copyright R. Buergin and G. Ritschard, 2016
## distributed under license Creative Commons BY-NC-SA
## http://creativecommons.org/licenses/by-nc-sa/3.0/


###################################################
### code chunk number 2: initialize
###################################################

## remove several R objects in the global environment
rm(list = ls())

## create RData and Figures directory if necessary
suppressWarnings(dir.create("RData/"))
suppressWarnings(dir.create("Figures/"))

## set options
options(width=76,prompt="R> ", continue="+  ", useFancyQuotes = FALSE,
        digits = 3, scipen = 10)

## load required packages
library("vcrpart")
library("mlbench") 
library("partykit")
library("xtable")
library("Ecdat")
#library("RWeka")
library("rpart")

## load
load(file = "RData/vcrpart-applications.RData")


###################################################
### code chunk number 3: ucba-data
###################################################

UCBA <- as.data.frame(UCBAdmissions)
UCBA$Admit <- 1 * (UCBA$Admit == "Admitted")
UCBA$Female <- 1 * (UCBA$Gender == "Female")
head(UCBA, 3)


###################################################
### code chunk number 4: ucba-glmS
###################################################

glmS.UCBA <- glm(formula = Admit ~ Female, data = UCBA,
                 family = binomial(), weights = UCBA$Freq)


###################################################
### code chunk number 5: ucba-glmS-coef
###################################################

summary(glmS.UCBA)$coefficients[, -4]


###################################################
### code chunk number 6: ucba-glmL
###################################################

glmL.UCBA <- glm(formula = Admit ~ -1 + Dept + Dept:Female,
                 data = UCBA, family = binomial(),
                 weights = UCBA$Freq)


###################################################
### code chunk number 7: ucba-glmL-coef
###################################################

summary(glmL.UCBA)$coefficients[, -4]


###################################################
### code chunk number 8: ucba-fitting-vcmL
###################################################

library("vcrpart")
vcmL.UCBA <-
  tvcglm(formula = Admit ~ -1 + vc(Dept) + vc(Dept, by = Female),
         data = UCBA, family = binomial(), weights = UCBA$Freq,
         control = tvcglm_control(minsize = 30, mindev = 0.0,
           cv = FALSE))


###################################################
### code chunk number 9: ucba-fig-vcmL
###################################################

grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2)))
pushViewport(viewport(layout.pos.col = 1))
plot(vcmL.UCBA, part = 1, type = "coef", newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 2))
plot(vcmL.UCBA, part = 2, type = "coef", newpage = FALSE)
upViewport()
upViewport()


###################################################
### code chunk number 10: ucba-plot (eval = FALSE)
###################################################
## 
## plot(vcmL.UCBA, type = "coef", part = "A")
## plot(vcmL.UCBA, type = "coef", part = "B")


###################################################
### code chunk number 11: ucba-splitpath
###################################################

splitpath(vcmL.UCBA, steps = 1, details = TRUE)


###################################################
### code chunk number 12: ucba-nomsplits-fitting
###################################################

glmCW.UCBA <-
  glm(formula = Admit ~ 1 + Dept:Female, family = binomial(),
      data = UCBA,  weights = UCBA$Freq)


###################################################
### code chunk number 13: ucba-nomsplits-coef
###################################################

sort(coef(glmCW.UCBA)[-1])


###################################################
### code chunk number 14: ucba-prune-fixed-lambda
###################################################

vcm.UCBA <- prune(vcmL.UCBA, cp = 6)


###################################################
### code chunk number 15: ucba-prunepath
###################################################

prunepath(vcm.UCBA, steps = 1)


###################################################
### code chunk number 16: ucba-cv (eval = FALSE)
###################################################
## 
## cv.UCBA <- cvloss(vcmL.UCBA,
##                   folds = folds_control(weights = "freq", seed = 13))


###################################################
### code chunk number 17: ucba-fig-cv
###################################################

par(mgp = c(2, 1, 0), mar = c(3,3,0.1,0.1))
plot(cv.UCBA,
     xlab = expression(lambda),
     ylab = "Validation error",
     legend = FALSE)


###################################################
### code chunk number 18: ucba-lambda
###################################################

oobLoss <- colMeans(cv.UCBA$oobloss)
subs <- which.min(oobLoss)
cp.hat <- cv.UCBA$cp.hat
cp.int <- cv.UCBA$grid[subs:(subs+1)]


###################################################
### code chunk number 19: ucba-prune-estimated-lambda (eval = FALSE)
###################################################
## 
## vcm.UCBA <- prune(vcmL.UCBA, cp = cv.UCBA$cp.hat)


###################################################
### code chunk number 20: ucba-fig-vcm
###################################################

grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2, width = c(0.5, 0.5))))
pushViewport(viewport(layout.pos.col = 1))
plot(vcm.UCBA, part = 1, newpage = FALSE, type = "coef")
upViewport()
pushViewport(viewport(layout.pos.col = 2))
plot(vcm.UCBA, part = 2, newpage = FALSE, type = "coef")
upViewport()
upViewport()



###################################################
### code chunk number 21: details-mean-centering
###################################################

set.seed(1)
d <- data.frame(x = runif(100, min = -5, max = 5),
                g = rep(LETTERS[1:2], each = 50))
d$y <- model.matrix(~ -1 + g:x, d) %*% c(-1, 1) + rnorm(100)
d$x <- d$x + 10
d$x.centered <- d$x - mean(d$x)
par(mfrow = c(1, 2), mgp = c(2,1,0), mar = c(3,3,2,3), cex.main = 1.25)
lm <- lm(y ~ x, d)
d$offset <- lm$coefficients[1]
lmA <- lm(y ~ -1 + x, d, subset = d$g == "A", offset = d$offset)
lmB <- lm(y ~ -1 + x, d, subset = d$g == "B", offset = d$offset)
plot(y ~ x, d, pch = as.integer(d$g)^2, xlim = c(-5, 15),
     main = "x not mean-centered")
legend("topleft", c("Global", "Group A", "Group B"), lty = 1:3, pch = c(NA, 1, 4))
abline(lm, lty = 1)
abline(a = coef(lm)[1], b = coef(lmA), lty = 2)
abline(a = coef(lm)[1], b = coef(lmB), lty = 3)
lm <- lm(y ~ x.centered, d)
d$offset <- lm$coefficients[1]
lmA <- lm(y ~ -1 + x.centered, d, subset = d$g == "A", offset = d$offset)
lmB <- lm(y ~ -1 + x.centered, d, subset = d$g == "B", offset = d$offset)
plot(y ~ x.centered, d, pch = as.integer(d$g)^2, xlim = c(-5, 15),
     main = "x mean-centered", xlab = "x")
abline(lm, lty = 1)
abline(a = coef(lm)[1], b = coef(lmA), lty = 2)
abline(a = coef(lm)[1], b = coef(lmB), lty = 3)


###################################################
### code chunk number 22: applications-control
###################################################

control <- tvcglm_control(folds = folds_control(seed = 13))


###################################################
### code chunk number 23: pima-data
###################################################

library("mlbench")
data("PimaIndiansDiabetes2")
Pima <- na.omit(PimaIndiansDiabetes2[, -c(4, 5)])


###################################################
### code chunk number 24: pima-fitting-unconstrained (eval = FALSE)
###################################################
## 
## vcm.Pima.1 <-
##   tvcglm(diabetes ~ -1 + vc(pregnant, pressure, mass, pedigree, age) +
##          vc(pregnant, pressure, mass, pedigree, age, by = glucose),
##          data = Pima, family = binomial(), control = control)


###################################################
### code chunk number 25: pima-fitting-additive (eval = FALSE)
###################################################
## 
## vcm.Pima.2 <-
##   tvcglm(diabetes ~ 1 + glucose +
##          vc(pregnant) + vc(pregnant, by = glucose) +
##          vc(pressure) + vc(pressure, by = glucose) +
##          vc(mass) + vc(mass, by = glucose) +
##          vc(pedigree) + vc(pedigree, by = glucose) +
##          vc(age) + vc(age, by = glucose),
##          data = Pima, family = binomial(), control = control)


###################################################
### code chunk number 26: pima-fitting-mob (eval = FALSE)
###################################################
## 
## ## MOB-model fitted from 'glmtree' function
## mob.Pima <-
##     glmtree(diabetes ~ glucose | pregnant + pressure + mass +
##                 pedigree + age, data = Pima, family = binomial())
## 
## ## construct same model with 'tvcglm' function
## mob.Pima <-
##     tvcglm(diabetes ~ -1 + vc(pregnant, pressure, mass, pedigree, age,
##                               by = glucose, intercept = TRUE),
##            data = Pima, family = binomial(),
##            control = tvcglm_control(sctest = TRUE, cv = FALSE, maxnumsplit = 10000))


###################################################
### code chunk number 27: pima-fig-tree
###################################################

## plot the tvcm model
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2, width = c(0.5, 0.5))))
pushViewport(viewport(layout.pos.col = 1))
grid.text("TVCM", y = unit(0.95, "npc"), gp = gpar(cex = 1.25))
pushViewport(viewport(
               y = unit(0.45, "npc"),
               width = unit(0.95, "npc"),
               height = unit(0.875, "npc")))
grid.rect()
pushViewport(viewport(height = unit(0.9, "npc"), y = unit(0.5, "npc")))
pushViewport(viewport(layout = grid.layout(1, 2, width = c(0.65, 0.35))))
pushViewport(viewport(layout.pos.col = 1))
plot(vcm.Pima.1, part = 1, type = "coef",
     newpage = FALSE)
grid.lines(unit(c(0.99, 0.99), "npc"),
           unit(c(0.05, 0.95), "npc"),
           gp = gpar(lty = 2, col = "black"))
upViewport(1)
pushViewport(viewport(layout.pos.col = 2))
plot(vcm.Pima.1, part = 2, type = "coef",
     plot_gp = list(width = 1), main = "vc(pregnant, ...,\n by = glucose)",
     newpage = FALSE)
upViewport(6)

## plot the MOB model
pushViewport(viewport(layout.pos.col = 2))
grid.text("MOB", y = unit(0.95, "npc"), gp = gpar(cex = 1.25))
pushViewport(viewport(
               y = unit(0.45, "npc"),
               width = unit(0.95, "npc"),
               height = unit(0.875, "npc")))
grid.rect()
plot(mob.Pima, newpage = FALSE, tnex = 4.5,
     main = "", margins = c(1.5, 0.75, 0, 0.75),
     parm = list("(Intercept)", "glucose"))
upViewport(2)


###################################################
### code chunk number 28: pima-comp-code
###################################################

## Comment:
## The following code runs the comparison study if
## 'run = TRUE' or the R data file 'vcrpart-bench' is
## not available.

run <- FALSE

if (run | !"vcrpart-bench.RData" %in% dir("RData/") ) {

    ## compute the missclassification error
    mce <- function(object, newdata) {
        if ("party" %in% class(object)) {
            mu <- predict(object, newdata = newdata, type = "response")
        } else if ("rpart" %in% class(object)){
            mu <- predict(object, newdata = newdata, type = "class")
        } else {
            mu <- predict(object, newdata = newdata)
        }
        mu <- if (is.factor(mu)) 1 * (mu == "pos") else round(mu)
        y <- 1 * (newdata$diabetes == "pos")
        return(sum(y != round(mu)) / nrow(newdata))
    }

    ## compute the complexity of the TVCM model
    npar.tvcm <- function(object) {
        return(extractAIC(extract(object, "model"))[1] +
                   sum(sapply(object$info$node, width) - 1))
    }

    ## Define formulas and control parameters
    ## --------------------------------------

    ## tvcm algorithm
    f1.tvcm <- f3.tvcm <- f4.tvcm <- f5.tvcm <-
        diabetes ~ -1 +
            vc(pregnant, pressure, mass, pedigree, age) +
                vc(pregnant, pressure, mass, pedigree, age, by = glucose)

    f2.tvcm <-
        diabetes ~ 1 + glucose +
            vc(pregnant) + vc(pregnant, by = glucose) +
                vc(pressure) + vc(pressure, by = glucose) +
                    vc(mass) + vc(mass, by = glucose) +
                        vc(pedigree) + vc(pedigree, by = glucose) +
                            vc(age) + vc(age, by = glucose)

    c1.Pima <- c2.Pima <-
        tvcm_control(papply.args = list(mc.cores = 6),
                     folds = folds_control(seed = 13))
    c3.Pima <-
        tvcm_control(minsize = 50,
                     papply.args = list(mc.cores = 6),
                     folds = folds_control(seed = 13))
    c4.Pima <-
        tvcm_control(mindev = 50,
                     papply.args = list(mc.cores = 6),
                     folds = folds_control(seed = 13))
    c5.Pima <-
        tvcm_control(papply.args = list(mc.cores = 6),
                     folds = folds_control(seed = 13),
                     maxnumsplit = 19)

    ## MOB algorithm
    f.MOB <- diabetes ~ glucose | pregnant + pressure + mass + pedigree + age

                                        # rpart and other algorithms
    f.rpart <- diabetes ~ pregnant + glucose + pressure +  mass + pedigree + age


    ## Fit models on all data
    ## ----------------------

    algs <- c("TVCM1", "TVCM2", "TVCM3", "TVCM4", "TVCM5",
              "MOB", "CTree", "CART", "LMT", "J4.8")

    e.is <- c.is <- t.is <- rep(NA, length(algs))
    names(e.is) <- names(c.is) <- names(t.is) <- algs

    cat("\n * fitting the model to all data ... ")

    ## TVCM algorithm
    ptm <- proc.time()
    tvcm.Pima <- tvcm(f1.tvcm, data = Pima, family = binomial(), control = c1.Pima)
    t.is["TVCM1"] <- (proc.time() - ptm)[3]
    e.is["TVCM1"] <- mce(tvcm.Pima, Pima)
    c.is["TVCM1"] <- npar.tvcm(tvcm.Pima)

    ptm <- proc.time()
    tvcm.Pima <- tvcm(f2.tvcm, data = Pima, family = binomial(), control = c1.Pima)
    t.is["TVCM2"] <- (proc.time() - ptm)[3]
    e.is["TVCM2"] <- mce(tvcm.Pima, Pima)
    c.is["TVCM2"] <- npar.tvcm(tvcm.Pima)

    ptm <- proc.time()
    tvcm.Pima <- tvcm(f3.tvcm, data = Pima, family = binomial(), control = c3.Pima)
    t.is["TVCM3"] <- (proc.time() - ptm)[3]
    e.is["TVCM3"] <- mce(tvcm.Pima, Pima)
    c.is["TVCM3"] <- npar.tvcm(tvcm.Pima)

    ptm <- proc.time()
    tvcm.Pima <- tvcm(f4.tvcm, data = Pima, family = binomial(), control = c4.Pima)
    t.is["TVCM4"] <- (proc.time() - ptm)[3]
    e.is["TVCM4"] <- mce(tvcm.Pima, Pima)
    c.is["TVCM4"] <- npar.tvcm(tvcm.Pima)

    ptm <- proc.time()
    tvcm.Pima <- tvcm(f5.tvcm, data = Pima, family = binomial(), control = c5.Pima)
    t.is["TVCM5"] <- (proc.time() - ptm)[3]
    e.is["TVCM5"] <- mce(tvcm.Pima, Pima)
    c.is["TVCM5"] <- npar.tvcm(tvcm.Pima)

    ## MOB algorithm
    ptm <- proc.time()
    MOB.Pima <- glmtree(formula = f.MOB, data = Pima, family = binomial)
    t.is["MOB"] <- (proc.time() - ptm)[3]
    e.is["MOB"] <- mce(MOB.Pima, Pima)
    c.is["MOB"] <- 3 * width(MOB.Pima) - 1

    ## CTree algorithm
    ptm <- proc.time()
    CTree.Pima <- ctree(formula = f.rpart, data = Pima)
    t.is["CTree"] <- (proc.time() - ptm)[3]
    e.is["CTree"] <- mce(CTree.Pima, Pima)
    c.is["CTree"] <- 2 * width(CTree.Pima) - 1

    ## CART algorithm
    ptm <- proc.time()
    rpart.Pima <- rpart(formula = f.rpart, data = Pima)
    cp.rpart <- rpart.Pima$cptable[which.min(rpart.Pima$cptable[,"xerror"]),"CP"]
    rpart.Pima <- prune(rpart.Pima, cp = cp.rpart)
    t.is["CART"] <- (proc.time() - ptm)[3]
    e.is["CART"] <- mce(rpart.Pima, Pima)
    c.is["CART"] <- 2 * length(unique(rpart.Pima$where)) - 1

    require("RWeka")
    ## LMT algorithm
    ptm <- proc.time()
    LMT.Pima <- LMT(formula = f.rpart, data = Pima)
    t.is["LMT"] <- (proc.time() - ptm)[3]
    e.is["LMT"] <- mce(LMT.Pima, Pima)
    c.is["LMT"] <- 2 * width(as.party(LMT.Pima)) - 1

    ## J4.8 algorithm
    ptm <- proc.time()
    J48.Pima <- J48(formula = f.rpart, data = Pima)
    t.is["J4.8"] <- (proc.time() - ptm)[3]
    e.is["J4.8"] <- mce(J48.Pima, Pima)
    c.is["J4.8"] <- 2 * width(as.party(J48.Pima)) - 1

    cat("OK")

    ## Performance test
    ## ----------------

    cat("\n * starting performance test ... ")

    nsim <- 250
    folds <- folds_control(type = "bootstrap", K = nsim, seed = 123)
    folds <- vcrpart:::tvcm_folds(tvcm.Pima, folds)
    dn <- list(1:nsim, algs)
    e.boot <- c.boot <- t.boot <- matrix(, nsim, length(algs), dimnames = dn)

    rm(tvcm.Pima, MOB.Pima, CTree.Pima, rpart.Pima, LMT.Pima, J48.Pima)

    for (i in 1:nsim) {
        cat("\n\tfold", i, "... ")

        ## extract training data
        subs <- rep(1:nrow(Pima), folds[, i])
        training <- Pima[subs, ]

        ## extract validation data
        subs <- folds[,i] == 0
        validation <- Pima[subs, ]

        c1.Pima$folds$seed <- i
        c3.Pima$folds$seed <- i
        c4.Pima$folds$seed <- i
        c5.Pima$folds$seed <- i

        ## tvcm algorithm
        ptm <- proc.time()
        tvcmB.Pima <-
            try(tvcm(f1.tvcm, data = training, family = binomial(),
                     control = c1.Pima),
                silent = TRUE)
        if (!inherits(tvcmB.Pima, "try-error")) {
            t.boot[i, "TVCM1"] <- (proc.time() - ptm)[3]
            e.boot[i, "TVCM1"] <- mce(tvcmB.Pima, validation)
            c.boot[i, "TVCM1"] <- npar.tvcm(tvcmB.Pima)
        }

        ptm <- proc.time()
        tvcmB.Pima <-
            try(tvcm(f2.tvcm, data = training, family = binomial(),
                     control = c2.Pima),
                silent = TRUE)
        if (!inherits(tvcmB.Pima, "try-error")) {
            t.boot[i, "TVCM2"] <- (proc.time() - ptm)[3]
            e.boot[i, "TVCM2"] <- mce(tvcmB.Pima, validation)
            c.boot[i, "TVCM2"] <- npar.tvcm(tvcmB.Pima)
        }

        ptm <- proc.time()
        tvcmB.Pima <-
            try(tvcm(f1.tvcm, data = training, family = binomial(),
                     control = c3.Pima),
                silent = TRUE)
        if (!inherits(tvcmB.Pima, "try-error")) {
            t.boot[i, "TVCM3"] <- (proc.time() - ptm)[3]
            e.boot[i, "TVCM3"] <- mce(tvcmB.Pima, validation)
            c.boot[i, "TVCM3"] <- npar.tvcm(tvcmB.Pima)
        }

        ptm <- proc.time()
        tvcmB.Pima <-
            try(tvcm(f1.tvcm, data = training, family = binomial(),
                     control = c4.Pima),
                silent = TRUE)
        if (!inherits(tvcmB.Pima, "try-error")) {
            t.boot[i, "TVCM4"] <- (proc.time() - ptm)[3]
            e.boot[i, "TVCM4"] <- mce(tvcmB.Pima, validation)
            c.boot[i, "TVCM4"] <- npar.tvcm(tvcmB.Pima)
        }

        ptm <- proc.time()
        tvcmB.Pima <-
            try(tvcm(f1.tvcm, data = training, family = binomial(),
                     control = c5.Pima),
                silent = TRUE)
        if (!inherits(tvcmB.Pima, "try-error")) {
            t.boot[i, "TVCM5"] <- (proc.time() - ptm)[3]
            e.boot[i, "TVCM5"] <- mce(tvcmB.Pima, validation)
            c.boot[i, "TVCM5"] <- npar.tvcm(tvcmB.Pima)
        }

        ## MOB algorithm
        ptm <- proc.time()
        MOBB.Pima <- try(glmtree(formula = f.MOB, data = training,
                                 family = binomial),
                         silent = TRUE)
        t.boot[i, "MOB"] <- (proc.time() - ptm)[3]
        e.boot[i, "MOB"] <- mce(MOBB.Pima, validation)
        c.boot[i, "MOB"] <- 3 * width(MOBB.Pima) - 1

        ## CTree algorithm
        ptm <- proc.time()
        CTreeB.Pima <- try(ctree(formula = f.rpart, data = training), silent = TRUE)
        if (!inherits(CTreeB.Pima, "try-error")) {
            t.boot[i, "CTree"] <- (proc.time() - ptm)[3]
            e.boot[i, "CTree"] <- mce(CTreeB.Pima, validation)
            c.boot[i, "CTree"] <- 2 * width(CTreeB.Pima) - 1
        }

        ## CART algorithm
        ptm <- proc.time()
        rpartB.Pima <- try(rpart(formula = f.rpart, data = training), silent = TRUE)
        if (!inherits(rpartB.Pima, "try-error")) {
            cpB.Pima <-
                rpartB.Pima$cptable[which.min(rpartB.Pima$cptable[,"xerror"]),"CP"]
            rpartB.Pima <- prune(rpartB.Pima, cp = cpB.Pima)
            t.boot[i, "CART"] <- (proc.time() - ptm)[3]
            e.boot[i, "CART"] <- mce(rpartB.Pima, validation)
            c.boot[i, "CART"] <- 2 * length(unique(rpartB.Pima$where)) - 1
        }

        ## LMT algorithm
        ptm <- proc.time()
        LMTB.Pima <- try(LMT(formula = f.rpart, data = training), silent = TRUE)
        if (!inherits(LMTB.Pima, "try-error")) {
            t.boot[i, "LMT"] <- (proc.time() - ptm)[3]
            e.boot[i, "LMT"] <- mce(LMTB.Pima, validation)
            c.boot[i, "LMT"] <- 2 * width(as.party(LMTB.Pima)) - 1
        }

        ## J4.8 algorithm
        ptm <- proc.time()
        J48B.Pima <- try(J48(formula = f.rpart, data = training), silent = TRUE)
        if (!inherits(J48B.Pima, "try-error")) {
            t.boot[i, "J4.8"] <- (proc.time() - ptm)[3]
            e.boot[i, "J4.8"] <- mce(J48B.Pima, validation)
            c.boot[i, "J4.8"] <- 2 * width(as.party(J48B.Pima)) - 1
        }

        rm(tvcmB.Pima, MOBB.Pima, CTreeB.Pima, rpartB.Pima, LMTB.Pima, J48B.Pima)

        gc(verbose = FALSE)

        cat("OK")
    }

    rm(f1.tvcm, f2.tvcm, f.MOB, f.rpart,
       c1.Pima, c2.Pima, c3.Pima, c4.Pima, c5.Pima,
       ptm, dn)

    save(algs, folds, nsim,
         t.is, e.is, c.is,
         t.boot, e.boot, c.boot,
         file = "RData/vcrpart-bench.RData")
} else {

    load(file = "RData/vcrpart-bench.RData")

}



###################################################
### code chunk number 29: pima-MOB-spinogram (eval = FALSE)
###################################################
## 
## ## original plot of MOB model
## f.MOB <- diabetes ~ glucose | pregnant + pressure + mass + pedigree + age
## MOB.Pima <- glmtree(formula = f.MOB, data = Pima, family = binomial)
## plot(MOB.Pima)


###################################################
### code chunk number 30: pima-table-comp
###################################################

subs <- intersect(1:250, which(!is.na(e.boot[,1])))
e.boot <- e.boot[subs, ]
c.boot <- c.boot[subs, ]
t.boot <- t.boot[subs, ]
tab.Pima <- data.frame("Boot" = apply(e.boot, 2, median, na.rm = TRUE),
                       "Boot-Mean" = apply(e.boot, 2, mean, na.rm = TRUE),
                       "Orig" = e.is,
                       "Boot" = apply(c.boot, 2, median, na.rm = TRUE),
                       "Orig" = c.is,
                       "Boot" = apply(t.boot, 2, median, na.rm = TRUE),
                       "Orig" = t.is,
                       check.names = FALSE)
cap <- c("Performances for the \\code{Pima} data. Boot: Medians (and means under Boot-Mean) of results from fits on 250 bootstrap samples. Orig: Results on the original data. Misclassification: Misclassification errors. Complexity: The number of coefficients plus the number of splits. Time: Computation time in seconds.", "Performances for the \\code{Pima} data")
algs2 <- algs
algs2[1:5] <- c("TVCM (default)",
                "TVCM (additive)",
                "TVCM ($N_0 = 50$)",
                "TVCM ($D_{min} = 50$)",
                "TVCM ($N_S = 19$)")
algs2[8L] <- "RPART"
rownames(tab.Pima) <- algs2
rownames(tab.Pima)[1L] <- "TVCM"
xtab.Pima <- xtable(x = tab.Pima,
                    caption = cap,
                    label = "vcrpart-tab:pima-performance",
                    align = "lrrr|rr|rr",
                    digits = c(0, 3, 3, 3, 0, 0, 2, 2),
                    display = c("s", "f", "f", "f", "d", "d", "f", "f"))
addtorow <- list(pos = list(-1),
                 command = "\\hline & \\multicolumn{3}{c}{Misclassification} & \\multicolumn{2}{c}{Complexity} & \\multicolumn{2}{c}{Time} \\\\")
print(x = xtab.Pima,
      table.placement = "tbp",
      caption.placement = "bottom",
      latex.environments = "center",
      sanitize.text.function = function(x) x,
      booktabs = FALSE,
      hline.after = c(0, nrow(xtab.Pima)),
      add.to.row = addtorow, size="\\small")


###################################################
### code chunk number 31: pima-fig-comp
###################################################

nsim <- length(subs)
algs3 <- algs
algs3[2] <- expression(paste("TVCM (additive)", sep = ""))
algs3[3] <- expression(paste("TVCM (", N[0], "= 50)", sep = ""))
algs3[4] <- expression(paste("TVCM (", D[min], "= 50)", sep = ""))
algs3[5] <- expression(paste("TVCM (", N[S], " = 19)", sep = ""))
algs3[8] <- "RPART"
par(mfrow = c(1,2), mgp = c(2.25, 1, 0), mar = c(3.5, 8, 0.5, 2))
e.boot.long <-
  data.frame(MCE = c(e.boot),
             ALG = factor(rep(algs, each = nsim), levels = algs))
mod <- lm(MCE ~ ALG, e.boot.long)
ci <- confint(mod)
plot(coef(mod)[-1], length(coef(mod)[-1]):1, pch = 16,
     xlim = c(-0.01, 0.045), axes = FALSE, ylab = "",
     xlab = "Misclassification difference")
axis(1); axis(2, length(coef(mod)[-1]):1, algs3[-1], las = 2); box();
abline(v = 0, lty = 2)
arrows(ci[-1,1], length(coef(mod)[-1]):1,
       ci[-1,2], length(coef(mod)[-1]):1, angle = 90, code = 3, length = 0.075)
c.boot.long <-
  data.frame(DF = c(c.boot),
             ALG = factor(rep(algs, each = nsim), levels = algs))
mod <- lm(DF ~ ALG, c.boot.long)
ci <- confint(mod)
plot(coef(mod)[-1], length(coef(mod)[-1]):1,
     axes = FALSE, ylab = "", pch = 16,
     xlab = "Complexity difference", xlim = c(-12,80))
axis(1); axis(2, length(coef(mod)[-1]):1, algs3[-1], las = 2); box();
abline(v = 0, lty = 2)
arrows(ci[-1,1], length(coef(mod)[-1]):1, ci[-1,2], length(coef(mod)[-1]):1,
       angle = 90, code = 3, length = 0.075)


###################################################
### code chunk number 32: vcrpart-simulations-code
###################################################

## Comment:
## The following code runs the simulation study if
## 'run = TRUE' or the R data file 'RData/vcrpart-simulations.RData' is
## not available.


run <- FALSE

## function to generate the data
generate.data <- function(N = 200) {
    rval <- data.frame(x = rnorm(N),
                       z0 = rbinom(N, 1, 0.5),
                       z1 = rbinom(N, 1, 0.5),
                       z2 = rbinom(N, 1, 0.5),
                       z3 = rbinom(N, 1, 0.5),
                       z4 = rbinom(N, 1, 0.5),
                       z5 = rbinom(N, 1, 0.5))
    rval$group <- factor(paste(rval$z0,rval$z1, sep = "-"))
    levels(rval$group) <- c("1", "1", "2", "3")
    rval$y1 <- model.matrix(~ z0 + x + z1:x, rval) %*% c(-1, 2, 1, -1) +
        rnorm(N)
    return(rval)
}

## function that executes a single simulation run
simFun <- function(N = 100) {
    cat(".")
    rval <- matrix(FALSE, 2, 4)
    z <- c("z0", "z1", "z2", "z3", "z4", "z5")

    ## set control according to sample size
    control.sim <- tvcglm_control(apply.args = list(mc.cores = 6),
                                  mindev = min(N / 200, 2),
                                  minsize = min(N / 20, 30))

    ## generate data
    dat <- generate.data(N = N)

    ## fit the models
    m1 <- tvcglm(y1 ~ - 1 + vc(z) + vc(z, by = x),
                 family = gaussian(), data = dat,
                 control = control.sim)
    m2 <- tvcglm(y1 ~ - 1 + vc(z, by = x, intercept = TRUE),
                     family = gaussian(), data = dat,
                 control = control.sim)

    n1 <- extract(m1, "nodes")
    n2 <- extract(m2, "nodes")

    ## check whether the algorithms found true structure
    rval[1,1] <-
        (width(n1$A) == 2 &&
             unlist(nodeapply(n1$A, 1, function(x) x$split$varid)) == 1) &
                 (width(n1$B) == 2 &&
                      unlist(nodeapply(n1$B, 1, function(x) x$split$varid)) == 2)
    rval[2,1] <-
        width(n2$A) == 4 &&
            (all(unlist(nodeapply(n2$A, c(1,2,5),
                                  function(x) x$split$varid)) %in% c(1,2,2)) |
                                      all(unlist(nodeapply(n2$A, c(1,2,5),
                                                           function(x) x$split$varid)) %in% c(2,1,1)))

    ## check whether the fits include the true structure as a nested model
    rval[1,2] <-
        (width(n1$A) > 1 &&
             unlist(nodeapply(n1$A, 1, function(x) x$split$varid)) == 1) &
                (width(n1$B) > 1 &&
                     unlist(nodeapply(n1$B, 1, function(x) x$split$varid)) == 2)
    rval[2,2] <-
        width(n2$A) >= 4 &&
            (all(unlist(nodeapply(n2$A, c(1,2,5),
                                  function(x) x$split$varid)) %in% c(1,2,2)) |
                                      all(unlist(nodeapply(n2$A, c(1,2,5),
                                                           function(x) x$split$varid)) %in% c(2,1,1)))

    ## check whether true effect modifiers were chose
    rval[1,3] <- "z0" %in% extract(m1, "selected")[[1]] &
        "z1" %in% extract(m1, "selected")[[2]]
    rval[2,3] <- all(c("z0", "z1") %in% extract(m2, "selected")[[1]])

    ## check whether noise variables were chosen
    rval[1,4] <- sum(!unique(unlist(extract(m1, "selected"))) %in% c("z0", "z1"))
    rval[2,4] <- sum(!unique(unlist(extract(m2, "selected"))) %in% c("z0", "z1"))

    return(c(rval))
}

if (run | !"vcrpart-simulations.RData" %in% dir("RData/")) {
    nsim <- 2000 # the number of simulation runs
    N <- c(50, 100, 150, 200, 250, 300, 350, 400, 450, 500)
    par <- rep(N, each = nsim)
    res <- mclapply(X = par, simFun, mc.cores = 1)
    save(res, par, N, nsim, file = "RData/vcrpart-simulations.RData")
} else {
    load(file = "RData/vcrpart-simulations.RData")
}




###################################################
### code chunk number 33: vcrpart-simulations-arbitrarydata (eval = FALSE)
###################################################
## 
## ## generate an arbitrary simulation data set with 100 observations
## N <- 100
## simdata <- generate.data(N = N)
## simdata$y <- simdata$y1


###################################################
### code chunk number 34: vcrpart-simulations-moderators (eval = FALSE)
###################################################
## 
## z <- c("z0", "z1", "z2", "z3", "z4", "z5")
## control.sim <-
##     tvcglm_control(mindev = min(N / 200, 2), minsize = min(N / 20, 30))


###################################################
### code chunk number 35: vcrpart-simulations-models-1 (eval = FALSE)
###################################################
## 
## vcm.sim.1 <- # coefficient-wise partitions model
##     tvcglm(y ~ -1 + vc(z) + vc(z, by = x), data = simdata,
##            family = gaussian(), control = control.sim)


###################################################
### code chunk number 36: vcrpart-simulations-models-2 (eval = FALSE)
###################################################
## 
## vcm.sim.2 <- # single partition model
##     tvcglm(y ~ -1 + vc(z, by = x, intercept = TRUE),
##            data = simdata, family = gaussian(), control = control.sim)


###################################################
### code chunk number 37: pima-fig-simulations
###################################################

res <- sapply(res, I)
res <- apply(res, 1, function(x) tapply(x, par, function(x) sum(x) / length(x)))

par(mfrow = c(1, 4), mgp = c(2, 1, 0), mar = c(3, 3, 3, 1))
matplot(x = N, y = res[, 1:2], type = "b", col = "black",
        ylab = "Proportion of runs", ylim = c(0, 1),
        main = "Identified\n underlying model",
        pch = 1:2, xlab = "Number of observations")
abline(h = c(0, 1), lty = 3)
matplot(x = N, y = res[, 3:4], type = "b", col = "black",
        ylab = "Proportion of runs",
        ylim = c(0, 1), main = "Nested\n underlying model",
        pch = 1:2, xlab = "Number of observations")
abline(h = c(0, 1), lty = 3)
matplot(x = N, y = res[, 5:6], type = "b", col = "black",
        ylab = "Proportion of runs",
        ylim = c(0, 1), main = "True moderators\n selected",
        pch = 1:2, xlab = "number of observations")
abline(h = c(0, 1), lty = 3)
matplot(x = N, y = res[, 7:8], type = "b", col = "black",
        ylab = "Proportion of runs",
        ylim = c(0, 1), main = "False\n variable selections",
        pch = 1:2, xlab = "Number of observations")



###################################################
### code chunk number 38: school-data
###################################################

library("Ecdat")
data("Schooling")
Schooling <- Schooling[c(19, 21, 7, 28, 9, 14, 17, 18, 20, 23, 2, 4)]
Schooling$black <- 1 * (Schooling$black == "yes")


###################################################
### code chunk number 39: school-IV
###################################################
Schooling$ed76.IV <- fitted(lm(ed76 ~ nearc4, Schooling))


###################################################
### code chunk number 40: school-glm
###################################################
lm.School <- lm(lwage76 ~ ed76.IV + exp76 + I(exp76^2) + black,
                data = Schooling)


###################################################
### code chunk number 41: school-glm
###################################################
summary(lm.School)$coef[, 1:3]


###################################################
### code chunk number 42: school-formula
###################################################

f.School <- lwage76 ~ -1 + ed76.IV + exp76 + I(exp76^2) +
  vc(age76, momdad14, south66, south76, famed, enroll76, smsa76) +
  vc(ed76.IV, exp76, age76, momdad14, south66,
     south76, famed, enroll76, smsa76, by = black)


###################################################
### code chunk number 43: school-fitting (eval = FALSE)
###################################################
## 
## vcm.School <- tvcglm(formula = f.School, data = Schooling,
##                      family = gaussian(), control = control)


###################################################
### code chunk number 44: school-fig-cv
###################################################

par(mgp = c(2, 1, 0), mar = c(3,3,0.1,0.1))
plot(vcm.School, "cv",
     xlab = expression(lambda),
     ylab = "Validation error",
     legend = FALSE)



###################################################
### code chunk number 45: school-fig-tree
###################################################

grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2, widths = c(0.72,0.28))))
pushViewport(viewport(layout.pos.col = 1))
plot(vcm.School, type = "coef", part = 1, nobs = FALSE,
     newpage = FALSE, gp = gpar(fontsize = 11))
upViewport()
pushViewport(viewport(layout.pos.col = 2))
plot(vcm.School, type = "coef", part = 2,
     newpage = FALSE)
upViewport()
upViewport()


###################################################
### code chunk number 46: school-nonvarcoef
###################################################

coef(vcm.School)$fe


###################################################
### code chunk number 47: pl-data
###################################################

data("PL")


###################################################
### code chunk number 48: pl-glm
###################################################

glm.PL <- glm(uncj10 ~ july, data = PL, family = binomial)


###################################################
### code chunk number 49: app-glm-coef
###################################################
summary(glm.PL)$coefficients[, 1:3]


###################################################
### code chunk number 50: pl-formula
###################################################

f.PL <- uncj10  ~  1 + july + vc(age) + vc(age, by = july) +
    vc(workExp) + vc(workExp, by = july) +
    vc(unEmpl) + vc(unEmpl, by = july) +
    vc(laborEarnings) + vc(laborEarnings, by = july) +
    vc(whiteCollar) + vc(whiteCollar, by = july) +
    vc(wage) + vc(wage, by = july) +
    vc(industry.SL) + vc(industry.SL, by = july) +
    vc(region.SL) + vc(region.SL, by = july)


###################################################
### code chunk number 51: pl-fitting (eval = FALSE)
###################################################
## 
## vcm.PL <- tvcglm(formula = f.PL, family = binomial(),
##                     data = PL, control = control)


###################################################
### code chunk number 52: pl-fig-cv
###################################################

par(mgp = c(2, 1, 0), mar = c(3,3,0.1,0.1))
plot(vcm.PL, "cv",
     xlab = expression(lambda),
     ylab = "Validation error",
     legend = FALSE)


###################################################
### code chunk number 53: pl-fig-tree
###################################################

fac <- 0.9
subs <- which(width(vcm.PL) > 1)
ylim1 <- c(-1, 0.8)
ylim2 <- c(-1.4, 0.5)
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow = 2, ncol = 3,
                        widths = c(1/3, 1/3, 1/3))))
pushViewport(viewport(layout.pos.col = 1, layout.pos.row = 1))
plot(vcm.PL, type = "coef", part = subs[1], plot_gp = list(ylim = ylim1),
     newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 1, layout.pos.row = 2))
plot(vcm.PL, type = "coef", part = subs[2], plot_gp = list(ylim = ylim2),
     newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 2, layout.pos.row = 1))
plot(vcm.PL, type = "coef", part = subs[3], plot_gp = list(ylim = ylim1),
     newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 3, layout.pos.row = 1))
plot(vcm.PL, type = "coef", part = subs[4], plot_gp = list(ylim = ylim1),
     newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 2, layout.pos.row = 2))
plot(vcm.PL, type = "coef", part = subs[5], plot_gp = list(ylim = ylim2),
     newpage = FALSE)
upViewport()
upViewport()


###################################################
### code chunk number 54: pl-coef
###################################################

## extract coefficients for text
effGL <- coef(vcm.PL)$fe[2]
effWED1 <- coef(vcm.PL)$vc$C[1]
effWED2 <- coef(vcm.PL)$vc$C[2]
effWEM1 <- coef(vcm.PL)$vc$D[1]
effREM1 <- coef(vcm.PL)$vc$P[1]
effREM3 <- coef(vcm.PL)$vc$P[3]


###################################################
### code chunk number 55: applicatons-save (eval = FALSE)
###################################################
## 
## ## save all fitted models, those being commented out by default
## if (run | !"vcrpart-applications.RData" %in% dir("RData/")) {
##   save(UCBA, cv.UCBA, vcm.UCBA,
##      Pima, vcm.Pima.1, vcm.Pima.2, mob.Pima,
##      Schooling, vcm.School,
##      PL, vcm.PL,
##      file = "RData/vcrpart-applications.RData")
## }

Try the vcrpart package in your browser

Any scripts or data that you put into this service are public.

vcrpart documentation built on May 17, 2021, 3:01 a.m.