Nothing
### R code from vignette source 'coefficient-wise-varying-coefficient-regression.Rnw'
###################################################
### code chunk number 1: header
###################################################
## Date: 2016-07-25
## Authors: Reto Buergin and Gilbert Ritschard
## Institution: Swiss National Centre of Competence in
## Research LIVES (http://www.lives-nccr.ch/)
##
## R-codes for the article "Coefficient-wise
## tree-based varying coefficient regression with
## R-package 'vcrpart'". The code requires vcrpart (>= 0.4-1),
## mlbench (>= 2.1-1) and Ecdat (>= 0.2-9).
##
## Notes:
## Chunks with '(eval = FALSE)' in the header or
## with if / else conditions on 'run' objects are
## commented out because they are computationally
## burdensome. Instead, the pre-run results
## stored in supplementary files are used, see the
## commands
##
## load(file = "RData/vcrpart-applications.RData")
## load(file = "RData/vcrpart-bench.RData")
## load(file = "RData/vcrpart-simulations.RData")
##
## Depending where you have stored the RData files,
## you should modify these commands correspondingly.
##
## The comparison study of Section 4.1 or the the simulation
## study of section 4.2 can be processed by setting
## 'run <- TRUE' in the corresponding chunks.
##
## We cannot guarantee that the functions work with future
## versions of R and the indicated packages.
##
## Copyright R. Buergin and G. Ritschard, 2016
## distributed under license Creative Commons BY-NC-SA
## http://creativecommons.org/licenses/by-nc-sa/3.0/
###################################################
### code chunk number 2: initialize
###################################################
## remove several R objects in the global environment
rm(list = ls())
## create RData and Figures directory if necessary
suppressWarnings(dir.create("RData/"))
suppressWarnings(dir.create("Figures/"))
## set options
options(width=76,prompt="R> ", continue="+ ", useFancyQuotes = FALSE,
digits = 3, scipen = 10)
## load required packages
library("vcrpart")
library("mlbench")
library("partykit")
library("xtable")
library("Ecdat")
#library("RWeka")
library("rpart")
## load
load(file = "RData/vcrpart-applications.RData")
###################################################
### code chunk number 3: ucba-data
###################################################
UCBA <- as.data.frame(UCBAdmissions)
UCBA$Admit <- 1 * (UCBA$Admit == "Admitted")
UCBA$Female <- 1 * (UCBA$Gender == "Female")
head(UCBA, 3)
###################################################
### code chunk number 4: ucba-glmS
###################################################
glmS.UCBA <- glm(formula = Admit ~ Female, data = UCBA,
family = binomial(), weights = UCBA$Freq)
###################################################
### code chunk number 5: ucba-glmS-coef
###################################################
summary(glmS.UCBA)$coefficients[, -4]
###################################################
### code chunk number 6: ucba-glmL
###################################################
glmL.UCBA <- glm(formula = Admit ~ -1 + Dept + Dept:Female,
data = UCBA, family = binomial(),
weights = UCBA$Freq)
###################################################
### code chunk number 7: ucba-glmL-coef
###################################################
summary(glmL.UCBA)$coefficients[, -4]
###################################################
### code chunk number 8: ucba-fitting-vcmL
###################################################
library("vcrpart")
vcmL.UCBA <-
tvcglm(formula = Admit ~ -1 + vc(Dept) + vc(Dept, by = Female),
data = UCBA, family = binomial(), weights = UCBA$Freq,
control = tvcglm_control(minsize = 30, mindev = 0.0,
cv = FALSE))
###################################################
### code chunk number 9: ucba-fig-vcmL
###################################################
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2)))
pushViewport(viewport(layout.pos.col = 1))
plot(vcmL.UCBA, part = 1, type = "coef", newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 2))
plot(vcmL.UCBA, part = 2, type = "coef", newpage = FALSE)
upViewport()
upViewport()
###################################################
### code chunk number 10: ucba-plot (eval = FALSE)
###################################################
##
## plot(vcmL.UCBA, type = "coef", part = "A")
## plot(vcmL.UCBA, type = "coef", part = "B")
###################################################
### code chunk number 11: ucba-splitpath
###################################################
splitpath(vcmL.UCBA, steps = 1, details = TRUE)
###################################################
### code chunk number 12: ucba-nomsplits-fitting
###################################################
glmCW.UCBA <-
glm(formula = Admit ~ 1 + Dept:Female, family = binomial(),
data = UCBA, weights = UCBA$Freq)
###################################################
### code chunk number 13: ucba-nomsplits-coef
###################################################
sort(coef(glmCW.UCBA)[-1])
###################################################
### code chunk number 14: ucba-prune-fixed-lambda
###################################################
vcm.UCBA <- prune(vcmL.UCBA, cp = 6)
###################################################
### code chunk number 15: ucba-prunepath
###################################################
prunepath(vcm.UCBA, steps = 1)
###################################################
### code chunk number 16: ucba-cv (eval = FALSE)
###################################################
##
## cv.UCBA <- cvloss(vcmL.UCBA,
## folds = folds_control(weights = "freq", seed = 13))
###################################################
### code chunk number 17: ucba-fig-cv
###################################################
par(mgp = c(2, 1, 0), mar = c(3,3,0.1,0.1))
plot(cv.UCBA,
xlab = expression(lambda),
ylab = "Validation error",
legend = FALSE)
###################################################
### code chunk number 18: ucba-lambda
###################################################
oobLoss <- colMeans(cv.UCBA$oobloss)
subs <- which.min(oobLoss)
cp.hat <- cv.UCBA$cp.hat
cp.int <- cv.UCBA$grid[subs:(subs+1)]
###################################################
### code chunk number 19: ucba-prune-estimated-lambda (eval = FALSE)
###################################################
##
## vcm.UCBA <- prune(vcmL.UCBA, cp = cv.UCBA$cp.hat)
###################################################
### code chunk number 20: ucba-fig-vcm
###################################################
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2, width = c(0.5, 0.5))))
pushViewport(viewport(layout.pos.col = 1))
plot(vcm.UCBA, part = 1, newpage = FALSE, type = "coef")
upViewport()
pushViewport(viewport(layout.pos.col = 2))
plot(vcm.UCBA, part = 2, newpage = FALSE, type = "coef")
upViewport()
upViewport()
###################################################
### code chunk number 21: details-mean-centering
###################################################
set.seed(1)
d <- data.frame(x = runif(100, min = -5, max = 5),
g = rep(LETTERS[1:2], each = 50))
d$y <- model.matrix(~ -1 + g:x, d) %*% c(-1, 1) + rnorm(100)
d$x <- d$x + 10
d$x.centered <- d$x - mean(d$x)
par(mfrow = c(1, 2), mgp = c(2,1,0), mar = c(3,3,2,3), cex.main = 1.25)
lm <- lm(y ~ x, d)
d$offset <- lm$coefficients[1]
lmA <- lm(y ~ -1 + x, d, subset = d$g == "A", offset = d$offset)
lmB <- lm(y ~ -1 + x, d, subset = d$g == "B", offset = d$offset)
plot(y ~ x, d, pch = as.integer(d$g)^2, xlim = c(-5, 15),
main = "x not mean-centered")
legend("topleft", c("Global", "Group A", "Group B"), lty = 1:3, pch = c(NA, 1, 4))
abline(lm, lty = 1)
abline(a = coef(lm)[1], b = coef(lmA), lty = 2)
abline(a = coef(lm)[1], b = coef(lmB), lty = 3)
lm <- lm(y ~ x.centered, d)
d$offset <- lm$coefficients[1]
lmA <- lm(y ~ -1 + x.centered, d, subset = d$g == "A", offset = d$offset)
lmB <- lm(y ~ -1 + x.centered, d, subset = d$g == "B", offset = d$offset)
plot(y ~ x.centered, d, pch = as.integer(d$g)^2, xlim = c(-5, 15),
main = "x mean-centered", xlab = "x")
abline(lm, lty = 1)
abline(a = coef(lm)[1], b = coef(lmA), lty = 2)
abline(a = coef(lm)[1], b = coef(lmB), lty = 3)
###################################################
### code chunk number 22: applications-control
###################################################
control <- tvcglm_control(folds = folds_control(seed = 13))
###################################################
### code chunk number 23: pima-data
###################################################
library("mlbench")
data("PimaIndiansDiabetes2")
Pima <- na.omit(PimaIndiansDiabetes2[, -c(4, 5)])
###################################################
### code chunk number 24: pima-fitting-unconstrained (eval = FALSE)
###################################################
##
## vcm.Pima.1 <-
## tvcglm(diabetes ~ -1 + vc(pregnant, pressure, mass, pedigree, age) +
## vc(pregnant, pressure, mass, pedigree, age, by = glucose),
## data = Pima, family = binomial(), control = control)
###################################################
### code chunk number 25: pima-fitting-additive (eval = FALSE)
###################################################
##
## vcm.Pima.2 <-
## tvcglm(diabetes ~ 1 + glucose +
## vc(pregnant) + vc(pregnant, by = glucose) +
## vc(pressure) + vc(pressure, by = glucose) +
## vc(mass) + vc(mass, by = glucose) +
## vc(pedigree) + vc(pedigree, by = glucose) +
## vc(age) + vc(age, by = glucose),
## data = Pima, family = binomial(), control = control)
###################################################
### code chunk number 26: pima-fitting-mob (eval = FALSE)
###################################################
##
## ## MOB-model fitted from 'glmtree' function
## mob.Pima <-
## glmtree(diabetes ~ glucose | pregnant + pressure + mass +
## pedigree + age, data = Pima, family = binomial())
##
## ## construct same model with 'tvcglm' function
## mob.Pima <-
## tvcglm(diabetes ~ -1 + vc(pregnant, pressure, mass, pedigree, age,
## by = glucose, intercept = TRUE),
## data = Pima, family = binomial(),
## control = tvcglm_control(sctest = TRUE, cv = FALSE, maxnumsplit = 10000))
###################################################
### code chunk number 27: pima-fig-tree
###################################################
## plot the tvcm model
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2, width = c(0.5, 0.5))))
pushViewport(viewport(layout.pos.col = 1))
grid.text("TVCM", y = unit(0.95, "npc"), gp = gpar(cex = 1.25))
pushViewport(viewport(
y = unit(0.45, "npc"),
width = unit(0.95, "npc"),
height = unit(0.875, "npc")))
grid.rect()
pushViewport(viewport(height = unit(0.9, "npc"), y = unit(0.5, "npc")))
pushViewport(viewport(layout = grid.layout(1, 2, width = c(0.65, 0.35))))
pushViewport(viewport(layout.pos.col = 1))
plot(vcm.Pima.1, part = 1, type = "coef",
newpage = FALSE)
grid.lines(unit(c(0.99, 0.99), "npc"),
unit(c(0.05, 0.95), "npc"),
gp = gpar(lty = 2, col = "black"))
upViewport(1)
pushViewport(viewport(layout.pos.col = 2))
plot(vcm.Pima.1, part = 2, type = "coef",
plot_gp = list(width = 1), main = "vc(pregnant, ...,\n by = glucose)",
newpage = FALSE)
upViewport(6)
## plot the MOB model
pushViewport(viewport(layout.pos.col = 2))
grid.text("MOB", y = unit(0.95, "npc"), gp = gpar(cex = 1.25))
pushViewport(viewport(
y = unit(0.45, "npc"),
width = unit(0.95, "npc"),
height = unit(0.875, "npc")))
grid.rect()
plot(mob.Pima, newpage = FALSE, tnex = 4.5,
main = "", margins = c(1.5, 0.75, 0, 0.75),
parm = list("(Intercept)", "glucose"))
upViewport(2)
###################################################
### code chunk number 28: pima-comp-code
###################################################
## Comment:
## The following code runs the comparison study if
## 'run = TRUE' or the R data file 'vcrpart-bench' is
## not available.
run <- FALSE
if (run | !"vcrpart-bench.RData" %in% dir("RData/") ) {
## compute the missclassification error
mce <- function(object, newdata) {
if ("party" %in% class(object)) {
mu <- predict(object, newdata = newdata, type = "response")
} else if ("rpart" %in% class(object)){
mu <- predict(object, newdata = newdata, type = "class")
} else {
mu <- predict(object, newdata = newdata)
}
mu <- if (is.factor(mu)) 1 * (mu == "pos") else round(mu)
y <- 1 * (newdata$diabetes == "pos")
return(sum(y != round(mu)) / nrow(newdata))
}
## compute the complexity of the TVCM model
npar.tvcm <- function(object) {
return(extractAIC(extract(object, "model"))[1] +
sum(sapply(object$info$node, width) - 1))
}
## Define formulas and control parameters
## --------------------------------------
## tvcm algorithm
f1.tvcm <- f3.tvcm <- f4.tvcm <- f5.tvcm <-
diabetes ~ -1 +
vc(pregnant, pressure, mass, pedigree, age) +
vc(pregnant, pressure, mass, pedigree, age, by = glucose)
f2.tvcm <-
diabetes ~ 1 + glucose +
vc(pregnant) + vc(pregnant, by = glucose) +
vc(pressure) + vc(pressure, by = glucose) +
vc(mass) + vc(mass, by = glucose) +
vc(pedigree) + vc(pedigree, by = glucose) +
vc(age) + vc(age, by = glucose)
c1.Pima <- c2.Pima <-
tvcm_control(papply.args = list(mc.cores = 6),
folds = folds_control(seed = 13))
c3.Pima <-
tvcm_control(minsize = 50,
papply.args = list(mc.cores = 6),
folds = folds_control(seed = 13))
c4.Pima <-
tvcm_control(mindev = 50,
papply.args = list(mc.cores = 6),
folds = folds_control(seed = 13))
c5.Pima <-
tvcm_control(papply.args = list(mc.cores = 6),
folds = folds_control(seed = 13),
maxnumsplit = 19)
## MOB algorithm
f.MOB <- diabetes ~ glucose | pregnant + pressure + mass + pedigree + age
# rpart and other algorithms
f.rpart <- diabetes ~ pregnant + glucose + pressure + mass + pedigree + age
## Fit models on all data
## ----------------------
algs <- c("TVCM1", "TVCM2", "TVCM3", "TVCM4", "TVCM5",
"MOB", "CTree", "CART", "LMT", "J4.8")
e.is <- c.is <- t.is <- rep(NA, length(algs))
names(e.is) <- names(c.is) <- names(t.is) <- algs
cat("\n * fitting the model to all data ... ")
## TVCM algorithm
ptm <- proc.time()
tvcm.Pima <- tvcm(f1.tvcm, data = Pima, family = binomial(), control = c1.Pima)
t.is["TVCM1"] <- (proc.time() - ptm)[3]
e.is["TVCM1"] <- mce(tvcm.Pima, Pima)
c.is["TVCM1"] <- npar.tvcm(tvcm.Pima)
ptm <- proc.time()
tvcm.Pima <- tvcm(f2.tvcm, data = Pima, family = binomial(), control = c1.Pima)
t.is["TVCM2"] <- (proc.time() - ptm)[3]
e.is["TVCM2"] <- mce(tvcm.Pima, Pima)
c.is["TVCM2"] <- npar.tvcm(tvcm.Pima)
ptm <- proc.time()
tvcm.Pima <- tvcm(f3.tvcm, data = Pima, family = binomial(), control = c3.Pima)
t.is["TVCM3"] <- (proc.time() - ptm)[3]
e.is["TVCM3"] <- mce(tvcm.Pima, Pima)
c.is["TVCM3"] <- npar.tvcm(tvcm.Pima)
ptm <- proc.time()
tvcm.Pima <- tvcm(f4.tvcm, data = Pima, family = binomial(), control = c4.Pima)
t.is["TVCM4"] <- (proc.time() - ptm)[3]
e.is["TVCM4"] <- mce(tvcm.Pima, Pima)
c.is["TVCM4"] <- npar.tvcm(tvcm.Pima)
ptm <- proc.time()
tvcm.Pima <- tvcm(f5.tvcm, data = Pima, family = binomial(), control = c5.Pima)
t.is["TVCM5"] <- (proc.time() - ptm)[3]
e.is["TVCM5"] <- mce(tvcm.Pima, Pima)
c.is["TVCM5"] <- npar.tvcm(tvcm.Pima)
## MOB algorithm
ptm <- proc.time()
MOB.Pima <- glmtree(formula = f.MOB, data = Pima, family = binomial)
t.is["MOB"] <- (proc.time() - ptm)[3]
e.is["MOB"] <- mce(MOB.Pima, Pima)
c.is["MOB"] <- 3 * width(MOB.Pima) - 1
## CTree algorithm
ptm <- proc.time()
CTree.Pima <- ctree(formula = f.rpart, data = Pima)
t.is["CTree"] <- (proc.time() - ptm)[3]
e.is["CTree"] <- mce(CTree.Pima, Pima)
c.is["CTree"] <- 2 * width(CTree.Pima) - 1
## CART algorithm
ptm <- proc.time()
rpart.Pima <- rpart(formula = f.rpart, data = Pima)
cp.rpart <- rpart.Pima$cptable[which.min(rpart.Pima$cptable[,"xerror"]),"CP"]
rpart.Pima <- prune(rpart.Pima, cp = cp.rpart)
t.is["CART"] <- (proc.time() - ptm)[3]
e.is["CART"] <- mce(rpart.Pima, Pima)
c.is["CART"] <- 2 * length(unique(rpart.Pima$where)) - 1
require("RWeka")
## LMT algorithm
ptm <- proc.time()
LMT.Pima <- LMT(formula = f.rpart, data = Pima)
t.is["LMT"] <- (proc.time() - ptm)[3]
e.is["LMT"] <- mce(LMT.Pima, Pima)
c.is["LMT"] <- 2 * width(as.party(LMT.Pima)) - 1
## J4.8 algorithm
ptm <- proc.time()
J48.Pima <- J48(formula = f.rpart, data = Pima)
t.is["J4.8"] <- (proc.time() - ptm)[3]
e.is["J4.8"] <- mce(J48.Pima, Pima)
c.is["J4.8"] <- 2 * width(as.party(J48.Pima)) - 1
cat("OK")
## Performance test
## ----------------
cat("\n * starting performance test ... ")
nsim <- 250
folds <- folds_control(type = "bootstrap", K = nsim, seed = 123)
folds <- vcrpart:::tvcm_folds(tvcm.Pima, folds)
dn <- list(1:nsim, algs)
e.boot <- c.boot <- t.boot <- matrix(, nsim, length(algs), dimnames = dn)
rm(tvcm.Pima, MOB.Pima, CTree.Pima, rpart.Pima, LMT.Pima, J48.Pima)
for (i in 1:nsim) {
cat("\n\tfold", i, "... ")
## extract training data
subs <- rep(1:nrow(Pima), folds[, i])
training <- Pima[subs, ]
## extract validation data
subs <- folds[,i] == 0
validation <- Pima[subs, ]
c1.Pima$folds$seed <- i
c3.Pima$folds$seed <- i
c4.Pima$folds$seed <- i
c5.Pima$folds$seed <- i
## tvcm algorithm
ptm <- proc.time()
tvcmB.Pima <-
try(tvcm(f1.tvcm, data = training, family = binomial(),
control = c1.Pima),
silent = TRUE)
if (!inherits(tvcmB.Pima, "try-error")) {
t.boot[i, "TVCM1"] <- (proc.time() - ptm)[3]
e.boot[i, "TVCM1"] <- mce(tvcmB.Pima, validation)
c.boot[i, "TVCM1"] <- npar.tvcm(tvcmB.Pima)
}
ptm <- proc.time()
tvcmB.Pima <-
try(tvcm(f2.tvcm, data = training, family = binomial(),
control = c2.Pima),
silent = TRUE)
if (!inherits(tvcmB.Pima, "try-error")) {
t.boot[i, "TVCM2"] <- (proc.time() - ptm)[3]
e.boot[i, "TVCM2"] <- mce(tvcmB.Pima, validation)
c.boot[i, "TVCM2"] <- npar.tvcm(tvcmB.Pima)
}
ptm <- proc.time()
tvcmB.Pima <-
try(tvcm(f1.tvcm, data = training, family = binomial(),
control = c3.Pima),
silent = TRUE)
if (!inherits(tvcmB.Pima, "try-error")) {
t.boot[i, "TVCM3"] <- (proc.time() - ptm)[3]
e.boot[i, "TVCM3"] <- mce(tvcmB.Pima, validation)
c.boot[i, "TVCM3"] <- npar.tvcm(tvcmB.Pima)
}
ptm <- proc.time()
tvcmB.Pima <-
try(tvcm(f1.tvcm, data = training, family = binomial(),
control = c4.Pima),
silent = TRUE)
if (!inherits(tvcmB.Pima, "try-error")) {
t.boot[i, "TVCM4"] <- (proc.time() - ptm)[3]
e.boot[i, "TVCM4"] <- mce(tvcmB.Pima, validation)
c.boot[i, "TVCM4"] <- npar.tvcm(tvcmB.Pima)
}
ptm <- proc.time()
tvcmB.Pima <-
try(tvcm(f1.tvcm, data = training, family = binomial(),
control = c5.Pima),
silent = TRUE)
if (!inherits(tvcmB.Pima, "try-error")) {
t.boot[i, "TVCM5"] <- (proc.time() - ptm)[3]
e.boot[i, "TVCM5"] <- mce(tvcmB.Pima, validation)
c.boot[i, "TVCM5"] <- npar.tvcm(tvcmB.Pima)
}
## MOB algorithm
ptm <- proc.time()
MOBB.Pima <- try(glmtree(formula = f.MOB, data = training,
family = binomial),
silent = TRUE)
t.boot[i, "MOB"] <- (proc.time() - ptm)[3]
e.boot[i, "MOB"] <- mce(MOBB.Pima, validation)
c.boot[i, "MOB"] <- 3 * width(MOBB.Pima) - 1
## CTree algorithm
ptm <- proc.time()
CTreeB.Pima <- try(ctree(formula = f.rpart, data = training), silent = TRUE)
if (!inherits(CTreeB.Pima, "try-error")) {
t.boot[i, "CTree"] <- (proc.time() - ptm)[3]
e.boot[i, "CTree"] <- mce(CTreeB.Pima, validation)
c.boot[i, "CTree"] <- 2 * width(CTreeB.Pima) - 1
}
## CART algorithm
ptm <- proc.time()
rpartB.Pima <- try(rpart(formula = f.rpart, data = training), silent = TRUE)
if (!inherits(rpartB.Pima, "try-error")) {
cpB.Pima <-
rpartB.Pima$cptable[which.min(rpartB.Pima$cptable[,"xerror"]),"CP"]
rpartB.Pima <- prune(rpartB.Pima, cp = cpB.Pima)
t.boot[i, "CART"] <- (proc.time() - ptm)[3]
e.boot[i, "CART"] <- mce(rpartB.Pima, validation)
c.boot[i, "CART"] <- 2 * length(unique(rpartB.Pima$where)) - 1
}
## LMT algorithm
ptm <- proc.time()
LMTB.Pima <- try(LMT(formula = f.rpart, data = training), silent = TRUE)
if (!inherits(LMTB.Pima, "try-error")) {
t.boot[i, "LMT"] <- (proc.time() - ptm)[3]
e.boot[i, "LMT"] <- mce(LMTB.Pima, validation)
c.boot[i, "LMT"] <- 2 * width(as.party(LMTB.Pima)) - 1
}
## J4.8 algorithm
ptm <- proc.time()
J48B.Pima <- try(J48(formula = f.rpart, data = training), silent = TRUE)
if (!inherits(J48B.Pima, "try-error")) {
t.boot[i, "J4.8"] <- (proc.time() - ptm)[3]
e.boot[i, "J4.8"] <- mce(J48B.Pima, validation)
c.boot[i, "J4.8"] <- 2 * width(as.party(J48B.Pima)) - 1
}
rm(tvcmB.Pima, MOBB.Pima, CTreeB.Pima, rpartB.Pima, LMTB.Pima, J48B.Pima)
gc(verbose = FALSE)
cat("OK")
}
rm(f1.tvcm, f2.tvcm, f.MOB, f.rpart,
c1.Pima, c2.Pima, c3.Pima, c4.Pima, c5.Pima,
ptm, dn)
save(algs, folds, nsim,
t.is, e.is, c.is,
t.boot, e.boot, c.boot,
file = "RData/vcrpart-bench.RData")
} else {
load(file = "RData/vcrpart-bench.RData")
}
###################################################
### code chunk number 29: pima-MOB-spinogram (eval = FALSE)
###################################################
##
## ## original plot of MOB model
## f.MOB <- diabetes ~ glucose | pregnant + pressure + mass + pedigree + age
## MOB.Pima <- glmtree(formula = f.MOB, data = Pima, family = binomial)
## plot(MOB.Pima)
###################################################
### code chunk number 30: pima-table-comp
###################################################
subs <- intersect(1:250, which(!is.na(e.boot[,1])))
e.boot <- e.boot[subs, ]
c.boot <- c.boot[subs, ]
t.boot <- t.boot[subs, ]
tab.Pima <- data.frame("Boot" = apply(e.boot, 2, median, na.rm = TRUE),
"Boot-Mean" = apply(e.boot, 2, mean, na.rm = TRUE),
"Orig" = e.is,
"Boot" = apply(c.boot, 2, median, na.rm = TRUE),
"Orig" = c.is,
"Boot" = apply(t.boot, 2, median, na.rm = TRUE),
"Orig" = t.is,
check.names = FALSE)
cap <- c("Performances for the \\code{Pima} data. Boot: Medians (and means under Boot-Mean) of results from fits on 250 bootstrap samples. Orig: Results on the original data. Misclassification: Misclassification errors. Complexity: The number of coefficients plus the number of splits. Time: Computation time in seconds.", "Performances for the \\code{Pima} data")
algs2 <- algs
algs2[1:5] <- c("TVCM (default)",
"TVCM (additive)",
"TVCM ($N_0 = 50$)",
"TVCM ($D_{min} = 50$)",
"TVCM ($N_S = 19$)")
algs2[8L] <- "RPART"
rownames(tab.Pima) <- algs2
rownames(tab.Pima)[1L] <- "TVCM"
xtab.Pima <- xtable(x = tab.Pima,
caption = cap,
label = "vcrpart-tab:pima-performance",
align = "lrrr|rr|rr",
digits = c(0, 3, 3, 3, 0, 0, 2, 2),
display = c("s", "f", "f", "f", "d", "d", "f", "f"))
addtorow <- list(pos = list(-1),
command = "\\hline & \\multicolumn{3}{c}{Misclassification} & \\multicolumn{2}{c}{Complexity} & \\multicolumn{2}{c}{Time} \\\\")
print(x = xtab.Pima,
table.placement = "tbp",
caption.placement = "bottom",
latex.environments = "center",
sanitize.text.function = function(x) x,
booktabs = FALSE,
hline.after = c(0, nrow(xtab.Pima)),
add.to.row = addtorow, size="\\small")
###################################################
### code chunk number 31: pima-fig-comp
###################################################
nsim <- length(subs)
algs3 <- algs
algs3[2] <- expression(paste("TVCM (additive)", sep = ""))
algs3[3] <- expression(paste("TVCM (", N[0], "= 50)", sep = ""))
algs3[4] <- expression(paste("TVCM (", D[min], "= 50)", sep = ""))
algs3[5] <- expression(paste("TVCM (", N[S], " = 19)", sep = ""))
algs3[8] <- "RPART"
par(mfrow = c(1,2), mgp = c(2.25, 1, 0), mar = c(3.5, 8, 0.5, 2))
e.boot.long <-
data.frame(MCE = c(e.boot),
ALG = factor(rep(algs, each = nsim), levels = algs))
mod <- lm(MCE ~ ALG, e.boot.long)
ci <- confint(mod)
plot(coef(mod)[-1], length(coef(mod)[-1]):1, pch = 16,
xlim = c(-0.01, 0.045), axes = FALSE, ylab = "",
xlab = "Misclassification difference")
axis(1); axis(2, length(coef(mod)[-1]):1, algs3[-1], las = 2); box();
abline(v = 0, lty = 2)
arrows(ci[-1,1], length(coef(mod)[-1]):1,
ci[-1,2], length(coef(mod)[-1]):1, angle = 90, code = 3, length = 0.075)
c.boot.long <-
data.frame(DF = c(c.boot),
ALG = factor(rep(algs, each = nsim), levels = algs))
mod <- lm(DF ~ ALG, c.boot.long)
ci <- confint(mod)
plot(coef(mod)[-1], length(coef(mod)[-1]):1,
axes = FALSE, ylab = "", pch = 16,
xlab = "Complexity difference", xlim = c(-12,80))
axis(1); axis(2, length(coef(mod)[-1]):1, algs3[-1], las = 2); box();
abline(v = 0, lty = 2)
arrows(ci[-1,1], length(coef(mod)[-1]):1, ci[-1,2], length(coef(mod)[-1]):1,
angle = 90, code = 3, length = 0.075)
###################################################
### code chunk number 32: vcrpart-simulations-code
###################################################
## Comment:
## The following code runs the simulation study if
## 'run = TRUE' or the R data file 'RData/vcrpart-simulations.RData' is
## not available.
run <- FALSE
## function to generate the data
generate.data <- function(N = 200) {
rval <- data.frame(x = rnorm(N),
z0 = rbinom(N, 1, 0.5),
z1 = rbinom(N, 1, 0.5),
z2 = rbinom(N, 1, 0.5),
z3 = rbinom(N, 1, 0.5),
z4 = rbinom(N, 1, 0.5),
z5 = rbinom(N, 1, 0.5))
rval$group <- factor(paste(rval$z0,rval$z1, sep = "-"))
levels(rval$group) <- c("1", "1", "2", "3")
rval$y1 <- model.matrix(~ z0 + x + z1:x, rval) %*% c(-1, 2, 1, -1) +
rnorm(N)
return(rval)
}
## function that executes a single simulation run
simFun <- function(N = 100) {
cat(".")
rval <- matrix(FALSE, 2, 4)
z <- c("z0", "z1", "z2", "z3", "z4", "z5")
## set control according to sample size
control.sim <- tvcglm_control(apply.args = list(mc.cores = 6),
mindev = min(N / 200, 2),
minsize = min(N / 20, 30))
## generate data
dat <- generate.data(N = N)
## fit the models
m1 <- tvcglm(y1 ~ - 1 + vc(z) + vc(z, by = x),
family = gaussian(), data = dat,
control = control.sim)
m2 <- tvcglm(y1 ~ - 1 + vc(z, by = x, intercept = TRUE),
family = gaussian(), data = dat,
control = control.sim)
n1 <- extract(m1, "nodes")
n2 <- extract(m2, "nodes")
## check whether the algorithms found true structure
rval[1,1] <-
(width(n1$A) == 2 &&
unlist(nodeapply(n1$A, 1, function(x) x$split$varid)) == 1) &
(width(n1$B) == 2 &&
unlist(nodeapply(n1$B, 1, function(x) x$split$varid)) == 2)
rval[2,1] <-
width(n2$A) == 4 &&
(all(unlist(nodeapply(n2$A, c(1,2,5),
function(x) x$split$varid)) %in% c(1,2,2)) |
all(unlist(nodeapply(n2$A, c(1,2,5),
function(x) x$split$varid)) %in% c(2,1,1)))
## check whether the fits include the true structure as a nested model
rval[1,2] <-
(width(n1$A) > 1 &&
unlist(nodeapply(n1$A, 1, function(x) x$split$varid)) == 1) &
(width(n1$B) > 1 &&
unlist(nodeapply(n1$B, 1, function(x) x$split$varid)) == 2)
rval[2,2] <-
width(n2$A) >= 4 &&
(all(unlist(nodeapply(n2$A, c(1,2,5),
function(x) x$split$varid)) %in% c(1,2,2)) |
all(unlist(nodeapply(n2$A, c(1,2,5),
function(x) x$split$varid)) %in% c(2,1,1)))
## check whether true effect modifiers were chose
rval[1,3] <- "z0" %in% extract(m1, "selected")[[1]] &
"z1" %in% extract(m1, "selected")[[2]]
rval[2,3] <- all(c("z0", "z1") %in% extract(m2, "selected")[[1]])
## check whether noise variables were chosen
rval[1,4] <- sum(!unique(unlist(extract(m1, "selected"))) %in% c("z0", "z1"))
rval[2,4] <- sum(!unique(unlist(extract(m2, "selected"))) %in% c("z0", "z1"))
return(c(rval))
}
if (run | !"vcrpart-simulations.RData" %in% dir("RData/")) {
nsim <- 2000 # the number of simulation runs
N <- c(50, 100, 150, 200, 250, 300, 350, 400, 450, 500)
par <- rep(N, each = nsim)
res <- mclapply(X = par, simFun, mc.cores = 1)
save(res, par, N, nsim, file = "RData/vcrpart-simulations.RData")
} else {
load(file = "RData/vcrpart-simulations.RData")
}
###################################################
### code chunk number 33: vcrpart-simulations-arbitrarydata (eval = FALSE)
###################################################
##
## ## generate an arbitrary simulation data set with 100 observations
## N <- 100
## simdata <- generate.data(N = N)
## simdata$y <- simdata$y1
###################################################
### code chunk number 34: vcrpart-simulations-moderators (eval = FALSE)
###################################################
##
## z <- c("z0", "z1", "z2", "z3", "z4", "z5")
## control.sim <-
## tvcglm_control(mindev = min(N / 200, 2), minsize = min(N / 20, 30))
###################################################
### code chunk number 35: vcrpart-simulations-models-1 (eval = FALSE)
###################################################
##
## vcm.sim.1 <- # coefficient-wise partitions model
## tvcglm(y ~ -1 + vc(z) + vc(z, by = x), data = simdata,
## family = gaussian(), control = control.sim)
###################################################
### code chunk number 36: vcrpart-simulations-models-2 (eval = FALSE)
###################################################
##
## vcm.sim.2 <- # single partition model
## tvcglm(y ~ -1 + vc(z, by = x, intercept = TRUE),
## data = simdata, family = gaussian(), control = control.sim)
###################################################
### code chunk number 37: pima-fig-simulations
###################################################
res <- sapply(res, I)
res <- apply(res, 1, function(x) tapply(x, par, function(x) sum(x) / length(x)))
par(mfrow = c(1, 4), mgp = c(2, 1, 0), mar = c(3, 3, 3, 1))
matplot(x = N, y = res[, 1:2], type = "b", col = "black",
ylab = "Proportion of runs", ylim = c(0, 1),
main = "Identified\n underlying model",
pch = 1:2, xlab = "Number of observations")
abline(h = c(0, 1), lty = 3)
matplot(x = N, y = res[, 3:4], type = "b", col = "black",
ylab = "Proportion of runs",
ylim = c(0, 1), main = "Nested\n underlying model",
pch = 1:2, xlab = "Number of observations")
abline(h = c(0, 1), lty = 3)
matplot(x = N, y = res[, 5:6], type = "b", col = "black",
ylab = "Proportion of runs",
ylim = c(0, 1), main = "True moderators\n selected",
pch = 1:2, xlab = "number of observations")
abline(h = c(0, 1), lty = 3)
matplot(x = N, y = res[, 7:8], type = "b", col = "black",
ylab = "Proportion of runs",
ylim = c(0, 1), main = "False\n variable selections",
pch = 1:2, xlab = "Number of observations")
###################################################
### code chunk number 38: school-data
###################################################
library("Ecdat")
data("Schooling")
Schooling <- Schooling[c(19, 21, 7, 28, 9, 14, 17, 18, 20, 23, 2, 4)]
Schooling$black <- 1 * (Schooling$black == "yes")
###################################################
### code chunk number 39: school-IV
###################################################
Schooling$ed76.IV <- fitted(lm(ed76 ~ nearc4, Schooling))
###################################################
### code chunk number 40: school-glm
###################################################
lm.School <- lm(lwage76 ~ ed76.IV + exp76 + I(exp76^2) + black,
data = Schooling)
###################################################
### code chunk number 41: school-glm
###################################################
summary(lm.School)$coef[, 1:3]
###################################################
### code chunk number 42: school-formula
###################################################
f.School <- lwage76 ~ -1 + ed76.IV + exp76 + I(exp76^2) +
vc(age76, momdad14, south66, south76, famed, enroll76, smsa76) +
vc(ed76.IV, exp76, age76, momdad14, south66,
south76, famed, enroll76, smsa76, by = black)
###################################################
### code chunk number 43: school-fitting (eval = FALSE)
###################################################
##
## vcm.School <- tvcglm(formula = f.School, data = Schooling,
## family = gaussian(), control = control)
###################################################
### code chunk number 44: school-fig-cv
###################################################
par(mgp = c(2, 1, 0), mar = c(3,3,0.1,0.1))
plot(vcm.School, "cv",
xlab = expression(lambda),
ylab = "Validation error",
legend = FALSE)
###################################################
### code chunk number 45: school-fig-tree
###################################################
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2, widths = c(0.72,0.28))))
pushViewport(viewport(layout.pos.col = 1))
plot(vcm.School, type = "coef", part = 1, nobs = FALSE,
newpage = FALSE, gp = gpar(fontsize = 11))
upViewport()
pushViewport(viewport(layout.pos.col = 2))
plot(vcm.School, type = "coef", part = 2,
newpage = FALSE)
upViewport()
upViewport()
###################################################
### code chunk number 46: school-nonvarcoef
###################################################
coef(vcm.School)$fe
###################################################
### code chunk number 47: pl-data
###################################################
data("PL")
###################################################
### code chunk number 48: pl-glm
###################################################
glm.PL <- glm(uncj10 ~ july, data = PL, family = binomial)
###################################################
### code chunk number 49: app-glm-coef
###################################################
summary(glm.PL)$coefficients[, 1:3]
###################################################
### code chunk number 50: pl-formula
###################################################
f.PL <- uncj10 ~ 1 + july + vc(age) + vc(age, by = july) +
vc(workExp) + vc(workExp, by = july) +
vc(unEmpl) + vc(unEmpl, by = july) +
vc(laborEarnings) + vc(laborEarnings, by = july) +
vc(whiteCollar) + vc(whiteCollar, by = july) +
vc(wage) + vc(wage, by = july) +
vc(industry.SL) + vc(industry.SL, by = july) +
vc(region.SL) + vc(region.SL, by = july)
###################################################
### code chunk number 51: pl-fitting (eval = FALSE)
###################################################
##
## vcm.PL <- tvcglm(formula = f.PL, family = binomial(),
## data = PL, control = control)
###################################################
### code chunk number 52: pl-fig-cv
###################################################
par(mgp = c(2, 1, 0), mar = c(3,3,0.1,0.1))
plot(vcm.PL, "cv",
xlab = expression(lambda),
ylab = "Validation error",
legend = FALSE)
###################################################
### code chunk number 53: pl-fig-tree
###################################################
fac <- 0.9
subs <- which(width(vcm.PL) > 1)
ylim1 <- c(-1, 0.8)
ylim2 <- c(-1.4, 0.5)
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow = 2, ncol = 3,
widths = c(1/3, 1/3, 1/3))))
pushViewport(viewport(layout.pos.col = 1, layout.pos.row = 1))
plot(vcm.PL, type = "coef", part = subs[1], plot_gp = list(ylim = ylim1),
newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 1, layout.pos.row = 2))
plot(vcm.PL, type = "coef", part = subs[2], plot_gp = list(ylim = ylim2),
newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 2, layout.pos.row = 1))
plot(vcm.PL, type = "coef", part = subs[3], plot_gp = list(ylim = ylim1),
newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 3, layout.pos.row = 1))
plot(vcm.PL, type = "coef", part = subs[4], plot_gp = list(ylim = ylim1),
newpage = FALSE)
upViewport()
pushViewport(viewport(layout.pos.col = 2, layout.pos.row = 2))
plot(vcm.PL, type = "coef", part = subs[5], plot_gp = list(ylim = ylim2),
newpage = FALSE)
upViewport()
upViewport()
###################################################
### code chunk number 54: pl-coef
###################################################
## extract coefficients for text
effGL <- coef(vcm.PL)$fe[2]
effWED1 <- coef(vcm.PL)$vc$C[1]
effWED2 <- coef(vcm.PL)$vc$C[2]
effWEM1 <- coef(vcm.PL)$vc$D[1]
effREM1 <- coef(vcm.PL)$vc$P[1]
effREM3 <- coef(vcm.PL)$vc$P[3]
###################################################
### code chunk number 55: applicatons-save (eval = FALSE)
###################################################
##
## ## save all fitted models, those being commented out by default
## if (run | !"vcrpart-applications.RData" %in% dir("RData/")) {
## save(UCBA, cv.UCBA, vcm.UCBA,
## Pima, vcm.Pima.1, vcm.Pima.2, mob.Pima,
## Schooling, vcm.School,
## PL, vcm.PL,
## file = "RData/vcrpart-applications.RData")
## }
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.