# inst/doc/resampling.R In gamclass: Functions and Data for a Course on Modern Regression and Classification

```## ----setup, cache=FALSE, echo=FALSE-----------------------------------
library(knitr)
options(replace.assign=FALSE,width=72)
opts_chunk\$set(fig.path='figs/key-', cache.path='cache/key-',
fig.align='center', fig.width=3.5,
fig.height=3.5, fig.show='hold', par=TRUE,
tidy=FALSE,  comment=NA)
knit_hooks\$set(par=function(before, options, envir){
if (before && options\$fig.show!='none') par(mar=c(4,4,2.6,.1),
cex.lab=.95,cex.axis=.9,mgp=c(2,.7,0),tcl=-.3)
}, crop=hook_pdfcrop)
oldopt <- options(digits=4)

## ----cap123-----------------------------------------------------------
cap1 <- "Heart weight versus body weight of 97 male cats.
A regression line has been fitted."
cap2 <- "Q-Q plot ('Normal Probability plot') of residuals from
the regression of heart weight on body weight, for the male cats."
cap3 <- "Scatterplot matrix of bootstrap estimates of `Slope`
versus `Intercept`, with boxplots used to summarize the marginal
distributions."
cap4 <- "Bootstrap resampling result, compared with simulation
resampling."

## ----cats-xy, fig.width=4.25, fig.height=4.25, fig.cap=cap1-----------
library(lattice)
mcats <- subset(MASS::cats, Sex=="M")
xyplot(Hwt ~ Bwt, data=mcats, type=c("p","r"))

## ----mcats-lm---------------------------------------------------------
mcats.lm <- lm(Hwt~Bwt, data=mcats)
signif(coef(summary(mcats.lm)),3)

## ----cats-qq, fig.cap=cap2--------------------------------------------
res <- resid(mcats.lm)
qqnorm(res)

## ----outlier-num}, eval=TRUE, echo=TRUE-------------------------------
nout <- (1:nrow(mcats))[c(which.min(res),which.max(res))]

## ----boot-regcoefs, fig.cap=cap3, fig.width=2.85, fig.height=3--------
library(gamclass, quietly=TRUE)
library(car, quietly=TRUE)
mcats <- subset(MASS::cats, Sex=="M")
bootmat <- bootreg(Hwt ~ Bwt,
data = mcats,
nboot = 1000)
bootdf <- as.data.frame(bootmat)
names(bootdf) <- c("Intercept",
"Slope")
alpha.f=0.25)
scatterplot(Slope ~ Intercept,
col=colr, data=bootdf,
boxplots="xy",
regLine=NA,
smooth=FALSE)

## ----95CI-------------------------------------------------------------
sapply(bootdf, quantile, prob=c(.025, .975))

## ----alt-boot-omitOutlier---------------------------------------------
library(lattice)
bootmat <- bootreg(formula = Hwt ~ Bwt,
data = mcats[-97, ],
nboot = 1000)
bootdf0 <- as.data.frame(bootmat)
names(bootdf0) <- c("Intercept","Slope")
gphA <- xyplot(Slope ~ Intercept, data=bootdf0, alpha=0.25)

## ----alt-sim----------------------------------------------------------
simmat <- simreg(formula = Hwt ~ Bwt,
data=mcats[-97, ], nsim=1000)
simdf <- as.data.frame(simmat)
names(simdf) <- c("Intercept","Slope")
gphB <- xyplot(Slope ~ Intercept, data=simdf, alpha=0.25)

## ----alt-resamp, out.width='47%', fig.show='hold', fig.width=3.85, fig.height=3.85, fig.cap=cap4----
update(gphA, main=grid::textGrob("A: Bootstrap (outlier omitted)",
x=grid::unit(.05, "npc"),
y = grid::unit(.25, "npc"), just="left",
gp=grid::gpar(cex=1)))
update(gphB, main=grid::textGrob("B: Simulation",
x = grid::unit(.05, "npc"),
y = grid::unit(.25, "npc"), just="left",
gp=grid::gpar(cex=1)))

## ----cap5-------------------------------------------------------------
cap5 <- "Regression tree for predicting `Mileage` given
`Weight`. At each node, observations for which the
criterion is satisfied take the branch to the left.  Thus,
at the first node, `tonsWt\$>=\$1.218` chooses the branch to
the left, while `tonsWt\$<\$1.218` chooses the branch to the
right.  Panel B plots `Mileage` versus `tonsWt`, with
fitted values from the `rpart` model shown as horizontal
grey lines."

## ----rpart------------------------------------------------------------
library(rpart)
Car90 <- na.omit(car90[, c("Mileage","Weight")])
## Express weight in metric tonnes
Car90 <- within(Car90, tonsWt <- Weight/2240)
car90.rpart <- rpart(Mileage ~ tonsWt, data=Car90)

## ----deftree, out.width='48%', fig.cap=cap5,fig.width=5.25, fig.height=5.25, fig.show='hold'----
## Code for Panel A
plot(car90.rpart)
text(car90.rpart, xpd=TRUE, digits=3)
mtext(side=3, line=1.25, "A: Regression tree", adj=0, cex=1.25)
## Code for Panel B
plot(Mileage ~ tonsWt, data=Car90, fg='gray')
wt <- with(Car90, tonsWt)
hat <- predict(car90.rpart)
gamclass::addhlines(wt, hat, lwd=2, col="gray")
mtext(side=3, line=1.25, "B: Predicted values from tree", adj=0, cex=1.25)

## ----cap6-------------------------------------------------------------
cap6 <- "Between group sum of squares for `Mileage`, as a
function of the value of `tonsWt` at which the split is
made. The choice \$c = 1.218\$ maximizes the between groups
sum of squares."

## ----bss-plot, fig.width=5, h=4.25, out.width="60%", fig.cap=cap6-----
## Code
BSS <- bssBYcut(tonsWt, Mileage, Car90)
with(BSS, plot(xOrd, bss, xlab="Cutpoint", fg='gray', ylab=""))
mtext(side=2, line=1.5, "Between groups\nsum of squares")
abline(v=1.218, lty=2)

## ----cap7, out.width='80%'--------------------------------------------
cap7 <- "Scatterplot matrix of accuracies for the several models.
The line \$y=x\$ is shown in each panel.
Note that `rfOOB` is out-of-bag accuracy, i.e., calculated from
the set of 95 observations, and that `rfTest` is accuracy on the
test data, again for a random forest model with no preliminary
smoothing. Results from hybrid models are labeled according to
the name of the formula for the smooth.   The final accuracy,
evaluated on the test data, is for a random forest model
fitted to residuals from the smooth"

## ----meuse-setup------------------------------------------------------
data("meuse", package="sp")
meuse <- within(meuse, {levels(soil) <- c("1","2","2")
ffreq <- as.numeric(ffreq)
})

## ----gam-formulae-----------------------------------------------------
form1 <- ~ dist + elev + soil + ffreq
form3 <- ~ s(dist, k=3) + s(elev,k=3) + soil +ffreq
form3x <- ~ s(dist, k=3) + s(elev,k=3) + s(x, k=3) + soil+ffreq
form8x <- ~ s(dist, k=8) + s(elev,k=8) + s(x, k=8) + soil+ffreq
formlist <- list("Hybrid1"=form1, "Hybrid3"=form3, "Hybrid3x"=form3x,
"Hybrid8x"=form8x)

## ----rfgam-setup------------------------------------------------------
rfVars <- c("dist", "elev", "soil", "ffreq", "x", "y")
nrep <- 100
errsmat <- matrix(0, nrep, length(formlist)+2)
dimnames(errsmat)[[2]] <- c(names(formlist), "rfTest", "rfOOB")

## ----rfgam-many-------------------------------------------------------
n <- 95
for(i in 1:nrep){
sub <- sample(1:nrow(meuse), n)
meuseOut <- meuse[-sub,]
meuseIn <- meuse[sub,]
errsmat[i, ] <- gamclass::gamRF(formlist=formlist, yvar="loglead",
rfVars=rfVars, data=meuseIn,
newdata=meuseOut, printit=FALSE)
}

## ----cf-models-rfgam, fig.cap=cap7, fig.width=7.5, fig.height=7.5, out.width='65%'----
## Code for scatterplot of results
ran <- range(errsmat)
at <- round(ran+c(0.02,-0.02)*diff(ran),2)
lis <- list(limits=ran, at=at, labels=format(at, digits=2))
lims=list(lis,lis,lis,lis,lis,lis)
library(lattice)
splom(errsmat,
pscales=lims,
par.settings=simpleTheme(cex=0.75),
panel=function(x,y,...){lpoints(x,y,...)
panel.abline(0,1,col="gray")}
)

## ----cf-accs-rfgam----------------------------------------------------
errdf <- stack(as.data.frame(errsmat[,-6]))
names(errdf) <- c("errs","model")
errdf\$model <- relevel(errdf\$model, ref="rfTest")
errdf\$sample <- factor(rep(1:nrow(errsmat),5))
errors.aov <- aov(errs ~ model+sample, data=errdf)
round(coef(summary.lm(errors.aov))[1:5,], 4)

## ----cf-accs-rfgam-ests-----------------------------------------------
model.tables(errors.aov, type="mean", cterms='model')

## ----cuckoos-caps-----------------------------------------------------
cap8 <- "Length versus breadth, compared between cuckoo eggs laid in
the different host nests. Axes for the scores are overlaid."
cap9 <- "Plot of length versus breadth of cuckoo eggs, now showing
a boundary line for distinguishing `wren` from `not-wren`,
based on the `lda()` analysis."

## ----cuckoos-lda------------------------------------------------------
cuckoos <- within(DAAG::cuckoos,
levels(species) <- abbreviate(levels(species), 8))
cuckoos.lda <- MASS::lda(species ~ length + breadth, data=cuckoos)

## ----cuckoo-basic-gph-------------------------------------------------
gph <- xyplot(length ~ breadth, groups=species, data=cuckoos,
type=c("p"), auto.key=list(space="right"), aspect=1,
scales=list(tck=0.5), par.settings=simpleTheme(pch=16))

## ----cuckoos-sc, fig.cap=cap8,out.width='70%', fig.width=6.25, fig.height=4.75----
## library(latticeExtra)  # This package has the function layer()
LDmat <- cuckoos.lda\$scaling
ld1 <- LDmat[,1]
ld2 <- LDmat[,2]
gm <- sapply(cuckoos[, c("length", "breadth")], mean)
av1 <- gm[1] + ld1[2]/ld1[1]*gm[2]
av2 <- gm[1] + ld2[2]/ld2[1]*gm[2]
addlayer <- latticeExtra::layer(panel.abline(av1, -ld1[2]/ld1[1], lty=1),
panel.abline(av2, -ld2[2]/ld2[1], lty=2))

## ----calc-scores------------------------------------------------------
gm <- apply(cuckoos.lda\$means*cuckoos.lda\$prior,2,sum)
# Grand 'means'
## Sweep out grand 'means'
centered <- sweep(as.matrix(cuckoos[,1:2]), 2, gm)

## ----loo-cv-acc-------------------------------------------------------
## Leave-one-out cross-validation
## Accuracies for linear discriminant analysis
cuckooCV.lda <- MASS::lda(species ~ length + breadth,
data=cuckoos, CV=TRUE)
confusion(cuckoos\$species, cuckooCV.lda\$class,
gpnames=abbreviate(levels(cuckoos\$species), 8))
## Post-multiply by scaling matrix
scores <- centered %*% cuckoos.lda\$scaling

## ----qda-loo-cv-acc---------------------------------------------------
## Accuracies for quadratic discriminant analysis
cuckooCV.qda <- MASS::qda(species ~ length + breadth,
data=cuckoos, CV=TRUE)
acctab <-confusion(cuckoos\$species, cuckooCV.qda\$class,
gpnames=levels(cuckoos\$species),
printit=FALSE)\$confusion
tab <- table(cuckoos\$species)
##
## Overall accuracy
cat("Overall accuracy =",
sum(diag(acctab)*tab)/sum(tab), "\n")
## Confusion matrix
round(acctab, 3)

## ----CVfalse----------------------------------------------------------
cuckoos.lda <- MASS::lda(species ~ length + breadth, data=cuckoos)

## ----calc-discrim-----------------------------------------------------
x <- pretty(cuckoos\$breadth, 20)
y <- pretty(cuckoos\$length, 20)
Xcon <- expand.grid(breadth=x, length=y)
cucklda.pr <- predict(cuckoos.lda, Xcon)\$posterior

## ----cuckoos-contour, fig.cap=cap9,out.width='70%', fig.width=6.25, fig.height=4.75----
m <- match("wren", colnames(cucklda.pr))
ldadiff <- apply(cucklda.pr, 1, function(x)x[m]-max(x[-m]))
at=c(-1,0,1), labels=c("", "lda",""),
label.style="flat", col='gray',
data=Xcon), axes=FALSE)

## ----cf-acc-lda-qda---------------------------------------------------
library(gamclass, quietly=TRUE)
cucklda.pr <- cuckooCV.lda\$posterior
cucklda.pr2 <- MASS::lda(species ~ length + breadth + I(length^2)
data=cuckoos)\$posterior
compareModels(groups=cuckoos\$species,
estprobs=list(lda=cucklda.pr,
"lda plus"=cucklda.pr2),
gpnames=abbreviate(levels(cuckoos\$species),8))

## ----getdata-spam-----------------------------------------------------
data(spam, package='kernlab')
spam[,-58] <- scale(spam[,-58])
nr <- sample(1:nrow(spam))
spam0 <- spam[nr[1:2601],]      ## Training
spam1 <- spam[nr[2602:3601],]   ## Holdout
spam01 <- spam[nr[1:3601],]     ## Use for training,
## if holdout not needed
spam2 <- spam[nr[3602:4601],]   ## Test

## ----lda-rates--------------------------------------------------------
library('MASS')
spam01.lda <- lda(type~., data=spam01)
ldaRates <- ldaErr(train.lda=spam01.lda, train=spam01, test=spam2,
group='type')

## ----rpart-dofun------------------------------------------------------
set.seed(29)   ## Make results precisely reproducible
spam01.rp <- rpart::rpart(type~., data=spam01, cp=0.0001)
rpRates <- rpartErr(train.rp=spam01.rp, train=spam01, test=spam2,
group='type')

## ----rf-dofun---------------------------------------------------------
set.seed(29)   ## Make results precisely reproducible
spam01.rf <- randomForest::randomForest(type ~ ., data=spam01)
rfRates <- rfErr(train.rf=spam01.rf, train=spam01, test=spam2,
group='type')

## ----acctable---------------------------------------------------------
lda_main <- c(round(ldaRates['loo'], 3), round(ldaRates['trainerr'], 3),'-',
round(ldaRates['testerr'], 3))
rpartAcc <- c('-',round(rpRates['cverror'], 3),round(rpRates['trainerror'], 3),
round(rpRates['testerror'], 3))
rfAcc <- c('-',round(rfRates['trainerr'], 3),round(rfRates['OOBerr'], 3),
round(rfRates['testerr'], 3))
accmat <- rbind(lda_main,rpartAcc,rfAcc)
rownames(accmat) <- c('lda (main effects)','rpart','randomForest')
kable(accmat, col.names=c("Leave One Out CV","Training",
"Out of Bag","Test rate"), align=rep('r',4))

## ----lda-x, eval=FALSE------------------------------------------------
#  mm01x <- model.matrix(type ~ 0+.^2, data=spam01)
#  mm2x <- model.matrix(type ~ 0+.^2, data=spam2)
#  spam01x.lda <- lda(x=mm01x, grouping=spam01[,'type'])
#  ldaRatesx <- ldaErr(train.lda=spam01x.lda, train=mm01x, test=mm2x,
#                      group=spam01[,'type'])
#  round(ldaRatesx,3)

## ----errRates, echo=FALSE---------------------------------------------
c(loo=0.291,  trainerr=0.027,  testerr=0.164)

## ----data-Vowel-------------------------------------------------------
data('Vowel', package='mlbench')
Vowel\$V1 <-  factor(Vowel\$V1)

## ----vowel-qda--------------------------------------------------------
vowelcv.qda <- qda(Class ~ ., data=Vowel, CV=TRUE)
accqda <- confusion(Vowel\$Class, vowelcv.qda\$class,
printit=FALSE)\$overall
print(round(accqda,3))
## Compare with CV performance
accspqda <- CVcluster(Class ~ ., id=V1,
data=Vowel, nfold=15, FUN=qda,
printit=FALSE)\$CVaccuracy
print(round(accspqda,3))

## ----vowel-rf---------------------------------------------------------
data('Vowel', package='mlbench')
Vowel\$V1 <-  factor(Vowel\$V1)
vowel.rf <- randomForest::randomForest(Class ~ ., data=Vowel)
print("OOB accuracy estimate")
accOOB <- 1-sum(rep(1/11,11)*vowel.rf\$confusion[,"class.error"])
print(round(accOOB,3))
## Compare with performance based on OOB choice of speakers
accspOOB <- RFcluster(Class ~ ., id=V1,
data=Vowel, ntree=500,
progress = FALSE)\$OOBaccuracy
print(round(accspOOB),3)
```

## Try the gamclass package in your browser

Any scripts or data that you put into this service are public.

gamclass documentation built on Nov. 14, 2020, 1:07 a.m.