ch06.R
In MASS: Support Functions and Datasets for Venables and Ripley's MASS

#-*- R -*-

## Script from Fourth Edition of `Modern Applied Statistics with S'

# Chapter 6   Linear Statistical Models

library(MASS)
library(lattice)
options(width=65, digits=5, height=9999)
pdf(file="ch06.pdf", width=8, height=6, pointsize=9)
options(contrasts = c("contr.helmert", "contr.poly"))


# 6.1  A linear regression example

xyplot(Gas ~ Temp | Insul, whiteside, panel =
  function(x, y, ...) {
    panel.xyplot(x, y, ...)
    panel.lmline(x, y, ...)
  }, xlab = "Average external temperature (deg. C)",
  ylab = "Gas consumption  (1000 cubic feet)", aspect = "xy",
  strip = function(...) strip.default(..., style = 1))


gasB <- lm(Gas ~ Temp, data = whiteside, subset = Insul=="Before")
gasA <- update(gasB, subset = Insul=="After")

summary(gasB)
summary(gasA)

varB <- deviance(gasB)/gasB$df.resid    # direct calculation
varB <- summary(gasB)$sigma^2           # alternative

gasBA <- lm(Gas ~ Insul/Temp - 1, data = whiteside)
summary(gasBA)

gasQ <- lm(Gas ~ Insul/(Temp + I(Temp^2)) - 1, data = whiteside)
summary(gasQ)$coef

# R: options(contrasts = c("contr.helmert", "contr.poly"))
gasPR <- lm(Gas ~ Insul + Temp, data = whiteside)
anova(gasPR, gasBA)

oldcon <- options(contrasts = c("contr.treatment", "contr.poly"))
gasBA1 <- lm(Gas ~ Insul*Temp, data = whiteside)
summary(gasBA1)$coef
options(oldcon)


# 6.2  Model formulae and model matrices

dat <- data.frame(a = factor(rep(1:3, 3)),
                  y = rnorm(9, rep(2:4, 3), 0.1))
obj <- lm(y ~ a, dat)
(alf.star <- coef(obj))
Ca <- contrasts(dat$a)      # contrast matrix for `a'
drop(Ca %*% alf.star[-1])
dummy.coef(obj)


N <- factor(Nlevs <- c(0,1,2,4))
contrasts(N)
contrasts(ordered(N))

N2 <- N
contrasts(N2, 2) <- poly(Nlevs, 2)
N2 <- C(N, poly(Nlevs, 2), 2)       # alternative
contrasts(N2)

fractions(ginv(contr.helmert(n = 4)))

Cp <- diag(-1, 4, 5);  Cp[row(Cp) == col(Cp) - 1] <- 1
Cp
fractions(ginv(Cp))


# 6.3  Regression diagnostics

(hills.lm <- lm(time ~ dist + climb, data = hills))
frame()
par(fig = c(0, 0.6, 0, 0.55))
plot(fitted(hills.lm), studres(hills.lm))
abline(h = 0, lty = 2)
# identify(fitted(hills.lm), studres(hills.lm), row.names(hills))
par(fig = c(0.6, 1, 0, 0.55), pty = "s")
qqnorm(studres(hills.lm))
qqline(studres(hills.lm))
par(pty = "m")
hills.hat <- lm.influence(hills.lm)$hat
cbind(hills, lev = hills.hat)[hills.hat > 3/35, ]
cbind(hills, pred = predict(hills.lm))["Knock Hill", ]
(hills1.lm <- update(hills.lm, subset = -18))
update(hills.lm, subset = -c(7, 18))
summary(hills1.lm)
summary(update(hills1.lm,  weights = 1/dist^2))
lm(time ~ -1 + dist + climb, hills[-18, ], weight = 1/dist^2)

# hills <- hills   # make a local copy (needed in S-PLUS)
hills$ispeed <- hills$time/hills$dist
hills$grad <- hills$climb/hills$dist
(hills2.lm <- lm(ispeed ~ grad, data = hills[-18, ]))
frame()
par(fig = c(0, 0.6, 0, 0.55))
plot(hills$grad[-18], studres(hills2.lm), xlab = "grad")
abline(h = 0, lty = 2)
# identify(hills$grad[-18], studres(hills2.lm), row.names(hills)[-18])
par(fig = c(0.6, 1, 0, 0.55), pty = "s")
qqnorm(studres(hills2.lm))
qqline(studres(hills2.lm))
par(pty = "m")
hills2.hat <- lm.influence(hills2.lm)$hat
cbind(hills[-18,], lev = hills2.hat)[hills2.hat > 1.8*2/34, ]


# 6.4  Safe prediction

quad1 <- lm(Weight ~ Days + I(Days^2), data = wtloss)
quad2 <- lm(Weight ~ poly(Days, 2), data = wtloss)

new.x <- data.frame(Days = seq(250, 300, 10),
                    row.names = seq(250, 300, 10))

predict(quad1, newdata = new.x)
predict(quad2, newdata = new.x)

# predict.gam(quad2, newdata = new.x) # S-PLUS only


# 6.5  Robust and resistant regression

# library(lqs)
phones.lm <- lm(calls ~ year, data = phones)
attach(phones); plot(year, calls); detach()
abline(phones.lm$coef)
abline(rlm(calls ~ year, phones, maxit=50), lty = 2, col = 2)
abline(lqs(calls ~ year, phones), lty =3, col = 3)
# legend(locator(1), lty = 1:3, col = 1:3,
#        legend = c("least squares", "M-estimate", "LTS"))

## cor = FALSE is the default in R
summary(lm(calls ~ year, data = phones))
summary(rlm(calls ~ year, maxit = 50, data = phones))
summary(rlm(calls ~ year, scale.est = "proposal 2", data = phones))
summary(rlm(calls ~ year, data = phones, psi = psi.bisquare))

lqs(calls ~ year, data = phones)
lqs(calls ~ year, data = phones, method = "lms")
lqs(calls ~ year, data = phones, method = "S")

summary(rlm(calls ~ year, data = phones, method = "MM"))

# library(robust) # S-PLUS only
# phones.lmr <- lmRob(calls ~ year, data = phones)
# summary(phones.lmr)
# plot(phones.lmr)

hills.lm
hills1.lm # omitting Knock Hill
rlm(time ~ dist + climb, data = hills)
summary(rlm(time ~ dist + climb, data = hills,
             weights = 1/dist^2, method = "MM"))
lqs(time ~ dist + climb, data = hills, nsamp = "exact")
summary(hills2.lm) # omitting Knock Hill
summary(rlm(ispeed ~ grad, data = hills))
summary(rlm(ispeed ~ grad, data = hills, method="MM"))
# summary(lmRob(ispeed ~ grad, data = hills))

lqs(ispeed ~ grad, data = hills)


# 6.6  Bootstrapping linear models

library(boot)
fit <- lm(calls ~ year, data = phones)
ph <- data.frame(phones, res = resid(fit), fitted = fitted(fit))
ph.fun <- function(data, i) {
  d <- data
  d$calls <- d$fitted + d$res[i]
  coef(update(fit, data=d))
}
(ph.lm.boot <- boot(ph, ph.fun, R = 999))

fit <- rlm(calls ~ year, method = "MM", data = phones)
ph <- data.frame(phones, res = resid(fit), fitted = fitted(fit))
(ph.rlm.boot <- boot(ph, ph.fun, R = 999))


# 6.7  Factorial designs and designed experiments

options(contrasts=c("contr.helmert", "contr.poly"))
(npk.aov <- aov(yield ~ block + N*P*K, data = npk))
summary(npk.aov)

alias(npk.aov)
coef(npk.aov)

options(contrasts=c("contr.treatment", "contr.poly"))
npk.aov1 <- aov(yield ~ block + N + K, data = npk)
summary.lm(npk.aov1)
se.contrast(npk.aov1, list(N == "0", N == "1"), data = npk)
model.tables(npk.aov1, type = "means", se = TRUE)

mp <- c("-", "+")
(NPK <- expand.grid(N = mp, P = mp, K = mp))

if(FALSE) { ## fac.design is part of S-PLUS.
blocks13 <- fac.design(levels = c(2, 2, 2),
    factor= list(N=mp, P=mp, K=mp), rep = 3, fraction = 1/2)

blocks46 <- fac.design(levels = c(2, 2, 2),
   factor = list(N=mp, P=mp, K=mp), rep = 3, fraction = ~ -N:P:K)

NPK <- design(block = factor(rep(1:6, each  = 4)),
             rbind(blocks13, blocks46))
i <- order(runif(6)[NPK$block], runif(24))
NPK <- NPK[i,]  # Randomized

lev <- rep(2, 7)
factors <- list(S=mp, D=mp, H=mp, G=mp, R=mp, B=mp, P=mp)
(Bike <- fac.design(lev, factors,
     fraction = ~ S:D:G + S:H:R + D:H:B + S:D:H:P))
replications(~ .^2, data=Bike)
}

if(require("FrF2")) {
NPK <- FrF2(8, factor.names = c("N", "P", "K"), default.levels = 0:1,
            blocks = 2, replications = 3)
print(NPK)
print(as.data.frame(NPK))

print(Bike <- FrF2(factor.names = c("S", "D", "H", "G", "R", "B", "P"),
                   default.levels = c("+", "-"), resolution = 3))
print(replications(~ .^2, data=Bike))
}

# 6.8  An unbalanced four-way layout

attach(quine)
table(Lrn, Age, Sex, Eth)

Means <- tapply(Days, list(Eth, Sex, Age, Lrn), mean)
Vars  <- tapply(Days, list(Eth, Sex, Age, Lrn), var)
SD <- sqrt(Vars)
par(mfrow = c(1, 2), pty="s")
plot(Means, Vars, xlab = "Cell Means", ylab = "Cell Variances")
plot(Means, SD, xlab = "Cell Means", ylab = "Cell Std Devn.")
detach()

## singular.ok = TRUE is the default in R
boxcox(Days+1 ~ Eth*Sex*Age*Lrn, data = quine, singular.ok = TRUE,
  lambda = seq(-0.05, 0.45, len = 20))

logtrans(Days ~ Age*Sex*Eth*Lrn, data = quine,
    alpha = seq(0.75, 6.5, len = 20), singular.ok = TRUE)

quine.hi <- aov(log(Days + 2.5) ~ .^4, quine)
quine.nxt <- update(quine.hi, . ~ . - Eth:Sex:Age:Lrn)
dropterm(quine.nxt, test = "F")

quine.lo <- aov(log(Days+2.5) ~ 1, quine)
addterm(quine.lo, quine.hi, test = "F")

quine.stp <- stepAIC(quine.nxt,
   scope = list(upper = ~Eth*Sex*Age*Lrn, lower = ~1),
   trace = FALSE)
quine.stp$anova

dropterm(quine.stp, test = "F")
quine.3 <- update(quine.stp, . ~ . - Eth:Age:Lrn)
dropterm(quine.3, test = "F")
quine.4 <- update(quine.3, . ~ . - Eth:Age)
quine.5 <- update(quine.4, . ~ . - Age:Lrn)
dropterm(quine.5, test = "F")


# 6.9  Predicting computer performance

par(mfrow = c(1, 2), pty = "s")
boxcox(perf ~ syct + mmin + mmax + cach + chmin + chmax,
       data = cpus, lambda = seq(0, 1, 0.1))

cpus1 <- cpus
attach(cpus)
for(v in names(cpus)[2:7])
  cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])),
                    include.lowest = TRUE)
detach()
boxcox(perf ~ syct + mmin + mmax + cach + chmin + chmax,
       data = cpus1, lambda = seq(-0.25, 1, 0.1))
par(mfrow = c(1, 1), pty = "m")

set.seed(123)
cpus2 <- cpus[, 2:8]  # excludes names, authors' predictions
cpus2[, 1:3] <- log10(cpus2[, 1:3])
#cpus.samp <- sample(1:209, 100)
cpus.samp <-
c(3, 5, 6, 7, 8, 10, 11, 16, 20, 21, 22, 23, 24, 25, 29, 33, 39, 41, 44, 45,
46, 49, 57, 58, 62, 63, 65, 66, 68, 69, 73, 74, 75, 76, 78, 83, 86,
88, 98, 99, 100, 103, 107, 110, 112, 113, 115, 118, 119, 120, 122,
124, 125, 126, 127, 132, 136, 141, 144, 146, 147, 148, 149, 150, 151,
152, 154, 156, 157, 158, 159, 160, 161, 163, 166, 167, 169, 170, 173,
174, 175, 176, 177, 183, 184, 187, 188, 189, 194, 195, 196, 197, 198,
199, 202, 204, 205, 206, 208, 209)

cpus.lm <- lm(log10(perf) ~ ., data = cpus2[cpus.samp, ])
test.cpus <- function(fit)
   sqrt(sum((log10(cpus2[-cpus.samp, "perf"]) -
             predict(fit, cpus2[-cpus.samp,]))^2)/109)
test.cpus(cpus.lm)
cpus.lm2 <- stepAIC(cpus.lm, trace=FALSE)
cpus.lm2$anova
test.cpus(cpus.lm2)


# 6.10  Multiple comparisons

immer.aov <- aov((Y1 + Y2)/2 ~ Var + Loc, data = immer)
summary(immer.aov)

model.tables(immer.aov, type = "means", se = TRUE, cterms = "Var")

if(FALSE) {
multicomp(immer.aov, plot = TRUE)

oats1 <- aov(Y ~ N + V + B, data = oats)
summary(oats1)
multicomp(oats1, focus = "V")
multicomp(oats1, focus = "N", comparisons = "mcc", control = 1)
lmat <- matrix(c(0,-1,1,rep(0, 11), 0,0,-1,1, rep(0,10),
                 0,0,0,-1,1,rep(0,9)),,3,
               dimnames = list(NULL,
               c("0.2cwt-0.0cwt", "0.4cwt-0.2cwt", "0.6cwt-0.4cwt")))
multicomp(oats1, lmat = lmat, bounds = "lower", comparisons = "none")
}

(tk <- TukeyHSD(immer.aov, which = "Var"))
plot(tk)

oats1 <- aov(Y ~ N + V + B, data = oats)
(tk <- TukeyHSD(oats1, which = "V"))
plot(tk)

## An alternative under R is to use package multcomp (which requires mvtnorm)
## This code is for multcomp >= 0.991-1
library(multcomp)
## next is slow:
(tk <- confint(glht(immer.aov, linfct = mcp(Var = "Tukey"))))
plot(tk)

confint(glht(oats1, linfct = mcp(V = "Tukey")))
lmat <- matrix(c(0,-1,1,rep(0, 11), 0,0,-1,1, rep(0,10),
                 0,0,0,-1,1,rep(0,9)),,3,
               dimnames = list(NULL,
               c("0.2cwt-0.0cwt", "0.4cwt-0.2cwt", "0.6cwt-0.4cwt")))
confint(glht(oats1, linfct = mcp(N = t(lmat[2:5, ])), alternative = "greater"))
plot(tk)

# End of ch06
Any scripts or data that you put into this service are public.
MASS documentation built on June 22, 2024, 10:42 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
MASS
Support Functions and Datasets for Venables and Ripley's MASS

inst/scripts/ch06.R
In MASS: Support Functions and Datasets for Venables and Ripley's MASS

Try the MASS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MASS Support Functions and Datasets for Venables and Ripley's MASS

inst/scripts/ch06.R In MASS: Support Functions and Datasets for Venables and Ripley's MASS

Try the MASS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MASS
Support Functions and Datasets for Venables and Ripley's MASS

inst/scripts/ch06.R
In MASS: Support Functions and Datasets for Venables and Ripley's MASS