ch3.solutions: Solution code for Chapter 3: Generalized Linear Models
In gamair: Data for 'GAMs: An Introduction with R'

Description Author(s) References See Also Examples

R code for Chapter 3 exercise solutions.

Simon Wood <simon@r-project.org>

Maintainer: Simon Wood <simon@r-project.org>

Wood, S.N. (2017) Generalized Additive Models: An Introduction with R, CRC

mgcv, ch3

library(gamair); library(mgcv)

## Q.2 Residuals

n <- 100; m <- 10
x <- runif(n)
lp <- 3*x-1
mu <- binomial()$linkinv(lp)
y <- rbinom(1:n,m,mu)
par(mfrow=c(2,2))
plot(glm(y/m ~ x,family=binomial,weights=rep(m,n)))

## example glm fit...
b <- glm(y/m ~ x,family=binomial,weights=rep(m,n))
reps <- 200;mu <- fitted(b)
rsd <- matrix(0,reps,n) # array for simulated resids
runs <- rep(0,reps)  # array for simulated run counts
for (i in 1:reps) {  # simulation loop
  ys <- rbinom(1:n,m,mu) # simulate from fitted model
  ## refit model to simulated data
  br <- glm(ys/m ~ x,family=binomial,weights=rep(m,n))
  rs <- residuals(br) # simulated resids (meet assumptions)
  rsd[i,] <- sort(rs) # store sorted residuals
  fv.sort <- sort(fitted(br),index.return=TRUE)
  rs <- rs[fv.sort$ix] # order resids by sorted fit values
  rs <- rs > 0         # check runs of +ve, -ve resids
  runs[i] <- sum(rs[1:(n-1)]!=rs[2:n])
}
# plot original ordered residuals, and simulation envelope
for (i in 1:n) rsd[,i] <- sort(rsd[,i])
par(mfrow=c(1,1))
plot(sort(residuals(b)),(1:n-.5)/n) # original
## plot 95% envelope ....
lines(rsd[5,],(1:n-.5)/n);lines(rsd[reps-5,],(1:n-.5)/n)

# compare original runs to distribution under independence
rs <- residuals(b)
fv.sort <- sort(fitted(b),index.return=TRUE)
rs <- rs[fv.sort$ix]
rs <- rs > 0
obs.runs <- sum(rs[1:(n-1)]!=rs[2:n])
sum(runs>obs.runs)

## Q.3 Death penalty

## read in data...
count <- c(53,414,11,37,0,16,4,139)
death <- factor(c(1,0,1,0,1,0,1,0))
defendant <- factor(c(0,0,1,1,0,0,1,1))
victim <- factor(c(0,0,0,0,1,1,1,1))
levels(death) <- c("no","yes")
levels(defendant) <- c("white","black")
levels(victim) <- c("white","black")

## a)
sum(count[death=="yes"&defendant=="black"])/
    sum(count[defendant=="black"])
sum(count[death=="yes"&defendant=="white"])/
    sum(count[defendant=="white"])

## b)
dm <- glm(count~death*victim+death*defendant+
          victim*defendant,family=poisson(link=log))
summary(dm)
dm0 <- glm(count~death*victim+victim*defendant,
           family=poisson(link=log))
anova(dm0,dm,test="Chisq")

## Q.7 IRLS
y <- c(12,14,33,50,67,74,123,141,165,204,253,246,240)
t <-1:13
X <- cbind(rep(1,13),t,t^2) # model matrix
mu <- y;eta <- log(mu) # initial values
ok <- TRUE
while (ok) {
 ## evaluate pseudodata and weights
 z <- (y-mu)/mu + eta
 w <- as.numeric(mu)
 ## fit weighted working linear model
 z <- sqrt(w)*z; WX <- sqrt(w)*X
 beta <- coef(lm(z~WX-1))
 ## evaluate new eta and mu
 eta.old <- eta
 eta <- X%*%beta
 mu <- exp(eta)
 ## test for convergence...
 if (max(abs(eta-eta.old))<1e-7*max(abs(eta))) ok <- FALSE
}
plot(t,y);lines(t,mu) # plot fit

## Q.8
## b)
data(harrier)
m <- 1
b <- glm(Consumption.Rate~I(1/Grouse.Density^m),
     family=quasi(link=inverse,variance=mu),data=harrier)

## c)
plot(harrier$Grouse.Density,residuals(b))
## clear pattern if $m=1$, and the parameter estimates lead to a rather odd curve.

## d)
## search leads to...
m <- 3.25
b <- glm(Consumption.Rate~I(1/Grouse.Density^m),
     family=quasi(link=inverse,variance=mu),data=harrier)

## e)
pd <- data.frame(Grouse.Density = seq(0,130,length=200))
pr <- predict(b,newdata=pd,se=TRUE)
with(harrier,plot(Grouse.Density,Consumption.Rate))
lines(pd$Grouse.Density,1/pr$fit,col=2)
lines(pd$Grouse.Density,1/(pr$fit-pr$se*2),col=3)
lines(pd$Grouse.Density,1/(pr$fit+pr$se*2),col=3)

## f)
ll <- function(b,cr,d)
## evalates -ve quasi-log likelihood of model
## b is parameters, cr is consumption, d is density
{ ## get expected consumption...
  dm <- d^b[3]
  Ec <- exp(b[1])*dm/(1+exp(b[1])*exp(b[2])*dm)
  ## appropriate quasi-likelihood...
  ind <- cr>0    ## have to deal with cr==0 case
  ql <- cr - Ec
  ql[ind] <- ql[ind] + cr[ind]*log(Ec[ind]/cr[ind])
  -sum(ql)
}
## Now fit model ...
fit <- optim(c(log(.4),log(10),3),ll,method="L-BFGS-B",
             hessian=TRUE,cr=harrier$Consumption.Rate,
             d=harrier$Grouse.Density)
## and plot results ...
b <- fit$par
d <- seq(0,130,length=200); dm <- d^b[3]
Ec <- exp(b[1])*dm/(1+exp(b[1])*exp(b[2])*dm)
with(harrier,plot(Grouse.Density,Consumption.Rate))
lines(d,Ec,col=2)

## Q.9
death <- as.numeric(ldeaths)
month <- rep(1:12,6)
time <- 1:72
ldm <- glm(death ~ sin(month/12*2*pi)+cos(month/12*2*pi),
           family=poisson(link=identity))
plot(time,death,type="l");lines(time,fitted(ldm),col=2)
summary(ldm)
plot(ldm)

## Q.10
y <- c(12,14,33,50,67,74,123,141,165,204,253,246,240)
t <- 1:13
b <- glm(y ~ t + I(t^2),family=poisson)
log.lik <- b1 <- seq(.4,.7,length=100)
for (i in 1:100)
{ log.lik[i] <- logLik(glm(y~offset(b1[i]*t)+I(t^2),
                           family=poisson))
}
plot(b1,log.lik,type="l")
points(coef(b)[2],logLik(b),pch=19)
abline(logLik(b)[1]-qchisq(.95,df=1),0,lty=2)

## Q.11 Soybean
## a)
library(nlme)
attach(Soybean)
lmc <- lmeControl(niterEM=300) ## needed for convergence
m1<-lme(weight~Variety*Time+Variety*I(Time^2)+
        Variety*I(Time^3),Soybean,~Time|Plot,control=lmc)
plot(m1) ## clear increasing variance with mean

## b)
library(MASS)
m2<-glmmPQL(weight~Variety*Time+Variety*I(Time^2)+
    Variety*I(Time^3),data=Soybean,random=~Time|Plot,
    family=Gamma(link=log),control=lmc)
plot(m2) ## much better

## c)
m0<-glmmPQL(weight~Variety*Time+Variety*I(Time^2)+
    Variety*I(Time^3),data=Soybean,random=~1|Plot,
    family=Gamma(link=log),control=lmc) # simpler r.e.'s
m3<-glmmPQL(weight~Variety*Time+Variety*I(Time^2)+
    Variety*I(Time^3),data=Soybean,random=~Time+
    I(Time^2)|Plot,family=Gamma(link=log),control=lmc)
## ... m3 has more complex r.e. structure
## Following not strictly valid, but gives a rough
## guide. Suggests m2 is best...
AIC(m0,m2,m3)
summary(m2) ## drop Variety:Time
m4<-glmmPQL(weight~Variety+Time+Variety*I(Time^2)+
    Variety*I(Time^3),data=Soybean,random=~Time|Plot,
    family=Gamma(link=log),control=lmc)
summary(m4) ## perhaps drop Variety:I(Time^3)?
m5<-glmmPQL(weight~Variety+Time+Variety*I(Time^2)+
    I(Time^3),data=Soybean,random=~Time|Plot,
    family=Gamma(link=log),control=lmc)
summary(m5) ## don't drop any more
AIC(m2,m4,m5) ## supports m4
intervals(m5,which="fixed")

## So m4 or m5 are probably the best models to use, and
## both suggest that variety P has a higher weight on average.

Loading required package: nlme
This is mgcv 1.8-25. For overview type 'help("mgcv-package")'.
[1] 161
[1] 0.07853403
[1] 0.1097308

Call:
glm(formula = count ~ death * victim + death * defendant + victim * 
    defendant, family = poisson(link = log))

Deviance Residuals: 
       1         2         3         4         5         6         7         8  
 0.02505  -0.00895  -0.05463   0.03000  -0.60362   0.04572   0.09251  -0.01545  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 6.02631    0.04913 122.669  < 2e-16 ***
deathyes                   -2.05946    0.14585 -14.121  < 2e-16 ***
victimblack                -3.26517    0.25478 -12.816  < 2e-16 ***
defendantblack             -2.42032    0.17155 -14.109  < 2e-16 ***
deathyes:victimblack       -2.40444    0.60061  -4.003 6.25e-05 ***
deathyes:defendantblack     0.86780    0.36707   2.364   0.0181 *  
victimblack:defendantblack  4.59497    0.31353  14.656  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1225.07955  on 7  degrees of freedom
Residual deviance:    0.37984  on 1  degrees of freedom
AIC: 52.42

Number of Fisher Scoring iterations: 3

Analysis of Deviance Table

Model 1: count ~ death * victim + victim * defendant
Model 2: count ~ death * victim + death * defendant + victim * defendant
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
1         2     5.3940                       
2         1     0.3798  1   5.0142  0.02514 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
glm(formula = death ~ sin(month/12 * 2 * pi) + cos(month/12 * 
    2 * pi), family = poisson(link = identity))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-13.0248   -3.5368    0.1624    2.9021   19.5843  

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)            2056.625      5.345  384.81   <2e-16 ***
sin(month/12 * 2 * pi)  614.992      7.431   82.76   <2e-16 ***
cos(month/12 * 2 * pi)  409.015      7.431   55.04   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 12329.6  on 71  degrees of freedom
Residual deviance:  2226.2  on 69  degrees of freedom
AIC: 2910.9

Number of Fisher Scoring iterations: 4

iteration 1
iteration 2
iteration 3
iteration 4
iteration 1
iteration 2
iteration 3
iteration 4
iteration 1
iteration 2
iteration 3
iteration 4
   df AIC
m0 10  NA
m2 12  NA
m3 15  NA
Linear mixed-effects model fit by maximum likelihood
 Data: Soybean 
  AIC BIC logLik
   NA  NA     NA

Random effects:
 Formula: ~Time | Plot
 Structure: General positive-definite, Log-Cholesky parametrization
            StdDev      Corr  
(Intercept) 0.372420536 (Intr)
Time        0.002915643 -0.994
Residual    0.188632406       

Variance function:
 Structure: fixed weights
 Formula: ~invwt 
Fixed effects: weight ~ Variety * Time + Variety * I(Time^2) + Variety * I(Time^3) 
                       Value  Std.Error  DF   t-value p-value
(Intercept)        -5.364971 0.15662819 358 -34.25291  0.0000
VarietyP            0.397564 0.22107160  46   1.79835  0.0787
Time                0.218397 0.01090543 358  20.02648  0.0000
I(Time^2)          -0.001627 0.00025381 358  -6.40927  0.0000
I(Time^3)           0.000002 0.00000177 358   1.12806  0.2601
VarietyP:Time       0.006519 0.01536237 358   0.42435  0.6716
VarietyP:I(Time^2) -0.000287 0.00035706 358  -0.80307  0.4225
VarietyP:I(Time^3)  0.000002 0.00000249 358   0.93665  0.3496
 Correlation: 
                   (Intr) VartyP Time   I(T^2) I(T^3) VrtP:T VP:I(T^2
VarietyP           -0.708                                            
Time               -0.873  0.618                                     
I(Time^2)           0.807 -0.571 -0.984                              
I(Time^3)          -0.765  0.542  0.956 -0.992                       
VarietyP:Time       0.620 -0.872 -0.710  0.699 -0.678                
VarietyP:I(Time^2) -0.573  0.806  0.700 -0.711  0.705 -0.984         
VarietyP:I(Time^3)  0.544 -0.765 -0.680  0.705 -0.711  0.955 -0.992  

Standardized Within-Group Residuals:
       Min         Q1        Med         Q3        Max 
-2.6981180 -0.6552450 -0.0616646  0.5768139  3.6926753 

Number of Observations: 412
Number of Groups: 48 
iteration 1
iteration 2
iteration 3
iteration 4
Linear mixed-effects model fit by maximum likelihood
 Data: Soybean 
  AIC BIC logLik
   NA  NA     NA

Random effects:
 Formula: ~Time | Plot
 Structure: General positive-definite, Log-Cholesky parametrization
            StdDev      Corr  
(Intercept) 0.371809198 (Intr)
Time        0.002901003 -0.994
Residual    0.188638481       

Variance function:
 Structure: fixed weights
 Formula: ~invwt 
Fixed effects: weight ~ Variety + Time + Variety * I(Time^2) + Variety * I(Time^3) 
                       Value  Std.Error  DF   t-value p-value
(Intercept)        -5.406957 0.12272546 359 -44.05733  0.0000
VarietyP            0.479437 0.10778930  46   4.44791  0.0001
Time                0.221753 0.00767198 359  28.90429  0.0000
I(Time^2)          -0.001704 0.00018136 359  -9.39481  0.0000
I(Time^3)           0.000003 0.00000130 359   1.94091  0.0531
VarietyP:I(Time^2) -0.000137 0.00006302 359  -2.18150  0.0298
VarietyP:I(Time^3)  0.000001 0.00000074 359   1.79830  0.0730
 Correlation: 
                   (Intr) VartyP Time   I(T^2) I(T^3) VP:I(T^2
VarietyP           -0.437                                     
Time               -0.784 -0.003                              
I(Time^2)           0.666  0.109 -0.969                       
I(Time^3)          -0.599 -0.138  0.916 -0.985                
VarietyP:I(Time^2)  0.263 -0.610  0.008 -0.183  0.286         
VarietyP:I(Time^3) -0.206  0.478 -0.007  0.178 -0.291 -0.980  

Standardized Within-Group Residuals:
        Min          Q1         Med          Q3         Max 
-2.68996385 -0.65631664 -0.05972778  0.58812592  3.72020038 

Number of Observations: 412
Number of Groups: 48 
iteration 1
iteration 2
iteration 3
iteration 4
Linear mixed-effects model fit by maximum likelihood
 Data: Soybean 
  AIC BIC logLik
   NA  NA     NA

Random effects:
 Formula: ~Time | Plot
 Structure: General positive-definite, Log-Cholesky parametrization
            StdDev      Corr  
(Intercept) 0.371510720 (Intr)
Time        0.002872102 -0.991
Residual    0.189367267       

Variance function:
 Structure: fixed weights
 Formula: ~invwt 
Fixed effects: weight ~ Variety + Time + Variety * I(Time^2) + I(Time^3) 
                       Value  Std.Error  DF   t-value p-value
(Intercept)        -5.364724 0.12028256 360 -44.60101  0.0000
VarietyP            0.388361 0.09475109  46   4.09875  0.0002
Time                0.222096 0.00769357 360  28.86777  0.0000
I(Time^2)          -0.001769 0.00017899 360  -9.88285  0.0000
I(Time^3)           0.000003 0.00000125 360   2.61307  0.0093
VarietyP:I(Time^2) -0.000027 0.00001255 360  -2.13175  0.0337
 Correlation: 
                   (Intr) VartyP Time   I(T^2) I(T^3)
VarietyP           -0.394                            
Time               -0.803  0.001                     
I(Time^2)           0.731  0.027 -0.984              
I(Time^3)          -0.704  0.001  0.955 -0.991       
VarietyP:I(Time^2)  0.312 -0.800  0.004 -0.040  0.004

Standardized Within-Group Residuals:
        Min          Q1         Med          Q3         Max 
-2.67130867 -0.66143822 -0.05416341  0.57914166  3.65293634 

Number of Observations: 412
Number of Groups: 48 
   df AIC
m2 12  NA
m4 11  NA
m5 10  NA
Approximate 95% confidence intervals

 Fixed effects:
                           lower          est.         upper
(Intercept)        -5.599540e+00 -5.364724e+00 -5.129908e+00
VarietyP            1.990307e-01  3.883609e-01  5.776911e-01
Time                2.070768e-01  2.220963e-01  2.371157e-01
I(Time^2)          -2.118323e-03 -1.768904e-03 -1.419484e-03
I(Time^3)           8.258938e-07  3.265598e-06  5.705303e-06
VarietyP:I(Time^2) -5.127377e-05 -2.676397e-05 -2.254168e-06
attr(,"label")
[1] "Fixed effects:"