ch3.solutions: Solution code for Chapter 3: Generalized Linear Models

Description Author(s) References See Also Examples

Description

R code for Chapter 3 exercise solutions.

Author(s)

Simon Wood <simon@r-project.org>

Maintainer: Simon Wood <simon@r-project.org>

References

Wood, S.N. (2017) Generalized Additive Models: An Introduction with R, CRC

See Also

mgcv, ch3

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
library(gamair); library(mgcv)

## Q.2 Residuals

n <- 100; m <- 10
x <- runif(n)
lp <- 3*x-1
mu <- binomial()$linkinv(lp)
y <- rbinom(1:n,m,mu)
par(mfrow=c(2,2))
plot(glm(y/m ~ x,family=binomial,weights=rep(m,n)))

## example glm fit...
b <- glm(y/m ~ x,family=binomial,weights=rep(m,n))
reps <- 200;mu <- fitted(b)
rsd <- matrix(0,reps,n) # array for simulated resids
runs <- rep(0,reps)  # array for simulated run counts
for (i in 1:reps) {  # simulation loop
  ys <- rbinom(1:n,m,mu) # simulate from fitted model
  ## refit model to simulated data
  br <- glm(ys/m ~ x,family=binomial,weights=rep(m,n))
  rs <- residuals(br) # simulated resids (meet assumptions)
  rsd[i,] <- sort(rs) # store sorted residuals
  fv.sort <- sort(fitted(br),index.return=TRUE)
  rs <- rs[fv.sort$ix] # order resids by sorted fit values
  rs <- rs > 0         # check runs of +ve, -ve resids
  runs[i] <- sum(rs[1:(n-1)]!=rs[2:n])
}
# plot original ordered residuals, and simulation envelope
for (i in 1:n) rsd[,i] <- sort(rsd[,i])
par(mfrow=c(1,1))
plot(sort(residuals(b)),(1:n-.5)/n) # original
## plot 95% envelope ....
lines(rsd[5,],(1:n-.5)/n);lines(rsd[reps-5,],(1:n-.5)/n)

# compare original runs to distribution under independence
rs <- residuals(b)
fv.sort <- sort(fitted(b),index.return=TRUE)
rs <- rs[fv.sort$ix]
rs <- rs > 0
obs.runs <- sum(rs[1:(n-1)]!=rs[2:n])
sum(runs>obs.runs)

## Q.3 Death penalty

## read in data...
count <- c(53,414,11,37,0,16,4,139)
death <- factor(c(1,0,1,0,1,0,1,0))
defendant <- factor(c(0,0,1,1,0,0,1,1))
victim <- factor(c(0,0,0,0,1,1,1,1))
levels(death) <- c("no","yes")
levels(defendant) <- c("white","black")
levels(victim) <- c("white","black")

## a)
sum(count[death=="yes"&defendant=="black"])/
    sum(count[defendant=="black"])
sum(count[death=="yes"&defendant=="white"])/
    sum(count[defendant=="white"])

## b)
dm <- glm(count~death*victim+death*defendant+
          victim*defendant,family=poisson(link=log))
summary(dm)
dm0 <- glm(count~death*victim+victim*defendant,
           family=poisson(link=log))
anova(dm0,dm,test="Chisq")

## Q.7 IRLS
y <- c(12,14,33,50,67,74,123,141,165,204,253,246,240)
t <-1:13
X <- cbind(rep(1,13),t,t^2) # model matrix
mu <- y;eta <- log(mu) # initial values
ok <- TRUE
while (ok) {
 ## evaluate pseudodata and weights
 z <- (y-mu)/mu + eta
 w <- as.numeric(mu)
 ## fit weighted working linear model
 z <- sqrt(w)*z; WX <- sqrt(w)*X
 beta <- coef(lm(z~WX-1))
 ## evaluate new eta and mu
 eta.old <- eta
 eta <- X%*%beta
 mu <- exp(eta)
 ## test for convergence...
 if (max(abs(eta-eta.old))<1e-7*max(abs(eta))) ok <- FALSE
}
plot(t,y);lines(t,mu) # plot fit

## Q.8
## b)
data(harrier)
m <- 1
b <- glm(Consumption.Rate~I(1/Grouse.Density^m),
     family=quasi(link=inverse,variance=mu),data=harrier)

## c)
plot(harrier$Grouse.Density,residuals(b))
## clear pattern if $m=1$, and the parameter estimates lead to a rather odd curve.

## d)
## search leads to...
m <- 3.25
b <- glm(Consumption.Rate~I(1/Grouse.Density^m),
     family=quasi(link=inverse,variance=mu),data=harrier)

## e)
pd <- data.frame(Grouse.Density = seq(0,130,length=200))
pr <- predict(b,newdata=pd,se=TRUE)
with(harrier,plot(Grouse.Density,Consumption.Rate))
lines(pd$Grouse.Density,1/pr$fit,col=2)
lines(pd$Grouse.Density,1/(pr$fit-pr$se*2),col=3)
lines(pd$Grouse.Density,1/(pr$fit+pr$se*2),col=3)

## f)
ll <- function(b,cr,d)
## evalates -ve quasi-log likelihood of model
## b is parameters, cr is consumption, d is density
{ ## get expected consumption...
  dm <- d^b[3]
  Ec <- exp(b[1])*dm/(1+exp(b[1])*exp(b[2])*dm)
  ## appropriate quasi-likelihood...
  ind <- cr>0    ## have to deal with cr==0 case
  ql <- cr - Ec
  ql[ind] <- ql[ind] + cr[ind]*log(Ec[ind]/cr[ind])
  -sum(ql)
}
## Now fit model ...
fit <- optim(c(log(.4),log(10),3),ll,method="L-BFGS-B",
             hessian=TRUE,cr=harrier$Consumption.Rate,
             d=harrier$Grouse.Density)
## and plot results ...
b <- fit$par
d <- seq(0,130,length=200); dm <- d^b[3]
Ec <- exp(b[1])*dm/(1+exp(b[1])*exp(b[2])*dm)
with(harrier,plot(Grouse.Density,Consumption.Rate))
lines(d,Ec,col=2)

## Q.9
death <- as.numeric(ldeaths)
month <- rep(1:12,6)
time <- 1:72
ldm <- glm(death ~ sin(month/12*2*pi)+cos(month/12*2*pi),
           family=poisson(link=identity))
plot(time,death,type="l");lines(time,fitted(ldm),col=2)
summary(ldm)
plot(ldm)

## Q.10
y <- c(12,14,33,50,67,74,123,141,165,204,253,246,240)
t <- 1:13
b <- glm(y ~ t + I(t^2),family=poisson)
log.lik <- b1 <- seq(.4,.7,length=100)
for (i in 1:100)
{ log.lik[i] <- logLik(glm(y~offset(b1[i]*t)+I(t^2),
                           family=poisson))
}
plot(b1,log.lik,type="l")
points(coef(b)[2],logLik(b),pch=19)
abline(logLik(b)[1]-qchisq(.95,df=1),0,lty=2)

## Q.11 Soybean
## a)
library(nlme)
attach(Soybean)
lmc <- lmeControl(niterEM=300) ## needed for convergence
m1<-lme(weight~Variety*Time+Variety*I(Time^2)+
        Variety*I(Time^3),Soybean,~Time|Plot,control=lmc)
plot(m1) ## clear increasing variance with mean

## b)
library(MASS)
m2<-glmmPQL(weight~Variety*Time+Variety*I(Time^2)+
    Variety*I(Time^3),data=Soybean,random=~Time|Plot,
    family=Gamma(link=log),control=lmc)
plot(m2) ## much better

## c)
m0<-glmmPQL(weight~Variety*Time+Variety*I(Time^2)+
    Variety*I(Time^3),data=Soybean,random=~1|Plot,
    family=Gamma(link=log),control=lmc) # simpler r.e.'s
m3<-glmmPQL(weight~Variety*Time+Variety*I(Time^2)+
    Variety*I(Time^3),data=Soybean,random=~Time+
    I(Time^2)|Plot,family=Gamma(link=log),control=lmc)
## ... m3 has more complex r.e. structure
## Following not strictly valid, but gives a rough
## guide. Suggests m2 is best...
AIC(m0,m2,m3)
summary(m2) ## drop Variety:Time
m4<-glmmPQL(weight~Variety+Time+Variety*I(Time^2)+
    Variety*I(Time^3),data=Soybean,random=~Time|Plot,
    family=Gamma(link=log),control=lmc)
summary(m4) ## perhaps drop Variety:I(Time^3)?
m5<-glmmPQL(weight~Variety+Time+Variety*I(Time^2)+
    I(Time^3),data=Soybean,random=~Time|Plot,
    family=Gamma(link=log),control=lmc)
summary(m5) ## don't drop any more
AIC(m2,m4,m5) ## supports m4
intervals(m5,which="fixed")

## So m4 or m5 are probably the best models to use, and
## both suggest that variety P has a higher weight on average.

Example output

Loading required package: nlme
This is mgcv 1.8-25. For overview type 'help("mgcv-package")'.
[1] 161
[1] 0.07853403
[1] 0.1097308

Call:
glm(formula = count ~ death * victim + death * defendant + victim * 
    defendant, family = poisson(link = log))

Deviance Residuals: 
       1         2         3         4         5         6         7         8  
 0.02505  -0.00895  -0.05463   0.03000  -0.60362   0.04572   0.09251  -0.01545  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 6.02631    0.04913 122.669  < 2e-16 ***
deathyes                   -2.05946    0.14585 -14.121  < 2e-16 ***
victimblack                -3.26517    0.25478 -12.816  < 2e-16 ***
defendantblack             -2.42032    0.17155 -14.109  < 2e-16 ***
deathyes:victimblack       -2.40444    0.60061  -4.003 6.25e-05 ***
deathyes:defendantblack     0.86780    0.36707   2.364   0.0181 *  
victimblack:defendantblack  4.59497    0.31353  14.656  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1225.07955  on 7  degrees of freedom
Residual deviance:    0.37984  on 1  degrees of freedom
AIC: 52.42

Number of Fisher Scoring iterations: 3

Analysis of Deviance Table

Model 1: count ~ death * victim + victim * defendant
Model 2: count ~ death * victim + death * defendant + victim * defendant
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
1         2     5.3940                       
2         1     0.3798  1   5.0142  0.02514 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
glm(formula = death ~ sin(month/12 * 2 * pi) + cos(month/12 * 
    2 * pi), family = poisson(link = identity))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-13.0248   -3.5368    0.1624    2.9021   19.5843  

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)            2056.625      5.345  384.81   <2e-16 ***
sin(month/12 * 2 * pi)  614.992      7.431   82.76   <2e-16 ***
cos(month/12 * 2 * pi)  409.015      7.431   55.04   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 12329.6  on 71  degrees of freedom
Residual deviance:  2226.2  on 69  degrees of freedom
AIC: 2910.9

Number of Fisher Scoring iterations: 4

iteration 1
iteration 2
iteration 3
iteration 4
iteration 1
iteration 2
iteration 3
iteration 4
iteration 1
iteration 2
iteration 3
iteration 4
   df AIC
m0 10  NA
m2 12  NA
m3 15  NA
Linear mixed-effects model fit by maximum likelihood
 Data: Soybean 
  AIC BIC logLik
   NA  NA     NA

Random effects:
 Formula: ~Time | Plot
 Structure: General positive-definite, Log-Cholesky parametrization
            StdDev      Corr  
(Intercept) 0.372420536 (Intr)
Time        0.002915643 -0.994
Residual    0.188632406       

Variance function:
 Structure: fixed weights
 Formula: ~invwt 
Fixed effects: weight ~ Variety * Time + Variety * I(Time^2) + Variety * I(Time^3) 
                       Value  Std.Error  DF   t-value p-value
(Intercept)        -5.364971 0.15662819 358 -34.25291  0.0000
VarietyP            0.397564 0.22107160  46   1.79835  0.0787
Time                0.218397 0.01090543 358  20.02648  0.0000
I(Time^2)          -0.001627 0.00025381 358  -6.40927  0.0000
I(Time^3)           0.000002 0.00000177 358   1.12806  0.2601
VarietyP:Time       0.006519 0.01536237 358   0.42435  0.6716
VarietyP:I(Time^2) -0.000287 0.00035706 358  -0.80307  0.4225
VarietyP:I(Time^3)  0.000002 0.00000249 358   0.93665  0.3496
 Correlation: 
                   (Intr) VartyP Time   I(T^2) I(T^3) VrtP:T VP:I(T^2
VarietyP           -0.708                                            
Time               -0.873  0.618                                     
I(Time^2)           0.807 -0.571 -0.984                              
I(Time^3)          -0.765  0.542  0.956 -0.992                       
VarietyP:Time       0.620 -0.872 -0.710  0.699 -0.678                
VarietyP:I(Time^2) -0.573  0.806  0.700 -0.711  0.705 -0.984         
VarietyP:I(Time^3)  0.544 -0.765 -0.680  0.705 -0.711  0.955 -0.992  

Standardized Within-Group Residuals:
       Min         Q1        Med         Q3        Max 
-2.6981180 -0.6552450 -0.0616646  0.5768139  3.6926753 

Number of Observations: 412
Number of Groups: 48 
iteration 1
iteration 2
iteration 3
iteration 4
Linear mixed-effects model fit by maximum likelihood
 Data: Soybean 
  AIC BIC logLik
   NA  NA     NA

Random effects:
 Formula: ~Time | Plot
 Structure: General positive-definite, Log-Cholesky parametrization
            StdDev      Corr  
(Intercept) 0.371809198 (Intr)
Time        0.002901003 -0.994
Residual    0.188638481       

Variance function:
 Structure: fixed weights
 Formula: ~invwt 
Fixed effects: weight ~ Variety + Time + Variety * I(Time^2) + Variety * I(Time^3) 
                       Value  Std.Error  DF   t-value p-value
(Intercept)        -5.406957 0.12272546 359 -44.05733  0.0000
VarietyP            0.479437 0.10778930  46   4.44791  0.0001
Time                0.221753 0.00767198 359  28.90429  0.0000
I(Time^2)          -0.001704 0.00018136 359  -9.39481  0.0000
I(Time^3)           0.000003 0.00000130 359   1.94091  0.0531
VarietyP:I(Time^2) -0.000137 0.00006302 359  -2.18150  0.0298
VarietyP:I(Time^3)  0.000001 0.00000074 359   1.79830  0.0730
 Correlation: 
                   (Intr) VartyP Time   I(T^2) I(T^3) VP:I(T^2
VarietyP           -0.437                                     
Time               -0.784 -0.003                              
I(Time^2)           0.666  0.109 -0.969                       
I(Time^3)          -0.599 -0.138  0.916 -0.985                
VarietyP:I(Time^2)  0.263 -0.610  0.008 -0.183  0.286         
VarietyP:I(Time^3) -0.206  0.478 -0.007  0.178 -0.291 -0.980  

Standardized Within-Group Residuals:
        Min          Q1         Med          Q3         Max 
-2.68996385 -0.65631664 -0.05972778  0.58812592  3.72020038 

Number of Observations: 412
Number of Groups: 48 
iteration 1
iteration 2
iteration 3
iteration 4
Linear mixed-effects model fit by maximum likelihood
 Data: Soybean 
  AIC BIC logLik
   NA  NA     NA

Random effects:
 Formula: ~Time | Plot
 Structure: General positive-definite, Log-Cholesky parametrization
            StdDev      Corr  
(Intercept) 0.371510720 (Intr)
Time        0.002872102 -0.991
Residual    0.189367267       

Variance function:
 Structure: fixed weights
 Formula: ~invwt 
Fixed effects: weight ~ Variety + Time + Variety * I(Time^2) + I(Time^3) 
                       Value  Std.Error  DF   t-value p-value
(Intercept)        -5.364724 0.12028256 360 -44.60101  0.0000
VarietyP            0.388361 0.09475109  46   4.09875  0.0002
Time                0.222096 0.00769357 360  28.86777  0.0000
I(Time^2)          -0.001769 0.00017899 360  -9.88285  0.0000
I(Time^3)           0.000003 0.00000125 360   2.61307  0.0093
VarietyP:I(Time^2) -0.000027 0.00001255 360  -2.13175  0.0337
 Correlation: 
                   (Intr) VartyP Time   I(T^2) I(T^3)
VarietyP           -0.394                            
Time               -0.803  0.001                     
I(Time^2)           0.731  0.027 -0.984              
I(Time^3)          -0.704  0.001  0.955 -0.991       
VarietyP:I(Time^2)  0.312 -0.800  0.004 -0.040  0.004

Standardized Within-Group Residuals:
        Min          Q1         Med          Q3         Max 
-2.67130867 -0.66143822 -0.05416341  0.57914166  3.65293634 

Number of Observations: 412
Number of Groups: 48 
   df AIC
m2 12  NA
m4 11  NA
m5 10  NA
Approximate 95% confidence intervals

 Fixed effects:
                           lower          est.         upper
(Intercept)        -5.599540e+00 -5.364724e+00 -5.129908e+00
VarietyP            1.990307e-01  3.883609e-01  5.776911e-01
Time                2.070768e-01  2.220963e-01  2.371157e-01
I(Time^2)          -2.118323e-03 -1.768904e-03 -1.419484e-03
I(Time^3)           8.258938e-07  3.265598e-06  5.705303e-06
VarietyP:I(Time^2) -5.127377e-05 -2.676397e-05 -2.254168e-06
attr(,"label")
[1] "Fixed effects:"

gamair documentation built on Aug. 23, 2019, 5:03 p.m.