This case study is drawn from the content in Chapter 4.4 of Roback, P. & Legler, J. Beyond Multiple Linear Regression. (2021). https://bookdown.org/roback/bookdown-BeyondMLR/.

Below is the code, with minor alterations, that accompanies Chapter 4.4.

library(tidyverse)
fHH1 <- read_csv(
  "https://raw.githubusercontent.com/proback/BeyondMLR/master/data/fHH1.csv") %>% 
  select(-1)

Data Organization

glimpse(fHH1)

Exploratory Data Analyses

mean(fHH1$total)
var(fHH1$total)

prop.table(table(fHH1$roof))
fHH1  %>% group_by(roof)  %>% 
  summarise(mean=mean(total), sd=sd(total), 
            var=var(total), n=n())
fHH1  %>% group_by(location)  %>% 
  summarise(mean=mean(total), sd=sd(total), 
            var=var(total), n=n())
ggplot(fHH1, aes(total)) + 
  geom_histogram(binwidth = .25, color = "black", 
                 fill = "white") + 
  xlab("Number in the house excluding head of household") +
  ylab("Count of households") +
  theme_minimal()
cuts = cut(fHH1$age,
           breaks=c(15,20,25,30,35,40,45,50,55,60,65,70))
ageGrps <- data.frame(cuts,fHH1)
ggplot(data = ageGrps, aes(x = total)) +
  geom_histogram(binwidth = .25, color = "black", 
                 fill = "white") +
  facet_wrap(cuts) +
  xlab("Household size") +
  theme_minimal()

\newpage

# Mean = Variance ?
ageGrps  %>% 
  group_by(cuts)  %>% 
  summarise(mnNum= mean(total),varNum=var(total),n=n()) %>% 
knitr::kable(
      caption="Compare mean and variance of household size within each age group.",
      col.names = c("Age Groups", "Mean", "Variance", "n"))

\newpage

## Checking linearity assumption: Empirical log of the means plot
sumStats <- fHH1 %>% group_by(age) %>% 
  summarise(mntotal = mean(total),
            logmntotal = log(mntotal), n=n())
ggplot(sumStats, aes(x=age, y=logmntotal)) +
  geom_point()+
  geom_smooth(method = "loess", size = 1.5)+
  xlab("Age of head of the household") +
  ylab("Log of the empirical mean number in the house") +
  theme_minimal()
sumStats2 <- fHH1 %>% group_by(age, location) %>% 
  summarise(mntotal = mean(total),
            logmntotal = log(mntotal), n=n())
ggplot(sumStats2, aes(x=age, y=logmntotal, color=location,
                      linetype = location, shape = location)) +
  geom_point()+
  geom_smooth(method = "loess", se=FALSE)+
  xlab("Age of head of the household") +
  ylab("Log empirical mean household size") +
  theme_minimal()
modela = glm(total ~ age, family = poisson, data = fHH1)
coef(summary(modela))
cat(" Residual deviance = ", summary(modela)$deviance, " on ",
    summary(modela)$df.residual, "df", "\n",
    "Dispersion parameter = ", summary(modela)$dispersion)
# Wald type CI by hand
beta1hat <- summary(modela)$coefficients[2,1]
beta1se <- summary(modela)$coefficients[2,2]
beta1hat - 1.96*beta1se   # lower bound 
beta1hat + 1.96*beta1se   # upper bound 
exp(beta1hat - 1.96*beta1se)
exp(beta1hat + 1.96*beta1se)
# CI for betas using profile likelihood
confint(modela)
exp(confint(modela))
# model0 is the null/reduced model
model0 <- glm(total ~ 1, family = poisson, data = fHH1)
drop_in_dev <- anova(model0, modela, test = "Chisq")
did_print <- data.frame(ResidDF=drop_in_dev$`Resid. Df`,
    ResidDev=drop_in_dev$`Resid. Dev`,
    Deviance=drop_in_dev$Deviance, Df=drop_in_dev$Df,
    pval=drop_in_dev$`Pr(>Chi)`)
row.names(did_print) <- row.names(drop_in_dev)
did_print

Second Order Model

fHH1 <- fHH1 %>% mutate(age2 = age*age)
modela2 = glm(total ~ age + age2, family = poisson, 
              data = fHH1)
coef(summary(modela2))
cat(" Residual deviance = ", summary(modela2)$deviance, " on ",
    summary(modela2)$df.residual, "df", "\n",
    "Dispersion parameter = ", summary(modela2)$dispersion)
drop_in_dev <- anova(modela, modela2, test = "Chisq")
did_print <- data.frame(ResidDF=drop_in_dev$`Resid. Df`,
    ResidDev=drop_in_dev$`Resid. Dev`,
    Deviance=drop_in_dev$Deviance, Df=drop_in_dev$Df,
    pval=drop_in_dev$`Pr(>Chi)`)
row.names(did_print) <- row.names(drop_in_dev)
did_print
# Finding the age where the number in the house is a maximum
coefa2 = modela2$coefficients[3]
coefa = modela2$coefficients[2]
coefi = modela2$coefficients[2]
estLogNumHouse.f <- function(age){
  return(coefa2*(age)^2 + coefa*(age) + coefi)
}
optimize(estLogNumHouse.f, interval=c(20,70), maximum=TRUE)

Adding a Covariate

modela2L = glm(total ~ age + age2 + location, 
               family = poisson, data = fHH1)
coef(summary(modela2L))
cat(" Residual deviance = ", summary(modela2L)$deviance, " on ",
    summary(modela2L)$df.residual, "df", "\n",
    "Dispersion parameter = ", summary(modela2L)$dispersion)
exp(modela2L$coefficients)
drop_in_dev <- anova(modela2, modela2L, test = "Chisq")
did_print <- data.frame(ResidDF=drop_in_dev$`Resid. Df`,
    ResidDev=drop_in_dev$`Resid. Dev`,
    Deviance=drop_in_dev$Deviance, Df=drop_in_dev$Df,
    pval=drop_in_dev$`Pr(>Chi)`)
row.names(did_print) <- row.names(drop_in_dev)
did_print
modela4 <- glm(total ~ age + age2 + location + roof, 
              family = poisson, data = fHH1)
summary(modela4)

Residuals for Poisson Models (optional)

# Residual plot for the first order model
## Log scale
lfitteda = predict(modela) # log scale
lresida = resid(modela)  # linear model
lresid.df = data.frame(lfitteda,lresida)
ggplot(lresid.df,aes(x=lfitteda, y=lresida)) +
  geom_point(alpha = .25)+
  geom_smooth(method = "loess", size = 1.5, linetype = 2)+
  geom_line(y=0, size=1.5, col="red")+
  xlab("Fitted values") +
  ylab("Deviance Residuals") +
  theme_minimal()

Goodness-of-Fit

1-pchisq(modela2$deviance, modela2$df.residual)  # GOF test


sta303-bolton/sta303w8 documentation built on April 11, 2021, 12:44 p.m.