This case study is drawn from the content in Chapter 4.4 of Roback, P. & Legler, J. Beyond Multiple Linear Regression. (2021). https://bookdown.org/roback/bookdown-BeyondMLR/.
Below is the code, with minor alterations, that accompanies Chapter 4.4.
library(tidyverse) fHH1 <- read_csv( "https://raw.githubusercontent.com/proback/BeyondMLR/master/data/fHH1.csv") %>% select(-1)
glimpse(fHH1)
mean(fHH1$total) var(fHH1$total) prop.table(table(fHH1$roof)) fHH1 %>% group_by(roof) %>% summarise(mean=mean(total), sd=sd(total), var=var(total), n=n()) fHH1 %>% group_by(location) %>% summarise(mean=mean(total), sd=sd(total), var=var(total), n=n())
ggplot(fHH1, aes(total)) + geom_histogram(binwidth = .25, color = "black", fill = "white") + xlab("Number in the house excluding head of household") + ylab("Count of households") + theme_minimal()
cuts = cut(fHH1$age, breaks=c(15,20,25,30,35,40,45,50,55,60,65,70)) ageGrps <- data.frame(cuts,fHH1) ggplot(data = ageGrps, aes(x = total)) + geom_histogram(binwidth = .25, color = "black", fill = "white") + facet_wrap(cuts) + xlab("Household size") + theme_minimal()
\newpage
# Mean = Variance ? ageGrps %>% group_by(cuts) %>% summarise(mnNum= mean(total),varNum=var(total),n=n()) %>% knitr::kable( caption="Compare mean and variance of household size within each age group.", col.names = c("Age Groups", "Mean", "Variance", "n"))
\newpage
## Checking linearity assumption: Empirical log of the means plot sumStats <- fHH1 %>% group_by(age) %>% summarise(mntotal = mean(total), logmntotal = log(mntotal), n=n()) ggplot(sumStats, aes(x=age, y=logmntotal)) + geom_point()+ geom_smooth(method = "loess", size = 1.5)+ xlab("Age of head of the household") + ylab("Log of the empirical mean number in the house") + theme_minimal()
sumStats2 <- fHH1 %>% group_by(age, location) %>% summarise(mntotal = mean(total), logmntotal = log(mntotal), n=n()) ggplot(sumStats2, aes(x=age, y=logmntotal, color=location, linetype = location, shape = location)) + geom_point()+ geom_smooth(method = "loess", se=FALSE)+ xlab("Age of head of the household") + ylab("Log empirical mean household size") + theme_minimal()
modela = glm(total ~ age, family = poisson, data = fHH1)
coef(summary(modela)) cat(" Residual deviance = ", summary(modela)$deviance, " on ", summary(modela)$df.residual, "df", "\n", "Dispersion parameter = ", summary(modela)$dispersion)
# Wald type CI by hand beta1hat <- summary(modela)$coefficients[2,1] beta1se <- summary(modela)$coefficients[2,2] beta1hat - 1.96*beta1se # lower bound beta1hat + 1.96*beta1se # upper bound exp(beta1hat - 1.96*beta1se) exp(beta1hat + 1.96*beta1se)
# CI for betas using profile likelihood confint(modela) exp(confint(modela))
# model0 is the null/reduced model model0 <- glm(total ~ 1, family = poisson, data = fHH1) drop_in_dev <- anova(model0, modela, test = "Chisq")
did_print <- data.frame(ResidDF=drop_in_dev$`Resid. Df`, ResidDev=drop_in_dev$`Resid. Dev`, Deviance=drop_in_dev$Deviance, Df=drop_in_dev$Df, pval=drop_in_dev$`Pr(>Chi)`) row.names(did_print) <- row.names(drop_in_dev) did_print
fHH1 <- fHH1 %>% mutate(age2 = age*age) modela2 = glm(total ~ age + age2, family = poisson, data = fHH1)
coef(summary(modela2)) cat(" Residual deviance = ", summary(modela2)$deviance, " on ", summary(modela2)$df.residual, "df", "\n", "Dispersion parameter = ", summary(modela2)$dispersion)
drop_in_dev <- anova(modela, modela2, test = "Chisq")
did_print <- data.frame(ResidDF=drop_in_dev$`Resid. Df`, ResidDev=drop_in_dev$`Resid. Dev`, Deviance=drop_in_dev$Deviance, Df=drop_in_dev$Df, pval=drop_in_dev$`Pr(>Chi)`) row.names(did_print) <- row.names(drop_in_dev) did_print
# Finding the age where the number in the house is a maximum coefa2 = modela2$coefficients[3] coefa = modela2$coefficients[2] coefi = modela2$coefficients[2] estLogNumHouse.f <- function(age){ return(coefa2*(age)^2 + coefa*(age) + coefi) } optimize(estLogNumHouse.f, interval=c(20,70), maximum=TRUE)
modela2L = glm(total ~ age + age2 + location, family = poisson, data = fHH1)
coef(summary(modela2L)) cat(" Residual deviance = ", summary(modela2L)$deviance, " on ", summary(modela2L)$df.residual, "df", "\n", "Dispersion parameter = ", summary(modela2L)$dispersion)
exp(modela2L$coefficients)
drop_in_dev <- anova(modela2, modela2L, test = "Chisq")
did_print <- data.frame(ResidDF=drop_in_dev$`Resid. Df`, ResidDev=drop_in_dev$`Resid. Dev`, Deviance=drop_in_dev$Deviance, Df=drop_in_dev$Df, pval=drop_in_dev$`Pr(>Chi)`) row.names(did_print) <- row.names(drop_in_dev) did_print
modela4 <- glm(total ~ age + age2 + location + roof, family = poisson, data = fHH1) summary(modela4)
# Residual plot for the first order model ## Log scale lfitteda = predict(modela) # log scale lresida = resid(modela) # linear model lresid.df = data.frame(lfitteda,lresida) ggplot(lresid.df,aes(x=lfitteda, y=lresida)) + geom_point(alpha = .25)+ geom_smooth(method = "loess", size = 1.5, linetype = 2)+ geom_line(y=0, size=1.5, col="red")+ xlab("Fitted values") + ylab("Deviance Residuals") + theme_minimal()
1-pchisq(modela2$deviance, modela2$df.residual) # GOF test
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.