library(LM2GLMM) library(doSNOW) ## for load balancing in spaMM options(width = 110) knitr::opts_chunk$set(cache = FALSE, cache.path = "./cache_knitr/GLM_intro/", fig.path = "./fig_knitr/GLM_intro/", fig.width = 5, fig.height = 5, fig.align = "center", error = TRUE) spaMM::spaMM.options(nb_cores = parallel::detectCores() - 1L)
r .emo("goal")
r .emo("info")
$\eta_i = \beta_0 + \beta_1 \times x_{1,i} + \beta_2 \times x_{2,i} + \dots + \beta_{p} \times x_{p,i}$ or in matrix notation $\eta = \text{X}\beta$ ,
with:
We call:
The GLM fit leads to $\text{Y} = g^{-1}(\eta) + \epsilon = g^{-1}(\text{X}\beta) + \epsilon = g^{-1}(\widehat{\eta})+ \varepsilon = g^{-1}(\text{X}\widehat{\beta}) + \varepsilon$
Here the errors can have a distribution other than Gaussian.
r .emo("info")
$\eta_i = \beta_0 + \beta_1 \times x_{1,i} + \beta_2 \times x_{2,i} + \dots + \beta_{p} \times x_{p,i}$ or in matrix notation $\eta = \text{X}\beta$ ,
with:
This is identical to the LM if:
So in GLM, the probability distribution of $\text{Y}$ can be Gaussian.
r .emo("info")
r .emo("nerd")
This includes:
r .emo("info")
The probability distribution and the link function are obtained in R by calling the family
functions:
gaussian(link = "identity") binomial(link = "logit") poisson(link = "log")
Note: the objects of class family
created by these functions also contain other useful information needed for the fitting procedure.
r .emo("alien")
set.seed(1L) Aliens <- data.frame(humans_eaten = round(runif(n = 20, min = 0, max = 15))) Aliens$size <- rnorm( n = 20, mean = 50 + 1.5 * Aliens$humans_eaten, sd = 5) Aliens$eggs <- rpois( n = 20, lambda = exp(-1 + 0.1 * Aliens$humans_eaten)) Aliens$happy <- rbinom(n = 20, size = 1, prob = plogis(-3 + 0.3 * Aliens$humans_eaten)) Aliens$all_eyes <- round(runif(nrow(Aliens), min = 1, max = 12)) Aliens$blue_eyes <- rbinom(n = nrow(Aliens), size = Aliens$all_eyes, prob = plogis(-2 + 0.5 * Aliens$humans_eaten)) Aliens$pink_eyes <- Aliens$all_eyes - Aliens$blue_eyes head(Aliens)
r .emo("alien")
Or, using the {LM2GLMM} function:
set.seed(1L) Aliens <- simulate_Aliens_GLM(N = 20) head(Aliens)
The true parameter values are stored as attributes if you need them r .emo("nerd")
attributes(Aliens)$param.eta$blue_eyes ## to get back the parameters
r .emo("practice")
fit_gauss_lm <- lm(size ~ humans_eaten, data = Aliens) fit_gauss <- glm(size ~ humans_eaten, family = gaussian(), data = Aliens) fit_poiss <- glm(eggs ~ humans_eaten, family = poisson(), data = Aliens) fit_binar <- glm(happy ~ humans_eaten, family = binomial(), data = Aliens) fit_binom <- glm(cbind(blue_eyes, pink_eyes) ~ humans_eaten, family = binomial(), data = Aliens) library(spaMM) fit_gauss_spaMM_ML <- fitme(size ~ humans_eaten, family = gaussian(), data = Aliens) fit_poiss_spaMM_ML <- fitme(eggs ~ humans_eaten, family = poisson(), data = Aliens) fit_binar_spaMM_ML <- fitme(happy ~ humans_eaten, family = binomial(), data = Aliens) fit_binom_spaMM_ML <- fitme(cbind(blue_eyes, pink_eyes) ~ humans_eaten, family = binomial(), data = Aliens) fit_gauss_spaMM_REML <- fitme(size ~ humans_eaten, family = gaussian(), data = Aliens, method = "REML") fit_poiss_spaMM_REML <- fitme(eggs ~ humans_eaten, family = poisson(), data = Aliens, method = "REML") fit_binar_spaMM_REML <- fitme(happy ~ humans_eaten, family = binomial(), data = Aliens, method = "REML") fit_binom_spaMM_REML <- fitme(cbind(blue_eyes, pink_eyes) ~ humans_eaten, family = binomial(), data = Aliens, method = "REML")
stats::glm()
r .emo("practice")
fit_poiss
spaMM::fitme()
r .emo("practice")
fit_poiss_spaMM_ML
r .emo("practice")
#EXAMPLE 1: Wrangle binomial data to be used as count data #Model total number of eyes, not proportion of blue eyes Aliens$CountEyes <- Aliens$blue_eyes + Aliens$pink_eyes glm(CountEyes ~ humans_eaten, data = Aliens, family = poisson(link = "log"))
#EXAMPLE 2: Wrangle count data to be used as binary data #Model whether aliens had 0 or >0 eggs, rather than modelling egg count Aliens$hadeggs <- Aliens$eggs > 0 glm(hadeggs ~ humans_eaten, data = Aliens, family = binomial(link = "logit"))
#EXAMPLE 3: Wrangle quantitative predictor to be used as qualitative predictor #Treat size as three categories #Here 'small' is less than 25th percentile, 'big' is greater than 75th percentile Aliens$size_cat <- ifelse(Aliens$size < 60, "small", ifelse(Aliens$size > 68, "big", "medium")) glm(hadeggs ~ size_cat, data = Aliens, family = binomial(link = "logit"))
r .emo("info")
A lot of functions that will be used by stats::glm()
and friends:
str(binomial(link = "logit"))
Note: we will explore the 2 most useful components.
link
& linkfun
) r .emo("practice")
The link function converts the values from the scale of the response to the one of the linear predictor.
binomial(link = "logit")$linkfun(0.9) ## the default log(0.9/(1 - 0.9))
binomial(link = "probit")$linkfun(0.9) qnorm(0.9)
Note: when things get complex, to convert the values from the scale of the response to the one of the linear predictor, use linkfun()
from the family object instead of trying to do it by hand.
linkinv
) r .emo("practice")
The inverse link function converts the values of the linear predictor back into the scale of the response.
binomial(link = "logit")$linkinv(2.197225) plogis(2.197225) 1/(1 + exp(-2.197225))
Note: when things get complex, to convert the values from the scale of the linear predictor to the one of the response, use linkinv()
from the family object instead of trying to do it by hand.
r .emo("info")
We want to find the $\widehat{\beta}$ maximizing the likelihood.
This is the same as finding the $\widehat{\beta}$ minimizing the residual deviance.
Residual deviance = $-2 \times (\text{logLik}\text{focal fit} - \text{logLik}\text{staturated fit})$
r .emo("info")
my_glm1 <- function(formula, data, family, weights = NULL) { useful.data <- model.frame(formula = formula, data = data) ## keep only useful rows and columns X <- model.matrix(object = formula, data = useful.data) ## build design matrix y <- model.response(useful.data) ## extract response nobs <- NROW(y) ## count the rows of observations (not nrow() as y can be a vector or a matrix) if (is.null(weights)) weights <- rep(1, nobs) ## set the prior.weights if NULL etastart <- start <- mustart <- NULL ## required for family$initialize eval(family$initialize) ## recomputes y and prior weights if binomial + cbind get_resid_deviance <- function(coef) { ## define function computing the unscaled residual deviance eta <- as.numeric(X %*% coef) ## compute the linear predictors mu <- family$linkinv(eta) ## express the linear predictor in the scale of the response sum(family$dev.resids(y, mu, weights)) ## compute the deviance. Here: prior weights are used! } fit <- nlminb(rep(0, ncol(X)), get_resid_deviance) ## optimisation; same as optim() but seems to work better if (fit$convergence != 0) warning("the algorithm did not converge") ## test if convergence has been reached fit$par ## returns estimates }
Note: it should handle all families but it does not consider offsets, convergence issues and maybe other sources of complexity.
r .emo("info")
unlike LM there is no analytic solution to directly obtain maximum likelihood estimates (MLE)!
The function glm.fit()
fits GLM using an iterative algorithm based on weighted least squares to update parameter estimates.
The working weights (not to be mixed up with prior weights) are inversely proportional to the variance of the response, but the variance depends on the mean which, in turn, depends on the parameters.
The algorithm fixes these weights, determines the parameter values that minimize the weighted sum of squared residuals, then updates the weights and repeats the process until the weights stabilize (it usually happens in very few iterations).
This algorithm is thus called IRLS for iteratively re-weighted least squares, it is also sometimes called Fisher's scoring which has been proposed by Fisher (in a context more general than GLM).
r .emo("nerd")
my_glm2 <- function(formula, data, family, weights = NULL) { useful.data <- model.frame(formula = formula, data = data) ## keep only useful rows and columns X <- model.matrix(object = formula, data = useful.data) ## build design matrix y <- model.response(useful.data) ## extract response nobs <- NROW(y) ## count the rows of observations (not nrow() as y can be a vector or a matrix) if (is.null(weights)) weights <- rep(1, nobs) ## set the prior.weights if NULL etastart <- start <- mustart <- NULL ## required for family$initialize eval(family$initialize) ## creates mustart -- the initial values for mu -- and ## recomputes y and weights if binomial + cbind eta <- family$linkfun(mustart) ## compute initial values for eta from mustart for (iter in 1:25L) { mu <- family$linkinv(eta) ## compute the fitted values deriv <- family$mu.eta(eta) ## compute the derivative d mu/d eta w <- (y - mu)/deriv ## compute the working residuals Z <- eta + w ## compute the adjusted response W <- weights*deriv^2/family$variance(mu) ## compute the working weights lm.fit <- lm(Z ~ X - 1, weights = W) ## lm with weights beta <- lm.fit$coef ## extract the estimates eta <- X %*% beta ## compute the linear predictor } beta ## returns estimates }
r .emo("practice")
The algorithm usually converges quickly:
foo <- glm(size ~ humans_eaten, data = Aliens, family = gaussian(), control = list(trace = TRUE)) foo <- glm(eggs ~ humans_eaten, data = Aliens, family = poisson(), control = list(trace = TRUE))
Note: check other options for the iterative procedure in ?glm.control
glm.fit()
mockup r .emo("nerd")
r .emo("nerd")
r .emo("nerd")
my_glm3 <- function(formula, data, family, weights = NULL) { useful.data <- model.frame(formula = formula, data = data) ## keep only useful rows and columns X <- model.matrix(object = formula, data = useful.data) ## build design matrix y <- model.response(useful.data) ## extract response nobs <- NROW(y) ## count the rows of observations (not nrow() as y can be a vector or a matrix) if (is.null(weights)) weights <- rep(1, nobs) ## set the prior.weights if NULL etastart <- start <- mustart <- NULL ## required for family$initialize eval(family$initialize) ## creates mustart -- the initial values for mu -- and ## recomputes y and weights if binomial + cbind eta <- family$linkfun(mustart) ## compute initial values for eta from mustart devold <- Inf ## set old residual deviance | glm.fit does something more clever here for (iter in 1:25L) { mu <- family$linkinv(eta) ## compute the fitted values deriv <- family$mu.eta(eta) ## compute the derivative d mu/d eta w <- (y - mu)/deriv ## compute the working residuals Z <- eta + w ## compute the adjusted response W <- weights*deriv^2/family$variance(mu) ## compute the working weights dev <- sum(family$dev.resids(y, mu, weights)) ## compute residual deviance if (abs(dev - devold)/(0.1 + abs(dev)) < 1e-7) break ## leave loop if it has converged devold <- dev ## update residual deviance QR <- qr(X*sqrt(W)) ## perform the QR decomposition | glm.fit uses C_Cdqrls directly. beta <- qr.coef(QR, Z*sqrt(W)) ## compute coefficients | glm.fit uses C_Cdqrls directly. eta <- as.numeric(X %*% beta) ## compute the predicted values on the scale of the linear predictor } if (iter == 25L) warning("the algorithm did not converge after 25 iterations...") beta ## returns estimates }
r .emo("broken")
test <- data.frame(x = c(8.752, 20.27, 24.71, 32.88, 27.27, 19.09), y = c(5254, 35.92, 84.14, 641.8, 1.21, 47.2)) glm(y ~ x, data = test, family = Gamma(link = "log"), control = list(trace = TRUE))
r .emo("practice")
You may try to play with parameters of the iterative algorithm...
glm(y ~ x, data = test, family = Gamma(link = "log"), start = c(10, -0.1), control = list(epsilon = 1e-3, trace = TRUE))
... but unless you really know what you are doing (& I don't), you risk to make the optimisation algorithm stop prematurely without warning.
r .emo("practice")
You may also try other fitting methods (safer & easier):
library(spaMM) ## you need to load spaMM explicitly for that glm(y ~ x, data = test, family = Gamma(link = "log"), method = "spaMM_glm.fit")
r .emo("practice")
You may also try other fitting functions (which is equivalent to selecting alternative fitting methods):
fitme(y ~ x, data = test, family = Gamma(link = "log"))
r .emo("practice")
Just for fun let's try our toy functions too:
my_glm1(y ~ x, data = test, family = Gamma(link = "log")) ## Yay! It works my_glm2(y ~ x, data = test, family = Gamma(link = "log")) ## Silly results my_glm3(y ~ x, data = test, family = Gamma(link = "log")) ## Crash
r .emo("info")
If the process generating the data is not Gaussian fits will be more difficult...
manyAliens <- simulate_Aliens_GLM(N = 1000) fit_lm <- lm(log(eggs) ~ humans_eaten, data = manyAliens) ## fails due to log(0)
...and parameter estimates may be substantially biased:
fit_lm2 <- lm(log(eggs+1) ~ humans_eaten, data = manyAliens) ## runs since no longer 0s fit_glm <- glm(eggs ~ humans_eaten, family = poisson(), data = manyAliens) rbind(ground_truth = attr(manyAliens, "param.eta")$eggs, GLM_Poisson = fit_glm$coefficients, LM = fit_lm2$coefficients)
r .emo("info")
If the process generating the data is not Gaussian the assumptions for LM will also often not be met:
par(mfrow = c(1, 4)) plot(fit_lm2)
r .emo("info")
set.seed(1L) x <- seq(1, 2, length = 100) simu <- replicate(100, { y <- exp(rnorm(n = length(x), mean = 2 + 1 * x, sd = 2)) ## exp(mean_resp + error) fit_lm <- lm(log(y) ~ x) fit_glm <- glm(y ~ x, family = gaussian(link = "log")) c(coef(fit_lm), coef(fit_glm)) }) round(rbind( ground_truth = c("mean int" = 2, "sd int" = NA, "mean slope" = 1, "sd slope" = NA), LM = c("mean int" = mean(simu[1, ]), "sd int" = sd(simu[1, ]), "mean slope" = mean(simu[2, ]), "sd slope" = sd(simu[2, ])), GLM = c(mean(simu[3, ]), sd(simu[3, ]), mean(simu[4, ]), sd(simu[4, ]))), 3)
Note: in fit_glm
the expected values are log transformed, not the residuals! So the estimates produced by this fit are biased.
r .emo("info")
set.seed(1L) x <- seq(1, 2, length = 100) simu <- replicate(100, { y <- rnorm(n = length(x), mean = exp(2 + 1 * x), sd = 2) ## exp(mean_resp) + error fit_lm <- lm(log(y) ~ x) fit_glm <- glm(y ~ x, family = gaussian(link = "log")) c(coef(fit_lm), coef(fit_glm)) }) round(rbind( ground_truth = c("mean int" = 2, "sd int" = NA, "mean slope" = 1, "sd slope" = NA), LM = c("mean int" = mean(simu[1, ]), "sd int" = sd(simu[1, ]), "mean slope" = mean(simu[2, ]), "sd slope" = sd(simu[2, ])), GLM = c(mean(simu[3, ]), sd(simu[3, ]), mean(simu[4, ]), sd(simu[4, ]))), 3)
Note: here fit_glm
corresponds to the simulation model, so the estimates produced are better than the alternatives.
r .emo("practice")
We can compare deviances (smaller = better), but not directly:
set.seed(1L) x <- seq(1, 2, length = 100) y <- rnorm(n = length(x), mean = exp(2 + 1 * x), sd = 5) fit_glm <- glm(y ~ x, family = gaussian(link = "log")) fit_lm <- lm(log(y) ~ x) rbind("deviance GLM" = deviance(fit_glm), "raw deviance LM" = deviance(fit_lm), ## not comparable as response = log(y) "corrected deviance LM" = sum((y - exp(predict(fit_lm)))^2)) ## comparable deviance for lm
Note: you cannot use AIC in this case since the responses differ!
r .emo("practice")
stats::glm()
), e.g.:predict(fit_binom, newdata = data.frame(humans_eaten = 1:5), type = "link")
Note: the numbers correspond here to logits since it is the link function used in this fit.
predict(fit_binom, newdata = data.frame(humans_eaten = 1:5), type = "response")
Note: the numbers correspond here to probabilities since it is a binomial fit.
r .emo("info")
library(ggplot2) p_link <- predict(fit_binom, newdata = data.frame(humans_eaten = 1:5), type = "link") p_resp <- predict(fit_binom, newdata = data.frame(humans_eaten = 1:5), type = "response") logit_data <- data.frame(x = x <- seq(-4, 4, length = 100), y = plogis(x)) pred_data <- as.data.frame(cbind(humans_eaten = 1:5, link = p_link, response = p_resp)) pred_data_expanded <- data.frame(x = x <- seq(0, 10, length = 100), y = predict(fit_binom, newdata = data.frame(humans_eaten = x), type = "response")) ggplot() + geom_abline(slope = coef(fit_binom)["humans_eaten"][[1]], intercept = coef(fit_binom)["(Intercept)"][[1]], linetype = "dashed") + geom_point(aes(y = link, x = humans_eaten, colour = factor(humans_eaten)), data = pred_data, size = 2) + scale_x_continuous(limits = c(0, 10), breaks = 0:10) + scale_y_continuous(limits = c(-4, 4), breaks = seq(-4, 4, by = 2)) + labs(y = "prediction (link scale)", x = "humans eaten", colour = "humans eaten") + theme_bw() -> p1 ggplot() + geom_segment(aes(y = response, yend = response, xend = -4, x = link), data = pred_data, lty = 1, size = 0.25, arrow = arrow(length = unit(3, "mm"))) + geom_segment(aes(y = response, yend = -Inf, x = link, xend = link), data = pred_data, lty = 1, size = 0.25) + geom_text(aes(y = response + 0.05, x = -2, label = "Inverse\nlink function"), data = pred_data[5, ], size = 3.5) + geom_line(aes(y = y, x = x), data = logit_data, linetype = "dashed") + geom_point(aes(y = response, x = link, colour = factor(humans_eaten)), data = pred_data, size = 2) + labs(y = "prediction (response scale)", x = "prediction (link scale)", colour = "humans eaten") + scale_x_continuous(expand = c(0, 0)) + theme_bw() -> p2 ggplot() + geom_line(aes(y = y, x = x), data = pred_data_expanded, linetype = "dashed") + geom_point(aes(y = response, x = humans_eaten, colour = factor(humans_eaten)), data = pred_data, size = 2) + scale_x_continuous(limits = c(0, 10), breaks = 0:10) + labs(y = "prediction (response scale)", x = "humans eaten", colour = "humans eaten") + scale_y_continuous(limits = c(0, 1)) + theme_bw() -> p3 library(patchwork) p1 + p2 + p3 + plot_layout(guides = "collect")
r .emo("recap")
The predictions on the linear scale work just as we saw it for LMs.
How do we interpret the coefficients on the linear scale?
fit_binom
r .emo("info")
Doing the right thing can be tricky (even when a single predictor is involved)...
Because on the response scale the prediction associated with a mean value is not equal to the mean of predictions associated with each values.
Aliens2 <- data.frame(humans_eaten = rpois(n = 1000, lambda = 4)) ## new data making difference obvious Aliens2$happy <- rbinom(n = 1000, size = 1, prob = plogis(-3 + 0.3 * Aliens2$humans_eaten)) fit_binar2 <- glm(happy ~ humans_eaten, family = binomial(), data = Aliens2)
predict(fit_binar2, newdata = data.frame(humans_eaten = mean(Aliens2$humans_eaten)), type = "response")[[1]] mean(predict(fit_binar2, newdata = data.frame(humans_eaten = Aliens2$humans_eaten), type = "response"))
Note: considering a predictor at its mean no longer gives the mean expectation.
r .emo("practice")
Let's illustrate the issue on a more realistic model fit:
fit_titanic <- glm(survived ~ sex*age*passengerClass, data = TitanicSurvival, family = binomial())
newdata1 <- expand.grid(sex = levels(model.frame(fit_titanic)$sex), passengerClass = levels(model.frame(fit_titanic)$passengerClass), age = mean(model.frame(fit_titanic)$age)) preds1 <- predict(fit_titanic, newdata = newdata1, type = "response") cbind(newdata1, wrong_survival = preds1)
r .emo("practice")
Let's illustrate the issue on a more realistic model fit:
fit_titanic <- glm(survived ~ sex*age*passengerClass, data = TitanicSurvival, family = binomial())
newdata_focal <- expand.grid(sex = levels(model.frame(fit_titanic)$sex), passengerClass = levels(model.frame(fit_titanic)$passengerClass)) newdata_nonfocal <- data.frame(age = model.frame(fit_titanic)$age) newdata2 <- merge(newdata_focal, newdata_nonfocal) preds2 <- cbind(newdata2, survival_pdep = predict(fit_titanic, newdata = newdata2, type = "response")) output_pdep <- aggregate(survival_pdep ~ sex + passengerClass, data = preds2, FUN = mean) output_pdep$wrong_survival <- preds1 output_pdep
r .emo("practice")
Let's illustrate the issue on a more realistic model fit:
fit_titanic <- glm(survived ~ sex*age*passengerClass, data = TitanicSurvival, family = binomial())
newdata3 <- aggregate(age ~ sex + passengerClass, data = model.frame(fit_titanic), FUN = identity) #Mean prediction across predictor values. Correct newdata3 <- tidyr::unnest(newdata3, cols = age) ## unnest the list column storing ages preds3 <- cbind(newdata3, survival_cond = predict(fit_titanic, newdata = newdata3, type = "response")) output_cond <- aggregate(survival_cond ~ sex + passengerClass, data = preds3, FUN = mean) output_cond$survival_pdep <- output_pdep$survival output_cond$wrong_survival <- preds1 output_cond
r .emo("practice")
Things can be easier with {spaMM} provided that:
fit_titanic_spaMM <- fitme(survived ~ sex*age*passengerClass, data = TitanicSurvival, family = binomial())
pdep_effects(fit_titanic_spaMM, focal_var = "passengerClass")
r .emo("practice")
Things can be easier with {spaMM} provided that:
fit_titanic_spaMM <- fitme(survived ~ sex*age*passengerClass, data = TitanicSurvival, family = binomial())
Doing the same by hand:
newdata1 <- newdata2 <- newdata3 <- model.frame(fit_titanic_spaMM) newdata1["passengerClass"] <- "1st" newdata2["passengerClass"] <- "2nd" newdata3["passengerClass"] <- "3rd" newdata <- rbind(newdata1, newdata2, newdata3) preds <- cbind(newdata, survival = predict(fit_titanic_spaMM, newdata = newdata)) aggregate(survival ~ passengerClass, data = preds, FUN = mean)
r .emo("recap")
r .emo("info")
coef(fit_poiss) predict(fit_poiss, newdata = data.frame(humans_eaten = 10)) #Linear predictor scale by default coef(fit_poiss)[1] + 10 * coef(fit_poiss)[2]
Because we used the link function log
, this is a log number of eggs!
r .emo("info")
coef(fit_poiss) predict(fit_poiss, newdata = data.frame(humans_eaten = 10), type = "response") exp(coef(fit_poiss)[1] + 10 * coef(fit_poiss)[2])
This is the average number of eggs predicted for aliens eating 10 humans.
r .emo("info")
The model parameters are linear for $\eta$, not $\mu$!
What happens when different aliens eat one additional human?
(p.eta <- predict(fit_poiss, newdata = data.frame(humans_eaten = 1:5))) #Linear predictor scale by default diff(p.eta)
Note: the effect is constant at the scale of the linear predictor.
r .emo("info")
The model parameters are linear for $\eta$, not $\mu$!
What happens when different aliens eat one additional human?
(p.mu <- predict(fit_poiss, newdata = data.frame(humans_eaten = 1:5), type = "response")) diff(p.mu)
Note: the effect is NOT constant at the scale of the response.
r .emo("practice")
Ratios are the way out:
p.mu[-1] / p.mu[-5] exp(coef(fit_poiss)[2])
Eating one additional human increases the average number of eggs produced by aliens by r exp(fit_poiss$coefficients[2])
times, which is equivalent to r 100*exp(fit_poiss$coefficients[2])-100
%.
Note: if you change the link function, the recipe will be different!
r .emo("info")
(p.mu <- predict(fit_binom, newdata = data.frame(humans_eaten = 1:5), type = "response")) diff(p.mu)
Note: the effect is also not constant at the scale of the response!
r .emo("practice")
Odds ratios are the way out:
odds <- p.mu / (1 - p.mu) odds[-1] / odds[-5]
The effect is constant on the scale of the odds ratios!
And the odds ratios can also be directly deduced from the parameter estimates:
exp(coef(fit_binom)[2])
Eating one additional human increases the odd of aliens having blue eyes by r round(exp(fit_binom$coefficients[2]), 2)
times.
r .emo("practice")
What happens when different aliens eat 10 additional humans?
p.mu <- predict(fit_binom, newdata = data.frame(humans_eaten = c(0, 10, 20, 30, 40)), type = "response") odd <- p.mu / (1 - p.mu) odd[-1] / odd[-5]
It is also constant and predicted from:
exp(fit_binom$coefficients[2] * 10)
Note: as odds-ratios are ratios, a value of 0.1 means a 10 (=1/0.1) fold reduction in odds.
r .emo("info")
summary(fit_gauss)
r .emo("recap")
summary(fit_gauss)$coefficients summary(fit_gauss_lm)$coefficients
r .emo("practice")
Example: Poisson
summary(fit_poiss)$coefficients z <- (coef(fit_poiss) - 0) / sqrt(diag(vcov(fit_poiss))) pvalues <- 2 * pnorm(abs(z), lower.tail = FALSE) cbind(z, pvalues)
summary(fit_poiss_spaMM_ML, details = c(p_value = "Wald"), verbose = FALSE)$beta_table ## same with spaMM
Note: when the dispersion parameter is not estimated, summary
uses $z$, not $t$. Yet, these so-called "Wald tests" are not very reliable and it is better to rely on CI build using likelihood profiling or parametric bootstrap.
r .emo("info")
The parameter estimates are asymptotically normally distributed as in LM, but for non-Gaussian families small sample sizes deviate strongly from the asymptotic case. Example with large N:
new_betas <- t(replicate(1000, update(fit_binar, data = simulate_Aliens_GLM(N = 1000))$coef))
r .emo("warn")
The parameter estimates are asymptotically normally distributed as in LM, but for non-Gaussian families small sample sizes deviate strongly from the asymptotic case. Example with small N:
new_betas <- t(replicate(1000, update(fit_binar, data = simulate_Aliens_GLM(N = 10))$coef))
Note: this is why the so-called Wald tests shown earlier are not great...
r .emo("info")
For gaussian()
(and quasiXXX()
) the F-test is the one you should use (as we have seen it with LM):
car::Anova(fit_gauss, test = "F")
r .emo("info")
r .emo("proof")
What about a F-test?
set.seed(1L) N <- 20 pv <- replicate(1000, { d <- data.frame(eggs = rpois(N, lambda = 0.5), mood = factor(sample(c("depr.", "happy", "upset", "tired", "anxious"), size = N, replace = TRUE))) fit_poiss_1 <- glm(eggs ~ mood, family = poisson(), data = d) c(F = car::Anova(fit_poiss_1, test = "F")["mood", "Pr(>F)"], LR = car::Anova(fit_poiss_1, test = "LR")["mood", "Pr(>Chisq)"]) }) rbind(mean(pv["F", ] <= 0.05), mean(pv["LR", ] <= 0.05))
Note: the F-test performs very badly here, but the LRT is better.
r .emo("practice")
anova(update(fit_poiss, . ~ 1), fit_poiss, test = "LRT") car::Anova(fit_poiss, test = "LR") LR <- -2 * (logLik(update(fit_poiss, . ~ 1)) - logLik(fit_poiss)) pvalue <- pchisq(LR, df = 1, lower.tail = FALSE) cbind(LR, pvalue) ## the same
r .emo("practice")
You can compare model fits this way using {spaMM}:
fit_poiss_spaMM0 <- update(fit_poiss_spaMM_ML, . ~ 1) res <- anova(fit_poiss_spaMM_ML, fit_poiss_spaMM0, boot.repl = 1000)
res
r .emo("practice")
You can also compare model fits this way using {pbkrtest} for stats::glm()
fits:
fit_poiss_0 <- update(fit_poiss, . ~ 1) res <- pbkrtest::PBmodcomp(fit_poiss, fit_poiss_0, nsim = 1000, seed = 123)
res
r .emo("nerd")
Let us compare the 2 versions of the LRT using parametric bootstrap r .emo("slow")
r .emo("slow")
r .emo("slow")
set.seed(1L) N <- 20 pv_boot <- replicate(1000, { d <- data.frame(eggs = rpois(N, lambda = 0.5), mood = factor(sample(c("depr.", "happy", "upset", "tired", "anxious"), size = N, replace = TRUE))) fit_poiss_1 <- fitme(eggs ~ mood, family = poisson(), data = d) fit_poiss_1_null <- fitme(eggs ~ 1, family = poisson(), data = d) boot <- anova(fit_poiss_1, fit_poiss_1_null, boot.repl = 200) c(raw = boot$rawBootLRT$p_value, Bart = boot$BartBootLRT$p_value) })
rbind(mean(pv_boot["Bart", ] <= 0.05), mean(pv_boot["raw", ] <= 0.05))
Note: applying the Bartlett correction is not as accurate (but it should become best if boot.repl
is small).
r .emo("info")
Let us compare the performances of all 4 tests (a perfect test would follow the dashed red line)
par(las = 1) plot(ecdf(pv["F", ]), col = "blue", cex = 0.5, main = "") plot(ecdf(pv["LR", ]), col = "green", add = TRUE, cex = 0.5) plot(ecdf(pv_boot["Bart", ]), col = "orange", add = TRUE, cex = 0.5) plot(ecdf(pv_boot["raw", ]), col = "red", add = TRUE, cex = 0.5) legend("topleft", fill = c("blue", "green", "orange", "red"), legend = c("F", "LR", "Param boot Bartlett", "Param boot raw"), bty = "n") abline(0, 1, col = "black", lty = 2, lwd = 2)
r .emo("info")
Let us compare the performances of all 4 tests (a perfect test would follow the dashed red line)
par(las = 1) plot(ecdf(pv["F", ]), xlim = c(0, 0.1), ylim = c(0, 0.1), col = "blue", cex = 0.5, main = "", xaxt = "n") plot(ecdf(pv["LR", ]), col = "green", add = TRUE, cex = 0.5, xaxt = "n") plot(ecdf(pv_boot["Bart", ]), col = "orange", add = TRUE, cex = 0.5, xaxt = "n") plot(ecdf(pv_boot["raw", ]), col = "red", add = TRUE, cex = 0.5, xaxt = "n") axis(1, at = seq(0, 0.1, 0.025)) abline(v = 0.05, lty = 3, lwd = 0.75) legend("topleft", fill = c("blue", "green", "orange", "red"), legend = c("F", "LR", "Param boot Bartlett", "Param boot raw"), bty = "n") abline(0, 1, col = "black", lty = 2, lwd = 2)
Note: here the F-test and asymptotic LRT are anticonservative r .emo("warn")
r .emo("practice")
For the Gaussian family, do as in LM! (Use the quantiles from the Student's distribution or confint()
).
For families with non-estimated variances (e.g. binomial, poisson...), CI based on the Gaussian or the Student's t distributions do not work well and instead you should build CI by likelihood profiling or parametric bootstrap.
If {MASS} is installed, calling confint()
on a model fitted with stats::glm()
directly does build CI by likelihood profiling:
confint(fit_poiss)
r .emo("practice")
For the Gaussian family, do as in LM! (Use the quantiles from the Student's distribution or confint()
).
For families with non-estimated variances (e.g. binomial, poisson...), CI based on the Gaussian or the Student's t distributions do not work well and instead you should build CI by likelihood profiling or parametric bootstrap.
For model fitted with spaMM::fitme()
, calling confint()
also performs likelihood profiling:
confint(fit_poiss_spaMM_ML, parm = "(Intercept)") confint(fit_poiss_spaMM_ML, parm = "humans_eaten")
r .emo("practice")
For the Gaussian family, do as in LM! (Use the quantiles from the Student's distribution or confint()
).
For families with non-estimated variances (e.g. binomial, poisson...), CI based on the Gaussian or the Student's t distributions do not work well and instead you should build CI by likelihood profiling or parametric bootstrap.
For model fitted with spaMM::fitme()
, calling confint()
can also be used to compute CI using parametric bootstrap if the argument boot_args
is correctly set:
res_boot <- confint(fit_poiss_spaMM_ML, parm = "humans_eaten", boot_args = list(nsim = 1000))
r .emo("practice")
res_boot res_boot$normal ## do not use the other types here!
r .emo("practice")
Which method is best depends on the particular circumstances, but you can (as always) test for yourself using simulations r .emo("slow")
r .emo("slow")
r .emo("slow")
set.seed(1L) N <- 20 coverage_small_list <- replicate(1000, { d <- simulate_Aliens_GLM(N = N) fit_poiss_spaMM_ML <- fitme(eggs ~ humans_eaten, family = poisson(), data = d) ref <- attr(d, "param.eta")$eggs["slope"] ## 0.1 prof <- confint(fit_poiss_spaMM_ML, parm = "humans_eaten") boot <- confint(fit_poiss_spaMM_ML, parm = "humans_eaten", boot_args = list(nsim = 200)) c(prof = sum(findInterval(ref, prof$interval)) == 1, boot_norm = sum(findInterval(ref, boot$normal[2:3])) == 1) }, simplify = FALSE)
apply((do.call("rbind", coverage_small_list)), 2, mean, na.rm = TRUE)
Note: here the construction of CI by likelihood profiling behaves best.
r .emo("practice")
There is no longer a direct method offered by predict()
for models fitted with stats::glm()
:
newdata <- data.frame(humans_eaten = 0:1) pred <- predict(fit_poiss, newdata = newdata, se.fit = TRUE) lwr_eta <- pred$fit + qnorm(0.025) * pred$se.fit upr_eta <- pred$fit + qnorm(0.975) * pred$se.fit cbind(pred = family(fit_poiss)$linkinv(pred$fit), lwr = family(fit_poiss)$linkinv(lwr_eta), upr = family(fit_poiss)$linkinv(upr_eta))
Notes:
don't use type = "response"
in predict()
because transformations need to happen after CI are computed!
for proper predictions involving more predictors, better use pdep_effects()
on a model fitted with {spaMM} r .emo("recap")
r .emo("practice")
CIs are assumed symmetrical (i.e. Gaussian) on the linear predictor scale but not on the response scale.
newdata <- data.frame(humans_eaten = 0:1) pred_wrong <- predict(fit_poiss, newdata = newdata, se.fit = TRUE, type = "response") lwr_eta <- pred_wrong$fit + qnorm(0.025) * pred_wrong$se.fit ## Wrong: for example purposes only upr_eta <- pred_wrong$fit + qnorm(0.975) * pred_wrong$se.fit ## Wrong: for example purposes only cbind(pred = pred_wrong$fit, lwr = lwr_eta, upr = upr_eta)
r .emo("practice")
CIs are assumed symmetrical (i.e. Gaussian) on the linear predictor scale but not on the response scale.
newdata <- data.frame(humans_eaten = 0:1) #Correct method. CIs on linear predictor scale pred <- predict(fit_poiss, newdata = newdata, se.fit = TRUE) lwr_eta <- pred$fit + qnorm(0.025) * pred$se.fit upr_eta <- pred$fit + qnorm(0.975) * pred$se.fit plot_data <- data.frame(humans_eaten = 0:1, eggs = family(fit_poiss)$linkinv(pred$fit), lwr = family(fit_poiss)$linkinv(lwr_eta), upr = family(fit_poiss)$linkinv(upr_eta), method = "link") #Wrong method. CIs on response scale pred_wrong <- predict(fit_poiss, newdata = newdata, se.fit = TRUE, type = "response") lwr_eta <- pred_wrong$fit + qnorm(0.025) * pred_wrong$se.fit upr_eta <- pred_wrong$fit + qnorm(0.975) * pred_wrong$se.fit plot_data <- rbind(plot_data, data.frame(humans_eaten = 0:1, eggs = pred_wrong$fit, lwr = lwr_eta, upr = upr_eta, method = "response")) ggplot(data = plot_data) + geom_hline(yintercept = 0) + geom_errorbar(aes(x = humans_eaten, ymin = lwr, ymax = upr, group = method), width = 0.25, position = position_dodge(width = 0.25)) + geom_point(aes(x = humans_eaten, y = eggs, group = method, shape = method), size = 3, fill = "grey", position = position_dodge(width = 0.25)) + scale_shape_discrete(labels = c("Linear predictor scale", "Response scale"), name = "") + scale_x_continuous(breaks = c(0, 1)) + theme_classic()
r .emo("practice")
It is much easier with {spaMM}:
get_intervals(fit_poiss_spaMM_ML, newdata = newdata, intervals = "fixefVar") pdep_effects(fit_poiss_spaMM_ML, focal_var = "humans_eaten", focal_values = 0:1)
Note: here the 2 methods are equivalent because only one predictor is considered in the model fit.
r .emo("info")
If the response variable does not change but models are not nested (e.g. with different family or link function), you can use:
For actual testing, the only method available is the parametric bootstrap!
Note: using the deviance is not recommended because some family return a scaled deviance while other do not r .emo("broken")
mod_poiss <- glm(happy ~ humans_eaten, family = poisson(link = "log"), data = Aliens) #Wrong family used mod_logit <- glm(happy ~ humans_eaten, family = binomial(link = "logit"), data = Aliens) #Default link function mod_probit <- glm(happy ~ humans_eaten, family = binomial(link = "probit"), data = Aliens) #Alternative link mod_cauchit <- glm(happy ~ humans_eaten, family = binomial(link = "cauchit"), data = Aliens) #Alternative link AIC(mod_poiss, mod_logit, mod_probit, mod_cauchit)
plot(model)
is useless for GLM r .emo("info")
par(mfrow = c(2, 2)) plot(fit_poiss)
Note: what we would expect to see here is no longer the same as for LM.
r .emo("info")
rbind( residuals(fit_poiss, type = "deviance")[1:2], residuals(fit_poiss, type = "pearson")[1:2], residuals(fit_poiss, type = "working")[1:2], residuals(fit_poiss, type = "response")[1:2])
Note: we can compute even more types of residuals, such as the partial residuals...
Unfortunately, none of these residuals are particularly useful to check the model assumptions!
r .emo("nerd")
We can deduce useful residuals from the comparison of observations to values expected under the model assumption...
set.seed(1L) simulated_data <- simulate(fit_poiss, 1000) simulated_data_cont <- simulated_data + runif(nrow(simulated_data)*ncol(simulated_data), min = -0.5, max = 0.5) hist(as.numeric(simulated_data_cont[1, ]), main = "distrib of first fitted value", nclass = 30) abline(v = fit_poiss$y[1] + runif(1, min = -0.5, max = 0.5), col = "red", lwd = 2, lty = 2)
Note: here we compare the first observation (+/- a little noise) to 1000 response values simulated under the model fit.
r .emo("nerd")
The idea of is to consider as residuals the quantile of each observation in its expected (simulated) distribution:
plot(ecdf_simulated_1 <- ecdf(unlist(simulated_data_cont[1, ])), main = "cdf of first fitted value") noise <- runif(1, min = -0.5, max = 0.5) observed_quantile_1_cont <- ecdf_simulated_1(fit_poiss$y[1] + noise) segments(x0 = fit_poiss$y[1] + noise, y0 = 0, y1 = observed_quantile_1_cont, col = "red", lwd = 2) arrows(x0 = fit_poiss$y[1] + noise, x1 = -1, y0 = observed_quantile_1_cont, col = "red", lwd = 2)
r .emo("nerd")
We can compute them all at once using the same process:
simulated_residuals <- rep(NA, nrow(fit_poiss$model)) for (i in 1:nrow(fit_poiss$model)) { ecdf_fn <- ecdf(unlist(simulated_data_cont[i, ])) simulated_residuals[i] <- ecdf_fn(fit_poiss$y[i] + runif(1, min = -0.5, max = 0.5)) } simulated_residuals
r .emo("nerd")
By construction, these residuals should be uniformly distributed.
r .emo("practice")
All of this can be done easily with the package {DHARMa}:
library(DHARMa) fit_poiss_simres <- simulateResiduals(fit_poiss) plot(fit_poiss_simres)
r .emo("info")
r .emo("info")
It is difficult, but we may try to:
r .emo("alien")
set.seed(1L) Aliens <- data.frame(humans_eaten = runif(100, min = 0, max = 15)) Aliens$eggs <- rpois(n = 100, lambda = exp(1 + 0.2 * Aliens$humans_eaten - 0.02 * Aliens$humans_eaten^2)) fit_poiss_bad <- glm(eggs ~ humans_eaten, data = Aliens, family = poisson()) ## mispecified model (fit_poiss_good <- glm(eggs ~ poly(humans_eaten, 2, raw = TRUE), data = Aliens, family = poisson())) ## good model
r .emo("practice")
plot(s_bad <- simulateResiduals(fit_poiss_bad, n = 1000)) ## simulated residuals from the bad fit
r .emo("practice")
plot(s_good <- simulateResiduals(fit_poiss_good, n = 1000)) ## simulated residuals from the good fit
r .emo("practice")
car::crPlots(fit_poiss_bad, terms = ~ humans_eaten) ## partial residuals from the bad fit
r .emo("practice")
car::crPlots(fit_poiss_good, terms = ~ poly(humans_eaten, 2, raw = TRUE)) ## partial residuals from the good fit
r .emo("practice")
library(mgcv) ## package to fit GAMs fit_poissGAM <- gam(eggs ~ s(humans_eaten), data = Aliens, family = poisson()) plot(fit_poissGAM, shade = TRUE, residuals = FALSE, shade.col = "red") ## on the scale of the linear predictor!
r .emo("recap")
r .emo("practice")
You can run the Durbin Watson test on the simulated residuals using {DHARMa}:
testTemporalAutocorrelation(s_good, time = fit_poiss_good$fitted.values, plot = FALSE)
r .emo("practice")
You can run the Durbin Watson test on the simulated residuals using {DHARMa}:
testTemporalAutocorrelation(s_good, time = Aliens$humans_eaten, plot = FALSE)
Note: As for LM it is good practice to test for the lack of serial autocorrelation along all your predictors and not just along the fitted values.
r .emo("info")
A GLM assumes a particular relationship between mu and var(mu):
p <- seq(0, 1, 0.1) lambda <- 0:10 theta <- 0:10 v_b <- binomial()$variance(p) v_p <- poisson()$variance(lambda) v_G <- Gamma()$variance(theta) par(mfrow = c(1, 3), las = 2) plot(v_b ~ p, type = "o"); plot(v_p ~ lambda, type = "o"); plot(v_G ~ theta, type = "o")
r .emo("info")
Usual suspects:
r .emo("alien")
set.seed(1L) popA <- data.frame(humans_eaten = runif(50, min = 0, max = 15)) popA$eggs <- rpois(n = 50, lambda = exp(-1 + 0.05 * popA$humans_eaten)) popA$pop <- "A" popB <- data.frame(humans_eaten = runif(50, min = 0, max = 15)) popB$eggs <- rpois(n = 50, lambda = exp(-3 + 0.4 * popB$humans_eaten)) popB$pop <- "B" AliensMix <- rbind(popA, popB) (fit_poissMix <- glm(eggs ~ humans_eaten, family = poisson(), data = AliensMix))
r .emo("info")
Two measures are traditionally used to quantify dispersion:
fit_poissMix$deviance / fit_poissMix$df.residual sum(residuals(fit_poissMix, type = "pearson")^2) / fit_poissMix$df.residual
r .emo("info")
Asymptotic tests based on these measures do not always behave well r .emo("proof")
Note: this is far from the uniform distribution we aim at...
r .emo("practice")
We can again use {DHARMa} for that:
r <- simulateResiduals(fit_poissMix, n = 1000) testDispersion(r) # stat = squared pearson residuals compared to that obtained by simulation
r .emo("practice")
fit_poissMix2 <- glm(eggs ~ pop*humans_eaten, family = poisson(), data = AliensMix) r2 <- simulateResiduals(fit_poissMix2, n = 1000) testDispersion(r2, plot = FALSE) ## you can change options to test for underdispersion
r .emo("info")
Variance = $k \times \mu$
fit_poissMixQ <- glm(eggs ~ humans_eaten, family = quasipoisson(), data = AliensMix) summary(fit_poissMixQ)$coef car::Anova(fit_poissMixQ, test = "F") ## 'F' not 'LR' because dispersion is estimated!
r .emo("practice")
Variance = $k \times \mu$
It kind of works (if the overdispersion is not too strong), but we cannot do everything with these types of fit since they have no likelihood:
confint(fit_poissMixQ) ## works (using deviance) c(logLik(fit_poissMixQ), AIC(fit_poissMixQ)) ## does not work
r .emo("info")
quasibinomial()
, with the same limit (no likelihood)r .emo("info")
For count data, you have 3 options:
Note: only the Conway-Maxwell-Poisson can handle underdispersion.
r .emo("practice")
fit_poissMix_spaMM <- fitme(eggs ~ humans_eaten, family = poisson(), data = AliensMix) fit_poissMixNB_spaMM <- fitme(eggs ~ humans_eaten, family = spaMM::negbin(), data = AliensMix) ## namespace needed! summary(fit_poissMixNB_spaMM) c(AIC(fit_poissMix_spaMM), AIC(fit_poissMixNB_spaMM))
Note: the negative binomial fit is much better than the Poisson one.
r .emo("info")
It occurs for when there are more zeros than expected under the probability distribution of the data generation process.
It can occur when the response results from a 2 (or more) steps process
Examples:
r .emo("info")
Fit an hurdle model
Fit a zero-inflation model
Note: if you are not sure where the zeros come from, try both!
r .emo("alien")
set.seed(1L) AliensZ <- simulate_Aliens_GLM(N = 1000) AliensZ$eggs <- AliensZ$eggs * AliensZ$happy ## unhappy Aliens loose their eggs :-( barplot(table(AliensZ$eggs))
r .emo("practice")
fit_Zpoiss <- glm(eggs ~ humans_eaten, data = AliensZ, family = poisson()) r <- simulateResiduals(fit_Zpoiss, n = 1000) testZeroInflation(r, plot = FALSE)
Note: we reject the null hypothesis which is that the observed number of zeros is compatible with the one expected if those data would have be produced by a Poisson model.
mean(sum(AliensZ$eggs == 0) / sum(dpois(0, fitted(fit_Zpoiss)))) ## = ratioObsSim
r .emo("practice")
testZeroInflation(r, plot = TRUE) ## plot the number of 0s simulated under the model and observed
r .emo("practice")
You can fit hurdle models with glmmTMB::glmmTMB()
:
library(glmmTMB) fit_Zhurd1 <- glmmTMB(eggs ~ humans_eaten, ziformula = ~humans_eaten, family = truncated_poisson(link = "log"), data = AliensZ) fit_Zhurd2 <- glmmTMB(eggs ~ humans_eaten, ziformula = ~1, family = truncated_poisson(link = "log"), data = AliensZ) lmtest::lrtest(fit_Zhurd1, fit_Zhurd2)
Notes:
humans_eaten
influenced the production of all 0s (since they all come from the binary process).r .emo("practice")
You can also fit zero-inflation models with glmmTMB::glmmTMB()
:
fit_Zzi1 <- glmmTMB(eggs ~ humans_eaten, ziformula = ~humans_eaten, family = poisson(link = "log"), data = AliensZ) fit_Zzi2 <- glmmTMB(eggs ~ humans_eaten, ziformula = ~1, family = poisson(link = "log"), data = AliensZ) lmtest::lrtest(fit_Zzi1, fit_Zzi2)
Note: here we tested whether humans_eaten
influenced the production of some of the 0s (since they come both from the binary process and the Poisson process).
r .emo("practice")
AIC(fit_Zpoiss, fit_Zhurd1, fit_Zzi1)
Let's also compare the model parameter estimates:
tab <- rbind(Poisson = c(fit_Zpoiss$coefficients, NA, NA), Hurdle = unlist(fixef(fit_Zhurd1)), ## NOTE: the binary part predicts the number of zeros ZeroInfl = unlist(fixef(fit_Zzi1)), Truth = c(attr(AliensZ, "param.eta")$eggs, -1 * attr(AliensZ, "param.eta")$happy)) colnames(tab) <- c("Int.count", "Slope.count", "Int.zeros", "Slope.zeros") tab
Note: parameter estimates confirm that both the hurdle and the zero-inflation fits are (here) accurate.
r .emo("info")
Separation occurs when a level (or combination of levels) for categorical predictor, or a particular threshold along a continuous predictor, predicts the outcomes perfectly.
From Wikipedia:
For example, if the predictor X is continuous, and the outcome y = 1 for all observed x > 2. If the outcome values are perfectly determined by the predictor (e.g., y = 0 when x ≤ 2) then the condition "complete separation" is said to occur. If instead there is some overlap (e.g., y = 0 when x < 2, but y has observed values of 0 and 1 when x = 2) then "quasi-complete separation" occurs. A 2 × 2 table with an empty cell is an example of quasi-complete separation.
r .emo("proof")
set.seed(1L) n <- 50 test <- data.frame(happy = rbinom(2*n, prob = c(rep(0, n), rep(0.75, n)), size = 1), sp = factor(c(rep("sp1", n), rep("sp2", n)))) table(test$happy, test$sp)
fit_separated <- glm(happy ~ sp, data = test, family = binomial()) summary(fit_separated)$coefficients
Note: despite the very clear effect, the test does not detect it.
r .emo("practice")
stats::glm()
fitslibrary(safeBinaryRegression) ## overload the glm function fit_separated <- glm(happy ~ sp, data = test, family = binomial()) ## don't mind warnings here detach(package:safeBinaryRegression) ## stop masking original glm()
r .emo("practice")
spaMM::fitme()
fitsfit_separated_spaMM <- fitme(happy ~ sp, data = test, family = binomial())
Note: you may need to install an additional package, but {spaMM} will inform you about it.
r .emo("practice")
test$happy[1] <- 1 table(test$happy, test$sp) fit_separated2 <- glm(happy ~ sp, data = test, family = binomial()) summary(fit_separated2)$coefficients
Note: this little tweak of the data is sufficient to recover significance.
r .emo("practice")
A better solution for the binary case (as well as for binomial in general) is to use a specific fitting method:
test$happy[1] <- 0 ## restore the original data library(brglm2) fit_separated3 <- glm(happy ~ sp, data = test, family = binomial(), method = "brglmFit") summary(fit_separated3)$coefficients
r .emo("goal")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.