knitr::opts_chunk$set( dpi = 300, out.width = "100%", collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) pkgs <- c("bayestestR", "ggplot2", "glmmTMB", "performance", "lme4", "ggdag", "dagitty") successfully_loaded <- vapply(pkgs, requireNamespace, FUN.VALUE = logical(1L), quietly = TRUE) can_evaluate <- all(successfully_loaded) if (can_evaluate) { knitr::opts_chunk$set(eval = TRUE) vapply(pkgs, require, FUN.VALUE = logical(1L), quietly = TRUE, character.only = TRUE) } else { knitr::opts_chunk$set(eval = FALSE) }
This vignette can be referred to by citing the package:
citation("see")
A crucial aspect when building regression models is to evaluate the quality of modelfit. It is important to investigate how well models fit to the data and which fit indices to report. Functions to create diagnostic plots or to compute fit measures do exist, however, mostly spread over different packages. There is no unique and consistent approach to assess the model quality for different kind of models.
The primary goal of the performance package in easystats ecosystem is to fill this gap and to provide utilities for computing indices of model quality and goodness of fit. These include measures like r-squared (R2), root mean squared error (RMSE) or intraclass correlation coefficient (ICC) , but also functions to check (mixed) models for overdispersion, zero-inflation, convergence or singularity.
For more, see: https://easystats.github.io/performance/
Let's load the needed libraries first:
library(performance) library(lme4) library(see)
(related function documentation)
Example where model is not a good fit.
model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") result <- binned_residuals(model) result plot(result)
Example where model is a good fit.
model <- suppressWarnings(glm(am ~ mpg + vs + cyl, data = mtcars, family = "binomial")) result <- binned_residuals(model) result plot(result)
(related function documentation)
m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) result <- check_collinearity(m) result plot(result)
library(glmmTMB) data(Salamanders) # create highly correlated pseudo-variable set.seed(1) Salamanders$cover2 <- Salamanders$cover * runif(n = nrow(Salamanders), min = 0.7, max = 1.5) # fit mixed model with zero-inflation model <- glmmTMB( count ~ spp + mined + cover + cover2 + (1 | site), ziformula = ~ spp + mined, family = truncated_poisson, data = Salamanders ) result <- check_collinearity(model) result plot(result)
(related function documentation)
# select only mpg and disp (continuous) mt1 <- mtcars[, c(1, 3, 4)] # create some fake outliers and attach outliers to main df mt2 <- rbind(mt1, data.frame(mpg = c(37, 40), disp = c(300, 400), hp = c(110, 120))) # fit model with outliers model <- lm(disp ~ mpg + hp, data = mt2) result <- check_outliers(model) result
There are two visualization options
plot(result, type = "dots")
plot(result, type = "bars")
(related function documentation)
model2 <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) result2 <- check_normality(model2)
plot(result2, type = "density", data = model2)
plot(result2, type = "density")
plot(result2, type = "qq", data = model2)
plot(result2, type = "qq")
plot(result2, type = "pp", data = model2)
plot(result2, type = "pp")
(related function documentation)
model <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy) result <- check_normality(model, effects = "random") plot(result)
(related function documentation)
model <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) result <- check_heteroscedasticity(model) plot(result)
(related function documentation)
model <- lm(len ~ supp + dose, data = ToothGrowth) result <- check_homogeneity(model) plot(result)
(related function documentation)
model <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) check_predictions(model)
To check if the model properly captures the variation in the data, use
check_range = TRUE
:
model <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) check_predictions(model, check_range = TRUE)
(related function documentation)
The composition of plots when checking model assumptions depends on the type of the input model. E.g., for logistic regression models, a binned residuals plot is used, while for linear models a plot of homegeneity of variance is shown instead. Models from count data include plots to inspect overdispersion.
model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") check_model(model)
out <- check_model(model) plot(out, type = "discrete_both")
model <- glm( count ~ spp + mined + cover, family = poisson(), data = Salamanders ) check_model(model)
model <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy) check_model(model)
check_model(model, panel = FALSE)
Note that not all checks supported in performance
will be reported in this
unified visual check. For example, for linear models, one needs to check the
assumption that errors are not autocorrelated, but this check will not be shown
in the visual check.
check_autocorrelation(lm(formula = wt ~ mpg, data = mtcars))
(related function documentation)
compare_performance()
computes indices of model performance for different
models at once and hence allows comparison of indices across models. The
plot()
-method creates a "spiderweb" plot, where the different indices are
normalized and larger values indicate better model performance. Hence, points
closer to the center indicate worse fit indices.
data(iris) lm1 <- lm(Sepal.Length ~ Species, data = iris) lm2 <- lm(Sepal.Length ~ Species + Petal.Length, data = iris) lm3 <- lm(Sepal.Length ~ Species * Sepal.Width, data = iris) lm4 <- lm(Sepal.Length ~ Species * Sepal.Width + Petal.Length + Petal.Width, data = iris) result <- compare_performance(lm1, lm2, lm3, lm4) result plot(result)
(related function documentation)
model <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy) result <- check_distribution(model) result plot(result)
vec <- bayestestR::distribution_poisson(n = 500, lambda = 2.5) result <- check_distribution(vec) result plot(result)
(related function documentation)
dag <- check_dag( y ~ x + b + c, x ~ b, outcome = "y", exposure = "x", coords = list( x = c(y = 5, x = 4, b = 3, c = 3), y = c(y = 3, x = 3, b = 2, c = 4) ) ) # plot comparison between current and required model plot(dag) # plot current model specification plot(dag, which = "current") # plot required model specification plot(dag, which = "required")
(related function documentation)
data(iris) set.seed(123) iris$y <- rbinom(nrow(iris), size = 1, 0.3) folds <- sample(nrow(iris), size = nrow(iris) / 8, replace = FALSE) test_data <- iris[folds, ] train_data <- iris[-folds, ] model <- glm(y ~ Sepal.Length + Sepal.Width, data = train_data, family = "binomial") result <- performance_roc(model, new_data = test_data) result plot(result)
You can also compare ROC curves for different models.
set.seed(123) library(bayestestR) # creating models m1 <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") m2 <- glm(vs ~ wt + am + mpg, data = mtcars, family = "binomial") # comparing their performances using the AUC curve plot(performance_roc(m1, m2))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.