R/check_zeroinflation.R

Defines functions print.check_zi check_zeroinflation.performance_simres check_zeroinflation.default check_zeroinflation

Documented in check_zeroinflation check_zeroinflation.default check_zeroinflation.performance_simres

#' @title Check for zero-inflation in count models
#' @name check_zeroinflation
#'
#' @description `check_zeroinflation()` checks whether count models are
#' over- or underfitting zeros in the outcome.
#'
#' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb`
#' (package **MASS**).
#' @param tolerance The tolerance for the ratio of observed and predicted
#' zeros to considered as over- or underfitting zeros. A ratio
#' between 1 +/- `tolerance` is considered as OK, while a ratio
#' beyond or below this threshold would indicate over- or underfitting.
#' @param alternative A character string specifying the alternative hypothesis.
#' @param ... Arguments passed down to [`simulate_residuals()`]. This only applies
#' for models with zero-inflation component, or for models of class `glmmTMB`
#' from `nbinom1` or `nbinom2` family.
#'
#' @return A list with information about the amount of predicted and observed
#'  zeros in the outcome, as well as the ratio between these two values.
#'
#' @details If the amount of observed zeros is larger than the amount of
#' predicted zeros, the model is underfitting zeros, which indicates a
#' zero-inflation in the data. In such cases, it is recommended to use
#' negative binomial or zero-inflated models.
#'
#' In case of negative binomial models, models with zero-inflation component,
#' or hurdle models, the results from `check_zeroinflation()` are based on
#' [`simulate_residuals()`], i.e. `check_zeroinflation(simulate_residuals(model))`
#' is internally called if necessary.
#'
#' @section Tests based on simulated residuals:
#' For certain models, resp. model from certain families, tests are based on
#' simulated residuals (see [`simulate_residuals()`]). These are usually more
#' accurate for testing such models than the traditionally used Pearson residuals.
#' However, when simulating from more complex models, such as mixed models or
#' models with zero-inflation, there are several important considerations.
#' Arguments specified in `...` are passed to [`simulate_residuals()`], which
#' relies on [`DHARMa::simulateResiduals()`] (and therefore, arguments in `...`
#' are passed further down to _DHARMa_). The defaults in DHARMa are set on the
#' most conservative option that works for all models. However, in many cases,
#' the help advises to use different settings in particular situations or for
#' particular models. It is recommended to read the 'Details' in
#' `?DHARMa::simulateResiduals` closely to understand the implications of the
#' simulation process and which arguments should be modified to get the most
#' accurate results.
#'
#' @family functions to check model assumptions and and assess model quality
#'
#' @examplesIf require("glmmTMB") && require("DHARMa")
#' data(Salamanders, package = "glmmTMB")
#' m <- glm(count ~ spp + mined, family = poisson, data = Salamanders)
#' check_zeroinflation(m)
#'
#' # for models with zero-inflation component, it's better to carry out
#' # the check for zero-inflation using simulated residuals
#' m <- glmmTMB::glmmTMB(
#'   count ~ spp + mined,
#'   ziformula = ~ mined + spp,
#'   family = poisson,
#'   data = Salamanders
#' )
#' res <- simulate_residuals(m)
#' check_zeroinflation(res)
#' @export
check_zeroinflation <- function(x, ...) {
  UseMethod("check_zeroinflation")
}


#' @rdname check_zeroinflation
#' @export
check_zeroinflation.default <- function(x, tolerance = 0.05, ...) {
  # check if we have poisson
  model_info <- insight::model_info(x)
  if (!model_info$is_count) {
    insight::format_error("Model must be from Poisson-family.")
  }

  # get actual zero of response
  obs.zero <- sum(insight::get_response(x, verbose = FALSE) == 0L)

  if (obs.zero == 0) {
    insight::print_color("Model has no observed zeros in the response variable.\n", "red")
    return(NULL)
  }

  # model classes not supported in DHARMa
  not_supported <- c("fixest", "glmx")

  # for models with zero-inflation component or negative binomial families,
  # we use simulate_residuals()
  if (!inherits(x, not_supported) && (model_info$is_zero_inflated || model_info$is_negbin || model_info$family == "genpois")) { # nolint
    if (missing(tolerance)) {
      tolerance <- 0.1
    }
    return(check_zeroinflation(simulate_residuals(x, ...), tolerance = tolerance, ...))
  }

  # get predictions of outcome
  mu <- stats::fitted(x)
  # get predicted zero-counts
  pred.zero <- round(sum(stats::dpois(x = 0, lambda = mu)))

  # proportion
  structure(
    class = "check_zi",
    list(
      predicted.zeros = pred.zero,
      observed.zeros = obs.zero,
      ratio = pred.zero / obs.zero,
      tolerance = tolerance
    )
  )
}


#' @rdname check_zeroinflation
#' @export
check_zeroinflation.performance_simres <- function(x,
                                                   tolerance = 0.1,
                                                   alternative = c("two.sided", "less", "greater"),
                                                   ...) {
  alternative <- match.arg(alternative)

  # compute test results
  result <- .simres_statistics(x, statistic_fun = function(i) sum(i == 0), alternative = alternative)

  structure(
    class = "check_zi",
    list(
      predicted.zeros = round(mean(result$simulated)),
      observed.zeros = result$observed,
      ratio = mean(result$simulated) / result$observed,
      tolerance = tolerance,
      p.value = result$p
    )
  )
}

#' @export
check_zeroinflation.DHARMa <- check_zeroinflation.performance_simres


# methods ------------------

#' @export
print.check_zi <- function(x, ...) {
  insight::print_color("# Check for zero-inflation\n\n", "blue")
  cat(sprintf("   Observed zeros: %i\n", x$observed.zeros))
  cat(sprintf("  Predicted zeros: %i\n", x$predicted.zeros))
  cat(sprintf("            Ratio: %.2f\n\n", x$ratio))

  lower <- 1 - x$tolerance
  upper <- 1 + x$tolerance

  if (is.null(x$p.value)) {
    p_string <- ""
  } else {
    p_string <- paste0(" (", insight::format_p(x$p.value), ")")
  }

  if (x$ratio < lower) {
    message("Model is underfitting zeros (probable zero-inflation)", p_string, ".")
  } else if (x$ratio > upper) {
    message("Model is overfitting zeros", p_string, ".")
  } else {
    insight::format_alert(paste0(
      "Model seems ok, ratio of observed and predicted zeros is within the tolerance range",
      p_string,
      "."
    ))
  }

  invisible(x)
}

Try the performance package in your browser

Any scripts or data that you put into this service are public.

performance documentation built on June 22, 2024, 10:55 a.m.