R/log_scores.R

Defines functions calc_kde_scores density_scores.gam density_scores.lm density_scores.kde density_scores.default density_scores

Documented in density_scores density_scores.default density_scores.gam density_scores.kde density_scores.lm

#' @title Density scores
#' @description Compute density scores or leave-one-out density scores from a
#' model or a kernel density estimate of a data set.
#' The density scores are defined as minus the log of the conditional density,
#' or kernel density estimate, at each observation.
#' The leave-one-out density scores (or LOO density scores) are obtained by
#' estimating the conditional density or kernel density estimate using all
#' other observations.
#' @details If the first argument is a numerical vector or matrix, then
#' a kernel density estimate is computed, using a Gaussian kernel,
#' with default bandwidth given by a robust normal reference rule.
#' Otherwise the model is used to compute the conditional
#' density function at each observation, from which the density scores (or
#' possibly the LOO density scores) are obtained.
#' @param object A model object or a numerical data set.
#' @param loo Should leave-one-out density scores be computed?
#' @author Rob J Hyndman
#' @return A numerical vector containing either the density scores, or the LOO
#' density scores.
#' @seealso
#'  \code{\link{kde_bandwidth}}
#'  \code{\link[ks]{kde}}
#' @export

density_scores <- function(object, loo = FALSE, ...) {
  UseMethod("density_scores")
}

#' @rdname density_scores
#' @param h Bandwidth for univariate kernel density estimate. Default is \code{\link{kde_bandwidth}}.
#' @param H Bandwidth for multivariate kernel density estimate. Default is \code{\link{kde_bandwidth}}.
#' @param ... Other arguments are passed to \code{\link[ks]{kde}}.
#' @examples
#' # Density scores computed from bivariate data set
#' of <- oldfaithful |>
#'   filter(duration < 7000, waiting < 7000) |>
#'   mutate(
#'     fscores = density_scores(cbind(duration, waiting)),
#'     loo_fscores = density_scores(cbind(duration, waiting), loo = TRUE),
#'     lookout_prob = lookout(density_scores = fscores, loo_scores = loo_fscores)
#'   )
#' of |>
#'   ggplot(aes(x = duration, y = waiting, color = lookout_prob < 0.01)) +
#'   geom_point()
#' @export
density_scores.default <- function(
    object, loo = FALSE,
    h = kde_bandwidth(object, method = "double"),
    H = kde_bandwidth(object, method = "double"), ...) {
  object <- as.matrix(object)
  tmp <- calc_kde_scores(object, h, H, ...)
  if (loo) {
    return(tmp$loo_scores)
  } else {
    return(tmp$scores)
  }
}

#' @rdname density_scores
#' @param ... Other arguments are ignored.
#' @examples
#' # Density scores computed from bivariate KDE
#' f_kde <- kde(of[, 2:3], H = kde_bandwidth(of[, 2:3]))
#' of |>
#'   mutate(
#'     fscores = density_scores(f_kde),
#'     loo_fscores = density_scores(f_kde, loo = TRUE)
#'   )
#' @export
density_scores.kde <- function(object, loo = FALSE, ...) {
  n <- NROW(object$x)
  d <- NCOL(object$x)
  # kde on a grid, but we need it at observations, so we will re-estimate
  # interpolation is probably quicker, but less accurate and
  # this works ok.
  output <- calc_kde_scores(object$x, object$h, object$H, ...)
  if (loo) {
    return(output$loo_scores)
  } else {
    return(output$scores)
  }
}

#' @rdname density_scores
#' @examples
#' # Density scores computed from linear model
#' of <- oldfaithful |>
#'   filter(duration < 7200, waiting < 7200)
#' lm_of <- lm(waiting ~ duration, data = of)
#' of |>
#'   mutate(
#'     fscore = density_scores(lm_of),
#'     loo_fscore = density_scores(lm_of, loo = TRUE),
#'     lookout_prob = lookout(density_scores = fscore, loo_scores = loo_fscore)
#'   ) |>
#'   ggplot(aes(x = duration, y = waiting, color = lookout_prob < 0.02)) +
#'   geom_point()
#' @export
density_scores.lm <- function(object, loo = FALSE, ...) {
  e <- stats::residuals(object, type = "response")
  h <- stats::hatvalues(object)
  sigma2 <- sum(e^2, na.rm = TRUE) / object$df.residual
  if (loo) {
    resdf <- object$df.residual
    sigma2 <- (sigma2 * resdf - e^2 / (1 - h)) / (resdf - 1)
  }
  r2 <- e^2 / ((1 - h) * sigma2)
  return(0.5 * (log(2 * pi) + r2))
}


#' @rdname density_scores
#' @examples
#' # Density scores computed from GAM
#' of <- oldfaithful |>
#'   filter(duration > 1, duration < 7200, waiting < 7200)
#' gam_of <- mgcv::gam(waiting ~ s(duration), data = of)
#' of |>
#'   mutate(
#'     fscore = density_scores(gam_of),
#'     lookout_prob = lookout(density_scores = fscore)
#'   ) |>
#'   filter(lookout_prob < 0.02)
#' @importFrom stats approx dbinom density dnorm dpois na.omit
#' @export
density_scores.gam <- function(object, loo = FALSE, ...) {
  if (loo) {
    warning("Leave-one-out log scores unavailable for GAM models. Returning log scores.")
  }
  fit_aug <- broom::augment(object, type.predict = "response")
  if (object$family$family == "gaussian") {
    std.resid <- c(scale(fit_aug$.resid / fit_aug$.se.fit))
    density_scores <- -dnorm(std.resid, log = TRUE)
  } else if (object$family$family == "binomial") {
    density_scores <- -dbinom(
      x = object$y * object$prior.weights,
      size = object$prior.weights, prob = fit_aug$.fitted, log = TRUE
    )
  } else if (object$family$family == "poisson") {
    density_scores <- -dpois(object$y, lambda = fit_aug$.fitted, log = TRUE)
  } else {
    stop("Unsupported family")
  }
  return(density_scores)
}

# Compute value of density at each observation using kde
calc_kde_scores <- function(
    y,
    h = kde_bandwidth(y, method = "double"),
    H = kde_bandwidth(y, method = "double"), ...) {
  n <- NROW(y)
  d <- NCOL(y)
  # Estimate density at each observation
  if (d == 1L) {
    gridsize <- 10001
    K0 <- 1 / (h * sqrt(2 * pi))
    fi <- ks::kde(y,
      h = h, gridsize = gridsize, binned = n > 2000,
      eval.points = y, compute.cont = FALSE, ...
    )$estimate
  } else {
    gridsize <- 101
    K0 <- det(H)^(-1 / 2) * (2 * pi)^(-d / 2)
    fi <- ks::kde(y,
      H = H, gridsize = gridsize, binned = n > 2000,
      eval.points = y, compute.cont = FALSE, ...
    )$estimate
  }
  loo_scores <- -log(pmax(0, (n * fi - K0) / (n - 1)))
  scores <- -log(pmax(0, fi))
  return(list(scores = scores, loo_scores = loo_scores))
}

utils::globalVariables(c(
  ".resid", ".se.fit", ".std.resid", ".resid", ".sigma", ".hat",
  "studentized_residuals"
))

Try the weird package in your browser

Any scripts or data that you put into this service are public.

weird documentation built on May 29, 2024, 1:24 a.m.