R/weighted_hist.R

Defines functions weighted_iqr weighted_var weighted_hist

# Weighted histogram
#
# Author: mjskay
###############################################################################


#' @importFrom rlang as_label enexpr get_expr
weighted_hist = function(
  x, weights = NULL, breaks = "Sturges", align = "none"
) {
  x_label = as_label(enexpr(x))
  weights_label = as_label(enexpr(weights))
  label = if (is.null(weights)) {
    x_label
  } else {
    paste0("[", x_label, ", ", weights_label, "]")
  }

  if (length(x) < 1) cli_abort("{.fun ggdist::density_histogram} requires {.code length(x) >= 1}.")

  weights = weights %||% rep(1, length(x))

  # figure out breaks
  if (is.character(breaks)) {
    breaks = match_function(breaks, prefix = "breaks_")
  }
  if (is.function(breaks)) {
    breaks = breaks(x, weights = weights)
  }
  if (length(breaks) == 1) {
    if (length(x) == 1) {
      breaks = c(x - 0.5, x + 0.5)
    } else {
      breaks = seq.int(min(x), max(x), length.out = breaks)
    }
    bin_width = diff(breaks)
    equidist = TRUE
  } else {
    breaks = sort(unique(breaks))
    bin_width = diff(breaks)
    equidist = diff(range(bin_width)) < 1e-7 * mean(bin_width)
  }

  # apply alignment if bins are equidistant
  if (equidist) {
    if (is.character(align)) {
      align = match_function(align, prefix = "align_")
    }
    if (is.function(align)) {
      align = align(breaks)
    }
    if (align < 0 || align > bin_width[[1]]) {
      cli_abort(c(
        "{.arg align} must be between 0 and the bin width",
        "i" = "See the {.arg align} argument to {.fun ggdist::density_histogram}."
      ))
    }

    # we check for align != 0 even though in theory we could apply a 0 alignment
    # below and the result would be correct. We do this because then if someone
    # manually specifies the breaks and no alignment, exactly those breaks are used.
    if (align != 0) {
      breaks = breaks - align
      max_break = breaks[length(breaks)]

      if (max_break < max(x)) {
        breaks = c(breaks, max_break + bin_width[[1]])
        bin_width = c(bin_width, bin_width[[1]])
      }
      if (length(breaks) > 2 && breaks[[2]] <= min(x)) {
        breaks = breaks[-1]
        bin_width = bin_width[-1]
      }
    }
  }

  # check for invalid binning
  if (min(x) < breaks[1] || max(x) > breaks[length(breaks)]) {
    cli_abort("The {.arg breaks} argument to {.fun ggdist::density_histogram} must cover all values of {.arg x}")
  }

  # bin x values
  bin = findInterval(x, breaks, rightmost.closed = TRUE, left.open = TRUE)

  # sum up weights in each bin
  counts = rep(0, length(breaks) - 1)
  used_bins = unique(bin)
  counts[used_bins] = tapply(weights, factor(bin, used_bins), sum)

  structure(
    list(
      breaks = breaks,
      counts = counts,
      density = counts / bin_width / sum(weights),
      mids = (breaks[-length(breaks)] + breaks[-1])/2,
      xname = label,
      equidist = equidist
    ),
    class = "histogram"
  )
}


# breaks algorithms -------------------------------------------------------

#' Break (bin) selection algorithms for histograms
#'
#' Methods for determining breaks (bins) in histograms, as used in the `breaks`
#' argument to [density_histogram()].
#' Supports [automatic partial function application][automatic-partial-functions].
#'
#' @param x A numeric vector giving a sample.
#' @param weights A numeric vector of `length(x)` giving sample weights.
#' @param width For [breaks_fixed()], the desired bin width.
#' @param digits Number of significant digits to keep when rounding in the Freedman-Diaconis
#'   algorithm ([breaks_FD()]). For an explanation of this parameter, see the documentation
#'   of the corresponding parameter in [grDevices::nclass.FD()].
#'
#' @details
#' These functions take a sample and its weights and return a valuable suitable for
#' the `breaks` argument to [density_histogram()] that will determine the histogram
#' breaks.
#'
#'  - [breaks_fixed()] allows you to manually specify a fixed bin width.
#'  - [breaks_Sturges()], [breaks_Scott()], and [breaks_FD()] implement weighted
#'    versions of the corresponding base functions. See [nclass.Sturges()],
#'    [nclass.scott()], and [nclass.FD()].
#' @returns Either a single number (giving the number of bins) or a vector
#' giving the edges between bins.
#' @seealso [density_histogram()], [align]
#' @examples
#' library(ggplot2)
#'
#' set.seed(1234)
#' x = rnorm(200, 1, 2)
#'
#' # Let's compare the different break-selection algorithms on this data:
#' ggplot(data.frame(x), aes(x)) +
#'   stat_slab(
#'     aes(y = "breaks_fixed(width = 0.5)"),
#'     density = "histogram",
#'     breaks = breaks_fixed(width = 0.5),
#'     outline_bars = TRUE,
#'     color = "black",
#'   ) +
#'   stat_slab(
#'     aes(y = "breaks_Sturges()\nor 'Sturges'"),
#'     density = "histogram",
#'     breaks = "Sturges",
#'     outline_bars = TRUE,
#'     color = "black",
#'   ) +
#'   stat_slab(
#'     aes(y = "breaks_Scott()\nor 'Scott'"),
#'     density = "histogram",
#'     breaks = "Scott",
#'     outline_bars = TRUE,
#'     color = "black",
#'   ) +
#'   stat_slab(
#'     aes(y = "breaks_FD()\nor 'FD'"),
#'     density = "histogram",
#'     breaks = "FD",
#'     outline_bars = TRUE,
#'     color = "black",
#'   ) +
#'   geom_point(aes(y = 0.7), alpha = 0.5) +
#'   labs(
#'     subtitle = "ggdist::stat_slab(density = 'histogram', ...)",
#'     y = "breaks =",
#'     x = NULL
#'   )
#' @name breaks
#' @export
breaks_fixed = auto_partial(name = "breaks_fixed", function(
  x, weights = NULL, width = 1
) {
  if (length(x) == 1) return(c(x - width/2, x + width/2))

  # determine amount we need to expand range by to make it a multiple of width
  x_range = range(x)
  expand = ((-diff(x_range)) %% width) / 2

  seq.int(x_range[[1]] - expand, x_range[[2]] + expand, by = width)
})

#' @rdname breaks
#' @export
breaks_Sturges = auto_partial(name = "breaks_Sturges", function(
  x, weights = NULL
) {
  weights = weights %||% rep(1, length(x))
  n = max(length(x), sum(weights))
  ceiling(log2(n) + 1)
})

#' @rdname breaks
#' @export
breaks_Scott = auto_partial(name = "breaks_Scott", function(
  x, weights = NULL
) {
  weights = weights %||% rep(1, length(x))
  n = max(length(x), sum(weights))
  h = 3.5 * sqrt(weighted_var(x, weights)) * n^(-1/3)
  if (h > 0) {
    max(1, ceiling(diff(range(x))/h))
  } else {
    1L
  }
})

#' @rdname breaks
#' @export
breaks_FD = auto_partial(name = "breaks_FD", function(
  x, weights = NULL, digits = 5
) {
  weights = weights %||% rep(1, length(x))
  h = 2 * weighted_iqr(.x <- signif(x, digits = digits), weights)

  if (h == 0) {
    .x_order = order(.x)
    .x = .x[.x_order]
    .weights = weights[.x_order]
    al = 1/4
    al_min = 1/512

    quantile_fun = weighted_quantile_fun(.x, weights = .weights, n = "sum")
    while (h == 0 && (al <- al/2) >= al_min) {
      h = diff(quantile_fun(c(al, 1 - al))) / (1 - 2 * al)
    }
  }

  if (h == 0) {
    h = 3.5 * sqrt(weighted_var(x, weights))
  }

  n = max(length(x), sum(weights))
  if (h > 0) {
    ceiling(diff(range(x))/h * n^(1/3))
  } else {
    1L
  }
})


# alignment algorithms ----------------------------------------------------

#' Break (bin) alignment methods
#'
#' Methods for aligning breaks (bins) in histograms, as used in the `align`
#' argument to [density_histogram()].
#' Supports [automatic partial function application][automatic-partial-functions].
#'
#' @param breaks A sorted vector of breaks (bin edges).
#' @param at A scalar numeric giving an alignment point.
#'  - For [align_boundary()]: align breaks so that a bin edge lines up with `at`.
#'  - For [align_center()]: align breaks so that the center of a bin lines up with `at`.
#'
#' @details
#' These functions take a sorted vector of equally-spaced `breaks` giving
#' bin edges and return a numeric offset which, if subtracted from `breaks`,
#' will align them as desired:
#'
#'  - [align_none()] performs no alignment (it always returns `0`).
#'  - [align_boundary()] ensures that a bin edge lines up with `at`.
#'  - [align_center()] ensures that a bin center lines up with `at.`
#'
#' For [align_boundary()] (respectively [align_center()]), if no bin edge (or center) in the
#' range of `breaks` would line up with `at`, it ensures that `at` is an integer
#' multiple of the bin width away from a bin edge (or center).
#'
#' @returns A scalar numeric returning an offset to be subtracted from `breaks`.
#' @seealso [density_histogram()], [breaks]
#' @examples
#' library(ggplot2)
#'
#' set.seed(1234)
#' x = rnorm(200, 1, 2)
#'
#' # If we manually specify a bin width using breaks_fixed(), the default
#' # alignment (align_none()) will not align bin edges to any "pretty" numbers.
#' # Here is a comparison of the three alignment methods on such a histogram:
#' ggplot(data.frame(x), aes(x)) +
#'   stat_slab(
#'     aes(y = "align_none()\nor 'none'"),
#'     density = "histogram",
#'     breaks = breaks_fixed(width = 1),
#'     outline_bars = TRUE,
#'     # no need to specify align; align_none() is the default
#'     color = "black",
#'   ) +
#'   stat_slab(
#'     aes(y = "align_center(at = 0)\nor 'center'"),
#'     density = "histogram",
#'     breaks = breaks_fixed(width = 1),
#'     align = align_center(at = 0),   # or align = "center"
#'     outline_bars = TRUE,
#'     color = "black",
#'   ) +
#'   stat_slab(
#'     aes(y = "align_boundary(at = 0)\nor 'boundary'"),
#'     density = "histogram",
#'     breaks = breaks_fixed(width = 1),
#'     align = align_boundary(at = 0), # or align = "boundary"
#'     outline_bars = TRUE,
#'     color = "black",
#'   ) +
#'   geom_point(aes(y = 0.7), alpha = 0.5) +
#'   labs(
#'     subtitle = "ggdist::stat_slab(density = 'histogram', ...)",
#'     y = "align =",
#'     x = NULL
#'   ) +
#'   geom_vline(xintercept = 0, linetype = "22", color = "red")
#' @name align
#' @export
align_none = auto_partial(name = "align_none", function(breaks) {
  0
})

#' @rdname align
#' @export
align_boundary = auto_partial(name = "align_boundary", function(breaks, at = 0) {
  (breaks[[1]] - at) %% diff(breaks[1:2])
})

#' @rdname align
#' @export
align_center = auto_partial(name = "align_center", function(breaks, at = 0) {
  w = diff(breaks[1:2])
  (breaks[[1]] - at + w/2) %% w
})


# helpers -----------------------------------------------------------------

#' @importFrom stats weighted.mean
weighted_var = function(x, weights) {
  sum(weights * (x - weighted.mean(x, weights))^2) / sum(weights)
}

weighted_iqr = function(x, weights) {
  diff(weighted_quantile(as.numeric(x), c(0.25, 0.75), weights = weights, n = "sum"))
}

Try the ggdist package in your browser

Any scripts or data that you put into this service are public.

ggdist documentation built on Nov. 27, 2023, 9:06 a.m.