R/ppplot.R
In qqconf: Creates Simultaneous Testing Bands for QQ-Plots

Documented in pp_conf_plot

#' PP Plot with Simultaneous and Pointwise Testing Bounds.
#'
#' Create a pp-plot with with a shaded simultaneous acceptance region and,
#' optionally, lines for a point-wise region. The observed values are plotted
#' against their expected values had they come from the specified distribution.
#'
#' If any of the points of the pp-plot fall outside the simultaneous acceptance region for the selected
#' level alpha test, that means that we can reject the null hypothesis that the data are i.i.d. draws from the
#' specified distribution. If \code{difference} is set to TRUE, the vertical axis plots the
#' observed probability minus expected probability. If pointwise bounds are used, then on average, alpha * n of the points will fall outside
#' the bounds under the null hypothesis, so the chance that the pp-plot has any points falling outside of the pointwise bounds
#' is typically much higher than alpha under the null hypothesis. For this reason, a simultaneous region is preferred.
#'
#' @param obs The observed data.
#' @param distribution The probability function for the specified distribution. Defaults to \code{pnorm}.
#' Custom distributions are allowed as long as all parameters are supplied in dparams.
#' @param method Method for simultaneous testing bands. Must be either "ell" (equal local levels test), which applies a level \eqn{\eta} pointwise
#' test to each order statistic such that the Type I error of the global test is \code{alpha}, or "ks" to apply a
#' Kolmogorov-Smirnov test. "ell" is recommended.
#' @param alpha Type I error of global test of whether the data come from the reference distribution.
#' @param difference Whether to plot the difference between the observed and
#'   expected values on the vertical axis.
#' @param log10 Whether to plot axes on -log10 scale (e.g. to see small p-values).
#' @param right_tail This argument is only used if \code{log10} is \code{TRUE}. When \code{TRUE},
#' the x-axis is -log10(1 - Expected Probability) and the y-axis is -log10(1 - Observed Probability).
#' When \code{FALSE} (default) the x-axis is -log10(Expected Probability) and the y-axis is
#' -log10(Observed Probability). The argument should be set to \code{TRUE} to make
#' observations in the right tail of the distribution easier to see, and set to false to make the
#' observations in the left tail of the distribution easier to see.
#' @param add Whether to add points to an existing plot.
#' @param dparams List of additional arguments for the probability function of the distribution
#'   (e.g. df=1). Note that if any parameters of the distribution are specified, parameter estimation will not be performed
#'   on the unspecified parameters, and instead they will take on the default values set by the distribution function.
#'   For the uniform distribution, parameter estimation is not performed, and
#'   the default parameters are max = 1 and min = 0.
#'   For other distributions parameters will be estimated if not provided.
#'   For the normal distribution, we estimate the mean as the median and the standard deviation as \eqn{Sn} from the paper by Rousseeuw and Croux 1993
#'   "Alternatives to the Median Absolute Deviation". For all other distributions besides uniform and normal,
#'   the code uses MLE to estimate the parameters. Note that estimation is not implemented for custom distributions, so all
#'   parameters of the distribution must be provided by the user.
#' @param bounds_params List of optional arguments for \code{get_bounds_two_sided}.
#'   (i.e. \code{tol}, \code{max_it}, \code{method}).
#' @param line_params arguments passed to the line function to modify the line that indicates a perfect fit of the
#'   reference distribution.
#' @param plot_pointwise Boolean indicating whether pointwise bounds should be added to the plot
#' @param pointwise_lines_params arguments passed to the \code{lines} function that modifies pointwise bounds when plot_pointwise is
#'   set to TRUE.
#' @param points_params arguments to be passed to the \code{points} function to plot the data.
#' @param polygon_params Arguments to be passed to the polygon function to construct simultaneous confidence region.
#'   By default \code{border} is set to NA and \code{col} is set to grey.
#' @param prob_pts_method (optional) method used to get probability points for
#' plotting. The default value, \code{"uniform"}, results in
#' \code{ppoints(n, a=0)}, which are the expected
#' values of the order statistics of Uniform(0, 1).  When
#'  this argument is set to \code{"median"}, \code{qbeta(.5, c(1:n), c(n:1))},
#'  the medians of the order statistics of Uniform(0, 1) will be used. For a
#'  PP plot, there is no particular theoretical justification for setting this
#'  argument to \code{"normal"}, which results in \code{ppoints(n)}, but it is
#'  an option because it is used in some
#'  other packages. When \code{alpha} is large, \code{"median"} is recommended.
#' @param ... Additional arguments passed to the plot function.
#'
#' @export
#'
#' @return None, PP plot is produced.
#'
#' @examples
#' set.seed(0)
#' smp <- rnorm(100)
#'
#' # Plot PP plot against normal distribution with mean and variance estimated
#' pp_conf_plot(
#'   obs=smp
#' )
#'
#' # Make same plot on -log10 scale to highlight the left tail,
#' # with radius of plot circles also reduced by .5
#' pp_conf_plot(
#'   obs=smp,
#'   log10 = TRUE,
#'   points_params = list(cex = .5)
#' )
#'
#' # Make same plot with difference between observed and expected values on the y-axis
#' pp_conf_plot(
#'   obs=smp,
#'   difference = TRUE
#' )
#'
#' # Make same plot with samples plotted as a blue line, expected value line plotted as a red line,
#' # and pointwise bounds plotted as black lines
#' pp_conf_plot(
#'   obs=smp,
#'   plot_pointwise = TRUE,
#'   points_params = list(col="blue", type="l"),
#'   line_params = list(col="red")
#' )
#'
#' @references
#' Weine, E., McPeek, MS., & Abney, M. (2023).
#' Application of Equal Local Levels to Improve
#' Q-Q Plot Testing Bands with R Package qqconf
#' Journal of Statistical Software, 106(10).
#' https://doi:10.18637/jss.v106.i10
#'
pp_conf_plot <- function(obs,
                         distribution = pnorm,
                         method = c("ell", "ks"),
                         alpha = 0.05,
                         difference = FALSE,
                         log10 = FALSE,
                         right_tail = FALSE,
                         add = FALSE,
                         dparams = list(),
                         bounds_params = list(),
                         line_params = list(),
                         plot_pointwise = FALSE,
                         pointwise_lines_params = list(),
                         points_params = list(),
                         polygon_params = list(border = NA, col = 'gray'),
                         prob_pts_method = c("uniform", "median", "normal"),
                         ...) {

  if (!("q" %in% names(formals(distribution)))) {

    stop("distribution function must take 'q' as an argument.
         Did you mean to make a QQ plot?")

  }

  dots <- list(...)
  method <- match.arg(method)
  if (is.null(dots$ylab)) {
    if (difference && log10 && right_tail) {
      ylab <- expression("-log"[10]*"(1 - Observed probabilities) + log"[10]*"(1 - Expected probabilities)")
    } else if (difference && log10) {
      ylab <- expression("-log"[10]*"(Observed probabilities) + log"[10]*"(Expected probabilities)")
    } else if(difference) {
      ylab <- 'Obseved probabilities - Expected probabilities'
    } else if (log10 && right_tail) {
      ylab <- expression("-log"[10]*"(1 - Observed probabilities)")
    } else if (log10) {
      ylab <- expression("-log"[10]*"(Observed probabilities)")
    } else {
      ylab <- 'Observed probabilities'
    }
  } else {
    ylab <- dots$ylab
    dots <- within(dots, rm(ylab))
  }
  if (is.null(dots$xlab)) {
    if (log10 && right_tail) {
      xlab <- expression("-log"[10]*"(1 - Expected probabilities)")
    } else if (log10) {
      xlab <- expression("-log"[10]*"(Expected probabilities)")
    } else {
      xlab <- 'Expected probabilities'
    }
  } else {
    xlab <- dots$xlab
    dots <- within(dots, rm(xlab))
  }

  samp_size <- length(obs)
  conf_int <- 1 - alpha
  conf <- c(alpha / 2, conf_int + alpha / 2)
  ## The observed and expected probabilities. Expected probabilities are based on the specified
  ## distribution
  # constant for visual expansion of confidence regions
  c <- .5

  dist_name <- as.character(substitute(distribution))
  if(length(dparams) == 0 && dist_name == "punif") {

    dparams['min'] <- 0
    dparams['max'] <- 1

  } else if (length(dparams) == 0) {

    cat("no dparams supplied. Estimating parameters from the data...\n")
    MASS_name <- get_mass_name_from_distr(dist_name, "pp")
    dparams <- estimate_params_from_data(MASS_name, obs)

  }

  prob_pts_method <- match.arg(prob_pts_method)

  qq_distribution <- get_qq_distribution_from_pp_distribution(
    as.character(substitute(distribution))
  )

  global_bounds_qq <- get_qq_band(
    obs = obs,
    alpha = alpha,
    distribution = qq_distribution,
    dparams = dparams,
    ell_params = bounds_params,
    band_method = method,
    prob_pts_method = prob_pts_method
  )

  global_low <- do.call(distribution, c(list(q=global_bounds_qq$lower_bound), dparams))
  global_high <- do.call(distribution, c(list(q=global_bounds_qq$upper_bound), dparams))

  exp_pts <- do.call(distribution, c(list(q=global_bounds_qq$expected_value), dparams))

  obs_pts <- do.call(distribution, c(list(q=sort(obs)), dparams))

  ext_quantile <- get_extended_quantile(prob_pts_method, samp_size)

  if (log10 == TRUE && right_tail == TRUE) {

    exp_pts <- -log10(1 - exp_pts)
    low_exp_pt <- c * -log10(ext_quantile$high_pt) + (1 - c) * exp_pts[1]
    high_exp_pt <- c * -log10(ext_quantile$low_pt) + (1 - c) * exp_pts[samp_size]
    obs_pts <- -log10(1 - obs_pts)

  }
  else if (log10 == TRUE) {

    exp_pts <- -log10(exp_pts)
    low_exp_pt <- c * -log10(ext_quantile$low_pt) + (1 - c) * exp_pts[1]
    high_exp_pt <- c * -log10(ext_quantile$high_pt) + (1 - c) * exp_pts[samp_size]
    obs_pts <- -log10(obs_pts)

  }
  else {

    low_exp_pt <- c * (ext_quantile$low_pt) + (1 - c) * exp_pts[1]
    high_exp_pt <- c * (ext_quantile$high_pt) + (1 - c) * exp_pts[samp_size]

  }
  if (difference) {
    y_pts <- obs_pts - exp_pts
  } else {
    y_pts <- obs_pts
  }

  ## When not adding points to a pp-plot compute pointwise and global confidence bounds.
  if (!add) {
    left <- exp_pts[1]
    right <- exp_pts[samp_size]
    bottom <- min(y_pts) #obs_pts[1]
    top <- max(y_pts) #obs_pts[samp_size]
    do.call(plot, c(list(x=c(left, right), y=c(bottom, top), type='n', xlab=xlab, ylab=ylab), dots))

    pointwise_low <- qbeta(conf[1], 1:samp_size, samp_size:1)
    pointwise_high <- qbeta(conf[2], 1:samp_size, samp_size:1)

    if (log10 == TRUE && right_tail == TRUE) {
      pointwise_low <- -log10(1 - pointwise_low)
      pointwise_high <- -log10(1 - pointwise_high)
      global_low <- -log10(1 - global_low)
      global_high <- -log10(1 - global_high)
    } else if (log10 == TRUE) {
      pointwise_low <- -log10(pointwise_low)
      pointwise_high <- -log10(pointwise_high)
      global_low <- -log10(global_low)
      global_high <- -log10(global_high)
    }

    if ("ylim" %in% names(dots)) {

      bottom <- dots$ylim[1] - 1000
      top <- dots$ylim[2] + 1000

    } else {

      global_low_temp <- global_low[is.finite(global_low)]
      global_high_temp <- global_high[is.finite(global_high)]
      bottom <- min(global_low_temp) - 1000
      top <- max(global_high_temp) + 1000

    }

    if (difference) {

      low_global_diff <- global_low - exp_pts
      low_global_diff <- c(low_global_diff[1], low_global_diff, low_global_diff[samp_size])
      high_global_diff <- global_high - exp_pts
      high_global_diff <- c(high_global_diff[1], high_global_diff, high_global_diff[samp_size])
      low_pointwise_diff <- pointwise_low - exp_pts
      low_pointwise_diff <- c(low_pointwise_diff[1], low_pointwise_diff, low_pointwise_diff[samp_size])
      high_pointwise_diff <- pointwise_high - exp_pts
      high_pointwise_diff <- c(high_pointwise_diff[1], high_pointwise_diff, high_pointwise_diff[samp_size])
      exp_pts <- c(low_exp_pt, exp_pts, high_exp_pt)

      do.call(
        polygon,
        c(list(x = c(exp_pts, rev(exp_pts)),
               y = pmin(pmax(c(low_global_diff, rev(high_global_diff)), bottom), top)),
          polygon_params)
      )
      if (plot_pointwise) {

        do.call(lines, c(list(x = exp_pts, y = low_pointwise_diff), pointwise_lines_params))
        do.call(lines, c(list(x = exp_pts, y = high_pointwise_diff), pointwise_lines_params))

      }

    } else {

      # code to extend region for visibility
      global_low <- c(global_low[1], global_low, global_low[samp_size])
      global_high <- c(global_high[1], global_high, global_high[samp_size])
      pointwise_low <- c(pointwise_low[1], pointwise_low, pointwise_low[samp_size])
      pointwise_high <- c(pointwise_high[1], pointwise_high, pointwise_high[samp_size])
      exp_pts <- c(low_exp_pt, exp_pts, high_exp_pt)

      do.call(
        polygon,
        c(list(x = c(exp_pts, rev(exp_pts)),
               y = pmin(pmax(c(global_low, rev(global_high)), bottom), top)),
          polygon_params)
      )
      if (plot_pointwise) {

        do.call(lines, c(list(x = exp_pts, y = pointwise_low), pointwise_lines_params))
        do.call(lines, c(list(x = exp_pts, y = pointwise_high), pointwise_lines_params))

      }

    }

    do.call(points, c(list(x = exp_pts[2:(samp_size + 1)], y = y_pts), points_params))

  } else {

    do.call(points, c(list(x = exp_pts, y = y_pts), points_params))

  }
  if (difference) {

    do.call(lines, c(list(x = c(min(exp_pts), max(exp_pts)), y = c(0, 0)), line_params))

  } else{

    do.call(lines, c(list(x = c(min(exp_pts), max(exp_pts)), y = c(min(exp_pts), max(exp_pts))), line_params))

  }

}