#' Quantile-quantile confidence bands
#'
#' Draws quantile-quantile confidence bands, with an additional detrend option.
#'
#' @import ggplot2
#' @importFrom dplyr summarize
#' @importFrom dplyr group_by
#' @importFrom MASS fitdistr
#' @importFrom robustbase s_Qn
#' @importFrom robustbase Qn
#'
#' @include stat_qq_line.R
#'
#' @inheritParams ggplot2::layer
#' @inheritParams ggplot2::geom_ribbon
#'
#' @param distribution Character. Theoretical probability distribution function
#' to use. Do not provide the full distribution function name (e.g.,
#' \code{"dnorm"}). Instead, just provide its shortened name (e.g.,
#' \code{"norm"}). If you wish to provide a custom distribution, you may do so
#' by first creating the density, quantile, and random functions following the
#' standard nomenclature from the \code{stats} package (i.e., for
#' \code{"custom"}, create the \code{dcustom}, \code{pcustom},
#' \code{qcustom}, and \code{rcustom} functions).
#' @param dparams List of additional parameters passed on to the previously
#' chosen \code{distribution} function. If an empty list is provided (default)
#' then the distributional parameters are estimated via MLE. MLE for custom
#' distributions is currently not supported, so you must provide the
#' appropriate \code{dparams} in that case.
#' @param detrend Logical. Should the plot objects be detrended? If \code{TRUE},
#' the objects will be detrended according to the reference Q-Q line. This
#' procedure was described by Thode (2002), and may help reducing visual bias
#' caused by the orthogonal distances from Q-Q points to the reference line.
#' @param identity Logical. Should an identity line be used as the reference
#' line used to construct the confidence bands? If \code{TRUE}, the identity
#' line is used. If \code{FALSE} (default), the commonly-used Q-Q line that
#' intercepts two data quantiles specified in \code{qprobs} is used. Please
#' notice that the chosen reference line will also be used for the detrending
#' procedure, if \code{detrend = TRUE}.
#' @param qtype Integer between 1 and 9. Type of the quantile algorithm to be
#' used by the \code{\link[stats]{quantile}} function to construct the Q-Q
#' line.
#' @param qprobs Numeric vector of length two. Represents the quantiles used by
#' the \code{\link[stats]{quantile}} function to construct the Q-Q line.
#' @param bandType Character. Either \code{"pointwise"}, \code{"boot"}, \code{"ks"} or
#' \code{"ts"}, or \code{"ell"}. \code{"pointwise"} constructs pointwise confidence bands based
#' on Normal confidence intervals. \code{"boot"} creates pointwise confidence
#' bands based on a parametric bootstrap; parameters are estimated with MLEs.
#' \code{"ks"} constructs simultaneous confidence bands based on the Kolmogorov-Smirnov
#' test. \code{"ts"} constructs tail-sensitive confidence bands, as
#' described by Aldor-Noiman et al. (2013) (also, see 'Note' for
#' limitations). Finally, \code{"ell"} constructs simultaenous bands using the
#' equal local levels test describe by Weine et al. (2021).
#' @param B Integer. If \code{bandType = "boot"}, then \code{B} is the number of
#' bootstrap replicates. If \code{bandType = "ts"}, then \code{B} is the
#' number of simulated samples.
#' @param conf Numerical. Confidence level of the bands.
#' @param mu Numerical. Only used if \code{bandType = "ts"}. Center
#' distributional parameter used to construct the simulated tail-sensitive
#' confidence bands. If either \code{mu} or \code{sigma} are \code{NULL}, then
#' those parameters are estimated using \code{\link[robustbase]{Qn}} and
#' \code{\link[robustbase]{s_Qn}}, respectively.
#' @param sigma Numerical. Only used if \code{bandType = "ts"}. Scale
#' distributional parameter used to construct the simulated tail-sensitive
#' confidence bands. If either \code{mu} or \code{sigma} are \code{NULL}, then
#' those parameters are estimated using robust estimates from the \pkg{stats}
#' package.
#'
#' @note
#' \itemize{
#' \item{Tail-sensitive confidence bands are only implemented for Normal Q-Q
#' plots. As a future update, we intend to generalize to other distributions.}
#' \item{Bootstrap bands are constructed based on a MLE parametric bootstrap.
#' Hence, it is not possible to construct such bands if the sample and
#' theoretical distributions present mismatching supports.}
#' }
#'
#' @references
#' \itemize{
#' \item{Thode, H. (2002), Testing for Normality. CRC Press, 1st Ed.}
#' \item{\href{https://www.tandfonline.com/doi/abs/10.1080/00031305.2013.847865}{Aldor-Noiman,
#' S. et al. (2013). The Power to See: A New Graphical Test of Normality. The
#' American Statistician. 67:4.}}
#' \item{\href{https://arxiv.org/abs/2111.15082}{Weine, E. et al. (2021).
#' Application of Equal Local Levels to Improve Q-Q Plot Testing Bands with R
#' Package qqconf.}}
#' }
#'
#' @examples
#' # generate random Normal data
#' set.seed(0)
#' smp <- data.frame(norm = rnorm(100))
#'
#' # Normal Q-Q plot of Normal data
#' gg <- ggplot(data = smp, mapping = aes(sample = norm)) +
#' stat_qq_band() +
#' stat_qq_line() +
#' stat_qq_point()
#' gg + labs(x = "Theoretical Quantiles", y = "Sample Quantiles")
#'
#' # Normal Q-Q plot of Normal data with equal local levels (ell) bands
#' bt <- "ell"
#' gg <- ggplot(data = smp, mapping = aes(sample = norm)) +
#' stat_qq_band(bandType = bt) +
#' stat_qq_line() +
#' stat_qq_point() +
#' labs(x = "Theoretical Quantiles", y = "Sample Quantiles")
#' gg
#'
#' # Exponential Q-Q plot of mean ozone levels (airquality dataset)
#' di <- "exp"
#' dp <- list(rate = 1)
#' gg <- ggplot(data = airquality, mapping = aes(sample = Ozone)) +
#' stat_qq_band(distribution = di, dparams = dp) +
#' stat_qq_line(distribution = di, dparams = dp) +
#' stat_qq_point(distribution = di, dparams = dp) +
#' labs(x = "Theoretical Quantiles", y = "Sample Quantiles")
#' gg
#'
#' # Detrended Exponential Q-Q plot of mean ozone levels
#' di <- "exp"
#' dp <- list(rate = 1)
#' de <- TRUE
#' gg <- ggplot(data = airquality, mapping = aes(sample = Ozone)) +
#' stat_qq_band(distribution = di, detrend = de) +
#' stat_qq_line(distribution = di, detrend = de) +
#' stat_qq_point(distribution = di, detrend = de) +
#' labs(x = "Theoretical Quantiles", y = "Sample Quantiles")
#' gg
#'
#' \dontrun{
#' # Normal Q-Q plot of Normal data with bootstrap confidence bands
#' bt <- "boot"
#' gg <- ggplot(data = smp, mapping = aes(sample = norm)) +
#' stat_qq_band(bandType = bt) +
#' stat_qq_line() +
#' stat_qq_point() +
#' labs(x = "Theoretical Quantiles", y = "Sample Quantiles")
#' gg
#'
#' # Normal Q-Q plot of Normal data with tail-sensitive confidence bands
#' bt <- "ts"
#' gg <- ggplot(data = smp, mapping = aes(sample = norm)) +
#' stat_qq_band(bandType = bt) +
#' stat_qq_line() +
#' stat_qq_point() +
#' labs(x = "Theoretical Quantiles", y = "Sample Quantiles")
#' gg
#' }
#'
#'
#' @export
stat_qq_band <- function(
mapping = NULL,
data = NULL,
geom = "qq_band",
position = "identity",
na.rm = TRUE,
show.legend = NA,
inherit.aes = TRUE,
distribution = "norm",
dparams = list(),
detrend = FALSE,
identity = FALSE,
qtype = 7,
qprobs = c(.25, .75),
bandType = "pointwise",
B = 1000,
conf = .95,
mu = NULL,
sigma = NULL,
...
) {
# error handling
if (!(distribution %in% c(
"beta",
"cauchy",
"chisq",
"exp",
"f",
"gamma",
"geom",
"lnorm",
"logis",
"norm",
"nbinom",
"pois",
"t",
"weibull")) &
length(dparams) == 0 &
table(sapply(formals(eval(parse(text = paste0("q", distribution)))), typeof))["symbol"] > 1) {
stop(
"MLE is currently not supported for custom distributions.\n",
"Please provide all the custom distribution parameters to 'dparams'.",
call. = FALSE
)
}
if (qtype < 1 | qtype > 9) {
stop("Please provide a valid quantile type: ",
"'qtype' must be between 1 and 9.",
call. = FALSE)
}
if (conf < 0 | conf > 1) {
stop("Please provide a valid confidence level for the bands: ",
"'conf' must be between 0 and 1.",
call. = FALSE)
}
if (B < 0) {
stop("Please provide a positive value for B.",
call. = FALSE)
}
bandType <- match.arg(bandType, c("pointwise", "boot", "ts", "ks", "ell"))
# vector with common discrete distributions
discreteDist <- c("binom", "geom", "nbinom", "pois")
if (distribution %in% discreteDist) geom <- "errorbar"
ggplot2::layer(
data = data,
mapping = mapping,
stat = StatQqBand,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
distribution = distribution,
dparams = dparams,
detrend = detrend,
identity = identity,
qtype = qtype,
qprobs = qprobs,
bandType = bandType,
B = round(B),
conf = conf,
mu = mu,
sigma = sigma,
discrete = distribution %in% discreteDist,
...
)
)
}
#' StatQqBand
#'
#' @keywords internal
#' @usage NULL
#' @export
#' @importFrom qqconf get_qq_band
StatQqBand <- ggplot2::ggproto(
`_class` = "StatQqBand",
`_inherit` = StatQqLine,
default_aes = ggplot2::aes(
x = ..x..,
ymin = ..lower..,
ymax = ..upper..
),
required_aes = c("sample"),
dropped_aes = c("sample"),
compute_group = {
function(data,
self,
scales,
distribution,
dparams,
detrend,
identity,
qtype,
qprobs,
bandType,
B,
conf,
mu,
sigma,
discrete) {
# distributional functions
dFunc <- eval(parse(text = paste0("d", distribution)))
qFunc <- eval(parse(text = paste0("q", distribution)))
rFunc <- eval(parse(text = paste0("r", distribution)))
smp <- sort(data$sample)
n <- length(smp)
quantiles <- ppoints(n)
# automatically estimate parameters with MLE, only if no parameters are
# provided with dparams and there are at least one distributional parameter
# without a default value
if(length(dparams) == 0) {
# equivalence between base R and MASS::fitdistr distribution names
corresp <- function(distName) {
switch(
distName,
beta = "beta",
cauchy = "cauchy",
chisq = "chi-squared",
exp = "exponential",
f = "f",
gamma = "gamma",
geom = "geometric",
lnorm = "log-normal",
logis = "logistic",
norm = "normal",
nbinom = "negative binomial",
pois = "poisson",
t = "t",
weibull = "weibull",
NULL
)
}
# initial value for some distributions
initVal <- function(distName) {
switch(
distName,
beta = list(shape1 = 1, shape2 = 1),
chisq = list(df = 1),
f = list(df1 = 1, df2 = 2),
t = list(df = 1),
NULL
)
}
suppressWarnings({
if(!is.null(corresp(distribution))) {
if(is.null(initVal(distribution))) {
dparams <- MASS::fitdistr(x = smp, densfun = corresp(distribution))$estimate
} else {
dparams <- MASS::fitdistr(x = smp, densfun = corresp(distribution), start = initVal(distribution))$estimate
}
}
})
}
theoretical <- do.call(qFunc, c(list(p = quantiles), dparams))
# inherit from StatQqLine
xline <- self$super()$compute_group(data = data,
distribution = distribution,
dparams = dparams,
qtype = qtype,
qprobs = qprobs,
identity = identity,
detrend = FALSE)$xline
yline <- self$super()$compute_group(data = data,
distribution = distribution,
dparams = dparams,
qtype = qtype,
qprobs = qprobs,
identity = identity,
detrend = FALSE)$yline
slope <- diff(yline) / diff(xline)
intercept <- yline[1L] - slope * xline[1L]
fittedValues <- (slope * theoretical) + intercept
# pointwise confidence bands based on normal confidence intervals
if (bandType == "pointwise") {
probs <- ppoints(n)
stdErr <- (slope / do.call(dFunc, c(list(x = theoretical), dparams))) * sqrt(probs * (1 - probs) / n)
zCrit <- qnorm(p = (1 - (1 - conf) / 2))
upper <- fittedValues + (stdErr * zCrit)
lower <- fittedValues - (stdErr * zCrit)
}
# parametric bootstrap pointwise confidence intervals
if (bandType == "boot") {
bs <- apply(
X = matrix(do.call(rFunc, c(list(n = n * B), as.list(dparams))), n, B),
MARGIN = 2,
FUN = sort
)
upper <- apply(X = bs, MARGIN = 1, FUN = quantile, probs = (1 + conf) / 2)
lower <- apply(X = bs, MARGIN = 1, FUN = quantile, probs = (1 - conf) / 2)
}
# using the DKW inequality for simultaneous bands
if (bandType == "ks") {
probs <- ppoints(n)
epsilon <- sqrt((1 / (2 * n)) * log(2/(1-conf)))
lp <- pmax(probs - epsilon, rep(0, n))
up <- pmin(probs + epsilon, rep(1, n))
lower <- intercept + slope * do.call(qFunc, c(list(p = lp), dparams))
upper <- intercept + slope * do.call(qFunc, c(list(p = up), dparams))
}
# tail-sensitive confidence bands
if (bandType == "ts") {
if (distribution != "norm") {
warning("Be aware that tail-sensitive confidence bands are _only_ implemented for Normal Q-Q plots.",
call. = F)
}
centerFunc <- function(x) robustbase::s_Qn(x, mu.too = TRUE)[[1]]
scaleFunc <- function(x) robustbase::Qn(x, finite.corr = FALSE)
upperCi <- rep(NA, n)
lowerCi <- rep(NA, n)
pValue <- matrix(NA, nrow = n, ncol = B)
# simulate data
sim <- NULL
if (is.null(mu) | is.null(sigma)) {
for (i in 1:B) sim <- cbind(sim, sort(rnorm(n)))
# center and scale simulated data
center <- apply(sim, 2, centerFunc)
scale <- apply(sim, 2, scaleFunc)
sim <- sweep(sweep(sim, 2, center, FUN = "-"), 2, scale, FUN = "/")
# convert simulated values to probabilities
sim <- t(apply(sim, 1, pnorm))
} else {
for (i in 1:B) sim <- cbind(sim, sort(runif(n)))
}
# widen the CIs to get simultanoues (100 * conf)% CIs
for (i in 1:n) {
tmp <- pbeta(sim[i, ], shape1 = i, shape2 = n + 1 - i)
pValue[i, ] <- apply(cbind(tmp, 1 - tmp), 1, min)
}
critical <- apply(pValue, 2, min)
criticalC <- quantile(critical, prob = 1 - conf)
upperCi <- qbeta(1 - criticalC, shape1 = 1:n, shape2 = n + 1 - (1:n))
lowerCi <- qbeta(criticalC, shape1 = 1:n, shape2 = n + 1 - (1:n))
# translate back to sample quantiles
if (is.null(mu) | is.null(sigma)) {
upper <- qnorm(upperCi) * scaleFunc(smp) + centerFunc(smp)
lower <- qnorm(lowerCi) * scaleFunc(smp) + centerFunc(smp)
} else {
upper <- qnorm(upperCi) * sigma + mu
lower <- qnorm(lowerCi) * sigma + mu
}
}
if (bandType == "ell") {
band <- qqconf::get_qq_band(
n = n,
alpha = 1 - conf,
distribution = qFunc,
dparams = dparams,
prob_pts_method = "normal"
)
probs <- band$expected_value
upper <- band$upper_bound
lower <- band$lower_bound
}
out <- data.frame(
x = theoretical,
upper = upper,
lower = lower,
fill = if (is.null(data$fill)) rgb(.6, .6, .6, .5) else data$fill
)
if (discrete) {
out$colour <- rgb(.5, .5, .5)
# create a data.frame with unique rows
out <- dplyr::summarize(
dplyr::group_by(out, x, fill, colour),
upper = max(upper),
lower = min(lower)
)
out <- as.data.frame(out)
}
# detrend the confidence bands by keeping the same distance from the
# stat_qq_line, which now is a line centered on y = 0
if (detrend) {
out$upper <- out$upper - fittedValues
out$lower <- out$lower - fittedValues
}
out
}
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.