#' @title Generalized Boxplot
#'
#' @author Christian L. Goueguel
#'
#' @description
#' This function implements the generalized boxplot, a robust data visualization
#' technique designed to effectively represent skewed and heavy-tailed distributions,
#' as proposed by Bruffaerts *et al*. (2014).
#'
#' @details
#' This method extends the adjusted boxplot method by leveraging the flexible Tukey's
#' g-and-h parametric distribution to model the underlying data structure,
#' particularly for asymmetric or long-tailed datasets, providing a more nuanced
#' and informative summary of the data's central tendency, spread, and potential
#' outliers.
#'
#' @references
#' - Bruffaerts, C., Verardi, V., Vermandele, C. (2014). A generalized boxplot for
#' skewed and heavy-tailed distributions. Statistics and Probability Letters 95(C):110–117
#'
#' @param x A numeric data frame or tibble.
#' @param alpha A scalar, between 0 and 1 that specifies the desired detection rate of atypical values.
#' @param p A scalar, between 0.5 and 1 that specifies the quantile order for estimating g and h.
#' @param plot Logical value indicating whether to plot the boxplot or return the boxplot statistics.
#' @param xlabels.angle A numeric value specifying the angle (in degrees) for x-axis labels (default is 90).
#' @param xlabels.vjust A numeric value specifying the vertical justification of x-axis labels (default is 1).
#' @param xlabels.hjust A numeric value specifying the horizontal justification of x-axis labels (default is 1).
#' @param box.width A numeric value specifying the width of the boxplot (default is 0.5).
#' @param notch A logical value indicating whether to display a notched boxplot (default is `FALSE`).
#' @param notchwidth A numeric value specifying the width of the notch relative to the body of the boxplot (default is 0.5).
#' @param staplewidth A numeric value specifying the width of staples at the ends of the whiskers.
#'
#' @return
#' - If `plot = TRUE`, returns a `ggplot2` object containing the generalized boxplot.
#' - If `plot = FALSE`, returns a list of tibbles with the generalized boxplot statistics and potantial outliers.
#'
#' @export generalized_boxplot
#'
#' @examples
#' set.seed(123)
#' data <- data.frame(
#' normal = rnorm(100),
#' skewed = rexp(100, rate = 0.5),
#' heavy_tailed = rt(100, df = 3)
#' )
#'
#' # Plot the generalized boxplot
#' generalized_boxplot(data)
#'
#' # Retrieve the generalized boxplot statistics
#' generalized_boxplot(data, plot = FALSE)
#'
generalized_boxplot <- function(x, alpha = 0.05, p = 0.9, plot = TRUE, xlabels.angle = 90, xlabels.vjust = 1, xlabels.hjust = 1, box.width = .5, notch = FALSE, notchwidth = 0.5, staplewidth = 0.5) {
if (missing(x)) {
stop("Missing 'x' argument.")
}
if (!all(x %>% purrr::map_lgl(is.numeric))) {
stop("Input 'x' must be a numeric data frame.")
}
if (!is.numeric(alpha) || alpha < 0 || alpha > 1) {
stop("Argument 'alpha' must be a numeric value between 0 and 1.")
}
if (!is.numeric(p) || p < 0.5 || p > 1) {
stop("Argument 'p' must be a numeric value between 0.5 and 1.")
}
if(!is.logical(plot)) {
stop("Argument 'plot' must be of type boolean (TRUE or FALSE).")
}
if (!is.logical(notch)) {
stop("Argument 'notch' must be of type boolean (TRUE or FALSE).")
}
if (!is.numeric(xlabels.angle) || xlabels.angle < 0 || xlabels.angle > 360) {
stop("Argument 'x_axis_angle' must be a numeric value between 0 and 360.")
}
if (!is.numeric(xlabels.vjust) || xlabels.vjust < 0 || xlabels.vjust > 1) {
stop("Argument 'xlabels.vjust' must be a numeric value between 0 and 1.")
}
if (!is.numeric(xlabels.hjust) || xlabels.hjust < 0 || xlabels.hjust > 1) {
stop("Argument 'xlabels.hjust' must be a numeric value between 0 and 1.")
}
if (!is.numeric(box.width) || box.width <= 0) {
stop("Argument 'box.width' must be a positive numeric value.")
}
if (!is.logical(notch)) {
stop("Argument 'notch' must be of type boolean (TRUE or FALSE).")
}
if (!is.numeric(notchwidth) || notchwidth < 0 || notchwidth > 1) {
stop("Argument 'notchwidth' must be a numeric value between 0 and 1.")
}
if (!is.numeric(staplewidth) || staplewidth < 0) {
stop("Argument 'staplewidth' must be a positive numeric value.")
}
genBoxplot_stats <- x %>%
purrr::map(
function(.x) {
stats_tbl <- genboxStats(.x, alpha, p)
tibble::tibble(
lower = stats_tbl$stats$lower_fence,
q1 = stats_tbl$stats$lower_quantile,
median = stats_tbl$stats$median,
q3 = stats_tbl$stats$upper_quantile,
upper = stats_tbl$stats$upper_fence
)
}
) %>%
dplyr::bind_rows(.id = "variable") %>%
purrr::modify_at("variable", forcats::as_factor)
genBoxplot_out <- x %>%
purrr::map(
function(.x) {
out_tbl <- genboxStats(.x, alpha, p)
tibble::tibble(
lower = out_tbl$outliers$lower,
upper = out_tbl$outliers$upper
)
}
) %>%
dplyr::bind_rows(.id = "variable") %>%
purrr::modify_at("variable", forcats::as_factor) %>%
tidyr::pivot_longer(!variable, names_to = "out", values_to = "value")
variable <- NULL
lower <- NULL
q1 <- NULL
median <- NULL
q3 <- NULL
upper <- NULL
value <- NULL
p <- ggplot2::ggplot() +
ggplot2::geom_boxplot(
data = genBoxplot_stats,
ggplot2::aes(
x = variable,
ymin = lower,
lower = q1,
middle = median,
upper = q3,
ymax = upper,
group = variable,
fill = variable),
stat = "identity",
width = box.width,
colour = "black",
outlier.colour = NA,
outlier.shape = NA,
notch = notch,
notchwidth = notchwidth,
staplewidth = staplewidth) +
ggplot2::geom_point(
data = genBoxplot_out,
ggplot2::aes(
x = variable,
y = value,
fill = variable,
group = variable),
shape = 21,
size = 2,
alpha = 1/3) +
ggplot2::geom_jitter(size = 1.5) +
ggsci::scale_fill_d3(palette = "category20") +
ggplot2::theme_bw() +
ggplot2::theme(
legend.position = "none",
panel.grid = ggplot2::element_blank(),
axis.text.x = ggplot2::element_text(angle = xlabels.angle, vjust = xlabels.vjust, hjust = xlabels.hjust)) +
ggplot2::labs(x = " ", y = " ")
if (plot == TRUE) {
return(p)
} else{
r <- list("stats" = genBoxplot_stats, "outliers" = genBoxplot_out)
return(r)
}
}
genboxStats <- function(x, alpha, p) {
n <- length(x)
order_x <- sort(x)
x_star <- (x - stats::median(x)) / stats::IQR(x)
r <- x_star - min(x_star) + 0.1
r_tilde <- r / (min(r) + max(r))
w <- stats::qnorm((rank(r_tilde, na.last = "keep") - 0.5) / sum(!is.na(r_tilde)))
w_star <- (w - stats::median(w)) / (stats::IQR(w) / 1.3426)
A <- stats::quantile(w_star, p) / stats::quantile(w_star, 1 - p)
B <- (stats::quantile(w_star, p) * stats::quantile(w_star, 1 - p)) / (stats::quantile(w_star, p) + stats::quantile(w_star, 1 - p))
z <- stats::qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
g <- (1 / z) * log(-A)
h <- (2 / z^2) * log(-g * B)
xi_alpha_2 <- tukeyGH(alpha / 2, type = "q", location = 0, scale = 1, g = g, h = h)
xi_1_alpha_2 <- tukeyGH(1 - alpha / 2, type = "q", location = 0, scale = 1, g = g, h = h)
atypical_indices <- which(w_star < xi_alpha_2 | w_star > xi_1_alpha_2)
L_star_minus <- xi_alpha_2
L_star_plus <- xi_1_alpha_2
C_low <- stats::median(w) + (stats::IQR(w) / 1.3426) * L_star_minus
C_up <- stats::median(w) + (stats::IQR(w) / 1.3426) * L_star_plus
f_low <- stats::pnorm(C_low, mean = 0, sd = 1) * (min(r) + max(r)) + min(x_star) - 0.1
f_up <- stats::pnorm(C_up, mean = 0, sd = 1) * (min(r) + max(r)) + min(x_star) - 0.1
q1 <- stats::quantile(x, 0.25)
q3 <- stats::quantile(x, 0.75)
s <- tibble::tibble(
lower_fence = f_low * stats::IQR(x) + stats::median(x),
lower_quantile = q1,
median = stats::median(x),
upper_quantile = q3,
upper_fence = f_up * stats::IQR(x) + stats::median(x)
)
out <- tibble::tibble(
lower = order_x[order_x < s$lower_fence],
upper = order_x[order_x > s$upper_fence]
)
results <- list("stats" = s, "outliers" = out)
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.