#' Compute Weighted Percentiles cross a List of Data Frames (with optional grouping)
#' @description
#' Applies the `compute_weighted_percentiles()` function to each data frame in a named list, using a specified variable and optional weight.
#' Supports optional grouping via a categorical `by` variable, and returns either percentile values or share values, depending on the `share` argument.
#'
#' @param data_list A named list of data frames, (e.g., across countries or years).
#' @param var_name A string specifying the variable name (e.g., "dhi", "pilabour") to compute percentiles or shares for.
#' @param wgt_name An optional string specifying the weight variable to be used. If `NULL`, equal weights are assumed.
#' @param probs A numeric vector of probabilities with values between 0 and 1, defining percentiles (if `share = FALSE`) or the brackets between which shares are computed (if `share = TRUE`).
#' @param type A character string indicating which percentile definition to use. Only used when `share = FALSE`.
#' - `"type_4"`: default, linear interpolation-based of the empirical cdf - continuous sample quantile.
#' - `"type_2"`: used in Stata commands like collapse and _pctile, inverse of empirical distribution function with averaging at discontinuities - discontinuous sample quantile.
#' @param share Logical. If `TRUE`, returns income shares between percentile brackets instead of the percentile values. Default is \code{FALSE}.
#' Note: This **always uses** `definition = "type_4"` (interpolation), regardless of the `definition` parameter. It cannot be combined with `definition = "type_2"`.
#' @param na.rm Logical. If `TRUE`, missing values in `var_name` or `wgt_name` are removed.
#' @param by Optional string giving the name of a categorical variable to split the data within each data frame before computing statistics.
#'
#' @return A named list.
#'
#' - If `by == NULL`, `share == FALSE` and `length(probs) == 1`: each list element is named by country and contains a named numeric vector, where the names are years and values are the computed statistics.
#'
#' - If `by == NULL`, and either `share == TRUE` or `length(probs) > 1`: each list element is named by `ccyy` (country-year) identifiers and contains a named numeric vector, where the names represent the share intervals or the percentiles defined in `probs`.
#'
#' - If `by != NULL`: the list has `ccyy` identifiers as keys. Each element is a sublist, named after the categories of the `by` variable. Each sublist contains a named numeric vector of computed statistics.
#'
#' @details
#' Percentiles are computed using weighted version of **quantile definition 4** from Hyndman and Fan (1996), by default, or **quantile definition 2** if specified.
#' When `share = TRUE`, the function estimates Lorenz ordinates by taking quantiles from the running sum of the ordered outcome variable (divided by the total),
#' according to the same **quantile definition 4 only**.
#'
#' @export
#'
#' @examples
#' \dontrun{
#'
#' # Import data
#' my_data_list <- lissyuse(data = "es", from = 2016)
#'
#' # Retrieve the percentile estimates
#' percentiles_result <- my_data_list %>%
#' run_weighted_percentiles(
#' var_name = "dhi",
#' wgt_name = "hpopwgt",
#' probs = seq(0.1, 0.9, 0.1),
#' type = "type_2",
#' na.rm = TRUE
#' )
#' print(percentiles_result)
#'
#' # Compute the distribution shares
#' shares_result <- run_weighted_percentiles(
#' data_list = my_data_list,
#' var_name = "dhi",
#' wgt_name = "hpopwgt",
#' probs = seq(0, 1, 0.1),
#' share = TRUE,
#' na.rm = TRUE
#' )
#' print(shares_result)
#'
#' # Using the by option
#' by_result_median <- run_weighted_percentiles(
#' data_list = purrr::map(my_data_list[1:2], ~.x %>% filter(emp == 1)),
#' var_name = "pi11",
#' wgt_name = "ppopwgt",
#' probs = 0.5,
#' type = "type_4",
#' na.rm = TRUE,
#' by = "region_c"
#'
#' )
#' print(by_result_median)
#' }
run_weighted_percentiles <- function(data_list, var_name, wgt_name = NULL, probs = seq(0, 1, 0.25), type = c("type_4", "type_2"), share = FALSE, na.rm = TRUE, by = NULL) {
data_list <- lissyrtools::remove_dname_with_missings_in_weights(
data_list,
wgt_name
) # return a list cleaned
# Check that var_name exists
assertthat::assert_that(
var_name %in% names(data_list[[1]]),
msg = glue::glue(
"Variable '{var_name}' could not be found as a column name in the datasets."
)
)
# Check that all variables in `by` exist, if provided
if (!is.null(by)) {
assertthat::assert_that(
by %in% names(data_list[[1]]),
msg = glue::glue(
"Grouping variable '{by}' could not be found as a column name in the datasets."
)
)
}
# Check that wgt_name exists, if provided
if (!is.null(wgt_name)) {
assertthat::assert_that(
wgt_name %in% names(data_list[[1]]),
msg = glue::glue(
"Weight variable '{wgt_name}' could not be found as a column name in the datasets."
)
)
}
lissyrtools::check_input_in_weight_argument(wgt_name)
type <- match.arg(type)
if (share && type != "type_4") {
warning("When `share = TRUE`, `type` is ignored and set to 'type_4'.")
}
if (!is.null(by)) {
allowed_categoricals_in_by <- c(lissyrtools::lis_categorical_variables, lissyrtools::lws_wealth_categorical_variables, "inum")
if (!by %in% allowed_categoricals_in_by) {
stop(sprintf("The `by` variable must be a categorical variable from `lissyrtools::lis_categorical_variables`, `lws_wealth_categorical_variables`, or the variable 'inum'."))
}
df_to_keep <- purrr::map_lgl(data_list, ~!all(is.na(.x[[by]])))
# change the data frames in `data_list` !
if (any(!df_to_keep)) {
warning(sprintf("The `by` variable '%s' contains only NAs in the following frames, which will be dropped: %s",
by, paste(names(df_to_keep)[!df_to_keep], collapse = ", ")))
data_list <- data_list[df_to_keep]
}
}
result <- purrr::imap(data_list, function(df, name) {
if (!is.null(by)) {
df[[by]] <- as.character(haven::as_factor(df[[by]]))
df <- df[!is.na(df[[by]]), ]
split_df <- split(df, df[[by]])
result <- purrr::imap(split_df, function(sub_df, group_name) {
var <- sub_df[[var_name]]
wgt <- if (!is.null(wgt_name)) sub_df[[wgt_name]] else NULL
lissyrtools::compute_weighted_percentiles(
var = var,
wgt = wgt,
probs = probs,
share = share,
type = if (share) "type_4" else type,
na.rm = na.rm
)
})
# Flatten if only one prob value (e.g., just median) and not share
if (length(probs) == 1 && share == FALSE) {
result <- purrr::map(result, ~unname(.x)) %>% unlist()
}
result
} else {
var <- df[[var_name]]
wgt <- if (!is.null(wgt_name)) df[[wgt_name]] else NULL
lissyrtools::compute_weighted_percentiles(
var = var,
wgt = wgt,
probs = probs,
share = share,
type = if (share) "type_4" else type,
na.rm = na.rm
)
}
})
if (length(probs) == 1 && share == FALSE && is.null(by)) {
result <- convert_list_from_ccyy_to_cc_names_yyyy(result)
}
return(result)
}
#' Compute Weighted Percentiles or Share of Distribution
#'
#' @description
#' This function computes weighted percentiles of a numeric vector, or the share of total value within specified percentile intervals.
#'
#'
#' @param var A numeric vector of values (e.g., .x$dhi, .x$pi11).
#' @param wgt A numeric vector of weights (e.g., .x$hpopwgt, .x$pwgt). Must be the same length as \code{x}.
#' @param probs A numeric vector of probabilities between 0 and 1 indicating which percentiles to compute. Default is \code{seq(0, 1, 0.25)}.
#' @param na.rm Logical; if \code{TRUE}, missing values in \code{x} and \code{w} are removed before computation. Default is \code{TRUE}.
#' @param share Logical; if \code{TRUE}, computes the share of total value (e.g., .x$dhi) within each interval defined by \code{probs}. If \code{FALSE}, returns the percentile values. Default is \code{FALSE}. Note: This option always uses \code{type = "type_4"}, and can not be used toghether with \code{type = "type_2"}.
#' @param type A character string indicating which percentile definition to use. Either \code{"type_4"} (default, linear interpolation-based of the empirical cdf - continuous sample quantile) or
#' \code{"type_2"} (used in Stata commands like collapse and _pctile, inverse of empirical distribution function with averaging at discontinuities - discontinuous sample quantile).
#'
#' @return A named numeric vector. If \code{share = FALSE}, returns weighted percentiles with names corresponding to the percentiles (e.g., "25%"). If \code{share = TRUE}, returns the share of the total value in each percentile range (e.g., "0-25%").
#'
#' @keywords internal
#'
#' @examples
#' \dontrun{
#' data <- lissyrtools::lissyuse(data = "es22", vars = c("dhi", "age"))
#' compute_weighted_percentiles(data$es22$dhi, data$es22$ppopwgt)
#' compute_weighted_percentiles(data$es22$dhi, data$es22$ppopwgt, probs = c(0.03, 0.72, 0.48, .01))
#' compute_weighted_percentiles(data$es22$dhi, data$es22$ppopwgt, probs = c(0.03, 0.72, 0.48, .01), share = TRUE)
#' }
compute_weighted_percentiles <- function(var, wgt = NULL, probs = seq(0, 1, 0.25), na.rm = TRUE, share = FALSE, type = c("type_4", "type_2")) {
type <- match.arg(type)
if (is.null(wgt)) {
wgt <- rep(1, length(var))
}
stopifnot(length(var) == length(wgt))
if (any(probs > 1) || any(probs < 0)) {
stop(
glue::glue(
"Values in the argument 'probs' must be between 0 and 1 (inclusive)."
)
)
}
# Remove NAs if needed
if (na.rm) {
keep <- !is.na(var) & !is.na(wgt)
var <- var[keep]
wgt <- wgt[keep]
}
# Sort var and wgt
ord <- order(var, wgt)
var <- var[ord]
wgt <- wgt[ord]
# Sort probs
probs <- probs[order(probs)]
probs <- if (share == TRUE) unique(c(0,probs,1)) else probs
w_total <- sum(wgt)
cw <- cumsum(wgt)
target <- probs * w_total
# Neeeded for the shares
cxw <- cumsum(var * wgt)
# --- type (Stata-like (collapse / _pctile approximation.)) implementation ---
if (type == "type_2" && !share) {
result <- numeric(length(probs))
for (i in seq_along(probs)) {
t <- target[i]
if (t == 0) {
result[i] <- var[1]
} else if (t == w_total) {
result[i] <- var[length(var)]
} else {
idx <- which(cw > t)[1]
if (any((t - cw) < .Machine$double.eps^0.5 & t >= cw)) { # < .Machine$double.eps^0.5 for eveb more precision
# in practise if cw[idx-1] == t
# Interpolate between adjacent values if needed (matches boundary)
result[i] <- (var[idx - 1] + var[idx]) / 2
} else {
result[i] <- var[idx]
}
}
}
names(result) <- paste0(probs * 100, "%")
return(result)
}
# --- END: Definition 2 ---
if (type == "type_4" && !share) {
# Definition 4 (used in percentils in STATA from Philip van Kerm)
result <- numeric(length(probs))
for (i in seq_along(probs)) {
t <- target[i]
# If the target is 0 or total weight, assign the first or last value
if (t == 0) { # t == 0
# Min.
result[i] <- var[1]
} else if (t == w_total) {
# Max
result[i] <- var[length(var)]
} else {
idx <- which(cw[-1] > t & cw[-length(cw)] <= t)[1] + 1
result[i] <- var[idx - 1] + (var[idx] - var[idx - 1]) * ((t - cw[idx - 1]) / wgt[idx])
}
}
# Set the names for the result
names(result) <- paste0(probs * 100, "%")
return(result)
} else {
# SHARE calculation (using only type 4)
result_for_shares <- numeric(length(probs) - 1)
for (i in 1:(length(probs) - 1)) {
if (i == 1) {
t <- probs[2] * sum(wgt)
idx <- which(cw[-1] > t & cw[-length(cw)] <= t)[1] + 1
Yi_minus_1_W <- (cxw[idx - 1] / cxw[length(var)])
Yi_W <- (cxw[idx] / cxw[length(var)])
gamma <- (t - cw[idx - 1]) / (cw[idx] - cw[idx - 1])
result_for_shares[1] <- 100 * ((1 - gamma) * Yi_minus_1_W + gamma * Yi_W)
} else if (i == (length(probs) - 1)) {
t <- probs[i] * sum(wgt)
idx <- which(cw[-1] > t & cw[-length(cw)] <= t)[1] + 1
Yi_minus_1_W <- (cxw[idx - 1] / cxw[length(var)])
Yi_W <- (cxw[idx] / cxw[length(var)])
gamma <- (t - cw[idx - 1]) / (cw[idx] - cw[idx - 1])
result_for_shares[length(result_for_shares)] <- (100 - ((1 - gamma) * Yi_minus_1_W + gamma * Yi_W) * 100)
} else {
# for example probs = 0.5
t_up <- probs[i + 1] * sum(wgt)
idx_up <- which(cw[-1] > t_up & cw[-length(cw)] <= t_up)[1] + 1
Yi_minus_1_W_up <- (cxw[idx_up - 1] / cxw[length(var)])
Yi_W_up <- (cxw[idx_up] / cxw[length(var)])
gamma_up <- (t_up - cw[idx_up - 1]) / (cw[idx_up] - cw[idx_up - 1])
# for example probs = 0.25
t_low <- probs[i] * sum(wgt)
idx_low <- which(cw[-1] > t_low & cw[-length(cw)] <= t_low)[1] + 1
Yi_minus_1_W_low <- (cxw[idx_low - 1] / cxw[length(var)])
Yi_W_low <- (cxw[idx_low] / cxw[length(var)])
gamma_low <- (t_low - cw[idx_low - 1]) / (cw[idx_low] - cw[idx_low - 1])
result_for_shares[i] <- (((1 - gamma_up) * Yi_minus_1_W_up + gamma_up * Yi_W_up) - ((1 - gamma_low) * Yi_minus_1_W_low + gamma_low * Yi_W_low)) * 100
}
}
probs_percent <- round(probs * 100)
interval_labels <- paste0(
probs_percent[-length(probs_percent)],
"-",
probs_percent[-1],
"%"
)
names(result_for_shares) <- interval_labels
return(result_for_shares)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.