#' @title Fit a nonparametric distribution on tabulated data
#'
#' @author Thomas Blanchet, Juliette Fournier, Thomas Piketty
#'
#' @description Estimate a distribution nonparametrically based on tabulations
#' of a distribution which include:
#' \itemize{
#' \item fractiles of the data,
#' \item the corresponding brackets,
#' \item the share of each bracket.
#' }
#'
#' @param p A vector of values in [0, 1].
#' @param threshold The quantiles corresponding to \code{p}.
#' @param average The average over the entire distribution.
#' @param bracketshare The corresponding bracket share.
#' @param topshare The corresponding top share.
#' @param bracketavg The corresponding bracket average.
#' @param topavg The corresponding top average.
#' @param invpareto The inverted Pareto coefficient.
#' @param bottom_model Which model to use at the bottom of the distribution?
#' Only relevant if \code{min(p) > 0}. Either \code{"gpd"} for the generalized
#' Pareto distribution, \code{"hist"} for histogram density, or \code{"dirac"}.
#' Default is \code{"hist"} if \code{min(threshold) > 0}, \code{"dirac"} if
#' \code{min(threshold) == 0} and \code{"gpd"} otherwise.
#' @param lower_bound Lower bound of the distribution. Only relevant if
#' \code{min(p) > 0}. Default is \code{0}.
#' @param binf Asymptotic Pareto coefficient. If \code{NULL} or \code{NA},
#' it is directly estimated from the data. Default is \code{NULL}.
#' @param fast Use a faster but less precise method (split-histogram).
#' Default is \code{FALSE}.
#'
#' @return An object of class \code{gpinter_dist_orig}.
#'
#' @importFrom stats integrate optim
#' @importFrom nloptr nloptr
#'
#' @export
tabulation_fit <- function(p, threshold, average=NULL, bracketshare=NULL, topshare=NULL,
bracketavg=NULL, topavg=NULL, invpareto=NULL,
bottom_model=NULL, lower_bound=0, binf=NULL, fast=FALSE) {
# Check and clean the input
input <- clean_input_tabulation(p, threshold, average, bracketshare, topshare,
bracketavg, topavg, invpareto, bottom_model, lower_bound)
p <- input$p
m <- input$m
n <- input$n
average <- input$average
bottom_model <- input$bottom_model
lower_bound <- input$lower_bound
threshold <- input$threshold
# Log-transform of the data
pk <- p
qk <- threshold
mk <- m
xk <- -log(1 - pk)
yk <- -log(mk)
sk <- (1 - pk)*qk/mk
# Estimate the second derivative at the last point
if (is.null(binf) || is.na(binf)) {
an <- (sk[n] - sk[n - 1])/(xk[n] - xk[n - 1])
} else {
if (binf <= 1) {
stop("the asymptotic Pareto coefficient must be above one")
}
xi <- (binf - 1)/binf
mu <- qk[n]
# Identify sigma using the average in the last threshold
sigma <- ((exp(-yk[n]) - mu + mu*pk[n])*(-1 + xi))/(-1 + pk[n])
# Identify an from sigma
an <- (-sigma + (-1 + pk[n])*(-1 + sk[n])*sk[n] *
exp(2*xk[n] - yk[n]))/((-1 + pk[n])*exp(2*xk[n] - yk[n]))
}
if (fast) {
# Use mean-split histogram all the time
ak <- c((sk[2] - sk[1])/(xk[2] - xk[1]), rep(NA, n - 2), an)
# Enforce monotonicity constraint
ak <- ifelse(ak + sk*(1 - sk) < 0, -sk*(1 - sk), ak)
xk_nc <- xk
yk_nc <- yk
sk_nc <- sk
ak_nc <- ak
use_hist <- NULL
fk_cns <- NULL
pk_cns <- pk[1]
qk_cns <- qk[1]
mk_cns <- mk[1]
xk_cns <- xk[1]
yk_cns <- yk[1]
sk_cns <- sk[1]
ak_cns <- NA
for (i in 1:(n - 1)) {
x0 <- xk[i]
x1 <- xk[i + 1]
y0 <- yk[i]
y1 <- yk[i + 1]
s0 <- sk[i]
s1 <- sk[i + 1]
p0 <- pk[i]
p1 <- pk[i + 1]
q0 <- qk[i]
q1 <- qk[i + 1]
m0 <- mk[i]
m1 <- mk[i + 1]
bracketavg <- (m0 - m1)/(p1 - p0)
use_hist <- c(use_hist, TRUE, TRUE)
hist <- hist_interpol(p0, p1, q0, q1, bracketavg)
fk_cns <- c(fk_cns, hist$f0, hist$f1)
pk_cns <- c(pk_cns, hist$pstar, p1)
qk_cns <- c(qk_cns, hist$qstar, q1)
mk_cns <- c(mk_cns,
hist_lorenz(hist$pstar,
hist$pstar, p1,
hist$qstar, q1,
m1, hist$f1
),
m1
)
xk_cns <- c(xk_cns, -log(1 - hist$pstar), x1)
yk_cns <- c(yk_cns, NA, y1)
sk_cns <- c(sk_cns, NA, s1)
ak_cns <- c(ak_cns, NA, NA)
}
} else {
# Do generalized Pareto interpolation
# Calculate the second derivative
ak <- clamped_quintic_spline(xk, yk, sk, an)
# Keep non-constrained parameter values in memory
xk_nc <- xk
yk_nc <- yk
sk_nc <- sk
ak_nc <- ak
# Enforce monotonicity constraint at the interpolation points
ak <- ifelse(ak + sk*(1 - sk) < 0, -sk*(1 - sk), ak)
# Enforce monotonicity constraint over the entire function
use_hist <- NULL
fk_cns <- NULL
pk_cns <- pk[1]
qk_cns <- qk[1]
mk_cns <- mk[1]
xk_cns <- xk[1]
yk_cns <- yk[1]
sk_cns <- sk[1]
ak_cns <- ak[1]
for (i in 1:(n - 1)) {
x0 <- xk[i]
x1 <- xk[i + 1]
y0 <- yk[i]
y1 <- yk[i + 1]
s0 <- sk[i]
s1 <- sk[i + 1]
a0 <- ak[i]
a1 <- ak[i + 1]
p0 <- pk[i]
p1 <- pk[i + 1]
q0 <- qk[i]
q1 <- qk[i + 1]
m0 <- mk[i]
m1 <- mk[i + 1]
bracketavg <- (m0 - m1)/(p1 - p0)
# Check if constraint is violated
if (!is_increasing(x0, x1, y0, y1, s0, s1, a0, a1)) {
# First, try to enforce the constraint by adding one point
new_pt <- add_one_point(x0, x1, y0, y1, s0, s1, a0, a1)
# Check that the algorithm converged
if (!is.null(new_pt)) {
# Check that the constraint is now satisfied
cond <- is_increasing(
x0, new_pt$x_new,
y0, new_pt$y_new,
s0, new_pt$s_new,
a0, new_pt$a_new
) & is_increasing(
new_pt$x_new, x1,
new_pt$y_new, y1,
new_pt$s_new, s1,
new_pt$a_new, a1
)
if (cond) {
# If so, add the point and move on to the next bracket
use_hist <- c(use_hist, FALSE, FALSE)
fk_cns <- c(fk_cns, NA, NA)
pk_cns <- c(pk_cns, 1 - exp(-new_pt$x_new), p1)
mk_cns <- c(mk_cns, exp(-new_pt$y_new), m1)
qk_cns <- c(qk_cns, new_pt$s_new*exp(new_pt$x_new - new_pt$y_new), q1)
xk_cns <- c(xk_cns, new_pt$x_new, x1)
yk_cns <- c(yk_cns, new_pt$y_new, y1)
sk_cns <- c(sk_cns, new_pt$s_new, s1)
ak_cns <- c(ak_cns, new_pt$a_new, a1)
next
}
}
# If adding one point failed, try adding two points
new_pts <- add_two_points(x0, x1, y0, y1, s0, s1, a0, a1)
# Check that the algorithm converged
if (!is.null(new_pts)) {
# Check that the constraint is now satisfied
cond <- is_increasing(
x0, new_pts$x_new1,
y0, new_pts$y_new1,
s0, new_pts$s_new1,
a0, new_pts$a_new1
) & is_increasing(
new_pts$x_new1, new_pts$x_new2,
new_pts$y_new1, new_pts$y_new2,
new_pts$s_new1, new_pts$s_new2,
new_pts$a_new1, new_pts$a_new2
) & is_increasing(
new_pts$x_new2, x1,
new_pts$y_new2, y1,
new_pts$s_new2, s1,
new_pts$a_new2, a1
)
if (cond) {
# If so, add the two new points and move on to the next bracket
use_hist <- c(use_hist, FALSE, FALSE, FALSE)
fk_cns <- c(fk_cns, NA, NA, NA)
pk_cns <- c(
pk_cns,
1 - exp(-new_pts$x_new1),
1 - exp(-new_pts$x_new2),
p1
)
qk_cns <- c(
qk_cns,
new_pts$s_new1*exp(new_pts$x_new1 - new_pts$y_new1),
new_pts$s_new2*exp(new_pts$x_new2 - new_pts$y_new2),
q1
)
mk_cns <- c(mk_cns, exp(-new_pts$y_new1), exp(-new_pts$y_new2), m1)
xk_cns <- c(xk_cns, new_pts$x_new1, new_pts$x_new2, x1)
yk_cns <- c(yk_cns, new_pts$y_new1, new_pts$y_new2, y1)
sk_cns <- c(sk_cns, new_pts$s_new1, new_pts$s_new2, s1)
ak_cns <- c(ak_cns, new_pts$a_new1, new_pts$a_new2, a1)
next
}
}
# If adding two points also failed, we fall back to an histogram density
use_hist <- c(use_hist, TRUE, TRUE)
hist <- hist_interpol(p0, p1, q0, q1, bracketavg)
fk_cns <- c(fk_cns, hist$f0, hist$f1)
pk_cns <- c(pk_cns, hist$pstar, p1)
qk_cns <- c(qk_cns, hist$qstar, q1)
mk_cns <- c(mk_cns,
hist_lorenz(hist$pstar,
hist$pstar, p1,
hist$qstar, q1,
m1, hist$f1
),
m1
)
xk_cns <- c(xk_cns, -log(1 - hist$pstar), x1)
yk_cns <- c(yk_cns, NA, y1)
sk_cns <- c(sk_cns, NA, s1)
ak_cns <- c(ak_cns, NA, a1)
} else {
# Leave the coefficients untouched if the constraint isn't violated
use_hist <- c(use_hist, FALSE)
fk_cns <- c(fk_cns, NA)
pk_cns <- c(pk_cns, p1)
qk_cns <- c(qk_cns, q1)
mk_cns <- c(mk_cns, m1)
xk_cns <- c(xk_cns, x1)
yk_cns <- c(yk_cns, y1)
sk_cns <- c(sk_cns, s1)
ak_cns <- c(ak_cns, a1)
}
}
}
# Estimate the parameters of the generalized Pareto distribution at the top
param_top <- gpd_top_parameters(xk[n], yk[n], sk[n], ak[n])
# If we get xi >= 1 or sigma <= 0, the GPD is not valid, so use a
# standard Pareto instead (which breaks derivability of the quantile
# function)
if (param_top$sigma <= 0 || param_top$xi >= 1) {
param_top$mu_top <- qk[n]
param_top$xi_top <- 1 - sk[n]
param_top$sigma_top <- param_top$mu_top*param_top$xi_top
}
# Estimate the parameters of the model for the bottom
if (p[1] > 0) {
bracketavg <- (average - mk_cns[1])/pk_cns[1] # Estimate the average in the bottom
if (bracketavg >= threshold[1] && bottom_model != "dirac") {
stop(paste0("The average below the first bracket (", round(bracketavg),
") is above the first threshold (", round(threshold[1]), ")"))
}
if (bottom_model == "dirac" || bracketavg == 0) {
param_bottom <- list(mu=NA, sigma=NA, xi=NA, delta=p[1])
} else if (bottom_model == "gpd" && bracketavg > 0) {
q1 <- qk[1]
param_bottom <- list(
mu_bottom = q1,
sigma_bottom = ((bracketavg - q1)*(lower_bound - q1))/(bracketavg - lower_bound),
xi_bottom = (bracketavg - q1)/(bracketavg - lower_bound),
delta = NA
)
} else if (bottom_model == "gpd" || bracketavg < 0) {
param_bottom <- gpd_bottom_parameters(xk[1], yk[1], sk[1], ak[1], average)
# Same thing as for the top if xi >= 1 or sigma <= 0
if (param_bottom$sigma <= 0 || param_bottom$xi >= 1) {
bottom_model = "hist"
}
param_bottom$delta <- NA
}
if (bottom_model == "hist") {
use_hist <- c(TRUE, TRUE, use_hist)
hist <- hist_interpol(0, pk_cns[1], lower_bound, qk_cns[1], bracketavg)
fk_cns <- c(hist$f0, hist$f1, fk_cns)
pk_cns <- c(0, hist$pstar, pk_cns)
qk_cns <- c(lower_bound, hist$qstar, qk_cns)
mk_cns <- c(
average,
hist_lorenz(hist$pstar,
hist$pstar, hist$p1,
hist$qstar, hist$q1,
mk_cns[1], hist$f1
),
mk_cns
)
xk_cns <- c(0, -log(1 - hist$pstar), xk_cns)
yk_cns <- c(-log(average), NA, yk_cns)
sk_cns <- c(lower_bound/average, NA, sk_cns)
ak_cns <- c(NA, NA, ak_cns)
param_bottom <- list(mu=NA, sigma=NA, xi=NA, delta=NA)
}
} else {
param_bottom <- list(mu=NA, sigma=NA, xi=NA, delta=NA)
}
# Object to return
result <- list()
class(result) <- c("gpinter_dist_orig", "gpinter_dist")
result$use_hist <- use_hist
result$pk <- pk_cns
result$qk <- qk_cns
result$xk <- xk_cns
result$yk <- yk_cns
result$sk <- sk_cns
result$ak <- ak_cns
result$fk <- fk_cns
result$mk <- mk_cns
result$pk_nc <- p
result$xk_nc <- xk_nc
result$yk_nc <- yk_nc
result$sk_nc <- sk_nc
result$ak_nc <- ak_nc
result$qk_nc <- threshold
result$mk_nc <- m
result$bk_nc <- m/((1 - p)*threshold)
result$average <- average
# Estimate the parameters of the generalized Pareto distribution at the top
param_top <- gpd_top_parameters(xk[n], yk[n], sk[n], ak[n])
# If we get xi == 1 or sigma == 0, the GPD is not valid, so use a
# standard Pareto instead (which breaks derivability of the quantile
# function)
if (param_top$sigma == 0 || param_top$xi == 1) {
result$mu_top <- qk[n]
result$xi_top <- 1 - sk[n]
result$sigma_top <- result$mu_top*result$xi_top
} else {
result$mu_top <- param_top$mu
result$sigma_top <- param_top$sigma
result$xi_top <- param_top$xi
}
result$mu_bottom <- param_bottom$mu
result$sigma_bottom <- param_bottom$sigma
result$xi_bottom <- param_bottom$xi
result$delta_bottom <- param_bottom$delta
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.