#' @title Fit a nonparametric distribution on tabulated data without thresholds
#'
#' @author Thomas Blanchet, Juliette Fournier, Thomas Piketty
#'
#' @description Estimate a distribution nonparametrically based on tabulations
#' of a distribution which include:
#' \itemize{
#' \item fractiles of the data,
#' \item the share of each bracket.
#' }
#'
#' @param p A vector of values in [0, 1].
#' @param average The average over the entire distribution.
#' @param bracketshare The corresponding bracket share.
#' @param topshare The corresponding top share.
#' @param bracketavg The corresponding bracket average.
#' @param topavg The corresponding top average.
#' @param first_threshold The value of the first threshold. If \code{NULL}, it
#' is estimated from the data. Default is \code{NULL}.
#' @param last_invpareto The inverted Pareto coefficient at the last threshold.
#' Use \code{NULL} for unknown. (Default is \code{NULL}.)
#' @param bottom_model Which model to use at the bottom of the distribution?
#' Only relevant if \code{min(p) > 0}. Either \code{"gpd"} for the generalized
#' Pareto distribution, or \code{"hist"} for histogram density. Default is
#' \code{"hist"} if \code{min(threshold) > 0}, and \code{"gpd"} otherwise.
#' @param lower_bound Lower bound of the distribution. Only relevant if
#' \code{min(p) > 0}. Default is \code{0}.
#' @param binf Asymptotic Pareto coefficient. If \code{NULL} or \code{NA},
#' it is directly estimated from the data. Default is \code{NULL}.
#' @param top_model Which model to use at the top of the distribution?
#' Either \code{"gpd"} for generalized Pareto distribution, or \code{"pareto"}
#' for standard Pareto distribution. Default is \code{"gpd"}.
#' @param fast Use a faster but less precise method (split-histogram).
#' Default is \code{FALSE}.
#'
#' @return An object of class \code{gpinter_dist_orig}.
#'
#' @importFrom stats integrate optim
#' @importFrom nloptr nloptr
#'
#' @export
shares_fit <- function(p, average=NULL, bracketshare=NULL, topshare=NULL,
bracketavg=NULL, topavg=NULL, first_threshold=NULL,
last_invpareto=NULL, bottom_model=NULL, top_model=NULL,
lower_bound=0, binf=NULL, fast=FALSE) {
input <- clean_input_shares(p, average, bracketshare, topshare, bracketavg,
topavg, first_threshold, bottom_model, lower_bound, top_model)
p <- input$p
m <- input$m
n <- input$n
average <- input$average
bottom_model <- input$bottom_model
top_model <- input$top_model
lower_bound <- input$lower_bound
# Log-transform of the data
pk <- p
mk <- m
xk <- -log(1 - pk)
yk <- -log(mk)
# Estimate the second derivative at the last point
an <- second_derivative(xk[n - 2], xk[n - 1], xk[n], yk[n - 2], yk[n - 1], yk[n])
# Fit the quintic spline
if (pk[1] == 0 & (mk[1] - mk[2]) > 0) {
if (is.null(first_threshold)) {
first_threshold <- 0
}
}
if (is.null(first_threshold)) {
param_spline <- clamped1_quintic_spline_noderiv(xk, yk, an)
ak <- param_spline$ak
sk <- param_spline$sk
} else {
s1 <- exp(yk[1] - xk[1])*first_threshold
param_spline <- clamped2_quintic_spline_noderiv(xk, yk, s1, an)
ak <- param_spline$ak
sk <- param_spline$sk
}
# Check if the estimated function satisfies the constraints: if not, adjust
# the estimated thresholds before passing it to tabulation_fit
constraints_ok <- TRUE
threshold <- exp(xk - yk)*sk
bracketavg <- -diff(c(m, 0))/diff(c(p, 1))
if (threshold[1] >= bracketavg[1]) {
constraints_ok <- FALSE
if (bracketavg[1] > 0) {
# Assume the distribution starts at zero
if (pk[1] == 0) {
threshold[1] <- 0
} else {
bottomavg <- (average - sum(m))/pk[1]
threshold[1] <- 2*bottomavg
}
} else {
threshold[1] <- 2*bracketavg[1]
}
}
for (i in 2:n) {
if (bracketavg[i - 1] >= threshold[i] || bracketavg[i] <= threshold[i]) {
constraints_ok <- FALSE
threshold[i] <- 0.5*(bracketavg[i - 1] + bracketavg[i])
}
if (!is_increasing(xk[i - 1], xk[i], yk[i - 1], yk[i], sk[i - 1], sk[i], ak[i - 1], ak[i])) {
constraints_ok <- FALSE
}
}
# Enforce the constraint for the first threshold
avg_bottom <- (1 - mk[1])/p[1]
if (p[1] > 0 && avg_bottom > threshold[1]) {
threshold[1] <- (avg_bottom + bracketavg[1])/2
}
if (!is.null(last_invpareto) && !is.na(last_invpareto)) {
last_threshold <- bracketavg[length(bracketavg)]/last_invpareto
if (last_invpareto <= 1) {
stop("the inverted Pareto coefficient must be > 1")
} else if (last_threshold < threshold[length(threshold) - 1]) {
max_last_invpareto <- bracketavg[length(bracketavg)]/threshold[length(threshold) - 1]
stop(sprintf("given your data, the last Pareto coefficient must be below %.2f", max_last_invpareto))
}
threshold[length(threshold)] <- last_threshold
}
if ((!is.null(top_model) && top_model == "pareto") && (is.null(binf) || is.na(binf))) {
binf <- bracketavg[length(bracketavg)]/threshold[length(threshold)]
} else if ((!is.null(top_model) && top_model == "pareto") && !(is.null(binf) || is.na(binf))) {
stop("you can either set 'binf' or 'top_model == \"pareto\"'")
}
# Exclude first bracket if it correspond to a Dirac in zero
if (pk[1] == 0 && abs(threshold[1]) <= .Machine$double.eps*2 && abs(bracketavg[1]) <= .Machine$double.eps*2) {
pk <- pk[-1]
threshold <- threshold[-1]
bracketavg <- bracketavg[-1]
}
# Pass the estimated thresholds to tabulation_fit
return(tabulation_fit(p=pk, threshold=threshold, average=average,
bracketavg=bracketavg, bottom_model=bottom_model, lower_bound=lower_bound, binf=binf, fast=fast))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.