Nothing
#' Density estimation from tabulated data with given frequencies and group central moments.
#'
#' @description {Estimation of a density from tabulated summary statistics evaluated within each of the big bins (or classes) partitioning the variable support. These statistics include class frequencies and central moments of orders one up to four. The log-density is modelled using a linear combination of penalized B-splines. The multinomial log-likelihood involving the frequencies adds up to a roughness penalty based on differences of neighboring B-spline coefficients and to the log of a root-n approximation of the sampling density of the observed vector of central moments within each class. The so-obtained penalized log-likelihood is maximized using the EM algorithm to get an estimation of the spline parameters and, hence, of the variable density and related quantities such as quantiles, see Lambert (2021) for details.
#' }
#' @usage degross(degross.data,
#' phi0 = NULL, tau0 = 1000,
#' use.moments = rep(TRUE,4), freq.min = 20, diag.only=FALSE,
#' penalize = TRUE,
#' aa = 2, bb = 1e-06, pen.order = 3, fixed.tau = FALSE,
#' plotting = FALSE, verbose = FALSE, iterlim=20)
#' @inheritParams degross_lpost
#' @importFrom grDevices gray
#' @importFrom graphics hist lines
#' @importFrom stats dt
#' @param degross.data A \link{degrossData.object} generated by \link{degrossData}.
#' @param phi0 Starting value for the \code{K}-vector \eqn{\phi} of B-spline parameters specifying the log-density. Default: NULL.
#' @param tau0 Starting value for the roughness penalty parameter. Default: 1000.
#' @param fixed.tau Logical indicating whether the roughness penalty parameter \code{tau} is fixed. Default: FALSE, implying its estimation.
#' @param plotting Logical indicating whether an histogram of the data with the estimated density should be plotted. Default: FALSE.
#' @param verbose Logical indicating whether details on the estimation progress should be displayed. Default: FALSE.
#' @param iterlim Maximum number of iterations during the M-step. Default: 20.
#'
#' @return An object of class \code{degross} containing several components from the density estimation procedure. Details can be found in \code{\link{degross.object}}. A summary of its content can be printed using \code{\link{print.degross}} or plotted using \code{\link{plot.degross}}.
#'
#' @author Philippe Lambert \email{p.lambert@uliege.be}
#' @references
#' Lambert, P. (2021) Moment-based density and risk estimation from grouped summary statistics. arXiv:2107.03883.
#'
#' @seealso \code{\link{degross.object}}, \code{\link{ddegross}}, \code{\link{pdegross}}, \code{\link{qdegross}}.
#'
#' @examples
#' ## Simulate grouped data
#' sim = simDegrossData(n=3500, plotting=TRUE,choice=2,J=3)
#' print(sim$true.density) ## Display density of the data generating mechanism
#'
#' ## Create a degrossData object
#' obj.data = with(sim, degrossData(Big.bins=Big.bins, freq.j=freq.j, m.j=m.j))
#' print(obj.data)
#'
#' ## Estimate the density underlying the grouped data
#' obj.fit = degross(obj.data)
#'
#' ## Plot the estimated density...
#' plot(obj.fit)
#' ## ... and compare it with the ('target') density used to simulate the data
#' curve(sim$true.density(x),add=TRUE,col="red",lwd=2)
#' legend("topleft",
#' legend=c("Observed freq.","Target density","Estimated density"),
#' col=c("grey85","red","black"), lwd=c(10,2,2),
#' lty=c("solid","solid","dashed"), box.lty=0, inset=.02)
#'
#' @export
degross = function(degross.data,
phi0=NULL,tau0=1000,
use.moments=rep(TRUE,4),freq.min=20,diag.only=FALSE,
penalize=TRUE,aa=2,bb=1e-6,pen.order=3,fixed.tau=FALSE,
plotting=FALSE,verbose=FALSE,iterlim=20){
K = degross.data$K ## Number of spline parameters
##
B.i = degross.data$B.i ## B-spline basis at the small bin midpoints
ui = degross.data$ui ## Midpoints of the small bins
delta = diff(ui[1:2]) ## Width of a small bin
## Penalty matrix
Dd = diff(diag(K),diff=pen.order)
Pd = t(Dd) %*% Dd
##
## Starting values for the spline parameters
if (is.null(phi0)){
ymin = min(degross.data$Big.bins) ; ymax = max(degross.data$Big.bins)
## Starting guess for the density: a Student with 5 d.f. at the midpoint of the support
mu0 = .5*(ymin+ymax) ; s0 = (ymax-ymin)/6
fstart = -log(s0)+dt((ui-mu0)/s0,df=5,log=TRUE)
phi0 = c(solve(t(B.i)%*%B.i+100*Pd)%*%t(B.i)%*%fstart)
}
## Data
freq.j = degross.data$freq.j # Frequencies for big bins
m.tot = sum(freq.j)
J = length(freq.j) # Nbr of big bins
small.to.big = degross.data$small.to.big # Big bin to which each small bin belongs
##
## Iterative estimation
## --------------------
Big.bins = degross.data$Big.bins
Big.mids = .5*(Big.bins[-1] + Big.bins[-length(Big.bins)])
##
if (plotting){
## par(mar=c(4,4,1,1))
temp = hist(rep(Big.mids,freq.j),breaks=Big.bins,plot=FALSE)
hist(rep(Big.mids,freq.j), ylim=c(0,1.2*max(temp$density)),
breaks=Big.bins,freq=FALSE,main="",
col="grey85",border="white",xlab="")
}
## Starting value for n.i
eta.i = c(B.i %*% phi0)
temp = exp(eta.i-max(eta.i)) ## Small bin means
pi.i = temp / sum(temp) ## Small bin probs
logNormCst = log(sum(temp)) + log(delta)
gamma.j = c(rowsum(pi.i,small.to.big)) ## Big bin means
## gamma.j = c(tapply(pi.i,small.to.big,sum)) ## Big bin means
n.i = freq.j[small.to.big]*pi.i/gamma.j[small.to.big] ## Expected number of data within small bins given current density est.
## Starting values for other quantities
OK.E = FALSE
phi.cur = phi0 ; tau.cur = tau0
e.iter = 0
if (verbose){
cat("-- EM algorithm --\n")
cat("Observed log-posterior (niter in M-step)\n")
}
while (!OK.E){
e.iter = e.iter + 1
## obj.cur = degross_lpost(phi.cur,tau.cur,n.i,degross.data,use.moments=use.moments,diag.only=diag.only,penalize=penalize,aa=aa,bb=bb)
## pi.i = obj.cur$pi.i ; gamma.j = obj.cur$gamma.j
##
## (1) ----- E-STEP -------
n.i = freq.j[small.to.big]*pi.i/gamma.j[small.to.big] ## Expected number of data within small bins given current density est.
obj.cur = degross_lpost(phi=phi.cur,tau=tau.cur,n.i=n.i,degross.data=degross.data,
use.moments=use.moments,freq.min=freq.min,
penalize=penalize,aa=aa,bb=bb,pen.order=pen.order)
lcur.e1 = obj.cur$lpost.mj ## Expected log-posterior at the start of the iteration
if ((verbose) & (e.iter == 1)) cat("0: ",lcur.e1,"\n",sep="")
##
## (2) ----- M-STEP: scoring algorithm for phi -------
m.iter = 0
OK.M = FALSE
while(!OK.M){
## if (plotting) lines(ui,pi.i/delta,col="grey")
m.iter = m.iter + 1
## lcur, Score & Fisher info
Score = obj.cur$Score ; Fisher = obj.cur$Fisher
##
## Update
phi.new = phi.cur + c(solve(Fisher+1e-4*diag(K))%*%Score)
phi.new = phi.new - max(phi.new)
obj.new = degross_lpost(phi=phi.new,tau=tau.cur,n.i=n.i,degross.data=degross.data,
use.moments=use.moments,freq.min=freq.min,diag.only=diag.only,
penalize=penalize,aa=aa,bb=bb,pen.order=pen.order)
grad.max = max(abs(obj.new$Score))
##
OK.M = ((grad.max < 1e-2) & (abs(obj.cur$lpost-obj.new$lpost) < 1e-1)) | (m.iter > iterlim)
if (m.iter > 20) cat("!! Scoring algorithm in M-step requires more than 20 iterations !!\n")
obj.cur = obj.new
pi.i = obj.cur$pi.i ; gamma.j = obj.cur$gamma.j
phi.cur = phi.new
}
lcur.e2 = obj.cur$lpost.mj # Observed log-posterior after the M-step for phi
##
## (3) ----- E-step for tau -----
##
if (!fixed.tau){
Fisher = obj.cur$Fisher
quad = sum(phi.cur*c(Pd%*%phi.cur))
edf = sum(diag(solve(Fisher+diag(1e-6+0*phi.cur))%*%(Fisher-tau.cur*Pd)))
tau.min = 1 #1e-2
tau.cur = max(tau.min,(edf-pen.order)/quad) ## Schall method
}
obj.cur = degross_lpost(phi=phi.cur,tau=tau.cur,n.i=n.i,degross.data=degross.data,
use.moments=use.moments,freq.min=freq.min,diag.only=diag.only,
penalize=penalize,aa=aa,bb=bb,pen.order=pen.order)
##
lcur.e3 = obj.cur$lpost.mj # Observed log-posterior at the end of the iteration
if (verbose) cat(e.iter,": ",lcur.e3," (niter=",m.iter,")\n",sep="")
##
## (4) Stopping rule for E-M
OK.E = (abs(lcur.e1-lcur.e3) < 1e-1) | (e.iter > 100)
if (e.iter > 100) cat("!! E-step require more than 100 iterations !!\n")
}
## Reevaluate before leaving
n.i = freq.j[small.to.big]*pi.i/gamma.j[small.to.big] ## Expected number of data within small bins given current density est.
obj.cur = degross_lpost(phi=phi.cur,tau=tau.cur,n.i=n.i,degross.data=degross.data,
use.moments=use.moments,freq.min=freq.min,
penalize=penalize,aa=aa,bb=bb,pen.order=pen.order)
pi.i = obj.cur$pi.i ; gamma.j = obj.cur$gamma.j
##
if (plotting) lines(ui,pi.i/delta,col="black",lwd=2,lty="dashed")
if (verbose){
if (!fixed.tau) cat("Selected tau:",tau.cur,"\n")
else cat("Fixed value for tau = tau0 =",tau.cur,"\n")
}
## Log of the normalizing constant when evaluating the density
eta.i = c(B.i %*% obj.cur$phi)
temp = exp(eta.i) ## Small bin means
## pi.i = temp / sum(temp) ## Small bin probs
logNormCst = -log(sum(temp)) - log(delta)
## Returned material
res = obj.cur
##
eigen.vals = svd(Fisher)$d
log.evidence = c(obj.cur$lpost.mj - .5*sum(log(eigen.vals[eigen.vals > 1e-4]))) # <----- HERE
##
##
llik = with(obj.cur, llik.mj + moments.penalty)
aic = -2*llik + 2*edf
bic = -2*llik + log(m.tot)*edf
##
res$edf = edf ## Effective degrees of freedom
res$aic = aic ; res$bic = bic ## AIC & BIC
res$log.evidence = log.evidence ## log-evidence (i.e. log of the average likelihood)
##
res$degross.data = degross.data ## Data used during estimation
res$use.moments = use.moments ## vector of logicals remembering which sample moments were used
res$diag.only = diag.only ## logical indicating whether the off-diag elements of V(sample central moments) were ignored
res$logNormCst = logNormCst ## Log of the normalizing constant when evaluating the fitted density
##
return(structure(res,class="degross"))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.