R/degross.R
In degross: Density Estimation from GROuped Summary Statistics

Documented in degross

#' Density estimation from tabulated data with given frequencies and group central moments.
#'
#' @description {Estimation of a density from tabulated summary statistics evaluated within each of the big bins (or classes) partitioning the variable support. These statistics include class frequencies and central moments of orders one up to four. The log-density is modelled using a linear combination of penalized B-splines. The multinomial log-likelihood involving the frequencies adds up to a roughness penalty based on differences of neighboring B-spline coefficients and to the log of a root-n approximation of the sampling density of the observed vector of central moments within each class. The so-obtained penalized log-likelihood is maximized using the EM algorithm to get an estimation of the spline parameters and, hence, of the variable density and related quantities such as quantiles, see Lambert (2021) for details.
#' }
#' @usage degross(degross.data,
#'        phi0 = NULL, tau0 = 1000,
#'        use.moments = rep(TRUE,4), freq.min = 20, diag.only=FALSE,
#'        penalize = TRUE,
#'        aa = 2, bb = 1e-06, pen.order = 3, fixed.tau = FALSE,
#'        plotting = FALSE, verbose = FALSE, iterlim=20)
#' @inheritParams degross_lpost
#' @importFrom grDevices gray
#' @importFrom graphics hist lines
#' @importFrom stats dt
#' @param degross.data A \link{degrossData.object} generated by \link{degrossData}.
#' @param phi0 Starting value for the \code{K}-vector \eqn{\phi} of B-spline parameters specifying the log-density. Default: NULL.
#' @param tau0 Starting value for the roughness penalty parameter. Default: 1000.
#' @param fixed.tau Logical indicating whether the roughness penalty parameter \code{tau} is fixed. Default: FALSE, implying its estimation.
#' @param plotting Logical indicating whether an histogram of the data with the estimated density should be plotted. Default: FALSE.
#' @param verbose Logical indicating whether details on the estimation progress should be displayed. Default: FALSE.
#' @param iterlim Maximum number of iterations during the M-step. Default: 20.
#'
#' @return An object of class \code{degross} containing several components from the density estimation procedure. Details can be found in \code{\link{degross.object}}. A summary of its content can be printed using \code{\link{print.degross}} or plotted using \code{\link{plot.degross}}.
#'
#' @author Philippe Lambert \email{p.lambert@uliege.be}
#' @references
#' Lambert, P. (2021) Moment-based density and risk estimation from grouped summary statistics. arXiv:2107.03883.
#'
#' @seealso \code{\link{degross.object}}, \code{\link{ddegross}}, \code{\link{pdegross}}, \code{\link{qdegross}}.
#'
#' @examples
#' ## Simulate grouped data
#' sim = simDegrossData(n=3500, plotting=TRUE,choice=2,J=3)
#' print(sim$true.density) ## Display density of the data generating mechanism
#'
#' ## Create a degrossData object
#' obj.data = with(sim, degrossData(Big.bins=Big.bins, freq.j=freq.j, m.j=m.j))
#' print(obj.data)
#'
#' ## Estimate the density underlying the grouped data
#' obj.fit = degross(obj.data)
#'
#' ## Plot the estimated density...
#' plot(obj.fit)
#' ## ... and compare it with the ('target') density used to simulate the data
#' curve(sim$true.density(x),add=TRUE,col="red",lwd=2)
#' legend("topleft",
#'        legend=c("Observed freq.","Target density","Estimated density"),
#'        col=c("grey85","red","black"), lwd=c(10,2,2),
#'        lty=c("solid","solid","dashed"), box.lty=0, inset=.02)
#'
#' @export
degross = function(degross.data,
                      phi0=NULL,tau0=1000,
                      use.moments=rep(TRUE,4),freq.min=20,diag.only=FALSE,
                      penalize=TRUE,aa=2,bb=1e-6,pen.order=3,fixed.tau=FALSE,
                      plotting=FALSE,verbose=FALSE,iterlim=20){
  K = degross.data$K ## Number of spline parameters
  ##
  B.i = degross.data$B.i ## B-spline basis at the small bin midpoints
  ui = degross.data$ui  ## Midpoints of the small bins
  delta = diff(ui[1:2]) ## Width of a small bin
  ## Penalty matrix
  Dd = diff(diag(K),diff=pen.order)
  Pd = t(Dd) %*% Dd
  ##
  ## Starting values for the spline parameters
  if (is.null(phi0)){
      ymin = min(degross.data$Big.bins) ; ymax = max(degross.data$Big.bins)
      ## Starting guess for the density: a Student with 5 d.f. at the midpoint of the support
      mu0 = .5*(ymin+ymax) ; s0 = (ymax-ymin)/6
      fstart = -log(s0)+dt((ui-mu0)/s0,df=5,log=TRUE)
      phi0 = c(solve(t(B.i)%*%B.i+100*Pd)%*%t(B.i)%*%fstart)
  }
  ## Data
  freq.j = degross.data$freq.j # Frequencies for big bins
  m.tot = sum(freq.j)
  J = length(freq.j) # Nbr of big bins
  small.to.big = degross.data$small.to.big # Big bin to which each small bin belongs
  ##
  ## Iterative estimation
  ## --------------------
  Big.bins = degross.data$Big.bins
  Big.mids = .5*(Big.bins[-1] + Big.bins[-length(Big.bins)])
  ##
  if (plotting){
    ## par(mar=c(4,4,1,1))
    temp = hist(rep(Big.mids,freq.j),breaks=Big.bins,plot=FALSE)
    hist(rep(Big.mids,freq.j), ylim=c(0,1.2*max(temp$density)),
         breaks=Big.bins,freq=FALSE,main="",
         col="grey85",border="white",xlab="")
  }
  ## Starting value for n.i
  eta.i = c(B.i %*% phi0)
  temp = exp(eta.i-max(eta.i)) ## Small bin means
  pi.i = temp / sum(temp) ## Small bin probs
  logNormCst = log(sum(temp)) + log(delta)
  gamma.j = c(rowsum(pi.i,small.to.big)) ## Big bin means
  ## gamma.j = c(tapply(pi.i,small.to.big,sum)) ## Big bin means
  n.i = freq.j[small.to.big]*pi.i/gamma.j[small.to.big] ## Expected number of data within small bins given current density est.
  ## Starting values for other quantities
  OK.E = FALSE
  phi.cur = phi0 ; tau.cur = tau0
  e.iter = 0
  if (verbose){
    cat("-- EM algorithm --\n")
    cat("Observed log-posterior (niter in M-step)\n")
  }
  while (!OK.E){
    e.iter = e.iter + 1
    ## obj.cur = degross_lpost(phi.cur,tau.cur,n.i,degross.data,use.moments=use.moments,diag.only=diag.only,penalize=penalize,aa=aa,bb=bb)
    ## pi.i = obj.cur$pi.i ; gamma.j = obj.cur$gamma.j
    ##
    ## (1) ----- E-STEP -------
    n.i = freq.j[small.to.big]*pi.i/gamma.j[small.to.big] ## Expected number of data within small bins given current density est.
    obj.cur = degross_lpost(phi=phi.cur,tau=tau.cur,n.i=n.i,degross.data=degross.data,
                            use.moments=use.moments,freq.min=freq.min,
                            penalize=penalize,aa=aa,bb=bb,pen.order=pen.order)
    lcur.e1 = obj.cur$lpost.mj ## Expected log-posterior at the start of the iteration
    if ((verbose) & (e.iter == 1)) cat("0: ",lcur.e1,"\n",sep="")
    ##
    ## (2) ----- M-STEP: scoring algorithm for phi -------
    m.iter = 0
    OK.M = FALSE
    while(!OK.M){
        ##      if (plotting) lines(ui,pi.i/delta,col="grey")
        m.iter = m.iter + 1
        ## lcur, Score & Fisher info
        Score = obj.cur$Score ; Fisher = obj.cur$Fisher
        ##
        ## Update
        phi.new = phi.cur  + c(solve(Fisher+1e-4*diag(K))%*%Score)
        phi.new = phi.new - max(phi.new)
        obj.new = degross_lpost(phi=phi.new,tau=tau.cur,n.i=n.i,degross.data=degross.data,
                                use.moments=use.moments,freq.min=freq.min,diag.only=diag.only,
                                penalize=penalize,aa=aa,bb=bb,pen.order=pen.order)
        grad.max = max(abs(obj.new$Score))
        ##
        OK.M = ((grad.max < 1e-2) & (abs(obj.cur$lpost-obj.new$lpost) < 1e-1)) | (m.iter > iterlim)
        if (m.iter > 20) cat("!! Scoring algorithm in M-step requires more than 20 iterations !!\n")
        obj.cur = obj.new
        pi.i = obj.cur$pi.i ; gamma.j = obj.cur$gamma.j
        phi.cur = phi.new
    }
    lcur.e2 = obj.cur$lpost.mj # Observed log-posterior after the M-step for phi
    ##
    ## (3) ----- E-step for tau -----
    ##
    if (!fixed.tau){
      Fisher = obj.cur$Fisher
      quad = sum(phi.cur*c(Pd%*%phi.cur))
      edf = sum(diag(solve(Fisher+diag(1e-6+0*phi.cur))%*%(Fisher-tau.cur*Pd)))
      tau.min = 1 #1e-2
      tau.cur = max(tau.min,(edf-pen.order)/quad) ## Schall method
    }
    obj.cur = degross_lpost(phi=phi.cur,tau=tau.cur,n.i=n.i,degross.data=degross.data,
                            use.moments=use.moments,freq.min=freq.min,diag.only=diag.only,
                            penalize=penalize,aa=aa,bb=bb,pen.order=pen.order)
    ##
    lcur.e3 = obj.cur$lpost.mj # Observed log-posterior at the end of the iteration
    if (verbose) cat(e.iter,": ",lcur.e3," (niter=",m.iter,")\n",sep="")
    ##
    ## (4) Stopping rule for E-M
    OK.E = (abs(lcur.e1-lcur.e3) < 1e-1) | (e.iter > 100)
    if (e.iter > 100) cat("!! E-step require more than 100 iterations !!\n")
  }
  ## Reevaluate before leaving
  n.i = freq.j[small.to.big]*pi.i/gamma.j[small.to.big] ## Expected number of data within small bins given current density est.
  obj.cur = degross_lpost(phi=phi.cur,tau=tau.cur,n.i=n.i,degross.data=degross.data,
                          use.moments=use.moments,freq.min=freq.min,
                          penalize=penalize,aa=aa,bb=bb,pen.order=pen.order)
  pi.i = obj.cur$pi.i ; gamma.j = obj.cur$gamma.j
  ##
  if (plotting) lines(ui,pi.i/delta,col="black",lwd=2,lty="dashed")
  if (verbose){
      if (!fixed.tau) cat("Selected tau:",tau.cur,"\n")
      else cat("Fixed value for tau = tau0 =",tau.cur,"\n")
  }
  ## Log of the normalizing constant when evaluating the density
  eta.i = c(B.i %*% obj.cur$phi)
  temp = exp(eta.i) ## Small bin means
  ## pi.i = temp / sum(temp) ## Small bin probs
  logNormCst = -log(sum(temp)) - log(delta)
  ## Returned material
  res = obj.cur
  ##
  eigen.vals = svd(Fisher)$d
  log.evidence = c(obj.cur$lpost.mj - .5*sum(log(eigen.vals[eigen.vals > 1e-4]))) # <----- HERE
  ##
  ##
  llik = with(obj.cur, llik.mj + moments.penalty)
  aic = -2*llik + 2*edf
  bic = -2*llik + log(m.tot)*edf
  ##
  res$edf = edf  ## Effective degrees of freedom
  res$aic = aic ; res$bic = bic ## AIC & BIC
  res$log.evidence = log.evidence ## log-evidence (i.e. log of the average likelihood)
  ##
  res$degross.data = degross.data ## Data used during estimation
  res$use.moments = use.moments ## vector of logicals remembering which sample moments were used
  res$diag.only = diag.only ## logical indicating whether the off-diag elements of V(sample central moments) were ignored
  res$logNormCst = logNormCst ## Log of the normalizing constant when evaluating the fitted density
  ##
  return(structure(res,class="degross"))
}