# R/hdr.R In robjhyndman/hdrcde: Highest Density Regions and Conditional Density Estimation

#### Documented in hdrhdr.boxplothdr.den

#' Highest Density Regions
#'
#' Calculates highest density regions in one dimension
#'
#' Either \code{x} or \code{den} must be provided. When \code{x} is provided,
#' the density is estimated using kernel density estimation. A Box-Cox
#' transformation is used if \code{lambda!=1}, as described in Wand, Marron and
#' Ruppert (1991). This allows the density estimate to be non-zero only on the
#' positive real line. The default kernel bandwidth \code{h} is selected using
#' the algorithm of Samworth and Wand (2010).
#'
#' Hyndman's (1996) density quantile algorithm is used for calculation.
#'
#' @param x Numeric vector containing data. If \code{x} is missing then
#' \code{den} must be provided, and the HDR is computed from the given density.
#' @param prob Probability coverage required for HDRs
#' @param den Density of data as list with components \code{x} and \code{y}.
#' If omitted, the density is estimated from \code{x} using
#' @param h Optional bandwidth for calculation of density.
#' @param lambda Box-Cox transformation parameter where \code{0 <= lambda <=
#' 1}.
#' @param nn Number of random numbers used in computing f-alpha quantiles.
#' @param all.modes Return all local modes or just the global mode?
#' @return A list of three components: \item{hdr}{The endpoints of each interval
#' in each HDR} \item{mode}{The estimated mode of the density.}
#' \item{falpha}{The value of the density at the boundaries of each HDR.}
#' @author Rob J Hyndman
#' @references Hyndman, R.J. (1996) Computing and graphing highest density
#' regions. \emph{American Statistician}, \bold{50}, 120-126.
#'
#' Samworth, R.J. and Wand, M.P. (2010). Asymptotics and optimal bandwidth
#' selection for highest density region estimation.  \emph{The Annals of
#' Statistics}, \bold{38}, 1767-1792.
#'
#' Wand, M.P., Marron, J S., Ruppert, D. (1991) Transformations in density
#' estimation. \emph{Journal of the American Statistical Association},
#' \bold{86}, 343-353.
#' @keywords smooth distribution
#' @examples
#' # Old faithful eruption duration times
#' hdr(faithful$eruptions) #' @export hdr hdr <- function(x=NULL, prob=c(50,95,99), den=NULL, h=hdrbw(BoxCox(x,lambda),mean(prob)), lambda=1, nn=5000, all.modes=FALSE) { if(!is.null(x)) { r <- diff(range(x)) if(r==0) stop("Insufficient data") } if(is.null(den)) den <- tdensity(x,bw=h,lambda=lambda) alpha <- sort(1-prob/100) falpha <- calc.falpha(x,den,alpha,nn=nn) hdr.store <- matrix(NA,length(alpha),100) for(i in 1:length(alpha)) { junk <- hdr.ends(den,falpha$falpha[i])$hdr if(length(junk) > 100) { junk <- junk[1:100] warning("Too many sub-intervals. Only the first 50 returned.") } hdr.store[i,] <- c(junk,rep(NA,100-length(junk))) } cj <- colSums(is.na(hdr.store)) hdr.store <- matrix(hdr.store[,cj < nrow(hdr.store)],nrow=length(prob)) rownames(hdr.store) <- paste(100*(1-alpha),"%",sep="") if(all.modes) { y <- c(0,den$y,0)
n <- length(y)
idx <- ((y[2:(n-1)] > y[1:(n-2)]) & (y[2:(n-1)] > y[3:n])) | (den$y == max(den$y))
mode <- den$x[idx] } else mode <- falpha$mode
return(list(hdr=hdr.store,mode=mode,falpha=falpha$falpha)) } "calc.falpha" <- function(x=NULL, den, alpha, nn=5000) { # Calculates falpha needed to compute HDR of density den. # Also finds approximate mode. # Input: den = density on grid. # x = independent observations on den # alpha = level of HDR # Called by hdr.box and hdr.conf if(is.null(x)) calc.falpha(x=sample(den$x, nn, replace=TRUE, prob=den$y), den, alpha) else { fx <- approx(den$x,den$y,xout=x,rule=2)$y
falpha <- quantile(sort(fx), alpha)
mode <- den$x[den$y==max(den$y)] return(list(falpha=falpha,mode=mode,fx=fx)) } } "hdr.ends" <- function(den,falpha) { miss <- is.na(den$x) # | is.na(den$y) den$x <- den$x[!miss] den$y <- den$y[!miss] n <- length(den$x)
# falpha is above the density, so the HDR does not exist
if(falpha > max(den$y)) return(list(falpha=falpha,hdr=NA) ) f <- function(x, den, falpha) { approx(den$x, den$y-falpha, xout=x)$y
}
intercept <- all_roots(f, interval=range(den$x), den=den, falpha=falpha) ni <- length(intercept) # No roots -- use the whole line if(ni == 0L) intercept <- c(den$x[1],den$x[n]) else { # Check behaviour outside the smallest and largest intercepts if(f(0.5*(head(intercept,1) + den$x[1]), den, falpha) > 0)
intercept <- c(den$x[1],intercept) if(f(0.5*(tail(intercept,1) + den$x[n]), den, falpha) > 0)
intercept <- c(intercept,den$x[n]) } # Check behaviour -- not sure if we need this now if(length(intercept) %% 2) warning("Some HDRs are incomplete") # intercept <- sort(unique(intercept)) return(list(falpha=falpha,hdr=intercept)) } #' Density plot with Highest Density Regions #' #' Plots univariate density with highest density regions displayed #' #' Either \code{x} or \code{den} must be provided. When \code{x} is provided, #' the density is estimated using kernel density estimation. A Box-Cox #' transformation is used if \code{lambda!=1}, as described in Wand, Marron and #' Ruppert (1991). This allows the density estimate to be non-zero only on the #' positive real line. The default kernel bandwidth \code{h} is selected using #' the algorithm of Samworth and Wand (2010). #' #' Hyndman's (1996) density quantile algorithm is used for calculation. #' #' @param x Numeric vector containing data. If \code{x} is missing then #' \code{den} must be provided, and the HDR is computed from the given density. #' @param prob Probability coverage required for HDRs #' @param den Density of data as list with components \code{x} and \code{y}. #' If omitted, the density is estimated from \code{x} using #' \code{\link[stats]{density}}. #' @param h Optional bandwidth for calculation of density. #' @param lambda Box-Cox transformation parameter where \code{0 <= lambda <= #' 1}. #' @param xlab Label for x-axis. #' @param ylab Label for y-axis. #' @param ylim Limits for y-axis. #' @param plot.lines If \code{TRUE}, will show how the HDRs are determined #' using lines. #' @param col Colours for regions. #' @param bgcol Colours for the background behind the boxes. Default \code{"gray"}, if \code{NULL} no box is drawn. #' @param legend If \code{TRUE} add a legend on the right of the boxes. #' @param \dots Other arguments passed to plot. #' @return a list of three components: \item{hdr}{The endpoints of each interval #' in each HDR} \item{mode}{The estimated mode of the density.} #' \item{falpha}{The value of the density at the boundaries of each HDR.} #' @author Rob J Hyndman #' @seealso \code{\link{hdr}}, \code{\link{hdr.boxplot}} #' @references Hyndman, R.J. (1996) Computing and graphing highest density #' regions. \emph{American Statistician}, \bold{50}, 120-126. #' #' Samworth, R.J. and Wand, M.P. (2010). Asymptotics and optimal bandwidth #' selection for highest density region estimation. \emph{The Annals of #' Statistics}, \bold{38}, 1767-1792. #' #' Wand, M.P., Marron, J S., Ruppert, D. (1991) Transformations in density #' estimation. \emph{Journal of the American Statistical Association}, #' \bold{86}, 343-353. #' @keywords smooth distribution hplot #' @examples #' # Old faithful eruption duration times #' hdr.den(faithful$eruptions)
#'
#' # Simple bimodal example
#' x <- c(rnorm(100,0,1), rnorm(100,5,1))
#' hdr.den(x)
#' @export hdr.den
hdr.den <- function(x, prob=c(50,95,99), den, h=hdrbw(BoxCox(x,lambda),mean(prob)),
lambda=1, xlab=NULL, ylab="Density", ylim=NULL, plot.lines=TRUE, col=2:8,bgcol="gray",legend=FALSE, ...)
{
if(missing(den))
den <- tdensity(x,bw=h,lambda=lambda)
else if(missing(x))
x <- sample(den$x, 500, replace=TRUE, prob=den$y)

maxden <- max(den$y) stepy <- maxden*0.02 if(is.null(ylim)) ylim <- c((1-length(prob))*stepy, maxden) plot(den, type="n", xlab=xlab, ylab=ylab, ylim=ylim, xaxt="n", yaxt="n", ...) rangex <- range(x, na.rm=TRUE) minx <- rangex[1] maxx <- rangex[2] rangex <- maxx-minx if(!is.null(bgcol)) polygon(c(minx-0.5*rangex, maxx+0.5*rangex, maxx+0.5*rangex, minx-0.5*rangex), c(0,0,rep(-length(prob)*stepy*2,2)),col=bgcol,border=FALSE) #abline(h=0, col="gray") lines(den$x,den$y) box() axis(1) axis(2, at=pretty(c(0,maxden))) hd <- hdr(x=x, prob=prob, den=den, h=h) nregions <- nrow(hd$hdr)

leng <- dim(hd$hdr)[[2]] # Colours for HDRs if(nregions > 8) { repcol <- ceiling(length(prob)/8) col <- rep(col, repcol) } if(plot.lines) { for(i in 1:nregions) { abline(h=hd$falpha[i], col=col[i], lty=2)
for(j in 1:length(hd$hdr[i,])) lines(rep(hd$hdr[i,j],2), c(stepy*(i-nregions),hd$falpha[i]),col=col[i], lty=2) } } for(i in 1:nregions) add.hdr(hd$hdr[i,], (i-nregions-0.5)*stepy, stepy, col=col[i], horiz=TRUE)
if(legend){
rightlim=apply(hd$hdr,1,function(i)i[length(i[!is.na(i)])]) text(rightlim, ((1:nregions)-nregions-0.5)*stepy, label=paste0(rownames(hd$hdr)," HDR"),cex=.8,pos=4)
}
return(hd)
}

#' Highest Density Region Boxplots
#'
#' Calculates and plots a univariate highest density regions boxplot.
#'
#' The density is estimated using kernel density estimation. A Box-Cox
#' transformation is used if \code{lambda!=1}, as described in Wand, Marron and
#' Ruppert (1991). This allows the density estimate to be non-zero only on the
#' positive real line. The default kernel bandwidth \code{h} is selected using
#' the algorithm of Samworth and Wand (2010).
#'
#' Hyndman's (1996) density quantile algorithm is used for calculation.
#'
#' @param x Numeric vector containing data or a list containing several vectors.
#' @param prob Probability coverage required for HDRs
#' @param h Optional bandwidth for calculation of density.
#' @param lambda Box-Cox transformation parameter where \code{0 <= lambda <=
#' 1}.
#' @param boxlabels Label for each box plotted.
#' @param col Colours for regions of each box.
#' @param main Overall title for the plot.
#' @param xlab Label for x-axis.
#' @param ylab Label for y-axis.
#' @param pch Plotting character.
#' @param border Width of border of box.
#' @param outline If not <code>TRUE</code>, the outliers are not drawn.
#' @param space The space between each box, between 0 and 0.5.
#' @param \dots Other arguments passed to plot.
#' @return nothing.
#' @author Rob J Hyndman
#' @references Hyndman, R.J. (1996) Computing and graphing highest density
#' regions. \emph{American Statistician}, \bold{50}, 120-126.
#'
#' Samworth, R.J. and Wand, M.P. (2010). Asymptotics and optimal bandwidth
#' selection for highest density region estimation.  \emph{The Annals of
#' Statistics}, \bold{38}, 1767-1792.
#'
#' Wand, M.P., Marron, J S., Ruppert, D. (1991) Transformations in density
#' estimation. \emph{Journal of the American Statistical Association},
#' \bold{86}, 343-353.
#' @keywords smooth distribution hplot
#' @examples
#' # Old faithful eruption duration times
#' hdr.boxplot(faithful$eruptions) #' #' # Simple bimodal example #' x <- c(rnorm(100,0,1), rnorm(100,5,1)) #' par(mfrow=c(1,2)) #' boxplot(x) #' hdr.boxplot(x) #' #' # Highly skewed example #' x <- exp(rnorm(100,0,1)) #' par(mfrow=c(1,2)) #' boxplot(x) #' hdr.boxplot(x,lambda=0) #' #' @export hdr.boxplot hdr.boxplot <- function(x, prob=c(99,50), h=hdrbw(BoxCox(x,lambda),mean(prob)), lambda=1, boxlabels="", col= gray((9:1)/10), main = "", xlab="",ylab="", pch=1, border=1,outline=TRUE,space=.25,...) { if(!is.list(x)) x <- list(x) prob <- -sort(-prob) nplots <- length(x) junk <- unlist(x) ends.list <- list() for(j in 1:nplots) ends.list[[j]] <- hdr.box(x[[j]],prob,h=h,lambda=lambda) plot(c(0.5,nplots+0.5),range(junk,unlist(ends.list)),type="n",xlab=xlab,main=main, ylab=ylab,xaxt="n",...) if(length(boxlabels)==1 & boxlabels[1]=="") { junk <- names(x) if(is.null(junk)) boxlabels <- rep("",nplots) else boxlabels <- junk } axis(1,at=c(1:nplots),labels=boxlabels,tick=FALSE,...) sp <- 0.5 - min(max(space, 0), 0.5) nint <- length(prob) cols <- rep(col,5) for(j in 1:nplots) { xx <- x[[j]] ends <- ends.list[[j]] for(i in 1:nint) { endsi <- ends[[i]] for(k in 1:(length(endsi)/2)) { polygon( c(j-sp,j-sp,j+sp,j+sp,j-sp), c(endsi[k*2-1],endsi[k*2], endsi[k*2],endsi[k*2-1],endsi[k*2-1]), col =cols[i], border=border) } } for(k in 1:length(ends$mode))
lines(c(j-0.35,j+0.35),rep(ends$mode[k],2),lty=1) if(outline) { outliers <- xx[xx<min(ends[[1]]) | xx>max(ends[[1]])] points(rep(j,length(outliers)),outliers,pch=pch) } } invisible() } "hdr.box" <- function(x, prob=c(99,50), h, lambda, ...) { # Does all the calculations for an HDR boxplot of x and returns # the endpoints of the HDR sub-intervals and the mode in a list. # Called by hdr.boxplot(). r <- diff(range(x)) if(r>0) { den <- tdensity(x,bw=h,lambda) info <- calc.falpha(x,den,1-prob/100) } hdrlist <- list() for(i in 1:length(prob)) { if(r>0) hdrlist[[i]] <- hdr.ends(den,info$falpha[i])$hdr else hdrlist[[i]] <- c(x[1],x[1]) } if(r>0) hdrlist$mode <- info$mode else hdrlist$mode <- x[1]
return(hdrlist)
}

# Transformed density estimate

tdensity <- function(x,bw="SJ",lambda=1)
{
if(is.list(x))
x <- x[[1]]
if(lambda==1)
return(density(x,bw=bw,n=1001))
else if(lambda < 0 | lambda > 1)
stop("lambda must be in [0,1]")
# Proceed with a Box-Cox transformed density
y <- BoxCox(x,lambda)
g <- density(y,bw=bw, n=1001)
j <- g$x > 0.1 - 1/lambda # Stay away from the edge g$y <- g$y[j] g$x <- g$x[j] xgrid <- InvBoxCox(g$x,lambda) # x
g$y <- c(0,g$y * xgrid^(lambda-1))
g$x <- c(0,xgrid) return(g) } #' Box Cox Transformation #' #' BoxCox() returns a transformation of the input variable using a Box-Cox #' transformation. InvBoxCox() reverses the transformation. #' #' The Box-Cox transformation is given by \deqn{f_\lambda(x) =\frac{x^\lambda - #' 1}{\lambda}}{f(x;lambda) = (x^lambda - 1)/lambda} if #' \eqn{\lambda\ne0}{lambda is not equal to 0}. For \eqn{\lambda=0}{lambda=0}, #' \deqn{f_0(x) = \log(x).}{f(x;0) = log(x).} #' #' @aliases BoxCox InvBoxCox #' @param x a numeric vector or time series #' @param lambda transformation parameter #' @return a numeric vector of the same length as x. #' @author Rob J Hyndman #' @references Box, G. E. P. and Cox, D. R. (1964) An analysis of #' transformations. \emph{JRSS B} \bold{26} 211--246. #' @keywords math #' @export BoxCox BoxCox <- function (x, lambda) { if(is.list(x)) x <- x[[1]] if (lambda == 0) log(x) else (x^lambda - 1)/lambda } InvBoxCox <- function (x, lambda) { if(is.list(x)) x <- x[[1]] if (lambda == 0) exp(x) else (x * lambda + 1)^(1/lambda) } all_roots <- function (f, interval, lower = min(interval), upper = max(interval), n = 100L, ...) { x <- seq(lower, upper, len = n + 1L) fx <- f(x, ...) roots <- x[which(fx == 0)] fx2 <- fx[seq(n)] * fx[seq(2L,n+1L,by=1L)] index <- which(fx2 < 0) for (i in index) roots <- c(roots, uniroot(f, lower = x[i], upper = x[i+1L], ...)$root)
return(sort(roots))
}

robjhyndman/hdrcde documentation built on Aug. 18, 2022, 6:36 a.m.