# R/ed.R In hafen/ed: ed: a regression approach to density estimation

#### Documented in ed

```#' Fit ed model to data
#'
#' Calculate raw ed density estimates and fit ed model to data using loess
#'
#' @param x data, a vector
#' @param k the gap width (in number of observations) with which to compute the raw estimates
#' @param disjoint should non-overlapping gaps be computed? (default \code{TRUE})
#' @param degree degree of loess fitting  (see \code{\link{loess}})
#' @param span span of loess fitting (see \code{\link{loess}})
#' @param xgrid number of equally-spaced points along the support of \code{x} at which to compute the fit
#' @param bounds bounds at which to fit the density (see details)
#' @param f a function providing a true density or hypothesized density, with which the ed estimate can be compared (optional)
#' @param family loess parameter (\code{\link{loess}})
#' @param normalize should the resulting density be normalized so that it integrates to one?
#' @param delta grid augmentation parameter (experimental).  A value of 0 (default) disables grid augmentation.
#' @param lower grid augmentation parameter (experimental)
#' @param control loess parameter (see \code{\link{loess.control}})
#'
#' @return a list with a lot of things (to be documented...).  For now, look at str(result) to get an idea.
#'
#' @details
#' bounds...
#'
#' @note This function is provided as a convenience, but often you may want to simply compute the raw estimates using \code{\link{ed_raw}} and iteratively figure out how to fit the raw estimates with whatever nonparametric method you like.
#'
#'
#' @export
ed <- function(x, k = 10, disjoint = TRUE, degree = 2, span = 0.3,
xgrid = 500, bounds = c(min(x), max(x)), f = NULL,
family = "gaussian", normalize = FALSE, delta = 0, lower = NULL,
control = loess.control()) {

# TODO: check that arguments are valid...

n <- length(x)
xs <- sort(x)

f_log <- f
u_bounds <- NULL
# add grid augmentation if requested
if (delta > 0) {
x_range <- range(x)
n_neighbors <- ceiling(span * n)
b1 <- lower
if (is.null(b1)) b1 <- x_range[1] - diff(xs[c(1, n_neighbors)])
b2 <- x_range[2] + diff(xs[c(n - n_neighbors + 1, n)])

u_bounds <- c(b1, b2)

n_unif <- floor(delta * n / (1 - delta))
# want to add n_unif points to the grid specified by 'bounds', but
# also want uniform points outside the range for smoothing at boundaries
# n_unif <- ceiling(n_unif1 + (n_unif1/diff(x_range)) * (diff(xs[c(1, n_neighbors)] + diff(xs[c(n - n_neighbors + 1, n)]))))
# outside the range shouldn't count
# span <- span*(n + n_unif1)/(n + n_unif)
u <- seq(u_bounds[1], u_bounds[2], length = n_unif)
x <- c(x, u)
xs <- sort(x)
n <- n + n_unif

# s <- seq(u_bounds[1], u_bounds[2], length = xgrid)
# update delta so the regions where the uniform dominates agree with true density
u_min <- log(k / (n * (u[k + 1] - u[1]))) - log(k) + digamma(k)
delta <- diff(u_bounds) * exp(u_min)

if (!is.null(f)) {
f_log <- function(x)
(1 - delta) * f(x) + delta * dunif(x, u_bounds[1], u_bounds[2])
}
}

if (disjoint) {
m <- floor((n - 1) / k)
ind <- (c(1:m) - 1) * k + 1
} else {
m <- n - k
ind <- c(1:m)
}
ind2 <- ind + k

x_eval <- (xs[ind2] + xs[ind]) / 2
dists <- xs[ind2] - xs[ind]

balloon <- k / (n * dists)

# add points that will get zero weight so we can interpolate with loess
ii <- 1
w <- rep(1, m)
if (bounds[1] < min(x_eval)) {
x_eval <- c(bounds[1], x_eval)
w <- c(0, w)
balloon <- c(0.1234, balloon)
ii <- ii + 1
}
if (bounds[2] > max(x_eval)) {
x_eval <- c(bounds[2], x_eval)
w <- c(0, w)
balloon <- c(0.1234, balloon)
ii <- ii + 1
}

s <- seq(bounds[1], bounds[2], length = xgrid)

raw <- log(balloon) - log(k) + digamma(k)

raw_loess <- loess(raw ~ x_eval, weights = w, degree = degree,
span = span, family = family, control = control)

p_raw_loess <- predict(raw_loess)

ellhat <- predict(raw_loess, newdata = s)
if (delta > 0) {
ellhat[ellhat < u_min] <- u_min
fhat <- (exp(ellhat) - delta * dunif(s, u_bounds[1], u_bounds[2])) / (1 - delta)
# fhat[fhat < 0] <- 0
p_raw_loess[p_raw_loess < u_min] <- u_min
} else {
fhat <- exp(ellhat)
}

c <- sum(fhat) * (s[2] - s[1])
if (normalize) fhat <- fhat / c

no_endpoints <- ii:(m + ii - 1)
# if (delta > 0) {
#   no_endpoints <- which(x_eval < bounds[2] & x_eval > bounds[1])
# }

# "integrated" mse, if f is supplied
if (is.null(f)) {
mse <- NULL
} else {
mse <- sum((f(s) - fhat) ^ 2) * (s[2] - s[1])
}

res <- list(
surface   = data.frame(f = as.numeric(fhat), x = as.numeric(s), logf = ellhat),
dat       = data.frame(
x     = x_eval[no_endpoints],
raw   = raw[no_endpoints],
fhat  = exp(p_raw_loess)[no_endpoints],
resid = (raw - p_raw_loess)[no_endpoints]),
c         = c,
f         = f,
mse       = mse,
params    = list(k = k, span = span, degree = degree, delta = delta),
f_log     = f_log,
bounds    = bounds,
u_bounds  = u_bounds,
n         = n,
lo        = raw_loess,
normalize = normalize
)
class(res) <- c("ed", "list")
return(res)
}

#' @export
print.ed <- function(x, ...) {
message("ed object...\nUse str() to see structure of this object.")
}
```
hafen/ed documentation built on May 17, 2019, 1:32 p.m.