Nothing
#' One-dimensional kernel density estimate
#'
#' @description State-of-the-art gaussian kernel density estimator for
#' one-dimensional data. The estimator does not use the commonly
#' employed 'gaussian rule of thumb'. As a result, it outperforms
#' many plug-in methods on multimodal densities with widely
#' separated modes. This function is the cleaned-up version of
#' the code written and published by Z. I. Botev at:
#' \url{http://web.maths.unsw.edu.au/~zdravkobotev/}
#'
#' @references Z. I. Botev, J. F. Grotowski and D. P. Kroese,
#' "Kernel Density Estimation Via Diffusion",
#' Annals of Statistics, 2010, Volume 38, Number 5, Pages 2916-2957
#'
#' @param data a vector of data from which the density estimate is constructed;
#' @param n the number of mesh points used in the uniform discretization of
#' the interval [MIN, MAX]; n has to be a power of two; if n is not
#' a power of two, then n is rounded up to the next power of two;
#' the default value of n is n=2 ^ 12;
#' @param MIN minimum of the interval [MIN, MAX] on which the density estimate
#' is constructed; default value: MIN = min(data) - Range / 10
#' @param MAX maximum of the interval [MIN, MAX] on which the density estimate
#' is constructed; default value: MAX = max(data) + Range / 10
#'
#' @return A \code{matrix} with two rows of length \code{n}, where the second row contains
#' the density values on the mesh in the first row.
#'
#' @importFrom stats optimize fft
#' @importFrom graphics hist
#' @export
#'
#' @examples set.seed(1)
#' data <- c(rnorm(10 ^ 3), rnorm(10 ^ 3) * 2 + 30)
#' d <- kde(data)
#' plot(d[1,], d[2,], type = 'l', xlab = 'x', ylab = 'density f(x)')
kde <- function(data, n, MIN, MAX) {
nargin <- length(as.list(match.call())) - 1
if (nargin < 2) n <- 2 ^ 14
# round up n to the next power of 2
n <- 2 ^ ceiling(log2(n))
# define the default interval [MIN, MAX]
if (nargin < 4) {
minimum <- min(data)
maximum <- max(data)
Range <- maximum - minimum
MIN <- minimum - Range / 2
MAX <- maximum + Range / 2
}
# set up the grid over which the density estimate is computed
R <- MAX - MIN
dx <- R / n
xmesh <- MIN + seq(0, R, dx)
N <- length(data)
# if data has repeated observations use the N given by
# N = length(as.numeric(names(table(data))))
# then bin the data uniformly using the grid defined above
w <- hist(data, xmesh, plot = FALSE)
initial_data <- (w$counts) / N
initial_data <- initial_data / sum(initial_data)
dct1d <- function(data) {
# computes the discrete cosine transform of the column vector data
n <- length(data)
# Compute weights to multiply DFT coefficients
weight <- c(1, 2 * exp(-1i * (1:(n - 1)) * pi / (2 * n)))
# Re-order the elements of the columns of x
data <- c(data[seq(1, n - 1, 2)], data[seq(n, 2, -2)])
# Multiply FFT by weights:
data <- Re(weight * fft(data))
data
}
# discrete cosine transform of initial data
a <- dct1d(initial_data) #
# now compute the optimal bandwidth ^ 2 using the referenced method
I <- (1:(n - 1)) ^ 2
a2 <- (a[2:n] / 2) ^ 2
# use fzero to solve the equation t=zeta*gamma ^ [5](t)
fixed_point <- function(t, N, I, a2) {
# this implements the function t-zeta*gamma ^ [l](t)
l <- 7
f <- 2 * (pi ^ (2 * l)) * sum((I ^ l) * a2 * exp(-I * (pi ^ 2) * t))
for (s in (l - 1):2) {
K0 <- prod(seq(1, 2 * s - 1, 2)) / sqrt(2 * pi)
const <- (1 + (1 / 2) ^ (s + 1 / 2)) / 3
time <- (2 * const * K0 / N / f) ^ (2 / (3 + 2 * s))
f <- 2 * pi ^ (2 * s) * sum(I ^ s * a2 * exp(-I * pi ^ 2 * time))
}
out <- t - (2 * N * sqrt(pi) * f) ^ (-2 / 5)
out <- abs(out)
}
soln <- optimize(fixed_point, c(0, .1), tol = 10 ^ -22, N = N, I = I, a2 = a2)
t_star <- soln$minimum
# smooth the discrete cosine transform of initial data using t_star
a_t <- a * exp(-(0:(n - 1)) ^ 2 * pi ^ 2 * t_star / 2)
# now apply the inverse discrete cosine transform
idct1d <- function(data) {
# computes the inverse discrete cosine transform
n <- length(data)
# Compute weights
weights <- n * exp(1i * (0:(n - 1)) * pi / (2 * n))
# Compute x tilde using equation (5.93) in Jain
data <- Re(fft(weights * data, inverse = TRUE)) / n
# Re-order elements of each column according to equations (5.93) and (5.94) in Jain
out <- rep(0, n)
out[seq(1, n, 2)] <- data[1:(n / 2)]
out[seq(2, n, 2)] <- data[n:(n / 2 + 1)]
out
}
density <- idct1d(a_t) / R
# take the rescaling of the data into account
bandwidth <- sqrt(t_star) * R
xmesh <- seq(MIN, MAX, R / (n - 1))
out <- matrix(c(xmesh, density), nrow = 2, byrow = TRUE)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.