R/kde.R
In pmpp: Posterior Mean Panel Predictor

Documented in kde

#' One-dimensional kernel density estimate
#'
#' @description State-of-the-art gaussian kernel density estimator for
#'              one-dimensional data. The estimator does not use the commonly
#'              employed 'gaussian rule of thumb'. As a result, it outperforms
#'              many plug-in methods on multimodal densities with widely
#'              separated modes. This function is the cleaned-up version of 
#'              the code written and published by Z. I. Botev at:
#'              \url{http://web.maths.unsw.edu.au/~zdravkobotev/}
#'
#' @references Z. I. Botev, J. F. Grotowski and D. P. Kroese,
#'             "Kernel Density Estimation Via Diffusion",
#'             Annals of Statistics, 2010, Volume 38, Number 5, Pages 2916-2957
#'
#' @param data a vector of data from which the density estimate is constructed;
#' @param n    the number of mesh points used in the uniform discretization of
#'             the interval [MIN, MAX]; n has to be a power of two; if n is not
#'             a power of two, then n is rounded up to the next power of two;
#'             the default value of n is n=2 ^ 12;
#' @param MIN  minimum of the interval [MIN, MAX] on which the density estimate
#'             is constructed; default value: MIN = min(data) - Range / 10
#' @param MAX  maximum of the interval [MIN, MAX] on which the density estimate
#'             is constructed; default value: MAX = max(data) + Range / 10
#'
#' @return A \code{matrix} with two rows of length \code{n}, where the second row contains 
#' the density values on the mesh in the first row.
#'
#' @importFrom stats optimize fft
#' @importFrom graphics hist
#' @export
#'
#' @examples set.seed(1)
#' data <- c(rnorm(10 ^ 3), rnorm(10 ^ 3) * 2 + 30)
#' d <- kde(data)
#' plot(d[1,], d[2,], type = 'l', xlab = 'x', ylab = 'density f(x)')
kde <- function(data, n, MIN, MAX) {
  nargin <- length(as.list(match.call())) - 1
  if (nargin < 2) n <- 2 ^ 14
  # round up n to the next power of 2
  n <- 2 ^ ceiling(log2(n))
  # define the default interval [MIN, MAX]
  if (nargin < 4) {
    minimum <- min(data)
    maximum <- max(data)
    Range <- maximum - minimum
    MIN <- minimum - Range / 2
    MAX <- maximum + Range / 2
  }
  # set up the grid over which the density estimate is computed
  R <- MAX - MIN
  dx <- R / n
  xmesh <- MIN + seq(0, R, dx)
  N <- length(data)
  # if data has repeated observations use the N given by
  # N = length(as.numeric(names(table(data))))
  # then bin the data uniformly using the grid defined above
  w <- hist(data, xmesh, plot = FALSE)
  initial_data <- (w$counts) / N
  initial_data <- initial_data / sum(initial_data)

  dct1d <- function(data) {
    # computes the discrete cosine transform of the column vector data
    n <- length(data)
    # Compute weights to multiply DFT coefficients
    weight <- c(1, 2 * exp(-1i * (1:(n - 1)) * pi / (2 * n)))
    # Re-order the elements of the columns of x
    data <- c(data[seq(1, n - 1, 2)], data[seq(n, 2, -2)])
    # Multiply FFT by weights:
    data <- Re(weight * fft(data))
    data
  }

  # discrete cosine transform of initial data
  a <- dct1d(initial_data) #
  # now compute the optimal bandwidth ^ 2 using the referenced method
  I <- (1:(n - 1)) ^ 2
  a2 <- (a[2:n] / 2) ^ 2
  # use  fzero to solve the equation t=zeta*gamma ^ [5](t)

  fixed_point <- function(t, N, I, a2) {
    # this implements the function t-zeta*gamma ^ [l](t)
    l <- 7
    f <- 2 * (pi ^ (2 * l)) * sum((I ^ l) * a2 * exp(-I * (pi ^ 2) * t))
    for (s in (l - 1):2) {
      K0 <- prod(seq(1, 2 * s - 1, 2)) / sqrt(2 * pi)
      const <- (1 + (1 / 2) ^ (s + 1 / 2)) / 3
      time <- (2 * const * K0 / N / f) ^ (2 / (3 + 2 * s))
      f <- 2 * pi ^ (2 * s) * sum(I ^ s * a2 * exp(-I * pi ^ 2 * time))
    }
    out <- t - (2 * N * sqrt(pi) * f) ^ (-2 / 5)
    out <- abs(out)
  }
  soln <- optimize(fixed_point, c(0, .1), tol = 10 ^ -22, N = N, I = I, a2 = a2)
  t_star <- soln$minimum
  # smooth the discrete cosine transform of initial data using t_star
  a_t <- a * exp(-(0:(n - 1)) ^ 2 * pi ^ 2 * t_star / 2)
  # now apply the inverse discrete cosine transform

  idct1d <- function(data) {
    # computes the inverse discrete cosine transform
    n <- length(data)
    # Compute weights
    weights <- n * exp(1i * (0:(n - 1)) * pi / (2 * n))
    # Compute x tilde using equation (5.93) in Jain
    data <- Re(fft(weights * data, inverse = TRUE)) / n
    # Re-order elements of each column according to equations (5.93) and (5.94) in Jain
    out <- rep(0, n)
    out[seq(1, n, 2)] <- data[1:(n / 2)]
    out[seq(2, n, 2)] <- data[n:(n / 2 + 1)]
    out
  }

  density <- idct1d(a_t) / R
  # take the rescaling of the data into account
  bandwidth <- sqrt(t_star) * R
  xmesh <- seq(MIN, MAX, R / (n - 1))
  out <- matrix(c(xmesh, density), nrow = 2, byrow = TRUE)
}