R/kdensity.R

Defines functions kdensity

Documented in kdensity

#' Parametrically guided kernel density estimation
#'
#' `kdensity` computes a parametrically guided kernel density estimate
#' for univariate data. It supports asymmetric kernels and parametric starts
#' through the `kernel` and `start` arguments.
#'
#' The default values for `bw`, `kernel`, `start`, and
#' `support` are interdependent, and are chosen to make sense. E.g.,
#' the default value for `support` when `start = beta` is
#' `c(0, 1)`.
#'
#' The `start` argument defaults to `uniform`, which corresponds
#' to ordinary kernel density estimation. The typical default value for
#' `kernel` is `gaussian`.
#'
#' @export
#'
#' @param x Numeric vector containing the data.
#'
#' @param bw A bandwidth function. Can be either a string, a custom-made
#' function, or a double. The supported bandwidth functions are documented
#' in [bandwidths()].
#'
#' @param adjust An adjustment constant, so that `h = adjust*bw*sd`, where `sd`
#' varies with the chosen kernel.
#'
#' @param kernel The kernel function. Can be chosen from the list of built-in
#' kernels or be custom-made. See [kernels()] for details.
#'
#' @param start Parametric start. Can be chosen from the list of built-in
#' parametric starts or be custom-made. See [parametric_starts()] for
#' details.
#'
#' @param support The support of the data. Must be compatible with the supplied
#' `x` and the supplied `start` and `kernel`. Is used to find the
#' normalization constant, see `normalized`.
#'
#' @param na.rm Logical; if `TRUE`, `NA`s will be removed from `x`.
#'
#' @param normalized Logical; if `TRUE`, the density is normalized.
#'
#' @param tolerance Numeric; the relative error to tolerate in normalization.
#'
#' @return `kdensity` returns an S3 function object of
#' [base::class()] "kdensity". This is a callable function with the
#' following elements, accessible by '$':
#' \describe{
#'   \item{`x`}{The data supplied in `x`.}
#'   \item{`bw_str, bw, adjust, h`}{The bandwidth function, the resulting
#'               bandwidth, the `adjust` argument, and the adjusted
#'               bandwidth.}
#'   \item{`kernel_str, kernel, start, start_str, support`}{Name of the kernel,
#'               the kernel object, name of the parametric start, the start object,
#'               and the support of the density.}
#'   \item{`data.name, n, range, has.na, na.rm, normalized`}{Name of the data, number of
#'               observations, the range of the data, whether the data
#'               `x` contained `NA` values, whether na.rm is `TRUE`
#'               or not, and whether the density is normalized.}
#'   \item{`call`}{The `call` to `kdensity`.}
#'   \item{`estimates`}{Named numeric vector containing the parameter
#'               estimates from the parametric start.}
#'   \item{`logLik`}{The log-likelihood of the parametric starts. Is `NA`
#'               for the uniform start.}
#'
#' }
#'
#'
#' @details If `normalized` is `FALSE` and `start != "uniform"`, the resulting
#' density will not integrate to 1 in general.
#'
#' @seealso The `stats` package function [stats::density()].
#' @references
#'   Hjort, Nils Lid, and Ingrid K. Glad. "Nonparametric density estimation with a parametric start." The Annals of Statistics (1995): 882-904.
#'
#'   Jones, M. C., and D. A. Henderson. "Miscellanea kernel-type density estimation on the unit interval." Biometrika 94.4 (2007): 977-984.
#'
#'   Chen, Song Xi. "Probability density function estimation using gamma kernels." Annals of the Institute of Statistical Mathematics 52.3 (2000): 471-480.
#'
#'   Silverman, Bernard W. Density estimation for statistics and data analysis. Vol. 26. CRC press, 1986.
#'
#' @examples
#' ## Use gamma kernels to model positive data, the concentration of
#' ## theophylline
#'
#' concentration = Theoph$conc + 0.001
#' plot(kdensity(concentration, start = "gamma", kernel = "gamma", adjust = 1/3),
#'      ylim = c(0, 0.15), lwd = 2, main = "Concentration of theophylline")
#' lines(kdensity(concentration, start = "gamma", kernel = "gaussian"),
#'       lty = 2, col = "grey", lwd = 2)
#' lines(kdensity(concentration, start = "gaussian", kernel = "gaussian"),
#'       lty = 3, col = "blue", lwd = 2)
#' lines(kdensity(concentration, start = "gaussian", kernel = "gamma", adjust = 1/3),
#'       lty = 4, col = "red", lwd = 2)
#' rug(concentration)
#'
#' ## Using a density and and estimator from another package.
#'
#' skew_hyperbolic = list(
#'   density   = SkewHyperbolic::dskewhyp,
#'   estimator = function(x) SkewHyperbolic::skewhypFit(x, printOut = FALSE)$param,
#'   support   = c(-Inf, Inf)
#' )
#'
#' kde = kdensity(diff(LakeHuron), start = skew_hyperbolic)
#' plot(kde, lwd = 2, col = "blue",
#'      main = "Annual differences in water level (ft) of Lake Huron, 1875 - 1972")
#' lines(kde, plot_start = TRUE, lty = 2, lwd = 2) # Plots the skew hyperbolic density.
#' rug(diff(LakeHuron))
#'
#' kde$estimates # Also: coef(kde)
#' # Displays the parameter estimates:
#' #        mu     delta      beta        nu
#' # -1.140713  3.301112  2.551657 26.462469
#'

kdensity = function(x, bw = NULL, adjust = 1, kernel = NULL, start = NULL,
                    support = NULL, na.rm = FALSE, normalized = TRUE,
                    tolerance = 0.01)
 {

  ## These tests are based purely on the data, and should be run no matter what.
  data.name = deparse(substitute(x))
  has.na = anyNA(x)

  assertthat::assert_that(!(has.na & !na.rm),
                          msg = "x contains NAs and na.rm = FALSE.")

  x = x[!is.na(x)] # This line is reached only if (has.na & !na.rm) is FALSE.

  ## 'kernel', 'start' and 'bw' can be custom made: In this case, they must
  ## be added to their environments.

  if(!is.null(start)){
    if(!is.character(start)) {
      start_str = deparse(substitute(start))
      add_start(start_str = start_str, start = start)
      start = start_str
    }
  }

  if(!is.null(kernel)){
    if(!is.character(kernel)) {
      kernel_str = deparse(substitute(kernel))
      add_kernel(kernel_str = kernel_str, kernel = kernel)
      kernel = kernel_str
    }
  }

  ## The case of bw == Inf is special! In this case, we return the parametric
  ## start itself.

  ## Now we handle the combinations of kernel, start and support.
  ## This is fancy defaults management.
  kss_list = get_kernel_start_support(kernel, start, support)

  ## The strings are used for reporting and inside bandwidth functions,
  ## at must be preserved.
  start_str  = kss_list$start_str
  kernel_str = kss_list$kernel_str

  ## We overwrite the kernel, start, and support with what we obtained from kss.
  kernel  = kss_list$kernel
  start   = kss_list$start
  support = kss_list$support

  ## Tests for incompabibilities in the supplied values.
  support_compatible(kernel, start, support)

  ## We continue by checking if the supplied values make sense.
  if(!all(x <= support[2] & x >= support[1])) {
    stop("The supplied data x is not contained in the support: (",
         support[1], ", ", support[2], ").")
  }

  estimates = start$estimator(x)
  parametric_start = start$density

  # Name of the variable where the density is evaluated. Typically x.
  x_name = names(formals(start$density))[1]

  parametric_start_vector = function(x) {
    arguments = list()
    arguments[[1]] = x
    names(arguments)[1] = x_name
    arguments = append(arguments, as.list(estimates))
    do.call(parametric_start, arguments)
  }

  ## Takes care of the bandwidth. Can be either a double, a string, or a
  ## function taking the arguments data, kernel, start support.

  if(is.null(bw)) {
    bw = get_standard_bw(kernel_str, start_str, support)
  }

  if(!is.numeric(bw)) {
    if(is.character(bw)) {
      bw_str = bw
      bw     = get_bw(bw)(x, kernel_str, start_str, support)
    } else {
      bw_str = deparse(substitute(bw))
      bw     = bw(x, kernel_str, start_str, support)
    }
  } else {
    bw_str = "user supplied"
    if(bw == Inf) {
      msg = "bw = Inf does not work with a uniform start."
      assertthat::assert_that(start_str != "uniform", start_str != "constant", msg = msg)
    }
  }

  ## The parameter h is computed. The basic bandwidth is h = bw*adjust for the
  ## normal kernel, and is adjusted for all the other kernels so that the sd
  ## of the kernel equals h.
  sd = ifelse(!is.null(kss_list$kernel$sd), kss_list$kernel$sd, 1)
  h = bw*adjust*sd

  ## The denominator can be computed.
  parametric_start_data = parametric_start_vector(x)

  ## Now we handle normalization.

  normalization = 1

  if(normalized & bw != Inf) {

    integrand = function(y) {
        sapply(y, function(y) mean(1/h*kernel$kernel(y, x, h) /
                                     parametric_start_data) *
                                     parametric_start_vector(y))
    }

    integral = tryCatch(
      stats::integrate(integrand,
                       lower = support[1],
                       upper = support[2]),

      error = function(e) {
        stop(paste0("Normalization error: The function will not integrate.",
                    "Two common causes are: 1.) The kernel is non-smooth, ",
                    "try a smooth kernel if possible. 2.) The supplied ",
                    "support is incorrect."))
      })

    msg = "The normalization constant has too large relative error.
   1.) try to normalize the data, or
   2.) change kernel, or
   3.) increase the 'tolerance' parameter for kdensity"

    assertthat::assert_that(!is.nan(integral$abs.error/integral$value),
                            msg = msg)
    assertthat::assert_that(integral$abs.error/integral$value < tolerance,
              msg = msg)

    normalization = integral$value
  }


  ## This is the main part of the returned object.

  return_function = function(y) {

    if(missing(y)) {
      stop("A kdensity object is a density function. Call it with numerical input.")
    }

    if(h != Inf)  {
      if(length(y) == 1) {
        mean(1/h*kernel$kernel(y, x, h) * parametric_start_vector(y) /
               parametric_start_data)/normalization
      } else {
        sapply(y, function(y) {
          parametric_start_vector_y = parametric_start_vector(y)
          1/h*mean(kernel$kernel(y, x, h) * parametric_start_vector_y /
                     parametric_start_data)/normalization
        })
      }
    } else {
       ## If bw = Inf, the parametric start is returned.
       arguments = list()
       arguments[[1]] = y
       names(arguments)[1] = x_name
       arguments = append(arguments, as.list(estimates))
       do.call(parametric_start, arguments)
    }

  }

  ## 'R6'-ish attributes:
  call = match.call()
  range = c(min(x), max(x))
  n = length(x)
  logLik = ifelse(start_str == "uniform" | start_str == "constant", NA,
                  sum(parametric_start_vector(x)))

  ## S3 attributes:
  class(return_function) = "kdensity"
  attr(return_function, "srcref") = NULL
  return_function
}
JonasMoss/kdensity documentation built on Oct. 5, 2020, 2:30 p.m.