R/kde_2d_mass.R
In MASSExtra: Some 'MASS' Enhancements

Documented in kde_1d plot.kde_1d print.kde_1d

### 2d kde new method
###

#' @importFrom utils methods
NULL

floor_os   <- function(x, o = 0, s = 1) o + s*floor((x - o)/s)
ceiling_os <- function(x, o = 0, s = 1) o + s*ceiling((x - o)/s)

#' One-dimensional Kernel Density Estimate
#'
#' A pure R implementation of an approximate one-dimensional KDE, similar
#' to \code{\link[stats]{density}} but using a different algorithm not involving
#' \code{\link[stats]{fft}}.  Two extra facilities are provided, namely
#' (a) the kernel may be given either as a character string to select one of a
#' number of kernel functions provided, or a user defined R function, and (b) the
#' kde may be fitted beyond the prescribed limits for the result, and folded back
#' to emulate the effect of having known bounds for the distribution.
#'
#' @param x   A numeric vector for which the kde is required or (in methods)
#'            an object of class \code{"kde_1d"}
#' @param bw  The bandwidth or the bandwidth function.
#' @param kernel The kernel function, specified either as a character string or as
#'               an R function. Partial matching of the character string is allowed.
#' @param n  Integer, the number of equally-spaced values in the abscissa of the kde
#' @param limits  numeric vector of length 2.  Prescribed x-range limits for the
#'                x-range of the result.  May be infinite, but infinite values will be
#'                pruned back to an appropriate value as determined by the data.
#' @param cut The number of bandwidths beyond the range of the input x-values to use
#' @param na.rm  Logical value: should any missing values in x be silently removed?
#' @param adjust numeric value: a multiplier to be applied to the computed bandwidth.
#' @param fold  Logical value: should the kde be estimated beyond the prescribed limits
#'              for the result and 'folded back' to emulate the effect of having known
#'              range boundaries for the underlying distribution?
#' @param las,col,xlab,ylab base graphics parameters
#' @param ... currently ignored, except in method functions
#'
#' @return A list of results specifying the result of the kde computation, of class \code{"kde_1d"}
#' @export
#'
#' @examples
#' set.seed(1234)
#' u <- runif(5000)
#' kdeu0 <- kde_1d(u, limits = c(-Inf, Inf))
#' kdeu1 <- kde_1d(u, limits = 0:1, kernel = "epan", fold = TRUE)
#' plot(kdeu0, col = 4)
#' lines(kdeu1, col = "dark green")
#' fun <- function(x) (0 < x & x < 1) + 0
#' curve(fun, add=TRUE, col = "grey", n = 1000)
kde_1d <- function(x, bw = bw.nrd0,
                   kernel = c("gaussian", "biweight", "cosine", "epanechnikov",
                              "logistic", "optCosine", "rectangular", "squaredCosine",
                              "triangular", "tricube", "triweight", "uniform"),
                   n = 512, limits = c(rx[1] - cut*bw, rx[2] + cut*bw),
                   cut = 3, na.rm = FALSE, adjust = 1, fold = FALSE, ...) {
  stopifnot(non_numeric_x = is.numeric(x),
            short_x = length(unique(x)) > 2)
  data_name <- deparse(substitute(x))
  if(is.function(bw)) {
    bw <- bw(x)
  }
  stopifnot(bad_bw = is.numeric(bw) && length(bw) == 1 && bw > 0,
            bad_adjust = is.numeric(adjust) && length(adjust) == 1 && adjust > 0)
  bw <- bw * adjust
  if(is.character(kernel)) {
    kernel <- match.arg(kernel)
    substring(kernel, 0, 1) <- toupper(substring(kernel, 0, 1))
    kernel <- get(paste0("kernel", kernel), mode = "function")
  } else {
    stopifnot(kernel_not_a_function = is.function(kernel),
              bad_arguments = identical(args(kernel), args(kernelGaussian)))
  }
  stopifnot(bad_n = is.numeric(n) && length(n) == 1 && (n %% 1 == 0) && n > 2)
  rx <- range(x, na.rm = na.rm)
  if(fold) {
    limits[1] <- max(limits[1], rx[1] - cut*bw)
    limits[2] <- min(limits[2], rx[2] + cut*bw)
    stopifnot(bad_limits = all(x >= limits[1]) && all(x <= limits[2]))
    dx <- diff(limits)/(n-1)
    lo <-   floor_os(limits[1] - min(diff(limits), 5*bw), limits[1], dx)
    up <- ceiling_os(limits[2] + min(diff(limits), 5*bw), limits[2], dx)
    n_star <- 1 + round((up - lo)/dx)
    xo <- seq(lo, up, by = dx)
    # fr <- tabulate(1 + pmin(floor((x - lo)/dx), n_star - 1), nbins = n_star)
  } else {
    lo <- max(limits[1], rx[1] - 5*bw)
    up <- min(limits[2], rx[2] + 5*bw)
    stopifnot(bad_limits = all(x >= lo) && all(x <= up))
    dx <- (up - lo)/(n-1)
    xo <- seq(lo, up, by = dx)
    # fr <- tabulate(1 + pmin(floor((x - lo)/dx), n - 1), nbins = n)
  }
  fr <- linbin1d(x, xo)
  yo <- outer(xo, xo, function(x, y) kernel(x, y, bw)) %*% fr
  if(fold) {
    if(any(low <- (xo < limits[1]+dx/10))) {
      k <- which(low)
      kl <- k[length(k)]
      k0 <- 2*kl - k
      yo[k0] <- yo[k0] + yo[k]
    }
    if(any(upp <- (limits[2]-dx/10 < xo))) {
      k <- which(upp)
      ku <- k[1]
      k0 <- 2*ku - k
      yo[k0] <- yo[k0] + yo[k]
    }
    keep <- kl:ku
    yo <- yo[keep]
    xo <- xo[keep]
    lo <- limits[1]
    up <- limits[2]
  }
  structure(list(x = xo, y = yo/(sum(yo)*dx), bw = bw, n = length(x),
                 lower = lo, upper = up, data_name = data_name),
            class = "kde_1d")
}

linbin1d <- function(x, gx) {
  dx <- mean(diff(gx))
  i0 <- findInterval(x, gx, all.inside = TRUE)
  i <- unique(i0)
  wt <- crossprod(outer(i0, i, "=="), cbind(gx[i0+1] - x, x - gx[i0]))/dx
  fr <- numeric(length(gx))
  fr[i    ] <- fr[i    ] + wt[, 1]
  fr[i + 1] <- fr[i + 1] + wt[, 2]
  fr
}

#' @rdname kde_1d
#' @export
print.kde_1d <- function(x, ...) {
  cat("A Kernel Density Estimate of class ", class(x), "\n\n")
  labs <- c("Response", "Sample size", "Range", "Bandwidth", "Resolution")
  values <- with(x, c(data_name, n,
                      paste0(signif(lower, 4), " < x < ",
                             signif(upper, 4)),
                      signif(bw, 4),
                      length(x)))
  cat(paste0(format(labs), ": ", values), sep = "\n")
  generics <- attr(utils::methods(class = "kde_1d"), "info")$generic
  cat("\nMethods exist for generics:",
      paste(generics, collapse = ", "), "\n")
  invisible(x)
}

#' @rdname kde_1d
#' @export
plot.kde_1d <- function(x, ..., col = "steel blue", las = 1,
                        xlab = bquote(x == italic(.(x$data_name))),
                        ylab = expression(kde(italic(x)))) {
  with(x, {
    plot(x = c(lower, x, upper), y = c(0, y, 0),
         type = "l", xlab = xlab, ylab = ylab, axes = FALSE,
         col = col, panel.first = grid(), las = las, ...)
    axis(1, las = las)
    axis(2, las = las)
  })
  invisible(x)
}

#' @import grDevices
NULL

#' A Two-dimensional Kernel Density Estimate
#'
#' A pure R implementation of an approximate two-dimensional kde computation, where
#' the approximation depends on the x- and y-resolution being fine, i.e. the number
#' of both x- and y-points should be reasonably large, at least 256.  The coding
#' follows the same idea as used in \code{\link[MASS]{kde2d}}, but scales much better
#' for large data sets.
#'
#' @param x,y Numeric vectors of the same length specified in any way acceptable
#'            to \code{\link[grDevices]{xy.coords}}.  In methods, \code{x} will be
#'            an object of class \code{"kde_2d"}
#' @param bw bandwidths. May be a numeric vector of length 1 or 2, or a function,
#'           or list of two bandwidth computation functions.  Short entities will
#'           be repeated to length 1.  The first relates to the x-coordinate and
#'           the second to the y.
#' @param kernel As for \code{\link{kde_1d}} though 1 or 2 values may be specified
#'               relating to x- and y-coordinates respectively.  Short entities will
#'               be repeated to length 2
#' @param n positive integer vector of length 1 or 2 specifying the resolution required
#'          in the x- and y-coordinates respectively.  Short values will be repeated to
#'          length 2.
#' @param x_limits,y_limits Numeric vectors specifying the limits required for the result
#' @param cut The number of bandwidths beyond the x- and y-range limits for the resuls.
#' @param na.rm  Should missing values be silently removed?
#' @param adjust A factor to adjust both bandwidths to regulate smoothness
#' @param las,col,xlab,ylab base graphics parameters
#' @param ... currently ignored, except in method functions
#'
#' @return A list of results of class \code{"kde_2d"}.  The result may be used directly
#'         in \code{\link[graphics]{image}} or \code{\link[graphics]{contour}}.
#' @export
#'
#' @examples
#' krc <- with(Boston, {
#'   criminality <- log(crim)
#'   spaciousness <- sqrt(rm)
#'   kde_2d(criminality, spaciousness, n = 128, kernel = "biweight")
#' })
#' plot(krc, xlab = expression(italic(Criminality)), ylab = expression(italic(Spaciousness)))
#' levs <- hr_levels(krc)
#' contour(krc, add = TRUE, levels = levs, labels = names(levs))
#'
#' with(krc, persp(x, 10*y, 3*z, border="transparent", col = "powder blue",
#'                 theta = 30, phi = 15, r = 20, scale = FALSE, shade = TRUE,
#'                 xlab = "Criminality", ylab = "Spaciousness", zlab = "density"))
kde_2d <- function (x, y = NULL, bw = list(x = bw.nrd0, y = bw.nrd0),
                    kernel = c("gaussian", "biweight", "cosine", "epanechnikov",
                               "logistic", "optCosine", "rectangular", "squaredCosine",
                               "triangular", "tricube", "triweight", "uniform"),
                    n = 128,
                    x_limits = c(rx[1] - cut * bw["x"], rx[2] + cut * bw["x"]),
                    y_limits = c(ry[1] - cut * bw["y"], ry[2] + cut * bw["y"]),
                    cut = 1, na.rm = FALSE, adjust = 53/45, ...) {
  xy <- grDevices::xy.coords(x, y)
  x <- xy$x
  y <- xy$y
  if (na.rm) {
    if (any(nax <- is.na(x)) | any(nay <- is.na(y))) {
      x <- x[!(nax | nay)]
      y <- y[!(nax | nay)]
    }
  }
  stopifnot(x_has_nas = !any(is.na(x)),
            y_has_nas = !any(is.na(y)),
            x_y_lengths_unequal = length(x) == length(y),
            bad_n = !is.integer(n) || all(n > 2))
  knames <- if (is.null(names(kernel)))
    c("x", "y")
  else names(kernel)
  if (is.character(kernel)) {
    kernel <- match.arg(kernel, several.ok = TRUE)
    kernel <- rep(kernel, length.out = 2)
    substring(kernel, 0, 1) <- toupper(substring(kernel, 0, 1))
    kernel <- paste0("kernel", kernel)
    kernel = list(x = get(kernel[1], mode = "function"),
                  y = get(kernel[2], mode = "function"))
  }
  else if (is.function(kernel)) {
    if (length(kernel) == 1) {
      kernel <- rep(list(kernel), 2)
    }
  }
  kernel <- setNames(kernel, knames)
  if (is.function(bw)) {
    bw <- rep(list(bw), 2)
  }
  if(is.list(bw)) {
    bw <- mapply(FUN = function(x, y) x(y),
                 bw, list(x = x, y = y))
  } else {
    bw <- rep(bw, length.out = 2)
  }
  stopifnot(bad_bw = is.numeric(bw) && all(bw > 0),
            bad_adjust = is.numeric(adjust) && length(adjust) <= 2 && all(adjust > 0))
  bw <- bw * adjust
  if (is.null(names(bw))) {
    bw <- setNames(bw, c("x", "y"))
  }
  nnames <- if (is.null(names(n)))
    c("x", "y")
  else names(n)
  n <- setNames(rep(n, length.out = 2), nnames)
  rx <- range(x)
  ry <- range(y)
  x_limits <- c(max(x_limits[1], rx[1] - 5 * bw["x"]),
                min(x_limits[2], rx[2] + 5 * bw["x"]))
  y_limits <- c(max(y_limits[1], ry[1] - 5 * bw["y"]),
                min(y_limits[2], ry[2] + 5 * bw["y"]))
  dx <- diff(x_limits)/(n["x"] - 1)
  dy <- diff(y_limits)/(n["y"] - 1)
  xo <- seq(x_limits[1], x_limits[2], by = dx)
  yo <- seq(y_limits[1], y_limits[2], by = dy)
  Fr <- linbin2d(x, y, xo, yo)
  Hx <- outer(xo, xo, kernel[["x"]], sd = bw["x"])
  Hy <- outer(yo, yo, kernel[["y"]], sd = bw["y"])
  z <- Hx %*% Fr %*% Hy
  z <- z/(sum(z) * dx * dy)
  structure(list(x = xo, y = yo, z = z, bw = bw, n_sample = length(x),
                 lower = c(x = min(xo), y = min(yo)),
                 upper = c(x = max(xo), y = max(yo)),
                 n_out = n,
                 data_name = with(xy, c(x = ifelse(is.null(xlab), "x", xlab),
                                        y = ifelse(is.null(ylab), "y", ylab)))),
            class = "kde_2d")
}

linbin2d <- function(x, y, gx, gy) {
  dx <- mean(diff(gx))
  dy <- mean(diff(gy))
  i <- findInterval(x, gx, all.inside = TRUE)
  j <- findInterval(y, gy, all.inside = TRUE)
  lx <- (gx[i + 1] - x)
  ux <- (x - gx[i])
  ly <- (gy[j + 1] - y)
  uy <- (y - gy[j])
  ij <- paste(i, j, sep = ":")
  uij <- unique(ij)
  wt <- crossprod(outer(ij, uij, "=="),
                  cbind(lx*ly, lx*uy, ux*ly, ux*uy))/(dx*dy)
  Fr <- matrix(0, length(gx), length(gy))
  i <- as.integer(sub(":.*$", "", uij))
  j <- as.integer(sub("^.*:", "", uij))
  Fr[cbind(i    , j    )] <- Fr[cbind(i    , j    )] + wt[,1]
  Fr[cbind(i    , j + 1)] <- Fr[cbind(i    , j + 1)] + wt[,2]
  Fr[cbind(i + 1, j    )] <- Fr[cbind(i + 1, j    )] + wt[,3]
  Fr[cbind(i + 1, j + 1)] <- Fr[cbind(i + 1, j + 1)] + wt[,4]
  Fr
}

#' @rdname kde_2d
#' @export
print.kde_2d <- function(x, ...) {
  cat("A Two-dimensional Kernel Density Estimate\n\n")
  labs <- c("Responses", "Sample size", "Ranges", "Bandwidths", "Resolution")
  values <- with(x, c(paste(data_name, collapse = ", "),
                      n_sample,
                      paste(paste0(signif(lower["x"], 4), " < x < ",
                                   signif(upper["x"], 4)),
                            paste0(signif(lower["y"], 4), " < y < ",
                                   signif(upper["y"], 4)),
                            sep = ", "),
                      paste(signif(bw, 4), collapse = ", "),
                      paste(length(x), length(y), sep = ", ")))
  cat(paste0(format(labs), ": ", values), sep = "\n")
  generics <- attr(methods(class = "kde_2d"), "info")$generic
  cat("\nMethods exist for generics:",
      paste(generics, collapse = ", "), "\n")
  invisible(x)
}

#' @rdname kde_2d
#' @export
plot.kde_2d <- function(x, ..., las = 1,
                        xlab = bquote(italic(.(x$data_name[["x"]]))),
                        ylab = bquote(italic(.(x$data_name[["y"]]))),
                        col = hcl.colors(50, "YlOrRd", rev = TRUE)) {
  with(x, {
    image(x, y, z, las = las, xlab = xlab, ylab = ylab, col = col, ...)
  })
  invisible(x)
}

#' @import demoKde
NULL

#' #' @rdname kde_1d
#' #' @export
#' kernelBiweight <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(7)*sd
#'   ifelse((z <- abs(x-mean)) < h, 15/16*(1 - (z/h)^2)^2/h, 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelCosine <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(1/(1-8/pi^2))*sd
#'   ifelse((z <- abs(x-mean)) < h, pi/4*cos((pi*z)/(2*h))/h, 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelEpanechnikov <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(5)*sd
#'   ifelse((z <- abs(x-mean)) < h, 3/4*(1 - (z/h)^2)/h, 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelGaussian <- function(x, mean = 0, sd = 1)
#'   dnorm(x, mean = mean, sd = sd)
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelLogistic <- function(x, mean = 0, sd = 1)
#'   stats::dlogis(x, mean, sqrt(3)/pi*sd)
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelOptCosine <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(1/(1-8/pi^2))*sd
#'   ifelse((z <- abs(x-mean)) < h, pi/4*cos((pi*z)/(2*h))/h, 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelRectangular <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(3)*sd
#'   ifelse(abs(x-mean) < h, 1/(2*h), 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelSquaredCosine <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(3/(1-6/pi^2))*sd
#'   ifelse((z <- abs(x-mean)) < h, cos(pi*z/(2*h))^2/h, 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelTriangular <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(24)*sd/2
#'   ifelse((z <- abs(x-mean)) < h, (1 - z/h)/h, 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelTricube <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(243/35)*sd
#'   ifelse((z <- abs(x - mean)) < h, 70/81*(1 - (z/h)^3)^3/h, 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelTriweight <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(9)*sd
#'   ifelse((z <- abs(x-mean)) < h, 35/32*(1 - (z/h)^2)^3/h, 0)
#' }
#'
#' #' @rdname kde_1d
#' #' @export
#' kernelUniform <- function(x, mean = 0, sd = 1) {
#'   h <- sqrt(3)*sd
#'   ifelse(abs(x-mean) < h, 1/(2*h), 0)
#' }


#' Home Range levels
#'
#' For an object representing a 2-dimensional kernel density estimate
#' find the level(s) defining a central "home range" region, that is,
#' a region of probability content p for which all density points
#' within the region are higher than any density point outside the region.
#' This makes it a region of probability p with smallest area.
#'
#' @param x an object whose \code{z} component represents the KDE
#' @param p a vector of probability levels
#' @param ... extra arguments (currently not used)
#'
#' @return A vector of density levels defining the home range contours
#' @export
#'
#' @examples
#' krc <- with(Boston, {
#'   criminality <- log(crim)
#'   spaciousness <- sqrt(rm)
#'   kde_2d(criminality, spaciousness)
#' })
#' plot(krc, xlab = expression(italic(Criminality)),
#'           ylab = expression(italic(Spaciousness)))
#' home <- hr_levels(krc, p = 0.5)
#' contour(krc, add = TRUE, levels = home, labels = "50%")
hr_levels <- function(x, ...) {
  UseMethod("hr_levels")
}

#' @rdname hr_levels
#' @export
hr_levels.default <- function(x, p = (1:9)/10, ...) {
  stopifnot(bad_object = is.list(x) && is.numeric(x$z) &&
              is.matrix(x$z) && all(x$z >= 0),
            bad_p_levels = is.numeric(p) && all(0 <= p & p <= 1))
  sz <- sort(x$z)
  cz <- cumsum(sz)/sum(sz)
  k <- sapply(p, function(pr) sum(cz <= (1 - pr)))
  k <- pmax(1, k)
  setNames(sz[k], format(p))
}

#' @rdname hr_levels
#' @export
hr_levels.kde_2d <- function(x, ...) {
  x <- unclass(x)
  NextMethod("hr_levels", x, ...)
}