#' Find "best" smoothing parameter using leave-one-out cross validation.
#'
#' Minimises the leave-one-out estimate of root mean-squared error to find
#' find the "optimal" bandwidth for smoothing.
#'
#' L-BFGS-B optimisation is used to constrain the bandwidths to be greater
#' than the binwidths: if the bandwidth is smaller than the binwidth it's
#' impossible to compute the rmse because no smoothing occurs. The tolerance
#' is set relatively high for numerical optimisation since the precise choice
#' of bandwidth makes little difference visually, and we're unlikely to have
#' sufficient data to make a statistically significant choice anyway.
#'
#' @param x condensed summary to smooth
#' @param h_init initial values of bandwidths to start search out. If not
#' specified defaults to 5 times the binwidth of each variable.
#' @param ... other arguments (like \code{var}) passed on to
#' \code{\link{rmse_cv}}
#' @param tol numerical tolerance, defaults to 1\%.
#' @param control additional control parameters passed on to \code{\link{optim}}
#' The most useful argument is probably trace, which makes it possible to
#' follow the progress of the optimisation.
#' @family bandwidth estimation functions
#' @return a single numeric value representing the bandwidth that minimises
#' the leave-one-out estimate of rmse. Vector has attributes
#' \code{evaluations} giving the number of times the objective function
#' was evaluated. If the optimisation does not converge, or smoothing is not
#' needed (i.e. the estimate is on the lower bounds), a warning is thrown.
#' @export
#' @examples
#' \donttest{
#' x <- rchallenge(1e4)
#' xsum <- condense(bin(x, 1 / 10))
#' h <- best_h(xsum, control = list(trace = 3, REPORT = 1))
#'
#' if (require("ggplot2")) {
#' autoplot(xsum)
#' autoplot(smooth(xsum, h))
#' }
#' }
best_h <- function(x, h_init = NULL, ..., tol = 1e-2, control = list()) {
stopifnot(is.condensed(x))
gvars <- group_vars(x)
widths <- vapply(x[gvars], attr, "width", FUN.VALUE = numeric(1))
h_init <- h_init %||% widths * 5
stopifnot(is.numeric(h_init), length(h_init) == length(gvars))
stopifnot(is.list(control))
control <- modifyList(list(factr = tol / .Machine$double.eps), control)
# Optimise
rmse <- function(h) {
rmse_cv(x, h, ...)
}
res <- optim(h_init, rmse, method = "L-BFGS-B", lower = widths * 1.01,
control = control)
h <- unname(res$par)
# Feedback
if (res$convergence != 0) {
warning("Failed to converge: ", res$message, call. = FALSE)
} else if (rel_dist(h, widths) < 1e-3) {
warning("h close to lower bound: smoothing not needed", call. = FALSE)
}
structure(h, evaluations = res$counts[1], conv = res$convergence)
}
rel_dist <- function(x, y) {
mean(abs(x - y) / abs(x + y))
}
#' Generate grid of plausible bandwidths for condensed summary.
#'
#' By default, the bandwidths start at the bin width, and then continue
#' up 50 (\code{n}) steps until 20 (\code{max}) times the bin width.
#'
#' @param x a condensed summary
#' @param n number of bandwidths to generate (in each dimension)
#' @param max maximum bandwidth to generate, as multiple of binwidth.
#' @family bandwidth estimation functions
#' @export
#' @examples
#' x <- rchallenge(1e4)
#' xsum <- condense(bin(x, 1 / 10))
#' h_grid(xsum)
#'
#' y <- runif(1e4)
#' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
#' h_grid(xysum, n = 10)
h_grid <- function(x, n = 50, max = 20) {
stopifnot(is.condensed(x))
stopifnot(is.numeric(n), length(n) == 1, n > 0)
stopifnot(is.numeric(max), length(max) == 1, max > 0)
gs <- x[group_vars(x)]
widths <- vapply(gs, attr, "width", FUN.VALUE = numeric(1))
hs <- lapply(widths, function(w) w * seq(2, max, length = n))
expand.grid(hs, KEEP.OUT.ATTRS = FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.