R/rmse.r

Defines functions rmse_cvs rmse_cv

Documented in rmse_cv rmse_cvs

#' Estimate smoothing RMSE using leave-one-out cross-validation.
#'
#' \code{rmse_cv} computes the leave-one-out RMSE for a single vector of
#' bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths,
#' stored as a data frame.
#'
#' @param x condensed summary table
#' @param h,hs for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv}
#'    a data frame of bandwidths, as generated by \code{\link{h_grid}}.
#' @param var variable to smooth
#' @param ... other variables passed on to \code{\link{smooth}}
#' @family bandwidth estimation functions
#' @export
#' @examples
#' \donttest{
#' set.seed(1014)
#' # 1d -----------------------------
#' x <- rchallenge(1e4)
#' xsum <- condense(bin(x, 1 / 10))
#' cvs <- rmse_cvs(xsum)
#'
#' if (require("ggplot2")) {
#' autoplot(xsum)
#' qplot(x, err, data = cvs, geom = "line")
#' xsmu <- smooth(xsum, 1.3)
#' autoplot(xsmu)
#' autoplot(peel(xsmu))
#' }
#'
#' # 2d -----------------------------
#' y <- runif(1e4)
#' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
#' cvs <- rmse_cvs(xysum, h_grid(xysum, 10))
#' if (require("ggplot2")) {
#' qplot(x, y, data = cvs, size = err)
#' }
#' }
rmse_cvs <- function(x, hs = h_grid(x), ...) {
  rmse_1 <- function(i) {
    rmse_cv(x, as.numeric(hs[i, ]), ...)
  }
  err <- vapply(seq_len(nrow(hs)), rmse_1, numeric(1))
  data.frame(hs, err)
}

#' @rdname rmse_cvs
#' @export
rmse_cv <- function(x, h, var = summary_vars(x)[1], ...) {
  # can't smooth missing values, so drop.
  x <- x[complete.cases(x), , drop = FALSE]
  gvars <- group_vars(x)

  pred_error <- function(i) {
    out <- as.matrix(x[i, gvars, drop = FALSE])
    smu <- smooth(x[-i, , drop = FALSE], grid = out, h = h, var = var, ...)
    smu[[var]] - x[[var]][i]
  }
  err <- vapply(seq_len(nrow(x)), pred_error, numeric(1))
  sqrt(mean(err ^ 2, na.rm = TRUE))
}
hadley/bigvis documentation built on May 17, 2019, 9:45 a.m.