#' In-Sample Error
#'
#' @description
#' This function computes training error, in-sample error, and optimism in MC simulation setting.
#' @param data MC data set generated by \code{\link{mc_data}}.
#' @param ny the number of response at each x point
#' @param fit True model function with \code{x}-named argument.
#' @param rand Random sample generator function for error term. By default, \link[stats]{rnorm}
#' @param mcname column name of the MC sample. By default, \code{"mc"}.
#' @param xname column name of the data. By default, \code{"x"}.
#' @param yname column name of the response. By default, \code{"y"}.
#' @param fitname column name of the true fit. By default, \code{"fx"}.
#' @param pred_name column name of the predicted values. By default, \code{"pred"}.
#' @param insample_name multiple column names when computing insample error
#' @param error Choice of loss function. See \code{\link{loss}}.
#' @param distribution return the error for each MC sample? \code{FALSE} by default. If \code{TRUE}, it gives the \code{data.table}.
#' @param mod Model function.
#' @param formula an object of class \link[stats]{formula}.
#' @param ... Additional arguments for \code{mod}. If you wand argument for \code{rand}, define one.
#' @return
#' Training error, Insample error, Optimism
#' @details
#' In-sample error differs from Expected test error in that it is computed in the same predictor values.
#' Instead, it uses new response values at each predictor point.
#' \deqn{Err_{in} = \frac{1}{N} \sum_{i = 1}^N E_{y_0} [L(Y_i^{(0)}, \hat{f}(x_i)) \mid T]}
#' Optimism is the difference between the insample error and the training error.
#' @references Hastie, T., Tibshirani, R.,, Friedman, J. (2001). \emph{The Elements of Statistical Learning}. New York, NY, USA: Springer New York Inc..
#' @export
compute_insample <- function(data, ny, fit, rand,
mcname = "mc", xname = "x", yname = "y", fitname = "fx", pred_name = "pred", insample_name = "y",
error = c("squared", "absolute"), distribution = FALSE, mod, formula, ...) {
error <- match.arg(error)
x_sym <- sym(xname)
y_sym <- sym(yname)
pred_sym <- sym(pred_name)
if (!(yname %in% names(data))) data <- gen_y(data, fit, rand, mcname, xname, yname, fitname, fit_col = FALSE)
cols <- paste0(insample_name, 1:ny)
for (col in cols) {
data[,
(col) := fit(eval(x_sym)) + rand(.N)]
}
data <- pred_dt(data = data, mcname = mcname, mod = mod, formula = formula, pred_name = pred_name, ...)
data <-
data %>%
melt(id.vars = c(xname, yname, mcname, pred_name)) %>%
.[,
.(
training = loss(eval(y_sym), eval(pred_sym), error),
insample = loss(value, eval(pred_sym), error)
),
by = c(mcname, "variable")] %>%
.[,
optimism := insample - training] %>%
.[,
lapply(.SD, mean),
by = mcname,
.SDcols = -"variable"]
if (distribution) {
data
} else {
data %>%
.[,
lapply(.SD, mean),
.SDcols = -mcname]
}
}
#' Generalization Error
#'
#' @description
#' This function computes expected test error in Monte Carlo simultion situation.
#' @param data MC data set generated by \code{\link{mc_data}}.
#' @param randx Random sample generator function for \code{x}.
#' @param testn Test sample size
#' @param fit True model function with \code{x}-named argument.
#' @param test_set You can provide an independent test set instead of using \code{randx} and \code{testn}.
#' @param randy Random sample generator function for error term.
#' @param mcname column name of the MC sample. By default, \code{"mc"}.
#' @param xname column name of the data. By default, \code{"x"}.
#' @param yname column name of the response. By default, \code{"y"}.
#' @param error Choice of loss function. See \code{\link{loss}}.
#' @param distribution return the error for each MC sample? \code{FALSE} by default. If \code{TRUE}, it gives the \code{data.table}.
#' @param mod Model function.
#' @param formula an object of class \link[stats]{formula}.
#' @param ... Additional arguments for \code{mod}. If you wand argument for \code{randx} or \code{randy}, define one.
#' @return
#' Expected test error
#' @details
#' Given MC samples, compute test error using independent test set and average.
#' @references Hastie, T., Tibshirani, R.,, Friedman, J. (2001). \emph{The Elements of Statistical Learning}. New York, NY, USA: Springer New York Inc..
#' @export
compute_epe <- function(data, randx, testn, fit, test_set, randy = NULL,
mcname = "mc", xname = "x", yname = "y",
error = c("squared", "absolute"), distribution = FALSE, mod, formula, ...) {
error <- match.arg(error)
x_sym <- sym(xname)
y_sym <- sym(yname)
if (missing(test_set)) {
test_set <- data.table()
test_set <-
test_set %>%
.[,
(xname) := randx(n = testn)] %>%
.[,
(yname) := fit(eval(x_sym)) + randy(.N)]
}
if (!(yname %in% names(data))) {
data <-
data %>%
.[,
(yname) := fit(eval(x_sym)) + randy(.N),
by = mcname]
}
data <-
data %>%
.[,
.(pred = mod(formula, data = .SD, ...) %>%
predict(test_set)),
by = mcname] %>%
.[,
(yname) := test_set[, eval(y_sym)],
by = mcname] %>%
.[,
.(error = loss(eval(y_sym), pred)),
by = mcname]
if (distribution) {
data
} else {
data %>%
.[,
error] %>%
mean()
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.