youngtool: Research tools for academic writing

Documented in compute_insample

#' In-Sample Error
#'
#' @description
#' This function computes training error, in-sample error, and optimism in MC simulation setting.
#' @param data MC data set generated by \code{\link{mc_data}}.
#' @param ny the number of response at each x point
#' @param fit True model function with \code{x}-named argument.
#' @param rand Random sample generator function for error term. By default, \link[stats]{rnorm}
#' @param mcname column name of the MC sample. By default, \code{"mc"}.
#' @param xname column name of the data. By default, \code{"x"}.
#' @param yname column name of the response. By default, \code{"y"}.
#' @param fitname column name of the true fit. By default, \code{"fx"}.
#' @param pred_name column name of the predicted values. By default, \code{"pred"}.
#' @param insample_name multiple column names when computing insample error
#' @param error Choice of loss function. See \code{\link{loss}}.
#' @param distribution return the error for each MC sample? \code{FALSE} by default. If \code{TRUE}, it gives the \code{data.table}.
#' @param mod Model function.
#' @param formula an object of class \link[stats]{formula}.
#' @param ... Additional arguments for \code{mod}. If you wand argument for \code{rand}, define one.
#' @return
#' Training error, Insample error, Optimism
#' @details
#' In-sample error differs from Expected test error in that it is computed in the same predictor values.
#' Instead, it uses new response values at each predictor point.
#' \deqn{Err_{in} = \frac{1}{N} \sum_{i = 1}^N E_{y_0} [L(Y_i^{(0)}, \hat{f}(x_i)) \mid T]}
#' Optimism is the difference between the insample error and the training error.
#' @references Hastie, T., Tibshirani, R.,, Friedman, J. (2001). \emph{The Elements of Statistical Learning}. New York, NY, USA: Springer New York Inc..
#' @export
compute_insample <- function(data, ny, fit, rand,
                             mcname = "mc", xname = "x", yname = "y", fitname = "fx", pred_name = "pred", insample_name = "y",
                             error = c("squared", "absolute"), distribution = FALSE, mod, formula, ...) {
  error <- match.arg(error)
  x_sym <- sym(xname)
  y_sym <- sym(yname)
  pred_sym <- sym(pred_name)
  if (!(yname %in% names(data))) data <- gen_y(data, fit, rand, mcname, xname, yname, fitname, fit_col = FALSE)
  cols <- paste0(insample_name, 1:ny)
  for (col in cols) {
    data[,
         (col) := fit(eval(x_sym)) + rand(.N)]
  }
  data <- pred_dt(data = data, mcname = mcname, mod = mod, formula = formula, pred_name = pred_name, ...)
  data <-
    data %>%
    melt(id.vars = c(xname, yname, mcname, pred_name)) %>%
    .[,
      .(
        training = loss(eval(y_sym), eval(pred_sym), error),
        insample = loss(value, eval(pred_sym), error)
      ),
      by = c(mcname, "variable")] %>%
    .[,
      optimism := insample - training] %>%
    .[,
      lapply(.SD, mean),
      by = mcname,
      .SDcols = -"variable"]
  if (distribution) {
    data
  } else {
    data %>%
      .[,
        lapply(.SD, mean),
        .SDcols = -mcname]
  }
}

#' Generalization Error
#'
#' @description
#' This function computes expected test error in Monte Carlo simultion situation.
#' @param data MC data set generated by \code{\link{mc_data}}.
#' @param randx Random sample generator function for \code{x}.
#' @param testn Test sample size
#' @param fit True model function with \code{x}-named argument.
#' @param test_set You can provide an independent test set instead of using \code{randx} and \code{testn}.
#' @param randy Random sample generator function for error term.
#' @param mcname column name of the MC sample. By default, \code{"mc"}.
#' @param xname column name of the data. By default, \code{"x"}.
#' @param yname column name of the response. By default, \code{"y"}.
#' @param error Choice of loss function. See \code{\link{loss}}.
#' @param distribution return the error for each MC sample? \code{FALSE} by default. If \code{TRUE}, it gives the \code{data.table}.
#' @param mod Model function.
#' @param formula an object of class \link[stats]{formula}.
#' @param ... Additional arguments for \code{mod}. If you wand argument for \code{randx} or \code{randy}, define one.
#' @return
#' Expected test error
#' @details
#' Given MC samples, compute test error using independent test set and average.
#' @references Hastie, T., Tibshirani, R.,, Friedman, J. (2001). \emph{The Elements of Statistical Learning}. New York, NY, USA: Springer New York Inc..
#' @export
compute_epe <- function(data, randx, testn, fit, test_set, randy = NULL,
                        mcname = "mc", xname = "x", yname = "y",
                        error = c("squared", "absolute"), distribution = FALSE, mod, formula, ...) {
  error <- match.arg(error)
  x_sym <- sym(xname)
  y_sym <- sym(yname)
  if (missing(test_set)) {
    test_set <- data.table()
    test_set <-
      test_set %>%
      .[,
        (xname) := randx(n = testn)] %>%
      .[,
        (yname) := fit(eval(x_sym)) + randy(.N)]
  }
  if (!(yname %in% names(data))) {
    data <-
      data %>%
      .[,
        (yname) := fit(eval(x_sym)) + randy(.N),
        by = mcname]
  }
  data <-
    data %>%
    .[,
      .(pred = mod(formula, data = .SD, ...) %>%
          predict(test_set)),
      by = mcname] %>%
    .[,
      (yname) := test_set[, eval(y_sym)],
      by = mcname] %>%
    .[,
      .(error = loss(eval(y_sym), pred)),
      by = mcname]
  if (distribution) {
    data
  } else {
    data %>%
      .[,
        error] %>%
      mean()
  }
}