R/bagging.R
In isodistrreg: Isotonic Distributional Regression (IDR)

Documented in idrbag

#' Compute IDR predictions with (su)bagging
#'
#' @description Computes IDR predictions with bootstrap aggregating (bagging)
#' or subsample aggregation (subagging).
#' 
#' @usage idrbag(y, X, groups = setNames(rep(1, ncol(X)), colnames(X)), orders =
#'   c("comp" = 1), stoch = "sd", pars = osqpSettings(verbose = FALSE, eps_abs =
#'   1e-5, eps_rel = 1e-5, max_iter = 10000L), progress = TRUE, newdata, 
#'   digits = 3, interpolation = "linear", b, p, replace = FALSE, grid = NULL)
#' 
#' @param newdata \code{data.frame} containing variables with which to
#'   predict. Ordered factor variables are converted to numeric for computation,
#'   so ensure that the factor levels are identical in \code{newdata} and in
#'   \code{X}.
#' @param digits number of decimal places for the predictive CDF.
#' @param interpolation interpolation method for univariate data. Default is 
#'   \code{"linear"}. Any other argument will select midpoint interpolation (see 
#'   'Details' in \code{\link{predict.idrfit}}). Has no effect for multivariate
#'   IDR.
#' @param b number of (su)bagging samples.
#' @param p size of (su)bagging samples relative to training data.
#' @param replace draw samples with (\code{TRUE}, \code{1}) or without
#'     (\code{FALSE}, \code{0}) replacement?
#' @param grid grid on which the predictive CDFs are evaluated. Default are
#'     the unique values of \code{y}.
#' @inheritParams idr
#' 
#' @details 
#' This function draws \code{b} times a random subsample of size
#' \code{ceiling(nrow(X)*p)}) from the training data, fits IDR to each
#' subsample, computes predictions for the new data supplied in \code{newdata},
#' and averages the predictions derived from the \code{b} subsamples. There are
#' no default values for \code{b} and \code{p}.
#' 
#' @return
#' A list of predictions, see \code{\link{predict.idrfit}}.
#' 
#' @export
idrbag <- function(y, X, groups = setNames(rep(1, ncol(X)), colnames(X)),
  orders = c("comp" = 1), stoch = "sd", pars = osqpSettings(verbose = FALSE,
  eps_abs = 1e-5, eps_rel = 1e-5, max_iter = 10000L), progress = TRUE, newdata,
  digits = 3, interpolation = "linear", b, p, 
  replace = FALSE, grid = NULL) {
    
  if (!is.vector(y, mode = "numeric")) 
    stop("'y' must be a numeric vector")
  N <- length(y)
  if (!is.numeric(b) | length(b) != 1 | !(as.integer(b) == b) | b < 1)
    stop("'b' must be a positive integer smaller than length(y)")
  if (!is.numeric(p) | length(p) != 1 | p <= 0 | p >= 1)
    stop("'p' must be a number in (0,1)")
  if (isTRUE(replace == 1)) replace <- TRUE
  if (isTRUE(replace == 0)) replace <- FALSE
  if (isTRUE(progress == 1)) progress <- TRUE
  if (isTRUE(progress == 0)) progress <- FALSE  
  if (!isTRUE(replace) & !isFALSE(replace))
    stop("'replace' must be TRUE/FALSE or 1/0")
  if (!isTRUE(progress) & !isFALSE(progress))
    stop("'progress' must be TRUE/FALSE or 1/0")
  if (is.null(grid)) {
    grid <- sort(unique(y))
  } else {
    if (!is.vector(grid, "numeric"))
      stop("'grid' must be a numeric vector or NULL")
    grid <- sort(grid)
  }
  if (!is.data.frame(newdata))
    stop("'newdata' must be a data.frame")
  
  n <- ceiling(p * N)
  m <- length(grid)
  preds <- matrix(nrow = nrow(newdata), ncol = m, 0)
  if (progress) {
    pb <- utils::txtProgressBar(max = b)
    for (i in seq_len(b)) {
      utils::setTxtProgressBar(pb, i)
      s <- sample(N, n, replace = replace)
      ys <- y[s]
      ysunique <- unique(ys)
      if (length(ysunique) == 1) {
        pos <- findInterval(ysunique, grid)
        if (pos > 0) preds[, 1:pos] <- preds[, 1:pos] + 1
      } else {
      fit <- idr(y = y[s], X = X[s, , drop = FALSE], groups = groups,
        orders = orders, stoch = stoch, pars = pars, progress = FALSE)
      preds <- preds + cdf(predict(object = fit, data = newdata,
        digits = digits, interpolation = interpolation), grid)
      }
    }
    close(pb)
  } else {
    for (i in seq_len(b)) {
      s <- sample(N, n, replace = replace)
      ys <- y[s]
      ysunique <- unique(ys)
      if (length(ysunique) == 1) {
        pos <- findInterval(ysunique, grid)
        if (pos > 0) preds[, 1:pos] <- preds[, 1:pos] + 1
      } else {
      fit <- idr(y = y[s], X = X[s, , drop = FALSE], groups = groups,
        orders = orders, stoch = stoch, pars = pars, progress = FALSE)
      preds <- preds + cdf(predict(object = fit, data = newdata,
        digits = digits, interpolation = interpolation), grid)
      }
    }
  }
  preds <- asplit(round(preds / b, digits), 1)
  preds <- lapply(
    X = preds,
    FUN = function(dat) {
      sel <- c(dat[1] > 0, dat[-1] > dat[-m])
      data.frame(points = grid[sel], cdf = dat[sel])
    }
  )
  structure(preds, class = "idr", incomparables = integer(0))
}