Nothing
#' Prediction intervals with boosted forests
#'
#' Constructs prediction intervals with boosted forests.
#'
#' @param formula Object of class \code{formula} or \code{character} describing
#' the model to fit.
#' @param traindata Training data of class \code{data.frame}.
#' @param testdata Test data of class \code{data.frame}.
#' @param alpha Confidence level. (1 - \code{alpha}) is the desired coverage
#' level. The default is \code{alpha} = 0.05 for the 95% prediction interval.
#' @param calibration Calibration method for finding working level of
#' \code{alpha}, i.e. \eqn{\alpha_w}. Options are \code{"cv"}, \code{"oob"},
#' and \code{FALSE} standing for calibration with cross-validation, OOB
#' calibration, and no calibration, respectively. See below for details. The
#' default is \code{"cv"}.
#' @param coverage_range The allowed target calibration range for coverage level.
#' \eqn{\alpha_w} is selected such that the \code{"cv"} or \code{"oob"}
#' coverage is within \code{coverage_range}.
#' @param numfolds Number of folds for calibration with cross-validation. The
#' default is 5 folds.
#' @param params_ranger List of parameters that should be passed to
#' \code{ranger}. In the default parameter set, \code{num.trees} = 2000,
#' \code{mtry} = \eqn{px/3} (rounded up), \code{min.node.size} = 5,
#' \code{replace} = TRUE. See \code{ranger} for possible parameters.
#' @param oob Should out-of-bag (OOB) predictions and prediction intervals for
#' the training observations be returned?
#'
#' @section Details:
#'
#' \strong{Calibration process}
#'
#' Let (\eqn{1-\alpha}) be the target coverage level. The goal of the
#' calibration is to find the value of \eqn{\alpha_w}, which is the working
#' level of \eqn{\alpha} called by Roy and Larocque (2020), such that the
#' coverage level of the PIs for the training observations is closest to the
#' target coverage level. Two calibration procedures are provided: calibration
#' with cross-validation and out-of-bag (OOB) calibration.
#'
#' \enumerate{
#' \item In calibration with CV, we apply k-fold cross-validation to form
#' prediction intervals for the training observations. In each fold, we split
#' the original training data set into training and testing sets. For the
#' training set, we train a one-step boosted random forest and compute the OOB
#' residuals. Then, for each observation in the testing set, we build a PI.
#' After completing CV, we compute the coverage level with the constructed PIs
#' and if the coverage is not within the acceptable coverage range
#' (\code{coverage_range}), then we apply a grid search to find the
#' \eqn{\alpha_w} such that \eqn{\alpha_w} is the closest to the target
#' \eqn{\alpha} among the set of \eqn{\alpha_w}'s that ensures the target
#' coverage level for the constructed PIs. Once we find the \eqn{\alpha_w}, we
#' use this level to build the PI for the new observations.
#'
#' \item The OOB calibration procedure is proposed by Roy and Larocque (2020)
#' and it is the default calibration procedure of \code{rfpi()}. See details
#' section of \code{rfpi()} for the detailed explanation of this calibration
#' procedure.
#' }
#'
#' In terms of computational time, OOB calibration is faster than calibration
#' with CV. However, empirical results show that OOB calibration may result in
#' conservative prediction intervals. Therefore, the recommended calibration
#' procedure for the PIBF method is calibration with CV.
#'
#'
#' @return A list with the following components:
#'
#' \item{pred_interval}{Prediction intervals for test data. A list containing
#' lower and upper bounds.}
#' \item{test_pred}{Bias-corrected random forest predictions for test data.}
#' \item{alphaw}{Working level of \code{alpha}, i.e. \eqn{\alpha_w}. If
#' \code{calibration = FALSE}, it returns \code{NULL}.}
#' \item{test_response}{If available, test response.}
#' \item{oob_pred_interval}{Out-of-bag (OOB) prediction intervals for train
#' data. Prediction intervals are built with \code{alpha}. If
#' \code{oob = FALSE}, it returns \code{NULL}.}
#' \item{oob_pred}{Bias-corrected out-of-bag (OOB) predictions for train data.
#' If \code{oob = FALSE}, it returns \code{NULL}.}
#' \item{train_response}{Train response.}
#'
#' @references Alakus, C., Larocque, D., & Labbe, A. (2022). RFpredInterval: An
#' R Package for Prediction Intervals with Random Forests and Boosted Forests.
#' R JOURNAL, 14(1), 300-319.
#' @references Roy, M. H., & Larocque, D. (2020). Prediction intervals with
#' random forests. Statistical methods in medical research, 29(1), 205-229.
#' doi:10.1177/0962280219829885.
#'
#' @examples
#' ## load example data
#' data(BostonHousing, package = "RFpredInterval")
#' set.seed(2345)
#'
#' ## define train/test split
#' testindex <- 1:10
#' trainindex <- sample(11:nrow(BostonHousing), size = 100, replace = FALSE)
#' traindata <- BostonHousing[trainindex, ]
#' testdata <- BostonHousing[testindex, ]
#' px <- ncol(BostonHousing) - 1
#'
#' ## construct 95% PI with "cv" calibration using 5-folds
#' out <- pibf(formula = medv ~ ., traindata = traindata,
#' testdata = testdata, calibration = "cv", numfolds = 5,
#' params_ranger = list(num.trees = 40))
#'
#' ## get the PI for the first observation in the testdata
#' c(out$pred_interval$lower[1], out$pred_interval$upper[1])
#'
#' ## get the bias-corrected random forest predictions for testdata
#' out$test_pred
#'
#' ## construct 90% PI with "oob" calibration
#' out2 <- pibf(formula = medv ~ ., traindata = traindata,
#' testdata = testdata, alpha = 0.1, calibration = "oob",
#' coverage_range = c(0.89,91), params_ranger = list(num.trees = 40))
#'
#' ## get the PI for the testdata
#' out2$pred_interval
#'
#' ## get the working level of alpha (alphaw)
#' out2$alphaw
#'
#' @seealso \code{\link{piall}} \code{\link{rfpi}}
#' \code{\link{print.rfpredinterval}}
pibf <- function(formula,
traindata,
testdata,
alpha = 0.05,
calibration = c("cv", "oob", FALSE),
coverage_range = c(1-alpha-0.005, 1-alpha+0.005),
numfolds = 5,
params_ranger = list(num.trees = 2000, mtry = ceiling(px/3),
min.node.size = 5, replace = TRUE),
oob = FALSE)
{
## make formula object
formula <- as.formula(formula)
## initial checks for data sets
if (is.null(traindata)) {stop("'traindata' is missing.")}
if (is.null(testdata)) {stop("'testdata' is missing.")}
if (!is.data.frame(traindata)) {stop("'traindata' must be a data frame.")}
if (!is.data.frame(testdata)) {stop("'testdata' must be a data frame.")}
traindata <- as.data.frame(traindata)
testdata <- as.data.frame(testdata)
## verify key options
calibration <- match.arg(as.character(calibration), c("cv", "oob", FALSE))
oob <- match.arg(as.character(oob), c(FALSE, TRUE))
## check the dimension of coverage_range
if (length(as.numeric(coverage_range)) != 2) {
stop("'coverage_range' should be a numeric vector of length 2.")
}
## sort coverage_range
coverage_range <- sort(coverage_range)
## check if target coverage level is within coverage_range
if ( ((1-alpha) < coverage_range[1]) | ((1-alpha) > coverage_range[2]) ) {
stop("1-alpha is not within the limits of 'coverage_range'.")
}
## get variable names
all.names <- all.vars(formula, max.names = 1e7)
yvar.names <- all.vars(formula(paste(as.character(formula)[2], "~ .")), max.names = 1e7)
yvar.names <- yvar.names[-length(yvar.names)]
py <- length(yvar.names)
if (length(all.names) <= py) {
stop("formula is misspecified: total number of variables does not exceed total number of y-variables")
}
if (all.names[py + 1] == ".") {
if (py == 0) {
xvar.names <- names(traindata)
} else {
xvar.names <- names(traindata)[!is.element(names(traindata), all.names[1:py])]
}
} else {
if(py == 0) {
xvar.names <- all.names
} else {
xvar.names <- all.names[-c(1:py)]
}
not.specified <- !is.element(xvar.names, names(traindata))
if (sum(not.specified) > 0) {
stop("formula is misspecified, object ", xvar.names[not.specified], " not found")
}
}
xvar <- traindata[, xvar.names, drop = FALSE]
yvar <- traindata[, yvar.names, drop = FALSE]
traindata <- cbind(xvar, yvar)
yvar <- yvar[, yvar.names]
## get sample size and px
ntrain <- nrow(traindata)
px <- ncol(xvar)
## set parameters for ranger
if (is.null(params_ranger)) {
params_ranger <- list()
}
param_names <- names(params_ranger)
if (!("mtry" %in% param_names)) {params_ranger[["mtry"]] <- ceiling(px/3)}
if (!("num.trees" %in% param_names)) {params_ranger[["num.trees"]] <- 2000}
if (!("min.node.size" %in% param_names)) {params_ranger[["min.node.size"]] <- 5}
if (!("replace" %in% param_names)) {params_ranger[["replace"]] <- TRUE}
params_ranger[["formula"]] <- formula
## train first RF: mean RF
params_ranger[["data"]] <- traindata
params_ranger[["keep.inbag"]] <- FALSE
rf.mean <- do.call(ranger::ranger, params_ranger)
## get oob mean predictions
mean.oob <- rf.mean$predictions
if (sum(is.nan(mean.oob)) > 0) {
stop("Some of the OOB predictions are NaN. Increase the number of trees, 'num.trees' in params_ranger.")
}
## compute oob residuals
res <- yvar - as.numeric(mean.oob)
## train second RF: residual RF
traindata.res <- traindata
traindata.res[, yvar.names] <- res
params_ranger[["data"]] <- traindata.res
params_ranger[["keep.inbag"]] <- TRUE
rf.res <- do.call(ranger::ranger, params_ranger)
## update the oob mean predictions with oob bias predictions
bias.oob <- rf.res$predictions
mean.oob <- mean.oob + bias.oob
## compute new residuals after bias correction
res <- yvar - as.numeric(mean.oob)
## filter the test data based on the formula
testdata <- testdata[, is.element(names(testdata),
c(yvar.names, xvar.names)), drop = FALSE]
## get bias-corrected predictions for test data
mean.test <- predict(rf.mean, data = testdata)$predictions
bias.test <- predict(rf.res, data = testdata)$predictions
mean.test <- mean.test + bias.test
## build BOP for test data using the second RF
ntree <- rf.res$num.trees
mem.train <- predict(rf.res, data = traindata, type = "terminalNodes")$predictions
mem.test <- predict(rf.res, data = testdata, type = "terminalNodes")$predictions
inbag <- matrix(unlist(rf.res$inbag.counts, use.names = FALSE), ncol = ntree, byrow = FALSE)
BOPtest <- buildtestbop(mem.train = mem.train, mem.test = mem.test, inbag = inbag, residual = res)
## calibration process
if (calibration == FALSE) {
alphaw <- alpha
} else if (calibration == "cv") {
alphaw <- calibrate_cv(coverage_range, alpha, traindata, numfolds, params_ranger, yvar.names)
} else if (calibration == "oob") {
BOPoob <- buildoobbop(mem.train, inbag, residual = res)
if (sum(sapply(BOPoob, is.null)) > 0) {
stop("Some observations have empty BOP. Increase the number of trees, 'num.trees' in params_ranger.")
}
alphaw <- calibrate_oob(coverage_range, alpha, BOPoob, mean.oob, yvar)
}
## PI construction for test data
PI.obj <- formpi(alpha = alphaw,
BOP = BOPtest,
mean = mean.test,
response = NULL)
## store PI information
pred.interval <- list(lower = PI.obj$lower,
upper = PI.obj$upper)
## PI construction for train data with OOB predictions and OOB-BOP
if (oob) {
if (calibration != "oob") {
BOPoob <- buildoobbop(mem.train, inbag, residual = res)
}
PI.obj <- formpi(alpha = alpha,
BOP = BOPoob,
mean = mean.oob,
response = NULL)
oob.pred.interval <- list(lower = PI.obj$lower,
upper = PI.obj$upper)
}
## return list
out <- list(pred_interval = pred.interval,
test_pred = mean.test,
alphaw = if(calibration == FALSE){NULL}else{alphaw},
test_response = if(is.element(yvar.names, names(testdata))){testdata[, yvar.names]}else{NULL},
oob_pred_interval = if(oob){oob.pred.interval}else{NULL},
oob_pred = if(oob){mean.oob}else{NULL},
train_response = yvar)
class(out) <- c("rfpredinterval", "pibf")
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.