Nothing
#' Prediction intervals with random forests
#'
#' Constructs prediction intervals with 15 distinct variations proposed by Roy
#' and Larocque (2020). The variations include two aspects: The method used to
#' build the forest and the method used to build the prediction interval. There
#' are three methods to build the forest, (i) least-squares (LS), (ii) L1 and
#' (iii) shortest prediction interval (SPI) from the CART paradigm. There are
#' five methods for constructing prediction intervals, classical method,
#' shortest prediction interval, quantile method, highest density region, and
#' contiguous HDR.
#'
#' @param formula Object of class \code{formula} or \code{character} describing
#' the model to fit.
#' @param traindata Training data of class \code{data.frame}.
#' @param testdata Test data of class \code{data.frame}.
#' @param alpha Confidence level. (1 - \code{alpha}) is the desired coverage
#' level. The default is \code{alpha} = 0.05 for the 95% prediction interval.
#' @param split_rule Split rule for building a forest. Options are \code{"ls"}
#' for CART with least-squares (LS) splitting rule, \code{"l1"} for CART with
#' L1 splitting rule, \code{"spi"} for CART with shortest prediction interval
#' (SPI) splitting rule. The default is \code{"ls"}.
#' @param pi_method Methods for building a prediction interval. Options are
#' \code{"lm"} for classical method, \code{"spi"} for shortest prediction
#' interval, \code{"quant"} for quantile method, \code{"hdr"} for highest
#' density region, and \code{"chdr"} for contiguous HDR. The default is to use
#' all methods for PI construction. Single method or a subset of methods can
#' be applied.
#' @param calibration Apply OOB calibration for finding working level of
#' \code{alpha}, i.e. \eqn{\alpha_w}. See below for details. The default is
#' \code{TRUE}.
#' @param rf_package Random forest package that can be used for RF training.
#' Options are \code{"rfsrc"} for \code{randomForestSRC} and \code{"ranger"}
#' for \code{ranger} packages. Split rule \code{"ls"} can be used with both
#' packages. However, \code{"l1"} and \code{"spi"} split rules can only be
#' used with \code{"rfsrc"}. The default is \code{"rfsrc"}.
#' @param params_rfsrc List of parameters that should be passed to
#' \code{randomForestSRC}. In the default parameter set, \code{ntree} = 2000,
#' \code{mtry} = \eqn{px/3} (rounded up), \code{nodesize} = 5,
#' \code{samptype} = "swr". See \code{randomForestSRC} for possible
#' parameters.
#' @param params_ranger List of parameters that should be passed to
#' \code{ranger}. In the default parameter set, \code{num.trees} = 2000,
#' \code{mtry} = \eqn{px/3} (rounded up), \code{min.node.size} = 5,
#' \code{replace} = TRUE. See \code{ranger} for possible parameters.
#' @param params_calib List of parameters for calibration procedure.
#' \code{range} is the allowed target calibration range for coverage level.
#' The value that provides a coverage level within the range is chosen as
#' \eqn{\alpha_w}. \code{start} is the initial coverage level to start
#' calibration procedure. \code{step} is the coverage step size for each
#' calibration iteration. \code{refine} is the gradual decrease in \code{step}
#' value when close to target coverage level, the default is \code{TRUE} which
#' allows gradual decrease.
#' @param oob Should out-of-bag (OOB) predictions and prediction intervals for
#' the training observations be returned?
#'
#' @section Details:
#'
#' \strong{Calibration process}
#'
#' The calibration procedure uses the "Bag of Observations for Prediction"
#' (BOP) idea. BOP for a new observation is built with the set inbag
#' observations that are in the same terminal nodes as the new observation.
#' The calibration procedure uses the BOPs constructed for the training
#' observations. BOP for a training observation is built using only the trees
#' where this training observation is out-of-bag (OOB).
#'
#' Let (\eqn{1-\alpha}) be the target coverage level. The goal of the
#' calibration is to find the value of \eqn{\alpha_w}, which is the working
#' level of \eqn{\alpha} called by Roy and Larocque (2020), such that the
#' coverage level of the prediction intervals for the training observations is
#' closest to the target coverage level. The idea is to find the value of
#' \eqn{\alpha_w} using the OOB-BOPs. Once found, (\eqn{1-\alpha_w}) becomes
#' the level used to build the prediction intervals for the new observations.
#'
#'
#' @return A list with the following components:
#'
#' \item{lm_interval}{Prediction intervals for test data with the classical
#' method. A list containing lower and upper bounds.}
#' \item{spi_interval}{Prediction intervals for test data with SPI method. A
#' list containing lower and upper bounds.}
#' \item{hdr_interval}{Prediction intervals for test data with HDR method. A
#' list containing lower and upper bounds of prediction interval for each test
#' observation. There may be multiple PIs for a single observation.}
#' \item{chdr_interval}{Prediction intervals for test data with contiguous HDR
#' method. A list containing lower and upper bounds.}
#' \item{quant_interval}{Prediction intervals for test data with quantiles
#' method. A list containing lower and upper bounds.}
#' \item{test_pred}{Random forest predictions for test data.}
#' \item{test_response}{If available, test response.}
#' \item{alphaw}{Working level of \code{alpha}, i.e. \eqn{\alpha_w}. A numeric
#' array for the PI methods entered with \code{pi_method}. If
#' \code{calibration = FALSE}, it returns \code{NULL}.}
#' \item{split_rule}{Split rule used for building the random forest.}
#' \item{rf_package}{Random forest package that was used for RF training.}
#' \item{oob_pred_interval}{Out-of-bag (OOB) prediction intervals for train
#' data. Prediction intervals are built with \code{alpha}. If
#' \code{oob = FALSE}, it returns \code{NULL}.}
#' \item{oob_pred}{Out-of-bag (OOB) predictions for train data.
#' If \code{oob = FALSE}, it returns \code{NULL}.}
#' \item{train_response}{Train response.}
#'
#' @references Roy, M. H., & Larocque, D. (2020). Prediction intervals with
#' random forests. Statistical methods in medical research, 29(1), 205-229.
#' doi:10.1177/0962280219829885.
#'
#' @examples
#' ## load example data
#' data(BostonHousing, package = "RFpredInterval")
#' set.seed(2345)
#'
#' ## define train/test split
#' testindex <- 1:10
#' trainindex <- sample(11:nrow(BostonHousing), size = 100, replace = FALSE)
#' traindata <- BostonHousing[trainindex, ]
#' testdata <- BostonHousing[testindex, ]
#' px <- ncol(BostonHousing) - 1
#'
#' ## contruct 90% PI with "l1" split rule and "spi" PI method with calibration
#' out <- rfpi(formula = medv ~ ., traindata = traindata,
#' testdata = testdata, alpha = 0.1, calibration = TRUE,
#' split_rule = "l1", pi_method = "spi", params_rfsrc = list(ntree = 50),
#' params_calib = list(range = c(0.89, 0.91), start = 0.9, step = 0.01,
#' refine = TRUE))
#'
#' ## get the PI with "spi" method for first observation in the testdata
#' c(out$spi_interval$lower[1], out$spi_interval$upper[1])
#'
#' ## get the random forest predictions for testdata
#' out$test_pred
#'
#' ## get the working level of alpha (alphaw)
#' out$alphaw
#'
#' ## contruct 95% PI with "ls" split rule, "lm" and "quant" PI methods
#' ## with calibration and use "ranger" package for RF training
#' out2 <- rfpi(formula = medv ~ ., traindata = traindata,
#' testdata = testdata, split_rule = "ls", pi_method = c("lm", "quant"),
#' rf_package = "ranger", params_ranger = list(num.trees = 50))
#'
#' ## get the PI with "quant" method for the testdata
#' cbind(out2$quant_interval$lower, out2$quant_interval$upper)
#'
#' @seealso \code{\link{piall}} \code{\link{pibf}}
#' \code{\link{print.rfpredinterval}}
rfpi <- function(formula,
traindata,
testdata,
alpha = 0.05,
split_rule = c("ls", "l1", "spi"),
pi_method = c("lm", "spi", "quant", "hdr", "chdr"),
calibration = TRUE,
rf_package = c("rfsrc", "ranger"),
params_rfsrc = list(ntree = 2000, mtry = ceiling(px/3),
nodesize = 5, samptype = "swr"),
params_ranger = list(num.trees = 2000, mtry = ceiling(px/3),
min.node.size = 5, replace = TRUE),
params_calib = list(range = c(1-alpha-0.005, 1-alpha+0.005),
start = (1-alpha), step = 0.01, refine = TRUE),
oob = FALSE)
{
## make formula object
formula <- as.formula(formula)
## initial checks for data sets
if (is.null(traindata)) {stop("'traindata' is missing.")}
if (is.null(testdata)) {stop("'testdata' is missing.")}
if (!is.data.frame(traindata)) {stop("'traindata' must be a data frame.")}
if (!is.data.frame(testdata)) {stop("'testdata' must be a data frame.")}
traindata <- as.data.frame(traindata)
testdata <- as.data.frame(testdata)
## verify key options
rf_package <- match.arg(rf_package, c("rfsrc", "ranger"))
split_rule <- match.arg(split_rule, c("ls", "l1", "spi"))
pi_method <- match.arg(pi_method, c("lm", "spi", "quant", "hdr", "chdr"), several.ok = TRUE)
set_pi_method <- pi_method
oob <- match.arg(as.character(oob), c(FALSE, TRUE))
## check for split_rule - package consistency
if ((split_rule == "l1" || split_rule == "spi") & rf_package == "ranger") {
stop(paste0(split_rule," split rule cannot be applied with ranger package. Change rf_package to rfsrc."))
}
## get variable names
all.names <- all.vars(formula, max.names = 1e7)
yvar.names <- all.vars(formula(paste(as.character(formula)[2], "~ .")), max.names = 1e7)
yvar.names <- yvar.names[-length(yvar.names)]
py <- length(yvar.names)
if (length(all.names) <= py) {
stop("formula is misspecified: total number of variables does not exceed total number of y-variables")
}
if (all.names[py + 1] == ".") {
if (py == 0) {
xvar.names <- names(traindata)
} else {
xvar.names <- names(traindata)[!is.element(names(traindata), all.names[1:py])]
}
} else {
if(py == 0) {
xvar.names <- all.names
} else {
xvar.names <- all.names[-c(1:py)]
}
not.specified <- !is.element(xvar.names, names(traindata))
if (sum(not.specified) > 0) {
stop("formula is misspecified, object ", xvar.names[not.specified], " not found")
}
}
xvar <- traindata[, xvar.names, drop = FALSE]
yvar <- traindata[, yvar.names, drop = FALSE]
traindata <- cbind(xvar, yvar)
yvar <- yvar[, yvar.names]
## get sample size and px
ntrain <- nrow(traindata)
px <- ncol(xvar)
## filter the test data based on the formula
testdata <- testdata[, is.element(names(testdata),
c(yvar.names, xvar.names)), drop = FALSE]
if (rf_package == "rfsrc") {
## set parameters for rfsrc
if (is.null(params_rfsrc)) {
params_rfsrc <- list()
}
param_names <- names(params_rfsrc)
if (!("mtry" %in% param_names)) {params_rfsrc[["mtry"]] <- ceiling(px/3)}
if (!("ntree" %in% param_names)){params_rfsrc[["ntree"]] <- 2000}
if (!("nodesize" %in% param_names)) {params_rfsrc[["nodesize"]] <- 5}
if (!("samptype" %in% param_names)) {params_rfsrc[["samptype"]] <- "swr"}
params_rfsrc[["formula"]] <- formula
params_rfsrc[["data"]] <- traindata
params_rfsrc[["membership"]] <- TRUE
if (split_rule == "l1") {
params_rfsrc[["split_rule"]] <- "custom2"
} else if (split_rule == "spi") {
params_rfsrc[["split_rule"]] <- "custom3"
}
## train RF with rfsrc
rf <- do.call(rfsrc, params_rfsrc)
## get oob mean predictions
mean.oob <- rf$predicted.oob
## get test predictions
pred <- predict(rf, newdata = testdata, membership = TRUE)
mean.test <- pred$predicted
## get membership and inbag information
ntree <- rf$ntree
mem.train <- rf$membership
mem.test <- pred$membership
inbag <- rf$inbag
} else if (rf_package == "ranger") {
## set parameters for ranger
if (is.null(params_ranger)) {
params_ranger <- list()
}
param_names <- names(params_ranger)
if (!("mtry" %in% param_names)) {params_ranger[["mtry"]] <- ceiling(px/3)}
if (!("num.trees" %in% param_names)) {params_ranger[["num.trees"]] <- 2000}
if (!("min.node.size" %in% param_names)) {params_ranger[["min.node.size"]] <- 5}
if (!("replace" %in% param_names)) {params_ranger[["replace"]] <- TRUE}
params_ranger[["formula"]] <- formula
params_ranger[["data"]] <- traindata
params_ranger[["keep.inbag"]] <- TRUE
## train RF with ranger
rf <- do.call(ranger::ranger, params_ranger)
## get oob mean predictions
mean.oob <- rf$predictions
## get test predictions
mean.test <- predict(rf, data = testdata)$predictions
## get membership and inbag information
ntree <- rf$num.trees
mem.train <- predict(rf, traindata, type="terminalNodes")$predictions
mem.test <- predict(rf, testdata, type="terminalNodes")$predictions
inbag <- matrix(unlist(rf$inbag.counts, use.names = FALSE), ncol = ntree, byrow = FALSE)
}
## build test-BOP
BOPtest <- buildtestbop_ib(mem.train, mem.test, inbag, yvar)
## initialize storage
PI_list <- list()
oob_PI_list <- list()
alphaw_list <- list()
bw <- NULL
## form PI for each PI method
for (pi_method in set_pi_method) {
## assign the function of pi method
if (pi_method == "lm") {
formpi_fnc <- as.function(formpi_lm)
} else if (pi_method == "spi") {
formpi_fnc <- as.function(formpi_spi)
} else if (pi_method == "quant") {
formpi_fnc <- as.function(formpi_quant)
} else if (pi_method == "hdr") {
formpi_fnc <- as.function(formpi_hdr)
} else if (pi_method == "chdr") {
formpi_fnc <- as.function(formpi_chdr)
}
## calibration process
if (calibration) {
## set parameters for calibration
if (is.null(params_calib)) {
params_calib <- list()
}
param_names <- names(params_calib)
if (!("range" %in% param_names)) {params_calib[["range"]] <- c(1-alpha-0.005, 1-alpha+0.005)}
if (!("start" %in% param_names)) {params_calib[["start"]] <- 1-alpha}
if (!("step" %in% param_names)) {params_calib[["step"]] <- 0.01}
if (!("refine" %in% param_names)) {params_calib[["refine"]] <- TRUE}
## build OOB-BOP
BOPoob <- buildoobbop_ib(mem.train, inbag, yvar)
if (sum(sapply(BOPoob, is.null)) > 0) {
if (rf_package == "ranger") {
stop("Some observations have empty BOP. Increase the number of trees, 'num.trees' in params_ranger.")
} else {
stop("Some observations have empty BOP. Increase the number of trees, 'ntree' in params_rfsrc.")
}
}
## compute optimal bandwidth for hdr and chdr
if (pi_method %in% c("hdr","chdr")) {
if (is.null(bw)) {
bw <- opt_bw(BOP = BOPoob, alpha = alpha, nn = 10)
}
}
## set parameters for form PI functions
params_formpi <- list(BOP = BOPoob, response = yvar)
if (pi_method %in% c("hdr","chdr")) {
params_formpi[["bw"]] <- bw
}
## find alphaw
alphaw <- calibration(params_calib, params_formpi, formpi_fnc)
} else {
## assign alpha to alphaw
alphaw <- alpha
## compute optimal bandwidth for hdr and chdr
if (pi_method %in% c("hdr","chdr")) {
if (is.null(bw)) {
## build OOB-BOP
BOPoob <- buildoobbop_ib(mem.train, inbag, yvar)
if (sum(sapply(BOPoob, is.null)) > 0) {
if (rf_package == "ranger") {
stop("Some observations have empty BOP. Increase the number of trees, 'num.trees' in params_ranger.")
} else {
stop("Some observations have empty BOP. Increase the number of trees, 'ntree' in params_rfsrc.")
}
}
bw <- opt_bw(BOP = BOPoob, alpha = alpha, nn = 10)
}
}
}
## PI construction for test data
if (pi_method %in% c("hdr","chdr")) {
PI.obj <- formpi_fnc(BOP = BOPtest,
alpha = alphaw,
bw = bw,
response = NULL)
} else {
PI.obj <- formpi_fnc(BOP = BOPtest,
alpha = alphaw,
response = NULL)
}
## store PI information
if (pi_method == "hdr") {
pred_interval <- PI.obj$pi
} else {
pred_interval = list(lower = PI.obj$lower, upper = PI.obj$upper)
}
PI_list[[pi_method]] <- pred_interval
alphaw_list[[pi_method]] <- alphaw
## PI construction for train data with OOB predictions and OOB-BOP
if (oob) {
if (pi_method %in% c("hdr","chdr")) {
PI.obj <- formpi_fnc(BOP = BOPoob,
alpha = alpha,
bw = bw,
response = NULL)
} else {
BOPoob <- buildoobbop_ib(mem.train, inbag, yvar)
PI.obj <- formpi_fnc(BOP = BOPoob,
alpha = alpha,
response = NULL)
}
## store PI information for train data
if (pi_method == "hdr") {
oob_pred_interval <- PI.obj$pi
} else {
oob_pred_interval = list(lower = PI.obj$lower, upper = PI.obj$upper)
}
oob_PI_list[[pi_method]] <- oob_pred_interval
}
}## end of PI construction for each PI method
alphaw <- unlist(alphaw_list)
## return list
out <- list(lm_interval = if("lm" %in% set_pi_method){PI_list[["lm"]]}else{NULL},
spi_interval = if("spi" %in% set_pi_method){PI_list[["spi"]]}else{NULL},
hdr_interval = if("hdr" %in% set_pi_method){PI_list[["hdr"]]}else{NULL},
chdr_interval = if("chdr" %in% set_pi_method){PI_list[["chdr"]]}else{NULL},
quant_interval = if("quant" %in% set_pi_method){PI_list[["quant"]]}else{NULL},
test_pred = mean.test,
test_response = if(is.element(yvar.names, names(testdata))){testdata[, yvar.names]}else{NULL},
alphaw = if(calibration){alphaw}else{NULL},
split_rule = split_rule,
rf_package = rf_package,
oob_pred_interval = if(oob){oob_PI_list}else{NULL},
oob_pred = if(oob){mean.oob}else{NULL},
train_response = yvar)
class(out) <- c("rfpredinterval", "rfpi")
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.