Nothing
cv.varpro <- function(formula, data, nvar = 30, ntree = 150,
local.std = TRUE, zcut = seq(0.1, 2, length = 50), nblocks = 10,
split.weight = TRUE, split.weight.method = NULL, sparse = TRUE,
nodesize = NULL, max.rules.tree = 150, max.tree = min(150, ntree),
verbose = FALSE, seed = NULL,
fast = FALSE, crps = FALSE,
...)
{
##--------------------------------------------------------------
##
## to avoid forking issues we run everything in serial in R
##
##--------------------------------------------------------------
papply <- base::lapply
##--------------------------------------------------------------
##
## extract original yvalue names
## re-define the original data in case there are missing values
##
##--------------------------------------------------------------
stump <- get.stump(formula, data)
n <- stump$n
p <- length(stump$xvar.names)
yvar.names <- stump$yvar.names
data <- data.frame(stump$yvar, stump$xvar)
colnames(data)[1:length(yvar.names)] <- yvar.names
family <- stump$family
rm(stump)
##--------------------------------------------------------------
##
## extract additional options specified by user
##
##--------------------------------------------------------------
dots <- list(...)
## set nodesize
nodesize <- set.cv.nodesize(n, p, nodesize)
dots$nodesize.reduce <- set.nodesize(n, p, dots$nodesize.reduce)
dots$nodedepth.reduce <- set.nodedepth.reduce(n, p, dots$nodedepth.reduce)
if (is.null(dots$sampsize)) {
dots$nodesize.external <- set.nodesize(n, p, dots$nodesize.external)
}
else {
if (is.function(dots$sampsize)) {
dots$nodesize.external <- set.nodesize(dots$sampsize(n), p, dots$nodesize.external)
}
else {
dots$nodesize.external <- set.nodesize(dots$sampsize, p, dots$nodesize.external)
}
}
## set rfq parameters for class imbalanced scenario
use.rfq <- (NULL, NULL)$use.rfq
iratio.threshold <- (NULL, NULL)$iratio.threshold
##--------------------------------------------------------------
##
## default settings
##
##--------------------------------------------------------------
trn <- 1:n
newdata <- splitrule <- rfq <- imbalanced.obj <- cens.dist <- NULL
##--------------------------------------------------------------
##
## set the type of sampling, define train/test (fast=TRUE)
##
##--------------------------------------------------------------
## use same inbag/oob members to reduce MC error
if (!fast) {
if (is.null(dots$sampsize)) {##default sample size function used by rfsrc.fast
ssize <- n * .632
}
else {
ssize <- eval(dots$sampsize)
}
if (is.function(ssize)) {##user has specified a function
ssize <- ssize(n)
}
}
## subsampling is in effect when fast = TRUE
else {
## obtain the requested sample size
if (is.null(dots$sampsize)) {##default sample size function used by rfsrc.fast
ssize <- eval(formals(randomForestSRC::rfsrc.fast)$sampsize)
}
else {
ssize <- eval(dots$sampsize)
}
if (is.function(ssize)) {##user has specified a function
ssize <- ssize(n)
}
## now hold out a test data set equal to the tree sample size (if possible)
if (n > (2 * ssize)) {
tst <- sample(1:n, size = ssize, replace = FALSE)
trn <- setdiff(1:n, tst)
newdata <- data[tst,, drop = FALSE]
}
}
## custom sample array
samp <- randomForestSRC:::make.sample(ntree, length(trn), ssize)
## pass the sample size to varpro as a hidden option
dots$sampsize <- ssize
##--------------------------------------------------------------
##
## varpro call
##
##--------------------------------------------------------------
o <- do.call("varpro", c(list(formula = formula, data = data, nvar = nvar, ntree = ntree,
split.weight = split.weight, split.weight.method = split.weight.method, sparse = sparse,
nodesize = nodesize, max.rules.tree = max.rules.tree, max.tree = max.tree,
verbose = verbose, seed = seed), dots))
##--------------------------------------------------------------
##
## extract importance values
## map importance values which are hot-encoded back to original data
##
##--------------------------------------------------------------
##--------------------------------------------------------------
##
## extract importance values
## map importance values which are hot-encoded back to original data
##
##--------------------------------------------------------------
## compute importance once and reuse it (avoid recomputation)
vmp <- importance(o, local.std = local.std)
vorg <- get.orgvimp(o, local.std = local.std, vmp = vmp)
xvar.names <- vorg$variable
imp <- vorg$z
imp[is.na(imp)] <- 0
##--------------------------------------------------------------
##
## remove zcut values that lead to duplicated models
##
##--------------------------------------------------------------
zcut.models <- do.call(rbind, lapply(zcut, function(zz) {
1 * (imp >= zz)
}))
zcut <- zcut[!duplicated(zcut.models)]
##--------------------------------------------------------------
##
## rfq details: only applies to two class imbalanced scenarios
##
##--------------------------------------------------------------
if (family == "class" && length(levels(data[, yvar.names])) == 2 && use.rfq) {
## calculate imblanced ratio
y.frq <- table(data[, yvar.names])
class.labels <- names(y.frq)
iratio <- max(y.frq, na.rm = TRUE) / min(y.frq, na.rm = TRUE)
## check if this is imbalanced using default threshold setting
if (iratio > iratio.threshold) {
rfq <- TRUE
splitrule <- "auc"
imbalanced.obj <- list(perf.type = "gmean",
iratio = iratio,
iratio.threshold = iratio.threshold)
}
}
##--------------------------------------------------------------
##
## censoring distribution: only applies to survival families
##
##--------------------------------------------------------------
if (family == "surv" && crps) {
cens.dist <- get.cens.dist(data[trn, c(yvar.names, xvar.names), drop = FALSE],
ntree, nodesize, ssize)
}
##--------------------------------------------------------------
##
## select zcut using out-of-sample performance
##
##--------------------------------------------------------------
## set the seed
seed <- get.seed(seed)
## loop over zcut sequence and acquire OOB error rate
err <- do.call(rbind, lapply(zcut, function(zz) {
pt <- imp >= zz
if (sum(pt) > 0) {
if (!fast) {
err.zz <- get.sderr(rfsrc(formula,
data = data[trn, c(yvar.names, xvar.names[pt]), drop = FALSE],
nodesize = nodesize,
ntree = ntree,
rfq = rfq,
splitrule = splitrule,
perf.type = "none",
bootstrap = "by.user",
samp = samp,
seed = seed),
nblocks = nblocks,
crps = crps,
papply = papply,
imbalanced.obj = imbalanced.obj,
cens.dist = cens.dist)
}
else {
## nodesize is not deployed because fast subsampling is in play
err.zz <- get.sderr(randomForestSRC::rfsrc.fast(formula,
data = data[trn, c(yvar.names, xvar.names[pt]), drop = FALSE],
ntree = ntree,
rfq = rfq,
splitrule = splitrule,
perf.type = "none",
forest = TRUE,
bootstrap = "by.user",
samp = samp,
seed = seed),
nblocks = nblocks,
crps = crps,
papply = papply,
newdata = newdata,
imbalanced.obj = imbalanced.obj,
cens.dist = cens.dist)
}
}
else {
err.zz <- c(NA, NA)
}
if (verbose) {
cat("zcut value", zz,
"number variables", sum(pt),
"error", err.zz[1],
"sd", err.zz[2], "\n")
}
c(zz, sum(pt), err.zz)
}))
colnames(err) <- c("zcut", "nvar", "err", "sd")
##--------------------------------------------------------------
##
## return the importance values after filtering
##
##--------------------------------------------------------------
## minimum error
vmin <- vorg
zcut.min <- 0
if (!all(is.na(err[, 3]))) {
zcut.min <- zcut[which.min(err[, 3])]
if (verbose) {
cat("optimal cutoff value", zcut.min, "\n")
}
vmin <- vorg[imp >= zcut.min,, drop = FALSE]
}
## 1sd error rule -conservative
v1sd.conserve <- vorg
zcut.1sd <- 0
if (!all(is.na(err[, 3]))) {
idx.opt <- which.min(err[, 3])
serr <- mean(err[, 4], na.rm = TRUE)
idx2.opt <- err[, 3] < 1 & (err[, 3] <= (err[idx.opt, 3] + serr))
idx2.opt[is.na(idx2.opt)] <- FALSE
if (sum(idx2.opt) > 0) {
zcut.1sd <- zcut[max(which(idx2.opt))]
if (verbose) {
cat("optimal 1sd + (conservative) cutoff value", zcut.1sd, "\n")
}
v1sd.conserve <- vorg[imp >= zcut.1sd,, drop = FALSE]
}
else {
v1sd.conserve <- NULL
}
}
## 1sd error rule -liberal
v1sd.liberal <- vorg
zcut.liberal <- 0
if (!all(is.na(err[, 3]))) {
idx.opt <- which.min(err[, 3])
serr <- mean(err[, 4], na.rm = TRUE)
zcut.liberal <- zcut[min(which(err[, 3] <= (err[idx.opt, 3] + serr)), na.rm = TRUE)]
if (verbose) {
cat("optimal 1sd - (liberal) cutoff value", zcut.liberal, "\n")
}
v1sd.liberal <- vorg[imp >= zcut.liberal,, drop = FALSE]
}
rO <- list(imp = vmin,
imp.conserve = v1sd.conserve,
imp.liberal = v1sd.liberal,
err = err,
zcut = zcut.min,
zcut.conserve = zcut.1sd,
zcut.liberal = zcut.liberal)
class(rO) <- "cv.varpro"
## append some useful information as attributes
attr(rO, "imp.org") <- vmp
attr(rO, "xvar.names") <- o$xvar.names
attr(rO, "xvar.org.names") <- o$xvar.org.names
attr(rO, "family") <- o$family
return(rO)
}
## custom print object for cv to make attributes invisible
print.cv.varpro <- function(x, ...) {
attr(x, "class") <- attr(x, "imp.org") <- attr(x, "xvar.names") <-
attr(x, "xvar.org.names") <- attr(x, "family") <- NULL
print(x)
}
print.cv <- print.cv.varpro
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.