#' @title rcRF model selection
#' @description Performs model selection for rcRF model to
#' select the best penalty parameter.
#' @param data data.frame. Data used to construct rcDT model.
#' Must contain efficacy variable (y),
#' risk variable (r),
#' binary treatment indicator coded as 0 / 1 (trt),
#' propensity score (prtx),
#' candidate splitting covariates.
#' @param split.var numeric vector. Columns of spliting variables.
#' @param efficacy char. Efficacy outcome column. Defaults to 'y'.
#' @param risk char. Risk outcome column. Defaults to 'r'.
#' @param col.trt char. Treatment indicator column name. Should be of form 0/1 or -1/+1.
#' @param col.prtx char. Propensity score column name.
#' @param risk.control logical. Should risk be controlled? Defaults to TRUE.
#' @param risk.threshold numeric. Desired level of risk control.
#' @param ntree numeric. Number of trees to construct.
#' @param test data.frame of testing observations. Should be formatted the same as 'data'.
#' @param N0 numeric specifying minimum number of observations required to call a node terminal. Defaults to 20.
#' @param n0 numeric specifying minimum number of treatment/control observations needed in a split to declare a node terminal. Defaults to 5.
#' @param max.depth numeric specifying maximum depth of the tree. Defaults to 15 levels.
#' @param mtry numeric specifying the number of randomly selected splitting variables to be included.
#' Defaults to the greater of 1 and length(split.var)/3.
#' @param stabilize logical indicating if efficacy should be modeled using residuals. Defaults to TRUE.
#' @param stabilize.type character specifying method used for estimating residuals. Current options are 'linear' for linear model (default) and 'rf' for random forest.
#' @param use.other.nodes logical. Should global estimator of objective function be used. Defaults to TRUE.
#' @param ctg numeric vector corresponding to the categorical input columns. Defaults to NULL. Not available yet.
#' @param AIPWE logical. Should AIPWE (TRUE) or IPWE (FALSE) be used. Not available yet.
#' @param extremeRandomized logical. Experimental for randomly selecting cutpoints in a random forest model. Defaults to FALSE and users should change this at their own peril.
#' @param lambda.upper numeric. Upper bound for risk penalty. An attempt at reasonable selection will be performed automatically.
#' @param importance logical. Indicated if variable importance measures should be estimated and returned. Defaults to FALSE.
#' @param order.importances logical. Should importances be ordered (if requested)?
#' @param max.iter numeric. Indicates the maximum number of forest iterations to perform. Defaults to 10.
#' @param risk.tolerance numeric. Two component vector giving the bound on risk that is acceptable (acceptable risk range is calcuated as risk.threshold * risk.tolerance). Defaults to c(0.995, 1.005), i.e. 0.5\% tolerance.
#' @param avoid.nul.tree logical. Should null trees be discarded?
#' @param verbose logical. Give updates about forest progression?
#' @return A summary of the cross validation including optimal penalty parameter and the optimal model.
#' @return \item{best.fit}{optimal rcRF model}
#' @return \item{lambda}{optimal lambda value selected}
#' @return \item{oob.risk}{out-of-bag risk from best model}
#' @return \item{converged}{max number of iterations reached?}
#' @return \item{importances}{importance measures, if requested}
#' @return \item{risks}{vector of risk scores obtained over tuning procedure}
#' @return \item{lambdas}{vector of lambda values tried over tuning procedure}
#' @return \item{time.elapsed}{elapsed time for model tuning}
#' @import randomForest
#' @export
#' @examples
#'
#' # Grow large tree
#' set.seed(123)
#' dat <- generateData()
#' fit <- rcRF.select(data = dat,
#' split.var = 1:10,
#' risk.threshold = 2.75)
#'
rcRF.select <- function(data,
split.var,
test = NULL,
N0 = 20,
n0 = 5,
efficacy = 'y',
risk = 'r',
col.trt = "trt",
col.prtx = "prtx",
ntree = 500,
lambda.upper = NA,
risk.control = TRUE,
risk.threshold = NA,
AIPWE = FALSE,
ctg = NA,
mtry = max(floor(length(split.var)/3), 1),
avoid.nul.tree = FALSE,
max.depth = 15,
stabilize.type = c('linear', 'rf'),
stabilize = TRUE,
verbose = FALSE,
use.other.nodes = TRUE,
extremeRandomized = FALSE,
importance = FALSE,
order.importances = TRUE,
max.iter = 10,
risk.tolerance = c(0.995, 1.005)){
start.time <- Sys.time()
# input checks
if(!is.data.frame(data)) stop("data argument must be dataframe")
if(!is.numeric(split.var)) stop("split.var must be numeric vector")
if(!is.character(efficacy)) stop("efficacy argument must be character")
if(!efficacy %in% colnames(data)) stop("efficacy argument is not in data")
if(!is.character(risk)) stop("risk argument must be character")
if(!risk %in% colnames(data)) stop("risk argument is not in data")
if(!is.numeric(max.iter) | length(max.iter) != 1) stop("max.iter must be numeric of length 1")
if(length(risk.tolerance) != 2 | !is.numeric(risk.tolerance))
stop("risk.tolerance must be two component numeric vector")
if(risk.control & is.na(risk.threshold)) warning("risk.contrl is TRUE, but risk.threshold not specified")
if(!any(stabilize.type %in% c("linear", "rf"))) stop("linear and rf values supported for stabilize.type")
if(mtry > length(split.var)){
warning("mtry is larger than split.var length -- setting mtry to length(split.var)")
mtry <- length(split.var)
}
stabilize.type <- match.arg(stabilize.type)
if(sum(data[,col.trt] %in% c(0,1)) != nrow(data)){
data[,col.trt] <- ifelse(data[,col.trt] == -1, 0, 1)
message("Assuming trt indicator is of form -1/+1 and changed values to 0/1")
}
# retrieve lambda information (if provided)
if(is.na(lambda.upper)){
# define range of risk penalty values
EY.0 <- mean(data[data[,col.trt] == 0,efficacy] - mean(data[,efficacy]))
EY.1 <- mean(data[data[,col.trt] == 1,efficacy] - mean(data[,efficacy]))
ER.0 <- mean(data[data[,col.trt] == 0,risk])
ER.1 <- mean(data[data[,col.trt] == 1,risk])
# define upper bound for lambda
lambda.upper <- 1.25*(EY.1 - EY.0 + ER.0) / (ER.1 - risk.threshold)
} else{
if(!is.numeric(lambda.upper)) stop("lambda upper bound must be numeric")
}
fits <- vector("list")
# scan over lambda values to select final lambda value
lambdas <- 0.95*lambda.upper
iter <- 1
risks <- max(
c(sum(data[,risk] * (data[,col.trt] == 0) / data[,col.prtx]) / sum((data[,col.trt] == 0) / data[,col.prtx]),
sum(data[,risk] * (data[,col.trt] == 1) / data[,col.prtx]) / sum((data[,col.trt] == 1) / data[,col.prtx]))
)
while(((risks[length(risks)] > risk.tolerance[2]*risk.threshold) |
(risks[length(risks)] < risk.tolerance[1]*risk.threshold))&
iter < max.iter + 1 &
lambdas[length(lambdas)] > 0){
fit.lam <- rcRF(data = data,
split.var = split.var,
efficacy = efficacy,
risk = risk,
col.trt = col.trt,
col.prtx = col.prtx,
risk.control = risk.control,
risk.threshold = risk.threshold,
lambda = lambdas[length(lambdas)],
stabilize = stabilize,
stabilize.type = stabilize.type,
test = test,
ctg = ctg,
N0 = N0,
n0 = n0,
max.depth = max.depth,
ntree = ntree,
mtry = mtry,
avoid.nul.tree = avoid.nul.tree,
AIPWE = AIPWE,
verbose = FALSE,
use.other.nodes = use.other.nodes,
extremeRandomized = extremeRandomized,
importance = FALSE,
order.importances = order.importances)
fits <- c(fits, list(fit.lam))
risks <- c(risks, fit.lam$risk.oob[length(fit.lam$risk.oob)])
if(verbose){
cat("Completed Iteration", iter, "; OOB risk =", risks[length(risks)], "\n")
}
iter <- iter + 1
if(length(lambdas) == 1){
if(risks[length(risks)] > risk.threshold){
lambdas <- c(lambdas, 1.5*lambdas[length(lambdas)])
} else{
lambdas <- c(lambdas, 0.5*lambdas[length(lambdas)])
}
} else{
if(risks[length(risks)] > risk.threshold){
# current risk is too high, need to increase lambda
if(length(which(risks[-1] < risks[length(risks)])) > 0){
idx <- which(risks[-1] < risks[length(risks)])
base.risk <- risks[-1][idx][which.min(abs(risks[-1][idx] - risks[length(risks)]))]
idxx <- which(risks == base.risk)
base.lambda <- lambdas[idxx-1]
base.prop <- 1 - abs(risks[length(risks)] - risk.threshold) / abs(base.risk - risks[length(risks)])
lambdas <- c(lambdas, base.lambda - base.prop*abs(base.lambda - lambdas[length(lambdas)]))
} else{
lambdas <- c(lambdas, 1.5 * lambdas[length(lambdas)])
}
} else{
if(length(which(risks[-1] > risks[length(risks)])) > 0){
idx <- which(risks[-1] > risks[length(risks)])
base.risk <- risks[-1][idx][which.min(abs(risks[-1][idx] - risks[length(risks)]))]
idxx <- which(risks == base.risk)
base.lambda <- lambdas[idxx-1]
base.prop <- 1 - abs(risks[length(risks)] - risk.threshold) / abs(base.risk - risks[length(risks)])
lambdas <- c(lambdas, base.lambda + base.prop*abs(base.lambda - lambdas[length(lambdas)]))
} else{
lambdas <- c(lambdas, 0.5 * lambdas[length(lambdas)])
}
}
}
}
efficacies <- do.call(c, lapply(fits, function(ff) ff$value.oob[length(ff$value.oob)]))
risks <- risks[-1]
best.idx1 <- which(risks < risk.threshold * risk.tolerance[2])
best.idx2 <- which.max(efficacies[best.idx1])
best.fit <- fits[best.idx1][[best.idx2]]
best.lambda <- lambdas[best.idx1][best.idx2]
best.risk <- risks[best.idx1][best.idx2]
end.time <- Sys.time()
if(importance){
if(verbose) cat("Calculating Importances")
importances <- Variable.Importance.ITR(rcRF.fit = best.fit, sort = order.importances)
} else{
importances <- NA
}
setNames(list(best.fit,
best.lambda,
best.risk,
as.logical(iter <= max.iter),
importances,
risks[-1],
lambdas[-length(lambdas)],
end.time - start.time),
c("best.fit",
"lambda",
"oob.risk",
"converged",
"importances",
"risks",
"lambdas",
"elapsed.time"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.