#' @title Optimal rcDT model selection
#' @description Performs k-fold cross validation for tuning of risk and tree size
#' paramters to select the optimal rcDT model.
#' @param data data.frame. Data used to construct rcDT model.
#' Must contain efficacy variable (y),
#' risk variable (r),
#' binary treatment indicator coded as 0 / 1 (trt),
#' propensity score (prtx),
#' candidate splitting covariates (split.var).
#' @param split.var numeric vector. Columns of spliting variables.
#' @param efficacy char. Efficacy outcome column. Defaults to 'y'.
#' @param risk char. Risk outcome column. Defaults to 'r'.
#' @param col.trt char. Treatment indicator column name. Should be of form 0/1 or -1/+1.
#' @param col.prtx char. Propensity score column name.
#' @param risk.control logical. Should risk be controlled? Defaults to TRUE.
#' @param risk.threshold numeric. Desired level of risk control.
#' @param lambda.seq numeric vector. Identifies sequence of risk penalty parameters to be considered.
#' Defaults to NA and will attempt to identify reasonable range.
#' @param lambda.length numeric indicating number of risk penalty parameters to use in tuning.
#' Larger values will cause model selection to be slower. Defaults to 50.
#' @param n.folds numeric. Number of folds to use in k-fold cross validation. Defaults to 10.
#' @param test data.frame of testing observations. Should be formatted the same as 'data'.
#' @param N0 numeric specifying minimum number of observations required to call a node terminal. Defaults to 20.
#' @param n0 numeric specifying minimum number of treatment/control observations needed in a split to declare a node terminal. Defaults to 5.
#' @param max.depth numeric specifying maximum depth of the tree. Defaults to 15 levels.
#' @param mtry numeric specifying the number of randomly selected splitting variables to be included. Defaults to number of splitting variables.
#' @param stabilize logical indicating if efficacy should be modeled using residuals. Defaults to TRUE.
#' @param stabilize.type character specifying method used for estimating residuals. Current options are 'linear' for linear model (default) and 'rf' for random forest.
#' @param sort internal use.
#' @param use.other.nodes logical. Should global estimator of objective function be used. Defaults to TRUE.
#' @param use.bootstrap logical. Should a bootstrap resampling be done? Defaults to FALSE.
#' @param ctg numeric vector corresponding to the categorical input columns. Defaults to NULL. Not available yet.
#' @param AIPWE logical. Should AIPWE (TRUE) or IPWE (FALSE) be used. Not available yet.
#' @param extremeRandomized logical. Experimental for randomly selecting cutpoints in a random forest model. Defaults to FALSE and users should change this at their own peril.
#' @param verbose logical. Should tuning progress bar be displayed. Defaults to TRUE.
#' @return A summary of the cross validation including optimal penalty parameter and the optimal model.
#' @return \item{best.tree}{optimal rcDT model}
#' @return \item{alpha}{tree size penalty}
#' @return \item{lambda}{risk penalty}
#' @return \item{full.tree}{unpruned tree}
#' @return \item{pruned.tree}{output from pruning of `full.tree`}
#' @return \item{subtrees}{sequence of optimally pruned subtrees}
#' @return \item{best.tree.summaries}{summary across trees}
#' @return \item{in.train}{training samples from splits}
#' @return \item{in.test}{testing samples from splits}
#' @return \item{elapsed.time}{time elapsed during model tuning}
#' @import randomForest
#' @import pbapply
#' @export
#' @examples
#'
#' # Grow large tree
#' set.seed(123)
#' dat <- generateData()
#' fit <- rcDT.select(data = dat,
#' split.var = 1:10,
#' nfolds = 5,
#' risk.threshold = 2.75)
#'
rcDT.select <- function(data,
split.var,
N0 = 20,
n0 = 5,
efficacy = 'y',
risk = 'r',
col.trt = "trt",
col.prtx = "prtx",
lambda.seq = NA,
lambda.length = 50,
risk.control = TRUE,
risk.threshold = NA,
nfolds = 10,
AIPWE = FALSE,
sort = TRUE,
ctg = NA,
mtry = length(split.var),
max.depth = 15,
stabilize.type = c('linear', 'rf'),
stabilize = TRUE,
use.other.nodes = TRUE,
use.bootstrap = FALSE,
extremeRandomized = FALSE,
verbose = TRUE){
start.time <- Sys.time()
# input checks
if(!is.data.frame(data)) stop("data argument must be dataframe")
if(!is.numeric(split.var)) stop("split.var must be numeric vector")
if(!is.character(efficacy)) stop("efficacy argument must be character")
if(!efficacy %in% colnames(data)) stop("efficacy argument is not in data")
if(!is.character(risk)) stop("risk argument must be character")
if(!risk %in% colnames(data)) stop("risk argument is not in data")
if(risk.control & is.na(risk.threshold)) warning("risk.contrl is TRUE, but risk.threshold not specified")
if(!any(stabilize.type %in% c("linear", "rf"))) stop("linear and rf values supported for stabilize.type")
if(mtry > length(split.var)){
warning("mtry is larger than split.var length -- setting mtry to length(split.var)")
mtry <- length(split.var)
}
stabilize.type <- match.arg(stabilize.type)
if(sum(data$trt %in% c(0,1)) != nrow(data)){
data$trt <- ifelse(data$trt == -1, 0, 1)
message("Assuming trt indicator is of form -1/+1 and changed values to 0/1")
}
# retrieve lambda information (if provided)
if(length(lambda.seq) < 2 & all(is.na(lambda.seq))){
# define range of risk penalty values
EY.0 <- mean(data[data$trt == 0,efficacy] - mean(data[,efficacy]))
EY.1 <- mean(data[data$trt == 1,efficacy] - mean(data[,efficacy]))
ER.0 <- mean(data[data$trt == 0,risk])
ER.1 <- mean(data[data$trt == 1,risk])
# define upper bound for lambda
lambda.upper <- 1.25*(EY.1 - EY.0 + ER.0) / (ER.1 - risk.threshold)
lambdas <- round(seq(0, lambda.upper, length.out = lambda.length+1), 5)[-1]
} else{
if(length(lambda.seq) > 100){
lambdas <- sort(sample(lambda.seq, 100))
} else{
lambdas <- sort(lambda.seq)
}
}
names(lambdas) <- lambdas
# Shuffle data
if(!use.bootstrap){
if(sort) data <- data[sample(1:nrow(data), size = nrow(data)),]
folds <- cut(seq(1,nrow(data)), breaks = nfolds, labels = FALSE)
} else{
if(sort) data <- data[sample(1:nrow(data), size = nrow(data)),]
folds <- lapply(1:nfolds, function(i) unique(sample(1:nrow(data), nrow(data), replace = TRUE)))
}
in.train <- in.test <- trees <- list()
# divide samples into training and testing based on n.folds specified
for(k in 1:nfolds){
if(!use.bootstrap){ # use traditional CV
in.train[[k]] <- data[-which(folds==k,arr.ind=TRUE),]
in.test[[k]] <- data[which(folds==k,arr.ind=TRUE),]
} else{
in.train[[k]] <- data[folds[[k]],]
in.test[[k]] <- data[-folds[[k]],]
}
}
pboptions(type = ifelse(verbose, "timer", "none"))
out.lambdas <- pblapply(lambdas, function(lam){
# Grow initial tree to get sequence of alpha values
tre <- rcDT(data = data,
split.var = split.var,
test = NULL,
min.ndsz = N0,
n0 = n0,
efficacy = efficacy,
risk = risk,
col.trt = col.trt,
col.prtx = col.prtx,
lambda = lam,
risk.control = risk.control,
risk.threshold = risk.threshold,
stabilize = stabilize,
stabilize.type = stabilize.type,
ctg = ctg,
max.depth = max.depth,
AIPWE = AIPWE,
mtry = mtry,
use.other.nodes = use.other.nodes,
extremeRandomized = extremeRandomized)
full.tre.prune <- prune(tre,
a = 0,
ctgs = ctgs,
risk.control = risk.control,
risk.threshold = risk.threshold,
lambda = lam)
alphas <- c(0, do.call(c, lapply(1:(nrow(full.tre.prune$result)-1), function(rtt){
out.alpha <- (as.numeric(full.tre.prune$result$V[rtt]) - as.numeric(full.tre.prune$result$V[rtt+1])) /
(as.numeric(full.tre.prune$result$size.tmnl[rtt]) - as.numeric(full.tre.prune$result$size.tmnl[rtt+1]))
return(out.alpha)})))
names(alphas) <- alphas
# Grow cross validation sample trees
trees <- lapply(1:nfolds, function(n)
rcDT(data = in.train[[n]],
test = in.test[[n]],
split.var = split.var,
risk.control = risk.control,
risk.threshold = risk.threshold,
lambda = lam,
efficacy = efficacy,
risk = risk,
col.trt = col.trt,
col.prtx = col.prtx,
min.ndsz = N0,
n0 = n0,
AIPWE = AIPWE,
ctg = ctg,
mtry = mtry,
stabilize = stabilize,
stabilize.type = stabilize.type,
max.depth = max.depth,
use.other.nodes = use.other.nodes))
pruned.trees.alpha <- lapply(1:nfolds, function(tt){
if(!is.null(dim(trees[[tt]]$tree) & nrow(trees[[tt]]$tree) != 1)){
tmp <- prune(trees[[tt]],
a = 0,
test = trees[[tt]]$test,
AIPWE = FALSE,
ctgs = ctgs,
risk.control = risk.control,
risk.threshold = risk.threshold,
lambda = lam)
pruned.alphas <- do.call(c, lapply(alphas, function(aaa){
idx <- which.max(as.numeric(tmp$result$V) - aaa * (as.numeric(tmp$result$size.tmnl)))
return(as.numeric(tmp$result$V.test[idx]))
}))
return(list(V.test = pruned.alphas,
pruned = tmp))
}
})
best.alpha <- alphas[which.max(apply(do.call(rbind, lapply(pruned.trees.alpha, "[[", "V.test")), 2, mean, na.rm = TRUE))]
best.tree.idx <- which.max(as.numeric(full.tre.prune$result$V) - best.alpha * as.numeric(full.tre.prune$result$size.tmnl))
if(best.tree.idx > length(full.tre.prune$subtrees)){
best.tree <- full.tre.prune$subtrees[[length(full.tre.prune$subtrees)]]
best.tree[1, 6:ncol(best.tree)] <- NA
best.tree <- best.tree[1,,drop = FALSE]
} else{
best.tree <- full.tre.prune$subtrees[[best.tree.idx]]
}
best.tree.summary <- apply(do.call(rbind, lapply(1:nfolds, function(n){
pruned <- pruned.trees.alpha[[n]]$pruned
best.subtre.idx <- which.max(as.numeric(pruned$result$V) - best.alpha * as.numeric(pruned$result$size.tmnl))
if(best.subtre.idx > length(pruned$subtrees)){
return(c(Train.Benefit = sum(trees[[n]]$y / in.train[[n]][,col.prtx]) /
sum(1 / in.train[[n]][,col.prtx]),
Train.Risk = sum(in.train[[n]][,risk] / in.train[[n]][,col.prtx]) /
sum(1 / in.train[[n]][,col.prtx]),
Test.Benefit = sum(trees[[n]]$y.test / in.test[[n]][,col.prtx]) /
sum(1 / in.test[[n]][,col.prtx]),
Test.Risk = sum(in.test[[n]][,risk] / in.test[[n]][,col.prtx]) /
sum(1 / in.test[[n]][,col.prtx])))
} else{
tmp.tre <- pruned$subtrees[[best.subtre.idx]]
pr.train <- predict.ITR(tmp.tre, in.train[[n]], split.var)$trt.pred
pr.test <- predict.ITR(tmp.tre, in.test[[n]], split.var)$trt.pred
return(c(Train.Benefit = sum(trees[[n]]$y * (in.train[[n]][,col.trt] == pr.train) / in.train[[n]][,col.prtx]) /
sum((in.train[[n]][,col.trt] == pr.train) / in.train[[n]][,col.prtx]),
Train.Risk = sum(in.train[[n]][,risk] * (in.train[[n]][,col.trt] == pr.train) / in.train[[n]][,col.prtx]) /
sum((in.train[[n]][,col.trt] == pr.train) / in.train[[n]][,col.prtx]),
Test.Benefit = sum(trees[[n]]$y.test * (in.test[[n]][,col.trt] == pr.test) / in.test[[n]][,col.prtx]) /
sum((in.test[[n]][,col.trt] == pr.test) / in.test[[n]][,col.prtx]),
Test.Risk = sum(in.test[[n]][,risk] * (in.test[[n]][,col.trt] == pr.test) / in.test[[n]][,col.prtx]) /
sum((in.test[[n]][,col.trt] == pr.test) / in.test[[n]][,col.prtx])))
}
})), 2, mean, na.rm = TRUE)
return(list(summaries = c(alpha = best.alpha,
lambda = lam),
best.tree = best.tree,
full.tree = tre,
full.tree.pruned = full.tre.prune,
best.tree.summary = best.tree.summary))
})
tree.summaries <- do.call(rbind, lapply(out.lambdas, "[[", "best.tree.summary"))
idx <- which(tree.summaries[,"Test.Risk"] < 1.01*risk.threshold)
idx2 <- which.max(tree.summaries[idx,"Test.Benefit"])
best.tree <- out.lambdas[idx][[idx2]]$best.tree
best.lambda <- lambdas[idx][idx2]
best.alpha <- out.lambdas[idx][[idx2]]$summaries[1]
end.time <- Sys.time()
setNames(list(best.tree,
best.alpha,
best.lambda,
out.lambdas[idx][[idx2]]$full.tree,
out.lambdas[idx][[idx2]]$full.tree.pruned$result,
out.lambdas[idx][[idx2]]$full.tree.pruned$subtrees,
tree.summaries,
in.train,
in.test,
end.time - start.time),
c("best.tree",
"alpha",
"lambda",
"full.tree",
"pruned.tree",
"subtrees",
"best.tree.summaries",
"in.train",
"in.test",
"elapsed.time"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.