#' @title Constructs an rcDT model
#' @description Constructs a risk controlled decision tree (rcDT) given an
#' efficacy and risk outcome.
#' @param data data.frame. Data used to construct rcDT model.
#' Must contain efficacy variable (y),
#' risk variable (r),
#' binary treatment indicator coded as 0 / 1 (trt),
#' propensity score (prtx),
#' candidate splitting covariates (split.var).
#' @param split.var numeric vector. Columns of spliting variables.
#' @param efficacy char. Efficacy outcome column. Assumes larger values are desirable Defaults to 'y'.
#' @param risk char. Risk outcome column. Assumes smaller values are desirable Defaults to 'r'.
#' @param col.trt char. Treatment indicator column name. Should be of form 0/1 or -1/+1.
#' @param col.prtx char. Propensity score column name.
#' @param risk.control logical. Should risk be controlled? Defaults to TRUE.
#' @param risk.threshold numeric. Desired level of risk control.
#' @param lambda numeric. Penalty parameter for risk scores. Defaults to 0, i.e. no constraint.
#'
#' Optional arguments
#' @param test data.frame of testing observations. Should be formatted the same as 'data'.
#' @param min.ndsz numeric specifying minimum number of observations required to call a node terminal. Defaults to 20.
#' @param n0 numeric specifying minimum number of treatment/control observations needed in a split to declare a node terminal. Defaults to 5.
#' @param max.depth numeric specifying maximum depth of the tree. Defaults to 15 levels.
#' @param mtry numeric specifying the number of randomly selected splitting variables to be included. Defaults to number of splitting variables.
#' @param stabilize logical indicating if efficacy should be modeled using residuals. Defaults to TRUE.
#' @param stabilize.type character specifying method used for estimating residuals.
#' Current options are 'linear' for linear model (default) and 'rf' for random forest.
#' @param use.other.nodes logical. Should global estimator of objective function be used. Defaults to TRUE.
#' @param ctg numeric vector corresponding to the categorical input columns. Defaults to NULL. Not available yet.
#' @param AIPWE logical. Should AIPWE (TRUE) or IPWE (FALSE) be used. Not available yet.
#' @param extremeRandomized logical. Experimental for randomly selecting cutpoints in a random forest model. Defaults to FALSE and users should change this at their own peril.
#' @param print.summary logical. Should a summary of the tree building be printed? Defaults to TRUE for single trees.
#' @return Summary of rcDT model
#' @return \item{tree}{data.frame with the following: Each `node` begins with "0" indicating the root node,
#' followed by a "1" or "2" indicating the less than (or left) child node or greater than (or right) child node.
#' Additionally, the number of observations `size`, number treated `n.1`, number on control `n.0`, and treatment effect `trt.effect`
#' summaries are provided. The splitting information includes the column of the chosen splitting variable `var`, the variable name 'vname',
#' the direction the treatment is sent `cut.1` ("r" for right child node, and "l" for left), the chosen split value `cut.2`,
#' and the estimated value function `score`.}
#' @return \item{y}{efficacy values used in modeling. Will likely differ from original input `y` if stabilization was used}
#' @return \item{risk.threshold}{value of risk control used}
#' @return \item{data}{input dataset}
#' @return \item{fit.y}{fitted model for residuals is `stabilize` was used}
#' @return \item{split.var}{splitting covariates used}
#' @import randomForest
#' @export
#' @examples
#' set.seed(123)
#' dat <- generateData()
#' # Generates tree using simualated EMR data with splitting variables located in columns 1-4.
#' tree <- rcDT(data = dat,
#' split.var = 1:10,
#' risk.threshold = 2.75,
#' lambda = 1)
rcDT <- function(data,
split.var,
test = NULL,
ctg = NULL,
efficacy = "y",
risk = "r",
col.trt = "trt",
col.prtx = "prtx",
risk.control = TRUE,
risk.threshold = NA,
lambda = 0,
min.ndsz = 20,
n0 = 5,
stabilize = TRUE,
stabilize.type = c("linear", "rf"),
use.other.nodes = TRUE,
mtry = length(split.var),
max.depth = 15,
AIPWE = FALSE,
extremeRandomized = FALSE,
print.summary = TRUE)
{
# =========================
# input checks
# =========================
if(!is.data.frame(data)) stop("data argument must be dataframe")
if(!is.numeric(split.var)) stop("split.var must be numeric vector")
if(!is.character(efficacy)) stop("efficacy argument must be character")
if(!efficacy %in% colnames(data)) stop("efficacy argument is not in data")
if(!is.character(risk)) stop("risk argument must be character")
if(!risk %in% colnames(data)) stop("risk argument is not in data")
if(risk.control & is.na(risk.threshold)) warning("risk.contrl is TRUE, but risk.threshold not specified")
if(lambda < 0) stop("lambda must be > 0")
if(!any(stabilize.type %in% c("linear", "rf"))) stop("linear and rf values supported for stabilize.type")
if(mtry > length(split.var)){
warning("mtry is larger than split.var length -- setting mtry to length(split.var)")
mtry <- length(split.var)
}
# =========================
# end input checks
# =========================
if(print.summary)
sprintf("rcDT model risk control specified as %s; Penalty specified as %s", risk.threshold, lambda)
# initialize variables and libraries.
out <- NULL
list.nd <- NULL
list.test <- NULL
temp.list <- NULL
temp.test <- NULL
temp.name <- NULL
name <- "0"
full.set <- data
max.score <- NULL
# Fill in extra variables if not provided (just as placeholders)
# These variables will be utilized later as survival endpoints
# are included in the available analyses
if(is.null(data$KM.cens)) data$KM.cens <- rep(1, nrow(data))
if(is.null(data$status)) data$status <- rep(1, nrow(data))
if(is.null(data$id)) data$id <- 1:nrow(data)
if(is.null(.subset2(data, col.trt))) stop("treatment argument col.trt not found in data")
if(is.null(.subset2(data, col.prtx))) stop("propensity argument col.prtx not found in data")
if(col.trt != "trt") data$trt <- data[,col.trt]
if(col.prtx != "prtx") data$prtx <- data[,col.prtx]
if(sum(data$trt %in% c(0,1)) != nrow(data)){
data$trt <- ifelse(data$trt == -1, 0, 1)
warning("Assuming trt indicator is of form -1/+1 and changed values to 0/1")
}
if(!is.null(test)){
if(is.null(test$KM.cens)) test$KM.cens <- rep(1, nrow(test))
if(is.null(test$status)) test$status <- rep(1, nrow(test))
if(is.null(test$id)) test$id <- 1:nrow(test)
}
# Model residuals if requested
stabilize.type <- match.arg(stabilize.type)
if(stabilize.type == "rf") require(randomForest)
if(stabilize){
dat.tmp <- data.frame(data[ ,split.var],
y = .subset2(data ,efficacy))
if(stabilize.type == "rf"){
fit <- randomForest(y ~ ., data = dat.tmp)
data$y <- fit$y - fit$predicted
} else if(stabilize.type == "linear"){
fit <- lm(y ~ ., data = dat.tmp)
data$y <- fit$residuals
}
rm(dat.tmp)
if(!is.null(test)){
colnames(test) <- gsub(":x", ".x", colnames(test))
test$y <- test[,efficacy] - predict(fit, test)
colnames(test) <- gsub("[.]x", ":x", colnames(test))
}
fit.y <- fit
rm(fit)
} else{
data$y <- data[ ,efficacy]
if(!is.null(test)){
test$y <- test[, efficacy]
}
}
# Define risk variables
data$r <- .subset2(data ,risk)
if(!is.null(test)){
test$r <- .subset2(test, risk)
}
# record total dataset for spliting
list.nd <- list(data)
if (!is.null(test)) list.test <- list(test)
# loop over dataset for spliting
while(length(list.nd) != 0) {
for(ii in sample(1:length(list.nd), size = length(list.nd))){ # select node
if(!is.null(dim(list.nd[[ii]])) && nrow(list.nd[[ii]]) > 1){ # check that dataset is not degenerate
if (!is.null(test)){ # define test set if requested
test0 <- list.test[[ii]]
} else{
test0 <- NULL
}
# define temp tree structure for lower level splits
if(length(list.nd) <= 1) temp.tree <- NULL
# define temporary list of splitting information
tmp.list <- list(y = .subset2(data, 'y'),
prtx = .subset2(data, 'prtx'),
ae = .subset2(data, 'r'),
trt = .subset2(data, 'trt'),
KM.cens = .subset2(data, 'KM.cens'),
maxRisk = risk.threshold,
status = .subset2(data, 'status'),
n0 = n0,
lambda = lambda)
# Get current value of the objective
if(name[ii] == "0"){
max.score <- max(sapply(0:1, function(iii)
estITR(append(list(z = rep(iii, nrow(data))), tmp.list))))
} else{
trt.pred <- predict.ITR(temp.tree, data, split.var)$trt.pred
max.score <- estITR(append(list(z = trt.pred), tmp.list))
dat.rest <- data.frame(data[,colnames(data) != "trt.new"],
trt.new = trt.pred)
dat.rest <- dat.rest[!dat.rest$id %in% list.nd[[ii]]$id,]
}
rm(tmp.list)
# Determine best split across all covariates
if(!risk.control){
split <- risk.partition.ITR(dat = list.nd[[ii]],
test = test0,
name = name[ii],
min.ndsz = min.ndsz,
n0 = n0,
split.var = split.var,
ctg = ctg,
max.depth = max.depth,
mtry = mtry,
dat.rest = dat.rest,
max.score = max.score,
AIPWE = AIPWE,
risk.threshold = risk.threshold,
use.other.nodes = use.other.nodes,
extremeRandomized = extremeRandomized,
lambda = 0)
} else{
if(is.na(risk.control)) stop("Level for risk must be specified numeric")
split <- risk.partition.ITR(dat = list.nd[[ii]],
test = test0,
name = name[ii],
min.ndsz = min.ndsz,
n0 = n0,
split.var = split.var,
ctg = ctg,
max.depth = max.depth,
mtry = mtry,
dat.rest = dat.rest,
max.score = max.score,
AIPWE = AIPWE,
risk.threshold = risk.threshold,
use.other.nodes = use.other.nodes,
extremeRandomized = extremeRandomized,
lambda = lambda)
}
out <- rbind(out, split$info)
if(!is.null(nrow(split$left))&&!is.null(nrow(split$right))){
min.n <- min(nrow(split$left),nrow(split$right))
}
if (!is.null(split$left) && min.n >= min.ndsz && is.null(test)) {
temp.list <- c(temp.list, list(split$left, split$right))
temp.test <- c(temp.list, list(split$left, split$right))
temp.name <- c(temp.name, split$name.l, split$name.r)
} else if (!is.null(split$left) && min.n >= min.ndsz && !is.null(test) && !is.null(split$left.test)) {
temp.list <- c(temp.list, list(split$left, split$right))
temp.name <- c(temp.name, split$name.l, split$name.r)
temp.test <- c(temp.test, list(split$left.test, split$right.test))
}
}
temp.tree <- out
}
list.nd <- temp.list
list.test <- temp.test
name <- temp.name
temp.tree<-out
temp.list <- NULL
temp.test <- NULL
temp.name <- NULL
}
out <- out[order(out$node), ]
out$var[apply(out, 1, function(j) is.na(de(j['node'], out)[1]))] <- NA
out[is.na(out$var),c('var','vname','cut.1','cut.2','score')] <- NA
if(!is.null(test)){
out[is.na(out$var), c('score.test')] <- NA
}
if(!stabilize) fit.y <- NULL
if(!is.null(test)){
out.tmp <- list(tree = out, y = data$y, y.test = test$y)
i.nodes <- out.tmp$tree$node[!is.na(out.tmp$tree$var)]
names(i.nodes) <- i.nodes
test.value <- sapply(i.nodes, function(jjj){
keep <- c(setdiff(out.tmp$tree$node, de(jjj, out.tmp$tree)), paste0(jjj, 1:2))
tmp.tre <- out.tmp$tree[out.tmp$tree$node %in% keep, ]
tmp.tre[tmp.tre$node %in% paste0(jjj, 1:2), c('var','vname','cut.1','cut.2','score', 'score.test')] <- NA
preds <- predict.ITR(tmp.tre, test, split.var, ctgs = ctg)$trt.pred
score.test <- estITR(list(y = .subset2(test, 'y'),
trt = .subset2(test, 'trt'),
ae = .subset2(test, 'r'),
prtx = .subset2(test, 'prtx'),
status = .subset2(test, 'status'),
KM.cens = .subset2(test, 'KM.cens'),
n0 = 5, z = preds,
lambda = lambda, maxRisk = risk.threshold))
})
out$score.test <- test.value[match(out$node, names(test.value))]
setNames(list(out, data$y, test$y, risk.threshold, data, test, fit.y, split.var),
c("tree", "y", "y.test", "risk.threshold", "data", "test", "fit.y", "split.var"))
} else{
setNames(list(out, data$y, risk.threshold, data, fit.y, split.var),
c("tree", "y", "risk.threshold", "data", "fit.y", "split.var"))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.