#' Bayesian Decision Tree Fit
#'
#' Fit a Bayesian Decision Tree with a \code{stats}-like formula frontend interface.
#'
#' @param formuler ravioli ravioli give me the formuoli.
#' @param data the data associated with the formuler. Note: if you want an intercept, you must
#' add it ahead of time.
#' @param d the number of features to subsample at each node. Defaults to \code{NULL}, which tries every feature.
#' @param alpha the prior parameters for the feature probabilities. A \code{[p]} vector. If \code{NULL}, samples uniformly.
#' Defaults to \code{NULL}.
#' @param method whether you want "classification" or "regression". Defaults to \code{"classification"}.
#' @param depth.max the maximum allowed tree depth. Defaults to \code{5L}.
#' @param size the minimum allowed number of samples for an individual node. Defaults to \code{1L}.
#' @param debug whether to save the predictors and responses that are categorized. Defaults to \code{FALSE}.
#' @param ... trailing arguments.
#' @return an object of class \code{dec.tree.class} containing the following:
#' \item{\code{tree}}{the decision tree.}
#' \item{\code{X}}{The training predictors.}
#' \item{\code{Y}}{the training responses.}
#' \item{\code{d}}{d the number of features subsampled at each node.}
#' \item{\code{alpha}}{the sampling distribution for the features. A \code{[p]} vector.}
#' \item{\code{depth.max}}{the maximum allowed tree depth.}
#' \item{\code{size}}{the maximum allowed tree depth.}
#' \item{\code{debug}}{whether to save the predictors and responses that are categorized.}
#' @author Eric Bridgeford
#' @export
dec_tree.fit <- function(formuler, data=NULL, d=NULL, alpha=NULL, method="classification",
depth.max=5L, size=1L, debug=FALSE, ...) {
# miscellaneous formula jargon
call <- match.call()
if (missing(data))
data <- environment(formuler)
mf <- match.call(expand.dots = FALSE)
m <- match(c("formuler", "data"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
mt <- attr(mf, "terms")
Y <- model.response(mf, "any")
if (length(dim(Y)) == 1L) {
nm <- rownames(Y)
dim(Y) <- NULL
if (!is.null(nm))
names(Y) <- nm
}
X <- if (!is.empty.model(mt)) {
model.matrix(mt, mf, contrasts)
}
if (method == "classification") {
fit <- do.call(tree.class.fit, list(X[,-c(1)], Y, d, alpha, depth.max, size, debug))
fit$formula <- formuler
} else {
stop("Not yet implemented!")
}
return(fit)
}
#' Fit a decision tree classifier.
#' @param X the predictors. A \code{[n, p]} matrix.
#' @param Y the responses. A \code{[n]} vector or, optionally, a factor.
#' @param d the number of features to subsample at each node. Defaults to \code{NULL}, which tries every feature.
#' @param alpha the prior parameters for the feature probabilities. A \code{[p]} vector. If \code{NULL}, samples uniformly.
#' Defaults to \code{NULL}.
#' @param depth.max the maximum allowed tree depth. Defaults to \code{5L}.
#' @param size the minimum allowed number of samples for an individual node. Defaults to \code{1L}.
#' @param debug whether to save the predictors and responses that are categorized. Defaults to \code{FALSE}.
#' into a particular leaf node.
#' @return an object of class \code{dec.tree.class} containing the following:
#' \item{\code{tree}}{the decision tree.}
#' \item{\code{X}}{The training predictors.}
#' \item{\code{Y}}{the training responses.}
#' \item{\code{d}}{d the number of features subsampled at each node.}
#' \item{\code{alpha}}{the sampling distribution for the features. A \code{[p]} vector.}
#' \item{\code{depth.max}}{the maximum allowed tree depth.}
#' \item{\code{size}}{the maximum allowed tree depth.}
#' \item{\code{debug}}{whether to save the predictors and responses that are categorized.}
#' @author Eric Bridgeford
#' @export
dec.tree.class.fit <- function(X, Y, d=NULL, alpha=NULL, depth.max=5L, size=1L, debug=FALSE) {
Y <- as.factor(Y)
n <- length(Y); p <- dim(X)[2]
if (!ifelse(is.integer(d), d <= p & d > 0, is.null(d))) {
stop("d should be a positive integer <= p, or NULL to indicate to sample every feature.")
}
if (is.null(d)) {
d <- as.integer(p)
}
if (!ifelse(is.integer(depth.max), depth.max > 0, FALSE)) {
stop("You have not entered a valid value for depth.max.")
}
if (!ifelse(is.integer(size), size > 0, FALSE)) {
stop("You have not passed a valid value for size.")
}
# if alpha is NULL, sample uniformly
if (!ifelse(is.numeric(alpha), all(alpha > 0), is.null(alpha))) {
stop("You have not entered a valid Dirichlet prior. Should be NULL, or a p vector with all values should be > 0.")
}
# build the tree with the Decision Tree Algorithm
tree <- build.tree(get.split(X, Y, d, alpha), d, alpha, depth.max, size, 1, debug)
return(structure(
list(tree=tree, X=X, Y=Y, p=p, d=d, alpha=alpha, depth.max=depth.max, size=size, debug=debug),
class="dec.tree.class"
))
}
#' Recursive Approach for Building Decision Tree
#' @param split a particular split node.
#' @param d the number of features to subsample.
#' @param alpha the sampling distribution for the features. A \code{[p]} vector. If \code{NULL}, uses a uniform.
#' @param depth.max the max tree depth.
#' @param size the minimum number of elements at a particular.
#' @param depth the current depth of the tree.
#' @param debug a boolean indicating whether to save the predictors and responses
#' that are categorized into leaf nodes.
#' @importFrom forcats fct_c
#' @return a layer of a decision tree.
#' @author Eric Bridgeford
build.tree <- function(split, d, alpha, depth.max, size, depth, debug=FALSE) {
# if we have an empty child, create a leaf by merging
# so that we don't have a totally empty child
if (length(split$left$Y) == 0 || length(split$right$Y) == 0) {
if (!debug) {
return(leaf.node(fct_c(split$left$Y, split$right$Y)))
} else {
return(c(leaf.node(fct_c(split$left$Y, split$right$Y)),
list(X=rbind(split$left$X, split$right$X),
Y=fct_c(split$left$Y, split$right$Y))))
}
}
# if we are at the max depth, create a leaf for the left and
# right children
if (depth >= depth.max) {
if (!debug) {
split$left <- leaf.node(split$left$Y)
} else {
split$left <- c(leaf.node(split$left$Y),
list(X=split$left$X, Y=split$left$Y))
}
if (!debug) {
split$right <- leaf.node(split$right$Y)
} else {
split$right <- c(leaf.node(split$right$Y),
list(X=split$right$X, Y=split$right$Y))
}
return(split)
}
# process right child
if (length(split$right$Y) > size) {
# split the node if we can still do better
split$right <- build.tree(
get.split(split$right$X, split$right$Y, d, alpha),
d, alpha, depth.max, size, depth + 1, debug=debug
)
} else {
if (!debug) {
split$right <- leaf.node(split$right$Y)
} else {
split$right <- c(leaf.node(split$right$Y),
list(X=split$right$X, Y=split$right$Y))
}
}
# process left child
if (length(split$left$Y) > size) {
# split the node if we can still do better
split$left <- build.tree(
get.split(split$left$X, split$left$Y, d, alpha),
d, alpha, depth.max, size, depth + 1, debug=debug
)
} else {
if (!debug) {
split$left <- leaf.node(split$left$Y)
} else {
split$left <- c(leaf.node(split$left$Y),
list(X=split$left$X, Y=split$left$Y))
}
}
return(split)
}
#' Create a new split of the data.
#' @param X the predictors.
#' @param Y the responses.
#' @param i the feature index.
#' @param t the threshold for the feature.
#' @return the split, as a list.
#' @author Eric Bridgeford
create.split <- function(X, Y, i, t) {
# split based on the feature of interest and threshold
# of interest
idx <- X[,i] < t
# return split as a list
split <- structure(list(left=list(X=X[idx,], Y=Y[idx]),
right=list(X=X[!idx,], Y=Y[!idx]),
feature=i, threshold=t, n=length(Y)),
class="split.node")
return(split)
}
#' Find Best split in the data.
#'
#' @param X the predictors.
#' @param Y the responses.
#' @param d the number of features to subsample. Should be an integer 0 < d <= p.
#' @param alpha the sampling distribution for the features. A [p] vector. If \code{NULL}, uses a uniform.
#' @return the best split.
#' @importFrom MCMCpack rdirichlet
#' @author Eric Bridgeford
get.split <- function(X, Y, d, alpha) {
n <- length(Y); p <- dim(X)[2]
# sample features to check
if (is.null(alpha)) {
features <- sample(1:p, d, replace=FALSE)
} else {
feat.probs <- rdirichlet(1, alpha)
features <- sample(1:p, d, replace=FALSE, prob=feat.probs)
}
# initialize best split
best_split <- list(X=X, Y=Y, t=NULL, feature=NULL, score=1.1, split=NULL)
# loop over subsampled features
for (feature in features) {
# check all possible splits for feature of interest
for (t in X[,feature]) {
# split on feature, t
sp <- create.split(X, Y, feature, t)
# get importance score
sp$score <- impurity.idx(sp)
# save the best split as the one that minimizes
# the impurity score
if (sp$score < best_split$score) {
best_split <- sp
}
}
}
return(best_split)
}
#' A function for computing the impurity of a particular split.
#' @param groups a 2 element list, where the first element are the samples to the left
#' and the second element the samples to the right of a split.
#' @return the gini impurity index.
#' @author Eric Bridgeford
impurity.idx <- function(groups) {
groups <- lapply(groups[c("left", "right")], function(group) return(group$Y))
# the total number of observations across groups
N <- sum(sapply(groups, function(group) length(group)))
sum(sapply(groups, function(group) {
# the total number of observations within group
n <- length(group)
# if group is empty, do not factor into gini
# computation
if (n == 0) {
return(0)
}
# the unique ys
ys <- levels(group)
# compute the squared proportion of each y in this particular group
sc <- sum(sapply(ys, function(y) {
mean(group == y)^2
}))
return((1 - sc)*n/N)
}))
}
#' Return Most Probable Class at a Leaf Node
#'
#' @param Y the responses at a leaf node, as a \code{[n]} factor.
#' @return the most probable response at the leaf.
#' @author Eric Bridgeford
leaf.node <- function(Y) {
gr.ct <- sapply(levels(Y), function(y) {
sum(Y == y)
})
return(structure(list(vote=factor(levels(Y)[which.max(gr.ct)],
levels = levels(Y))), class="leaf.node"))
}
#' Count Features in a Decision Tree
#'
#' A function to count the feature utilization in a Decision Tree.
#'
#' @param object an object of class \code{dec.tree.class}.
#' @param ... trailing args.
#' @return a vector containing the number of times each feature
#' is used at a split node.
#' @author Eric Bridgeford
#' @export
count.features.dec.tree.class <- function(object, ...) {
feat.usage <- table(count.features(object$tree))
counts <- rep(0, object$p)
names(counts) <- 1:object$p
counts[names(feat.usage)] <- as.numeric(feat.usage)
return(counts)
}
#' Count Features Leaf
#'
#' A default to return NULL and stop recursion when counting features
#' in a stump once we reach a leaf node.
#'
#' @param object an object of class \code{leaf.node}.
#' @param ... trailing args.
#' @return a vector containing the number of times each feature is used
#' at a split node.
#' @author Eric Bridgeford
count.features.leaf.node <- function(object, ...) {
return(NULL)
}
#' Count Features Stump
#'
#' A function to count the Usage of Features At or Below a Split Node (a Stump)
#'
#' @param object an object of class \code{split.node}.
#' @param ... trailing args.
#' @return a vector containing the number of times each feature is used
#' at a split node.
#' @author Eric Bridgeford
count.features.split.node <- function(object, ...) {
return(c(object$feature, count.features(object$left), count.features(object$right)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.