######################################
### Random Forest Tree Constructor ###
######################################
#' @title RFTree Constructor
#' @name RFTree-class
#' @rdname RFTree-class
#' @description `RFTree` is the unit component in the `RF` which composes
#' `RFNode`. The tree uses recursively partitioning to determine the best
#' `splitFeature` and `splitValue` for each level, and recursively split the
#' dataset until it reaches the limitation according to `nodesize`.
#' @slot sampleIndex A list of the index of observations that are used as
#' averaging dataset. The index are based on the original dataset `x` and `y`
#' from forest. Essentially, `x[sampleIndex]` generates the whole splitting
#' dataset.
#' @slot root A `RFNode` object which is the root of the tree. If the class is
#' extended, the list may contain the corresponding extended `RFNode` object.
#' @exportClass RFTree
setClass(Class = "RFTree",
slots = list(sampleIndex = "list",
root = "list"))
#' @title RFTree-Constructor
#' @name RFTree-constructor
#' @rdname RFTree-class
#' @description The actual storation of `RFTree` is different from its input.
#' `sampleIndex` and `nodesize` are actually two lists composing of those in
#' splitting dataset and averaging dataset. In the original random forest,
#' those are the same value. For other variant of random forest, such as honest
#' random forest, they can be set differently. The constructor can be
#' overwritten to reflect the change.
#' @param x A data frame of all training predictors.
#' @param y A vector of all training responses.
#' @param mtry The number of variables randomly selected at each split point.
#' The default value is set to be one third of total number of features of the
#' training data.
#' @param nodesize The minimum observations contained in terminal nodes. The
#' default value is 5.
#' @param sampleIndex A list of the index of observations that are used as
#' averaging dataset. The index are based on the original dataset `x` and `y`
#' from forest. Essentially, `x[sampleIndex]` generates the whole splitting
#' dataset.
#' @param splitrule A string to specify how to find the best split among all
#' candidate feature values. The current version only supports `variance` which
#' minimizes the overall MSE after splitting. The default value is `variance`.
#' @param categoricalFeatureCols A list of index for all categorical data. Used
#' for trees to detect categorical columns.
#' @export RFTree
setGeneric(
name = "RFTree",
def = function(x,
y,
mtry,
nodesize,
sampleIndex,
splitrule,
categoricalFeatureCols) {
standardGeneric("RFTree")
}
)
# ------------------------------------------------------------------------------
#' @title RFTree-Constructor
#' @rdname RFTree-RFTree
#' @description The constructor for the RFTree object
#' @param x A data frame of all training predictors.
#' @param y A vector of all training responses.
#' @param se #TODO what is SE ???????
#' @param mtry The number of variables randomly selected at each split point.
#' The default value is set to be one third of total number of features of the
#' training data.
#' @param nodesize The minimum observations contained in terminal nodes. The
#' default value is 5.
#' @param sampleIndex A list of the index of observations that are used as
#' averaging dataset. The index are based on the original dataset `x` and `y`
#' from forest. Essentially, `x[sampleIndex]` generates the whole splitting
#' dataset.
#' @param splitrule A string to specify how to find the best split among all
#' candidate feature values. The current version only supports `variance` which
#' minimizes the overall MSE after splitting. The default value is `variance`.
#' @param categoricalFeatureCols A list of index for all categorical data. Used
#' for trees to detect categorical columns.
#' @aliases RFTree, RFTree-method
## @useDynLib forestry
#' @return a `RFTree` object
RFTree <- function(x,
y,
se = NULL,
mtry = max(floor(ncol(x) / 3), 1),
nodesize = 5,
sampleIndex = 1:length(y),
splitrule = "variance",
categoricalFeatureCols = list()) {
# Create the list of nodeSize and sampleIndex
aggregateNodeSize <- list("averagingNodeSize" = nodesize,
"splittingNodeSize" = nodesize)
aggregateSampleIndex <- list("averagingSampleIndex" = sampleIndex,
"splittingSampleIndex" = sampleIndex)
# Create a original random forest tree.
tree <- new("RFTree",
sampleIndex = aggregateSampleIndex,
root = list())
# Grow the tree.
root <- recursivePartition(
x = x,
y = y,
se = se,
mtry = mtry,
nodesize = aggregateNodeSize,
sampleIndex = aggregateSampleIndex,
splitrule = splitrule,
categoricalFeatureCols = categoricalFeatureCols
)
tree@root <- list("node" = root)
return(tree)
}
# ------------------------------------------------------------------------------
################################
### selectBestFeature Method ###
################################
#' selectBestFeature
#' @name selectBestFeature
#' @rdname selectBestFeature-RFTree
#' @description Find the best `splitfeature`, `splitValue` pair where
#' `splitfeature` is one of the features specified by `featureList`. The
#' `splitFeature` and its corresponding `splitValue` minimizes the specified
#' `splitrule`. The implementation is slightly different from the original
#' implementation as the tree contains both averaging and splitting dataset.
#' To check the minimum split, the method checks for both datasets according to
#' `sampleIndex` and `nodesize`.
#' @param x A data frame of all training predictors.
#' @param y A vector of all training responses.
#' @param se A vector of standard deviations corresponding to training responses
#' @param featureList A list of candidate variables at the current split.
#' @param sampleIndex A list of index of dataset used in this node and its
#' children. `sampleIndex` contains two keys `averagingSampleIndex`
#' and `splittingSampleIndex`. `averagingSampleIndex` is used to generate
#' aggregated prediction for the node. `splittingSampleIndex` is used for
#' `honestRF` which stores the splitting data when creating the tree. In
#' default, `splittingSampleIndex` is the same as `averagingSampleIndex`.
#' @param nodesize The minimum observations contained in terminal nodes. This
#' parameter is actually a list containing the values for both
#' `splittingNodeSize` and `averagingNodeSize`.
#' @param splitrule A string to specify how to find the best split among all
#' candidate feature values. The current version only supports `variance` which
#' minimizes the overall MSE after splitting. The default value is `variance`.
#' @param categoricalFeatureCols A list of index for all categorical data. Used
#' for trees to detect categorical columns.
#' @export selectBestFeature
setGeneric(
name = "selectBestFeature",
def = function(x,
y,
se,
featureList,
sampleIndex,
nodesize,
splitrule,
categoricalFeatureCols) {
standardGeneric("selectBestFeature")
}
)
#' @rdname selectBestFeature-RFTree
#' @aliases selectBestFeature, selectBestFeature-method
#' @return A list of two outputs: "splitFeature" is the best feature to split
#' in order to minimize the split loss, "splitValue" is its corresponding split
#' value.
#' @note This function is currently depreciated. It has been replaced by the
#' C++ version in the package. Although the functionality and parameters are
#' exactly the same.
selectBestFeature <- function(x,
y,
se,
featureList,
sampleIndex = list(
"averagingSampleIndex" = 1:length(y),
"splittingSampleIndex" = 1:length(y)
),
nodesize = list("splittingNodeSize" = 5,
"averagingNodeSize" = 5),
splitrule = "variance",
categoricalFeatureCols = list()) {
# Get the number of total features
mtry <- length(featureList)
# Initialize the minimum loss for each feature
bestSplitLossAll <- rep(-Inf, mtry)
bestSplitValueAll <- rep(NA, mtry)
bestSplitFeatureAll <- rep(NA, mtry)
bestSplitCountAll <- rep(0, mtry)
# Iterate each selected features
for (i in 1:mtry) {
currentFeature <- featureList[i]
allUniqueValues <-
sort(as.numeric(union(
unique(x[sampleIndex$splittingSampleIndex, currentFeature]),
unique(x[sampleIndex$averagingSampleIndex, currentFeature]))))
# Test if the all the values for the feature are the same, then proceed
if (length(allUniqueValues) == 1)
next ()
# Test if the current feature is categorical
if (currentFeature %in% categoricalFeatureCols) {
# Go through all the values in the selected feature as the splitting
# point
for (featureValue in allUniqueValues) {
# Make partitions on the current feature and value in both splitting
# and averaging dataset. (Instead of using less than, we need to
# contraint it to make it strictly equal to.
leftPartitionAveraging <-
x[sampleIndex$averagingSampleIndex, currentFeature] == featureValue
leftPartitionSplitting <-
x[sampleIndex$splittingSampleIndex, currentFeature] == featureValue
leftPartitionCountSplitting <- sum(leftPartitionSplitting)
rightPartitionCountSplitting <- sum(!leftPartitionSplitting)
leftPartitionCountAveraging <- sum(leftPartitionAveraging)
rightPartitionCountAveraging <- sum(!leftPartitionAveraging)
# Check leaf size at least nodesize
if (min(leftPartitionCountSplitting,
rightPartitionCountSplitting) < nodesize$splittingNodeSize |
min(leftPartitionCountAveraging,
rightPartitionCountAveraging) < nodesize$averagingNodeSize) {
next ()
}
# Calculate sample mean in both splitting partitions
leftPartitionMean <-
mean(y[sampleIndex$splittingSampleIndex][leftPartitionSplitting])
rightPartitionMean <-
mean(y[sampleIndex$splittingSampleIndex][!leftPartitionSplitting])
# Calculate the variance of the splitting
muBarSquareSum <-
leftPartitionCountSplitting * leftPartitionMean ^ 2 +
rightPartitionCountSplitting * rightPartitionMean ^ 2
# Update the value if a higher value has been seen
if (muBarSquareSum > bestSplitLossAll[i]) {
bestSplitLossAll[i] <- muBarSquareSum
bestSplitFeatureAll[i] <- currentFeature
bestSplitValueAll[i] <- featureValue
bestSplitCountAll[i] <- 1
} else {
# If we are as good as the best split
if (muBarSquareSum == bestSplitLossAll[i]) {
bestSplitCountAll[i] <- bestSplitCountAll[i] + 1
# Only update with probability 1/nseen
if (stats::runif(1, 0, bestSplitCountAll[i]) <= 1) {
bestSplitLossAll[i] <- muBarSquareSum
bestSplitFeatureAll[i] <- currentFeature
bestSplitValueAll[i] <- featureValue
}
}
}
}
next ()
}
# Proceed as non-categorical feature
# Keep track of previous feature
oldFeatureValue <- allUniqueValues[1]
# Go through all the values in the selected feature
for (featureValue in allUniqueValues[-1]) {
# Make partitions on the current feature and value in both splitting
# and averaging dataset.
leftPartitionAveraging <-
x[sampleIndex$averagingSampleIndex, currentFeature] < featureValue
leftPartitionSplitting <-
x[sampleIndex$splittingSampleIndex, currentFeature] < featureValue
leftPartitionCountSplitting <- sum(leftPartitionSplitting)
rightPartitionCountSplitting <- sum(!leftPartitionSplitting)
leftPartitionCountAveraging <- sum(leftPartitionAveraging)
rightPartitionCountAveraging <- sum(!leftPartitionAveraging)
# Check leaf size at least nodesize
if (min(leftPartitionCountSplitting,
rightPartitionCountSplitting) < nodesize$splittingNodeSize |
min(leftPartitionCountAveraging,
rightPartitionCountAveraging) < nodesize$averagingNodeSize) {
#' Update the oldFeature value before proceeding
oldFeatureValue <- featureValue
next ()
}
# se contains the standard
if (is.null(se)) {
se <- rep(1, length(y))
}
leftPartition_WEIGHTED_CountSplitting <-
sum(
1 /
(se[sampleIndex$splittingSampleIndex][leftPartitionSplitting]) ^ 2
)
rightPartition_WEIGHTED_CountSplitting <-
sum(
1 /
(se[sampleIndex$splittingSampleIndex][!leftPartitionSplitting]) ^ 2
)
# Calculate sample mean in both splitting partitions
leftPartition_WEIGHTED_Mean <-
1 / leftPartition_WEIGHTED_CountSplitting *
sum( (
y[sampleIndex$splittingSampleIndex] *
1 / (se[sampleIndex$splittingSampleIndex]) ^ 2
)[leftPartitionSplitting])
rightPartition_WEIGHTED_Mean <-
1 / rightPartition_WEIGHTED_CountSplitting *
sum( (
y[sampleIndex$splittingSampleIndex] *
1 / (se[sampleIndex$splittingSampleIndex]) ^ 2
)[!leftPartitionSplitting])
# Calculate the variance of the splitting
muBarSquareSum <-
leftPartition_WEIGHTED_CountSplitting *
leftPartition_WEIGHTED_Mean ^ 2 +
rightPartition_WEIGHTED_CountSplitting *
rightPartition_WEIGHTED_Mean ^ 2
# Update the value if a higher value has been seen
if (muBarSquareSum > bestSplitLossAll[i]) {
bestSplitLossAll[i] <- muBarSquareSum
bestSplitFeatureAll[i] <- currentFeature
bestSplitValueAll[i] <-
stats::runif(1, oldFeatureValue, featureValue)
bestSplitCountAll[i] <- 1
} else {
# If we are as good as the best split
if (muBarSquareSum == bestSplitLossAll[i]) {
bestSplitCountAll[i] <- bestSplitCountAll[i] + 1
# Only update with probability 1/nseen
if (stats::runif(1, 0, bestSplitCountAll[i]) <= 1) {
bestSplitLossAll[i] <- muBarSquareSum
bestSplitFeatureAll[i] <- currentFeature
bestSplitValueAll[i] <-
stats::runif(1, oldFeatureValue, featureValue)
}
}
}
# Update the old feature value
oldFeatureValue <- featureValue
}
}
# Get the best split values among all features
bestSplitLoss <- max(bestSplitLossAll)
bestFeatureIndex <- which(bestSplitLossAll == bestSplitLoss)
# If we found a feasible splitting point
if (sum(bestSplitCountAll) > 0) {
# If there are multiple best features, sample one according to their
# frequency of occurence
elements_to_sample <- rep(bestFeatureIndex,
bestSplitCountAll[bestFeatureIndex])
bestFeatureIndex <- ifelse(
length(elements_to_sample) == 1,
elements_to_sample,
sample(elements_to_sample, 1)
)
} else{
# If none of the features are possible, return NA
return(list(
"bestSplitFeature" = NA,
"bestSplitValue" = NA
))
}
# Return the best splitFeature and splitValue
return(
list(
"bestSplitFeature" = bestSplitFeatureAll[bestFeatureIndex],
"bestSplitValue" = bestSplitValueAll[bestFeatureIndex]
)
)
}
#################################
### recursivePartition Method ###
#################################
#' recursivePartition-RFTree
#' @name recursivePartition-RFTree
#' @rdname recursivePartition-RFTree
#' @description Grow the decision tree by recursively finding the best split
#' feature and value.
#' @param x A data frame of all training predictors.
#' @param y A vector of all training responses.
#' @param se A vector of standard deviations corresponding to training responses
#' @param mtry The number of variables randomly selected at each split point.
#' The default value is set to be one third of total number of features of the
#' training data.
#' @param sampleIndex A list of index of dataset used in this node and its
#' children. `sampleIndex` contains two keys `averagingSampleIndex`
#' and `splittingSampleIndex`. `averagingSampleIndex` is used to generate
#' aggregated prediction for the node. `splittingSampleIndex` is used for
#' `honestRF` which stores the splitting data when creating the tree. In
#' default, `splittingSampleIndex` is the same as `averagingSampleIndex`.
#' @param nodesize The minimum observations contained in terminal nodes. This
#' parameter is actually a list containing the values for both
#' `splittingNodeSize` and `averagingNodeSize`.
#' @param splitrule A string to specify how to find the best split among all
#' candidate feature values. The current version only supports `variance` which
#' minimizes the overall MSE after splitting. The default value is `variance`.
#' @param categoricalFeatureCols A list of index for all categorical data. Used
#' for trees to detect categorical columns.
#' @return root node.
recursivePartition <- function(x,
y,
se = 1,
mtry = max(floor(ncol(x) / 3), 1),
sampleIndex = list(
"averagingSampleIndex" = 1:length(y),
"splittingSampleIndex" = 1:length(y)
),
nodesize = list("splittingNodeSize" = 5,
"averagingNodeSize" = 5),
splitrule = "variance",
categoricalFeatureCols = list()) {
# Sample features for the split
selectedFeatureIndex <- sample(1:ncol(x), mtry)
# Load bestSplitFeature, bestSplitValue to the current environment
# Tentatively replaced by the C++ function
# To switch back, replace rcpp_selectBestFeature with selectBestFeature
list2env(
selectBestFeatureLinear(
x = x,
y = y,
featureList_spit = selectedFeatureIndex,
featureList_lin = 1:ncol(x),
lambda =1,
sampleIndex = sampleIndex,
nodesize = nodesize,
categoricalFeatureCols = categoricalFeatureCols
),
environment()
)
# Create a leaf node if the current bestSplitValue is NA
if (is.na(bestSplitValue)) {
return(RFNode(sampleIndex = sampleIndex))
} else {
# Create a tree node
# Update sample index for both left and right partitions
# Test if the current feature is categorical
if (bestSplitFeature %in% categoricalFeatureCols) {
leftPartitionSplitting <-
x[sampleIndex$splittingSampleIndex,
bestSplitFeature] == bestSplitValue
leftPartitionAveraging <-
x[sampleIndex$averagingSampleIndex,
bestSplitFeature] == bestSplitValue
} else {# continuous feature
leftPartitionSplitting <-
x[sampleIndex$splittingSampleIndex,
bestSplitFeature] < bestSplitValue
leftPartitionAveraging <-
x[sampleIndex$averagingSampleIndex,
bestSplitFeature] < bestSplitValue
}
sampleIndex_left <- list(
"splittingSampleIndex" =
sampleIndex$splittingSampleIndex[leftPartitionSplitting],
"averagingSampleIndex" =
sampleIndex$averagingSampleIndex[leftPartitionAveraging]
)
sampleIndex_right <- list(
"splittingSampleIndex" =
sampleIndex$splittingSampleIndex[!leftPartitionSplitting],
"averagingSampleIndex" =
sampleIndex$averagingSampleIndex[!leftPartitionAveraging]
)
# Recursively grow the tree
leftChild <- recursivePartition(
x = x,
y = y,
se = se,
mtry = mtry,
sampleIndex = sampleIndex_left,
nodesize = nodesize,
splitrule = splitrule,
categoricalFeatureCols = categoricalFeatureCols
)
rightChild <- recursivePartition(
x = x,
y = y,
se = se,
mtry = mtry,
sampleIndex = sampleIndex_right,
nodesize = nodesize,
splitrule = splitrule,
categoricalFeatureCols = categoricalFeatureCols
)
# Create the leaf node the connects to both children
node <- RFNode(
splitFeature = bestSplitFeature,
splitValue = bestSplitValue,
child = list("leftChild" = leftChild,
"rightChild" = rightChild)
)
return(node)
}
}
######################
### Predict Method ###
######################
#' predict-RFTree
#' @name predict-RFTree
#' @rdname predict-RFTree
#' @description Return the prediction from the current tree.
#' @param object A `RFTree`` object.
#' @param feature.new A data frame of testing predictors.
#' @param x A data frame of all training predictors.
#' @param y A vector of all training responses.
#' @param se #TODO what is SE????????
#' @param avgfunc An averaging function to average observations in the node. The
#' function is used for prediction. The input of this function should be a
#' dataframe of predictors `x` and a vector of outcomes `y`. The output is a
#' scalar. The default function is to take the mean of vector `y`.
#' @param categoricalFeatureCols A list of index for all categorical data. Used
#' for trees to detect categorical columns.
#' @return A vector of predicted responses.
#' @aliases predict,RFTree-method
#' @exportMethod predict
setMethod(
f = "predict",
signature = "RFTree",
definition = function(object,
feature.new,
x,
y,
avgfunc,
categoricalFeatureCols,
se = NULL) {
return(
predict(
object = object@root$node,
feature.new = feature.new,
x = x,
y = y,
se = se,
avgfunc = avgfunc,
categoricalFeatureCols = categoricalFeatureCols
)
)
}
)
#######################
### showTree Method ###
#######################
#' showTree-RFTree
#' @name showTree-RFTree
#' @rdname showTree-RFTree
#' @description Print the entire tree starting from the root node
#' @param object A `RFTree`` object
#' @return No return value
#' @exportMethod showTree
setGeneric(
name = "showTree",
def = function(object) {
standardGeneric("showTree")
}
)
#' @rdname showTree-RFTree
#' @aliases showTree
setMethod(
f = "showTree",
signature = "RFTree",
definition = function(object) {
printnode(object@root$node)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.