######################
### avgMean Method ###
######################
#' @title avgMean
#' @description A default function to average the observations in each tree leaf
#' by taking the mean of the outcomes.
#' @param x A dataframe of observations in the current tree leaf
#' @param y A vector of outcomes in the current tree leaf
#' @param y_sd A vector of standard deviations corresponding to the outcomes
#' @return The averaging metrics of the data
avgMean <- function(x, y, y_sd = NULL) {
return(mean(y))
}
#' @title weightedavgMean
#' @description A default function to average the observations in each tree leaf
#' by taking a weighted mean according with minimal variance.
#' @param x A dataframe of observations in the current tree leaf
#' @param y A vector of outcomes in the current tree leaf
#' @param y_sd A vector. The ith position contains the standard deviation of Y_i
#' @return The weighted averaging of Y
weightedavgMean <- function(x, y, y_sd) {
normalizer <- sum(1 / y_sd ^ 2)
return(sum(y / y_sd ^ 2) / normalizer)
}
######################################
### Random Forest Node Constructor ###
######################################
#' @title RFNode Constructor
#' @name RFNode-class
#' @rdname RFNode-class
#' @description `RFNode` is the basic element inside a `RFTree`. For each node,
#' it contains the corresponding data that are assigned to the node. The
#' `RFNode` can be either a leaf or a tree node (non-leaf). If it is a leaf
#' node, `avgfunc` can be called to aggregate all the observations in the node
#' and save a prediction for the node. If it is a tree node, it will contains
#' `leftChild` and `rightChild` which will be another two `RFNode`s. The parent
#' node can be used to track the `splitFeature` and `splitValue`. Please note
#' that in the `RFNode`, `sampleIndex` essentially tells what data reside
#' in the node. Every node contains two `sampleIndex`: `averagingSampleIndex`
#' and `splittingSampleIndex`. `averagingSampleIndex` is used to generate
#' aggregated prediction for the node. `splittingSampleIndex` is used for
#' `honestRF` which stores the splitting data when creating the tree. In
#' default, `RFNode` has the same `averagingSampleIndex` and
#' `splittingSampleIndex`.
#' @slot sampleIndex A list of index of dataset used in this node and its
#' children. `sampleIndex` contains two keys `averagingSampleIndex`
#' and `splittingSampleIndex`. `averagingSampleIndex` is used to generate
#' aggregated prediction for the node. `splittingSampleIndex` is used for
#' `honestRF` which stores the splitting data when creating the tree. In
#' default, `splittingSampleIndex` is the same as `averagingSampleIndex`.
#' @slot splitFeature Name of the feature that is used for splitting in this
#' node.
#' @slot splitValue The value that is used for splitting in this node.
#' @slot child If the node is not a leaf node, the `child` will be a list of two
#' `RFNode` objects. If it is a leaf node, the `child` will be `NULL`.
#' @slot nSplit Number of observations in the splitting dataset in this node.
#' @slot nAverage Number of observations in the averaging dataset in this node.
#' @exportClass RFNode
setClass(
Class = "RFNode",
slots = list(
sampleIndex = "list",
splitFeature = "numeric",
splitValue = "numeric",
child = "list",
nSplit = "numeric",
nAverage = "numeric"
)
)
#' @title RFNode Constructor
#' @name RFNode
#' @description A constructor to create a RFNode
#' @param sampleIndex A list of index of dataset used in this node and its
#' children. `sampleIndex` contains two keys `averagingSampleIndex`
#' and `splittingSampleIndex`. `averagingSampleIndex` is used to generate
#' aggregated prediction for the node. `splittingSampleIndex` is used for
#' `honestRF` which stores the splitting data when creating the tree. In
#' default, `splittingSampleIndex` is the same as `averagingSampleIndex`.
#' @param splitFeature Name of the feature that is used for splitting in this
#' node.
#' @param splitValue The value that is used for splitting in this node.
#' @param child If the node is not a leaf node, the `child` will be a list of
#' two `RFNode` objects. If it is a leaf node, the `child` will be `NULL`.
#' @export RFNode
setGeneric(
name = "RFNode",
def = function(sampleIndex,
splitFeature,
splitValue,
child) {
standardGeneric("RFNode")
}
)
#' @title RFNode Constructor
#' @description basic node creator
#' @aliases RFNode-method
#' @return a `RFNode` object
RFNode <-
function(sampleIndex = list("averagingSampleIndex" = vector(),
"splittingSampleIndex" = vector()),
splitFeature = numeric(),
splitValue = numeric(),
child = list()) {
if (length(child) == 0) {
# Both children are null, create a leaf node
node <- new(
"RFNode",
sampleIndex = sampleIndex,
splitFeature = numeric(),
splitValue = numeric(),
child = list(),
nSplit = length(sampleIndex$splittingSampleIndex),
nAverage = length(sampleIndex$averagingSampleIndex)
)
} else {
# Create a tree node
node <- new(
"RFNode",
sampleIndex = list(),
splitFeature = splitFeature,
splitValue = splitValue,
child = child,
nSplit = numeric(),
nAverage = numeric()
)
}
return(node)
}
######################
### Predict Method ###
######################
#' predict-RFNode
#' @name predict-RFNode
#' @description Return the prediction from the current node if it is a leaf
#' node, otherwise it will recursively call its children according to the
#' `splitFeature` and `splitValue`.
#' @param object A `RFNode`` object.
#' @param feature.new A data frame of testing predictors.
#' @param x A data frame of all training predictors.
#' @param y A vector of all training responses.
#' @param se standard errors of y
#' @param avgfunc An averaging function to average observations in the node. The
#' function is used for prediction. The input of this function should be a
#' dataframe of predictors `x` and a vector of outcomes `y`. The output is a
#' scalar. The default function is to take the mean of vector `y`.
#' @param categoricalFeatureCols A list of index for all categorical data. Used
#' for trees to detect categorical columns.
#' @return A vector of predicted responses.
#' @aliases predict,RFNode-method
#' @exportMethod predict
setMethod(
f = "predict",
signature = "RFNode",
definition = function(object,
feature.new,
x,
y,
se = NULL,
avgfunc,
categoricalFeatureCols) {
# If the node is a leaf, aggregate all its averaging data samples
if (length(object@child) == 0) {
predicted_value <-
avgfunc(x[object@sampleIndex$averagingSampleIndex, ],
y[object@sampleIndex$averagingSampleIndex],
se[object@sampleIndex$averagingSampleIndex])
prediction <- rep(predicted_value, nrow(feature.new))
return(prediction)
} else {
# Test if the splitting feature is categorical
if (object@splitFeature %in% categoricalFeatureCols) {
leftIndicator <-
feature.new[, object@splitFeature] == object@splitValue
} else {
# For regression, split left and right according to the split point
leftIndicator <-
feature.new[, object@splitFeature] < object@splitValue
}
# Initialize a matrix for predictions
prediction <- rep(NA, nrow(feature.new))
# Assign predictions from the left child
if (sum(leftIndicator) > 0) {
prediction[leftIndicator] <- predict(
object@child$leftChild,
feature.new[leftIndicator, ],
x,
y,
se,
avgfunc,
categoricalFeatureCols
)
}
# Assign predictions from the right child
if (sum(!leftIndicator) > 0) {
prediction[!leftIndicator] <- predict(
object@child$rightChild,
feature.new[!leftIndicator, ],
x,
y,
se,
avgfunc,
categoricalFeatureCols
)
}
# Return the prediction
return(prediction)
}
}
)
########################
### printnode Method ###
########################
#' printnode-RFNode
#' @name printnode
#' @rdname printnode-RFNode
#' @description Print the type of the node and its prediction. If it is a tree
#' node, it will recursively print all its children.
#' @param object RFNode object
#' @param indentSpace The indentation space of the current node in the standard
#' output. The initial value is 0. Once the tree grows, it increments by 2.
#' @exportMethod printnode
setGeneric(
name = "printnode",
def = function(object, indentSpace = 0) {
standardGeneric("printnode")
}
)
#' @name printnode-RFNode
#' @description prints the node
#' @aliases printnode,RFNode-method
setMethod(
f = "printnode",
signature = "RFNode",
definition = function(object, indentSpace = 0) {
if (length(object@child) == 0) {
# Leaf node
print(
paste(
paste(rep(" ", indentSpace), collapse = ""),
"Leaf Node:",
"# of Split =",
object@nSplit,
", # of Average =",
object@nAverage
)
)
} else {
# Tree node
print(
paste(
paste(rep(" ", indentSpace), collapse = ""),
"Tree Node:",
"Split feature =",
object@splitFeature,
", Split Value =",
object@splitValue
)
)
printnode(object@child$leftChild, indentSpace + 2)
printnode(object@child$rightChild, indentSpace + 2)
}
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.