max.subtree.rfsrc <- function(object,
max.order = 2,
sub.order = FALSE,
conservative = FALSE,
...)
{
## Incoming parameter checks. All are fatal.
if (is.null(object)) stop("Object is empty!")
if (sum(inherits(object, c("rfsrc", "grow"), TRUE) == c(1, 2)) != 2 &
sum(inherits(object, c("rfsrc", "forest"), TRUE) == c(1, 2)) != 2)
stop("This function only works for objects of class `(rfsrc, grow)' or '(rfsrc, forest)'")
## Acquire the forest
if (sum(inherits(object, c("rfsrc", "grow"), TRUE) == c(1, 2)) == 2) {
if (is.null(object$forest))
stop("Forest is empty! Re-run grow call with forest set to 'TRUE'.")
object <- object$forest
}
nativeArray <- object$nativeArray
if (is.null(nativeArray)) {
stop("RFSRC nativeArray content is NULL. Please ensure the object is valid.")
}
xvar.names <- object$xvar.names
if (is.null(xvar.names)) {
stop("RFSRC xvar.names content is NULL. Please ensure the object is valid.")
}
if (is.null(object$xvar)) {
stop("RFSRC xvar content is NULL. Please ensure the object is valid.")
}
## verify the max.order option
max.order <- floor(max.order)
if (max.order < 0) {
stop("RFSRC 'max.order' requested for distance order statistic must be an integer greater than zero (0).")
}
## max.order=0 defaults to non-conservative thresholding
if (max.order == 0) {
conservative <- FALSE
}
## Maximum depth monitored for nodes at depth counts
## TBD Increased from 1000 to 10000 but need a better way to set this value. TBD
MAX.DEPTH <- 10000
## Count the number of trees in the forest.
numTree <- length(as.vector(unique(nativeArray$treeID)))
## Count the number of parameters in the data set.
numParm <- length(xvar.names)
## Determine the sample size of the data set.
numSamp <- nrow(object$xvar)
## ------------------------------------------------------------------
## The maximal distance from the root node of a tree or sub-tree is
## zero-based. This means that the root node represents a distance
## of zero (0).
##
## $count - of length [numParm].
## - integer count of the number of maximal subtrees for each
## xvar in each tree.
## - this will be zero (0) if the xvar is not used in
## the forest.
## $order - if max.order > 0, matrix of dim [numParm] x [max.order]
## representing the order statistic for the maximal
## subtree distance for each xvar
## - if max.order == 0, matrix of [numParm] x [numTree]
## representing the first order statistic for the maximal
## subtree distance for each xvar by tree
## - rows represent the distribution for the order
## statistic for a xvar
## - thus [2][1] gives the minimum distance for xvar 2.
## - elements will have the value (maximum + 1), the
## penalty, if the xvar is not used in the tree, or
## the statistic does not exist
## $meanSum - vector of length [numParm]
## - the pre-mean mimimum maximal subtree distance for each
## xvar in each tree.
## - if the xvar is not used in the tree, this will be
## the maximum depth for the tree, the penalty.
## $depth - integer depth (defined as the maximum terminal node
## distance from the root node) possible in the tree
## topology. This will be always greater than zero (0)
## for non-trivial trees
## $terminalDepthSum
## - integer sum of the depths of the terminal
## nodes of the tree
## - used in calculating the average depth of the
## nodes for tree
## $subOrder
## - matrix of [numParm] x [numParm] representing the
## normalized minimum maximal w-subtree distance for
## parameter v's maximal subtree.
## - thus if v=2, w=1, consider a maximal 2-subtree with
## the "root" node of this subtree associated with
## relative depth = 0. Then consider a 1-subtree of the
## 2-subtree. Then, [2][1] gives the normalized minimum
## maximal subtree distance of all 1-subtrees within the
## 2-subtree. If the maximal w-subtree does not exist,
## the distance is set to be one (1). The normalized
## distance is calculated as follows. First, the
## maximum depth of the terminal nodes of the v-subtree
## is determined. Then the relative depth of the
## w-subtree (from the root node) of the v-subtree is
## determined. This latter quantity is divided by the
## prior maximum terminal node depth quantity to give a
## number between (0,1].
## - the diagonal [i][i] is the normalized minimum maximal
## v-subtree distance for the parameter i. If it is not
## split on in the tree, it is assigned the value one
## (1).
## - the resulting matrix is NA-free.
## $subOrderDiag
## - interim vector of length [numParm] representing the
## minimum maximal v-subtree distance for the tree that
## will constitute the diagonal of $subOrder.
## - variables that are not split on are assigned the
## penalty value of the maximum depth.
## $nodesAtDepth
## - number of nodes at each depth, where [1] = 2 for
## a non-trivial tree, [2] = 4 for a balanced tree.
## In general, for a balanced tree, at depth d, the
## count is 2^d.
##
## --------------------------------------------------------------
## Loop through all trees
## This is a LOCAL operation
## Global dependencies: (xvar.names, forest)
##
## It is inefficient to use a do-loop, therefore we use lapply
## or mclapply. Later we parse the list to extract the necessary
## objects
## --------------------------------------------------------------
subtree.obj <- mclapply(1:numTree, function(b) {
## Create the (local) subtree object for recursion. This is NOT the
## output object, but is closely related to it.
subtree <- vector("list", 8)
names(subtree) <- c("count",
"order",
"meanSum",
"depth",
"terminalDepthSum",
"subOrder",
"subOrderDiag",
"nodesAtDepth")
## Create the recursive output object. This would be unnecessary
## if it was possible to declare global variables in a package.
recursiveObject <- list(offset = min(which(nativeArray$treeID == b)),
subtree = subtree,
diagnostic = 0,
diagnostic2 = 0)
## Reset the nodes at depth count.
recursiveObject$subtree$nodesAtDepth <- rep(NA, MAX.DEPTH)
## Reset the mean maximal subtree distance.
## This will be NA if the xvar is not used in this tree.
recursiveObject$subtree$meanSum <- rep(NA, numParm)
## Reset the order statistic matrix representing maximal distance.
recursiveObject$subtree$order <- matrix(NA, nrow=numParm, ncol=max(max.order, 1))
if (sub.order) {
## Reset the sub-order statistic matrix representing the minimum
## maximal w-subtree distance for v != w.
recursiveObject$subtree$subOrder <- matrix(1.0, nrow=numParm, ncol=numParm)
## Reset the subOrder diagonals to NA.
recursiveObject$subtree$subOrderDiag <- rep(NA, numParm)
}
## Reset the maximum maximal subtree distance.
recursiveObject$subtree$depth <- 0
## Reset the terminal node depth sums.
recursiveObject$subtree$terminalDepthSum <- 0
## Reset the count of the maximal subtrees in the tree.
recursiveObject$subtree$count <- rep(0, numParm)
## Identify the root node split parameter.
rootParmID <- nativeArray$parmID[recursiveObject$offset]
## Save the previous value of the offset. This is used in
## determining the number of terminal nodes in the tree.
offsetMark <- recursiveObject$offset
## Set the stump count. Gets updated when the tree is a stump.
stumpCnt <- 0
## ----------------------------
## PRIMARY RECURSION PRIMARY
## ----------------------------
## Recursively parse the tree in the primary protocol.
recursiveObject <- rfsrcParseTree(
recursiveObject,
max(max.order, 1),
sub.order,
nativeArray,
b,
distance=0,
subtreeFlag=rep(FALSE, numParm))
## Check that the current tree is not a stump.
if (rootParmID != 0) {
## Update the forest sum ...
## Determine which xvar _were_not_ split on, in the current tree.
index <- which(recursiveObject$subtree$count == 0)
## Penalize these unused xvar by making their mean distance the worst.
recursiveObject$subtree$meanSum[index] <- recursiveObject$subtree$depth
forestMeanSum <- recursiveObject$subtree$meanSum
## This is tree specific. Here we view the matrix as a vector.
index <- which(is.na(recursiveObject$subtree$order))
## Penalize these order statistics making their values the worst.
recursiveObject$subtree$order[index] <- recursiveObject$subtree$depth
orderTree <- recursiveObject$subtree$order
## Explanation of demominator: The total number of nodes N
## (internal and external) in a tree is given by the number of
## records in a tree. In this case, offset - offsetMark = N,
## due to the nature of the pointer incrementation. The number
## of external (terminal) nodes N(TERM) is given by (N + 1) / 2.
## The maximum number of subtrees for the tree is given by
## N(TERM) / 2 which is the denominator.
## We normalize the actual number of maximal subtrees found in
## the tree by the maximum number maximal subtrees possible.
subtreeCountSum <- (recursiveObject$subtree$count / ((recursiveObject$offset - offsetMark + 1) / 4))
## Get the average depth of the terminal nodes of this tree.
## The denominator is just the number of terminal nodes.
terminalDepth <- recursiveObject$subtree$terminalDepthSum / ((recursiveObject$offset - offsetMark + 1) / 2)
if (sub.order) {
## Determine which xvar _were_ split on, in the current tree.
index <- which(recursiveObject$subtree$count > 0)
## Over-ride those diagonal elements with the minimum depth encountered.
diag(recursiveObject$subtree$subOrder)[index] <- recursiveObject$subtree$subOrderDiag[index]
## Determine which xvar _were_not_ split on, in the
## current tree. This can also be determined by examining the
## diagonal vector, and noting those elements with the value
## NA
index <- which(recursiveObject$subtree$count == 0)
## Over-ride those missing diagonal elements with penalty value.
diag(recursiveObject$subtree$subOrder)[index] <- recursiveObject$subtree$depth
## Normalize the depths.
diag(recursiveObject$subtree$subOrder) <- diag(recursiveObject$subtree$subOrder) / recursiveObject$subtree$depth
## Add the result to the forest sum.
subOrderSum <- recursiveObject$subtree$subOrder
}
else {
subOrderSum <- NULL
}
## Update the nodes at depth count.
nodesAtDepthMatrix <- recursiveObject$subtree$nodesAtDepth
}
else {
## The tree is a stump. It will be excluded.
stumpCnt <- 1
## All other parameters are set to NULL
forestMeanSum <- orderTree <- subtreeCountSum <-
terminalDepth <- subOrderSum <- nodesAtDepthMatrix <- NULL
}
## return the local subtree objects for parsing
return(list(forestMeanSum = forestMeanSum,
orderTree = orderTree,
subtreeCountSum = subtreeCountSum,
terminalDepth = terminalDepth,
subOrderSum = subOrderSum,
stumpCnt = stumpCnt,
nodesAtDepthMatrix = nodesAtDepthMatrix))
})
## ---------------------------------------------------------------------
## Parse the local object
## Pre-mean maximal subtree distance by parameter over the forest.
forestMeanSum <- rep(0, numParm)
## This is by tree over the forest.
## Minimum maximal subree distance by parameter by tree over the forest.
order.tree <- matrix(NA, nrow=numParm, ncol=numTree)
## Order statistics by parameter of maximal subtree distance.
if (max.order > 0) {
orderSum <- matrix(0, nrow=numParm, ncol=max.order)
}
## Normalized maximal subtree count sum over the forest. Linked to $count.
subtreeCountSum <- rep(0, numParm)
## Average depth of each tree. Linked to $terminalDepthSum
terminalDepth <- rep(0, numTree)
## Sub-order interim sum.
subOrderSum <- matrix(0, nrow=numParm, ncol=numParm)
## Final list of count of nodes at depth by tree.
nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = numTree)
## Initialize the stump count
stumpCnt <- 0
## iterate over trees
for (b in 1:numTree) {
local.obj <- subtree.obj[[b]]
## assignments only apply when the tree is not a stump
if (local.obj$stumpCnt == 0) {
forestMeanSum <- forestMeanSum <- forestMeanSum + local.obj$forestMeanSum
if (max.order > 0) {
orderSum <- orderSum + local.obj$orderTree
order.tree[ , b] <- local.obj$orderTree[, 1]
}
else {
order.tree[ , b] <- local.obj$orderTree
}
subtreeCountSum <- subtreeCountSum + local.obj$subtreeCountSum
terminalDepth[b] <- local.obj$terminalDepth
if (sub.order == TRUE) {
subOrderSum <- subOrderSum + local.obj$subOrderSum
}
nodesAtDepthMatrix[, b] <- local.obj$nodesAtDepthMatrix
}
## the tree is a stump
else {
stumpCnt <- stumpCnt + 1
}
}
## ---------------------------------------------------------------------
## Output preparation
##
##
## Prepare the ensemble values for the output object.
##
## $mean - vector of length [numParm]
## - mean minimum maximal subtree distance by parameter over the
## forest.
## $order - if max.order > 0, matrix of dim [numParm] x [max.order]
## representing the order statistic for the maximal
## subtree distance for each xvar over the forest
## - if max.order == 0, matrix of [numParm] x [numTree]
## representing the first order statistic for the maximal
## subtree distance for each xvar by tree
## $count - vector of length [numParm]
## - normalized maximal subtree count by parameter over the
## forest
## - normalization is by the number of topologically possible
## subtrees in each tree
## $sub.order - matrix of [numParm] x [numParm] representing the
## normalized minimum maximal w-subtree distance for
## parameter v's maximal subtree averaged over all trees.
## - see the tree specific description for details.
## - output if sub.order=TRUE
## $terminal - vector of length [numTree]
## - average terminal node depth of each tree.
## $nodes.at.depth - matrix of [MAX.DEPTH] x [numTree]
## - number of nodes at each depth, where [1] = 2 for
## a non-trivial tree, [2] = 4 for a balanced tree.
## In general, for a balanced tree, at depth d, the
## count is [d] = 2^d.
nameVector <- c("mean",
"order",
"count",
"terminal",
"nodes.at.depth",
"sub.order",
"threshold",
"threshold.1se",
"topvars",
"topvars.1se",
"percentile",
"density")
result <- vector("list", length(nameVector))
names(result) <- nameVector
## Precautionary check for all stumps.
if(numTree != stumpCnt) {
## Attach the average terminal node depth to the output object.
result$terminal <- terminalDepth
## Determine the forest average maximal subtree distance.
result$mean <- forestMeanSum / (numTree - stumpCnt)
names(result$mean) <- xvar.names
## Minimal depth by variable by tree
minDepthVarTree <- order.tree
if (max.order > 0) {
## Summarize the order statistics containing the maximal subtree distances.
result$order <- orderSum / (numTree - stumpCnt)
rownames(result$order) <- xvar.names
}
else {
result$order <- minDepthVarTree
rownames(result$order) <- xvar.names
}
## Determine the forest average subtree count.
result$count <- subtreeCountSum / (numTree - stumpCnt)
names(result$count) <- xvar.names
## Copy the matrix for the number of nodes at each depth.
result$nodes.at.depth <- nodesAtDepthMatrix
if (sub.order == TRUE) {
## Calculate the forest average of the minimum maximal w-subtree distances.
result$sub.order <- subOrderSum / (numTree - stumpCnt)
rownames(result$sub.order) <- xvar.names
colnames(result$sub.order) <- xvar.names
}
}
## print (recursiveObject$diagnostic)
## print (recursiveObject$diagnostic2)
## Minimal depth for thresholding weak variables
if (conservative == TRUE) {
parseDepthOrderObj <- parseDepthOrder(result, max.order)
if (!is.null(parseDepthOrderObj)) {
result$threshold <- parseDepthOrderObj$first.moment
result$threshold.1se <- result$threshold + parseDepthOrderObj$sd / sqrt(numSamp)
result$density <- parseDepthOrderObj$prob
result$second.order.threshold <- parseDepthOrderObj$second.order.moment
}
}
else {
parseDepthOrderObj <- parseDepthOrder(result, 0)
if (!is.null(parseDepthOrderObj)) {
threshold.con <- parseDepthOrder(result, 1)$first.moment
threshold.lib <- mean(parseDepthOrderObj$first.moment, na.rm = TRUE)
result$threshold <- ifelse(threshold.lib - 0.5 > threshold.con, threshold.lib - 0.5, threshold.lib)
result$threshold.1se <- result$threshold + mean(parseDepthOrderObj$sd, na.rm = TRUE) / sqrt(numSamp)
result$density <- apply(parseDepthOrderObj$prob, 1, mean, na.rm = TRUE)
result$second.order.threshold <- mean(parseDepthOrderObj$second.order.moment, na.rm = TRUE)
}
}
if (!is.null(result$density)) {
names(result$density) <- paste(0:(length(result$density) - 1))
}
## variable selection
if (max.order > 0) {
minDepthVar <- result$order[, 1]
}
else {
if (!is.null(result$order)) {
minDepthVar <- apply(minDepthVarTree, 1, mean, na.rm = TRUE)
}
else {
minDepthVar <- NULL
}
}
top.pt <- minDepthVar <= result$threshold
top.pt[is.na(top.pt)] <- FALSE
if (sum(top.pt) > 0) {
result$topvars <- xvar.names[top.pt]
}
top.1se.pt <- minDepthVar <= result$threshold.1se
top.1se.pt[is.na(top.1se.pt)] <- FALSE
if (sum(top.1se.pt) > 0) {
result$topvars.1se <- xvar.names[top.1se.pt]
}
## minimal depth percentile
if (!is.null(result$order)) {
cdf <- cumsum(result$density)
if (conservative == TRUE) {
result$percentile <- cdf[pmax(round(minDepthVar), 1)]
}
else {
result$percentile <- apply(rbind(sapply(1:ncol(minDepthVarTree), function(b) {
cdf[pmax(minDepthVarTree[, b], 1)]
})), 1, mean, na.rm = TRUE)
}
names(result$percentile) <- xvar.names
}
## discard values no longer needed
result$terminal <- result$mean <- NULL
## exit; invisible return
invisible(result)
}
## -----------------------------------------------------------------------
##
## PRIMARY RECURSION
##
## -----------------------------------------------------------------------
## Recursive function to determine first order maximal distances.
rfsrcParseTree <- function(recursiveObject,
max.order,
sub.order,
nativeArray,
b,
distance,
subtreeFlag) {
## Diagnostic count of calls.
recursiveObject$diagnostic <- recursiveObject$diagnostic + 1
## Weak consistency check to ensure that the iteration matches the treeID in the nativeArray record.
if(b != nativeArray$treeID[recursiveObject$offset]) {
stop("Invalid nativeArray input record (treeID) at ", recursiveObject$offset, ". Please contact Technical Support.")
}
## Check if we are at the root node. Otherwise increment the node at depth count.
if (distance > 0) {
## We are not at the root node.
if (distance <= length(recursiveObject$subtree$nodesAtDepth)) {
if (is.na(recursiveObject$subtree$nodesAtDepth[distance])) {
## This is the first split at this depth, so initialize the nodes at depth count.
recursiveObject$subtree$nodesAtDepth[distance] <- 1
}
else {
## Increment the count of nodes at this depth.
recursiveObject$subtree$nodesAtDepth[distance] <- recursiveObject$subtree$nodesAtDepth[distance] + 1
}
}
}
## Read the current nativeArray split parameter.
splitParameter <- nativeArray$parmID[recursiveObject$offset]
## Determine whether this is a terminal node.
if (splitParameter == 0) {
terminalFlag <- TRUE
}
else if (splitParameter != 0) {
terminalFlag <- FALSE
}
if (!terminalFlag) {
## Update the maximal subtree information for the parameter if it
## has not already been encountered in the tree.
if (subtreeFlag[splitParameter] == FALSE) {
## Increment the subtree count for the parameter.
recursiveObject$subtree$count[splitParameter] <- recursiveObject$subtree$count[splitParameter] + 1
## Update the (pre) mean distance.
if (is.na(recursiveObject$subtree$meanSum[splitParameter])) {
recursiveObject$subtree$meanSum[splitParameter] <- distance
}
else {
recursiveObject$subtree$meanSum[splitParameter] <- recursiveObject$subtree$meanSum[splitParameter] + distance
}
if (max.order > 0) {
## Update distance encountered for the order statistic.
## The last element, NA, in the temporary vector is a filler to handle which.max() silliness.
orderVector <- c(recursiveObject$subtree$order[splitParameter, ], distance, NA)
## Identify the first NA occurence.
index <- which.max(is.na(orderVector))
## The first (relevant) NA is filled with the current distance.
orderVector[index] <- distance
## The vector is then sorted.
sortedVector <- sort(orderVector[1:index])
## Restore the length of the order vector, if necessary, by apppending NA's.
if (index <= max.order) {
orderVector <- c(sortedVector, rep(NA, max.order-index))
}
else {
orderVector <- sortedVector[1:max.order]
}
## Update the order statistic.
recursiveObject$subtree$order[splitParameter, ] <- orderVector
}
else {
## Update the minimum distance encountered for the parameter.
if (is.na(recursiveObject$subtree$order[splitParameter])) {
recursiveObject$subtree$order[splitParameter] <- distance
}
else {
recursiveObject$subtree$order[splitParameter] <- min(recursiveObject$order[splitParameter], distance, na.rm = TRUE)
}
}
## Indicate that the parameter has been split on.
subtreeFlag[splitParameter] <- TRUE
if (sub.order == TRUE) {
## Update the diagonal element with the minimum distance encountered for the parameter.
if (is.na(recursiveObject$subtree$subOrderDiag[splitParameter])) {
recursiveObject$subtree$subOrderDiag[splitParameter] <- distance
}
else {
recursiveObject$subtree$subOrderDiag[splitParameter] <- min(recursiveObject$subtree$subOrderDiag[splitParameter], distance, na.rm = TRUE)
}
## Create the recursive2 list object. Note that the relative depth starts at zero (0) for the subtree. Also note
## that the object returns a vector containing the unnormalized minimum maximal w-subtree distance.
recursive2Object <- list(offset = recursiveObject$offset,
depth = 0,
minimumVector = rep(NA, dim(recursiveObject$subtree$subOrder)[2]),
diagnostic = recursiveObject$diagnostic2)
## Initialize the split flags for the sub-recursion. Ignore
## the diagonal element by setting the split flag.
subtree2Flag <- rep(FALSE, dim(recursiveObject$subtree$subOrder)[2])
subtree2Flag[splitParameter] <- TRUE
## Do the sub-recursion.
recursive2Object <- rfsrcParse2Tree(recursive2Object,
nativeArray,
b,
distance=0,
subtreeFlag=subtree2Flag)
recursiveObject$diagnostic2 <- recursiveObject$diagnostic2 + recursive2Object$diagnostic
## Set the element corresponding to the diagonal element temporarily to the penalized value, to avoid being manipulated.
recursive2Object$minimumVector[splitParameter] <- recursive2Object$depth
## Determine which w-subtrees in this v-subtree do not exist and set them to the penalized value.
recursive2Object$minimumVector[which(is.na(recursive2Object$minimumVector))] <- recursive2Object$depth
## Normalize the relative distances by the maximum terminal node depth encountered for this v-subtree.
recursive2Object$minimumVector <- recursive2Object$minimumVector / recursive2Object$depth
## Update the minimum w-subtree distance encountered in the tree.
recursiveObject$subtree$subOrder[splitParameter, ] <- pmin(recursiveObject$subtree$subOrder[splitParameter, ], recursive2Object$minimumVector)
} ## if (sub.order == TRUE) ...
} ## if (subtreeFlag[splitParameter] == FALSE) ...
} ## if (terminalFlag and distance > 0) then adjust the node count by - 1
else if (distance > 0) {
recursiveObject$subtree$nodesAtDepth[distance] <- recursiveObject$subtree$nodesAtDepth[distance] - 1
}
## Update the maximum depth encountered for this tree.
recursiveObject$subtree$depth <- max(recursiveObject$subtree$depth, distance)
## Increment the offset.
recursiveObject$offset <- recursiveObject$offset + 1
## Parse left and then right, if this is not a terminal node.
if (terminalFlag == FALSE) {
## Increment the (parsed) tree distance.
distance <- distance + 1
## Parse left:
recursiveObject <- rfsrcParseTree(recursiveObject, max.order, sub.order, nativeArray, b, distance, subtreeFlag)
## Parse right:
recursiveObject <- rfsrcParseTree(recursiveObject, max.order, sub.order, nativeArray, b, distance, subtreeFlag)
}
else {
## Update the terminal node depth sum.
recursiveObject$subtree$terminalDepthSum <- recursiveObject$subtree$terminalDepthSum + distance
}
return(recursiveObject)
}
## -----------------------------------------------------------------------
##
## SECONDARY RECURSION
##
## -----------------------------------------------------------------------
## Determine the second order maximal subtree distance.
## Recursive Object Definition:
##
## recursiveObject =
## list(offset,
## depth = 0,
## minimumVector = rep(NA, numParm),
## diagnostic)
##
## Inputs:
## distance
## - distance (depth) from the root of the (primary)
## v-subtree. Thus, this always starts at zero (0).
##
##
## Outputs:
## recursiveObject$minimumDistance
## - vector of length [numParm]
## - second order unnormalized minimum maximum w-subtree distances
## relative to the subtree defined by the initial root call.
## recursiveObject$depth
## - maximum terminal node depth, relative to the v-subtree.
##
## Algorithm Notes:
##
## We use the terminology (v,w) for a w-maximal subtree within
## (relative to) a v-maximal subtree.
##
## STEP-1:
##
## For a given v, and a given v-max subtree, we search for all w-max
## subtrees within this v-max subtree, where v != w. We single out
## the w-subtree with minimum depth -- that is, the w-split closest to
## the v-split. Thus, when we talk about depths of w, these are
## relative depths, relative to the v-max subtree. So the depth at
## root of the v-max subtree is defined to be zero (0).
##
## For a given v, and a given v-max subtree, we also determine the
## maximum terminal node depth. Let's call this the penalty depth.
## Again, this is relative to the v-max subtree.
##
## If a particular w-max subtree does not exist for a given v and
## v-max subtree, it is assigned the penalty depth.
##
## Now, there are p-1 minimum w-max subtree depths, given by analyzing
## all w != v. These distances are normalized (divided by) by the
## penalty depth for the v-max subtree, so that all p-1 values are in
## (0,1].
##
##
## STEP-2:
## Consider v as fixed. We do STEP-1 for all v-max subtrees. Say
## there are count(v) of these. We then take the MINIMUM normalized depth across the count(v)
## vectors of length p-1 that give the minimum w-max subtree depths by
## dividing by count(v).
##
## STEP-3:
## Currently, if the v-max subtree does not exist, the row
## corresponding to v in the $subOrder matrix, namely [v, ] will be
## one (1) except for the element corresponding to the diagonal [v,v].
## It will be zero (0).
##
## STEP-4:
##
## All of the above are repeated for each tree in the forest such that
## the [p]x[p] matrix for each tree is normalized (divided by) the the
## number of valid (non-stumped) trees in the forest.
##
## There is the rare possibility that a xvar v will never be
## split upon within the forest. The entire row for this xvar,
## given by [v, ], will be NA.
rfsrcParse2Tree <- function(recursiveObject,
nativeArray,
b,
distance,
subtreeFlag) {
## Diagnostic count of calls.
recursiveObject$diagnostic = recursiveObject$diagnostic + 1
## Weak consistency check to ensure that the iteration matches the treeID in the nativeArray record.
if(b != nativeArray$treeID[recursiveObject$offset]) {
stop("Invalid nativeArray input record (treeID) at ", recursiveObject$offset, ". Please contact Technical Support.")
}
## Read the current nativeArray split parameter.
splitParameter = nativeArray$parmID[recursiveObject$offset]
## Determine whether this is a terminal node.
if (splitParameter == 0) {
terminalFlag = TRUE
}
else if (splitParameter != 0) {
terminalFlag = FALSE
}
if (splitParameter != 0) {
## Check that the parameter has not already been encountered along this path.
if (subtreeFlag[splitParameter] == FALSE) {
## Update the minimum maximal subtree distance encountered for the parameter.
if (is.na(recursiveObject$minimumVector[splitParameter])) {
recursiveObject$minimumVector[splitParameter] = distance
}
else {
recursiveObject$minimumVector[splitParameter] = min(recursiveObject$minimumVector[splitParameter], distance, na.rm = TRUE)
}
## Indicate that the parameter has been split on
subtreeFlag[splitParameter] = TRUE
}
}
## Update the (relative) maximum depth encountered for this tree.
recursiveObject$depth = max(recursiveObject$depth, distance)
## Increment the (parsed) tree distance.
distance = distance + 1
## Increment the offset.
recursiveObject$offset = recursiveObject$offset + 1
## Parse left and then right, if this is not a terminal node.
if (terminalFlag == FALSE) {
## Parse left:
recursiveObject = rfsrcParse2Tree(recursiveObject, nativeArray, b, distance, subtreeFlag)
## Parse right:
recursiveObject = rfsrcParse2Tree(recursiveObject, nativeArray, b, distance, subtreeFlag)
}
return(recursiveObject)
}
## -----------------------------------------------------------------------
##
## Supporting Functions
##
## -----------------------------------------------------------------------
## minimal depth density
minDepthProb <- function(p, D, l) {
if (!is.null(l)) Ld <- 0
prob <- rep(0, D+1)
nullObj <- sapply(0:(D-1), function(d) {##trick to avoid a for-loop
if (is.null(l)) {
Ld <- 2^d-1
ld <- 2^d
}
else{
ld <- l[d+1]
if (d > 0) Ld <- Ld + l[d]
}
prob.1 <- Ld * ifelse(p > 1, log(1 - 1 / p), 0)
prob.2 <- ld * ifelse(p > 1, log(1 - 1 / p), 0)
prob[d+1] <<- exp(prob.1) * (1 - exp(prob.2))
NULL
})
prob[D+1] <- 1 - sum(prob[1:D])
if (prob[D+1] < 0) {
prob[D+1] <- 0
prob <- prob / sum(prob)
}
prob
}
## second order depth density
secondOrderDepthProb <- function(p, D, l) {
md.prob <- rep(0, D+1)
Ld <- 0
if (is.null(l)) {
l <- sapply(0:(D-1), function(d) 2^d)
}
nullObj <- sapply(0:(D-1), function(d) {##trick to avoid a for-loop
ld <- l[d+1]
if (d > 0) Ld <- Ld + l[d]
md.prob.1 <- Ld * ifelse(p > 1, log(1 - 1 / p), 0)
md.prob.2 <- ld * ifelse(p > 1, log(1 - 1 / p), 0)
md.prob[d+1] <<- exp(md.prob.1) * (1 - exp(md.prob.2))
NULL
})
md.prob[D+1] <- 1 - sum(md.prob[1:D])
if (md.prob[D+1] < 0) {
md.prob[D+1] <- 0
md.prob <- md.prob / sum(md.prob)
}
## md.prob is the minimal depth density
## we now use it to define the second order depth density
prob <- rep(0, D+1)
if (D >= 2) {
nullObj <- sapply(1:(D-1), function(d) {##trick to avoid a for-loop
prob.d <- l[1:d] * ifelse(p > 1, log(1 - 1 / p), 0)
prob[d+1] <<- md.prob[d+1] * sum(exp(-prob.d) - 1)
NULL
})
}
prob[D+1] <- 1 - sum(prob[1:D])
if (prob[D+1] < 0) {
prob[D+1] <- 0
prob <- prob/sum(prob)
}
prob
}
## calculates minimal depth density under the null
## calculates first/second moments and robust estimate of s.d
## treat stumps as equivalent to trees with tree depth = 1.
minDepthStat <- function(p, D=NULL, l=NULL) {
if (is.null(D) & is.null(l)) stop("set D or l")
if (!is.null(l)) {
D <- length(l)
}
D.support <- c(0:D)
prob <- minDepthProb(p, D=D, l=l)
prob.robust <- prob
prob.robust[length(prob)] <- 0
prob.robust <- prob.robust / sum(prob.robust, na.rm = TRUE)
first.moment <- sum(D.support * prob, na.rm = TRUE)
second.moment <- sum((D.support^2) * prob, na.rm = TRUE)
sd.robust <- sqrt(sum((D.support^2) * prob.robust, na.rm = TRUE)
- (sum(D.support * prob.robust, na.rm = TRUE))^2)
return(list(prob = prob, first.moment = first.moment, second.moment = second.moment, sd.robust = sd.robust))
}
## calculates second order depth density under the null
## returns its first moment
## treat stumps as equivalent to trees with tree depth = 1.
secondOrderDepthStat <- function(p, D=NULL, l=NULL) {
if (is.null(D) & is.null(l)) stop("set D or l")
if (!is.null(l)) {
D <- length(l)
}
D.support <- c(0:D)
prob <- secondOrderDepthProb(p, D=D, l=l)
sum(D.support * prob, na.rm = TRUE)
}
## parses minimal depth and second order depth
## returns the md density
## returns the md first (*threshold*) and second moment
## returns a robust estimate of standard deviation (gets rid of atom in the right tail)
## returns (new) threshold for second order statistic
## v is the max.subtree object
parseDepthOrder <- function(v, max.order) {
## preliminary checks: can this calculation even be done?
if (is.null(v$mean) & max.order > 0) {
return(NULL)
}
if (is.null(v$order) & max.order == 0) {
return(NULL)
}
## root node is assigned a value of 1 nodes at depth
## although not technically correct, it is relatively benign
nodes.at.depth <- rbind(1, v$nodes.at.depth)
## determine the tree height (height = 1 + depth)
## determine the average tree height
## determine the maximum tree height
treeHeight <- apply(nodes.at.depth, 2, function(l) {sum(!is.na(l))})
avgTreeHeight <- mean(treeHeight, na.rm=TRUE)
maxTreeHeight <- max(treeHeight, na.rm=TRUE)
## determine the number of trees in the forest
ntree <- length(v$terminal)
## mean minimal depth
## break this up into two cases
if (max.order > 0) {
## set the dimension
p <- length(v$mean)
## case I: substitute forest averaged values for parameters
nodes.at.depth.avg <- apply(nodes.at.depth, 1, mean, na.rm = TRUE)
l <- nodes.at.depth.avg[1:max(avgTreeHeight - 1, 1)]
minDepthStatObj <- minDepthStat(p, l = l)
first.moment <- minDepthStatObj$first.moment
second.moment <- minDepthStatObj$second.moment
sd.robust <- minDepthStatObj$sd.robust
prob <- minDepthStatObj$prob
second.order.moment <- secondOrderDepthStat(p, l = l)
}
else {
## case II: tree-averaged (by-tree) analysis
## set the dimension
p <- nrow(v$order)
## extract the tree density in a matrix format
## extract each trees depth
prob <- sapply(1:ntree, function(tree) {
l <- nodes.at.depth[, tree][1:sum(nodes.at.depth[, tree] > 0, na.rm = TRUE)]
c(minDepthStat(p, l = l)$prob, rep(0, maxTreeHeight - treeHeight[tree] * (1 + 1 * (treeHeight[tree] == 1))))
})
## extract the tree minimal threshold value
first.moment <- unlist(sapply(1:ntree, function(tree) {
l <- nodes.at.depth[, tree][1:sum(nodes.at.depth[, tree] > 0, na.rm = TRUE)]
minDepthStat(p, l = l)$first.moment
}))
## extract the tree second moment
second.moment <- unlist(sapply(1:ntree, function(tree) {
l <- nodes.at.depth[, tree][1:sum(nodes.at.depth[, tree] > 0, na.rm = TRUE)]
minDepthStat(p, l = l)$second.moment
}))
## extract the tree robust standard deviation
sd.robust <- unlist(sapply(1:ntree, function(tree) {
l <- nodes.at.depth[, tree][1:sum(nodes.at.depth[, tree] > 0, na.rm = TRUE)]
minDepthStat(p, l = l)$sd.robust
}))
## extract the tree second order threshold value
second.order.moment <- unlist(sapply(1:ntree, function(tree) {
l <- nodes.at.depth[, tree][1:sum(nodes.at.depth[, tree] > 0, na.rm = TRUE)]
secondOrderDepthStat(p, l = l)
}))
}
return(list(prob = prob,
first.moment = first.moment,
sd = sqrt(second.moment - first.moment^2),
sd.robust = sd.robust,
second.order.moment = second.order.moment))
}
max.subtree <- max.subtree.rfsrc
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.