#'Execute surrogate minimal depth variable importance
#'
#'This function determines the surrogate minimal depth of variables from a forest that is created by getTreeranger, addLayer and getSurrogates functions.
#'
#' @param forest a list containing allvariables and trees. Allvariables is a vector of all variable names in the original data set (strings). Trees is a list of trees that was generated by getTreeranger, addLayer, and getSurrogates functions.
#' @param s.l Number of average surrogate variables in the respective layers. (use count.surrogate function to get it)
#' @return List with the following components:
#' \itemize{
#' \item depth: mean surrogate minimal depth for each variable
#' \item selected: variables has been selected (1) or not (0),
#' \item threshold: the threshold that is used for the selection
#' }
#' @export
surrmindep = function(forest, s.l){
variables = forest[["allvariables"]]
trees = forest[["trees"]]
ntree = length(trees)
var.num = length(variables)
#prepare matrix for mindepth
mindepth = matrix(NA, nrow = ntree, ncol = var.num)
colnames(mindepth) = variables
MAX.DEPTH = 10000
#maximal Depth of trees and the number of nodes in every layer is saved to calculate treshold in a following step
maxdepth = rep(NA,ntree)
nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = ntree)
# get mindepth for every variable in every tree
for (i in 1:ntree) {
nodesAtDepth <- rep(NA, MAX.DEPTH)
tree = trees[[i]]
# get layer information of the variables and save it in minimal depth file
depth.tree = rep(NA,length(variables))
o = 1
while (anyNA(depth.tree) && o <= length(tree)) {
node = unlist(tree[o])
if (node["status"] == 1) {
if (is.na(depth.tree[node["splitvariable"]])) {
depth.tree[node["splitvariable"]] = node["layer"]
}
if (length(node) > 7) {
for (r in 8:(7 + (length(node) - 7)/2)) {
if (is.na(depth.tree[node[r]])) {
depth.tree[node[r]] = node["layer"]
}
}
}
}
o = o + 1
}
#variables with no split in the tree get maxdepth as minimal depth
depth.tree[which(is.na(depth.tree))] = unlist(tree[length(tree)])["layer"]
#save min and max depth information for every tree
mindepth[i,] = depth.tree
maxdepth[i] = unlist(tree[length(tree)])["layer"]
#find the number of nodes in every layer
laystat = cbind(sapply(tree,"[[","layer"),sapply(tree,"[[","status"))
colnames(laystat) = c("layer","status")
for (u in 1:(maxdepth[i])) {
nodesAtDepth[u] = nrow(subset(laystat, laystat[,"layer"] == u & laystat[,"status"] == 1 ))
}
nodesAtDepthMatrix[,i] = nodesAtDepth
}
# create mean values for the minimal depth of different variables
mean.depth = colMeans(mindepth)
# determine the mean depth of an uninformative variable similarly as in Ishwaran et. al. Journal of the American Statistical
# Accociation 2010
treeHeight = maxdepth
avgTreeHeight = mean(treeHeight, na.rm = TRUE)
maxTreeHeight = max(treeHeight, na.rm = TRUE)
# threshold has to be calculated different when trees are small
if ((avgTreeHeight) < 2) {
s.l.root = s.l[1]
p.root = (s.l.root + 1)/var.num
p.1 = 1 - p.root
threshold = p.root*0 + p.1*1
warning("Trees are very small! Threshold is defined based on trees with only root nodes.")
}
if ((avgTreeHeight) >= 2) {
nodes.at.depth.avg = apply(nodesAtDepthMatrix, 1, mean, na.rm = TRUE)
s.l.noroot = s.l[-1]
s.l.root = s.l[1]
p.root = (s.l.root + 1)/var.num
var.at.depth = nodes.at.depth.avg[1:(avgTreeHeight - 1)]*(s.l.noroot[1:(avgTreeHeight - 1)] + 1) # 1 time for the original tree and s-times for
# the surrogates
#-1 since the last layer will be added later
p.depth = var.at.depth/var.num
p.all = c(p.root,p.depth)
prob.sum = 0
probs.used = NULL
for (u in 1:length(p.all)) {
p.u = p.all[u]
prob.sum = prob.sum + p.u
if (prob.sum >= 1) break
probs.used[u] = p.u
}
prob.last = 1 - sum(probs.used) # the last layer (that was excluded before) is now used as minimal depth when the variable doesnt appear before
probs.used = c(probs.used,prob.last)
layers = c(0:(length(probs.used) - 1))
threshold = sum(layers*probs.used)
}
# Decide if variables are important or unimportant
Importances = rep(NA,var.num)
for (p in 1:var.num) {
if (mean.depth[p] < threshold) {
Importances[p] = 1}
else {
Importances[p] = 0}
}
names(Importances) = variables
results = list(depth = mean.depth,selected = Importances,threshold = threshold)
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.