R/surrmindep.R

Defines functions surrmindep

Documented in surrmindep

#'Execute surrogate minimal depth variable importance
#'
#'This function determines the surrogate minimal depth of variables from a forest that is created by getTreeranger, addLayer and getSurrogates functions.
#'
#' @param forest a list containing allvariables and trees. Allvariables is a vector of all variable names in the original data set (strings). Trees is a list of trees that was generated by getTreeranger, addLayer, and getSurrogates functions.
#' @param s.l Number of average surrogate variables in the respective layers. (use count.surrogate function to get it)
#' @return List with the following components:
#' \itemize{
#' \item depth: mean surrogate minimal depth for each variable
#' \item selected: variables has been selected (1) or not (0),
#' \item threshold: the threshold that is used for the selection
#' }
#' @export

surrmindep = function(forest, s.l){

variables = forest[["allvariables"]]
trees = forest[["trees"]]

ntree = length(trees)
var.num = length(variables)

#prepare matrix for mindepth
mindepth = matrix(NA, nrow = ntree, ncol = var.num)
colnames(mindepth) = variables

MAX.DEPTH = 10000
#maximal Depth of trees and the number of nodes in every layer is saved to calculate treshold in a following step
maxdepth = rep(NA,ntree)
nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = ntree)
# get mindepth for every variable in every tree
for (i in 1:ntree) {
nodesAtDepth <- rep(NA, MAX.DEPTH)
tree = trees[[i]]
# get layer information of the variables and save it in minimal depth file
depth.tree = rep(NA,length(variables))
  o = 1
  while (anyNA(depth.tree) && o <= length(tree)) {
    node = unlist(tree[o])
    if (node["status"] == 1) {
    if (is.na(depth.tree[node["splitvariable"]])) {
      depth.tree[node["splitvariable"]] = node["layer"]
    }
      if (length(node) > 7) {
      for (r in 8:(7 + (length(node) - 7)/2)) {
        if (is.na(depth.tree[node[r]])) {
      depth.tree[node[r]] = node["layer"]
      }
      }
      }
    }
  o = o + 1
  }
  #variables with no split in the tree get maxdepth as minimal depth
  depth.tree[which(is.na(depth.tree))] = unlist(tree[length(tree)])["layer"]
  #save min and max depth information for every tree
  mindepth[i,] = depth.tree
  maxdepth[i] = unlist(tree[length(tree)])["layer"]
  #find the number of nodes in every layer

  laystat = cbind(sapply(tree,"[[","layer"),sapply(tree,"[[","status"))
  colnames(laystat) = c("layer","status")
  for (u in 1:(maxdepth[i])) {

    nodesAtDepth[u] = nrow(subset(laystat, laystat[,"layer"] == u & laystat[,"status"] == 1 ))
  }
  nodesAtDepthMatrix[,i] = nodesAtDepth
}

# create mean values for the minimal depth of different variables
mean.depth = colMeans(mindepth)

# determine the mean depth of an uninformative variable similarly as in Ishwaran et. al. Journal of the American Statistical
# Accociation 2010

treeHeight = maxdepth
avgTreeHeight = mean(treeHeight, na.rm = TRUE)
maxTreeHeight = max(treeHeight, na.rm = TRUE)

# threshold has to be calculated different when trees are small
if ((avgTreeHeight) < 2) {
  s.l.root = s.l[1]
  p.root = (s.l.root + 1)/var.num
  p.1 = 1 - p.root
  threshold = p.root*0 + p.1*1
  warning("Trees are very small! Threshold is defined based on trees with only root nodes.")
}



if ((avgTreeHeight) >= 2) {
nodes.at.depth.avg = apply(nodesAtDepthMatrix, 1, mean, na.rm = TRUE)
s.l.noroot = s.l[-1]
s.l.root = s.l[1]
p.root = (s.l.root + 1)/var.num
var.at.depth = nodes.at.depth.avg[1:(avgTreeHeight - 1)]*(s.l.noroot[1:(avgTreeHeight - 1)] + 1)  # 1 time for the original tree and s-times for
                                                                                                              # the surrogates
                                                                                                              #-1 since the last layer will be added later
p.depth = var.at.depth/var.num
p.all = c(p.root,p.depth)
prob.sum = 0
probs.used = NULL
for (u in 1:length(p.all)) {
  p.u = p.all[u]
  prob.sum = prob.sum + p.u
  if (prob.sum >= 1) break
  probs.used[u] = p.u
}
prob.last = 1 - sum(probs.used)   # the last layer (that was excluded before) is now used as minimal depth when the variable doesnt appear before
probs.used = c(probs.used,prob.last)
layers = c(0:(length(probs.used) - 1))
threshold = sum(layers*probs.used)
}


# Decide if variables are important or unimportant
Importances = rep(NA,var.num)
for (p in 1:var.num) {
if (mean.depth[p] < threshold) {
 Importances[p] = 1}
  else {
    Importances[p] = 0}
}
names(Importances) = variables
results = list(depth = mean.depth,selected = Importances,threshold = threshold)
return(results)
}
StephanSeifert/SurrogateMinimalDepth documentation built on Aug. 7, 2023, 1:59 a.m.