R/get.tree.rfsrc.R
In RFpredInterval: Prediction Intervals with Random Forests and Boosted Forests

get.tree.rfsrc <- function(object,
                           tree.id,
                           target,
                           m.target = NULL,
                           time,
                           surv.type = c("mort", "rel.freq", "surv", "years.lost", "cif", "chf"),
                           class.type = c("prob", "bayes"),
                           oob = TRUE,
                           show.plots = TRUE)
{
  ##----------------------------------------------------------------
  ##
  ## The following two utilities were copied from the BMS CRAN
  ## package.  Thanks to Martin Feldkircher and Stefan Zeugne for
  ## these little quickies.
  ##
  ##----------------------------------------------------------------
  hex2bin <- function (hexcode) 
  {
      if (!is.character(hexcode)) 
          stop("please input a character like '0af34c'")
      hexcode <- paste("0", tolower(hexcode), sep = "")
      hexobj <- .hexcode.binvec.convert(length(hexcode) * 16L)
      return(hexobj$as.binvec(hexcode))
  }
  .hexcode.binvec.convert <- function (length.of.binvec) 
  {
      if (length(length.of.binvec) > 1) 
          length.of.binvec = length(length.of.binvec)
      addpositions = 4 - length.of.binvec%%4
      positionsby4 = (length.of.binvec + addpositions)/4
      hexvec = c(0:9, "a", "b", "c", "d", "e", "f")
      hexcodelist = list(`0` = numeric(4),
                         `1` = c(0, 0, 0, 1), 
                         `2` = c(0, 0, 1, 0),
                         `3` = c(0, 0, 1, 1),
                         `4` = c(0, 1, 0, 0),
                         `5` = c(0, 1, 0, 1),
                         `6` = c(0, 1, 1, 0), 
                         `7` = c(0, 1, 1, 1),
                         `8` = c(1, 0, 0, 0),
                         `9` = c(1, 0, 0, 1),
                         a   = c(1, 0, 1, 0),
                         b   = c(1, 0, 1, 1),
                         c   = c(1, 1, 0, 0),
                         d   = c(1, 1, 0, 1),
                         e   = c(1, 1, 1, 0),
                         f   = c(1, 1, 1, 1))
      return(list(
          as.hexcode = function(binvec) {
              incl = c(numeric(addpositions), binvec)
              dim(incl) = c(4, positionsby4)
              return(paste(hexvec[crossprod(incl, 2L^(3:0)) + 1], collapse = ""))
          },
          as.binvec = function(hexcode) {
              return(unlist(
                  hexcodelist[unlist(strsplit(hexcode, "", fixed = TRUE),
                                     recursive = FALSE, use.names = FALSE)], 
                  recursive = FALSE, use.names = FALSE)[-(1:addpositions)])
          }))
  }
  ##----------------------------------------------------------------
  ##
  ## coherence checks
  ##
  ##----------------------------------------------------------------
  if (sum(inherits(object, c("rfsrc", "grow"), TRUE) == c(1, 2)) != 2) {
    stop("This function only works for objects of class `(rfsrc, grow)'")
  }
  if (is.forest.missing(object)) {
    stop("forest is missing.  Re-run rfsrc (grow call) with forest=TRUE")
  }
  if (inherits(object, "anonymous")) {
    anonymous <- TRUE
  }
  else {
    anonymous <- FALSE
  }
  ##----------------------------------------------------------------
  ##
  ## coherencey checks and flags
  ##
  ##----------------------------------------------------------------
  ## process the object depending on the underlying family
  family <- object$forest$family
  ## coherence for prediction
  predict.flag <- TRUE
  if (family == "unsupv" || family == "surv-TDC") {
    predict.flag <- FALSE
  }
  ## ensure the coherency of multivariate target.
  m.target <- get.univariate.target(object, m.target)
  ##----------------------------------------------------------------
  ##
  ## extract x data
  ## convert the data to numeric mode, apply the na.action protocol
  ## missing data not allowed
  ##
  ##----------------------------------------------------------------
  xvar.names <- object$forest$xvar.names
  xvar.factor <- object$forest$xvar.factor
  if (!anonymous) {
    x.data <- object$forest$xvar
    if (any(is.na(x.data))) {
      stop("missing data not allowed")
    }
    x.data <- finalizeData(xvar.names, x.data, miss.flag = FALSE)
  }
  n <- object$n
  subset <- 1:n
  ##----------------------------------------------------------------
  ##
  ## prediction details for filling out the terminal nodes
  ## family specific
  ##
  ##----------------------------------------------------------------
  if (predict.flag) {
    ## survival and competing risk families.
    if (grepl("surv", family)) {
      ## extract event information
      event.info <- object$event.info
      cens <- event.info$cens
      event.type <- event.info$event.type
      ## assign time if missing
      if (missing(time)) {
        time <- median(event.info$time.interest, na.rm = TRUE)
      }
      ## check for single time point
      if (length(time) > 1) {
        stop("time must be a single value:  ", time)
      }
      ## competing risk
      if (family == "surv-CR") {
        if (missing(target)) {
          target <- 1
        }
        else {
          if (target < 1 || target > max(event.type, na.rm = TRUE)) {
            stop("'target' is specified incorrectly")
          }
        }
        ## set the surv.type
        surv.type <- setdiff(surv.type, c("mort", "rel.freq", "surv"))[1]
        pred.type <- match.arg(surv.type, c("years.lost", "cif", "chf"))
      }
      ## survival
      else {
        target <- 1
        ## set the surv.type
        surv.type <- setdiff(surv.type, c("years.lost", "cif", "chf"))[1]
        pred.type <- match.arg(surv.type, c("rel.freq", "mort", "chf", "surv"))
      }
    }
    ## univariate and multivariate families.
    else {
      ## assign a null time value
      event.info <- time <- NULL
      ## factor outcome
      if(is.factor(coerce.multivariate(object, m.target)$yvar)) {
        object.yvar.levels <- levels(coerce.multivariate(object, m.target)$yvar)
        pred.type <- match.arg(class.type, c("prob", "bayes"))
        if (missing(target)) {
          target <- object.yvar.levels[1]
        }
        if (is.character(target)) {
          target <- match(match.arg(target, object.yvar.levels), object.yvar.levels)
        }
        else {
          if ((target > length(object.yvar.levels)) | (target < 1)) {
            stop("target is specified incorrectly:", target)
          }
        }
      }
      ## regression or unsupervised
      else {
        pred.type <- "y"
        target <- NULL
      }
    }
  }
  ##----------------------------------------------------------------
  ##
  ## get tree data and select tree
  ##
  ##----------------------------------------------------------------
  ## only one tree allowed
  tree.id <- tree.id[1]
  ## must be integer from 1...ntree
  if (tree.id < 1 || tree.id > object$ntree) {
    stop("tree id must be integer from 1 to ntree")
  }
  ## pull the arrays
  native.array <- object$forest$nativeArray
  native.f.array <- object$forest$nativeFactorArray[[1]]
  ## added processing needed for factors
  f.ctr <- 0
  factor.flag <- FALSE
  if (!is.null(native.f.array)) {
    pt.f <- which(native.array$mwcpSZ != 0)
    factPT <- lapply(pt.f, function(j) {
      f.ctr <<- f.ctr + 1
      step <- native.array$mwcpSZ[j] - 1
      mwcpPT <- native.f.array[f.ctr:(f.ctr+step)]
      mwcpPT <- paste0(sapply(mwcpPT, function(mwc) {
        format(as.hexmode(mwcpPT), 8)
      }))      
      mwcpSZ <- hex2bin(mwcpPT)
      paste(mwcpSZ, collapse = "")
    })
    native.array$contPT[pt.f] <- factPT
    factor.flag <- TRUE
  }
  ## define the display tree
  display.tree <- native.array[native.array$treeID == tree.id,, drop = FALSE]
  ## check to see if any factors are left
  ## store relevant informatio for later split-inequality encodings
  if (factor.flag) {
    pt.f <- display.tree$mwcpSZ !=0
    if (sum(pt.f) > 0) {
      f.names <- unique(xvar.names[display.tree$parmID[pt.f]])
    }
    else {
      factor.flag <- FALSE
    }
  }
  ##----------------------------------------------------------------
  ##
  ## for labeling of trees --> acquire the predicted values 
  ##
  ##----------------------------------------------------------------
  ## ensure coherency of multivariate target.
  m.target <- get.univariate.target(object, m.target)
  ## pull yhat (for all but unsupervised families)
  if (predict.flag) {
    yhat <- extract.pred(predict.rfsrc(object, m.target = m.target),
               pred.type,
               subset,
               time,
               m.target,
               target,
               oob = oob)
  }
  ##----------------------------------------------------------------
  ##
  ## prepare the tree to be converted into a network
  ##
  ##----------------------------------------------------------------
  ## conversion
  converted.tree <- display.tree
  vars.id <- data.frame(var = c("<leaf>", xvar.names), parmID = 0:length(xvar.names), stringsAsFactors = FALSE)
  converted.tree$var <- vars.id$var[match(display.tree$parmID, vars.id$parmID)]
  ## special symbol to be used for encoding the counter for variables (see note below)
  special <- "999_999"
  # note: we append a counter to the variables, because the data.tree package has trouble when
  # nodes are not unique.
  var.count <- 1:nrow(converted.tree)
  lapply(unique(converted.tree$var), function(vv) {
    pt <- converted.tree$var == vv
    var.count[which(pt)] <<- 1:sum(pt)
  })
  converted.tree$var_count <- var.count
  converted.tree$var_conc <- paste0(converted.tree$var, special, converted.tree$var_count)
  ##----------------------------------------------------------------
  ##
  ## convert the tree to a network data frame, using the fact that the
  ## nativeArray output is a pre-order traversal 
  ##
  ##----------------------------------------------------------------
  ## preliminary
  from_node <- ""
  network <- data.frame()
  num.children <- data.frame(converted.tree, children = 0)
  num.children <- num.children[num.children$var != "<leaf>",, drop = FALSE]
  num.children <- num.children[!duplicated(num.children$var_conc),, drop = FALSE]
  num_children <- as.list(rep(0, nrow(num.children)))
  names(num_children) <- num.children$var_conc
  ## loop (using lapply)
  lapply(1:nrow(converted.tree), function(i) {
    rowi <- converted.tree[i, ]
    if(i == 1){
      from_node <<- rowi$var_conc
    }
    else{
      ## develop the split encoding
      if(num_children[[from_node]] == 0) {#left split
        side <- "<="
        contPT.pretty <- round(as.numeric(converted.tree$contPT[converted.tree$var_conc == from_node]), 3)
        split_ineq_pretty <- paste0(side, contPT.pretty)
      }
      else {#right split
        side <- ">"
        split_ineq_pretty <- ""
      }
      ## both numeric and factors are encoded as <= > but factors are secretely in hex notation
      split_ineq <- paste0(side, converted.tree$contPT[converted.tree$var_conc == from_node])
      ## update the network
      to_node <- rowi$var_conc
      new_node <- list(from = from_node, to = to_node, split = split_ineq, split.pretty = split_ineq_pretty)
      network <<- data.frame(rbind(network, new_node, stringsAsFactors = FALSE))
      num_children[[from_node]] <<- num_children[[from_node]] + 1
      if(rowi$var != "<leaf>")
        from_node <<- to_node
      else{
        if(i != nrow(converted.tree)){
          while(num_children[[from_node]] == 2){
            from_node <<- network$from[network$to == from_node]
          }
        }
      }
    }
  })
  ##----------------------------------------------------------------
  ##
  ## process network for factors - clean up the split encoding
  ## encode as complementary pair string
  ##
  ##----------------------------------------------------------------
  if (factor.flag) {
    ## identify which splits need to be cleaned up
    from.names <- network$from
    pt.f <- sort(unique(unlist(lapply(f.names, function(ptrn) {
      grep(ptrn, from.names)
    }))))
    ## identify levels of the factor
    ## otherwise we would have huge sets full of levels that aren't used
    fs <- gsub(paste0(special,".*"),"",from.names[pt.f])
    fs.levels <- sapply(fs, function(fsn) {
      #length(levels(x.org.data[, fsn]))
      xvar.factor$nlevels[which(fsn == xvar.names)]
    })
    ## clean the splits up and encode as complementary pair sets
    split.str <- lapply(1:length(pt.f), function(j) {
      str <- network$split[pt.f[j]]
      ## left split
      if (grepl("<=", str)) {
        str <- sub("<=", "", str)
        str <- strsplit(str, "")[[1]]
        cpr <- 1 + length(str) - which(str != "0")
        cpr <- cpr[cpr <= fs.levels[j]]
        paste0("{", paste(cpr, collapse = ","), "}")
      }
      ## right split
      else {
        str <- sub(">", "", str)
        str <- strsplit(str, "")[[1]]
        cpr <- 1 + length(str) - which(str == "0")
        cpr <- cpr[cpr <= fs.levels[j]]
        paste0("{", paste(cpr, collapse = ","), "}")
      }
    })
    ## pretty complementary pairs
    split.str.pretty <- lapply(1:length(pt.f), function(j) {
      str <- network$split[pt.f[j]]
      ## left split
      if (grepl("<=", str)) {
        str <- sub("<=", "", str)
        str <- strsplit(str, "")[[1]]
        cpr <- 1 + length(str) - which(str != "0")
        cpr <- cpr[cpr <= fs.levels[j]]
        paste0("{", paste(cpr, collapse = ","), "}")
      }
      else {
        ""
      }
    })
    ## replace the previous fake encoding with the now correct "set encoding"
    network$split[pt.f] <- split.str
    network$split.pretty[pt.f] <- split.str.pretty
  }
  ##----------------------------------------------------------------
  ##
  ## create a tree object, see the data.tree package
  ##
  ##----------------------------------------------------------------
  data.tree.network <- data.tree::FromDataFrameNetwork(network, "split")
  ## INTERNAL NODES
  ## label the edges with the splits , and relabel the nodes so that the appended counters are not visible
  data.tree.network$Get(function(node) {
    data.tree::SetEdgeStyle(node, color = "black", label = node$split.pretty, fontcolor = "black")
    data.tree::SetNodeStyle(node, color = "black", fontcolor = "black", penwidth = 3,
                            label = strsplit(node$name, special)[[1]][1])
  })
  ## TERMINAL NODES
  ## loop through the leaves of the tree, each time getting the path down to the leaf, and using this path
  ## to establish a filtering condition to obtain all cases that match the splitting on the way to this leaf.
  ## we filter out cases to correspond to the leaf, and then modify the leaf display to include the number
  ## of cases, followed by the user requested predicted value
  data.tree::Do(data.tree.network$leaves, function(node){
    path_list <- node$path
    var_list <- sapply(path_list, function(x){strsplit(x, special)[[1]][1]})
    var_list[length(var_list)] <- ""
    node_iter <- data.tree.network
    ## make boolean string operatore
    call <- lapply(2:(length(path_list)), function(i) {
      node_iter <<- node_iter[[path_list[[i]]]]
      str <- node_iter$split
      ## numeric boolean operator
      if (!any(grepl("\\{", str))) {
        str <- paste0("x.data$", paste0(var_list[i-1], str))
      }
      ## complementary pair boolean operator
      else {
        str <- gsub("\\{", "", str)
        str <- gsub("\\}", "", str)
        str <- strsplit(str, ",")[[1]]
        str <- paste("==", str, sep = "")
        str <- paste0("(",paste(paste0("x.data$", var_list[i-1], str), collapse = "|"),")")
      }
      str
    })
    call <- paste(call, collapse = " & ")
    ## evaluate the boolean operator
    ## this yields the id's for the cases in the node
    if (!anonymous) {
      pt <- which(eval(parse(text=call)))
      n.cases <- length(pt)
    }
    ## set the edgelabel
    edge.label <- node$split.pretty
    ## set the node label
    ## extract the predicted value in the node
    if (predict.flag && !anonymous) {
      if (!is.factor(yhat)) {
        yhat <- round(mean(yhat[pt], na.rm = TRUE), 2)
        node.label <- paste0("n=", n.cases, "\n", yhat)
      }
      else {
        frqtable <- tapply(yhat[pt], yhat[pt], length)
        pred <- names(frqtable)[which.max(frqtable)]
        node.label <- paste0("n=", n.cases, "\n", pred)
      }
    }
    ## unsupervised family --> no predicted value
    else if (!predict.flag && !anonymous) {
      node.label <- paste0("n=", n.cases)
    }
    ## anonymous
    else {
      node.label <- NULL
    }
    ## set styles
    data.tree::SetGraphStyle(node, rankdir = "TB")
    data.tree::SetEdgeStyle(node, arrowhead = "vee", color = "grey35",
                            penwidth = 3, label = edge.label)
    data.tree::SetNodeStyle(node, style = "filled,rounded", shape = "box",
                 fillcolor = "GreenYellow", penwidth = 3, 
                 fontname = "helvetica", tooltip = data.tree::GetDefaultTooltip,
                 label = node.label)
  })
  invisible(data.tree.network)
}
get.tree <- get.tree.rfsrc