R/readForest.R

Defines functions readForest readTree getParent passData aggregateNodeFeature sdNode

Documented in readForest

readForest <- function(rfobj, x, y=NULL,
                       return.node.feature=TRUE,
                       wt.pred.accuracy=FALSE,
                       obs.weights=NULL,
                       n.core=1){

  if (is.null(rfobj$forest))
    stop('No Forest component in the randomForest object')
  if (wt.pred.accuracy & is.null(y))
    stop('y required to evaluate prediction accuracy')

  ntree <- rfobj$ntree
  p <- ncol(x)
  n <- nrow(x)
  out <- list()
  print("readForest")
  # read leaf node data from each tree in the forest
  rd.forest <- mclapply(1:ntree, readTree, rfobj=rfobj, x=x, y=y,
                        return.node.feature=return.node.feature,
                        wt.pred.accuracy=wt.pred.accuracy,
                        obs.weights=obs.weights,
                        mc.cores=n.core)
  print("bind Forest info")

  out$tree.info <- rbindlist(lapply(rd.forest, function(tt) tt$tree.info))
  # aggregate sparse feature matrix across forest
  nf <- lapply(rd.forest, function(tt) tt$node.feature)
  nf <- aggregateNodeFeature(nf)
  out$node.feature <- sparseMatrix(i=nf[,1], j=nf[,2], dims=c(max(nf[,1]), p))
  return(out)

}

readTree <- function(rfobj, k, x, y, return.node.feature, wt.pred.accuracy,
                     obs.weights) {
  n <- nrow(x)
  p <- ncol(x)
  ntree <- rfobj$ntree

  # Read tree level data from RF
  out <- list()
  out$tree.info <- as.data.frame(getTree(rfobj, k))
  out$tree.info$node.idx <- 1:nrow(out$tree.info)
  parents <- getParent(out$tree.info)
  out$tree.info$parent <- as.integer(parents)
  out$tree.info$tree <- as.integer(k)
  out$tree.info$size.node <- 0L

  # replicate each leaf node in node.feature based on specified
  # sampling.
  select.node <- out$tree.info$status == -1
  rep.node <- rep(0, nrow(out$tree.info))
  out$tree.info <- select(out$tree.info, prediction, node.idx, parent, tree, size.node)

  if (is.null(rfobj$obs.nodes)) {
    # if nodes not tracked, pass data through forest to get leaf
    # counts
    # TODO: node weighted sampling here
    fit.data <- passData(rfobj, x, out$tree.info, k)
    leaf.counts <- rowSums(fit.data[select.node,])
    which.leaf <- apply(fit.data[select.node,], MAR=2, which)
    leaf.idx <- as.integer(which(select.node))
    if (wt.pred.accuracy) leaf.sd <- c(by(y, which.leaf, sdNode))
  } else {
    if (is.null(obs.weights)) {
      leaf.counts <- table(rfobj$obs.nodes[,k])
      leaf.idx <- as.integer(names(leaf.counts))
      if (wt.pred.accuracy) leaf.sd <- c(by(y, rfobj$obs.nodes[,k], sdNode))
    } else {
      leaf.counts <- c(by(obs.weights, rfobj$obs.nodes[,k], sum))
      leaf.idx <- as.integer(names(leaf.counts))
      if (wt.pred.accuracy) leaf.sd <- c(by(y, rfobj$obs.nodes[,k], sdNode))
    }

  }

  out$tree.info$size.node[leaf.idx] <- leaf.counts
  if (wt.pred.accuracy) {
    out$tree.info$dec.purity <- 0
    out$tree.info$dec.purity[leaf.idx] <- pmax((sd(y) - leaf.sd) / sd(y), 0)
  }

  out$tree.info <- out$tree.info[select.node,]
  rep.node[select.node] <- 1

  # Extract decision paths from leaf nodes as binary sparse
  # matrix
  if (return.node.feature) {
    row.offset <- 0
    var.nodes <- as.integer(rfobj$forest$bestvar[,k])
    total.rows <- n #sum(rep.node[select.node])
    sparse.idcs <- nodeVars(var.nodes,
                            as.integer(length(select.node)),
                            as.integer(p),
                            as.integer(parents),
                            as.integer(select.node),
                            as.integer(rep.node),
                            as.integer(row.offset),
                            matrix(0L, nrow=(total.rows * p), ncol=2))
    out$node.feature <- sparse.idcs[!sparse.idcs[,1] == 0,]
  }

  return(out)
}

getParent <- function(tree.info) {
  # Generate a vector of parent node indices from output of getTree
  parent <- match(1:nrow(tree.info), c(tree.info[,'left daughter'],
                                       tree.info[,'right daughter']))
  parent <- parent %% nrow(tree.info)
  parent[1] <- 0
  return(parent)
}


passData <- function(rfobj, x, tt, k) {
  # Pass data through rf object
  leaf.id <- tt$status == -1
  n <- nrow(x)
  n.node <- rfobj$forest$ndbigtree[k]
  node.composition <- matrix(FALSE, nrow=n.node, ncol=n)
  node.composition[1,] <- TRUE


  for (i in which(!leaf.id)){
    # determine children and split point for current node
    d.left <- tt$"left daughter"[i]
    d.right <- tt$"right daughter"[i]
    split.var <- tt$"split var"[i]
    split.pt <- tt$"split point"[i]

    parent.id <- node.composition[i,]
    d.left.id <- (x[,split.var] <= split.pt) & parent.id
    d.right.id <- (x[,split.var] > split.pt) & parent.id

    node.composition[d.left,] <- d.left.id
    node.composition[d.right,] <- d.right.id
  }

  return(node.composition)
}

aggregateNodeFeature <- function(nf) {
  # aggregate list of node feature data returned from each tree

  ntree <- length(nf)
  row.offset <- c(0, cumsum(sapply(nf, function(z) max(z[,1])))[-ntree])
  n.rows <- sapply(nf, nrow)
  nf <- do.call(rbind, nf)
  nf[,1] <- nf[,1] + rep(row.offset, times=n.rows)
  return(nf)
}

sdNode <- function(x) {
  sd.node <- ifelse(length(x) == 1, 0, sd(x))
  return(sd.node)
}
Jromero1208/PBDRiRF documentation built on May 4, 2019, 10:56 a.m.