preorder2 <- structure(function# recursive traversal of tree assigning row numbers of data for each node and leaf
### Recursive calling stops at leaf after which the function propagates back up the tree
(
treeRow, ##<< current row of tree dataframe to be
bag, ##<< The data for the current row
tree, ##<< tree (from randomForest::getTree to be traversed
# ASSIGN=TRUE, ##<< unused!! no more global assignments
verbose=0 ##<< level of verbosity
){
#print(tree[treeRow, ])
#if (length(tree[treeRow, "status"])==0) browser()
if (tree[treeRow, "status"] ==1){ #no leaf!
left_daughter = tree[treeRow, 'left daughter']
right_daughter = tree[treeRow, 'right daughter']
daughters = splitBag(treeRow,bag,tree)# returns row names !!
tree=preorder2(left_daughter,bag[daughters$left_daughter,],tree, verbose=verbose)
tree=preorder2(right_daughter,bag[daughters$right_daughter,],tree,verbose=verbose)
}
#rowNamesOrig = as.character(floor(as.numeric(rownames(bag))))
rowNamesOrig = rownames(bag)
if (nrow(bag)>0) {
tree$node[treeRow] <- list(rowNamesOrig)
}
if (verbose) cat(treeRow,":", rowNamesOrig,"\n")
return(tree)
### tree with rownames in column node
}, ex = function(){
data("titanic_train", package = "rfVarImpOOB", envir = environment())
set.seed(123)
ranRows=sample(nrow(titanic_train), 300)
RF = randomForest::randomForest(formula = Survived ~ Sex + Pclass + PassengerId,
data=titanic_train[ranRows,],
ntree=5,importance=TRUE,
mtry=3,keep.inbag=TRUE,
nodesize = 1)
k=2
tree = randomForest::getTree(RF, k, labelVar = TRUE)
tree$node=NA
inbag = rep(rownames(RF$inbag),time=RF$inbag[,k])
#trainBag=titanic_train[inbag,]
trainBag=titanic_train[ranRows,][inbag,]
tree=preorder2(1,trainBag,tree)
})
splitBag <- function# splits the data from parent node into left and right children
### The function properly splits on factor levels
(
treeRow, ##<< current row of tree dataframe to be
bag, ##<< The data for the current row
tree ##<< tree (from randomForest::getTree)
){
split_var = as.character(tree[treeRow,'split var'])
if (is.factor(bag[,split_var])){
#levels = unique(bag[,split_var])
#levels = sort(levels, decreasing = TRUE)
levels = levels(bag[,split_var])
n = length(levels)#} else{
if (n<2) n=0;#browser()
#print(treeRow)
binCode =rev(binaryLogic::as.binary(tree[treeRow,'split point'],n=n))
split_point = try(as.character(levels[binCode])) #rF encodes male as 10, female as 01
if (class(split_point) == "try-error") browser()
mask = bag[,split_var] %in% split_point
left_daughter = rownames(bag)[mask]
right_daughter = rownames(bag)[!mask]
} else{
split_point = tree[treeRow,'split point']
mask = bag[,split_var] <= split_point
left_daughter = rownames(bag)[mask]
right_daughter = rownames(bag)[!mask]
#if (verbose >0) {diagnostics(in_node_df, left_daughter, right_daughter, split_var)}
}
return(list(left_daughter = (left_daughter), right_daughter = (right_daughter)))
### list with elements left_daughter, right_daughter
}
InfGain <- function#computes information gain for each parent node in a tree
### information gain for each parent node in a tree
(
tree, ##<< tree (from randomForest::getTree)
total=TRUE, ##<< if TRUE compute the sum instead of the mean
zeroLeaf = TRUE, ##<< if TRUE discard the information gain due to splits resulting in n=1
score=c("PMDI21","MDI","MDA","MIA")[1], ##<< scoring method:MDI=mean decrease impurity (Gini),MDA=mean decrease accuracy (permutation),MIA=mean increase accuracy
verbose=0 ##<< level of verbosity
){
#browser()
IG_result = MIA_result = rep(NA, nrow(tree))
tree$n_node = 0
for (i in 1:nrow(tree)){
en_node = tree[i, 'gini_index'] # get the entropy of that row
n_node = tree[i, 'node'] %>% unlist %>% na.omit %>%length # get the number of elements of the bag
tree[i, 'n_node'] = n_node
ld = tree[i, 'left daughter'] # row numeber of the left daughter
rd = tree[i, 'right daughter'] # right daughter
if (ld==0|rd==0){ # information gain for terminal node = NA
IG=NA
IG_result[i] = IG
next
}
en_ld = tree[ld, 'gini_index'] # index of the left daughter
en_rd = tree[rd, 'gini_index'] # right daughter
nld = tree[ld, 'node']%>% unlist %>% na.omit %>% length # get the number of elements of left daughter
nrd = tree[rd, 'node']%>% unlist %>% na.omit %>% length # right
dfCorrection=0#idea for later
IG = en_node - sum(en_ld*(nld-dfCorrection), en_rd*(nrd-dfCorrection))/(n_node-2*dfCorrection) # formula of information gain
if (total) IG = IG*(n_node-2*dfCorrection)
#new idea: discount inf gain entirely if one of the children only has one element.
if (zeroLeaf) IG = IG*ifelse(pmin(nld,nrd)<2,0,1)
#if (length(IG)!=1) browser()
IG_result[i] = IG
if (score =="MIA"){
MIA = sum(tree$Accuracy[ld]*nld, tree$Accuracy[rd]*nrd)/(n_node) -tree$Accuracy[i]
MIA=MIA*n_node
MIA_result[i] = MIA
#if (is.na(MIA) | MIA == 0) browser()
}
}
#browser()
tree$`IG_gini`=round(IG_result,3)
if (score =="MIA") {
tree$IG_acc = round(MIA_result,3)
tree[is.na(tree[,"IG_acc"]),"IG_acc"] = 0
}
tree[is.na(tree[,'IG_gini']),'IG_gini'] = 0
return(tree)
### tree object augmented with information gain at each node
}
GiniImportanceForest <- structure(function#computes inbag and OOB Gini importance averaged over all trees in a forest
### workhorse function of this package
(
RF, ##<< object returned by call to randomForest()
data, ##<< data which was used to train the RF. NOTE: assumes setting of inbag=TRUE while training
ylabel = "Survived", ##<< name of dependent variable
zeroLeaf = TRUE, ##<< if TRUE discard the information gain due to splits resulting in n=1
agg = c("mean","median","none")[1], ##<< method of aggregating importance scores across trees. If "none" return the raw arrays (for debugging)
score=c("PMDI21", "MDI","MDA","MIA")[1], ##<< scoring method:MDI=mean decrease impurity (Gini),MDA=mean decrease accuracy (permutation),MIA=mean increase accuracy
Predictor=mean, ##<< function to estimate node prediction, such as Mode or mean or median. Alternatively, pass an array of numbers as replacement for the yHat column of tree
correctBias = FALSE, ##<< multiply by n/(n-1) for sample variance correction!
verbose=0 ##<< level of verbosity
){
stopifnot(!is.null(RF$inbag))
vars = dimnames(attr(RF$terms, "factors"))
stopifnot(ylabel == setdiff(vars[[1]], vars[[2]]))
nTrees = ncol(RF$inbag)
VIbench = as.data.frame(randomForest::importance(RF))
vars = rownames(VIbench)
Gini_inbag=Gini_OOB = matrix(0,nrow=length(vars),ncol=nTrees,dimnames=list(vars,1:nTrees))
#VItrain = c(PassengerId=0, Sex=0, Pclass=0)
inTree=outTree=list()
for (k in 1:nTrees){
trackingInfo=c(k=k,bag="in");#print(trackingInfo)
#inbag = as.numeric(names((RF$inbag[RF$inbag[,k]>0,k])))
inbag = rep(rownames(RF$inbag),time=RF$inbag[,k])
#trainBag=titanic_train[inbag,]
trainBag=data[inbag,]
rownames(trainBag) = 1:nrow(trainBag)
#cat("k=",k,"\n")
inImp =GiniImportanceTree(trainBag,RF,k,ylabel=ylabel, zeroLeaf=zeroLeaf,score=score,Predictor=Predictor,correctBias=correctBias)
yScore=colnames(inImp)[2]
Gini_inbag[,k] = inImp[vars,yScore]
inTree[[k]] =GiniImportanceTree(trainBag,RF,k,returnTree=TRUE,ylabel=ylabel, zeroLeaf=zeroLeaf,score=score,Predictor=Predictor,correctBias=correctBias,verbose=verbose)
trackingInfo=c(k=k,bag="out");#print(trackingInfo)
outbag = names((RF$inbag[RF$inbag[,k]==0,k]))
OOB = data[outbag,]
rownames(OOB) = 1:nrow(OOB)
if (verbose<0) outTree[[k]] =GiniImportanceTree(OOB, RF, k,returnTree=TRUE,ylabel=ylabel, Predictor=inTree[[k]]$yHat, zeroLeaf=zeroLeaf,score=score,correctBias=correctBias)
outImp =GiniImportanceTree(OOB, RF, k,ylabel=ylabel, Predictor=inTree[[k]]$yHat, zeroLeaf=zeroLeaf,score=score,correctBias=correctBias,verbose=verbose)
#browser()
yScore=colnames(outImp)[2]
Gini_OOB[,k] = outImp[vars,yScore]
#VItrain["PassengerId"] = VItrain["PassengerId"] +tmp["PassengerId","sum_IG_gini"]
#VItrain["Sex"] = VItrain["Sex"] + tmp["Sex","sum_IG_gini"]
#VItrain["Pclass"] = VItrain["Pclass"] + tmp["Pclass","sum_IG_gini"]
}
if (verbose<0) return(list(inTree=inTree, outTree=outTree))
#browser()
if (agg == "none"){
return(list(Gini_OOB=Gini_OOB,Gini_inbag=Gini_inbag))
} else if (agg == "mean"){
VIbench$Gini_OOB = rowMeans(Gini_OOB, na.rm = TRUE)
VIbench$Gini_inbag = rowMeans(Gini_inbag, na.rm = TRUE)
} else if (agg == "median"){
VIbench$Gini_OOB = apply(Gini_OOB,1,median, na.rm = TRUE)
VIbench$Gini_inbag = apply(Gini_inbag,1,median, na.rm = TRUE)
}
VIbench$Gini_OOB_sd = apply(Gini_OOB,1,sd, na.rm = TRUE)/sqrt(ncol(Gini_OOB))
VIbench$Gini_inbag_sd = apply(Gini_inbag, 1,sd, na.rm = TRUE)/sqrt(ncol(Gini_inbag))
return(VIbench)
### matrix with variable importance scores and their stdevs
} , ex = function(){
data("titanic_train", package = "rfVarImpOOB", envir = environment())
set.seed(123)
ranRows=sample(nrow(titanic_train), 300)
data=titanic_train[ranRows,]
RF = randomForest::randomForest(formula = Survived ~ Sex + Pclass + PassengerId,
data=data,
ntree=5,importance=TRUE,
mtry=3,keep.inbag=TRUE,
nodesize = 20)
data$Survived = as.numeric(data$Survived)-1
VI_Titanic = GiniImportanceForest(RF, data,ylab="Survived")
})
GiniImportanceTree <- structure(function# computes Gini information gain for one tree from randomForest
### computes importance scores for an individual tree.
### These can be based on Gini impurity or Accuracy or logloss
(
bag, ##<< data to compute the Gini gain for
RF, ##<< object returned by call to randomForest()
k, ##<< which tree
ylabel = "Survived", ##<< name of dependent variable
returnTree = FALSE, ##<< if TRUE returns the tree data frame otherwise the aggregated Gini importance grouped by split variables
zeroLeaf = TRUE, ##<< if TRUE discard the information gain due to splits resulting in n=1
score=c("PMDI21", "MDI","MDA","MIA")[1],
### scoring method:PMDI=mean decrease penalized Gini impurity (note:the last digit is the exponent of the penalty!),
### MDI=mean decrease impurity (Gini), MDA=mean decrease accuracy (permutation),
### MIA=mean increase accuracy
Predictor=mean, ##<< function to estimate node prediction, such as Mode or mean or median. Alternatively, pass an array of numbers as replacement for the yHat column of tree
correctBias = FALSE, ##<< multiply by n/(n-1) for sample variance correction!
verbose=0 ##<< level of verbosity
){
#brain dead "solution" to the CRAN note issues,
#see https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
IG_acc=IG_gini=`split var`=NULL
stopifnot(all(sort(base::unique(bag[,ylabel])) == c(0,1)))
#preorder(1,OOB)
#proper inbag keeping duplicates:
#inbag = as.numeric(names((rf3a$inbag[rf3a$inbag[,k]>0,k])))
tree = as.data.frame(randomForest::getTree(RF, k, labelVar = TRUE))
tree$gini_index = NA
tree$IG_gini= NA
if (score =="MIA"){
tree$Accuracy = NA
tree$IG_acc = NA
}
if (class(Predictor) =="function") {
tree$yHat=sample(bag[,ylabel],nrow(tree))
} else {
tree$yHat= Predictor
}
tree$node=NA
tree=preorder2(1,bag,tree)#uses unmodified row names now !! (e.g. "6.1)
#if (verbose>1) browser()
for (j in 1:nrow(tree)){
y=bag[unlist(tree[j,"node"]),ylabel]#NEEDS to be (0,1) !!!!
nNode = length(na.omit(y))
if (class(Predictor) =="function") {
tree$yHat[j] = Predictor(na.omit(y))
#print(table(y))
tree$gini_index[j] = round(gini_index(mean(y,na.rm=TRUE),correctBias=correctBias,nNode=nNode),9)
} else if (grepl("PMDI", score) ) {#penalized Gini for validation data only:
p=as.numeric(substr(score,5,5))#exponent of the penalty
kind=as.numeric(substr(score,6,6))#kind of penalty
if (verbose) cat("exponent of the penalty:",p,",type: ", kind,"\n")
tree$gini_index[j] = round(gini_index(mean(y,na.rm=TRUE),Predictor[j],p,kind,nNode=nNode,correctBias=correctBias),9)
} else {#this should be for validation data only:
tree$gini_index[j] = round(gini_index(mean(y,na.rm=TRUE),Predictor[j],kind=kind,correctBias=correctBias),9)
}
}
#tree$gini_index[j] = round(gini_index(titanic_train[unlist(tree[j,"node"]),"Survived"]),4)
if (score =="MIA"){
for (j in 1:nrow(tree)){
y=bag[unlist(tree[j,"node"]),ylabel]
#yMajority = Mode(y)
#tree$Accuracy[j] = round(mean(y==yMajority),9)
tree$Accuracy[j] = Accuracy(y, yHat = Mode(na.omit(y)))#tree$yHat[j])
#if (abs(tree$Accuracy[j])>1) browser()
}
}
if (verbose>1) browser()
tree=InfGain(tree, zeroLeaf=zeroLeaf,score=score)
if (returnTree) return(tree)
if (score =="MIA") {
df = dplyr::group_by(tree[,c('split var', 'IG_acc')], `split var`)
pivot = dplyr::summarise(df,`sum_IG_acc`=sum(`IG_acc`))
} else{
df = dplyr::group_by(tree[,c('split var', 'IG_gini')], `split var`)
pivot = dplyr::summarise(df,`sum_IG_gini`=sum(`IG_gini`))
}
count = dplyr::count(df)
pivot$`count` = count$n
pivot = as.data.frame(pivot)
pivot = pivot[!is.na(pivot$`split var`),]
rownames(pivot) = pivot[,1]
#if (pivot["PassengerId","sum_IG_gini"] < 0) browser()
if (any(is.na(pivot))) browser()
return(pivot)
### if returnTree==TRUE returns the tree data frame otherwise the aggregated Gini importance grouped by split variables
} , ex = function(){
rfTit = rfTitanic(nRows = 500,nodesize=10)
rfTit$data$Survived = as.numeric(rfTit$data$Survived)-1
k=1
tmp <- InOutBags(rfTit$RF, rfTit$data, k)
IndivTree =getTree(rfTit$RF,k)
#plot(as.party(tmp))#does not work
InTree = GiniImportanceTree(tmp$inbag,rfTit$RF,k,returnTree=TRUE)
OutTree = GiniImportanceTree(tmp$outbag,rfTit$RF,k,returnTree=TRUE)
})
rfTitanic <- structure(function#fit a random forest model on the titanic data
### convenience function to reduce overhead of repeatedly fitting RF to titanic data
(
formel = Survived ~ Sex + Pclass + PassengerId, ##<< formula
nRows=500, ##<< subsample size
ntree=10, ##<< number of trees
mtry=3, ##<< mtry
nodesize = 1 ##<< nodesize
){
#library(titanic)
titanic_train=NULL
data("titanic_train", package = "rfVarImpOOB", envir = environment())
set.seed(123)
ranRows=sample(nrow(titanic_train), nRows)
train=titanic_train[ranRows,]
rf = randomForest::randomForest(formula = formel,
data=train,
ntree=ntree,importance=TRUE,
mtry=mtry,keep.inbag=TRUE,
nodesize = nodesize)
return(list(RF=rf,data=train))
} , ex = function(){
rfTit = rfTitanic(nRows = 500,nodesize=10)
})
Mode <- structure(function#computes the mode of an array
### returns the mode of a vector
(
x ##<< vector to find mode of
) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}, ex = function(){
Mode(rep(letters[1:3],1:3))
Mode(c(TRUE,TRUE,FALSE))
Mode(c(TRUE,TRUE,FALSE,FALSE))
})
if (0){
library(inlinedocs)
setwd("C:/Users/loecherm/Dropbox/Markus/Research/codeandstats/randomforest_investigation/")
package.skeleton.dx("rfVarImpOOB");system("rm rfVarImpOOB/man/rfVarImpOOB-package.Rd");
system("rm rfVarImpOOB/man/InfGain.Rd");#system("rm rfVarImpOOB/man/InOutBags.Rd")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.