Nothing
#' Clustering the classification trees in a forest based on similarities
#'
#' A function to get insight into a forest of classification trees by clustering the trees in a forest using Partitioning Around Medoids (PAM, Kaufman & Rousseeuw, 2009), based on user provided similarities,
#' or based on similarities calculated by the package using a similarity measure chosen by the user (see Sies & Van Mechelen, 2020).
#'
#' The user should provide the number of clusters that the solution should contain, or a range of numbers that should be explored.
#' In the latter case, the resulting clusterforest object will contain clustering results for each solution.
#' On this clusterforest object, several methods, such as plot, print and summary, can be used.
#'
#' @param observeddata The entire observed dataset
#' @param treedata A list of dataframes on which the trees are based. Not necessary if the data set is included in the tree object already.
#' @param trees A list of trees of class party, classes inheriting from party (e.g., glmtree), classes that can be coerced to party (i.e., rpart, Weka_tree, XMLnode), or a randomForest or ranger object.
#' @param simmatrix A similaritymatrix with the similarities between all trees. Should be square, symmetric and have ones on the diagonal. Default=NULL
#' @param m Similarity measure that should be used to calculate similarities, in the case that no similarity matrix was provided by the user. Default=NULL.
#' m=1 is based on counting common predictors;
#' m=2 is based on counting common predictor-split point combinations;
#' m=3 is based on common ordered sets of predictor-range part combinations (see Shannon & Banks (1999));
#' m=4 is based on the agreement of partitions implied by leaf membership (Chipman, 1998);
#' m=5 is based on the agreement of partitions implied by class labels (Chipman, 1998);
#' m=6 is based on the number of predictor occurrences in definitions of leaves with same class label;
#' m=7 is based on the number of predictor-split point combinations in definitions of leaves with same class label
#' m=8 measures closeness to logical equivalence (applicable in case of binary predictors only)
#' @param tol A vector with for each predictor a number that defines the tolerance zone within which two split points of the predictor in question are assumed equal. For example, if the tolerance for predictor X
#' is 1, then a split on that predictor in tree A will be assumed equal to a split in tree B as long as the splitpoint in tree B is within the splitpoint in tree A + or - 1. Only applicable for m=1 and m=6. Default=NULL
#' @param weight If 1, the number of dissimilar paths in the Shannon and Banks measure (m=2), should be weighted by 1/their length (Otherwise they are weighted equally). Only applicable for m=2. Default=NULL
#' @param fromclus The lowest number of clusters for which the PAM algorithm should be run. Default=1.
#' @param toclus The highest number of clusters for which the PAM algorithm should be run. Default=1.
#' @param treecov A vector/dataframe with the covariate value(s) for each tree in the forest (1 column per covariate) in the case of known
#' sources of variation underlying the forest, that should be linked to the clustering solution.
#' @param sameobs Are the same observations included in every tree data set? For example, in the case of subsamples or bootstrap samples, the answer is no. Default=FALSE
#' @param seed A seed number that should be used for the multi start procedure (based on which initial medoids are assigned). Default=NULL.
#' @param no_cores Number of CPU cores used for computations. Default=detectCores(logical=FALSE)
#' @return The function returns an object of class clusterforest, with attributes:
#' \item{medoids}{the position of the medoid trees in the forest (i.e., which element of the list of partytrees)}
#' \item{medoidtrees}{the medoid trees}
#' \item{clusters}{The cluster to which each tree in the forest is assigned}
#' \item{avgsilwidth}{The average silhouette width for each solution (see Kaufman and Rousseeuw, 2009)}
#' \item{accuracy}{For each solution, the accuracy of the predicted class labels based on the medoids.}
#' \item{agreement}{For each solution, the agreement between the predicted class label for each observation based on the forest as a whole, and those based on the
#' medoids only (see Sies & Van Mechelen,2020)}
#' \item{withinsim}{Within cluster similarity for each solution (see Sies & Van Mechelen, 2020)}
#' \item{treesimilarities}{Similarity matrix on which clustering was based}
#' \item{treecov}{covariate value(s) for each tree in the forest}
#' \item{seed}{seed number that was used for the multi start procedure (based on which initial medoids were assigned)}
#' @export
#' @importFrom cluster pam
#' @importFrom partykit nodeids data_party id_node kids_node varid_split split_node index_split breaks_split right_split node_party
#' @importFrom stats predict
#' @importFrom graphics axis plot
#' @importFrom parallel detectCores makeCluster clusterExport parSapply stopCluster
#' @importFrom igraph graph_from_incidence_matrix max_bipartite_match
#' @importFrom stats complete.cases
#' @importFrom methods is
#' @importFrom ranger treeInfo
#' @importFrom randomForest getTree
#' @importFrom foreach foreach %do% %dopar%
#' @importFrom doParallel registerDoParallel
#' @import MASS
#' @import partykit
#' @import rpart
#'
#' @references \cite{Kaufman, L., & Rousseeuw, P. J. (2009). Finding groups in data: an introduction to cluster analysis (Vol. 344). John Wiley & Sons.}
#' @references \cite{Sies, A. & Van Mechelen I. (2020). C443: An R-package to see a forest for the trees. Journal of Classification.}
#' @references \cite{Shannon, W. D., & Banks, D. (1999). Combining classification trees using MLE. Statistics in medicine, 18(6), 727-740.}
#' @references \cite{Chipman, H. A., George, E. I., & McCulloh, R. E. (1998). Making sense of a forest of trees. Computing Science and Statistics, 84-92.}
#' @examples
#' require(MASS)
#' require(ranger)
#' require(rpart)
#'#Function to draw a bootstrap sample from a dataset
#'DrawBoots <- function(dataset, i){
#'set.seed(2394 + i)
#'Boot <- dataset[sample(1:nrow(dataset), size = nrow(dataset), replace = TRUE),]
#'return(Boot)
#'}
#'
#'#Function to grow a tree using rpart on a dataset
#'GrowTree <- function(x,y,BootsSample, minsplit = 40, minbucket = 20, maxdepth =3){
#' controlrpart <- rpart.control(minsplit = minsplit, minbucket = minbucket, maxdepth = maxdepth,
#' maxsurrogate = 0, maxcompete = 0)
#' tree <- rpart(as.formula(paste(noquote(paste(y, "~")), noquote(paste(x, collapse="+")))),
#' data = BootsSample, control = controlrpart)
#' return(tree)
#'}
#'
#'#Use functions to draw 10 boostrapsamples and grow a tree on each sample
#'Boots<- lapply(1:10, function(k) DrawBoots(Pima.tr ,k))
#'Trees <- lapply(1:10, function (i) GrowTree(x=c("npreg", "glu", "bp", "skin",
#' "bmi", "ped", "age"), y="type", Boots[[i]] ))
#'
#'#Clustering the trees in this forest
#'ClusterForest<- clusterforest(observeddata=Pima.tr,treedata=Boots,trees=Trees,m=1,
#'fromclus=1, toclus=2, sameobs=FALSE, no_cores=2)
#'
#'#Example RandomForest
#'Pima.tr.ranger <- ranger(type ~ ., data = Pima.tr, keep.inbag = TRUE, num.trees=20,
#'max.depth=3)
#'
#'ClusterForest<- clusterforest(observeddata=Pima.tr,trees=Pima.tr.ranger,m=5,
#' fromclus=1, toclus=2, sameobs=FALSE, no_cores=2)
clusterforest <- function (observeddata, treedata=NULL, trees, simmatrix=NULL, m=NULL, tol=NULL, weight=NULL,fromclus=1, toclus=1, treecov=NULL, sameobs=FALSE, seed=NULL, no_cores = detectCores(logical=FALSE)){
############################## Check forest input #####################
### Some checks whether correct forest information is provided by user
if(typeof(trees) != "list" ) {
cat("trees must be a list of party tree objects, a list of trees that can be converted to party objects, or a randomforest or ranger object")
return(NULL)
}
if('ranger' %in% class(trees)){
trees<-sapply(1:trees$num.trees, function (k) ranger2party(observeddata, trees, k))
}else if('randomForest' %in% class(trees)){
trees<-sapply(1:trees$ntree, function (k) randomForest2party(observeddata, trees, k))
}else if(!'party' %in% class(trees[[1]])){
tryCatch(trees<- lapply(1:length(trees), function (i) as.party(trees[[i]])),
error=function(e){cat("trees must be a list of party tree objects or objects that can be coerced to party trees")})
}
if(!is.null(trees[[1]]$data)){
treedt<- lapply(1:length(trees), function(k) trees[[k]]$data)
}
if(is.null(trees[[1]]$data)){
if(is.null(treedata)){
cat("please submit treedata")
return(NULL)
}
if(typeof(treedata) != "list" || !is(treedata[[1]], "data.frame")) {
cat("please submit correct treedata: a list of data frames on which the trees were grown")
return(NULL)
}
if(length(treedata) != length(trees)){
cat("the number of data frames provided must be the same as the number of trees")
return(NULL)
}
treedt=treedata
}
cl <- makeCluster(no_cores)
doParallel::registerDoParallel(cl)
## Turn user provided forest information into object of class forest.
forest <- list(partytrees = trees, treedata = treedt, observeddata=observeddata)
class(forest) <- append(class(forest), "forest")
#########################################################################
########################## Calculate Similarities #######################
# Check whether at least one of the arguments is not null (user provided
# similarity matrix or similarity measure)
if (is.null(simmatrix) & is.null(m)){
cat("Either a similarity matrix (simmatrix) should be provided, or
a similarity measure (m) should be chosen")
return(NULL)
}
# If user provided similarity matrix, check whether it is a square matrix,
# whether it's symmetric and whether it contains ones on the diagonal.
if (!is.null(simmatrix)){
#check square
if(nrow(simmatrix) != ncol(simmatrix)){
cat("The similarity matrix should be a square matrix")
return(NULL)
}
#check symmetric
if(!isSymmetric(simmatrix)){
cat("The similarity matrix should be a symmetric")
return(NULL)
}
#check ones on diagonal
if(!sum(diag(simmatrix) == 1) == nrow(simmatrix)){
cat("The similarity matrix should have ones on the diagonal")
return(NULL)
}
if(nrow(simmatrix) != length(trees)){
cat("The similarity matrix should have the same dimensions as the number of trees in the forest")
return(NULL)
}
#turn into treesimilarities object
treesimilarities <- simmatrix
attr(treesimilarities, "class") <- "treesimilarities"
}
# If user didn't provide similarity matrix -- calculate using chosen measure
if (is.null(simmatrix) & !is.null(m)){
X= unlist(unique(lapply(1:length(forest$partytrees), function (k) attr(forest$partytrees[[k]]$terms, "term.labels"))))
Y= unlist(lapply(1:length(forest$partytrees), function(k) colnames(forest$treedata[[k]])[!colnames(forest$treedata[[k]]) %in% X]))
if (m == 1){
sim <- M1(forest$observeddata, forest$treedata, Y, X, forest$partytrees, tol=NULL)
}
if(m == 2){
if(is.null(tol)){
cat("Please provide a tolerance zone for each predictor")
return(NULL)
} else{
sim <- M1(forest$observeddata, forest$treedata, Y, X, forest$partytrees, tol)
}
}
if (m == 3){
Xclass<- sapply(1:length(X), function(i) is(forest$observeddata[,X[i]], "integer" )|is(forest$observeddata[,X[i]],"numeric" ))
if("FALSE" %in% Xclass){
cat("This measure only works on numerical splitvariables (class integer or numeric)")
return(NULL)
} else{
sim <- M2(forest$observeddata, forest$treedata, Y, forest$partytrees, weight)
}
}
if (m == 4){
sim <- M4(forest$observeddata, forest$treedata, Y, forest$partytrees,sameobs)
}
if (m == 5){
sim <- M3(forest$observeddata, forest$treedata, Y,forest$partytrees, sameobs)
}
if (m == 6){
Xclass<- sapply(1:length(X), function(i) is(forest$observeddata[,X[i]] , "integer")|is(forest$observeddata[,X[i]], "numeric" ) )
if("FALSE" %in% Xclass){
cat("This measure only works on numerical splitvariables (class integer or numeric)")
return(NULL)
}
sim <- M6(forest$observeddata, forest$treedata, Y, X, forest$partytrees, tol=NULL)
}
if(m==7){
Xclass<- sapply(1:length(X), function(i) is(forest$observeddata[,X[i]], "integer" )|is(forest$observeddata[,X[i]], "numeric" ) )
if("FALSE" %in% Xclass){
cat("This measure only works on numerical splitvariables (class integer or numeric)")
return(NULL)
}
if(is.null(tol)){
cat("Please provide a tolerance zone for each predictor")
return(NULL)
} else{
sim <- M6(forest$observeddata, forest$treedata, Y, X, forest$partytrees, tol)
}
}
if (m == 8){
Xclass<- sapply(1:length(X), function(i) is(forest$fulldata[,X[i]], "factor" ) )
if("FALSE" %in% Xclass){
cat("This measure only works on binary splitvariables (class factor)")
return(NULL)
}else{
Xbin<- sapply(1:length(X), function(i) levels(forest$fulldata[,X[i]]) > 2 )
if("FALSE" %in% Xbin){
cat("This measure only works on binary splitvariables (class factor)")
return(NULL)
} else{
sim <- M5(forest$fulldata, forest$treedata, Y, X, forest$partytrees, sameobs)
}
}
}
# Turn similarity matrix into treesimilarities object
treesimilarities <- sim
attr(treesimilarities, "class") <- "treesimilarities"
}
################################# End similarities #######################
############################## Clustering ################################
trees=forest$partytrees
treedata=forest$treedata
observeddata=forest$observeddata
medoids<- list(0)
clusters<- list(0)
mds<- list(0)
sil<- list(0)
agreement<-list(0)
sums<- list(0)
meds<- list(0)
obj<- c(0)
medsseeds<- list(0)
correct<- list(0)
pamtrees <- lapply(1:length(trees), function (i) pamtree(observeddata, treedata[[i]], Y[i], trees[[i]]))
g <- lapply(1:length(trees), function(k) pamtrees[[k]]$predresp)
g_matrix <- t(sapply(1:length(trees), function(k) as.numeric(levels(g[[k]])[g[[k]]])))
forpred <- function(k) {
levels_g1[which.max( c(sum(g_matrix[,k] == levels_g1[1], na.rm=TRUE), sum(g_matrix[,k] == levels_g1[2], na.rm=TRUE)))]
}
levels_g1 = levels(g[[1]])
forestpred <- sapply(1:nrow(observeddata), forpred)
for (i in fromclus:toclus) {
# first do the clustering with BUILD and SWAP phase
clustering <- pam(x = 1 - treesimilarities, k = i, diss = TRUE, pamonce=3)
if(is.null(seed)){
seed <- round(runif(1,0,100000))
}
#Then try 100 random starts + SWAP Phase
for (j in 1:100){
set.seed(seed+j)
initmedoids = sample(1:nrow(treesimilarities),i)
medsseeds[[j]] <- pam(1-treesimilarities,k=i,medoids=initmedoids, diss=TRUE, pamonce=3)
obj[j] <- medsseeds[[j]]$objective[2]
}
# check whether objective function of one of the multistarts is better than the one of the build algorithm
# and if so, continue with the best result
if( round(min(obj), 10) < round(clustering$objective[2], 10) ){
clustering <- medsseeds[[ which.min(round(min(obj), 10))]]
}
medoids[[i]] <- clustering$medoids
clusters[[i]] <- clustering$clustering
meds <- vector(mode = "list", length = i)
for(j in 1:i){
meds[[j]] <- trees[[medoids[[i]][j]]]
}
mds[[i]] <- meds
sil[[i]] <- clustering $ silinfo $ avg.width
gmed <- g[c(medoids[[i]])]
w <- table(clusters[[i]])
#unweighted
#medpred1 <- sapply(1:nrow(observeddata), function (k) levels(gmed[[1]])[which.max( c(sum(unlist(lapply(gmed, `[[`, k)) == levels(unlist(lapply(gmed, `[[`, k)))[1], na.rm=TRUE), sum(unlist(lapply(gmed, `[[`, k)) == levels(unlist(lapply(gmed, `[[`, k)))[2], na.rm=TRUE) ) )] )
#gmed_matrix <- as.numeric(levels(gmed[[1]])[gmed[[1]]])
gmed_matrix <- t(sapply(1:i, function(k) as.numeric(levels(gmed[[k]])[gmed[[k]]])))
levels_gmed1 = levels(gmed[[1]])
medpred_weighted <- function(k) {
levels_gmed1[which.max( c(sum((gmed_matrix[,k] == levels_gmed1[1]) * w , na.rm=TRUE), sum((gmed_matrix[,k] == levels_gmed1[2]) * w, na.rm=TRUE)))]
}
#weighted
medpred <- sapply(1:nrow(observeddata), medpred_weighted)
agreement[[i]] <- mean(forestpred == medpred)
correct[[i]] <- mean(forest$observeddata[,Y][1] == medpred)
sumw<- numeric(0)
for (j in 1:i){
sumw[j] <- sum(treesimilarities[clusters[[i]]==j, medoids[[i]][j]])
}
sums[[i]] <- sum(sumw) / dim(treesimilarities)[1]
}
stopCluster(cl)
correct[[toclus + 1]] <- mean(forest$observeddata[,Y][1]== forestpred)
correct[[toclus + 2]] <- max(table(forest$observeddata[,Y][1]))/sum(table(forest$observeddata[,Y][1]))
value <- list(medoids = medoids, medoidtrees = mds, clusters=clusters, avgsilwidth=sil, agreement=agreement, accuracy=correct, withinsim=sums, treesimilarities=treesimilarities, treecov=treecov, seed=seed)
attr(value, "class") <- "clusterforest"
return(value)
}
################# subfunctions #############################
#Measure 1: number of splitvariables & splitpoints in common/total number of splitvariables largest tree
#If tol = null, only splitvariables taken into account
M1 <- function (observeddata, treedata, Y, X, trees, tol){
#check whether there are any categorical predictors
#if so, this measure with splitvariables can not be used.
Xclass<- sapply(1:length(X), function(i) class(observeddata[,X[i]] ) == "integer"|class(observeddata[,X[i]] ) == "numeric")
if(!is.null(tol)){
if("FALSE" %in% Xclass){
cat("This measure only works on numerical splitvariables (class integer or numeric)")
return(NULL)
}
}
pamtrees <- lapply(1:length(trees), function (i) pamtree(observeddata,treedata[[i]], Y[i],trees[[i]]))
simmatrix <- matrix(c(0), length(trees), length(trees))
splits <- lapply(1:length(trees), function(i) splitv(pamtrees[[i]], tol))
s <- sapply(1:length(trees), function (k) sapply(k:length(trees), function (l) sim(splits1=splits[[k]], splits2=splits[[l]], X=X, tol=tol)))
#replace naNs -- trees without splits should have similarity of 1 with each other
for (i in 1:length(trees)){
si<- s[[i]]
si[is.nan(si)] <- 1
simmatrix[i, c(i:length(trees))] <- si
}
ind <- lower.tri(simmatrix) #make matrix symmetric
simmatrix[ind] <- t(simmatrix)[ind]
return(simmatrix)
}
### Measure 2: Paths
M2<- function(observeddata, treedata, Y, trees, weight){
if (is.null(weight)){weight=0}
pamtrees <- lapply(1:length(trees), function (i) pamtree(observeddata,treedata[[i]], Y[i],trees[[i]]))
n <- length(pamtrees)
simmatrix <- matrix(c(0), n, n)
subs <- sapply (1:n, function (k) returnsubpaths(pamtrees[[k]])) #split up each path into all subpaths
dis <- matrix(c(0), length(pamtrees), length(pamtrees))
d <- sapply(1:n, function (s) sapply(s:n, function (j) dissim(subs[[s]], subs[[j]], weight) ))
for (i in 1:n){
dis[i, c(i:n)] <- d[[i]]
}
ind <- lower.tri(dis) #make matrix symmetric
dis[ind] <- t(dis)[ind]
sim <- 1 - round(dis,4)
return(sim)
}
####### Measure 3: classification labels agreement
M3<- function (observeddata, treedata, Y, trees, sameobs){
pamtrees<- lapply(1:length(trees), function (i) pamtree(observeddata,treedata[[i]], Y[i],trees[[i]]))
n <- length(pamtrees)
sim <- matrix(0, length(pamtrees), length(pamtrees))
# if treedata contains same observations as fulldata, then use the training data to evaluate agreement,
#otherwise use fulldata
if(sameobs==TRUE){
s <- sapply(1:n, function (s) sapply(s:n, function (j) mean(pamtrees[[s]]$predresptrain==pamtrees[[j]]$predresptrain)))
}else{
s <- sapply(1:n, function (s) sapply(s:n, function (j) mean(pamtrees[[s]]$predresp==pamtrees[[j]]$predresp)))
}
for (i in 1:n){
sim[i,c(i:n)] <- s[[i]]
}
ind <- lower.tri(sim) #make matrix symmetric
sim[ind] <- t(sim)[ind]
return(sim)
}
###### Measure 4: Partition metric ################
M4 <- function (observeddata, treedata, Y, trees, sameobs){
pamtrees <- lapply(1:length(trees), function (i) pamtree(observeddata,treedata[[i]], Y[i],trees[[i]]))
n <- length(trees)
if(sameobs==TRUE){
par<- lapply(1:n, function (s) part(treedata[[s]], pamtrees[[s]]$prednodetrain))
}else{
par<- lapply(1:n, function (s) part(treedata[[s]], pamtrees[[s]]$prednode))
}
par<- lapply(1:n, function (s) part(treedata[[s]], pamtrees[[s]]$prednode))
si <- sapply(1:n, function (s) sapply (s:n, function (j) metr(par[[s]], par[[j]], treedata[[s]])))
sim <- matrix(0, length(trees), length(trees))
for (i in 1:n){
sim[i, c(i:n)] <- si[[i]]
}
ind <- lower.tri(sim) #make matrix symmetric
sim[ind] <- t(sim)[ind]
return(sim)
}
### set of splitvariables and splitpoints and the prediction of a leaf
M6 <- function (observeddata, treedata, Y, X, trees, tol){
pamtrees <- lapply(1:length(trees), function (i) pamtree(observeddata,treedata[[i]], Y[i],trees[[i]]))
n <- length(pamtrees)
s <- lapply(1:n, function (k) splitvsets(pamtrees[[k]]$path1))
s0<- lapply(1:n, function (k) splitvsets(pamtrees[[k]]$path0))
for (k in 1:n){
if(is(treedata[[1]][,Y[1]], "factor") ) {
treedata[[k]][,Y[k]] <- as.factor(treedata[[k]][,Y[k]])
}else{
treedata[[k]] = treedata[[k]]
}#check whether why is a factor, if not-- make it a factor
}
best<- lapply(1:n, function(k) (sum(treedata[[k]][,Y[k]] == levels(treedata[[k]][,Y[k]])[2])/nrow(treedata[[k]] ))>0.5)
simmatrix <- foreach (k = 1:n, .combine=cbind) %dopar% {
sim <- numeric(n)
for (l in k:n) {
sim[l] <- simsets(paths1=s[[k]], paths2=s[[l]], paths01=s0[[k]], paths02=s0[[l]], X=X, tol=tol, best1=best[[k]], best2=best[[l]])
}
sim
}
for (k in 1:n) {
for (l in k:n) {
simmatrix[k, l] = simmatrix[l, k]
}
}
simmatrix = matrix(simmatrix, ncol=length(pamtrees), nrow=length(pamtrees), dimnames = NULL)
return(simmatrix)
}
M5 <- function(observeddata, treedata, Y, X, trees, sameobs){
pamtrees <- lapply(1:length(trees), function (i) pamtree(observeddata,treedata[[i]], Y[i],trees[[i]]))
n <-length(pamtrees)
s <- matrix(0, length(pamtrees), length(pamtrees))
di <- lapply(1:length(pamtrees), function (i) disjnorm(pamtrees[[i]], trees[[i]],observeddata, treedata[[i]] ,X, Y[i], sameobs))
if(sameobs==TRUE){
si <- sapply(1:n, function (i) sapply (i:n, function (j) dis(di[[i]], di[[j]], pamtrees[[i]]$predresptrain, pamtrees[[j]]$predresptrain)))
}
else{
si <- sapply(1:n, function (i) sapply (i:n, function (j) dis(di[[i]], di[[j]], pamtrees[[i]]$predresp, pamtrees[[j]]$predresp)))
}
si <- sapply(1:n, function (i) sapply (i:n, function (j) dis(di[[i]], di[[j]], pamtrees[[i]]$predresp, pamtrees[[j]]$predresp)))
for (i in 1:n){
s[i, c(i:n)] <- si[[i]]
}
ind <- lower.tri(s) #make matrix symmetric
s[ind] <- t(s)[ind]
s<- round(s, digits=3)
return(s)
}
## Function to turn each tree into a pam tree, containing the set of rules, the predictions on the full dataset and the nodes
## observeddata is the full dataset, treedata is the dataset for the current tree, Y is a vector with the outcome for each tree and tree are the partytrees
pamtree <- function(observeddata,treedata,Y,tree){
if(length(tree) > 2){ ## Check whether there was a split
paths <- listrulesparty(x=tree) # lists all the paths from root to leave
prednode <- predict(tree, newdata = observeddata, type = "node") #predicts node for every row of full data
if(!is(treedata[,Y], "factor" )) {treedata[,Y] <- as.factor(treedata[,Y])} #check whether y is a factor, if not-- make it a factor
if(!is(observeddata[,Y], "factor") ) {observeddata[,Y] <- as.factor(observeddata[,Y])} #check whether y is a factor, if not-- make it a factor
predresp <- predict(tree, newdata= observeddata, type="response") #predicts response for every row of full data
predresp <- factor(predresp, levels=c(levels(observeddata[,Y])[1], levels(observeddata[,Y])[2]))
predresptrain <- predict(tree, newdata = treedata, type="response") #predicts response for every row of full data
predresptrain <- factor(predresptrain, levels=c(levels(observeddata[,Y])[1], levels(observeddata[,Y])[2]))
prednodetrain <- predict(tree, newdata = treedata, type = "node") #predicts node for every row of tree data
frame <- matrix(c(0), length(unique(prednodetrain)), 2) #create matrix with one row for each node value
frame[, 1] <- sort(unique(prednodetrain))
frame[, 2] <- sapply(sort(unique(prednodetrain)), function(k) levels(predresptrain[prednodetrain == k][1])[predresptrain[prednodetrain == k][1]]) # check the predicted response for every node
#the paths that lead to a response of the second level of y
path1 <- paths[frame[, 2] == levels(observeddata[,Y])[2]]
path0 <- paths[frame[, 2] == levels(observeddata[,Y])[1]]
paths <- sapply(1:length(paths), function (k) strsplit(paths[k], " & ")) #split rules with multiple conditions in substrings
path1 <- sapply(1:length(path1), function (k) strsplit(path1[k], " & "))
path0 <- sapply(1:length(path0), function (k) strsplit(path0[k], " & "))
#if there was no split
}else{
paths<- NULL
path1<-NULL
path0<- NULL
tree <- list(numeric(0), numeric(0))
if(is(treedata[,Y], "factor") ) {treedata[,Y] <- as.factor(treedata[,Y])}
y <- treedata[, Y]
#check what class is most prevalent
if(sum(y == levels(y)[1], na.rm=T) > sum(y == levels(y)[2], na.rm=T)){
g1 <- levels(y)[1]
} else{
g1<- levels(y)[2]
}
predresp <- rep(g1, length(y))
predresp<- factor(predresp, levels=levels(treedata[,Y]))
prednode <- rep(1, length(y))
prednodetrain <- rep(1, length(y))
predresptrain<- rep(g1, length(y))
}
value <- list(pamtree = paths, path0=path0, path1= path1, prednode = prednode, predresp=predresp,prednodetrain=prednodetrain, predresptrain=predresptrain)
attr(value, "class") <- "pamtree"
return(value)
}
#### grow party tree (for turning ranger/randomforest tree into partytree)
grow.party.tree <- function(party.tree, ranger.tree, data, factor.terms.index, currentNodeNumber) {
node <- ranger.tree[ranger.tree$nodeID == currentNodeNumber, ]
factor.terms.index.left <- factor.terms.index
factor.terms.index.right <- factor.terms.index
# Create individual node
if (node$terminal == TRUE) {
newNode <- list(id = node$nodeID)
} else {
data.is.factor <- is.factor(data[[node$splitvarName]])
data.is.ordered <- is.ordered(data[[node$splitvarName]])
if (data.is.factor && !data.is.ordered) {
index <- factor.terms.index[[node$splitvarName]]
index[as.integer(unlist(strsplit(node$splitval, ',')))] = 2L
newNode <- list(id = node$nodeID,
split = partysplit(
varid = as.integer(node$splitvarID + 1),
index = index
),
kids = c(as.integer(node$leftChild), as.integer(node$rightChild)))
factor.terms.index.left[[node$splitvarName]] <- replace(index, index==2L, NA)
factor.terms.index.right[[node$splitvarName]] <- replace(replace(index, index==1L, NA), index==2L, 1L)
} else {
newNode <- list(id = node$nodeID, split = partysplit(varid = as.integer(node$splitvarID + 1), breaks = as.numeric(node$splitval)), kids = c(as.integer(node$leftChild), as.integer(node$rightChild)))
}
}
# Traverse tree recursively
if (node$terminal == FALSE) {
leftChildren <- grow.party.tree(party.tree, ranger.tree, data, factor.terms.index.left, node$leftChild)
rightChildren <- grow.party.tree(party.tree, ranger.tree, data, factor.terms.index.right, node$rightChild)
party.tree <- c(party.tree, leftChildren, rightChildren)
}
# Add newly created node to list of nodes
party.tree <- c(party.tree, list(newNode))
party.tree
}
generic2party <- function(data, generic.tree, inbag, formula, weights) {
response <- all.vars(formula)[1]
terms <- terms(formula, data = data)
factor.terms <- all.vars(terms)[-1]
factor.terms.index <- list()
for(factor.term in factor.terms) {
factor.terms.index[[factor.term]] <- rep(1L, length(levels(data[[factor.term]])))
}
data <- data[complete.cases(data), c(all.vars(terms)[-1], response)]
data <- as.data.frame(lapply(data, rep, inbag))
if (is.null(weights)) {
weights <- rep(1L, nrow(data))
}
nodelist = list()
nodelist <- grow.party.tree(nodelist, generic.tree, data, factor.terms.index, 0)
nodes <- as.partynode(nodelist)
fitted <- fitted_node(nodes, data = data)
tree <- party(nodes,
data = data,
fitted = data.frame("(fitted)" = fitted,
"(response)" = data[[response]],
"(weights)" = weights,
check.names = FALSE),
terms = terms(formula, data = data)
)
as.constparty(tree)
}
ranger2party <- function(data, ranger.forest, treeNumber, weights = NULL) {
if (!exists("inbag.counts", where=ranger.forest)) {
stop("Run ranger with the keep.inbag=T parameter")
}
ranger.tree <- treeInfo(ranger.forest, tree = treeNumber)
formula <- formula(ranger.forest$call[[2]])
inbag <- ranger.forest$inbag.counts[[treeNumber]]
generic2party(data, ranger.tree, inbag, formula, weights)
}
randomForest2party <- function(data, randomForest.forest, treeNumber, weights = NULL) {
if (!exists("inbag", where=randomForest.forest)) {
stop("Run randomForest with the keep.inbag=T parameter")
}
randomForest.tree.without.labels <- data.frame(getTree(randomForest.forest, k = treeNumber, labelVar = F))
randomForest.tree.with.labels <- data.frame(getTree(randomForest.forest, k = treeNumber, labelVar = T))
# Convert randomForest format to Ranger format
colnames(randomForest.tree.with.labels) <- c("leftChild", "rightChild", "splitvarName", "splitval", "terminal", "prediction")
colnames(randomForest.tree.without.labels) <- c("leftChild", "rightChild", "splitvarID", "splitval", "terminal", "prediction")
nodeID <- 0:(nrow(randomForest.tree.with.labels) -1)
leftChild <- randomForest.tree.with.labels$leftChild - 1L
rightChild <- randomForest.tree.with.labels$rightChild - 1L
splitvarID <- randomForest.tree.without.labels$splitvarID - 1L
splitvarName <- as.character(randomForest.tree.with.labels$splitvarName)
splitval <- randomForest.tree.with.labels$splitval
terminal <- ifelse(randomForest.tree.without.labels$terminal == -1, TRUE, FALSE)
is.na(rightChild) <- is.na(splitvarID) <- is.na(leftChild) <- is.na(splitval) <- terminal
prediction <- randomForest.tree.with.labels$prediction
generic.tree <- data.frame(nodeID, leftChild, rightChild, splitvarID, splitvarName, splitval, terminal, prediction)
idx.unordered <- apply(array(splitvarName), 1, function(x) { !("ordered" %in% class(data[[x]]) || "numeric" %in% class(data[[x]]))})
idx.unordered[terminal] <- FALSE
if (any(idx.unordered)) {
if (any(generic.tree$splitval[idx.unordered] > (2^31 - 1))) {
warning("Unordered splitting levels can only be shown for up to 31 levels.")
generic.tree$splitval[idx.unordered] <- NA
} else {
generic.tree$splitval[idx.unordered] <- sapply(generic.tree$splitval[idx.unordered], function(x) {
paste(which(as.logical(intToBits(2^31-1-x))), collapse = ",")
})
}
}
formula <- formula(randomForest.forest$call[[2]])
inbag <- randomForest.forest$inbag[, treeNumber]
generic2party(data, generic.tree, inbag, formula, weights)
}
### partykit:::.list.rules.party
listrulesparty <- function (x, i = NULL, ...)
{
if (is.null(i))
i <- nodeids(x, terminal = TRUE)
if (length(i) > 1) {
ret <- sapply(i, listrulesparty, x = x)
names(ret) <- if (is.character(i))
i
else names(x)[i]
return(ret)
}
if (is.character(i) && !is.null(names(x)))
i <- which(names(x) %in% i)
stopifnot(length(i) == 1 & is.numeric(i))
stopifnot(i <= length(x) & i >= 1)
i <- as.integer(i)
dat <- data_party(x, i)
if (!is.null(x$fitted)) {
findx <- which("(fitted)" == names(dat))[1]
fit <- dat[, findx:ncol(dat), drop = FALSE]
dat <- dat[, -(findx:ncol(dat)), drop = FALSE]
if (ncol(dat) == 0)
dat <- x$data
}
else {
fit <- NULL
dat <- x$data
}
rule <- c()
recFun <- function(node) {
if (id_node(node) == i)
return(NULL)
kid <- sapply(kids_node(node), id_node)
whichkid <- max(which(kid <= i))
split <- split_node(node)
ivar <- varid_split(split)
svar <- names(dat)[ivar]
index <- index_split(split)
if (is.factor(dat[, svar])) {
if (is.null(index))
index <- ((1:nlevels(dat[, svar])) > breaks_split(split)) +
1
slevels <- levels(dat[, svar])[index == whichkid]
srule <- paste(svar, " %in% c(\"", paste(slevels,
collapse = "\", \"", sep = ""), "\")", sep = "")
}
else {
if (is.null(index))
index <- 1:length(kid)
breaks <- cbind(c(-Inf, breaks_split(split)), c(breaks_split(split),
Inf))
sbreak <- breaks[index == whichkid, ]
right <- right_split(split)
srule <- c()
if (is.finite(sbreak[1]))
srule <- c(srule, paste(svar, ifelse(right, ">",
">="), sbreak[1]))
if (is.finite(sbreak[2]))
srule <- c(srule, paste(svar, ifelse(right, "<=",
"<"), sbreak[2]))
srule <- paste(srule, collapse = " & ")
}
rule <<- c(rule, srule)
return(recFun(node[[whichkid]]))
}
node <- recFun(node_party(x))
paste(rule, collapse = " & ")
}
#### Subfunctions M1 ######
#get splitvariables and splitpoints
splitv <- function (tree, tol){
paths <- tree$pamtree
# check whether there was a split
if(length(paths) > 0){
pathsw <- paths
leaves <- length(paths)
l <- sapply(1:leaves, function (k) length(paths[[k]]))
spaths <- lapply(1:leaves, function(k) sub(" <=.", ':', paths[[k]]))
spaths <- lapply(1:leaves, function(k) sub(" <.", ':', spaths[[k]]))
spaths <- lapply(1:leaves, function(k) sub(" >=.", ':', spaths[[k]]))
spaths <- lapply(1:leaves, function(k) sub(" >.", ':', spaths[[k]]))
spaths <- lapply(1:leaves, function(k) sub(" %in%.*", '', spaths[[k]]))
splitvarsa <- unique(unlist(spaths))
splitvars <- sub(":.*", '', splitvarsa)
if(!is.null(tol)){
splitvarsa <- sub(".*:", '', splitvarsa)
splitvarsa <- as.numeric(splitvarsa)
} else{
splitvarsa<- NULL
}
nsplitvar1 <- length(splitvars) # number of splits in tree i
}else{
splitvars <- NULL
splitvarsa <- NULL
nsplitvar1 <- 0
}
return(list(splitvars = splitvars, splitpoints = splitvarsa, nsplits = nsplitvar1))
}
#calculate jaccard index
sim<- function (splits1, splits2, X, tol){
### Only predictors
if(is.null(tol)){
common1 <- length(splits1$splitvars[pmatch(splits1$splitvars, splits2$splitvars, nomatch = 0)]) #pmatch: no doubles
total1 <- splits1$nsplits + splits2$nsplits - common1
Jaccard <- common1 / total1
}
### Also splitpoints
if(!is.null(tol)){
# Create matrix That shows for each variable of the splits1 whether it is equal to each variable in splits2
same <- matrix(c(0), length(splits1[[1]]), length(splits2[[1]]))
if(length(same > 0)){
for (i in 1:length(splits1[[1]])){
for(j in 1:length(splits2[[1]])){
same[i, j] <- splits1[[1]][[i]] == splits2[[1]][[j]]
}
}
# For those variables that are the same in both trees, put splitpoints of tree 2 in the matrix
splitpoints <- matrix(c(rep(splits2$splitpoints, nrow(same))), nrow(same), ncol(same), byrow=T)
s <- c(splitpoints)
sm <- c(same)
s[sm == 0] <- NA
splitpoints <- matrix(s, nrow(same), ncol(same), byrow=F)
# Get the right tolerance for each variables in splits1
tsa <- splits1$splitvars
t <- match(tsa,X)
correct<- matrix(c(0), nrow(same), ncol(same))
# Look whether splitpoint of splits2 is within tolerance zone of splits1
for (i in 1:length(splits1$splitvars)){
d <- as.numeric(tol[t[i]])
correct[i, ] <- findInterval(splitpoints[i, ], c(as.numeric(splits1$splitpoints[i]) - d, as.numeric(splits1[[2]][i]) + d) ) == 1
}
correct[is.na(correct)] <- 0
g <- graph_from_incidence_matrix(correct)
common<- max_bipartite_match(g)$matching_size
total<- splits1[[3]] + splits2[[3]] - common
Jaccard<- common / total
} else{
if(length(splits1[[1]]) == 0 & length(splits2[[1]]) == 0){
Jaccard <- 1
} else{Jaccard <- 0}
}
}
return(Jaccard)
}
# function returns all the subpaths, takes away splitpoints and removes direction of last split of each path.
# then returns only unique subpaths
## subfunction M2
returnsubpaths <- function(tree){
paths <- tree$pamtree
if(length(paths) > 0){
leaves<- length(paths)
l<- sapply(1:leaves, function (k) length(paths[[k]]))
lastpaths <- lapply(1:leaves, function(k) sub("<.*", '', paths[[k]][l[k]])) # remove direction and splitpoint last attribut of path
lastpaths <- lapply(1:leaves, function(k) sub(">.*", '', lastpaths[[k]]))
# place it back in paths
for(j in 1:leaves){
paths[[j]][l[j]]<- lastpaths[j]
}
paths <- lapply(1:leaves, function(k) sub("<.*", '-', paths[[k]])) #replace splitpoints with - or +
paths <- lapply(1:leaves, function(k) sub(">.*", '+', paths[[k]]))
upaths <- unique(paths)
subpaths <- list(0)
for(j in 1:length(upaths)){
d<- list(0)
d[[1]] <- upaths[[j]]
if(length(upaths[[j]]) > 1){ #check whether more than one split (otherwise no subpaths and d has just one element)
for (i in 2:length(upaths[[j]])){
d[[i]] <- d[[i - 1]][- length(d[[i - 1]])] #Split each path up into all subpaths
}
}
subpaths[[j]] <- d
}
subpaths <- unlist(subpaths, recursive=FALSE)
lastsubpaths <- lapply(1:length(subpaths), function(k) gsub("[[:punct:]]", '', subpaths[[k]][length(subpaths[[k]])])) # remove punctuation from last attribute of each subpath
for(j in 1:length(subpaths)){
subpaths[[j]][length(subpaths[[j]])] <- lastsubpaths[j]
}
subpaths<- unique(subpaths)
}else{subpaths<- NULL}
return(subpaths)
}
#calculates number of distinct subpaths in two sets of subpaths
dissim<- function (subs1,subs2,weights){
l <- sapply(1:length(subs1), function (k) length(subs1[[k]])) # length of each subpath of each path
d <- c(0)
l2 <- sapply(1:length(subs2), function (k) length(subs2[[k]])) #length of each subpath of each path
for (k in 1:(max(max(l), max(l2)))){ #iterate until longest subpath of the two trees
a <- as.numeric(subs1[l == k] %in% subs2[l2 == k]) # number of subpaths of tree i in j
b <- as.numeric(subs2[l2 == k] %in% subs1[l == k]) # number of subpaths of tree j in i
if(weights==0){d[k] <- (length(a) + length(b)) - (sum(a) + sum(b))} # if weighs 0 every dissimilar subpath weighted equally
if(weights==1){d[k] <- 1 / k * ((length(a) + length(b)) - (sum(a) + sum(b)))} # if weights 1 every dissimilar subpath weighted by 1/length subpath
}
if(weights == 0){dis <- sum(d) / (length(l[l > 0]) + length(l2[l2 > 0]))} # divide #dissimilar subpaths by maximum dissimilar subpaths
wl <- sum(1 / l)
wl2 <- sum(1 / l2)
wl[is.infinite(wl)] <- 0
wl2[is.infinite(wl2)] <- 0
if(weights == 1){dis <- sum(d) / (wl+wl2)} #divide weighted #dissimilar subpaths by maximum weighted # dissimilar subpaths
dis[is.na(dis)] <- 0
return(dis)
}
### Subfunctions M4
part <- function (data, tree1){
t1 <- matrix(c(0), nrow(data), nrow(data))
for(i in 1:nrow(data) - 1){
t1[i,] <- tree1[i] == tree1 # for each observation with each other observation: Same leaf?
}
return(t1)
}
metr<- function (t1, t2, data){
ind <- upper.tri(t1)
part<- sum(t1[ind] == t2[ind]) / choose(nrow(data), 2)
return(part)
}
#### Subfunctions M6
# function to get back the splitvariables and slit points for each path
splitvsets <- function (paths){
splitvars<- list(length(paths))
splitvarsa<- list(length(paths))
nsplitvar1<- list(length(paths))
#paths <- trees[[1]]
if(length(paths) > 0){
for(i in 1:length(paths)){
pathsw <- paths[[i]]
# splitvarsa1 <- unique(unlist(spaths))
splitvarsa1<- unique(unlist(pathsw))
splitvars1<- sub("\\-.*$", '', splitvarsa1)
splitvars[[i]]<- sub("\\d.*$", '', splitvars1)
splitvarsa1 <- sub(".*=", '', pathsw)
splitvarsa1 <- sub(".*>", '', splitvarsa1)
splitvarsa1 <- sub(".*<", '', splitvarsa1)
splitvarsa[[i]] <- c(as.numeric(splitvarsa1))
}
}else{
splitvars <- NULL
splitvarsa <- NULL
#nsplitvar1 <- 0
}
return(list(splitvars = splitvars, splitvarsa = splitvarsa))
}
simsets<- function (paths1, paths2, paths01, paths02, X, tol,best1,best2){
if(length(paths1[[1]]) > 0 & length(paths2[[1]]) > 0) {
if(is.null(tol)){
if(identical(paths1$splitvars, paths2$splitvars) & identical(paths01$splitvars, paths02$splitvars)){
sim <- 1
} else {
J1 <- JaccardPaths(paths1, paths2, tol,X)
J0 <- JaccardPaths(paths01, paths02,tol,X)
sim <- (J1+J0)/2
}
} else {
J1 <- JaccardPaths(paths1, paths2, tol,X)
J0 <- JaccardPaths(paths01, paths02,tol,X)
sim <- (J1+J0)/2
}
} else{
if(length(paths1[[1]]) == 0 & length(paths2[[1]]) == 0){
if(best1==best2){
sim <- 1
} else{sim <- 0}
} else{sim <- 0}
}
return(sim)
}
JaccardPaths <- function(paths1, paths2, tol,X){
Jaccard<- matrix(c(0),length(paths1[[1]]),length(paths2[[1]]))
## for each path in tree 1 and for each path in tree 2
for (i in 1:length(paths1[[1]])){
for (j in 1:length(paths2[[1]])){
same <- matrix(c(0), length(paths1[[1]][[i]]), length(paths2[[1]][[j]]))
# for the path check whether each variable has a match in the other path
if(length(same > 0)){
for (k in 1:length(paths1[[1]][[i]])){
for(m in 1:length(paths2[[1]][[j]])){
same[k, m] <- paths1[[1]][[i]][[k]] == paths2[[1]][[j]][[m]]
}
}
### In case splitpoints are taken into account
if (!is.null(tol)){
# For those variables that are the same in both trees, put splitpoints in the matrix
splitpoints <- matrix(c(rep(paths2[[2]][[j]], nrow(same))), nrow(same), ncol(same), byrow=T)
s <- c(splitpoints)
sm <- c(same)
s[sm == 0] <- NA
splitpoints <- matrix(s, nrow(same), ncol(same), byrow=F)
tsa<- paths1[[1]][[i]]
tsa<- sub(" <=.", '', tsa)
tsa<- sub(" <.", '', tsa)
tsa<- sub(" >=.", '', tsa)
tsa<- sub(" >.", '', tsa)
t <- match(tsa,X)
correct<- matrix(c(0), nrow(same), ncol(same))
# Look whether splitpoint of splits2 is within tolerance zone of splits1
for (h in 1:length(paths1[[1]][[i]])){
d <- as.numeric(tol[t[h]])
correct[h, ] <- findInterval(splitpoints[h, ], c(as.numeric(paths1[[2]][[i]][h]) - d, as.numeric(paths1[[2]][[i]][h]) + d) ) == 1
}
}else{
correct<- same
}
correct[is.na(correct)] <- 0
g <- graph_from_incidence_matrix(correct)
common<- max_bipartite_match(g)$matching_size
total<- length(paths1[[1]][[i]]) + length(paths2[[2]][[j]]) - common
Jaccard[i,j]<- common / total
}else{
if(length(paths1[[1]][[i]]) == 0 & length(paths2[[1]][[j]]) == 0){
Jaccard[i,j] <- 1
} else{Jaccard[i,j] <- 0}
}
}
}
g <- graph_from_incidence_matrix(Jaccard, weighted=TRUE)
common<- max_bipartite_match(g)$matching_weight
sim<- common/min(length(paths1[[1]]), length(paths2[[2]]))
return(sim)
}
##### sUBFUNCTIONS M5
##############################################################
dis <- function (tree1,tree2, predresp1, predresp2){
if( ! is.null(tree1) & ! is.null(tree2)){
common <- matrix(c(0), length(tree1), length(tree2))
# how many matches in each rule
for(i in 1:length(tree1)){
for(j in 1: length(tree2)){
common[i, j] <- length(unlist(tree1[i])[pmatch(unlist(tree1[i]), unlist(tree2[j]), nomatch = 0)])
}
}
rows <- apply(common, 2, max)
cols <- apply(common, 1, max)
#common rules are those that have all their elementary conjuncts in common
#calculate jaccard
common <- min(sum(rows == length(tree1[[1]])), sum(cols == length(tree1[[1]])))
total <- length(tree1) + length(tree2) - common
s <- common / total
}else if(is.null(tree1) & ! is.null(tree2) | ! is.null(tree1) & is.null(tree2)){
s <- 0
}else{
#check what prediction was made by the trivial rule
if(predresp1[1] == predresp2[1]){
s <- 1
} else{s<-0}
}
return(s)
}
disjnorm <- function (pamtree, tree,observeddata, treedata,X, Y, sameobs){
path <- pamtree$path1 #Paths that lead to leaf with predicted class 1
if(sameobs==TRUE){
predresp<- pamtree$predresptrain
}else{
predresp<- pamtree$predresp
}
if(length(path) > 0){
leaves <- length(path) # number of leaves
l <- sapply(1:leaves, function (k) length(path[[k]])) # for each path number of elements
levels <- matrix(c(0), length(X), 2)
for (i in 1:length(X)){
levels[i, ] <- levels(observeddata[, X[i]])
}
levels <- cbind(X, levels)
npaths <-path
spaths <- lapply(1:leaves, function(k) sapply(1:l[k], function (j) sub(" %in%.*", '', path[[k]][j])))
##
for(i in 1:length(X)){
npaths <- lapply(1:leaves, function(k) sapply(1:l[k], function (j) sub(paste(levels[i,2]), '-', npaths[[k]][j])))
}
for(i in 1:length(X)){
npaths <- lapply(1:leaves, function(k) sapply(1:l[k], function (j) sub(paste(levels[i,3]), '+', npaths[[k]][j])))
}
for(i in 1:length(X)){
signs <- lapply(1:leaves, function (i) gsub("[^-+]+", "", npaths[[i]]))
}
paths <- lapply(1:leaves, function (i) paste(spaths[[i]], signs[[i]]))
b <- sapply (1:length(spaths), function (s) X[pmatch(X, spaths[[s]], nomatch = 0) == 0])
if(length(b) > 0){
if(is.list(b)){
for(i in 1:length(b)){
now <- b[[i]]
nowe <- now[now != ""]
nowe <- sort(nowe)
# Add those variables in plus and minus form to those paths
if(length(nowe > 0)){
a <- matrix(c(0), 2, length(nowe))
a[1, ] <- c(paste(nowe, '+', sep=' '))
a[2, ] <- c(paste(nowe, '-', sep=' '))
grid <- do.call(expand.grid, split(a, col(a)))
for (k in 1:nrow(grid)){
paths <- c(paths, list(unlist(c(paths[i], paste(unlist(grid[k,c(1:ncol(grid))]))))))
}
} else {paths <- paths}
}
}else{
now <- b
nowe <- now[now != ""]
nowe <- sort(nowe)
# Add those variables in plus and minus form to those paths
if(length(nowe > 0)){
a <- matrix(c(0), 2, length(nowe))
a[1, ] <- c(paste(nowe, '+', sep=' '))
a[2, ] <- c(paste(nowe, '-', sep=' '))
grid <- do.call(expand.grid, split(a, col(a)))
paths <- lapply (1:nrow(grid), function (k) c(paths[[1]], paste(unlist(grid[k,c(1:ncol(grid))]))))
} else {
paths <- paths
}
}
} else{paths <- paths}
l <- sapply(1:length(paths), function (k) length(paths[[k]]))
paths <- unique(paths)
paths <- paths[lengths(paths) == max(l)]
}else{ ### if tree only root
if(predresp[1] == levels(observeddata[,Y])[1]){
paths <- NULL
}else{
a <- matrix(c(0), 2, length(X))
a[1, ] <- c(paste(X, '+', sep=' '))
a[2, ] <- c(paste(X, '-', sep=' '))
grid <- do.call(expand.grid, split(a, col(a)))
paths <- lapply (1:nrow(grid), function (k) paste(unlist(grid[k, c(1:ncol(grid))])))
}
}
return(paths)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.