Nothing
#'Recursive partitioning method for the prediction of preference rankings based upon Kemeny distances
#'
#'Recursive partitioning method for the prediction of preference rankings based upon Kemeny distances.
#'
#' @param Y A n by m data matrix, in which there are n judges and m objects to be judged. Each row is a ranking of the objects which are represented by the columns.
#' @param X A dataframe containing the predictor, that must have n rows.
#' @param prunplot prunplot=TRUE returns the plot of the pruning sequence. Default value: FALSE
#' @param control a list of options that control details of the \code{ranktree} algorithm governed by the function
#' \code{ranktreecontrol}. The options govern the minimum size within node to split (the default value is 0.1*n, where n is the total sample size), the bound on the decrease in impurity,
#' (default, 0.01), the algorithm chosen to compute the median ranking (default, "quick"), and other options related
#' to the consrank algorithm, which is called by \code{ranktree}
#' @param \dots arguments passed bypassing \code{ranktreecontrol}
#'
#' @details The user can use any algorithm implemented in the \code{consrank} function from the \pkg{ConsRank} package. All algorithms allow the user to set the option 'full=TRUE'
#' if the median ranking(s) must be searched in the restricted space of permutations instead of in the unconstrained universe of rankings of n items including all possible ties.
#' The output consists in a object of the class "ranktree". It contains:
#' \tabular{llll}{
#' X\tab \tab \tab the predictors: it must be a dataframe\cr
#' Y\tab \tab \tab the response variable: the matrix of the rankings\cr
#' node\tab \tab \tab a list containing teh tree-based structure:\cr
#' \tab number \tab \tab node number \cr
#' \tab terminal \tab \tab logical: TRUE is terminal node\cr
#' \tab father \tab \tab father node number of the current node\cr
#' \tab idfather \tab \tab id of the father node of the current node\cr
#' \tab size \tab \tab sample size within node\cr
#' \tab impur \tab \tab impurity at node\cr
#' \tab wimpur \tab \tab weighted impurity at node\cr
#' \tab idatnode \tab \tab id of the observations within node\cr
#' \tab class \tab \tab median ranking within node in terms of orderings\cr
#' \tab nclass \tab \tab median ranking within node in terms of rankings\cr
#' \tab mclass \tab \tab eventual multiple median rankings\cr
#' \tab tau \tab \tab Tau_x rank correlation coefficient at node\cr
#' \tab wtau \tab \tab weighted Tau_x rank correlation coefficient at node\cr
#' \tab error \tab \tab error at node\cr
#' \tab werror \tab \tab weighted error at node\cr
#' \tab varsplit \tab \tab variables generating split\cr
#' \tab varsplitid \tab \tab id of variables generating split\cr
#' \tab cutspli \tab \tab splitting point\cr
#' \tab children \tab \tab children nodes generated by current node\cr
#' \tab idchildren \tab \tab id of children nodes generated by current node\cr
#' \tab \dots \tab \tab other info about node\cr
#' control \tab \tab \tab parameters used to build the tree\cr
#' numnodes \tab \tab \tab number of nodes of the tree\cr
#' tsynt \tab \tab \tab list containing the synthesis of the tree:\cr
#' \tab children \tab \tab list containing all information about leaves\cr
#' \tab parents \tab \tab list containing all information about parent nodes\cr
#' geneaoly \tab \tab \tab data frame containing information about all nodes\cr
#' idgenealogy \tab \tab \tab data frame containing information about all nodes in terms of nodes id\cr
#' idparents \tab \tab \tab id of the parents of all the nodes\cr
#' goodness \tab \tab \tab goodness -and badness- of fit measures of the tree: Tau_X, error, impurity\cr
#' nomin \tab \tab \tab information about nature of the predictors\cr
#' alpha \tab \tab \tab alpha parameter for pruning sequence\cr
#' pruneinfo \tab \tab \tab list containing information about the pruning sequence:\cr
#' \tab prunelist \tab \tab information about the pruning\cr
#' \tab tau \tab \tab tau_X rank correlation coefficient of each subtree\cr
#' \tab error \tab \tab error of each subtree\cr
#' \tab termnodes \tab \tab number of terminal nodes of each subtree\cr
#' subtrees \tab \tab \tab list of each subtree created with the cost-complexity pruning procedure
#'}
#'
#'
#' @return An object of the class ranktree. See details for detailed information.
#'
#'
#'
#' @examples
#' data("Univranks")
#' tree <- ranktree(Univranks$rankings,Univranks$predictors,num=50)
#'
#' \donttest{
#' data(Irish)
#' #build the tree with default options
#' tree <- ranktree(Irish$rankings,Irish$predictors)
#'
#' #plot the tree
#' plot(tree,dispclass=TRUE)
#'
#' #visualize information
#' summary(tree)
#'
#' #get information about the paths leading to terminal nodes (all the paths)
#' infopaths <- treepaths(tree)
#'
#' #the terminal nodes
#' infopaths$leaves
#'
#' #sample size within each terminal node
#' infopaths$size
#'
#' #visualize the path of the second leave (terminal node number 8)
#' infopaths$paths[[2]]
#'
#' #alternatively
#' nodepath(termnode=8,tree)
#'
#'
#' set.seed(132) #for reproducibility
#' #validation of the tree via v-fold cross-validation (default value of V=5)
#' vtree <- validatetree(tree,method="cv")
#'
#' #extract the "best" tree
#' dtree <- getsubtree(tree,vtree$best_tau)
#'
#' summary(dtree)
#'
#' #plot the validated tree
#' plot(dtree,dispclass=TRUE)
#'
#' #predicted rankings
#' rankfit <- predict(dtree,newx=Irish$predictors)
#'
#' #fit of rankings
#' rankfit$rankings
#'
#' #fit in terms of orderings
#' rankfit$orderings
#'
#' #all info about the fit (id og the leaf, predictor values, and fit)
#' rankfit$orderings
#' }
#'
#'
#' @author Antonio D'Ambrosio \email{antdambr@unina.it}
#'
#' @references D'Ambrosio, A., and Heiser W.J. (2016). A recursive partitioning method for the prediction of preference rankings based upon Kemeny distances. Psychometrika, vol. 81 (3), pp.774-94.
#'
#' @seealso \code{ranktreecontrol}, \code{plot.ranktree}, \code{summary.ranktree}, \code{getsubtree}, \code{validatetree}, \code{treepaths}, \code{nodepath}
#'
#'
#'
#' @keywords Recursive partitioning
#' @keywords Tree-based method
#' @keywords Preference rankings
#'
#' @import ConsRank
#' @importFrom utils combn
#' @importFrom methods is
#'
#'
#' @export
ranktree <- function(Y,X,prunplot=FALSE,
control=ranktreecontrol(...), ...){
if (!is.matrix(Y)) {stop("response variable must be a matrix")}
N = nrow(Y)
C=ncol(Y)
if(is(control$num,"NULL")){control$num <- round(0.1*N)}
namespreds=colnames(X)
#nomin=ifelse(sapply(X,is.numeric),0,1) #0, numerical predictor, 1 nominal predictor
nomin=ifelse(sapply(X,function(i) is(i,"numeric")|is(i,"ordered")) ,0,1)
#aggiunto il 28/9/2021
# for (j in 1:C){
#
# if (is(predictors[,j],"numeric")|is(predictors[,j],"ordered")){
# nomin[j] <- 0
# } else {
# nomin[j] <- 1
# }
#
# }
# #fine aggiunta
#
xID=seq(1,N)
L <- 1 #numbering of the list
tree=list()
tree$X <- X
tree$Y <- Y
#tree$algorithm <- cralgorithm
tree$node$number[[L]] <- 1 #node number according spad
tree$node$checkdecr[[L]] <- TRUE
tree$node$terminal[[L]] <- TRUE
tree$node$father[[L]] <- NULL #number of the father of the node
tree$node$idfather[[L]] <- NA #id of the father node
tree$node$size[[L]] <- N #size within node
tree$node$impur[[L]] <- tree$node$wimpur[[L]] <- rankimpurity(Y)
tree$node$idatnode[[L]] <- xID #id of units in the node
tree$control <- control
#num=tree$control$num
#decrmin=tree$control$decrmin
CR=consrank(Y, algorithm=tree$control$algorithm, ps=tree$control$ps, full=tree$control$full,
itermax=tree$control$itermax, np=tree$control$np, gl=tree$control$gl,
ff=tree$control$ff, cr=tree$control$cr, proc=tree$control$proc)
if( is(CR,"NULL") ){
#if the combined input matrix contains either only zeros or only positive values,
#the "all-tie" solution is returned
CR <- list()
mr <- matrix(1,nrow=1,ncol=C)
colnames(mr) <- colnames(Y)
CR$Consensus <- mr
CR$Tau <- mean(tau_x(mr,Y))
CR$Eltime <- 0
}
tree$node$class[[L]] <- rank2order(CR$Consensus[1,],items=colnames(Y)) #class assignment rule
tree$node$nclass[[L]] <- CR$Consensus[1,] #numeric class
tree$node$mclass[[L]]<- CR$Consensus #class, containing eventually more than one median ranking
tree$node$tau[[L]] <- tree$node$wtau[[L]] <- CR$Tau[1] #goodness of fit
tree$node$werror[[L]] <- tree$node$error[[L]] <- sum(kemenyd(Y,CR$Consensus))/(N*C*(C-1)) #error within node
tree$numnodes <- 1 #number of nodes of the tree
#number <- 1 #node number
gd <- getdata(tree)
ind <- which(gd$term==TRUE & gd$imp==TRUE & gd$nnode >tree$control$num)
while(length(ind)>0){
for (i in 1:length(ind)) {
# print(length(tree$node$idatnode[[L[ind[i]]]]))
# flush.console()
#prova aggiunta il 29/4/2021
# if( length(tree$node$idatnode[[L[ind[i]]]]) <= tree$control$num ){ #if aggiunta
#
# tree$node$terminal[[L[ind[i]]]] <- TRUE
# tree$node$checkdecr[[L[ind[i]]]] <- FALSE
# tree$node$varsplit[[L[ind[i]]]] <- NA
# tree$node$varsplitid[[L[ind[i]]]] <- NA
# tree$node$cutspli[[L[ind[i]]]] <- NA
# tree$node$children[[L[ind[i]]]] <- NA
# tree$node$idchildren[[L[ind[i]]]] <- NA
#
# } else { #else aggiunta
RS <- ranksplit(Y,X,N,nomin,namespreds,tree$node$idatnode[[L[ind[i]]]])
if (RS$DecImp > tree$control$decrmin){
tree$node$terminal[[L[ind[i]]]] <- FALSE
tree <- addnode(tree,RS,L[ind[i]],C,N)
tree$node$checkdecr[[L[ind[i]]]] <- TRUE
#L <- seq(1:tree$numnodes)
} else {
tree$node$terminal[[L[ind[i]]]] <- TRUE
tree$node$checkdecr[[L[ind[i]]]] <- FALSE
tree$node$varsplit[[L[ind[i]]]] <- NA
tree$node$varsplitid[[L[ind[i]]]] <- NA
tree$node$cutspli[[L[ind[i]]]] <- NA
tree$node$children[[L[ind[i]]]] <- NA
tree$node$idchildren[[L[ind[i]]]] <- NA
}
# } #end else aggiunta
#L <- seq(1:tree$numnodes)
} #end for
L <- seq(1:tree$numnodes)
gd <- getdata(tree)
ind <- which(gd$term==TRUE & gd$imp==TRUE & gd$nnode >tree$control$num)
}#end while
tree$tsynth <- synthtree(tree)
tree <- layouttree(tree)
tree$goodness$error <- sum(unlist(tree$node$werror[tree$tsynth$children$measures$idt]))
tree$goodness$tau <- sum(unlist(tree$node$wtau[tree$tsynth$children$measures$idt]))
tree$goodness$impurity <- sum(unlist(tree$node$wimpur[tree$tsynth$children$measures$idt]))
tree$nomin <- nomin
#get pruning information
tree <- prune(tree)
if (isTRUE(prunplot)){
plot.ranktree(tree,plot.type="pruningseq")
}
# if(isTRUE(control$printinfo)){
#
# isleaf <- unlist(tree$node$terminal)
# obj <- "\nRecursive partitioning method for preference rankings: \n"
# cat(obj)
# cat("Number of terminal nodes: ",(tree$numnodes+1)/2,"\n")
# cat("Averaged tau_X rank correlation coefficient: ", tree$goodness$tau, "\n")
# cat("Error (normalized Kemeny distance): ", tree$goodness$error, "\n")
# cat("Variables generating splits: ", paste(unique(unlist(tree$node$varsplit[!isleaf])),collapse=', '), "\n")
#
# }
class(tree) <- c("ranktree","ConsRankClass")
return(tree)
}#end function
#-----------------------
addnode <- function(Tree,RSpl,snod,C,N){
numl <- Tree$numnodes +1 #number of list left node
numr <- Tree$numnodes +2 #number of list right node
numberleft <- 2*Tree$node$number[[snod]] #node number left
numberright <- (2*Tree$node$number[[snod]])+1 #node number right
Tree$numnodes <- numr
Tree$node$varsplit[[snod]] <- RSpl$SplitVar #variable that generates split
Tree$node$varsplitid[[snod]] <- RSpl$idSplitVar
Tree$node$cutspli[[snod]] <- RSpl$custSplit #cutting point
Tree$node$children[[snod]] <- c(numberleft,numberright)
Tree$node$idchildren[[snod]] <- c(numl,numr) #id of children nodes
#manage new nodes
#add the number at nodes
Tree$node$number[[numl]] <- numberleft
Tree$node$number[[numr]] <- numberright
#declare nodes as terminal for a moment
Tree$node$terminal[[numl]] <- TRUE
Tree$node$terminal[[numr]] <- TRUE
#no children at the moment
Tree$node$idchildren[[numl]] <- NA
Tree$node$idchildren[[numr]] <- NA
#add info about father
if (snod==1){
Tree$node$father[[numl]] <- Tree$node$father[[numr]] <- 1
Tree$node$idfather[[numl]] <- Tree$node$idfather[[numr]] <- 1
} else {
Tree$node$father[[numl]] <- Tree$node$father[[numr]] <- Tree$node$number[[snod]]
Tree$node$idfather[[numl]] <- Tree$node$idfather[[numr]] <- snod
}
#sample size within nodes
Tree$node$size[[numl]] <- length(RSpl$indexing$left)
Tree$node$size[[numr]] <- length(RSpl$indexing$right)
#impurity and weighted inpurity within nodes
Tree$node$impur[[numl]] <- RSpl$NodeImpurity$left
Tree$node$impur[[numr]] <- RSpl$NodeImpurity$right
Tree$node$wimpur[[numl]] <- RSpl$NodeImpurity$left*( length(RSpl$indexing$left)/N )
Tree$node$wimpur[[numr]] <- RSpl$NodeImpurity$right*( length(RSpl$indexing$right)/N )
#index of units within nodes
Tree$node$idatnode[[numl]] <- RSpl$indexing$left
Tree$node$idatnode[[numr]] <- RSpl$indexing$right
#check for split
Tree$node$checkdecr[[numl]] <- TRUE
Tree$node$checkdecr[[numr]] <- TRUE
#consensus ranking within node
CRL <- consrank(Tree$Y[RSpl$indexing$left,], algorithm=Tree$control$algorithm, ps=Tree$control$ps, full=Tree$control$full,
itermax=Tree$control$itermax, np=Tree$control$np, gl=Tree$control$gl,
ff=Tree$control$ff, cr=Tree$control$cr, proc=Tree$control$proc)
if( is(CRL,"NULL") ){
#if the combined input matrix contains either only zeros or only positive values,
#the "all-tie" solution is returned
CRL <- list()
mr <- matrix(1,nrow=1,ncol=ncol(Tree$Y))
colnames(mr) <- colnames(Tree$Y)
CRL$Consensus <- mr
CRL$Tau <- mean(tau_x(mr,Tree$Y[RSpl$indexing$left,]))
CRL$Eltime <- 0
}
CRR <- consrank(Tree$Y[RSpl$indexing$right,], algorithm=Tree$control$algorithm, ps=Tree$control$ps, full=Tree$control$full,
itermax=Tree$control$itermax, np=Tree$control$np, gl=Tree$control$gl,
ff=Tree$control$ff, cr=Tree$control$cr, proc=Tree$control$proc)
if( is(CRR,"NULL") ){
#if the combined input matrix contains either only zeros or only positive values,
#the "all-tie" solution is returned
CRR <- list()
mr <- matrix(1,nrow=1,ncol=ncol(Tree$Y))
colnames(mr) <- colnames(Tree$Y)
CRR$Consensus <- mr
CRR$Tau <- mean(tau_x(mr,Tree$Y[RSpl$indexing$right,]))
CRR$Eltime <- 0
}
Tree$node$class[[numl]] <- rank2order(CRL$Consensus[1,],items=colnames(Tree$Y))
Tree$node$class[[numr]] <- rank2order(CRR$Consensus[1,],items=colnames(Tree$Y))
Tree$node$nclass[[numl]] <- CRL$Consensus[1,]
Tree$node$nclass[[numr]] <- CRR$Consensus[1,]
Tree$node$mclass[[numl]] <- CRL$Consensus
Tree$node$mclass[[numr]] <- CRR$Consensus
#tau and weighted tau within node
Tree$node$tau[[numl]] <- CRL$Tau[1]
Tree$node$tau[[numr]] <- CRR$Tau[1]
Tree$node$wtau[[numl]] <- CRL$Tau[1] * (length(RSpl$indexing$left)/N)
Tree$node$wtau[[numr]] <- CRR$Tau[1] * (length(RSpl$indexing$right)/N)
#error and weigthed error within node
Tree$node$error[[numl]] <- sum(kemenyd(Tree$Y[RSpl$indexing$left,],CRL$Consensus[1,]))/( (length(RSpl$indexing$left)) *C*(C-1))
Tree$node$error[[numr]] <- sum(kemenyd(Tree$Y[RSpl$indexing$right,],CRR$Consensus[1,]))/( (length(RSpl$indexing$right))*C*(C-1))
Tree$node$werror[[numl]] <- Tree$node$error[[numl]] * (length(RSpl$indexing$left)/N)
Tree$node$werror[[numr]] <- Tree$node$error[[numr]] * (length(RSpl$indexing$right)/N)
#for a moment declare as NA the splitting variables potentially generated by subsequent splits
Tree$node$varsplit[[numl]] <- Tree$node$varsplit[[numr]] <- NA
Tree$node$varsplitid[[numr]] <- Tree$node$varsplitid[[numr]] <- NA
return(Tree)
} # end code
#--------------------------------------
rankimpurity <- function(Y) {
SS=sum(kemenyd(Y))/nrow(Y) #Relaxing suggestion
return(SS)
} #end function
#--------------------------------------
ranksplit = function(Y,X,N,nomin,namespreds,xID)
{
# r = nrow(X[xID,])
if (is.null(nrow(X[xID,]))){r=length(X[xID,])}
npred=ncol(X[xID,])
if (is.null(ncol(X[xID,]))){npred=1}
ImpFather = rankimpurity(Y[xID,])/N
tablesplit=matrix(0,4,npred)
colnames(tablesplit)=namespreds
#table that informs about
# variable1 | variable 2 |
# __________________________
# max Delta | | |
# cut point | | |
# size left | | |
# size right | | |
listLEFTid=vector(mode="list",length=npred) #list that recall indexing to the left and to the right
listRIGHTid=vector(mode="list",length=npred)
Splitnom=vector(mode="list",length=npred)
for (i in 1:npred) { #start for
if (npred==1){x=X[xID]} else {x=X[xID,i]}
ifelse(nomin[i]==0,FALSE,TRUE)
if (nomin[i]==FALSE)
{ # if ordinal
h=sort(unique(x))
#Delta=matrix(0,1,(length(h)-1))
if (length(h)==1){ #modifica per impedire problemi
tablesplit[1,i]=0
tablesplit[2,i]=NA
tablesplit[3,i]=NA
tablesplit[4,i]=NA
Splitnom[[i]]=NA
listLEFTid[[i]]=NA
listRIGHTid[[i]]=NA
} else {
iDL=lapply(h[-length(h)],FUN=function(h) which(x<=h))
iDR=lapply(h[-length(h)],FUN=function(h) which(x>h))
# print(length(h))
# flush.console()
primp=sapply(1:length(iDL), function(row)
ImpFather-( ( ifelse(length(iDL[[row]])==1,0,rankimpurity(Y[xID[iDL[[row]]],])/N) ) +
(ifelse(length(iDR[[row]])==1,0,rankimpurity(Y[xID[iDR[[row]]],])/N) ) )
)
md=which.max(unlist(primp))
leftid=which(x<=h[md])
rightid=which(x>h[md])
#tablesplit[1,i]=Delta[md]
tablesplit[1,i]=primp[[md]]
tablesplit[2,i]=h[md]
tablesplit[3,i]=length(leftid)
tablesplit[4,i]=length(rightid)
listLEFTid[[i]]=xID[leftid]
listRIGHTid[[i]]=xID[rightid]
}#end modifica
} #
else{
Nspl=nominalsplit(x,Y[xID,],ImpFather,N)
if (Nspl$Decr==0){
tablesplit[1,i]=0
tablesplit[2,i]=NA
tablesplit[3,i]=NA
tablesplit[4,i]=NA
Splitnom[[i]]=NA
listLEFTid[[i]]=NA
listRIGHTid[[i]]=NA
} else {
tablesplit[1,i]=Nspl$Decr
tablesplit[2,i]=NA
tablesplit[3,i]=length(Nspl$indL)
tablesplit[4,i]=length(Nspl$indR)
Splitnom[[i]]=Nspl$Vsplit
listLEFTid[[i]]=xID[Nspl$indL]
listRIGHTid[[i]]=xID[Nspl$indR]
}
} #end if nominal
}# end for
bestsplit=which.max(tablesplit[1,])
DecImp=tablesplit[1,bestsplit]
indexing=list(left=NA,right=NA)
indexing$left=listLEFTid[[bestsplit]]
indexing$right=listRIGHTid[[bestsplit]]
if(is.na(tablesplit[2,bestsplit])){
cutSplit=Splitnom[[bestsplit]]
} else {
cutSplit=tablesplit[2,bestsplit]
}
Nodeimpurity=list(left=NA,right=NA)
Nodeimpurity$left <- ifelse(length(indexing$left)==1,0,rankimpurity(Y[indexing$left,]))
Nodeimpurity$right <- ifelse(length(indexing$right)==1,0,rankimpurity(Y[indexing$right,]))
return( list(DecImp=DecImp, indexing=indexing,
custSplit=cutSplit, NodeImpurity=Nodeimpurity, SplitVar=namespreds[bestsplit], idSplitVar=bestsplit ) )
} #end function
#----------------
splittingvariables = function(X){
#generate all possible binary splits for categorical variables
K = sort(unique(X))
if(length(K)==1){
vsplit=NULL
} else {
if (length(K)>2){
s1=seq((length(K)-1),ceiling(length(K)/2),by=-1)
indici=vector(mode="list",length=(length(K)-1))
for (i in 1:length(s1)){ #for 1
indici[[s1[i]]]$spl=t(combn(K,s1[i]))
if (ncol(indici[[s1[i]]]$spl) == (length(K)/2))
{ #if 1
delrows=(nrow(indici[[s1[i]]]$spl)/2+1):nrow(indici[[s1[i]]]$spl)
indici[[s1[i]]]$spl=indici[[s1[i]]]$spl[-delrows,]
} #end if 1
} #end for 1
Lk=1
kL=1
while (kL < 99998){ #while 1
if (is.null(indici[[kL]]$spl))
{ #if 2
kL=kL+1
Lk=Lk+1 # Lk is the index of the first non-empty struct array "indici"
} else {
kL=99999
}# end if 2
} #end while 1
indvsplit=seq(Lk,(length(K)-1),by=1)
ncolmn=ncol(indici[[length(indici)]]$spl)
vsplit=matrix(0,1,ncolmn)
for (j in 1:length(indvsplit))
{ #for 2
if (ncol(indici[[indvsplit[j]]]$spl) < ncolmn )
{ #if 3
indna=(ncol(indici[[indvsplit[j]]]$spl)+1):ncolmn
indici[[indvsplit[j]]]$spl=cbind(indici[[indvsplit[j]]]$spl,matrix(NA,nrow(indici[[indvsplit[j]]]$spl),length(indna)))
} #end if 3
vsplit=rbind(vsplit,indici[[indvsplit[j]]]$spl)
}#end for 2
vsplit=vsplit[-1,]
} else if (length(K)==2) {
vsplit=K[1]
}
}
return(vsplit)
}#end function
#----------------------------
#' @importFrom methods is
nominalsplit = function(X,Y,ImpFather,N){
x=splittingvariables(X)
if( is(x,"NULL") ){ #if null, exit function
Decr <- 0
GL <- NA
GR <- NA
indleft <- NA
indright <- NA
Vsplit <- NA
} else { #start principal else
if (length(x)==1){ #if 1
indleft <- which(X==x)
indright <- which(X!=x)
GL <- ifelse(length(indleft)==1,0,rankimpurity(Y[indleft,]))
GR <- ifelse(length(indright)==1,0,rankimpurity(Y[indright,]))
Delta = ImpFather - (GL/N + GR/N)
Vsplit = x
Decr=max(Delta)
} else { #end if 1, start else 2
Delta=matrix(0,1,nrow(x))
for (j in 1:nrow(x)){#for 1
if( is(X,"factor") ){
left=which(!is.na(match(X,levels(X)[x[j,]])))
right=which(is.na(match(X,levels(X)[x[j,]])))
} else {
left=which(!is.na(match(X,x[j,])))
right=which(is.na(match(X,x[j,])))
}
GL <- ifelse(length(left)==1,0,rankimpurity(Y[left,]))
GR <- ifelse(length(right)==1,0,rankimpurity(Y[right,]))
Delta[1,j]= ImpFather - (GL/N + GR/N)
}#end for 1
j=which.max(Delta)
Decr=max(Delta)
if( is(X,"factor") ){
indleft=which(!is.na(match(X,levels(X)[x[j,]])))
indright=which(is.na(match(X,levels(X)[x[j,]])))
} else {
indleft=which(!is.na(match(X,x[j,])))
indright=which(is.na(match(X,x[j,])))
}
GL <- ifelse(length(indleft)==1,0,rankimpurity(Y[indleft,]))
GR <- ifelse(length(indright)==1,0,rankimpurity(Y[indright,]))
if (is(X,"factor")){
Vsplit=levels(X)[x[j,]]
} else {
Vsplit=x[j,]
}
if ( sum(is.na(Vsplit)) > 0 ){
Vsplit=Vsplit[-which(is.na(Vsplit))]
}
}#end else 2
} #end principal else
return(list(Decr=Decr, GL=GL, GR=GR, indL=indleft, indR=indright, Vsplit=Vsplit))
} #end function
#--------------------------------------------
getdata <- function(Tree){
term=unlist(Tree$node$terminal)
imp=unlist(Tree$node$checkdecr)
nnode=unlist(Tree$node$size)
return(list(term=term,imp=imp,nnode=nnode))
} #end function
#---------------------------------------
#' @importFrom rlist list.rbind
synthtree <- function(Tree){
N <- Tree$node$size[[1]]
idt <- which(unlist(Tree$node$terminal)==TRUE)
idp <- which(unlist(Tree$node$terminal)==FALSE)
nn <- unlist(Tree$node$number[idt])
if (length(nn)==1){
father <- NA} else
{father <- unlist(Tree$node$father[idt])
}
size <- unlist(Tree$node$size[idt])
impur <- unlist(Tree$node$impur[idt])
error <- unlist(Tree$node$error[idt])
tau <- unlist(Tree$node$tau[idt])
wtau <- unlist(Tree$node$wtau[idt])
class <- list.rbind(Tree$node$class[idt])
classnum <- do.call(rbind,Tree$node$nclass[idt])
prr=c(length(nn),length(father),length(size),length(impur),length(error),length(wtau))
#print(prr)
#flush.console()
children=list()
children$measures <- data.frame(idt,nn,father,size,impur,error,wtau,tau)
children$class <- class
children$classid <- idt
children$nclass <- classnum
if(length(idp)>0){
nn <- unlist(Tree$node$number[idp])
father <- c(0,unlist(Tree$node$father[idp]))
size <- unlist(Tree$node$size[idp])
varp <- unlist(Tree$node$varsplit[idp])
impur <- unlist(Tree$node$impur[idp])
wimpur <- unlist(Tree$node$wimpur[idp])
error <- unlist(Tree$node$error[idp])
tau <- unlist(Tree$node$tau[idp])
wtau <- unlist(Tree$node$wtau[idp])
class <- list.rbind(Tree$node$class[idp])
classnum <- do.call(rbind,Tree$node$nclass[idp])
parents <- list()
parents$measures <- data.frame(idp,nn,father,size,varp,impur,wimpur,error,wtau,tau)
parents$class <- class
parents$classid <- idp
parents$nclass <- classnum
} else { #tree with only the root node
parents <- NA
}
return(list(children=children, parents=parents))
}#end function
#------------------------------------------
prune <- function(Tree){
#optimal pruning sequence of the tree
tsynth <- Tree$tsynth
N <- Tree$numnodes
#get internal nodes
idparent <- tsynth$parents$measures$idp
numparent <- tsynth$parents$measures$nn
#get terminal nodes
idterminal <- tsynth$children$measures$idt
numterminal <- tsynth$children$measures$nn
isleaf <- treatisleaf <- unlist(Tree$node$terminal)
nodecost <- costsum <- unlist(Tree$node$wimpur)
nodeerr <- errsum <- unlist(Tree$node$werror)
nodetau <- tausum <- unlist(Tree$node$wtau)
children <- Tree$idgenealogy[,2:3]
parent <- Tree$idparents
nleaves <- sum(isleaf)
nodecount <- as.numeric(isleaf)
adj <- 1+100*( 2^(-52) )
repeat{
branches <- which(treatisleaf==FALSE)
twig <- cbind(as.numeric(treatisleaf[children[branches,1]]),as.numeric(treatisleaf[children[branches,2]]))
twig <- branches[rowSums(twig)==2]
if (length(twig)==0){break}
kids <- children[twig,]
costsum[twig] <- rowSums(cbind(costsum[kids$leftchild],costsum[kids$rightchild]))
errsum[twig] <- rowSums(cbind(errsum[kids$leftchild],errsum[kids$rightchild]))
tausum[twig] <- rowSums(cbind(tausum[kids$leftchild],tausum[kids$rightchild]))
nodecount[twig] <- rowSums(cbind(nodecount[kids$leftchild],nodecount[kids$rightchild]))
treatisleaf[twig] <- TRUE
}
whenpruned <- rep(0,N)
branches <- which(isleaf==FALSE)
prunestep <- 0
allalpha <- ntermnodes <- rep(0,N)
ntermnodes[1] <- length(idterminal)
while(length(branches)>0){#principal while
#complexity parameter
prunestep <- prunestep + 1
alpha <- pmax(0, (nodecost[branches]-costsum[branches]))/pmax(2^(-52),(nodecount[branches]-1))
bestalpha <- min(alpha)
toprune <- branches[alpha <= bestalpha*adj]
#nodes below here as no longer on tree
wasleaf <- isleaf
kids <- toprune
while(length(kids)>0){
kids <- children[kids,]
kids <- kids[which(rowSums(kids)>0),]
kids$leftchild[(isleaf[kids$leftchild])]<- - Inf
kids$rightchild[(isleaf[kids$rightchild])]<- - Inf
kids <- kids[sign(kids)>0]
isleaf[kids] <- TRUE
}
newleaves <- toprune[isleaf[toprune]==FALSE]
isleaf[toprune] <- TRUE
whenpruned[isleaf!=wasleaf & whenpruned==0] <- prunestep
# this branch is pruned
whenpruned[toprune] <- prunestep
for (i in 1:length(newleaves)){
#these branches are now new terminal nodes
node <- newleaves[i]
#update impurity, error and tau
diffcost <- nodecost[node] - costsum[node]
differr <- nodeerr[node] - errsum[node]
difftau <- nodetau[node] - tausum[node]
#update count
diffcount <- nodecount[node] - 1
#go from leaf to root node
repeat{
nodecount[node] <- nodecount[node] - diffcount
costsum[node] <- costsum[node] + diffcost
errsum[node] <- errsum[node] + differr
tausum[node] <- tausum[node] + difftau
node <- parent[node]
if (node==0) {break}
}
}
allalpha[prunestep+1] <- bestalpha
ntermnodes[prunestep+1] <- nodecount[1]
branches <- which(isleaf == FALSE)
}#end principal while
##here the plot of the pruning sequence?
#end of the plot
ids <- seq(1:max(whenpruned))
subtrees <- vector(mode = "list", length = (length(ids)+1))
taos <- errors <- rep(0,(length(ids)+1))
subtrees[[1]] <- Tree
for (j in 1:length(ids)){
cut <- which(whenpruned>0 & whenpruned <= ids[j])
sub <- prunenodes(Tree,cut)
taos[(j+1)] <- sub$goodness$tau
errors[(j+1)] <- sub$goodness$error
subtrees[[(j+1)]] <- sub #era subtrees[[j]]
}
taos[1] <- Tree$goodness$tau
errors[1] <- Tree$goodness$error
Tree$alpha <- allalpha[1:(prunestep+1)]
Tree$pruneinfo$prunelist <- whenpruned
Tree$pruneinfo$tau <- taos
Tree$pruneinfo$error <- errors
Tree$pruneinfo$termnodes <- ntermnodes[1:(prunestep+1)]
Tree$subtrees <- subtrees
return(Tree)
}#end function
#-----------------------------------
prunenodes <- function(Tree,cut){
N <- Tree$numnodes
children <- Tree$idgenealogy[,2:3]
parents <- cut
tokeep <- rep(1,N)
okids <- vector(mode="numeric",length=0L)
while(sum(tokeep)>0){
newkids <- c(as.matrix(children[parents,]))
newkids <- newkids[newkids>0 & !is.element(newkids,okids)]
if (length(newkids)==0){break}
okids <- c(okids,newkids)
tokeep[newkids] <- 0
parents <- newkids
}
subtree <- getsubtree(Tree,cut,tokeep=tokeep)
return(subtree)
}#end function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.