R/ranktree.R

Defines functions prunenodes prune synthtree getdata nominalsplit splittingvariables ranksplit rankimpurity addnode ranktree

Documented in ranktree

#'Recursive partitioning method for the prediction of preference rankings based upon Kemeny distances
#'
#'Recursive partitioning method for the prediction of preference rankings based upon Kemeny distances.   
#'
#' @param Y A n by m data matrix, in which there are n judges and m objects to be judged. Each row is a ranking of the objects which are represented by the columns. 
#' @param X A dataframe containing the predictor, that must have n rows.
#' @param prunplot prunplot=TRUE returns the plot of the pruning sequence. Default value: FALSE
#' @param control a list of options that control details of the \code{ranktree} algorithm governed by the function
#' \code{ranktreecontrol}. The options govern the minimum size within node to split (the default value is 0.1*n, where n is the total sample size), the bound on the decrease in impurity,
#' (default, 0.01), the algorithm chosen to compute the median ranking (default, "quick"), and other options related
#' to the consrank algorithm, which is called by \code{ranktree}
#' @param \dots arguments passed bypassing \code{ranktreecontrol}
#' 
#' @details The user can use any algorithm implemented in the \code{consrank} function from the \pkg{ConsRank} package. All algorithms allow the user to set the option 'full=TRUE' 
#' if the median ranking(s) must be searched in the restricted space of permutations instead of in the unconstrained universe of rankings of n items including all possible ties. 
#' The output consists in a object of the class "ranktree". It contains:
#' \tabular{llll}{
#' X\tab \tab \tab the predictors: it must be a dataframe\cr
#' Y\tab \tab  \tab the response variable: the matrix of the rankings\cr
#' node\tab  \tab \tab a list containing teh tree-based structure:\cr
#' \tab number \tab \tab node number \cr
#' \tab terminal \tab \tab logical: TRUE is terminal node\cr
#' \tab father \tab \tab father node number of the current node\cr
#' \tab idfather \tab \tab id of the father node of the current node\cr
#' \tab size \tab \tab sample size within node\cr
#' \tab impur \tab \tab impurity at node\cr
#' \tab wimpur \tab \tab weighted impurity at node\cr
#' \tab idatnode \tab \tab id of the observations within node\cr
#' \tab class \tab \tab median ranking within node in terms of orderings\cr
#' \tab nclass \tab \tab median ranking within node in terms of rankings\cr
#' \tab mclass \tab \tab eventual multiple median rankings\cr
#' \tab tau \tab \tab Tau_x rank correlation coefficient at node\cr
#' \tab wtau \tab \tab weighted Tau_x rank correlation coefficient at node\cr
#' \tab error \tab \tab error at node\cr
#' \tab werror \tab \tab weighted error at node\cr
#' \tab varsplit \tab \tab variables generating split\cr
#' \tab varsplitid \tab \tab id of variables generating split\cr
#' \tab cutspli \tab \tab splitting point\cr
#' \tab children \tab \tab children nodes generated by current node\cr
#' \tab idchildren \tab \tab id of children nodes generated by current node\cr
#' \tab \dots \tab \tab other info about node\cr
#' control \tab \tab \tab parameters used to build the tree\cr
#' numnodes \tab \tab \tab number of nodes of the tree\cr
#' tsynt \tab \tab \tab list containing the synthesis of the tree:\cr
#' \tab children \tab \tab list containing all information about leaves\cr
#' \tab parents \tab \tab list containing all information about parent nodes\cr
#' geneaoly \tab  \tab \tab data frame containing information about all nodes\cr
#' idgenealogy \tab \tab \tab data frame containing information about all nodes in terms of nodes id\cr
#' idparents \tab \tab \tab id of the parents of all the nodes\cr
#' goodness \tab \tab \tab goodness -and badness- of fit measures of the tree: Tau_X, error, impurity\cr
#' nomin \tab \tab \tab information about nature of the predictors\cr 
#' alpha \tab \tab \tab alpha parameter for pruning sequence\cr
#' pruneinfo \tab \tab \tab list containing information about the pruning sequence:\cr
#' \tab prunelist \tab \tab information about the pruning\cr
#' \tab tau \tab \tab tau_X rank correlation coefficient of each subtree\cr
#' \tab error \tab \tab error of each subtree\cr
#' \tab termnodes \tab \tab number of terminal nodes of each subtree\cr
#' subtrees \tab \tab \tab list of each subtree created with the cost-complexity pruning procedure
#'} 
#'   
#'  
#' @return An object of the class ranktree. See details for detailed information.
#' 
#'  
#' 
#' @examples
#' data("Univranks")
#' tree <- ranktree(Univranks$rankings,Univranks$predictors,num=50)
#' 
#' \donttest{
#' data(Irish)
#' #build the tree with default options
#' tree <- ranktree(Irish$rankings,Irish$predictors)
#' 
#' #plot the tree
#' plot(tree,dispclass=TRUE)
#' 
#' #visualize information
#' summary(tree)
#' 
#' #get information about the paths leading to terminal nodes (all the paths)
#' infopaths <- treepaths(tree)
#' 
#' #the terminal nodes
#' infopaths$leaves
#' 
#' #sample size within each terminal node
#' infopaths$size
#' 
#' #visualize the path of the second leave (terminal node number 8)
#' infopaths$paths[[2]]
#' 
#' #alternatively
#' nodepath(termnode=8,tree)
#' 
#' 
#' set.seed(132) #for reproducibility
#' #validation of the tree via v-fold cross-validation (default value of V=5)
#' vtree <- validatetree(tree,method="cv")
#' 
#' #extract the "best" tree
#' dtree <- getsubtree(tree,vtree$best_tau)
#' 
#' summary(dtree)
#' 
#' #plot the validated tree
#' plot(dtree,dispclass=TRUE)
#' 
#' #predicted rankings
#' rankfit <- predict(dtree,newx=Irish$predictors)
#' 
#' #fit of rankings
#' rankfit$rankings
#' 
#' #fit in terms of orderings
#' rankfit$orderings
#' 
#' #all info about the fit (id og the leaf, predictor values, and fit)
#' rankfit$orderings
#' }
#' 
#' 
#' @author Antonio D'Ambrosio \email{antdambr@unina.it}
#' 
#' @references D'Ambrosio, A., and Heiser W.J. (2016). A recursive partitioning method for the prediction of preference rankings based upon Kemeny distances. Psychometrika, vol. 81 (3), pp.774-94.
#' 
#' @seealso  \code{ranktreecontrol}, \code{plot.ranktree}, \code{summary.ranktree}, \code{getsubtree}, \code{validatetree}, \code{treepaths}, \code{nodepath}
#' 
#' 
#'
#' @keywords Recursive partitioning 
#' @keywords Tree-based method
#' @keywords Preference rankings
#' 
#' @import ConsRank
#' @importFrom utils combn
#' @importFrom methods is
#' 
#' 
#' @export 



ranktree <-  function(Y,X,prunplot=FALSE, 
                      control=ranktreecontrol(...), ...){
  
  if (!is.matrix(Y)) {stop("response variable must be a matrix")}
  
  N = nrow(Y)
  C=ncol(Y)
  if(is(control$num,"NULL")){control$num <- round(0.1*N)}
  namespreds=colnames(X)
  #nomin=ifelse(sapply(X,is.numeric),0,1) #0, numerical predictor, 1 nominal predictor
  nomin=ifelse(sapply(X,function(i) is(i,"numeric")|is(i,"ordered")) ,0,1)
  #aggiunto il 28/9/2021
  # for (j in 1:C){
  #   
  #   if (is(predictors[,j],"numeric")|is(predictors[,j],"ordered")){
  #     nomin[j] <- 0
  #   } else {
  #     nomin[j] <- 1
  #   }
  #   
  # }
  # #fine aggiunta
  # 
  
  xID=seq(1,N)
  
  
  L <- 1 #numbering of the list
  tree=list()
  tree$X <- X
  tree$Y <- Y
  #tree$algorithm <- cralgorithm
  tree$node$number[[L]] <- 1  #node number according spad
  tree$node$checkdecr[[L]] <- TRUE
  tree$node$terminal[[L]] <- TRUE
  tree$node$father[[L]] <- NULL #number of the father of the node
  tree$node$idfather[[L]] <- NA #id of the father node
  tree$node$size[[L]] <- N #size within node
  tree$node$impur[[L]] <- tree$node$wimpur[[L]] <- rankimpurity(Y)
  tree$node$idatnode[[L]] <- xID #id of units in the node
  tree$control <- control
  #num=tree$control$num
  #decrmin=tree$control$decrmin
  
  CR=consrank(Y, algorithm=tree$control$algorithm, ps=tree$control$ps, full=tree$control$full,
              itermax=tree$control$itermax, np=tree$control$np, gl=tree$control$gl,
              ff=tree$control$ff, cr=tree$control$cr, proc=tree$control$proc)
  
  if( is(CR,"NULL") ){
    #if the combined input matrix contains either only zeros or only positive values, 
    #the "all-tie" solution is returned
    
    CR <- list()
    mr <- matrix(1,nrow=1,ncol=C)
    colnames(mr) <- colnames(Y) 
    CR$Consensus <- mr
    CR$Tau <- mean(tau_x(mr,Y))
    CR$Eltime <- 0
    
    
  }
  
  tree$node$class[[L]] <- rank2order(CR$Consensus[1,],items=colnames(Y)) #class assignment rule
  tree$node$nclass[[L]] <- CR$Consensus[1,] #numeric class
  tree$node$mclass[[L]]<- CR$Consensus  #class, containing eventually more than one median ranking
  tree$node$tau[[L]] <- tree$node$wtau[[L]] <- CR$Tau[1]   #goodness of fit
  tree$node$werror[[L]] <- tree$node$error[[L]] <- sum(kemenyd(Y,CR$Consensus))/(N*C*(C-1))  #error within node
  tree$numnodes <- 1 #number of nodes of the tree
  
  
  
  #number <- 1 #node number
  
  gd <- getdata(tree)
  ind <- which(gd$term==TRUE & gd$imp==TRUE & gd$nnode >tree$control$num)
  

  
  while(length(ind)>0){
    
    for (i in 1:length(ind)) {
      
      # print(length(tree$node$idatnode[[L[ind[i]]]]))
      # flush.console()
      
      #prova aggiunta il 29/4/2021
      
      # if( length(tree$node$idatnode[[L[ind[i]]]]) <= tree$control$num   ){ #if aggiunta
      #   
      #   tree$node$terminal[[L[ind[i]]]] <- TRUE
      #   tree$node$checkdecr[[L[ind[i]]]] <- FALSE
      #   tree$node$varsplit[[L[ind[i]]]] <- NA
      #   tree$node$varsplitid[[L[ind[i]]]] <- NA
      #   tree$node$cutspli[[L[ind[i]]]] <- NA
      #   tree$node$children[[L[ind[i]]]] <- NA
      #   tree$node$idchildren[[L[ind[i]]]] <- NA
      #   
      # } else { #else aggiunta
        
        RS <- ranksplit(Y,X,N,nomin,namespreds,tree$node$idatnode[[L[ind[i]]]])
        if (RS$DecImp > tree$control$decrmin){
          tree$node$terminal[[L[ind[i]]]] <- FALSE
          tree <- addnode(tree,RS,L[ind[i]],C,N)
          tree$node$checkdecr[[L[ind[i]]]] <- TRUE
          #L <- seq(1:tree$numnodes)
        } else {
        
          tree$node$terminal[[L[ind[i]]]] <- TRUE
          tree$node$checkdecr[[L[ind[i]]]] <- FALSE
          tree$node$varsplit[[L[ind[i]]]] <- NA
          tree$node$varsplitid[[L[ind[i]]]] <- NA
          tree$node$cutspli[[L[ind[i]]]] <- NA
          tree$node$children[[L[ind[i]]]] <- NA
          tree$node$idchildren[[L[ind[i]]]] <- NA
        
        }
        
      # } #end else aggiunta
      
      #L <- seq(1:tree$numnodes)
      
    } #end for
    
    L <- seq(1:tree$numnodes)
    
    gd <- getdata(tree)
    ind <- which(gd$term==TRUE & gd$imp==TRUE & gd$nnode >tree$control$num)
  }#end while
  
  
  tree$tsynth <- synthtree(tree)
  
  tree <- layouttree(tree)
  
  tree$goodness$error <- sum(unlist(tree$node$werror[tree$tsynth$children$measures$idt]))
  tree$goodness$tau <- sum(unlist(tree$node$wtau[tree$tsynth$children$measures$idt]))
  tree$goodness$impurity <- sum(unlist(tree$node$wimpur[tree$tsynth$children$measures$idt]))
  tree$nomin <- nomin
  
  #get pruning information
  
  tree <- prune(tree)
  
  if (isTRUE(prunplot)){
    
    plot.ranktree(tree,plot.type="pruningseq")
    
  }
  
  # if(isTRUE(control$printinfo)){
  #   
  #   isleaf <- unlist(tree$node$terminal)
  #   obj <- "\nRecursive partitioning method for preference rankings: \n"
  #   cat(obj)
  #   cat("Number of terminal nodes: ",(tree$numnodes+1)/2,"\n")
  #   cat("Averaged tau_X rank correlation coefficient: ", tree$goodness$tau, "\n")
  #   cat("Error (normalized Kemeny distance): ", tree$goodness$error, "\n")
  #   cat("Variables generating splits: ", paste(unique(unlist(tree$node$varsplit[!isleaf])),collapse=', '), "\n")
  #   
  # }
  
  class(tree) <- c("ranktree","ConsRankClass")
  
  return(tree)
  
  
}#end function

#-----------------------

addnode <- function(Tree,RSpl,snod,C,N){
  
  numl <- Tree$numnodes +1 #number of list left node
  numr <- Tree$numnodes +2 #number of list right node
  numberleft <- 2*Tree$node$number[[snod]] #node number left
  numberright <- (2*Tree$node$number[[snod]])+1 #node number right
  
  Tree$numnodes <- numr
  
  Tree$node$varsplit[[snod]] <- RSpl$SplitVar  #variable that generates split
  Tree$node$varsplitid[[snod]] <- RSpl$idSplitVar
  Tree$node$cutspli[[snod]] <- RSpl$custSplit  #cutting point
  Tree$node$children[[snod]] <- c(numberleft,numberright)
  Tree$node$idchildren[[snod]] <- c(numl,numr) #id of children nodes
  
  #manage new nodes
  #add the number at nodes
  Tree$node$number[[numl]] <- numberleft
  Tree$node$number[[numr]] <- numberright
  
  #declare nodes as terminal for a moment 
  Tree$node$terminal[[numl]] <- TRUE
  Tree$node$terminal[[numr]] <- TRUE
  
  #no children at the moment
  Tree$node$idchildren[[numl]] <- NA
  Tree$node$idchildren[[numr]] <- NA
  
  #add info about father
  if (snod==1){
    Tree$node$father[[numl]] <- Tree$node$father[[numr]] <- 1
    Tree$node$idfather[[numl]] <- Tree$node$idfather[[numr]] <- 1
    
  } else {
    
    Tree$node$father[[numl]] <- Tree$node$father[[numr]] <- Tree$node$number[[snod]]
    Tree$node$idfather[[numl]] <- Tree$node$idfather[[numr]] <- snod
    
    
  }
  
  #sample size within nodes
  Tree$node$size[[numl]] <- length(RSpl$indexing$left)
  Tree$node$size[[numr]] <- length(RSpl$indexing$right)
  
  #impurity and weighted inpurity within nodes
  
  Tree$node$impur[[numl]] <- RSpl$NodeImpurity$left
  Tree$node$impur[[numr]] <- RSpl$NodeImpurity$right
  
  Tree$node$wimpur[[numl]] <- RSpl$NodeImpurity$left*( length(RSpl$indexing$left)/N   )
  Tree$node$wimpur[[numr]] <- RSpl$NodeImpurity$right*( length(RSpl$indexing$right)/N )
  
  #index of units within nodes
  Tree$node$idatnode[[numl]] <- RSpl$indexing$left
  Tree$node$idatnode[[numr]] <- RSpl$indexing$right
  
  #check for split
  Tree$node$checkdecr[[numl]] <- TRUE
  Tree$node$checkdecr[[numr]] <- TRUE
  
  #consensus ranking within node
  CRL <- consrank(Tree$Y[RSpl$indexing$left,], algorithm=Tree$control$algorithm, ps=Tree$control$ps, full=Tree$control$full,
                  itermax=Tree$control$itermax, np=Tree$control$np, gl=Tree$control$gl,
                  ff=Tree$control$ff, cr=Tree$control$cr, proc=Tree$control$proc)
  
  if( is(CRL,"NULL") ){
    #if the combined input matrix contains either only zeros or only positive values, 
    #the "all-tie" solution is returned
    
    CRL <- list()
    mr <- matrix(1,nrow=1,ncol=ncol(Tree$Y))
    colnames(mr) <- colnames(Tree$Y) 
    CRL$Consensus <- mr
    CRL$Tau <- mean(tau_x(mr,Tree$Y[RSpl$indexing$left,]))
    CRL$Eltime <- 0
    
    
  }
  
  
  
  
  CRR <- consrank(Tree$Y[RSpl$indexing$right,], algorithm=Tree$control$algorithm, ps=Tree$control$ps, full=Tree$control$full,
                  itermax=Tree$control$itermax, np=Tree$control$np, gl=Tree$control$gl,
                  ff=Tree$control$ff, cr=Tree$control$cr, proc=Tree$control$proc)
  
  if( is(CRR,"NULL") ){
    #if the combined input matrix contains either only zeros or only positive values, 
    #the "all-tie" solution is returned
    
    CRR <- list()
    mr <- matrix(1,nrow=1,ncol=ncol(Tree$Y))
    colnames(mr) <- colnames(Tree$Y) 
    CRR$Consensus <- mr
    CRR$Tau <- mean(tau_x(mr,Tree$Y[RSpl$indexing$right,]))
    CRR$Eltime <- 0
    
    
  }
  
  Tree$node$class[[numl]] <- rank2order(CRL$Consensus[1,],items=colnames(Tree$Y))
  Tree$node$class[[numr]] <- rank2order(CRR$Consensus[1,],items=colnames(Tree$Y))
  
  Tree$node$nclass[[numl]] <- CRL$Consensus[1,]
  Tree$node$nclass[[numr]] <- CRR$Consensus[1,]
  
  Tree$node$mclass[[numl]] <- CRL$Consensus
  Tree$node$mclass[[numr]] <- CRR$Consensus
  
  #tau and weighted tau within node
  Tree$node$tau[[numl]] <- CRL$Tau[1]
  Tree$node$tau[[numr]] <- CRR$Tau[1]
  
  Tree$node$wtau[[numl]] <- CRL$Tau[1] * (length(RSpl$indexing$left)/N)
  Tree$node$wtau[[numr]] <- CRR$Tau[1] * (length(RSpl$indexing$right)/N)
  
  #error and weigthed error within node
  Tree$node$error[[numl]] <- sum(kemenyd(Tree$Y[RSpl$indexing$left,],CRL$Consensus[1,]))/( (length(RSpl$indexing$left)) *C*(C-1))
  Tree$node$error[[numr]] <- sum(kemenyd(Tree$Y[RSpl$indexing$right,],CRR$Consensus[1,]))/( (length(RSpl$indexing$right))*C*(C-1))
  
  Tree$node$werror[[numl]] <- Tree$node$error[[numl]] * (length(RSpl$indexing$left)/N)
  Tree$node$werror[[numr]] <- Tree$node$error[[numr]] * (length(RSpl$indexing$right)/N)
  
  #for a moment declare as NA the splitting variables potentially generated by subsequent splits
  
  Tree$node$varsplit[[numl]] <- Tree$node$varsplit[[numr]] <- NA 
  Tree$node$varsplitid[[numr]] <- Tree$node$varsplitid[[numr]] <- NA 
  
  
  return(Tree)
  
} # end code

#--------------------------------------

rankimpurity <- function(Y) {
  SS=sum(kemenyd(Y))/nrow(Y) #Relaxing suggestion
  return(SS)
  
} #end function

#--------------------------------------

ranksplit = function(Y,X,N,nomin,namespreds,xID)
{
  
  
  # r = nrow(X[xID,])
  if (is.null(nrow(X[xID,]))){r=length(X[xID,])}
  npred=ncol(X[xID,])
  if (is.null(ncol(X[xID,]))){npred=1}
  ImpFather = rankimpurity(Y[xID,])/N
  
  tablesplit=matrix(0,4,npred)
  colnames(tablesplit)=namespreds
  
  #table that informs about
  #              variable1 | variable 2 |
  #            __________________________
  # max Delta  |           |            |
  # cut point  |           |            |
  # size left  |           |            |
  # size right |           |            |
  
  listLEFTid=vector(mode="list",length=npred) #list that recall indexing to the left and to the right
  listRIGHTid=vector(mode="list",length=npred)
  Splitnom=vector(mode="list",length=npred)
  
  
  
  
  for (i in 1:npred) { #start for
    
    if (npred==1){x=X[xID]} else {x=X[xID,i]}
    
    ifelse(nomin[i]==0,FALSE,TRUE)
    
    if (nomin[i]==FALSE)
      
    {  # if ordinal
      
      
      h=sort(unique(x))
      #Delta=matrix(0,1,(length(h)-1))
      
      if (length(h)==1){ #modifica per impedire problemi
        
        tablesplit[1,i]=0
        tablesplit[2,i]=NA
        tablesplit[3,i]=NA
        tablesplit[4,i]=NA
        Splitnom[[i]]=NA
        listLEFTid[[i]]=NA
        listRIGHTid[[i]]=NA
        
      } else {
        
        iDL=lapply(h[-length(h)],FUN=function(h) which(x<=h))
        iDR=lapply(h[-length(h)],FUN=function(h) which(x>h))
      
        # print(length(h))
        # flush.console()
      
      
        primp=sapply(1:length(iDL), function(row)
        ImpFather-( ( ifelse(length(iDL[[row]])==1,0,rankimpurity(Y[xID[iDL[[row]]],])/N) ) +
                      (ifelse(length(iDR[[row]])==1,0,rankimpurity(Y[xID[iDR[[row]]],])/N)  ) )
       )


        md=which.max(unlist(primp))
        leftid=which(x<=h[md])
        rightid=which(x>h[md])          
        #tablesplit[1,i]=Delta[md]
        tablesplit[1,i]=primp[[md]]
        tablesplit[2,i]=h[md]
        tablesplit[3,i]=length(leftid)
        tablesplit[4,i]=length(rightid)
      
        listLEFTid[[i]]=xID[leftid]
        listRIGHTid[[i]]=xID[rightid]
        
      }#end modifica
      
      
      
      
    } #
    
    else{
      
      Nspl=nominalsplit(x,Y[xID,],ImpFather,N)
      if (Nspl$Decr==0){
        
        tablesplit[1,i]=0
        tablesplit[2,i]=NA
        tablesplit[3,i]=NA
        tablesplit[4,i]=NA
        Splitnom[[i]]=NA
        listLEFTid[[i]]=NA
        listRIGHTid[[i]]=NA
        
      } else {
        
        tablesplit[1,i]=Nspl$Decr
        tablesplit[2,i]=NA
        tablesplit[3,i]=length(Nspl$indL)
        tablesplit[4,i]=length(Nspl$indR)
        
        
        

        Splitnom[[i]]=Nspl$Vsplit
        listLEFTid[[i]]=xID[Nspl$indL]
        listRIGHTid[[i]]=xID[Nspl$indR]
      }
      
    } #end if nominal
    
  }# end for
  
  bestsplit=which.max(tablesplit[1,])
  DecImp=tablesplit[1,bestsplit]
  indexing=list(left=NA,right=NA)
  indexing$left=listLEFTid[[bestsplit]]
  indexing$right=listRIGHTid[[bestsplit]]
  
  if(is.na(tablesplit[2,bestsplit])){
    cutSplit=Splitnom[[bestsplit]]
  } else {
    cutSplit=tablesplit[2,bestsplit]
  }
  
  Nodeimpurity=list(left=NA,right=NA)
  
  Nodeimpurity$left <- ifelse(length(indexing$left)==1,0,rankimpurity(Y[indexing$left,]))
  Nodeimpurity$right <- ifelse(length(indexing$right)==1,0,rankimpurity(Y[indexing$right,]))
  
  return( list(DecImp=DecImp, indexing=indexing, 
               custSplit=cutSplit, NodeImpurity=Nodeimpurity, SplitVar=namespreds[bestsplit], idSplitVar=bestsplit ) )
  
  
} #end function

#----------------

splittingvariables = function(X){
  #generate all possible binary splits for categorical variables
  
  K = sort(unique(X))
  
  if(length(K)==1){
    
    vsplit=NULL
    
  } else {
    
    if (length(K)>2){
      
      
      s1=seq((length(K)-1),ceiling(length(K)/2),by=-1)
      indici=vector(mode="list",length=(length(K)-1))
      
      for (i in 1:length(s1)){ #for 1
        indici[[s1[i]]]$spl=t(combn(K,s1[i]))
        
        if (ncol(indici[[s1[i]]]$spl) == (length(K)/2))
        { #if 1
          delrows=(nrow(indici[[s1[i]]]$spl)/2+1):nrow(indici[[s1[i]]]$spl)
          indici[[s1[i]]]$spl=indici[[s1[i]]]$spl[-delrows,]
        } #end if 1
        
      } #end for 1
      
      Lk=1
      kL=1
      
      while (kL < 99998){ #while 1
        
        if (is.null(indici[[kL]]$spl))
        { #if 2
          kL=kL+1
          Lk=Lk+1 # Lk is the index of the first non-empty struct array "indici"
        } else {
          kL=99999
          
        }# end if 2
        
        
      } #end while 1
      
      
      indvsplit=seq(Lk,(length(K)-1),by=1)
      
      ncolmn=ncol(indici[[length(indici)]]$spl)
      
      vsplit=matrix(0,1,ncolmn)
      
      for (j in 1:length(indvsplit))
        
      { #for 2
        
        if (ncol(indici[[indvsplit[j]]]$spl) < ncolmn )
          
        { #if 3
          indna=(ncol(indici[[indvsplit[j]]]$spl)+1):ncolmn
          indici[[indvsplit[j]]]$spl=cbind(indici[[indvsplit[j]]]$spl,matrix(NA,nrow(indici[[indvsplit[j]]]$spl),length(indna)))
        } #end if 3
        
        vsplit=rbind(vsplit,indici[[indvsplit[j]]]$spl)
        
      }#end for 2
      
      vsplit=vsplit[-1,]
      
    } else if (length(K)==2) {
      
      vsplit=K[1]
      
    }
    
  }
  
  return(vsplit)
  
}#end function

#----------------------------

#' @importFrom methods is

nominalsplit = function(X,Y,ImpFather,N){

  x=splittingvariables(X)
  
  if( is(x,"NULL")  ){ #if null, exit function
    Decr <- 0
    GL <- NA
    GR <- NA 
    indleft <- NA
    indright <- NA
    Vsplit <- NA
    
  } else { #start principal else
    
    
    
    if (length(x)==1){ #if 1
      
      indleft <- which(X==x)
      indright <- which(X!=x)

      GL <- ifelse(length(indleft)==1,0,rankimpurity(Y[indleft,]))
      GR <- ifelse(length(indright)==1,0,rankimpurity(Y[indright,]))
      
      Delta = ImpFather - (GL/N + GR/N)

      Vsplit = x
      Decr=max(Delta)
      
    } else { #end if 1, start else 2
      
      
      Delta=matrix(0,1,nrow(x))
      
      for (j in 1:nrow(x)){#for 1
        
        if( is(X,"factor") ){
          
          left=which(!is.na(match(X,levels(X)[x[j,]])))
          right=which(is.na(match(X,levels(X)[x[j,]])))
          
        } else {
          
          left=which(!is.na(match(X,x[j,])))
          right=which(is.na(match(X,x[j,])))
        }
        
        GL <- ifelse(length(left)==1,0,rankimpurity(Y[left,]))
        GR <- ifelse(length(right)==1,0,rankimpurity(Y[right,]))
        
        
        Delta[1,j]= ImpFather - (GL/N + GR/N)

        
        
      }#end for 1
      
      j=which.max(Delta)
      Decr=max(Delta)
      
      
      if( is(X,"factor") ){
        
        indleft=which(!is.na(match(X,levels(X)[x[j,]])))
        indright=which(is.na(match(X,levels(X)[x[j,]])))
        
      } else {
        
        indleft=which(!is.na(match(X,x[j,])))
        indright=which(is.na(match(X,x[j,])))
      }
      
      GL <- ifelse(length(indleft)==1,0,rankimpurity(Y[indleft,]))
      GR <- ifelse(length(indright)==1,0,rankimpurity(Y[indright,]))

      
      if (is(X,"factor")){
        
        Vsplit=levels(X)[x[j,]]
        
      } else {
        
        Vsplit=x[j,]
        
      }
      
      if ( sum(is.na(Vsplit)) > 0 ){
        
        Vsplit=Vsplit[-which(is.na(Vsplit))]
        
      }
      
    }#end else 2
    
    
    
  } #end principal else
  
  return(list(Decr=Decr, GL=GL,  GR=GR, indL=indleft, indR=indright, Vsplit=Vsplit))
  
} #end function

#--------------------------------------------

getdata <- function(Tree){
  term=unlist(Tree$node$terminal)
  imp=unlist(Tree$node$checkdecr)
  nnode=unlist(Tree$node$size)
  return(list(term=term,imp=imp,nnode=nnode))
} #end function

#---------------------------------------

#' @importFrom rlist list.rbind

synthtree <- function(Tree){
  
  N <- Tree$node$size[[1]]
  
  idt <- which(unlist(Tree$node$terminal)==TRUE)
  idp <- which(unlist(Tree$node$terminal)==FALSE)
  
  nn <- unlist(Tree$node$number[idt])
  if (length(nn)==1){
    father <- NA} else 
    {father <- unlist(Tree$node$father[idt])
    }
  size <- unlist(Tree$node$size[idt])
  impur <- unlist(Tree$node$impur[idt])
  error <- unlist(Tree$node$error[idt])
  tau <- unlist(Tree$node$tau[idt])
  wtau <- unlist(Tree$node$wtau[idt])
  class <- list.rbind(Tree$node$class[idt])
  classnum <- do.call(rbind,Tree$node$nclass[idt])
  
  prr=c(length(nn),length(father),length(size),length(impur),length(error),length(wtau))
  #print(prr)
  #flush.console()
  
  children=list()
  children$measures <- data.frame(idt,nn,father,size,impur,error,wtau,tau)
  children$class <- class
  children$classid <- idt
  children$nclass <- classnum
  
  
  if(length(idp)>0){
    nn <- unlist(Tree$node$number[idp])
    father <- c(0,unlist(Tree$node$father[idp]))
    size <- unlist(Tree$node$size[idp])
    varp <- unlist(Tree$node$varsplit[idp])
    impur <- unlist(Tree$node$impur[idp])
    wimpur <- unlist(Tree$node$wimpur[idp])
    error <- unlist(Tree$node$error[idp])
    tau <- unlist(Tree$node$tau[idp])
    wtau <- unlist(Tree$node$wtau[idp])
    class <- list.rbind(Tree$node$class[idp])
    classnum <- do.call(rbind,Tree$node$nclass[idp])
    
    parents <- list()
    parents$measures <- data.frame(idp,nn,father,size,varp,impur,wimpur,error,wtau,tau)
    parents$class <- class
    parents$classid <- idp
    parents$nclass <- classnum
  } else { #tree with only the root node
    parents <- NA
  }
  
  return(list(children=children, parents=parents))
  
  
}#end function

#------------------------------------------

prune <- function(Tree){
  
  #optimal pruning sequence of the tree
  tsynth <- Tree$tsynth
  N <- Tree$numnodes
  
  #get internal nodes
  idparent <- tsynth$parents$measures$idp
  numparent <- tsynth$parents$measures$nn
  
  #get terminal nodes
  
  idterminal <- tsynth$children$measures$idt
  numterminal <- tsynth$children$measures$nn  
  
  isleaf <- treatisleaf <- unlist(Tree$node$terminal)
  
  nodecost <- costsum <- unlist(Tree$node$wimpur)
  nodeerr <- errsum <- unlist(Tree$node$werror)
  nodetau <- tausum <- unlist(Tree$node$wtau)
  
  children <- Tree$idgenealogy[,2:3]
  parent <- Tree$idparents
  nleaves <- sum(isleaf)
  nodecount <- as.numeric(isleaf)
  adj <- 1+100*( 2^(-52) )
  
  repeat{
    
    branches <- which(treatisleaf==FALSE)
    twig <- cbind(as.numeric(treatisleaf[children[branches,1]]),as.numeric(treatisleaf[children[branches,2]]))
    twig <- branches[rowSums(twig)==2]
    
    if (length(twig)==0){break}
    
    kids <- children[twig,]
    
    costsum[twig] <- rowSums(cbind(costsum[kids$leftchild],costsum[kids$rightchild]))
    errsum[twig] <- rowSums(cbind(errsum[kids$leftchild],errsum[kids$rightchild]))
    tausum[twig] <- rowSums(cbind(tausum[kids$leftchild],tausum[kids$rightchild]))
    
    
    nodecount[twig] <- rowSums(cbind(nodecount[kids$leftchild],nodecount[kids$rightchild]))
    treatisleaf[twig] <- TRUE
    
  }
  
  
  whenpruned <- rep(0,N)
  branches <- which(isleaf==FALSE)
  prunestep <- 0
  allalpha <- ntermnodes <- rep(0,N)
  ntermnodes[1] <- length(idterminal)
  
  while(length(branches)>0){#principal while
    
    #complexity parameter
    
    prunestep <- prunestep + 1
    
    alpha <- pmax(0, (nodecost[branches]-costsum[branches]))/pmax(2^(-52),(nodecount[branches]-1)) 
    
    bestalpha <- min(alpha)
    
    toprune <- branches[alpha <= bestalpha*adj]
    
    #nodes below here as no longer on tree
    
    wasleaf <- isleaf
    
    kids <- toprune
    
    while(length(kids)>0){
      
      kids <- children[kids,]
      
      kids <- kids[which(rowSums(kids)>0),]
      
      kids$leftchild[(isleaf[kids$leftchild])]<- - Inf
      
      kids$rightchild[(isleaf[kids$rightchild])]<- - Inf
      
      kids <- kids[sign(kids)>0]
      
      isleaf[kids] <- TRUE
      
      
    }
    
    
    newleaves <- toprune[isleaf[toprune]==FALSE]
    
    isleaf[toprune] <- TRUE
    
    whenpruned[isleaf!=wasleaf & whenpruned==0] <- prunestep
    
    # this branch is pruned
    whenpruned[toprune] <- prunestep
    
    
    for (i in 1:length(newleaves)){
      
      #these branches are now new terminal nodes
      node <- newleaves[i]
      #update impurity, error and tau
      diffcost <- nodecost[node] - costsum[node]
      differr <- nodeerr[node] - errsum[node]
      difftau <- nodetau[node] - tausum[node]
      
      #update count
      diffcount <- nodecount[node] - 1
      
      #go from leaf to root node
      
      repeat{
        
        nodecount[node] <- nodecount[node] - diffcount
        
        costsum[node] <- costsum[node] + diffcost
        errsum[node] <- errsum[node] + differr
        tausum[node] <- tausum[node] + difftau
        
        node <- parent[node]
        
        if (node==0) {break}
        
      }
      
    }
    
    
    allalpha[prunestep+1] <- bestalpha
    
    ntermnodes[prunestep+1] <- nodecount[1]
    
    branches <- which(isleaf == FALSE)
    
    
  }#end principal while
  
  
  ##here the plot of the pruning sequence?
  
  #end of the plot
  
  ids <- seq(1:max(whenpruned))
  
  subtrees <- vector(mode = "list", length = (length(ids)+1))
  
  taos <- errors <- rep(0,(length(ids)+1))
  
  subtrees[[1]] <- Tree
  
  for (j in 1:length(ids)){
    
    cut <- which(whenpruned>0 & whenpruned <= ids[j])
    
    sub <- prunenodes(Tree,cut)
    
    taos[(j+1)] <- sub$goodness$tau
    errors[(j+1)] <- sub$goodness$error
    
    subtrees[[(j+1)]] <- sub #era subtrees[[j]]
    
  }
  
  taos[1] <- Tree$goodness$tau
  errors[1] <- Tree$goodness$error
  
  
  Tree$alpha <- allalpha[1:(prunestep+1)]
  Tree$pruneinfo$prunelist <- whenpruned
  Tree$pruneinfo$tau <- taos
  Tree$pruneinfo$error <- errors
  Tree$pruneinfo$termnodes <- ntermnodes[1:(prunestep+1)]
  Tree$subtrees <- subtrees
  
  return(Tree)
  
}#end function

#-----------------------------------

prunenodes <- function(Tree,cut){
  
  N <- Tree$numnodes
  children <- Tree$idgenealogy[,2:3]
  parents <- cut
  tokeep <- rep(1,N)
  
  okids <- vector(mode="numeric",length=0L)
  
  while(sum(tokeep)>0){
    
    newkids <- c(as.matrix(children[parents,]))
    
    newkids <- newkids[newkids>0 & !is.element(newkids,okids)]
    
    if (length(newkids)==0){break}
    
    okids <- c(okids,newkids)
    tokeep[newkids] <- 0
    parents <- newkids
    
  }
  
  subtree <- getsubtree(Tree,cut,tokeep=tokeep)
  
  return(subtree)
  
}#end function  

Try the ConsRankClass package in your browser

Any scripts or data that you put into this service are public.

ConsRankClass documentation built on Sept. 28, 2021, 5:10 p.m.