R/uni.tree.R

Defines functions uni.tree

Documented in uni.tree

#  uni.tree #
#' @title A survival binary tree split by univariate selection and the statistical significant criterion.
#' @description Using a survival dataset to build a tree. The decision of splitting is using score test or log-rank test.
#' And the threshold of stop spltting can fixed through the argument.
#' This function also produce a relative risk rank between the terminal nodes.
#' @param t.vec :Vector of survival times (time to either death or censoring)
#' @param d.vec :Vector of censoring indicators (1=death, 0=censoring)
#' @param X.mat :n by p matrix of covariates, where n is the sample size and p is the number of covariates
#' @param P.value :the threshold of P-value for stop splitting (stopping criterion)
#' @param d0 :A positive constant to stabilize the variance of score statistics (Witten & Tibshirani 2010)
#' @param S.plot :call for plot the KM estimator for each split
#' @param score :TRUE = score test (Emura T et al. 2019); FALSE = log-rank test
#' @return A nesting list of tree, containing inner node and terminal node information
#' @details the concept is generated by a function containing the function itself
#' @details For each split, we record three things, fist is the information of inner node which contains what to choose for splitting and what value the p-value the two sample test is. Second and third are the information of left and right children nodes.
#' If the children node is decided to become a inner node, then it genertae another two nodes;otherwise, it becomes a terminal node and record the information of sample size and the covariate space and the p-value it try to split but is failed
#' @examples
#' data(Lung,package="compound.Cox")
#' train_Lung=Lung[which(Lung[,"train"]==TRUE),] #select training data
#' t.vec=train_Lung[,1]
#' d.vec=train_Lung[,2]
#' x.mat=train_Lung[,-c(1,2,3)]
#' uni.tree(t.vec,d.vec,x.mat,P.value=0.01,d0=0.01,S.plot=FALSE,score=TRUE)
#' @import survival
#' @references
#' Emura T, Matsui S, Chen HY (2019). compound.Cox: Univariate Feature Selection and Compound Covariate for Predicting Survival, Computer Methods and Programs in Biomedicine 168: 21-37.
#' @references
#' Witten DM, Tibshirani R (2010) Survival analysis with high-dimensional covariates. Stat Method Med Res 19:29-51
#' @export
uni.tree=function(t.vec,d.vec,X.mat,P.value=0.01,d0=0.01,
                  S.plot=FALSE,score=TRUE){
  m=max(X.mat)-min(X.mat)
  recursive=function(t.vec,d.vec,X.mat,condition=NULL,Risk.score=0,rank.weight=1){
    n=length(t.vec)
    if(n>1){
      if(score == TRUE){
        p = ncol(X.mat)
        ###############transform the X.mat into w binary matrix#################################
        w.mat = 1*(X.mat[,rep(1:p,c(rep(m,p)))]>
                     matrix(rep(c(1:m),p),1,p*m)[rep(1,n),])
        #import and implement uni.score function
        uni.score.summary = uni.score(t.vec,d.vec,w.mat, d0= d0)
        uni.score_res = uni.score.summary$P
        #decide the cut value
        selected.cut.value = rep(c(1:m),p)[which.min(uni.score_res)]
        selected.cov.names = colnames(X.mat)[floor((which.min(uni.score_res)-1)/m)+1]#prepare the name of selected covariate name
        selected.P.value = min(uni.score_res)
      }else{
        #else = logrank
        uni.logrank.res = uni.logrank(t.vec,d.vec,X.mat)
        selected.cov.names = colnames(uni.logrank.res)[1]
        selected.cut.value = uni.logrank.res["cut_off_point",1]
        selected.P.value = uni.logrank.res["Pvalue",1]
      }
      #stopping criterion
      if(selected.P.value <= P.value ){
        #Separate data by the final decided value
        tmp=X.mat[,selected.cov.names]<=selected.cut.value
        #check high risk and low risk set
        #prepare for subtree
        if(score == TRUE){
          Zvalue = uni.score.summary$Z[which.min(uni.score_res)]
          if(Zvalue <= 0){
            left_t.vec = t.vec[tmp];right_t.vec=t.vec[!tmp]
            left_d.vec = d.vec[tmp];right_d.vec=d.vec[!tmp]
            left_X.mat = X.mat[tmp,];right_X.mat=X.mat[!tmp,]
          }else{
            left_t.vec=t.vec[!tmp];right_t.vec=t.vec[tmp]
            left_d.vec=d.vec[!tmp];right_d.vec=d.vec[tmp]
            left_X.mat=X.mat[!tmp,];right_X.mat=X.mat[tmp,]
          }
        }else{
          x=X.mat[,selected.cov.names]<=selected.cut.value
          lrsummary = survdiff(Surv(t.vec,d.vec)~x)
          Zvalue = (lrsummary$obs - lrsummary$exp)[1]
          if(Zvalue <= 0){
            left_t.vec=t.vec[tmp];right_t.vec=t.vec[!tmp]
            left_d.vec=d.vec[tmp];right_d.vec=d.vec[!tmp]
            left_X.mat=X.mat[tmp,];right_X.mat=X.mat[!tmp,]
          }else{
            left_t.vec=t.vec[!tmp];right_t.vec=t.vec[tmp]
            left_d.vec=d.vec[!tmp];right_d.vec=d.vec[tmp]
            left_X.mat=X.mat[!tmp,];right_X.mat=X.mat[tmp,]
          }
        }
        #prepare for subtree
        #Set condition
        if(is.null(condition)){
          if(Zvalue <= 0){
            condition_left=paste0(selected.cov.names,"<=",selected.cut.value,sep="")
            condition_right=paste0(selected.cov.names,">",selected.cut.value,sep="")
          }else{
            condition_left=paste0(selected.cov.names,">",selected.cut.value,sep="")
            condition_right=paste0(selected.cov.names,"<=",selected.cut.value,sep="")
          }
        }else{
          if(Zvalue <= 0){
            condition_left=paste0(condition," & ",paste0(selected.cov.names,"<=",selected.cut.value))
            condition_right=paste0(condition," & ",paste0(selected.cov.names,">",selected.cut.value))
          }else{
            condition_left=paste0(condition," & ",paste0(selected.cov.names,">",selected.cut.value))
            condition_right=paste0(condition," & ",paste0(selected.cov.names,"<=",selected.cut.value))
          }
        }
        if(S.plot == TRUE){
          KM.split(t.vec,d.vec,X.mat,selected.cov.names,selected.cut.value)
        }
        return(
          list(NODE=data.frame(Information=c("Inner.node",
                                             Risk.score,
                                             selected.P.value,
                                             selected.cov.names,
                                             selected.cut.value,
                                             round(Zvalue,2)
          ),
          row.names = c("node_status:",
                        "Risk score:",
                        "P-value:",
                        "name:",
                        "cut_value:",
                        "Z-value")
          )
          ,Left=recursive(left_t.vec,left_d.vec,left_X.mat,
                          condition=condition_left,Risk.score=Risk.score+rank.weight,rank.weight = 0.1*rank.weight)
          ,Right=recursive(right_t.vec,right_d.vec,right_X.mat,
                           condition=condition_right,Risk.score = Risk.score-rank.weight,rank.weight = 0.1*rank.weight)
          )
        )
      }else{
        return(list(NODE=data.frame(Information=c("terminal node",
                                                  Risk.score,
                                                  selected.P.value,
                                                  n,
                                                  if(is.null(condition)){"NULL"}else{condition}
        ),
        row.names=c("node_status:",
                    "Risk score:",
                    "P-value:",
                    "samplesize:",
                    "condition:")
        )
        )
        )

      }
    }else{
      return(list(NODE=data.frame(Information=c("terminal node",
                                                Risk.score,
                                                n,
                                                if(is.null(condition)){"NULL"}else{condition}
      ),
      row.names=c("node_status:",
                  "Risk score:",
                  "samplesize:",
                  "condition:")
      )
      )
      )
    }
  }
  return(recursive(t.vec,d.vec,X.mat))
}
lichkeam/uni.survival.tree documentation built on Dec. 21, 2021, 10:46 a.m.