R/partition_tree.R

Defines functions build_tree

Documented in build_tree

#' Using non-backtracking strategy for stopping rule and build the tree
#'
#' \code{build_tree} returns a dataframe
#'
#' @param f It is the adjencency matrix
#' @param xi.loc.labels lists of index of each cluster
#' @param ncl number of clusters
#' @param cl.labels a vector of index of individuals
#' @param method Partition method
#' @return Lists of cluster results
#' @examples
#' cluster_result <- build_tree(adj,xi.loc.labels=list(), ncl=0, cl.labels=1:339,n.min=25,D=NULL,method)

build_tree <- function(f,xi.loc.labels, ncl, cl.labels,n.min=25,D=NULL,method){
  nisol = which(rowSums(f) > 0)
  isol = which(rowSums(f) == 0)
  cl.labels.full <- cl.labels
  ## sanity check -- do not start if there are too many isolated nodes
  if((length(nisol)<=8)||(length(isol)>=5*length(nisol))||(length(nisol)<2*n.min)){
    ncl = ncl + 1
    xi.loc.labels[[ncl]] = cl.labels
    tree.path <- c("",as.character(cl.labels))
    mod.path <- c(0,rep(0,length(cl.labels)))
    if(length(isol)>0) tree.path <- c(tree.path,as.character(cl.labels[isol]))
    print('Too few connected nodes, not even started!')
    return(list(xi.loc.labels = xi.loc.labels, ncl = ncl,tree.path=tree.path,mod.path=mod.path))
  }
  #print(paste(length(isol),"isolated nodes",cl.labels[isol]))
  all.deg = rowSums(f)
  f = f[nisol, nisol] ### only focus on non-isolated nodes - isolated nodes with be attached to the smaller child
  cl.labels.full <- cl.labels
  all.deg = all.deg[nisol]
  cl.labels = cl.labels[nisol]
  n1 = dim(f)[1]

  K = 2
  d <- colSums(f)
  n <- nrow(f)
  I <- as(diag(rep(1,n)),"dgCMatrix")
  D <- as(diag(d),"dgCMatrix")
  r <- sqrt(sum(d^2)/sum(d)-1)
  B <- as(matrix(0,nrow=2*n,ncol=2*n),"dgCMatrix")
  B[(n+1):(2*n),1:n] <- -1*I
  B[1:n,(n+1):(2*n)] <- D-I
  B[(n+1):(2*n),(n+1):(2*n)] <- f
  ss <- Re(eigs(B,k=2,which="LM")$values)
  split.flag <- sum(abs(ss)>r)>1
  if(split.flag) {
    #f.adj <- f + 3*mean(all.deg)/nrow(f)
    #f.adj <- f + 0.1/nrow(f)
    g <- graph_from_adjacency_matrix(as.matrix(f), weighted=T, mode="undirected")
    g <- simplify(g)
    adj <- get.adjacency(g,type = "both",attr = "weight")
    if(method == "eigen"){
      adj.adj <- adj+as.numeric(0.1)/nrow(adj)
      eval <- eigs_sym(adj.adj,2,which="LM")
      clustering <- ifelse(eval$vectors[,2]<0,1,2)
      adj.adj <- NULL
    } else if (method == "kmeans2") {
    deg.adj <- diag(strength(g))+diag(rep(0.1,nrow(f)))
    l_matrix <- solve(sqrtm(deg.adj))%*%adj%*%solve(sqrtm(deg.adj))
    adj <- NULL
    deg.adj <- NULL
    eval <- eigs_sym(l_matrix,10,which = "LM")
    eval.adj <- eval$vectors%*%diag(abs(eval$values))
    embed.Y <- data.frame(eval.adj[,1:2])
    #embed.Y <- data.frame(embed.s$X[,1:2])
    row.names(embed.Y) <- V(g)$name
    #clustering <- kmeans(embed.Y,centers=2,iter.max=30,nstart=10)$cluster
    clustering <- pam(embed.Y,k=2)$clustering
    } else if (method == "kmeansweight") {
    deg.adj <- diag(strength(g))+diag(rep(0.1,nrow(f)))
    l_matrix <- solve(sqrtm(deg.adj))%*%adj%*%solve(sqrtm(deg.adj))
    adj <- NULL
    deg.adj <- NULL
    eval <- eigs_sym(l_matrix,10,which = "LM")
    eval.adj <- eval$vectors%*%diag(abs(eval$values))
    embed.slist <- abs(eval$values[1])
    m=1
    while(embed.slist<as.numeric(0.95)*sum(abs(eval$values))){
      m=m+1
      embed.slist <- embed.slist+abs(eval$values[m])
    }
    embed.Y <- data.frame(eval.adj[,1:m])
    #embed.Y <- data.frame(embed.s$X[,1:2])
    row.names(embed.Y) <- V(g)$name
    #clustering <- kmeans(embed.Y,centers=2,iter.max=30,nstart=10)$cluster
    clustering <- pam(embed.Y,k=2)$clustering
    }
  
    clus = list(clustering=clustering)
    xi.f = clus$clustering
    xi.labels = lapply(1:2, function(x){which(xi.f == x)})
    smaller.cluster <- xi.labels[[which.min(sapply(xi.labels,length))]]
    f1 <- f[smaller.cluster,smaller.cluster]
    a1.labels <- cl.labels[smaller.cluster]
    if(length(dim(f1)) > 0) {
      n1 <- dim(f1)[1]
    } else if(length(f1) > 0) { ### case when only 1 node is in this cluster, make it 2, so the later on rank check code still works
      n1 <- 1
    } else {
      n1 = 0
    }

    if(n1 > 2*n.min) { ## only do further clustering on cluster larger  2*n.min
      res = build_tree(f1,xi.loc.labels, ncl, a1.labels,n.min,D=D-1,method)
      xi.loc.labels = res$xi.loc.labels
      ncl = res$ncl
      L.tree.path <- res$tree.path
      if(length(isol)>0){
        xi.loc.labels[[ncl]] = c(xi.loc.labels[[ncl]],cl.labels.full[isol]) ### attached the isolated nodes in this level with the clusters under the smaller split

        path.head <- L.tree.path[length(L.tree.path)]
        path.head <- gsub('[[:digit:]]+', '', path.head)
        iso.path <- paste(path.head,cl.labels.full[isol],sep="")
        L.tree.path <- c(L.tree.path,iso.path)
      }

      L.mod.path <- res$mod.path

    } else {
      ncl = ncl + 1
      xi.loc.labels[[ncl]] = a1.labels
      if(length(isol)>0) xi.loc.labels[[ncl]] = c(cl.labels.full[isol],xi.loc.labels[[ncl]])
      L.tree.path <- as.character(xi.loc.labels[[ncl]])
      L.mod.path <- rep(0,length(xi.loc.labels[[ncl]]))
      #print('Too small left cluster, Branch End')
    }
    f2 = f[-smaller.cluster, -smaller.cluster]

    a2.labels = cl.labels[-smaller.cluster]
    if(length(dim(f2)) > 0) {
      n1 <- nrow(f2)

    } else if(length(f2) > 0) {
      n1 <- 1
    } else {
      n1 <- 0
    }
    if(n1 > 2*n.min) {
      res = build_tree(f2,xi.loc.labels, ncl, a2.labels,n.min,D=D-1,method)
      xi.loc.labels = res$xi.loc.labels
      R.tree.path <- res$tree.path
      R.mod.path <- res$mod.path
      ncl = res$ncl

    } else {
      ncl = ncl + 1
      xi.loc.labels[[ncl]] = a2.labels
      R.tree.path <- as.character(a2.labels)
      R.mod.path <- rep(0,length(a2.labels))
      #print('Too small right cluster, Branch End')
    }
    L.tree.path <- paste("L",L.tree.path,sep="/")
    R.tree.path <- paste("R",R.tree.path,sep="/")
    tree.path <- c("",L.tree.path,R.tree.path)

  } else {
    ncl = ncl + 1
    xi.loc.labels[[ncl]] = cl.labels
    if(length(isol)>0) xi.loc.labels[[ncl]] = c(cl.labels.full[isol],cl.labels)
    tree.path <- c("",as.character(cl.labels))
    if(length(isol)>0) tree.path <- c(tree.path,as.character(cl.labels.full[isol]))
    #print('One cluster, Branch End, not even started!')
  }
  return(list(xi.loc.labels = xi.loc.labels, ncl = ncl,tree.path=tree.path))
}
verasiwei/biHSBM documentation built on Feb. 26, 2020, 12:52 a.m.