R/hclustTree.R

Defines functions hclust_tree

Documented in hclust_tree

#'Build the hierarchical clustering tree.
#'
#'Hierarchical clustering with Fisher's test p-values as distance matrix.
#'Also add feature coverage information for each node in the tree.
#'@param pinmat The incidence table generated by \code{findpins}.
#'@param mat_fdr The FDR matrix generated by \code{fdr_fisherPV}
#'@param mat_dist The dissmilarity based on Fisher's test p-values for hierarchical clustering.
#'@param hc_method Default: average
#'@return A hclust objects with new items added.
#'@export



hclust_tree <- function(pinmat, mat_fdr, mat_dist, hc_method = "average"){




  pinmat <- pinmat[rowSums(pinmat)<ncol(pinmat),,drop = F]


  ## Grow a tree and add multiple items to the standard hclust object
  ####################################################################
  hc <- hclust(as.dist(mat_dist), method = hc_method)

  #Leaf indices for each node, in the order of the original labels
  leafID_list <- vector(mode = "list", length = nrow(hc$merge))

  #Leaf lables for the node
  leaflabel_list <- vector(mode = "list",length = nrow(hc$merge))

  #Maximal pairwise FDR anywhere in the node
  maxfdr <- rep(NA, nrow(hc$merge))

  #Mean FDR for the node
  meanfdr<-rep(NA, nrow(hc$merge))

  #Number of leaves in the node
  nodesize <- rep(NA, nrow(hc$merge))

  #For each node and each feature(pin) determine the fraction of leaves in the node with the feature
  sharing <- matrix(NA, nrow = nrow(pinmat), ncol = nrow(hc$merge))

  #Mean number of features per leaf in a node
  complexity <- rep(NA, nrow(hc$merge))

  #compute the items defined above

  for(i in 1:nrow(hc$merge)){

    if(hc$merge[i, 1] < 0){
      leafID_list[[i]] <- (-hc$merge[i,1])}
    else{
      leafID_list[[i]] <- leafID_list[[hc$merge[i,1]]]}

    if(hc$merge[i, 2] < 0){
      leafID_list[[i]] <- c(leafID_list[[i]], (-hc$merge[i,2]))}
    else{
      leafID_list[[i]] <- c(leafID_list[[i]], leafID_list[[hc$merge[i,2]]])}

    leaflabel_list[[i]] <- hc$labels[leafID_list[[i]]]

    nodesize[i] <- length(leafID_list[[i]])

    maxfdr[i]<- max(mat_fdr[leafID_list[[i]], leafID_list[[i]]][upper.tri(mat_fdr[leafID_list[[i]], leafID_list[[i]]])])

    meanfdr[i]<- mean(mat_fdr[leafID_list[[i]], leafID_list[[i]]][upper.tri(mat_fdr[leafID_list[[i]], leafID_list[[i]]])])

    complexity[i] <- mean(colSums(pinmat[,leaflabel_list[[i]]]))

    sharing[,i] <- rowMeans(pinmat[,leaflabel_list[[i]]])
  }



  hc$maxfdr <- maxfdr
  hc$meanfdr <- meanfdr
  hc$nodesize <- nodesize
  hc$leafID_list <- leafID_list
  hc$leaflabel_list <- leaflabel_list
  hc$sharing <- sharing
  hc$complexity <- complexity


  #return the hclust object which have new features added & distance matrix based on log10(fisherPV)
  return(hc)

}
JunyanSong/SCclust documentation built on April 16, 2022, 8:44 p.m.