R/hclust_tree.R

Defines functions hclust_tree

Documented in hclust_tree

#' Build the hierarchical clustering tree.
#'
#' Hierarchical clustering with Fisher's test p-values as distance matrix.
#' Also add feature coverage information for each node in the tree.
#' @param pinmat The incidence table generated by \code{calc_pinmat}.
#' @param mat_fdr The FDR matrix generated by \code{fisher_fdr}
#' @param mat_dist The dissmilarity based on Fisher's test p-values for 
#'        hierarchical clustering.
#' @param hcmethod Default: average
#' @return A hclust objects with new items added.
#' @export
hclust_tree <- function(pinmat, mat_fdr, mat_dist, hcmethod = "average"){

  # pinmat <- pinmat[rowSums(pinmat)<ncol(pinmat),,drop = F]

  # Grow a tree and add multiple items to the standard hclust object
  hc <- hclust(as.dist(mat_dist), method = hcmethod)

  # Leaf indices for each node, in the order of the original labels
  leaflist <- vector(mode = "list", length = nrow(hc$merge))
  # Leaf lables for the node
  labellist <- vector(mode = "list",length = nrow(hc$merge))
  # Maximal pairwise FDR anywhere in the node
  mergefdr <- rep(NA, nrow(hc$merge))
  # Mean FDR for the node
  meanfdr<-rep(NA, nrow(hc$merge))
  # Number of leaves in the node
  nodesize <- rep(NA, nrow(hc$merge))
  # For each node and each feature(pin) determine the fraction of leaves in the 
  # node with the feature
  sharing <- matrix(NA, nrow = nrow(pinmat), ncol = nrow(hc$merge))
  # Mean number of features per leaf in a node
  complexity <- rep(NA, nrow(hc$merge))

  for(i in 1:nrow(hc$merge)){

    if(hc$merge[i, 1] < 0){
      leaflist[[i]] <- (-hc$merge[i,1])}
    else{
      leaflist[[i]] <- leaflist[[hc$merge[i,1]]]}

    if(hc$merge[i, 2] < 0){
      leaflist[[i]] <- c(leaflist[[i]], (-hc$merge[i,2]))}
    else{
      leaflist[[i]] <- c(leaflist[[i]], leaflist[[hc$merge[i,2]]])}

    labellist[[i]] <- hc$labels[leaflist[[i]]]
    complexity[i] <- mean(colSums(pinmat[,labellist[[i]]]))
    sharing[,i] <- rowMeans(pinmat[,labellist[[i]]])
    nodesize[i] <- length(leaflist[[i]])
    mergefdr[i]<- max(mat_fdr[leaflist[[i]], 
            leaflist[[i]]][upper.tri(mat_fdr[leaflist[[i]], leaflist[[i]]])])
    meanfdr[i]<- mean(mat_fdr[leaflist[[i]], 
            leaflist[[i]]][upper.tri(mat_fdr[leaflist[[i]], leaflist[[i]]])])
  }

  hc$mergefdr <- mergefdr
  hc$meanfdr <- meanfdr
  hc$nodesize <- nodesize
  hc$leaflist <- leaflist
  hc$labellist <- labellist
  hc$sharing <- sharing
  hc$complexity <- complexity


  # return the hclust object which have new features added
  return(hc)
}
seqpipe/SCclust documentation built on May 4, 2021, 11:56 a.m.