R/sc_hc.R

Defines functions SC_hc SC_hc_colorCode

Documented in SC_hc SC_hc_colorCode

#' SC_hc hierachical clustering of single cells
#' @param dataFile a tab delimited txt file of expression data, columns are cells, rows are genes.
#' @param baseName prefix name of resulting files
#' @param cuttree_k number of clusters to be generated by cutting the dendrogram. 
#' @export
SC_hc <- function(dataFile,baseName,cuttree_k=4){
  rpkm <- read.table(dataFile,sep="\t",header=T,row.names=1)
  log2rpkm <- apply(rpkm,c(1,2),function(x) if(x>1) log2(x) else 0) 
  rowmax <- apply(log2rpkm,1,max)
  log2rpkm <- log2rpkm[which(rowmax>0),]
  
  hc <- hclust(dist(t(log2rpkm)),method="ward")
  #pdf(paste(baseName,"_hc.pdf",sep=""),width=22)
  plot(hc)
  #dev.off()
  
  memb <- cutree(hc, k = cuttree_k)  
  write.table(memb,paste(baseName,"_hc_cell_cluster.txt",sep=""),sep="\t")
}

#' SC_hc_colorCode hierarchical clustering of single cells, leaves color-coded by cell type information.
#' @param dataFile a tab delimited txt file of expression data, columns are cells, rows are genes.
#' @param baseName prefix name of resulting files
#' @param cuttree_k number of clusters to be generated by cutting the dendrogram. cuttree_k can be an integer or NULL. If NULL, no clusters will be generated.
#' @param sampleFile a tab delimited txt file of cell annotation, the first column is cell ID, the second column is group ID.
#' @param width width of the resulting figure
#' @param height height of the resulting figure
#' @param iflog2 a boolean value indiciating if the expression data will be log2 transformed.
#' @param colorPalette if NULL, brewer.pal(n,"Set2") will be used, or colorPalette can be c("red","green","blue").
#' @param distMethod distMethod can be one of "pearson", "kendall", "spearman" or "Euclidean"
#' @import RColorBrewer
#' @export
#' @examples
#' \dontrun{
#' dataFile = "TPM_GSE60783_noOutlier_geneQC0.05anyGroup_CD4vsCD8DEG.txt";
#' SC_hc_colorCode(dataFile = dataFile,
#'                cuttree_k = 11,
#'                sampleFile= "sample_GSE60783_noOutlier.txt",
#'                width = 22, height = 10, iflog2 = TRUE,
#'                colorPalette = c("red","green","blue"))
#' }
SC_hc_colorCode <- function(dataFile,baseName=NULL,cuttree_k=NULL,sampleFile=NULL,width=22,height=10,iflog2=TRUE,colorPalette=NULL,distMethod="Euclidean"){
   if(is.null(baseName)){
      baseName <- sub(".txt","",dataFile)
   }
   
   rpkm <- read.table(dataFile,sep="\t",header=T,row.names=1)
   if(iflog2){
      log2rpkm <- apply(rpkm,c(1,2),function(x) if(x>1) log2(x) else 0)
   }else{
      log2rpkm <- rpkm
   }
   rowmax <- apply(log2rpkm,1,max)
   log2rpkm <- log2rpkm[which(rowmax>0),]

   if(distMethod=="Euclidean"){
      hc <- hclust(dist(t(log2rpkm)),method="ward")
   }else{
      data.cor<-cor(log2rpkm,method=distMethod);
      data.cor<-as.dist(1-data.cor); #since the algorithm wants a distance measure, the resulting correlation is trasformed in a distance
      hc <- hclust(data.cor,method="ward")
   }

   if(!is.null(cuttree_k)){
      memb <- cutree(hc, k = cuttree_k)
      write.table(data.frame(SampleID=names(memb),GroupID=memb),paste(baseName,"_hc_cell_cluster.txt",sep=""),sep="\t",row.names=FALSE)
   }

   d.den <- as.dendrogram(hc)
   
   if(is.null(sampleFile)){
   }else{
      sample <- read.table(sampleFile,header=T,sep="\t",check.names=F,comment.char="",row.names=1)
      if(is.null(colorPalette)){
        colorPalette <- brewer.pal(length(unique(sample[,1])),"Set2")
      }
      sample$color <- colorPalette[sample[,1]]
      if(!is.null(cuttree_k)){
        if(cuttree_k<10){
           colorPalette_cutree <- brewer.pal(cuttree_k,"Set1")
        }else{
           colorPalette_cutree <- c(brewer.pal(9,"Set1"),brewer.pal(cuttree_k-9,"Set2"))
        }
        sample$color_cutree <- colorPalette_cutree[memb[row.names(sample)]]
      }
      sample <- sample[as.character(colnames(log2rpkm)),]
     
      colorLabel <- function(n){
         if(is.leaf(n)){
            a <- attributes(n)
            attr(n, "nodePar") <- c(a$nodePar, list(lab.col=as.character(sample[a$label,"color"]))) 
            attr(n, "nodePar") <- c(a$nodePar, list(pch=15,col=as.character(sample[a$label,"color"]))) 
            if(!is.null(cuttree_k)){
               attr(n, "edgePar") <- c(a$edgePar, list(col=as.character(sample[a$label,"color_cutree"])))             
            }
         }
         n
      }
      d.den <- dendrapply(d.den, colorLabel)
   }   


   #pdf(paste(baseName,"_hc.pdf",sep=""),width=width,height=height)
   par(oma = c(1, 1, 1, 5))
   plot(d.den)
   if(!is.null(sampleFile)){
      legend("topright",legend=unique(sample[,1]),col=as.character(unique(sample[,"color"])),pch=15,bty="n")
      if(!is.null(cuttree_k)){
        legend("bottomright",legend=unique(memb),col=as.character(unique(sample[,"color_cutree"])),pch=20,bty="n")
      }
   }
   #dev.off()
}
Zouter/MPath documentation built on May 4, 2019, 2:31 p.m.