R/rsgcc.R

Defines functions onegcc gcc.corfinal cor.pair cor.matrix gcc.heatmap .check.matrix .check.variable .cor_all adjacencymatrix

Documented in adjacencymatrix cor.matrix cor.pair gcc.corfinal gcc.heatmap onegcc

###################################################################################
##Note: this script contains function for correlation calcuation with snowfall 
##      package in R for parallell computing.
##Author: Chuang Ma
##Date: 2012-02-16
##################################################################################


if( !require(biwt)) install.packages("biwt")
require(biwt)

if( !require(parmigene)) install.package("parmigene")
library(parmigene)




#########################################################################
##compute Gini correlation by using the rank informaiton of x and the 
## value informaiton of y
########################################################################
onegcc <- function(x, y) {
   
  #rank function, get y[m] and y(m) with the rank information of x (first line)
  getrank <- function(datamatrix) {
    if( dim(datamatrix)[1] != 2 ) {
      stop("Error: the row num of datamatrix must be 2")
    }
    
    #use the order information of X
    OrderIndex <- order( datamatrix[1,], decreasing = FALSE )
    SortGenePair <- datamatrix[, OrderIndex]
    Sort2By1 <- SortGenePair[2,]
    return( list(Sort2By1 = Sort2By1, 
                 Sort2By2 = sort(datamatrix[2,], decreasing = FALSE ) 
                 ) )
  }#end getrank
  
  ##compute gcc with weight vector and y[m] and y(m)
  gcc <- function( weightvec, vectsort, vectselfsort) {
    Sum1 <- sum(weightvec*vectsort)
    Sum2 <- sum(weightvec*vectselfsort)
    if( Sum2 == 0 ) {
      cat("\n", x, "\n", y, "\n")
      cat("Warning: the Denominator is ZRRO, the value of one variable is consistent.")
      gcccor <- 0
    }else {
      gcccor <- Sum1/Sum2
    }
    return(gcccor)
  }
 
  ######################################################
  ##for gini correlation   
  ##generate weight vector
  Length <- length(x)
  Wt <- t(2*seq(1, Length, by = 1) - Length - 1)

  ##for gcc.rankx
  GenePairXY <- t(matrix( c(x,y), ncol = 2))
  SortYlist <- getrank(GenePairXY)
  gcc.rankx <- gcc(Wt, SortYlist$Sort2By1, SortYlist$Sort2By2)
    
  return(gcc.rankx)
  
}




##########################################################################
##get final correlation and p-value for GCC
##input gcccor is the output of allcor function for GCC
##########################################################################
gcc.corfinal <- function( gcccor ) {
  
  ##if gcccor has p-value informaiton, select correlation with p-value
  if( is.numeric(gcccor$gcc.rankx.pvalue) & is.numeric(gcccor$gcc.ranky.pvalue) ) {
    fpvalue <- gcccor$gcc.rankx.pvalue
    fgcc <- gcccor$gcc.rankx
    if( gcccor$gcc.ranky.pvalue < fpvalue ) {
      fpvalue <- gcccor$gcc.ranky.pvalue
      fgcc <- gcccor$gcc.ranky
    }
  }else {  ##no p-value, we selected the gcc with absolute max vlaue
    fpvalue <- NA
    x <- c(gcccor$gcc.rankx, gcccor$gcc.ranky)
    fgcc <- x[ which( abs(x) == max(abs(x)) ) ]
    if( length(fgcc) > 1 ) { #check length
#       cat("Warnning: the correlation coefficients generated by GCC method are the same after abs() operation.", 
#           gcccor$gcc.rankx, gcccor$gcc.ranky,
#           "In current version of Rgcc, only the first one is selected...")
      fgcc <- fgcc[1]
    }
  }
    
    return( list(gcc.fcor = fgcc, gcc.fpvalue = fpvalue))
}


cor.pair <- function( idxvec,
                      GEMatrix,
                      rowORcol = c("row", "col"),  
                      cormethod = c("GCC", "PCC", "SCC", "KCC", "BiWt"), 
                      pernum = 0, 
                      sigmethod = c("two.sided", "one.sided") ) 
{


#########################################################################
##compute Gini correlation by using the rank informaiton of x and the 
## value informaiton of y
########################################################################
onegcc <- function(x, y) {
   
  #rank function, get y[m] and y(m) with the rank information of x (first line)
  getrank <- function(datamatrix) {
    if( dim(datamatrix)[1] != 2 ) {
      stop("Error: the row num of datamatrix must be 2")
    }
    
    #use the order information of X
    OrderIndex <- order( datamatrix[1,], decreasing = FALSE )
    SortGenePair <- datamatrix[, OrderIndex]
    Sort2By1 <- SortGenePair[2,]
    return( list(Sort2By1 = Sort2By1, 
                 Sort2By2 = sort(datamatrix[2,], decreasing = FALSE ) 
                 ) )
  }#end getrank
  
  ##compute gcc with weight vector and y[m] and y(m)
  gcc <- function( weightvec, vectsort, vectselfsort) {
    Sum1 <- sum(weightvec*vectsort)
    Sum2 <- sum(weightvec*vectselfsort)
    if( Sum2 == 0 ) {
      cat("\n", x, "\n", y, "\n")
     cat("Warning: the Denominator is ZRRO, the value of one variable is consistent.")
     gcccor <- 0
    }else {
      gcccor <- Sum1/Sum2
    }
    return(gcccor)
  }
 
  ######################################################
  ##for gini correlation   
  ##generate weight vector
  Length <- length(x)
  Wt <- t(2*seq(1, Length, by = 1) - Length - 1)

  ##for gcc.rankx
  GenePairXY <- t(matrix( c(x,y), ncol = 2))
  SortYlist <- getrank(GenePairXY)
  gcc.rankx <- gcc(Wt, SortYlist$Sort2By1, SortYlist$Sort2By2)
    
  return(gcc.rankx)
  
}



  if(!is.vector(idxvec) | length(idxvec) != 2) {
    stop("Error: idxvec must be a vector with two elements indicating the indexs(rows) in GEMatrix")
  }
  if( class(GEMatrix) != "matrix" ) {
    stop("Error: GEMatrix should be a numeric data matrix")
  }

  if( rowORcol == "row" ) {
    x1 <- GEMatrix[idxvec[1],]
    y1 <- GEMatrix[idxvec[2],]
  }else if(rowORcol == "col") {
    x1 <- GEMatrix[,idxvec[1]]
    y1 <- GEMatrix[,idxvec[2]]
  }else {
    stop("Error: rowORcol must be \"row\" or \"col\"")
  }
  
  ##function for all considered correlation methods
  getcor <- function(g1, g2, cormethod ) {
    if( cormethod == "PCC") { return(cor.test(g1, g2, method="pearson")$estimate) }
    else if( cormethod == "SCC" ) { return(cor.test(g1, g2, method="spearman")$estimate) }
    else if( cormethod == "KCC" ) { return(cor.test(g1, g2, method="kendall")$estimate) }
    else if( cormethod == "BiWt" ){ return(biwt.cor(t(matrix(c(g1,g2), ncol=2)), output="vector")[1]) }
    else if( cormethod == "GCC" ){
      return(list( gcc.rankx = onegcc(g1,g2), gcc.ranky = onegcc(g2,g1) ) )
    }
  }#end getcor 
  
  
  ##get pvalue for permutation test
  getpvalue <- function( percorvec, pernum, realcor, sigmethod ) {
    pvalue <- length(which(percorvec >= realcor))/pernum
    if( pvalue == 0 ) pvalue <- 1.0/pernum 
    if( pvalue > 0.5 ) pvalue <- 1.0 - pvalue
    if( sigmethod == "two.sided" ) {
      pvalue <- 2*pvalue
    }
    return(pvalue)
  }#end getpvalue
  
  
  #check cormethod
  if( length(cormethod) > 1 ) {
    stop("Error: length of cormethod must be of length 1")
  }
  
  #check vector
  if( !is.vector(x1) || !is.vector(y1) ) {
    stop("Error: input two vectors for gcc.corpair")
  }
  
   #check numeric
  if( !is.numeric(x1) || !is.numeric(y1) ) {
    stop("Error: x should be numeric")
  }
  
   #check NA
  if( length(which(is.na(x1) == TRUE)) > 0 || length(which(is.na(y1) == TRUE)) > 0 ){
    stop("Error: There are Na(s) in x")
  }

  #check length
  if( length(x1) != length(y1) ) {
     stop("Error: the lengths of each row in x are different.\n")
  }
  
  realcor <- getcor(x1, y1, cormethod)
  
  ##check pernum for permutation or output
  if( pernum <= 0 ) {
    
    if( cormethod == "GCC") {
      return( list(gcc.rankx=realcor$gcc.rankx, gcc.ranky=realcor$gcc.ranky, gcc.rankx.pvalue = NA, gcc.ranky.pvalue = NA) )
    }else {
      return( list(cor = realcor, pvalue = NA))
    }
  }else {
    ##generate matrix for permuted correlation
    pGCCMatrix <- matrix(0, nrow = pernum, ncol = 2)
    colnames(pGCCMatrix) <- c("gcc.rankx", "gcc.ranky")
    rownames(pGCCMatrix) <- paste("permut", seq(1,pernum, by=1), sep="")
    
    GenePairXY <- t(matrix( c(x1, y1), ncol = 2))
    pGenePairXY <- GenePairXY
    Length <- length(x1)
 
     for( i in 1:pernum ) {

       ##get system time for seed and then generate random index
       curtime <- format(Sys.time(), "%H:%M:%OS4")
       XXX <- unlist(strsplit(curtime, ":"))
       curtimeidx <- (as.numeric(XXX[1])*3600 + as.numeric(XXX[2])*60 + as.numeric(XXX[3]))*10000
       set.seed( curtimeidx )
       TT = sort(runif(Length),index.return=TRUE)$ix
       pGenePairXY[1,] <- GenePairXY[1,TT]
       if( cormethod == "GCC" ) { 
         cortmp <- getcor( pGenePairXY[1,], pGenePairXY[2,], cormethod )
         pGCCMatrix[i,1] <- cortmp$gcc.rankx
         pGCCMatrix[i,2] <- cortmp$gcc.ranky
        }else {
         pGCCMatrix[i,1] <- getcor( pGenePairXY[1,], pGenePairXY[2,], cormethod )
        }
     }#end for i
     
    #compute p-value
    if( cormethod == "GCC" ) {
      return( list( gcc.rankx = realcor$gcc.rankx, 
                    gcc.ranky = realcor$gcc.ranky, 
                    gcc.rankx.pvalue = getpvalue(pGCCMatrix[,1], pernum, realcor$gcc.rankx, sigmethod), 
                    gcc.ranky.pvalue = getpvalue(pGCCMatrix[,2], pernum, realcor$gcc.ranky, sigmethod)) )
    }else {
      return( list (cor = realcor,
                    pvalue = getpvalue(pGCCMatrix[,1], pernum, realcor, sigmethod)) )
      
    }
  }

}#end allcor



##############################################################################
##Calcluate correlations for variables in a matrix with parallel computing
##x: numeric matrix
##asym: If ture, output gccx and gccy; or else, output max gcc(or gcc with low p value)
##style: "all.pairs", "pairs.between", "adjacent.pairs", "one.pair"
##var1.id
##var2.id
##pernum
##sigmethod
##Date: 2012-02-16
##############################################################################
cor.matrix <- function( GEMatrix, 
                        cpus = 1,
                        cormethod = c("GCC", "PCC", "SCC", "KCC", "BiWt"),
                        style = c("all.pairs", "pairs.between", "adjacent.pairs", "one.pair"),
                        var1.id = NA,
                        var2.id = NA,
                        pernum = 0, 
                        sigmethod = c("two.sided", "one.sided"),
                        output = c("matrix", "paired")     
                           ) {
  
  if( cpus > 1 ) {
    require(snowfall)
  }
  
  #check cormethod
  if( length( cormethod ) > 1 ) {
    cat("Warning: one correlation method should be specified. Default:GCC")
  }
  
  #check style
  if( length(style) > 1 ) {
    cat( "Warning: one style should be specified. Default: all.pairs")
  }
  
  #check sigmethod 
  if( pernum > 0 & length( sigmethod ) > 1 ) {
    sigmethod = "two.sided"
  }
  if( pernum == 0 ) {
    sigmethod <- "two.sided"
  }
    
  #check whether matrix
  if( !is.matrix( GEMatrix) ) {
    stop("Error: GEMatrix in cor.matrix function is not matrix")
  }
  
  if( !is.numeric( GEMatrix) ){
    stop("Error: GEMatrix is not numeric")
  }
  
  if( length(rownames(GEMatrix)) == 0 ) { #no rownames
    rownames(GEMatrix) <- seq(1,dim(GEMatrix)[1], by=1)
  }
  
  VariableNum <- nrow(GEMatrix)
  SampleSize <- ncol(GEMatrix)
  if( VariableNum <= 1 || SampleSize <= 1 ){
    stop("Error:the number of variable is less than 2, or the number of observation is less than 2 ")
  }
  
  
  if( style == "one.pair" ) {
    if( length(var1.id) != 1 || length(var2.id) != 1 || is.na(var1.id) == TRUE || is.na(var2.id) == TRUE) {
      stop("Error: Not define the var1.id or var1.id")
    }
  }
  
  if( style == "adjacent.pairs") {
    var1.id <- seq(1, (VariableNum-1), by=1)
    var2.id <- var1.id + 1  
  }
  
  #for task matrix
  if( style == "one.pair" || style == "adjacent.pairs") {
    if( length(var1.id) != length(var2.id) ) {
      stop("Error: var1.id and var2.id should be vectors with the same length")
    }
    taskmatrix <- matrix(c(var1.id, var2.id), ncol = 2)
  }#end if style
    
  if( style == "pairs.between" ) {
    if( length( which(is.na(var1.id) == TRUE) ) > 0 | length( which(is.na(var1.id) == TRUE) ) > 0 ){
      stop("Error: no variable IDs are given")
    }
    if( length( which((var1.id != var2.id) == TRUE ) ) > 0 )  {
      stop("Error: var1.id should be the same with var2.id for the pairs.between style")
    }
    if( length(which(is.numeric(var1.id) == FALSE)) > 0 | length(which(is.numeric(var2.id) == FALSE)) > 0 ){
      stop("Error:var1.id and var2.id should be numeric vector")
    }
  }
  if( style == "all.pairs" ) {
    var1.id <- seq(1, dim(GEMatrix)[1], by=1)
    var2.id <- var1.id
  }
    
  if( style == "pairs.between" || style == "all.pairs") {
    
    CurLen <- length(var1.id)
    taskmatrix <- matrix(0, nrow = length(var1.id)*(length(var1.id)+1)/2, ncol = 2) 
    kk = 0
    for( i in 1:length(var1.id)) {
      j <- 1
      while( j <= i ) {
        kk <- kk + 1
        taskmatrix[kk,] <- c(i,j)
        j <- j + 1
      }#end while j
    }#end for i
  }#end style

  ##implement apply  
  if( cpus == 1 | cormethod == "BiWt") {
      results <- apply(taskmatrix, 1, cor.pair, GEMatrix = GEMatrix, rowORcol = "row", cormethod = cormethod, pernum = pernum, sigmethod = sigmethod)
  }else {
      sfInit(parallel=TRUE, cpus=cpus)
      print(sprintf('%s cpus to be used', sfCpus()))
    #  if( cormethod == "GCC") sfExport('onegcc')
    #  if( cormethod == "BiWt") sfExport('biwt.cor')
      results <- sfApply(taskmatrix, 1, cor.pair, GEMatrix = GEMatrix, rowORcol = "row", cormethod = cormethod, pernum = pernum, sigmethod = sigmethod)
      sfStop()
  } 

 ##get final results
 if( output == "paired") {
   kk <- 0
   corpvalueMatrix <- matrix(NA, nrow = dim(taskmatrix)[1], ncol = 4)  
   for( i in 1:dim(taskmatrix)[1] ) {
     if( taskmatrix[i,1] == taskmatrix[i,2] ) next
     kk <- kk + 1
     corpvalueMatrix[kk,1:2] <- rownames(GEMatrix)[taskmatrix[i,]]
     if( cormethod == "GCC" ) {
        fGCC <- gcc.corfinal(results[i][[1]])
        corpvalueMatrix[kk,3] <- fGCC$gcc.fcor
        corpvalueMatrix[kk,4] <- fGCC$gcc.fpvalue
      }else {
        corpvalueMatrix[kk,3] <- results[i][[1]]$cor
        corpvalueMatrix[kk,4] <- results[i][[1]]$pvalue
      }
   }#end for i
   return( corpvalueMatrix[1:kk,] )
 }else {
     ##get final results
    UniqueRow <- sort(unique(taskmatrix[,1]))
    UniqueCol <- sort(unique(taskmatrix[,2]))
    corMatrix <- matrix(0, nrow = length(UniqueRow), ncol = length(UniqueCol)) 
    rownames(corMatrix) <- rownames(GEMatrix)[UniqueRow]
    colnames(corMatrix) <- rownames(GEMatrix)[UniqueCol]
    pvalueMatrix <- corMatrix
    pvalueMatrix[] <- NA

    for( i in 1:dim(taskmatrix)[1] ) {
       rowidx <- which(UniqueRow == taskmatrix[i,1])
       colidx <- which(UniqueCol == taskmatrix[i,2])
       
       if( cormethod == "GCC" ) {
          fGCC <- gcc.corfinal(results[i][[1]])
          corMatrix[rowidx,colidx] <- fGCC$gcc.fcor
          pvalueMatrix[rowidx,colidx] <- fGCC$gcc.fpvalue  
        }else {
          corMatrix[rowidx,colidx] <- results[i][[1]]$cor
          pvalueMatrix[rowidx,colidx] <- results[i][[1]]$pvalue
        }
       
       if( style == "pairs.between" | style == "all.pairs" ) {
          corMatrix[colidx, rowidx] <- corMatrix[rowidx,colidx]
          pvalueMatrix[colidx,rowidx] <- pvalueMatrix[rowidx,colidx]
       }
    }#end for i

    return( list(corMatrix = corMatrix, pvalueMatrix = pvalueMatrix) )
  }#end else

}#end function



#############################################################################
##This function plots HeatMap with different similarity measures (1-CorCoef) 
##Here CorCoef could be GCC (Gini correlation coefficient), 
##PCC (Pearson product-moment correlation coefficient), 
##SCC (Spearman's rank correlation coefficient), 
##KCC (Kendall tau correlation coefficient) 
##and BiWt(correlation estimates based on Tukey's biweight M-estimator)
##To set other parameters, please ref HeatMap.2 function in gplots package.
#############################################################################

 
gcc.heatmap <- function(x,
                           
                        cpus = 1,
                        
                        ## correlation method
                        method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"),
                        
                        ## similarity method
                        distancemethod = c("Raw", "Abs", "Sqr"),
                           
                        #hclustfun = hclust,
                        clustermethod = c("complete", "average", "median", "centroid", "mcquitty", "single", "ward"),
                        
                        rowhcdata = NULL,
                        colhcdata = NULL,
 
                           
                        keynote = "FPKM",
                        
                        ## dendrogram control
                        symm = FALSE,
                        Rowv = TRUE,
                        Colv= if(symm)"Rowv" else TRUE,
                        dendrogram = c("both","row","column","none"),
                      
                        ## data scaling
                        scale = c("none","row", "column"),
                        na.rm=TRUE,

                        ## image plot
                        revC = identical(Colv, "Rowv"),
                        add.expr,

                        ## mapping data to colors
                        breaks = 16,
                        quanbreaks = TRUE,
                        symbreaks=min(x < 0, na.rm=TRUE) || scale!="none",

                       ## colors
                       colrange= c("green", "black", "red"),

                       ## block sepration
                       colsep,
                       rowsep,
                       sepcolor="white",
                       sepwidth=c(0.05,0.05),

                       ## cell labeling
                       cellnote,
                       notecex=1.0,
                       notecol="cyan",
                       na.color=par("bg"),

                       ## level trace
                       trace=c("none", "column","row","both"),
                       tracecol="cyan",
                       hline=median(breaks),
                       vline=median(breaks),
                       linecol=tracecol,

                       ## Row/Column Labeling
                       margins = c(5, 5),
                       ColSideColors,
                       RowSideColors,
                       cexRow = 0.2 + 1/log10(dim(x)[1]),
                       cexCol = 0.2 + 1/log10(dim(x)[2]),
                       labRow = NULL,
                       labCol = NULL,

                       ## color key + density info
                       key = TRUE,
                       keysize = 0.65,
                       density.info=c("none","histogram","density"),
                       denscol=tracecol,
                       symkey = min(x < 0, na.rm=TRUE) || symbreaks,
                       densadj = 0.25,

                       ## plot labels
                       main = NULL,
                       xlab = NULL,
                       ylab = NULL,

                       ## plot layout
                       lmat = NULL,
                       lhei = NULL,
                       lwid = NULL,

                       ## extras
                       ...
                      )
{
  
  cat("GE matrix start to be clustered:", dim(x), "\n")
              
  scale01 <- function(x, low = min(x), high = max(x)) {  x <- (x - low)/(high - low); x }
  
  retval <- list()
  scale <- if(symm & missing(scale)) {  "none" }
  else { match.arg(scale) }
  
  dendrogram <- match.arg(dendrogram)
  trace <- match.arg(trace)
  density.info <- match.arg(density.info)
  
  if (length(colrange) == 1 & is.character(colrange)) {
      colrange <- get(colrange, mode = "function")
      col <- colrange
  }
  if (!missing(breaks) & (scale != "none")) 
      warning("Using scale=\"row\" or scale=\"column\" when breaks are", 
           "specified can produce unpredictable results.", "Please consider using only one or the other.")
  if (is.null(Rowv) | is.na(Rowv)) { Rowv <- FALSE }
  
  
  if (is.null(Colv) | is.na(Colv)) { Colv <- FALSE
  }else if (Colv == "Rowv" & !isTRUE(Rowv)) {   Colv <- FALSE   }
    
  
  if (length(di <- dim(x)) != 2 || !is.numeric(x)) {stop("`x' must be a numeric matrix") }
  nr <- di[1]
  nc <- di[2]
  if(nr <= 1 | nc <= 1) {stop("`x' must have at least 2 rows and 2 columns")}
  cat("nr = ", nr, "\n")
  cat("nc = ", nc, "\n")
  
  if(!is.numeric(margins) | length(margins) != 2) {stop("`margins' must be a numeric vector of length 2")}
  if(missing(cellnote)) { cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x)) }
  if(!inherits(Rowv, "dendrogram")) {
      if(((!isTRUE(Rowv)) | (is.null(Rowv))) & (dendrogram %in% c("both", "row"))) {
            if (is.logical(Colv) & (Colv)) { dendrogram <- "column" }
            else { dedrogram <- "none" }
            warning("Discrepancy: Rowv is FALSE, while dendrogram is `", dendrogram, "'. Omitting row dendogram.")
        }
    }
    if(!inherits(Colv, "dendrogram")) {
        if(((!isTRUE(Colv)) | (is.null(Colv))) & (dendrogram %in% c("both", "column"))) {
            if (is.logical(Rowv) & (Rowv)) {dendrogram <- "row" }
            else { dendrogram <- "none" }
            warning("Discrepancy: Colv is FALSE, while dendrogram is `", dendrogram, "'. Omitting column dendogram.")
        }
    }
  
    if (inherits(Rowv, "dendrogram")) {
        ddr <- Rowv
        rowInd <- order.dendrogram(ddr)
    }
    else if (is.integer(Rowv)) {
      if( missing(rowhcdata) | is.null(rowhcdata) ) {
        hcr <- gcc.hclust( x, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod)
      }else {
        hcr <- rowhcdata
      }
      ddr <- as.dendrogram(hcr$hc)
      ddr <- reorder(ddr, Rowv)
      rowInd <- order.dendrogram(ddr)
      if (nr != length(rowInd)) {stop("row dendrogram ordering gave index of wrong length") }
    }
    else if (isTRUE(Rowv)) {
        Rowv <- rowMeans(x, na.rm = na.rm)
        if( missing(rowhcdata) | is.null(rowhcdata) ) {
          hcr <- gcc.hclust( x, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod)
        }else {
          hcr <- rowhcdata
        }
        ddr <- as.dendrogram(hcr$hc)
        ddr <- reorder(ddr, Rowv)
        rowInd <- order.dendrogram(ddr)
        if (nr != length(rowInd)) {stop("row dendrogram ordering gave index of wrong length") }
    }
    else {
        rowInd <- nr:1
    }
  
  
    if (inherits(Colv, "dendrogram")) {
        ddc <- Colv
        colInd <- order.dendrogram(ddc)
    }else if (identical(Colv, "Rowv")) {
        if (nr != nc) { stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") }
        if (exists("ddr")) {
            ddc <- ddr
            colInd <- order.dendrogram(ddc)
        }
        else colInd <- rowInd
    }
    else if (is.integer(Colv)) {
      
      aa <- x
      if( !symm ) aa <- t(x)
      if( missing(colhcdata) | is.null(colhcdata) ) {
        hcc <- gcc.hclust( aa, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod)
      }else {
        hcc <- colhcdata
      }
      ddc <- as.dendrogram(hcc$hc)
      ddc <- reorder(ddc, Colv)
      colInd <- order.dendrogram(ddc)
      if (nc != length(colInd)) { stop("column dendrogram ordering gave index of wrong length") }
    }
    else if (isTRUE(Colv)) {
        Colv <- colMeans(x, na.rm = na.rm)
        aa <- x
        if( !symm ) aa <- t(x)
        if( missing(colhcdata) | is.null(colhcdata) ) {
          hcc <- gcc.hclust( aa, cpus = cpus, method = method, distancemethod = distancemethod, clustermethod = clustermethod)
        }else {
          hcc <- colhcdata
        }
 
        ddc <- as.dendrogram(hcc$hc)
        ddc <- reorder(ddc, Colv)
        colInd <- order.dendrogram(ddc)
        if (nc != length(colInd)) {stop("column dendrogram ordering gave index of wrong length") }
    }
    else {
        colInd <- 1:nc
    }
  
    retval$rowInd <- rowInd
    retval$colInd <- colInd
    retval$call <- match.call()
    x <- x[rowInd, colInd]
    x.unscaled <- x
    cellnote <- cellnote[rowInd, colInd]
    if (is.null(labRow)) { 
        labRow <- if (is.null(rownames(x))) 
            (1:nr)[rowInd]
        else rownames(x)
    }else { labRow <- labRow[rowInd] }

    if (is.null(labCol)) {
        labCol <- if (is.null(colnames(x))) 
            (1:nc)[colInd]
        else colnames(x)
    }else { labCol <- labCol[colInd] }
  
  
    if (scale == "row") {
        retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm)
        x <- sweep(x, 1, rm)
        retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm)
        x <- sweep(x, 1, sx, "/")
    }
    else if (scale == "column") {
        retval$colMeans <- rm <- colMeans(x, na.rm = na.rm)
        x <- sweep(x, 2, rm)
        retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm)
        x <- sweep(x, 2, sx, "/")
    }
  
  ##set breaks for colorkey and heat map
  if (missing(breaks) | is.null(breaks) | length(breaks) < 1) {
      if (missing(colrange) || is.function(colrange)) 
        breaks <- 20
      else if( is.vector(colrange) == TRUE & length(colrange) > 3) {  #pre-define colors
        breaks <- length(col) + 1
      }else {
        breaks <- 20
      }
  }

  if (length(breaks) == 1) {  #if already 
    if( quanbreaks == TRUE ) {
      breaks <- quantile( unique(c(x)), probs = seq(0, 1, length = breaks), na.rm = TRUE)
    }else {
     if (!symbreaks) 
            breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), length = breaks)
        else {
            extreme <- max(abs(x), na.rm = TRUE)
            breaks <- seq(-extreme, extreme, length = breaks)
        }
    }
  }
  
  nbr <- length(breaks)
  ncol <- length(breaks) - 1
  if (class(colrange) == "function") {
    col <- colrange(ncol)
  }else if( is.vector(colrange) == TRUE & length(colrange) >= 2 ) {
    col <- colorRampPalette(colrange)(nbr - 1) #for three col
  }else{
      col <- colorRampPalette(c("green", "black", "red"))(nbr - 1) #for three col
  }
  
  min.breaks <- min(breaks)
  max.breaks <- max(breaks)
  x[x < min.breaks] <- min.breaks
  x[x > max.breaks] <- max.breaks
  if (missing(lhei) | is.null(lhei)) 
    lhei <- c(keysize, 4)
  if (missing(lwid) | is.null(lwid)) 
    lwid <- c(keysize, 4)
  if (missing(lmat) | is.null(lmat)) {
    lmat <- rbind(4:3, 2:1)
    if (!missing(ColSideColors)) {
        if (!is.character(ColSideColors) | length(ColSideColors) != nc) { 
           stop("'ColSideColors' must be a character vector of length ncol(x)") 
        }
        lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1)
        lhei <- c(lhei[1], 0.2, lhei[2])
    }
    if (!missing(RowSideColors)) {
       if (!is.character(RowSideColors) | length(RowSideColors) != nr) {
                stop("'RowSideColors' must be a character vector of length nrow(x)")
       }
       lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1), 1), lmat[, 2] + 1)
       lwid <- c(lwid[1], 0.2, lwid[2])
    }
    lmat[is.na(lmat)] <- 0
  }
  
  if (length(lhei) != nrow(lmat)) 
    stop("lhei must have length = nrow(lmat) = ", nrow(lmat))
  if (length(lwid) != ncol(lmat)) 
    stop("lwid must have length = ncol(lmat) =", ncol(lmat))
  
  
  
  op <- par(no.readonly = TRUE)
  on.exit(par(op))
  layout(lmat, widths = lwid, heights = lhei, respect = FALSE)
  if (!missing(RowSideColors)) {
    par(mar = c(margins[1], 1, 1, 0.5))
    image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE)
  }
  if (!missing(ColSideColors)) {
    par(mar = c(0.5, 0, 0, margins[2]))
    image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE)
  }
  par(mar = c(margins[1], 0, 0, margins[2]))
  x <- t(x)
  cellnote <- t(cellnote)
  if (revC) {
     iy <- nr:1
     if (exists("ddr")) {  ddr <- rev(ddr) }
     x <- x[, iy]
     cellnote <- cellnote[, iy]
  }
  else { iy <- 1:nr }
  
  image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + c(0, nr), 
        axes = FALSE, xlab = "" , ylab = "", col = col, breaks = breaks, ...)
  
  retval$carpet <- x
  if (exists("ddr")) 
     retval$rowDendrogram <- ddr
  if (exists("ddc")) 
     retval$colDendrogram <- ddc
  retval$breaks <- breaks
  retval$col <- col
  if (!invalid(na.color) & any(is.na(x))) {
        mmat <- ifelse(is.na(x), 1, NA)
        image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "", col = na.color, add = TRUE)
 }
 axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, cex.axis = cexCol)
 if (!is.null(xlab)) 
    mtext(xlab, side = 1, line = margins[1] - 1.25)
    axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, cex.axis = cexRow)
    if (!is.null(ylab)) 
        mtext(ylab, side = 4, line = margins[2] - 1.25)
    if (!missing(add.expr)) 
        eval(substitute(add.expr))
    if (!missing(colsep)) 
        for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0, length(csep)), 
                                  xright = csep + 0.5 + sepwidth[1], ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1, 
                                  col = sepcolor, border = sepcolor)
    if (!missing(rowsep)) 
        for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) + 1 - rsep) - 0.5, 
                                  xright = nrow(x) + 1, ytop = (ncol(x) + 1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1, 
                                  col = sepcolor, border = sepcolor)
    min.scale <- min(breaks)
    max.scale <- max(breaks)
    x.scaled <- scale01(t(x), min.scale, max.scale)
    if (trace %in% c("both", "column")) {
        retval$vline <- vline
        vline.vals <- scale01(vline, min.scale, max.scale)
        for (i in colInd) {
            if (!is.null(vline)) {
                abline(v = i - 0.5 + vline.vals, col = linecol, 
                  lty = 2)
            }
            xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5
            xv <- c(xv[1], xv)
            yv <- 1:length(xv) - 0.5
            lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s")
        }
    }
    if (trace %in% c("both", "row")) {
        retval$hline <- hline
        hline.vals <- scale01(hline, min.scale, max.scale)
        for (i in rowInd) {
            if (!is.null(hline)) {
                abline(h = i + hline, col = linecol, lty = 2)
            }
            yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5
            yv <- rev(c(yv[1], yv))
            xv <- length(yv):1 - 0.5
            lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s")
        }
    }
    if (!missing(cellnote)) 
        text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote), col = notecol, cex = notecex)
    par(mar = c(margins[1], 0, 0, 0))
    if (dendrogram %in% c("both", "row")) {
        plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none")
    }
    else plot.new()
    par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2]))
    if (dendrogram %in% c("both", "column")) {
        plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none")
    }
    else plot.new()
    if (!is.null(main)) 
        title(main, cex.main = 1.5 * op[["cex.main"]])
  
  
  #for key
    if (key) {
      #  par(mar = c(5, 4, 2, 1), cex = 0.75)
        tmpbreaks <- breaks
        if (symkey) {
            max.raw <- max(abs(c(x, breaks)), na.rm = TRUE)
            min.raw <- -max.raw
            tmpbreaks[1] <- -max(abs(x), na.rm = TRUE)
            tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE)
        }
        else {
            min.raw <- min(x, na.rm = TRUE)
            max.raw <- max(x, na.rm = TRUE)
        }
                
        z <- seq(min.raw, max.raw, length = length(col))
       # image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, xaxt = "n", yaxt = "n")
        image(z = matrix(z, ncol = 1), col = col, xaxt = "n", yaxt = "n", xlab = "", ylab = "")
        par(usr = c(0, 1, 0, 1))
        lv <- pretty(breaks)
        xv <- scale01(as.numeric(lv), min.raw, max.raw)
        axis(1, at = xv, labels = lv)
        if(scale == "row") { mtext(side = 1, paste(keynote, "(Row Z-Score)", sep=""), line = 2) 
        }else if(scale == "column") {mtext(side = 1, paste(keynote, "(Column Z-Score)", sep=""), line = 2) 
        }else { mtext(side = 1, keynote, line = 2) }
        
        if (density.info == "density") {
            dens <- density(x, adjust = densadj, na.rm = TRUE)
            omit <- dens$x < min(breaks) | dens$x > max(breaks)
            dens$x <- dens$x[-omit]
            dens$y <- dens$y[-omit]
            dens$x <- scale01(dens$x, min.raw, max.raw)
            lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol, 
                lwd = 1)
            axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y))
           # title("Color Key\nand Density Plot")
            par(cex = 0.5)
            mtext(side = 2, "Density", line = 2)
        }
        else if (density.info == "histogram") {
            h <- hist(x, plot = FALSE, breaks = breaks)
            hx <- scale01(breaks, min.raw, max.raw)
            hy <- c(h$counts, h$counts[length(h$counts)])
            lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s", 
                col = denscol)
            axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy))
          #  title("Color Key\nand Histogram")
            par(cex = 0.5)
            mtext(side = 2, "Count", line = 2)
        }
        else title("")
    }
    else plot.new()
   retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)], 
        high = retval$breaks[-1], color = retval$col)
    invisible(retval)
  
  return( list(retval = retval, hcr = hcr, hcc = hcc) )
}
 

##check whether x is a data matrix and k for KNN-based MI esimator.
.check.matrix <- function(x, k, name) {
  if ((!is.matrix(x) && !is.data.frame(x)) || nrow(x) < 2) {
    stop(paste(name, "must be a multi-row matrix or a data.frame"))
  } else if (ncol(x) <= k) {
    stop(paste(name, " has too few columns (must be > ", k, ")", sep=""))
  }
}



##check the variable with multiple elements.
.check.variable <- function(var, name, candidates) {
   
   if( is.vector(var) ){
     vartmp = var[1]
    }

   if( length(vartmp) == 0 | is.null(var) == TRUE ) {
     stop( paste(name, "must have a value with the candicate parameters."))
   }
   
   if( length( which( candidates == vartmp ) ) == 0 ){
      stop( paste(name, "must have a value with the candicate parameters.") )
   }

   vartmp
}



#correlations for gene pairs of all genes
.cor_all <- function(xs, rowidx = NULL, colidx = NULL, corMethod = "GCC", cpus = 1, saveType = "matrix", backingpath = NULL, backingfile = "adj_mat", descriptorfile = "adj_desc" ) {
  h <- nrow(xs)
  w <- ncol(xs)
  k <- 3
  noise <- 0.0
  
  subrownum <- 0
  subcolnum <- 0
  if( (length(rowidx) != h) | (length(colidx) != w) ) {
    subrownum <- length(rowidx)
    subcolnum <- length(colidx)
  }
 
  corIndex <- c(1,2,3,4, 5)
  names(corIndex) <- c("GCC", "PCC", "SCC", "KCC", "ED")
  curcorIndex <- corIndex[corMethod]


  res <- NULL
  xsix <- NULL
  Rownames <- NULL
  Colnames <- NULL

  #all gene paris considered 
  if( (length(rowidx) == h) & (length(colidx) == h)  ) {  
    Rownames <- rownames(xs)
    Colnames <- Rownames
    if( corMethod == "GCC" || corMethod == "SCC"  ) {
        xsix <- apply( xs, 1, function(x) sort( sort(x,index.return=TRUE)$ix, index.return=TRUE)$ix )
        res <- .C("c_cor_all", as.integer(curcorIndex), as.double(t(xs)), as.integer(xsix), as.integer(h), as.integer(w), as.integer(k), as.double(noise), as.integer(cpus), res = double(h*h), PACKAGE = "rsgcc", DUP = TRUE)$res
    }else if( corMethod == "PCC" || corMethod == "KCC" || corMethod == "ED" ) {
       res <- .C("c_cor_all", as.integer(curcorIndex), as.double(t(xs)), as.integer(xsix), as.integer(h), as.integer(w), as.integer(k), as.double(noise), as.integer(cpus),  res = double(h*h), PACKAGE = "rsgcc", DUP = TRUE)$res
    }else{
     stop("Error: undefined cor method).\n")
    }
  }else {  #part gene paris considered 
 
     Rownames <- rownames(xs)[rowidx]
     Colnames <- rownames(xs)[colidx]
     if( corMethod == "GCC" || corMethod == "SCC"  ) {
        xsix <- apply( xs, 1, function(x) sort( sort(x,index.return=TRUE)$ix, index.return=TRUE)$ix )
        res <- .C("c_cor_subset", as.integer(curcorIndex), as.double(t(xs)), as.integer(xsix), as.integer(h), as.integer(w), as.integer(k), as.double(noise), as.integer(cpus), as.integer(rowidx), as.integer(colidx), as.integer(subrownum), as.integer(subcolnum), res = double(subrownum*subcolnum), PACKAGE = "rsgcc", DUP = TRUE)$res
      }else if( corMethod == "PCC" || corMethod == "KCC" || corMethod == "ED" ) {
        res <- .C("c_cor_subset", as.integer(curcorIndex), as.double(t(xs)), as.integer(xsix), as.integer(h), as.integer(w), as.integer(k), as.double(noise), as.integer(cpus), as.integer(rowidx), as.integer(colidx), as.integer(subrownum), as.integer(subcolnum), res = double(subrownum*subcolnum), PACKAGE = "rsgcc", DUP = TRUE)$res
      }else{
        stop("Error: undefined cor method).\n")
      }
   }



  m <- t( matrix(res, nrow=length(Colnames)) )
  rownames(m) <- Rownames
  colnames(m) <- Colnames
  if( saveType == "bigmatrix" ) {
     options(bigmemory.allow.dimnames=TRUE)
     if( is.null(backingpath) )
       backingpath <- getwd()
     m <- as.big.matrix(m, type = "double", separated=FALSE, backingfile= backingfile, backingpath= backingpath, descriptorfile = descriptorfile, shared = TRUE)
  } 
 
  m
  
}



#############################################################################################
#generate adjacency matrix from a gene expression data
#Modified on 2012-11-28. two parameters (genes.row, genes.col) were added for generating a correlation matrix with length(genes.row) X length(genes.col).
adjacencymatrix <- function(mat, genes.row = NULL, genes.col = NULL, method = c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED"), k = 3, cpus = 1, saveType = "matrix", backingpath = NULL, backingfile = "adj_mat", descriptorfile = "adj_desc", ... ) {

  call <- match.call()

  .check.matrix(mat, k, "xs")
  method <- .check.variable(method, "method", c("GCC", "PCC", "SCC", "KCC", "BiWt", "MI", "MINE", "ED") )
  
  if (method == "MI" && k < 2)
    stop("k must be >= 2.")

  geneNames <- rownames(mat)
  if( is.null(geneNames) ) 
    geneNames <- c(1:nrow(mat)) 

  ##for BiWt
  if( method == "BiWt" ) {
    m <- cor.matrix(mat, cpus = cpus, cormethod= method, style= "all.pairs", pernum= 0, sigmethod= "two.sided", output = "matrix")$corMatrix
    rownames(m) <- rownames(mat)
    colnames(m) <- rownames(mat)
    if( saveType == "bigmatrix" ) {
      options(bigmemory.allow.dimnames=TRUE)
      if( is.null(backingpath) )
         backingpath <- getwd()
       m <- as.big.matrix(m, type = "double", separated=FALSE, backingfile= backingfile, backingpath= backingpath, descriptorfile = descriptorfile, shared = TRUE)
    }#end if bigmatrix
    return(m)
  }
  
  ##for mine
  if( method == "MINE" ) {
    m <- mine(t(mat))$MIC
    rownames(m) <- rownames(mat)
    colnames(m) <- rownames(mat)
    if( saveType == "bigmatrix" ) {
      options(bigmemory.allow.dimnames=TRUE)
      if( is.null(backingpath) )
         backingpath <- getwd()
       m <- as.big.matrix(m, type = "double", separated=FALSE, backingfile= backingfile, backingpath= backingpath, descriptorfile = descriptorfile, shared = TRUE)
    }#end if bigmatrix
    return(m)
  }

  #for other methods
  Rownames <- geneNames
  Colnames <- geneNames
  row.idx <- c(1:nrow(mat))
  col.idx <- c(1:nrow(mat))
  if( !is.null(genes.row) ) {
     row.idx <- match( genes.row, geneNames)
     if( length( which(is.na(row.idx)) ) > 0 ) {
       stop("Error: some gene names in genes.row are not found in rownames(mat).\n")
     }
     Rownames <- rownames(mat)[row.idx]
   }
   if( !is.null(genes.col) ) {
    col.idx <- match( genes.col, geneNames)
    if( length( which(is.na(col.idx)) ) > 0 ) {
      stop("Error: some gene names in genes.col are not found in rownames(mat).\n")
    }
    Colnames <- rownames(mat)[col.idx]
  }

  
  

  #for knnmi
  m <- NULL
  if( method == "MI" ){
    if( is.null(genes.row) | is.null(genes.col) ) {
      m <- knnmi.cross( mat[row.idx,], mat[col.idx,], k, noise=1e-12 )
    }else {
      m <- knnmi.all(mat, k, noise=1e-12)
    }
    rownames(m) <- Rownames
    colnames(m) <- Colnames
  }else{
    m <-.cor_all(xs = mat, rowidx = row.idx, colidx = col.idx, corMethod = method, cpus = cpus, saveType = saveType, backingpath = backingpath, backingfile = backingfile, descriptorfile = descriptorfile)
  }
  m
}

Try the rsgcc package in your browser

Any scripts or data that you put into this service are public.

rsgcc documentation built on May 2, 2019, 9:25 a.m.