R/StatTools.R

Defines functions plotScores plotClusters plotCriterion getMergedDataset estime_cval get_Clusters_corr get_Clusters_hca getClusters .getScaling replace_zero trim

Documented in getClusters getMergedDataset plotClusters plotCriterion plotScores

# ID StatTools.R
# Copyright (C) 2017-2020 INRAE
# Authors: D. Jacob
#
trim<-function(x) gsub("^\\s+|\\s+$", "", x)

colors <- c("red", "cornflowerblue", "darkgreen", "blueviolet", "orange", 
             "magenta", "darkred", "coral", "mediumvioletred", "yellow4",
             "seagreen2", "lightskyblue", "darkcyan", "yellowgreen", "limegreen",
             "wheat2", "yellow4",  "violetred1", "darkorange", "cyan4")

matrix <- NULL
factors <- NULL
association <- NULL

options(warn = -1)
options(width=1024)

replace_zero <- function(data)
{
    nbzero <- length(data[data==0])
    minval <- min(data[data>0])
    if (nbzero>0) data[data==0]<- abs( stats::rnorm( nbzero, 0.1*minval, 0.01*minval ) )
    data
}

########################################################################################
## Scaling                                                                            ##
########################################################################################
## Aim: Data scaling
##
## Settings:
##      - matrix: a matrix with the variables in columns and the sample in rows.
##          The first column contains the sample names
##      - the "log" value: According to the data, people can use the logarithmic transformation.
##          If log=0: no logaritmic transformation
##          If log=2: data are transformed with log2
##          If log=10: data are transformed with log10
##      - Scaling methods: Several methods can be applied to a data set. In this application, we put forward:
##          Centering
##          Zscore
##          Pareto
##          Vast
##          Range
##          Level
##          L1: (Euclidean norm)
##          L2: (Euclidean norm)
##          NULL
##
########################################################################################
.getScaling <- function(matrix, log, methods=c("Centering","Zscore","Pareto","Vast","Range","Level","L1","L2","NULL"))
{
## LOG Transformation
   if (log==2) {
       matrix <- replace_zero(matrix)
       matrix<-log2((matrix))
       cat('transformation to log2\n')
   }
   if (log==10) {
       matrix <- replace_zero(matrix)
       matrix<-log10(matrix)
       cat('transformation to log10\n')
   }
   if (log==11) {
       matrix<-log10(matrix + 1)
       cat('Transformation to log10 after incrementing (+1)\n')
   }

## Column-wise
   if ("Centering" %in% methods)   { matrix<-scale(matrix,center=T,scale=F) }
   if ("Zscore" %in% methods)      { matrix<-scale(matrix,center=T,scale=T) }
   if ("Pareto" %in% methods)      { matrix<-apply(t(matrix), 1,function(x){(x-mean(x))/sqrt(stats::sd(x))}) }
   if ("Vast" %in% methods)        { matrix<-apply(t(matrix), 1,function(x){(x-mean(x))*mean(x)/(stats::sd(x)*stats::sd(x))}) }
   if ("Range" %in% methods)       { matrix<-apply(t(matrix), 1,function(x){(x-mean(x))/(max(x)-min(x))}) }
   if ("Level" %in% methods)       { matrix<-apply(t(matrix), 1,function(x){(x-mean(x))/stats::median(x)}) }

## Row-wise
   if ("L1" %in% methods)          { matrix<-scale(matrix,center=T,scale=F)
                                     matrix<-t(apply(matrix, 1,function(x){x/sum(x)})) }
   if ("L2" %in% methods)          { matrix<-scale(matrix,center=T,scale=F)
                                     matrix<-t(apply(matrix, 1,function(x){x/sqrt(sum(x*x))}))  }

   return(matrix)
}


#====================================================================#
# Bucket Clustering
#====================================================================#

#' getClusters
#'
#' From the data matrix generated from the integration of all bucket zones (columns) for each 
#' spectrum (rows), we can take advantage of the concentration variability of each compound in 
#' a series of samples by performing a clustering based on significant correlations that link 
#' these buckets together into clusters. Bucket Clustering based on either a lower threshold  
#' applied on correlations or a cutting value applied on a hierarchical tree of the variables 
#' (buckets) generated by an Hierarchical Clustering Analysis (HCA).
#'
#' At the bucketing step (see above),  we have chosen the intelligent bucketing, it means 
#' that each bucket exact matches with one resonance peak. Thanks to this, the buckets now 
#' have a strong chemical meaning, since the resonance peaks are the fingerprints of 
#' chemical compounds. However, to assign a chemical compound, several resonance peaks 
#' are generally required in 1D 1 H-NMR metabolic profiling. To generate relevant 
#' clusters (i.e. clusters possibly matching to chemical compounds), two approaches 
#' have been implemented:
#' \itemize{
#'   \item Bucket Clustering based on a lower threshold  applied on correlations
#'   \itemize{
#'       \item In this approach an appropriate correlation threshold is applied on the correlation 
#' matrix before its cluster decomposition. Moreover, an improvement can be done by searching for 
#' a trade-off on a tolerance interval of the correlation threshold : from a fixed threshold of the 
#' correlation (cval), the clustering is calculated for the three values (cval-dC, cval, cval+dC), 
#' where dC is the tolerance interval of the correlation threshold. From these three sets of 
#' clusters, we establish a merger according to the following rules: 1) if a large cluster is 
#' broken, we keep the two resulting clusters. 2) If a small cluster disappears, the initial 
#' cluster is conserved. Generally, an interval of the correlation threshold included between 
#' 0.002 and 0.01 gives good trade-off.
#'   }
#'   \item Bucket Clustering based on a hierarchical tree of the variables (buckets) generated 
#' by an Hierarchical Clustering Analysis (HCA)
#'   \itemize{
#'       \item In this approach a Hierachical Classification Analysis (HCA, \code{\link[stats]{hclust}}) 
#' is applied on the data after calculating a matrix distance ("euclidian" by default). Then, a cut 
#' is applied on the tree (\code{\link[stats]{cutree}}) resulting from \code{\link[stats]{hclust}}, 
#' into several groups by specifying the cut height(s). For finding best cut value,  the cut height 
#' is chosen i) by testing several values equally spaced in a given range of the cut height, then, 
#' 2) by keeping the one that gives the more cluster and by including most bucket variables. 
#' Otherwise, a cut value has to be specified by the user (vcutusr)
#'   }
#' }
#' 
#'
#' @param data the matrix including the integrations of the areas defined by the buckets (columns) 
#' on each spectrum (rows)
#' @param method Clustering method of the buckets. Either 'corr' for 'correlation' or 'hca' for 
#' 'hierarchical clustering analysis'. 
#' @param ... Depending on the chosen method:
#' \itemize{
#'   \item \code{corr} : cval, dC, ncpu
#'   \item \code{hca} :  vcutusr
#' }
#' @return
#' \code{getClusters} returns a list containing the following components:
#' \itemize{
#'   \item \code{vstats} Statistics that served to find the best value of the criterion (matrix)
#'   \item \code{clusters} List of  the ppm value corresponding to each cluster. the length of the list equal to number of clusters
#'   \item \code{clustertab} the associations matrix that gives for each cluster (column 2) the corresponding buckets (column 1)
#'   \item \code{params} List of parameters related to the chosen method for which the clustering was performed.
#'   \item \code{vcrit} Value of the (best/user) criterion, i.e correlation threshold for 'corr' method or the cut value for the 'hca' method.
#'   \item \code{indxopt} Index value within the vstats matrix corresponding to the criterion value (vcrit)
#' }
#' 
#' @references{
#'   Jacob D., Deborde C. and Moing A. (2013) An efficient spectra processing method for metabolite identification from 1H-NMR metabolomics data.
#'   Analytical and Bioanalytical Chemistry 405(15) 5049-5061 doi: 10.1007/s00216-013-6852-y
#'  }
#'
#' @examples
#'  \donttest{
#'   data_dir <- system.file("extra", package = "Rnmr1D")
#'   cmdfile <- file.path(data_dir, "NP_macro_cmd.txt")
#'   samplefile <- file.path(data_dir, "Samples.txt")
#'   out <- Rnmr1D::doProcessing(data_dir, cmdfile=cmdfile, 
#'                                 samplefile=samplefile, ncpu=2)
#'   outMat <- getBucketsDataset(out, norm_meth='CSN')
#'   clustcorr <- getClusters(outMat, method='corr', cval=0, dC=0.003, ncpu=2)
#'   clusthca <- getClusters(outMat, method='hca', vcutusr=0)
#'  }
getClusters <- function(data, method='hca', ... )
{
    out <- NULL
    if (method=='hca')  out <- get_Clusters_hca(data, ... )
    if (method=='corr') out <- get_Clusters_corr(data, ... )
    out
}

# Bucket Clustering - Find the optimal Cutting Tree Value
get_Clusters_hca <- function(data, vcutusr=0, scalemeth='Zscore', log=0, distmeth='euclidean', hcmeth='complete', maxcsize=40, mincsize=2, bucchar='B')
{
   params <- list (
     method='hca',

   # Scaling params
     scalemeth = scalemeth, 
     log = log,

   # Bucket variables
     BUCCHAR = bucchar,
   
   # Clustering
     DISTMETH = distmeth,
     HCMETH = hcmeth,
     MAXCSIZE = maxcsize,
     MINCSIZE = mincsize,

   # the range of 'Vcut' for getting some statistics
     Vcut_min=0.1,
     Vcut_max=0.9,
     Vstep=0.005,
     vcutusr=vcutusr

   )

   # Get the association table: Variables <=> CLusterID,
   # by applying the cutting tree threshold 'vcut' on the Hclust 'hc'
   get_VarSet <- function(hc, vcut)
   {
        hcut <- vcut*( max(hc$height) + min(hc$height) )
        CT <- stats::cutree(hc.c,h=hcut)
        i <- 1
        V <- NULL
        for (k in 1:length(unique(CT))) {
            if (length(CT[CT==k])<params$MINCSIZE) next
            CTK <- CT[CT==k]
            CTK[] <- i
            V <- c(V, CTK)
            i <- i + 1
        }
        VarSet <- cbind( names(V), simplify2array(lapply( as.vector(V), function(x) { paste('C',x,sep=''); })) )
        return(VarSet)
   }

   matrix <- .getScaling(data,log=params$log, methods=params$scalemeth)

   VC <- 1
   CC <- 2
   Vcut_min=params$Vcut_min
   Vcut_max=params$Vcut_max
   Vstep=params$Vstep

# Variables Classification
   x <- as.matrix(matrix)

# Compute the distance matrix (see ?dist) and the Hierarchical clustering (see ?hclust)
   dist.c <- stats::dist(t(x), method=params$DISTMETH)
   hc.c <- stats::hclust(dist.c, method=params$HCMETH)

# Init
   Vstats <- NULL
   Vcut <- seq(from=Vcut_min, to=Vcut_max, by=Vstep)

# Loop on the range
   for( k in 1:length(Vcut) ) {

     # Init
     Cluster_Count <- 0
     Cluster_VarCount <- 0
     Cluster_MatchVarCount <- 0
     Cluster_MaxSize <- 0
     Cluster_Matching <- 0
     VarSet <- get_VarSet(hc.c, Vcut[k])
     Clusters <- as.character(unique(VarSet[,2]))

     # For each cluster ...
     for (i in 1:length(Clusters)) {
        CL <- Clusters[i]
        Cluster_Count <- Cluster_Count + 1

        # get the peaklist
        ppmlst  <- VarSet[VarSet[,CC] == CL, VC]
        Cluster_VarCount <- Cluster_VarCount + length(ppmlst)
        if (Cluster_MaxSize < length(ppmlst) ) Cluster_MaxSize <- length(ppmlst)
     }
     Vstats <- rbind( Vstats, c(Vcut[k], Cluster_Count, Cluster_VarCount, Cluster_MaxSize ) )

   }
   colnames(Vstats) <- c("Vcut", "Nb Clusters", "Nb Vars", "MaxSize")

# Compute the optimal value for Vcut
# -- find the set of Vcut values such as the cluster count is max
   Vsub <- Vstats[ Vstats[,4]<params$MAXCSIZE,  ]
   V <- Vsub[Vsub[,2] == max(Vsub[,2]),1]
# -- Keep the Vcut value such as the variable count is max
   VcutOpt <- V[which(Vstats[Vstats[,1] %in% V,3]==max(Vstats[Vstats[,1] %in% V,3]))][1]
   Vcut <- ifelse(params$vcutusr==0, VcutOpt, params$vcutusr)
   indx <- round(Vcut/Vstep) - round(Vcut_min/Vstep) + 1

# Get the association table: Variables <=> CLusterID
   association <- get_VarSet(hc.c, Vcut)
   association <- cbind( association, gsub(params$BUCCHAR, "", gsub("_", ".", association[,VC])) )

   lclust <- unique(sort(association[,2]))
   clusters <- list()
   for(i in 1:length(lclust))  {
       CL <- lclust[i]
       clusters[[CL]] <-  as.numeric(association[association[,2]==CL, ][,3])
   }

   cat("#-- Clustering --\n")
   cat('#  Distance Method:',params$DISTMETH,"\n")
   cat('#  Agglomeration Method:',params$HCMETH,"\n")
   cat('#  Cutting Tree threshold:',Vcut,"\n")
   cat('#  Nb Clusters:',length(lclust),"\n")
   cat("#\n")

   return( list(vstats=Vstats, clusters=clusters, clustertab=association, params=params, vcrit=Vcut, indxopt=indx) )

}

# Bucket Clustering based on a lower threshold applied on correlations
get_Clusters_corr <- function(data, scalemeth='Zscore', log=0, cmeth='pearson', cval=0, dC=0.01, maxcsize=40, mincsize=2, bucchar='B', stats_comp=T, ncpu=1)
{
   params <- list (
     method='corr',

   # Scaling params
     scalemeth = scalemeth, 
     log = log,

   # Bucket variables
     BUCCHAR = bucchar,

   # Clustering
     CMETH = cmeth,
     CVAL = cval,
     dC = dC,
     MINCSIZE = mincsize,
     MAXCSIZE = maxcsize,

   # get some statistics
     VSTATS_CAL=stats_comp,
     NCPU=ncpu,

   # the range of 'Cval' for getting some statistics
     CVAL_MIN=0.9,
     CVAL_MAX=0.999,
     CVAL_STEP=0.001

   )

   matrix <- .getScaling(data,log=params$log, methods=params$scalemeth)

#---- CLUSTERING -----

 # Method in ("pearson", "kendall", "spearman")
   cor_mat <- stats::cor(matrix,method=params$CMETH)
   cor_mat[ lower.tri(cor_mat, diag=TRUE) ] <- 0

   vstats <- NULL
   indx <- 0
   if (params$VSTATS_CAL || params$CVAL==0) {
       cvals <- seq(from=params$CVAL_MIN, to=params$CVAL_MAX, by=params$CVAL_STEP)
       vstats <- estime_cval(cor_mat, cvals, params$NCPU)
       if (params$CVAL==0) {
           sub1 <- which( vstats[,4]<params$MAXCSIZE )
           sub2 <- which ( vstats[ sub1, 2 ]==max(vstats[ sub1, 2 ] ) ) + sub1[1] - 1
           indx <- sub2[ which(vstats[sub2,4]==min(vstats[sub2,4]))[1] ]
           params$CVAL <- vstats[indx, 1]
       } else {
           indx <- round(params$CVAL/params$CVAL_STEP) - round(params$CVAL_MIN/params$CVAL_STEP) + 1
       }
   }

   cl <- parallel::makeCluster(min(params$NCPU,3))
   doParallel::registerDoParallel(cl)

   NVARS <- length(colnames(matrix))
   cvalset <- c(params$CVAL-params$dC,params$CVAL,params$CVAL+params$dC)
   k<-0
   MT <- foreach::foreach( k=1:length(cvalset), .combine=cbind, .packages = c("igraph", "doParallel")) %dopar% {
      cval <- cvalset[k]
      cor_mat_cval <- cor_mat
      cor_mat_cval[ cor_mat_cval < cval] <- 0
      M<-cbind(colnames(matrix),c(1:NVARS)*0)
      graph <- graph.adjacency(cor_mat_cval>cval, weighted=TRUE, mode="upper")
      E(graph)$weight<-t(cor_mat_cval)[t(cor_mat_cval)>cval]
      V(graph)$label<- V(graph)$name
      cliques <- sapply(decompose.graph(graph), vcount)
      ordcli <- order(cliques,decreasing = T)
      g<-0
      for (i in 1:length(ordcli)) {
          ind<-ordcli[i]
          if (cliques[ind]>=params$MINCSIZE) {
             g <- g + 1
             subg <- decompose.graph(graph)[[ind]]
             VARS<-V(subg)$name
             M[M[,1] %in% VARS,2] <- sprintf("C%d",g)
          }
      }
      M[,2]
   }
   
   parallel::stopCluster(cl)
   
   M <- cbind(colnames(matrix),MT)
   NBCLUST <- apply( apply( gsub('C','', MT), 2 , as.numeric), 2, max )
   
   MC <- NULL
   for (i in 1:NVARS) {
      if (is.na(sum(as.numeric(M[i,-1])))) MC <- rbind(MC,M[i,])
   }
   varnames <- as.vector(lapply(cvalset, function(x) sprintf('V_%s',x,2)))
   varnames <- c('ppm',varnames)
   colnames(MC) <- varnames

   # Synthesis
   for (s in c(1,2)) {
      for (k in 1:NBCLUST[s]) {
         V <- MC[MC[,1+s]==sprintf("C%d",k),c(1,1+s,2+s)]
         i<-1
         while(i<dim(V)[1]) {
            if (V[i,3] == "0") {
               n<-1
               while(i<dim(V)[1] && V[i+1,2]==V[i,2] && V[i+1,3] == "0") { i<-i+1; n<-n+1; }
               # Rule 1
               if (n==1) {
                  if (i>1 && V[i-1,2]==V[i,2]) V[i,3] <- V[i-1,3]
                  else if (i<dim(V)[1] && V[i+1,2]==V[i,2]) V[i,3] <- V[i+1,3];
               }
               # Rule 2
               if (n>1) {
                  NBCLUST[1+s] <- NBCLUST[1+s] + 1
                  for (j in 1:n) V[i-n+j,3] <- sprintf("C%d",NBCLUST[1+s])
               }
            }
            i<-i+1
            if (i==dim(V)[1] && V[i,3] == "0" && V[i-1,2]==V[i,2]) V[i,3] <- V[i-1,3] # a special case
         }
         MC[MC[,1] %in% V[,1],2+s] <- V[,3]
      }
   }
   #association <- MC[,c(1,4)]
   
   # Filters out non-significant clusters
   CLUST <- unique(MC[,4])
   association <- NULL
   for (cl in 1:length(CLUST)) {
       VARS <- MC[MC[,4] == CLUST[cl],1]
       M <- matrix[ , VARS ]
       cor_mat <- stats::cor(M,method=params$CMETH)
       cor_mat[ lower.tri(cor_mat, diag=TRUE) ]<- 0
       if ( stats::median(cor_mat[ cor_mat!=0 ]) > (params$CVAL-params$dC) )
            association <- rbind(association, as.matrix(MC[MC[,4] == CLUST[cl], c(1,4)], ncol=2, byrow=T))
   }

   association <- association[association[,2] != "0",]
   association <- cbind( association, gsub(params$BUCCHAR, "", gsub("_", ".", association[,1])) )
   colnames(association) <- c("VAR","CLID", "PPM")

   lclust <- unique(sort(association[,2]))
   clusters <- list()
   for(i in 1:length(lclust))  {
       CL <- lclust[i]
       clusters[[CL]] <-  as.numeric(association[association[,2]==CL, ][,3])
   }

   cat("#-- Clustering --\n")
   cat('#  Correlation Method:',params$CMETH,"\n")
   cat('#  Correlation Threshold :',params$CVAL,"\n")
   cat('#  Correlation Tolerance:',params$dC,"\n")
   cat('#  Nb Clusters:',length(lclust),"\n")
   cat("#\n")

   return( list( vstats=vstats, clusters=clusters, clustertab=association, params=params, vcrit=params$CVAL, indxopt=indx ) )
}

estime_cval <- function(cor_mat, cvals, ncpu)
{
   compute_crit <- function(cor_mat, cval) {
      cor_mati <- cor_mat
      cor_mati[ cor_mati < cval] <- 0
   
      graph <- graph.adjacency(cor_mati>cval, weighted=TRUE, mode="upper")
      E(graph)$weight<-t(cor_mati)[t(cor_mati)>cval]
      V(graph)$label<- V(graph)$name
      cliques <- sapply(decompose.graph(graph), vcount)
      ordcli <- order(cliques,decreasing = T)
      nb_clusters<-0
      nb_clusters_2<-0
      nb_buckets<-0
      for (i in 1:length(ordcli)) {
         if (cliques[ordcli[i]]>=2) {
           nb_clusters <- nb_clusters + 1
           nb_buckets <- nb_buckets + cliques[ordcli[i]]
         }
         if (cliques[ordcli[i]]==2)
           nb_clusters_2 <- nb_clusters_2 + 1
      }
      size_max <- cliques[ordcli[1]]
      CRIT<-size_max/nb_clusters
      c(cval,nb_clusters,nb_buckets,size_max,nb_clusters_2,-20*log10(CRIT))
   }
   
   cl <- parallel::makeCluster(ncpu)
   doParallel::registerDoParallel(cl)

   i<-0
   vstats <- foreach::foreach(i=1:length(cvals), .combine=rbind, .packages = c("igraph", "doParallel")) %dopar% {  compute_crit(cor_mat, cvals[i]); }

   parallel::stopCluster(cl)

   colnames(vstats) <- c("Cval","Nb Clusters","Nb Vars","Max Size","Nb Clusters 2","Criterion")
   vstats
}

#' getMergedDataset
#'
#' merged variables for each cluster (based on their average)
#'
#' @param data the matrix including the integrations of the areas defined by the buckets (columns)
#' on each spectrum (rows)
#' @param clustObj a list generated by the \code{getClusters} function
#' @param onlycluster boolean - specifies if the merged data matrix at output must only contain 
#' the merged clusters (TRUE) or if it must also contain the buckets that are not include within a 
#' cluster (FALSE)
getMergedDataset <- function(data, clustObj, onlycluster=FALSE)
{
   association <- clustObj$clustertab
   matrix <- data
   if ("matrix" %in% class(association) && dim(association)[1]>1) {
      Clusters <- sort(unique(association[,2]))
      M <- simplify2array(lapply( 1:length(Clusters), function(x) { apply( matrix[, colnames(matrix) %in% association[association[,2] == Clusters[x],1] ], 1, mean ) } ))
      colnames(M) <- Clusters
      if (!onlycluster)
         M <- cbind(M, matrix[, !(colnames(matrix) %in% association[,1])])
      matrix <- M
   }
   matrix
}

## ---- Graphic Functions ----

##########################################################################
## plot.Criterion
##########################################################################

#' plotCriterion
#'
#' Plots the curves that show the number of clusters, the number of clustered buckets and the 
#' size of biggest cluster  versus the criterion, namely the correlation threshold for the 'corr' 
#' method, the cutting value for the 'hca' method.
#'
#' @param clustObj a list generated by the \code{getClusters} function
#' @param reverse  Boolean - indicates if x-axis must be reversed (TRUE) or nor (FALSE)
plotCriterion <- function(clustObj, reverse=FALSE)
{
    Vstats <- clustObj$vstats
    Vcrit <- clustObj$vcrit
    n <- clustObj$indxopt
    params <- clustObj$params
    MaxSize <- params$MAXCSIZE
    method <- params$method
    association <- clustObj$clustertab
    lclust <- unique(sort(association[,2]))

   # PNG image - Vstats plots
     legNames <- c( colnames(Vstats)[3], colnames(Vstats)[2], colnames(Vstats)[4])
     graphics::par(mar = c(5, 4, 4, 4) + 0.3)  # Leave space for z axis
     if( reverse==TRUE ) {
         xlim <- c(max(Vstats[,1]), min(Vstats[,1]))
     } else {
         xlim <- c(min(Vstats[,1]), max(Vstats[,1]))
     }
     main <- sprintf("%s method, Critere = %4.3f,  Nb Clust = %d",toupper(method),Vcrit, length(lclust))
     xlab <- "Critere"
     ylab <- "Number of variables / Clust Max Size"
     graphics::plot(Vstats[,1],Vstats[,3], xlim=xlim, ylim=c(0,1.2*max(Vstats[,3])), type="l", col=colors[1], xlab=xlab, ylab=ylab, main=main)
     if( method=='corr' ) {
         cval_lim <- c( max(params$CVAL-params$dC,0.9), min(params$CVAL+params$dC,1) )
         graphics::rect(cval_lim[1], -100, cval_lim[2], 1.5*max(Vstats[,3]), border = NA, col="azure2" ) # darkslategray2
         graphics::par(new=TRUE)
         graphics::plot(Vstats[,1],Vstats[,3], xlim=xlim, ylim=c(0,1.2*max(Vstats[,3])), type="l", col=colors[1], xlab=xlab, ylab=ylab, main=main)
     }
     graphics::lines(Vstats[,1],Vstats[,4], col=colors[3])
     graphics::abline(v=Vcrit, col="red")
     graphics::abline(h=MaxSize, col="green")
     if( method=='hca' ) {
          slines <- seq(from=params$Vcut_min, to=params$Vcut_max, by=0.025)
     } else { 
          slines <- seq(from=params$CVAL_MIN, to=params$CVAL_MAX, by=0.0025)
     }
     graphics::abline(v = slines, col = "lightgray", lty = 3)
     graphics::legend("top", legNames, col=colors[c(1:length(legNames))], lwd=2, lty=1, cex=0.6, horiz=TRUE)
     graphics::par(new=TRUE)
     graphics::plot(Vstats[,1],Vstats[,2], xlim=xlim, type="l", col=colors[2], xaxt="n", yaxt="n", xlab="",ylab="" )
     graphics::axis(side=4, at = pretty(range(Vstats[,2])))
     graphics::mtext("Number of Clusters", side=4, line=3)
}

##########################################################################
## plot.Clusters
##########################################################################

#' plotClusters
#'
#' Plots the boxplot of all clusters allowing to have an insight on the clusters distribution
#' 
#' @param data the matrix including the integrations of the areas defined by the buckets (columns)
#' on each spectrum (rows)
#' @param clustObj a list generated by the \code{getClusters} function
#' @param horiz Boolean - Indicates if the plot is horizontal (TRUE) or vertical (FALSE)
#' @param main Main title of the plot
plotClusters <- function(data, clustObj, horiz=TRUE, main="Boxplot by clusters (log10 transformed)")
{
     X <- getMergedDataset(data, clustObj, onlycluster=T)
     CLUST <- colnames(X)
     X[X<=0]<-0.00001*max(X)
     cent <- log10(t(X))

     Rank <- simplify2array(lapply(c(1:length(CLUST)), function(x) { round(mean(cent[x, ], na.rm=T))  }))
     cols <- c('red', 'orange', 'darkgreen', 'blue', 'purple')
     Rank <- Rank  - min(Rank) + 1
     nbrank <- max(Rank) - min(Rank) + 1

     graphics::boxplot(t(cent),cex.axis=0.5, main=main,
          ylab='\n\n\n\n\nClusters',xlab='Intensity (log10)', outline=F, horizontal=horiz, border=cols[Rank], las=2, cex.axis=0.5)
     graphics::abline(h=c(1:length(CLUST)), col="gray95")
     graphics::par(new=TRUE)
     graphics::boxplot(t(cent),cex.axis=0.5, main=main,
          ylab='\n\n\n\n\nClusters',xlab='Intensity (log10)', outline=F, horizontal=horiz, border=cols[Rank], las=2, cex.axis=0.5)
}

##########################################################################
## plot.Scores
##########################################################################
## Define the 'plot.scores' function for plotting PCA scores.
##  Input : data [N,PC] - N samples (rows), PC (columns)
##          samples - matrix of samples generated by the Rnmr1D function
##          factor - name (string) of the factor ; must be present as a column within the samples matrix
##          level - confidence level for plotting the corresponding ellipse
##########################################################################

#' plotScores
#'
#' Plots the two components defined by pc1, pc2 of the matrix of scores coming from a 
#' multivariable analysis, typically a Principal Component Analysis (PCA).
#'
#' @param data the matrix of scores coming from a multivariable analysis, typically a Principal Component Analysis (PCA)
#' @param pc1 the fist component of the matrix of variable loadings to be plotted.
#' @param pc2 the second component of the matrix of variable loadings to be plotted.
#' as well as the levels of the experimental factors if specified in the input. 
#' See \code{doProcessing} or \code{generateMetadata}
#' @param samples the samples matrix with the correspondence of the raw spectra, 
#' @param factor if not null, the name of one of the columns defining the factorial groups in the samples matrix at input
#' @param cexlabel number indicating the amount by which plotting text and symbols should be scaled relative to the default. 
#' @param level confidence level for plotting the corresponding ellipse
#' @param xlim gives the limit to be plotted of the first component
#' @param ylim gives the limit to be plotted of the second component
#' @param col colors vector for ellipses - automatically defined by default
plotScores <- function(data, pc1, pc2, samples, factor=NULL, cexlabel=1.2, level=0.95, xlim=NULL, ylim=NULL, col=NULL) {
    if (is.null(xlim)) xlim=1.5*c(min(data[,pc1]),max(data[,pc1]))
	if (is.null(ylim)) ylim=1.5*c(min(data[,pc2]),max(data[,pc2]))
    graphics::plot(data ,pch=20, adj=0, lwd=2, main="", xlim=xlim, ylim=ylim )
    graphics::text( adj=0, data[,pc1], data[,pc2], rownames(data), pos=4, cex=cexlabel )
    if (! is.null(factor)) {
       G <- unique(samples[ , factor ])
       if (is.null(col)) {
           grcols=grDevices::rainbow(length(G), s=0.9, v=0.8)
       } else {
           grcols=col
       }
       for(i in 1:length(G)) {
           XY <- data[rownames(data) %in% samples$Samplecode[samples[ , factor ]==G[i]], ]
           plotEllipse( XY[,pc1], XY[,pc2], level=level, col=grcols[i], lty=graphics::par()$lty, lwd=graphics::par()$lwd )
       }
       graphics::title(factor)
    }
}

##########################################################################
## plot.loadings
##########################################################################
## Loadings plot : Input : data [N,PC] N variables (rows), PC (columns)
## 
## use with associations file: plot of ellipses corresponding to each cluster
## 
## 
##########################################################################

#' plotLoadings
#'
#' Plots the two components defined by pc1, pc2 of the matrix of variable loadings coming from a 
#' multivariable analysis, typically a Principal Component Analysis (PCA).
#' It can also plot the ellipses corresponding to each cluster defined by the associations matrix 
#' if not null. (in fact it is the main interest of this function).
#'
#' @param data the matrix of variable loadings coming from a multivariable analysis, typically a Principal Component Analysis (PCA)
#' @param pc1 the fist component of the matrix of variable loadings to be plotted.
#' @param pc2 the second component of the matrix of variable loadings to be plotted.
#' @param associations the associations matrix that gives for each cluster (column 2) the corresponding buckets (column 1) 
#' @param main Change the default plot title on the rigth corner
#' @param xlimu gives the limit to be plotted of the first component
#' @param ylimu gives the limit to be plotted of the second component
#' @param cexlabel number indicating the amount by which plotting text and symbols should be scaled relative to the default. 
#' @param pch Plotting Symbols
#' @param ellipse boolean - specifies if ellipses are plot or not for each cluster
#' @param level confidence level for plotting the ellipses
plotLoadings <- function (data, pc1, pc2, associations=NULL, main="Loadings", 
                    xlimu=c(min(data[,pc1]),max(data[,pc1])), ylimu=c(min(data[,pc2]),max(data[,pc2])), cexlabel=1, pch=20, ellipse=TRUE, level=0.8)
{
   V <- t(data[,c(pc1,pc2)])
   colnames(V) <- rownames(data)
   graphics::plot(t(V),pch=20, adj=1, lwd=1, main=main,
            xlab= colnames(data)[pc1], ylab= colnames(data)[pc2], 
            xlim=xlimu, ylim=ylimu, col='red')
   graphics::abline(h=0,v=0)
   #plotEllipse( data[, pc1], data[, pc2], center=c(0,0), level=0.666, col="red", lty=3, lwd=0.1, type="l")
   # Use Associations (assoc1=T)
   if ("matrix" %in% class(associations) && dim(associations)[1]>1) {
       graphics::points(t(V), pch=pch, col="grey")
       graphics::text( adj=0, t(V)[,1], t(V)[,2], col="lightgrey", colnames(V), cex=cexlabel )
       colnames(V) <- as.vector(sapply(rownames(data),function(x) { ifelse(sum( associations[,1] %in% x), associations[,2][associations[,1] %in%  x ], x) }))
       P <- V[,colnames(V) %in% associations[,2] ]
       cols <- "red"
       Clusters <- unique(associations[order(associations[,1],decreasing=T),2])
       if (length(Clusters)<length(colnames(P))) {
          clcols=grDevices::rainbow(length(Clusters), s=0.9, v=0.8)
          for (i in 1:length(Clusters)) {
             XY <- t(P[,colnames(P)==Clusters[i]])
             M<-c(mean(XY[,1]),mean(XY[,2]))
             graphics::points(XY, pch=pch, col=clcols[i])
             if (dim(XY)[1]>1 && ellipse) plotEllipse( XY[,1], XY[,2], center=M, level= level, col=clcols[i], lty=graphics::par()$lty, lwd=graphics::par()$lwd )
             graphics::text(adj=0, M[1], M[2], Clusters[i], col="black", cex=cexlabel, font=2)
          }
       }
       else {
           clcols=grDevices::rainbow(length(Clusters), s=0.9, v=0.8)
          cols <- NULL; for (i in 1:length(colnames(P))) cols <- c(cols,  clcols[Clusters == colnames(P)[i]])
          graphics::points(t(P), pch=pch, col="red")
          graphics::text( adj=0, t(P)[,1], t(P)[,2], col=cols, colnames(P), cex=cexlabel )
       }
   }
   else {
       graphics::text( adj=0, t(V)[,1], t(V)[,2], col="cornflowerblue", colnames(V), cex=cexlabel )
   }
}

##########################################################################
## plot.ellipse
##########################################################################
## Compute confidence ellipses.
##  x and y variables for drawing.
##  level confidence level used to construct the ellipses. By default, 0.95.
##  npoint number of points used to draw the ellipses.
##  bary logical value. If TRUE, the coordinates of the ellipse around the barycentre of individuals are calculated.
## See https://github.com/kassambara/ggpubr/blob/master/R/stat_conf_ellipse.R
##
## Note : the size of the ellipses is related to the level of confidence chosen p: 
## p = 1 - exp(-cellipse^2 / 2) => cellipse <- sqrt(qchisq(p, 2))
## With a confidence level at 66.66% (2/3), we have cellipse = 1.5
## See http://www.visiondummy.com/2014/04/draw-error-ellipse-representing-covariance-matrix/
##     http://www.earth-time.org/projects/upb/public_docs/ErrorEllipses.pdf
##########################################################################

plotEllipse <- function ( x, y, center=NULL, level = 0.95, npoint = 100, bary = FALSE, ... )
{
  .ellipse <- function(x, scale = c(1, 1), centre = c(0, 0), t = 1.5, which = c(1,2), npoints = 100) {
    names <- c("x", "y")
    if (is.matrix(x)) {
      xind <- which[1]
      yind <- which[2]
      r <- x[xind, yind]
      if (missing(scale)) {
        scale <- sqrt(c(x[xind, xind], x[yind, yind]))
        if (scale[1] > 0)
          r <- r/scale[1]
        if (scale[2] > 0)
          r <- r/scale[2]
      }
      if (!is.null(dimnames(x)[[1]]))
        names <- dimnames(x)[[1]][c(xind, yind)]
    }
    else r <- x
    r <- min(max(r, -1), 1)
    d <- acos(r)
    a <- seq(0, 2 * pi, len = npoints)
    matrix(c(t*scale[1]*cos(a + d/2) + centre[1], t*scale[2]*cos(a - d/2) + centre[2]), npoints, 2, dimnames = list(NULL, names))
  }


  if (is.null(center)) center <- c(mean(x, na.rm = TRUE), mean(y, na.rm = TRUE))
  if (length(as.vector(x))>2) {
     tab <- data.frame(x = x, y = y)
     mat.cov <- stats::cov(tab)
     t = sqrt(stats::qchisq(level, 2))
     if (bary) mat.cov = mat.cov/nrow(tab)
     res <- .ellipse(mat.cov, centre = center, t=t, npoints = npoint)
     graphics::lines(res, adj=1, main="", xlab="", ylab="", ... )
  } else {
     graphics::lines(cbind(x, y), adj=1, main="", xlab="", ylab="", ... )
  }
}
INRA/Rnmr1D documentation built on April 11, 2024, 1:29 a.m.