Nothing
# ID StatTools.R
# Copyright (C) 2017-2020 INRAE
# Authors: D. Jacob
#
trim<-function(x) gsub("^\\s+|\\s+$", "", x)
colors <- c("red", "cornflowerblue", "darkgreen", "blueviolet", "orange",
"magenta", "darkred", "coral", "mediumvioletred", "yellow4",
"seagreen2", "lightskyblue", "darkcyan", "yellowgreen", "limegreen",
"wheat2", "yellow4", "violetred1", "darkorange", "cyan4")
matrix <- NULL
factors <- NULL
association <- NULL
options(warn = -1)
options(width=1024)
replace_zero <- function(data)
{
nbzero <- length(data[data==0])
minval <- min(data[data>0])
if (nbzero>0) data[data==0]<- abs( stats::rnorm( nbzero, 0.1*minval, 0.01*minval ) )
data
}
########################################################################################
## Scaling ##
########################################################################################
## Aim: Data scaling
##
## Settings:
## - matrix: a matrix with the variables in columns and the sample in rows.
## The first column contains the sample names
## - the "log" value: According to the data, people can use the logarithmic transformation.
## If log=0: no logaritmic transformation
## If log=2: data are transformed with log2
## If log=10: data are transformed with log10
## - Scaling methods: Several methods can be applied to a data set. In this application, we put forward:
## Centering
## Zscore
## Pareto
## Vast
## Range
## Level
## L1: (Euclidean norm)
## L2: (Euclidean norm)
## NULL
##
########################################################################################
.getScaling <- function(matrix, log, methods=c("Centering","Zscore","Pareto","Vast","Range","Level","L1","L2","NULL"))
{
## LOG Transformation
if (log==2) {
matrix <- replace_zero(matrix)
matrix<-log2((matrix))
cat('transformation to log2\n')
}
if (log==10) {
matrix <- replace_zero(matrix)
matrix<-log10(matrix)
cat('transformation to log10\n')
}
if (log==11) {
matrix<-log10(matrix + 1)
cat('Transformation to log10 after incrementing (+1)\n')
}
## Column-wise
if ("Centering" %in% methods) { matrix<-scale(matrix,center=T,scale=F) }
if ("Zscore" %in% methods) { matrix<-scale(matrix,center=T,scale=T) }
if ("Pareto" %in% methods) { matrix<-apply(t(matrix), 1,function(x){(x-mean(x))/sqrt(stats::sd(x))}) }
if ("Vast" %in% methods) { matrix<-apply(t(matrix), 1,function(x){(x-mean(x))*mean(x)/(stats::sd(x)*stats::sd(x))}) }
if ("Range" %in% methods) { matrix<-apply(t(matrix), 1,function(x){(x-mean(x))/(max(x)-min(x))}) }
if ("Level" %in% methods) { matrix<-apply(t(matrix), 1,function(x){(x-mean(x))/stats::median(x)}) }
## Row-wise
if ("L1" %in% methods) { matrix<-scale(matrix,center=T,scale=F)
matrix<-t(apply(matrix, 1,function(x){x/sum(x)})) }
if ("L2" %in% methods) { matrix<-scale(matrix,center=T,scale=F)
matrix<-t(apply(matrix, 1,function(x){x/sqrt(sum(x*x))})) }
return(matrix)
}
#====================================================================#
# Bucket Clustering
#====================================================================#
#' getClusters
#'
#' From the data matrix generated from the integration of all bucket zones (columns) for each
#' spectrum (rows), we can take advantage of the concentration variability of each compound in
#' a series of samples by performing a clustering based on significant correlations that link
#' these buckets together into clusters. Bucket Clustering based on either a lower threshold
#' applied on correlations or a cutting value applied on a hierarchical tree of the variables
#' (buckets) generated by an Hierarchical Clustering Analysis (HCA).
#'
#' At the bucketing step (see above), we have chosen the intelligent bucketing, it means
#' that each bucket exact matches with one resonance peak. Thanks to this, the buckets now
#' have a strong chemical meaning, since the resonance peaks are the fingerprints of
#' chemical compounds. However, to assign a chemical compound, several resonance peaks
#' are generally required in 1D 1 H-NMR metabolic profiling. To generate relevant
#' clusters (i.e. clusters possibly matching to chemical compounds), two approaches
#' have been implemented:
#' \itemize{
#' \item Bucket Clustering based on a lower threshold applied on correlations
#' \itemize{
#' \item In this approach an appropriate correlation threshold is applied on the correlation
#' matrix before its cluster decomposition. Moreover, an improvement can be done by searching for
#' a trade-off on a tolerance interval of the correlation threshold : from a fixed threshold of the
#' correlation (cval), the clustering is calculated for the three values (cval-dC, cval, cval+dC),
#' where dC is the tolerance interval of the correlation threshold. From these three sets of
#' clusters, we establish a merger according to the following rules: 1) if a large cluster is
#' broken, we keep the two resulting clusters. 2) If a small cluster disappears, the initial
#' cluster is conserved. Generally, an interval of the correlation threshold included between
#' 0.002 and 0.01 gives good trade-off.
#' }
#' \item Bucket Clustering based on a hierarchical tree of the variables (buckets) generated
#' by an Hierarchical Clustering Analysis (HCA)
#' \itemize{
#' \item In this approach a Hierachical Classification Analysis (HCA, \code{\link[stats]{hclust}})
#' is applied on the data after calculating a matrix distance ("euclidian" by default). Then, a cut
#' is applied on the tree (\code{\link[stats]{cutree}}) resulting from \code{\link[stats]{hclust}},
#' into several groups by specifying the cut height(s). For finding best cut value, the cut height
#' is chosen i) by testing several values equally spaced in a given range of the cut height, then,
#' 2) by keeping the one that gives the more cluster and by including most bucket variables.
#' Otherwise, a cut value has to be specified by the user (vcutusr)
#' }
#' }
#'
#'
#' @param data the matrix including the integrations of the areas defined by the buckets (columns)
#' on each spectrum (rows)
#' @param method Clustering method of the buckets. Either 'corr' for 'correlation' or 'hca' for
#' 'hierarchical clustering analysis'.
#' @param ... Depending on the chosen method:
#' \itemize{
#' \item \code{corr} : cval, dC, ncpu
#' \item \code{hca} : vcutusr
#' }
#' @return
#' \code{getClusters} returns a list containing the following components:
#' \itemize{
#' \item \code{vstats} Statistics that served to find the best value of the criterion (matrix)
#' \item \code{clusters} List of the ppm value corresponding to each cluster. the length of the list equal to number of clusters
#' \item \code{clustertab} the associations matrix that gives for each cluster (column 2) the corresponding buckets (column 1)
#' \item \code{params} List of parameters related to the chosen method for which the clustering was performed.
#' \item \code{vcrit} Value of the (best/user) criterion, i.e correlation threshold for 'corr' method or the cut value for the 'hca' method.
#' \item \code{indxopt} Index value within the vstats matrix corresponding to the criterion value (vcrit)
#' }
#'
#' @references{
#' Jacob D., Deborde C. and Moing A. (2013) An efficient spectra processing method for metabolite identification from 1H-NMR metabolomics data.
#' Analytical and Bioanalytical Chemistry 405(15) 5049-5061 doi: 10.1007/s00216-013-6852-y
#' }
#'
#' @examples
#' \donttest{
#' data_dir <- system.file("extra", package = "Rnmr1D")
#' cmdfile <- file.path(data_dir, "NP_macro_cmd.txt")
#' samplefile <- file.path(data_dir, "Samples.txt")
#' out <- Rnmr1D::doProcessing(data_dir, cmdfile=cmdfile,
#' samplefile=samplefile, ncpu=2)
#' outMat <- getBucketsDataset(out, norm_meth='CSN')
#' clustcorr <- getClusters(outMat, method='corr', cval=0, dC=0.003, ncpu=2)
#' clusthca <- getClusters(outMat, method='hca', vcutusr=0)
#' }
getClusters <- function(data, method='hca', ... )
{
out <- NULL
if (method=='hca') out <- get_Clusters_hca(data, ... )
if (method=='corr') out <- get_Clusters_corr(data, ... )
out
}
# Bucket Clustering - Find the optimal Cutting Tree Value
get_Clusters_hca <- function(data, vcutusr=0, scalemeth='Zscore', log=0, distmeth='euclidean', hcmeth='complete', maxcsize=40, mincsize=2, bucchar='B')
{
params <- list (
method='hca',
# Scaling params
scalemeth = scalemeth,
log = log,
# Bucket variables
BUCCHAR = bucchar,
# Clustering
DISTMETH = distmeth,
HCMETH = hcmeth,
MAXCSIZE = maxcsize,
MINCSIZE = mincsize,
# the range of 'Vcut' for getting some statistics
Vcut_min=0.1,
Vcut_max=0.9,
Vstep=0.005,
vcutusr=vcutusr
)
# Get the association table: Variables <=> CLusterID,
# by applying the cutting tree threshold 'vcut' on the Hclust 'hc'
get_VarSet <- function(hc, vcut)
{
hcut <- vcut*( max(hc$height) + min(hc$height) )
CT <- stats::cutree(hc.c,h=hcut)
i <- 1
V <- NULL
for (k in 1:length(unique(CT))) {
if (length(CT[CT==k])<params$MINCSIZE) next
CTK <- CT[CT==k]
CTK[] <- i
V <- c(V, CTK)
i <- i + 1
}
VarSet <- cbind( names(V), simplify2array(lapply( as.vector(V), function(x) { paste('C',x,sep=''); })) )
return(VarSet)
}
matrix <- .getScaling(data,log=params$log, methods=params$scalemeth)
VC <- 1
CC <- 2
Vcut_min=params$Vcut_min
Vcut_max=params$Vcut_max
Vstep=params$Vstep
# Variables Classification
x <- as.matrix(matrix)
# Compute the distance matrix (see ?dist) and the Hierarchical clustering (see ?hclust)
dist.c <- stats::dist(t(x), method=params$DISTMETH)
hc.c <- stats::hclust(dist.c, method=params$HCMETH)
# Init
Vstats <- NULL
Vcut <- seq(from=Vcut_min, to=Vcut_max, by=Vstep)
# Loop on the range
for( k in 1:length(Vcut) ) {
# Init
Cluster_Count <- 0
Cluster_VarCount <- 0
Cluster_MatchVarCount <- 0
Cluster_MaxSize <- 0
Cluster_Matching <- 0
VarSet <- get_VarSet(hc.c, Vcut[k])
Clusters <- as.character(unique(VarSet[,2]))
# For each cluster ...
for (i in 1:length(Clusters)) {
CL <- Clusters[i]
Cluster_Count <- Cluster_Count + 1
# get the peaklist
ppmlst <- VarSet[VarSet[,CC] == CL, VC]
Cluster_VarCount <- Cluster_VarCount + length(ppmlst)
if (Cluster_MaxSize < length(ppmlst) ) Cluster_MaxSize <- length(ppmlst)
}
Vstats <- rbind( Vstats, c(Vcut[k], Cluster_Count, Cluster_VarCount, Cluster_MaxSize ) )
}
colnames(Vstats) <- c("Vcut", "Nb Clusters", "Nb Vars", "MaxSize")
# Compute the optimal value for Vcut
# -- find the set of Vcut values such as the cluster count is max
Vsub <- Vstats[ Vstats[,4]<params$MAXCSIZE, ]
V <- Vsub[Vsub[,2] == max(Vsub[,2]),1]
# -- Keep the Vcut value such as the variable count is max
VcutOpt <- V[which(Vstats[Vstats[,1] %in% V,3]==max(Vstats[Vstats[,1] %in% V,3]))][1]
Vcut <- ifelse(params$vcutusr==0, VcutOpt, params$vcutusr)
indx <- round(Vcut/Vstep) - round(Vcut_min/Vstep) + 1
# Get the association table: Variables <=> CLusterID
association <- get_VarSet(hc.c, Vcut)
association <- cbind( association, gsub(params$BUCCHAR, "", gsub("_", ".", association[,VC])) )
lclust <- unique(sort(association[,2]))
clusters <- list()
for(i in 1:length(lclust)) {
CL <- lclust[i]
clusters[[CL]] <- as.numeric(association[association[,2]==CL, ][,3])
}
cat("#-- Clustering --\n")
cat('# Distance Method:',params$DISTMETH,"\n")
cat('# Agglomeration Method:',params$HCMETH,"\n")
cat('# Cutting Tree threshold:',Vcut,"\n")
cat('# Nb Clusters:',length(lclust),"\n")
cat("#\n")
return( list(vstats=Vstats, clusters=clusters, clustertab=association, params=params, vcrit=Vcut, indxopt=indx) )
}
# Bucket Clustering based on a lower threshold applied on correlations
get_Clusters_corr <- function(data, scalemeth='Zscore', log=0, cmeth='pearson', cval=0, dC=0.01, maxcsize=40, mincsize=2, bucchar='B', stats_comp=T, ncpu=1)
{
params <- list (
method='corr',
# Scaling params
scalemeth = scalemeth,
log = log,
# Bucket variables
BUCCHAR = bucchar,
# Clustering
CMETH = cmeth,
CVAL = cval,
dC = dC,
MINCSIZE = mincsize,
MAXCSIZE = maxcsize,
# get some statistics
VSTATS_CAL=stats_comp,
NCPU=ncpu,
# the range of 'Cval' for getting some statistics
CVAL_MIN=0.9,
CVAL_MAX=0.999,
CVAL_STEP=0.001
)
matrix <- .getScaling(data,log=params$log, methods=params$scalemeth)
#---- CLUSTERING -----
# Method in ("pearson", "kendall", "spearman")
cor_mat <- stats::cor(matrix,method=params$CMETH)
cor_mat[ lower.tri(cor_mat, diag=TRUE) ] <- 0
vstats <- NULL
indx <- 0
if (params$VSTATS_CAL || params$CVAL==0) {
cvals <- seq(from=params$CVAL_MIN, to=params$CVAL_MAX, by=params$CVAL_STEP)
vstats <- estime_cval(cor_mat, cvals, params$NCPU)
if (params$CVAL==0) {
sub1 <- which( vstats[,4]<params$MAXCSIZE )
sub2 <- which ( vstats[ sub1, 2 ]==max(vstats[ sub1, 2 ] ) ) + sub1[1] - 1
indx <- sub2[ which(vstats[sub2,4]==min(vstats[sub2,4]))[1] ]
params$CVAL <- vstats[indx, 1]
} else {
indx <- round(params$CVAL/params$CVAL_STEP) - round(params$CVAL_MIN/params$CVAL_STEP) + 1
}
}
cl <- parallel::makeCluster(min(params$NCPU,3))
doParallel::registerDoParallel(cl)
NVARS <- length(colnames(matrix))
cvalset <- c(params$CVAL-params$dC,params$CVAL,params$CVAL+params$dC)
k<-0
MT <- foreach::foreach( k=1:length(cvalset), .combine=cbind, .packages = c("igraph", "doParallel")) %dopar% {
cval <- cvalset[k]
cor_mat_cval <- cor_mat
cor_mat_cval[ cor_mat_cval < cval] <- 0
M<-cbind(colnames(matrix),c(1:NVARS)*0)
graph <- graph.adjacency(cor_mat_cval>cval, weighted=TRUE, mode="upper")
E(graph)$weight<-t(cor_mat_cval)[t(cor_mat_cval)>cval]
V(graph)$label<- V(graph)$name
cliques <- sapply(decompose.graph(graph), vcount)
ordcli <- order(cliques,decreasing = T)
g<-0
for (i in 1:length(ordcli)) {
ind<-ordcli[i]
if (cliques[ind]>=params$MINCSIZE) {
g <- g + 1
subg <- decompose.graph(graph)[[ind]]
VARS<-V(subg)$name
M[M[,1] %in% VARS,2] <- sprintf("C%d",g)
}
}
M[,2]
}
parallel::stopCluster(cl)
M <- cbind(colnames(matrix),MT)
NBCLUST <- apply( apply( gsub('C','', MT), 2 , as.numeric), 2, max )
MC <- NULL
for (i in 1:NVARS) {
if (is.na(sum(as.numeric(M[i,-1])))) MC <- rbind(MC,M[i,])
}
varnames <- as.vector(lapply(cvalset, function(x) sprintf('V_%s',x,2)))
varnames <- c('ppm',varnames)
colnames(MC) <- varnames
# Synthesis
for (s in c(1,2)) {
for (k in 1:NBCLUST[s]) {
V <- MC[MC[,1+s]==sprintf("C%d",k),c(1,1+s,2+s)]
i<-1
while(i<dim(V)[1]) {
if (V[i,3] == "0") {
n<-1
while(i<dim(V)[1] && V[i+1,2]==V[i,2] && V[i+1,3] == "0") { i<-i+1; n<-n+1; }
# Rule 1
if (n==1) {
if (i>1 && V[i-1,2]==V[i,2]) V[i,3] <- V[i-1,3]
else if (i<dim(V)[1] && V[i+1,2]==V[i,2]) V[i,3] <- V[i+1,3];
}
# Rule 2
if (n>1) {
NBCLUST[1+s] <- NBCLUST[1+s] + 1
for (j in 1:n) V[i-n+j,3] <- sprintf("C%d",NBCLUST[1+s])
}
}
i<-i+1
if (i==dim(V)[1] && V[i,3] == "0" && V[i-1,2]==V[i,2]) V[i,3] <- V[i-1,3] # a special case
}
MC[MC[,1] %in% V[,1],2+s] <- V[,3]
}
}
#association <- MC[,c(1,4)]
# Filters out non-significant clusters
CLUST <- unique(MC[,4])
association <- NULL
for (cl in 1:length(CLUST)) {
VARS <- MC[MC[,4] == CLUST[cl],1]
M <- matrix[ , VARS ]
cor_mat <- stats::cor(M,method=params$CMETH)
cor_mat[ lower.tri(cor_mat, diag=TRUE) ]<- 0
if ( stats::median(cor_mat[ cor_mat!=0 ]) > (params$CVAL-params$dC) )
association <- rbind(association, as.matrix(MC[MC[,4] == CLUST[cl], c(1,4)], ncol=2, byrow=T))
}
association <- association[association[,2] != "0",]
association <- cbind( association, gsub(params$BUCCHAR, "", gsub("_", ".", association[,1])) )
colnames(association) <- c("VAR","CLID", "PPM")
lclust <- unique(sort(association[,2]))
clusters <- list()
for(i in 1:length(lclust)) {
CL <- lclust[i]
clusters[[CL]] <- as.numeric(association[association[,2]==CL, ][,3])
}
cat("#-- Clustering --\n")
cat('# Correlation Method:',params$CMETH,"\n")
cat('# Correlation Threshold :',params$CVAL,"\n")
cat('# Correlation Tolerance:',params$dC,"\n")
cat('# Nb Clusters:',length(lclust),"\n")
cat("#\n")
return( list( vstats=vstats, clusters=clusters, clustertab=association, params=params, vcrit=params$CVAL, indxopt=indx ) )
}
estime_cval <- function(cor_mat, cvals, ncpu)
{
compute_crit <- function(cor_mat, cval) {
cor_mati <- cor_mat
cor_mati[ cor_mati < cval] <- 0
graph <- graph.adjacency(cor_mati>cval, weighted=TRUE, mode="upper")
E(graph)$weight<-t(cor_mati)[t(cor_mati)>cval]
V(graph)$label<- V(graph)$name
cliques <- sapply(decompose.graph(graph), vcount)
ordcli <- order(cliques,decreasing = T)
nb_clusters<-0
nb_clusters_2<-0
nb_buckets<-0
for (i in 1:length(ordcli)) {
if (cliques[ordcli[i]]>=2) {
nb_clusters <- nb_clusters + 1
nb_buckets <- nb_buckets + cliques[ordcli[i]]
}
if (cliques[ordcli[i]]==2)
nb_clusters_2 <- nb_clusters_2 + 1
}
size_max <- cliques[ordcli[1]]
CRIT<-size_max/nb_clusters
c(cval,nb_clusters,nb_buckets,size_max,nb_clusters_2,-20*log10(CRIT))
}
cl <- parallel::makeCluster(ncpu)
doParallel::registerDoParallel(cl)
i<-0
vstats <- foreach::foreach(i=1:length(cvals), .combine=rbind, .packages = c("igraph", "doParallel")) %dopar% { compute_crit(cor_mat, cvals[i]); }
parallel::stopCluster(cl)
colnames(vstats) <- c("Cval","Nb Clusters","Nb Vars","Max Size","Nb Clusters 2","Criterion")
vstats
}
#' getMergedDataset
#'
#' merged variables for each cluster (based on their average)
#'
#' @param data the matrix including the integrations of the areas defined by the buckets (columns)
#' on each spectrum (rows)
#' @param clustObj a list generated by the \code{getClusters} function
#' @param onlycluster boolean - specifies if the merged data matrix at output must only contain
#' the merged clusters (TRUE) or if it must also contain the buckets that are not include within a
#' cluster (FALSE)
getMergedDataset <- function(data, clustObj, onlycluster=FALSE)
{
association <- clustObj$clustertab
matrix <- data
if ("matrix" %in% class(association) && dim(association)[1]>1) {
Clusters <- sort(unique(association[,2]))
M <- simplify2array(lapply( 1:length(Clusters), function(x) { apply( matrix[, colnames(matrix) %in% association[association[,2] == Clusters[x],1] ], 1, mean ) } ))
colnames(M) <- Clusters
if (!onlycluster)
M <- cbind(M, matrix[, !(colnames(matrix) %in% association[,1])])
matrix <- M
}
matrix
}
## ---- Graphic Functions ----
##########################################################################
## plot.Criterion
##########################################################################
#' plotCriterion
#'
#' Plots the curves that show the number of clusters, the number of clustered buckets and the
#' size of biggest cluster versus the criterion, namely the correlation threshold for the 'corr'
#' method, the cutting value for the 'hca' method.
#'
#' @param clustObj a list generated by the \code{getClusters} function
#' @param reverse Boolean - indicates if x-axis must be reversed (TRUE) or nor (FALSE)
plotCriterion <- function(clustObj, reverse=FALSE)
{
Vstats <- clustObj$vstats
Vcrit <- clustObj$vcrit
n <- clustObj$indxopt
params <- clustObj$params
MaxSize <- params$MAXCSIZE
method <- params$method
association <- clustObj$clustertab
lclust <- unique(sort(association[,2]))
# PNG image - Vstats plots
legNames <- c( colnames(Vstats)[3], colnames(Vstats)[2], colnames(Vstats)[4])
graphics::par(mar = c(5, 4, 4, 4) + 0.3) # Leave space for z axis
if( reverse==TRUE ) {
xlim <- c(max(Vstats[,1]), min(Vstats[,1]))
} else {
xlim <- c(min(Vstats[,1]), max(Vstats[,1]))
}
main <- sprintf("%s method, Critere = %4.3f, Nb Clust = %d",toupper(method),Vcrit, length(lclust))
xlab <- "Critere"
ylab <- "Number of variables / Clust Max Size"
graphics::plot(Vstats[,1],Vstats[,3], xlim=xlim, ylim=c(0,1.2*max(Vstats[,3])), type="l", col=colors[1], xlab=xlab, ylab=ylab, main=main)
if( method=='corr' ) {
cval_lim <- c( max(params$CVAL-params$dC,0.9), min(params$CVAL+params$dC,1) )
graphics::rect(cval_lim[1], -100, cval_lim[2], 1.5*max(Vstats[,3]), border = NA, col="azure2" ) # darkslategray2
graphics::par(new=TRUE)
graphics::plot(Vstats[,1],Vstats[,3], xlim=xlim, ylim=c(0,1.2*max(Vstats[,3])), type="l", col=colors[1], xlab=xlab, ylab=ylab, main=main)
}
graphics::lines(Vstats[,1],Vstats[,4], col=colors[3])
graphics::abline(v=Vcrit, col="red")
graphics::abline(h=MaxSize, col="green")
if( method=='hca' ) {
slines <- seq(from=params$Vcut_min, to=params$Vcut_max, by=0.025)
} else {
slines <- seq(from=params$CVAL_MIN, to=params$CVAL_MAX, by=0.0025)
}
graphics::abline(v = slines, col = "lightgray", lty = 3)
graphics::legend("top", legNames, col=colors[c(1:length(legNames))], lwd=2, lty=1, cex=0.6, horiz=TRUE)
graphics::par(new=TRUE)
graphics::plot(Vstats[,1],Vstats[,2], xlim=xlim, type="l", col=colors[2], xaxt="n", yaxt="n", xlab="",ylab="" )
graphics::axis(side=4, at = pretty(range(Vstats[,2])))
graphics::mtext("Number of Clusters", side=4, line=3)
}
##########################################################################
## plot.Clusters
##########################################################################
#' plotClusters
#'
#' Plots the boxplot of all clusters allowing to have an insight on the clusters distribution
#'
#' @param data the matrix including the integrations of the areas defined by the buckets (columns)
#' on each spectrum (rows)
#' @param clustObj a list generated by the \code{getClusters} function
#' @param horiz Boolean - Indicates if the plot is horizontal (TRUE) or vertical (FALSE)
#' @param main Main title of the plot
plotClusters <- function(data, clustObj, horiz=TRUE, main="Boxplot by clusters (log10 transformed)")
{
X <- getMergedDataset(data, clustObj, onlycluster=T)
CLUST <- colnames(X)
X[X<=0]<-0.00001*max(X)
cent <- log10(t(X))
Rank <- simplify2array(lapply(c(1:length(CLUST)), function(x) { round(mean(cent[x, ], na.rm=T)) }))
cols <- c('red', 'orange', 'darkgreen', 'blue', 'purple')
Rank <- Rank - min(Rank) + 1
nbrank <- max(Rank) - min(Rank) + 1
graphics::boxplot(t(cent),cex.axis=0.5, main=main,
ylab='\n\n\n\n\nClusters',xlab='Intensity (log10)', outline=F, horizontal=horiz, border=cols[Rank], las=2, cex.axis=0.5)
graphics::abline(h=c(1:length(CLUST)), col="gray95")
graphics::par(new=TRUE)
graphics::boxplot(t(cent),cex.axis=0.5, main=main,
ylab='\n\n\n\n\nClusters',xlab='Intensity (log10)', outline=F, horizontal=horiz, border=cols[Rank], las=2, cex.axis=0.5)
}
##########################################################################
## plot.Scores
##########################################################################
## Define the 'plot.scores' function for plotting PCA scores.
## Input : data [N,PC] - N samples (rows), PC (columns)
## samples - matrix of samples generated by the Rnmr1D function
## factor - name (string) of the factor ; must be present as a column within the samples matrix
## level - confidence level for plotting the corresponding ellipse
##########################################################################
#' plotScores
#'
#' Plots the two components defined by pc1, pc2 of the matrix of scores coming from a
#' multivariable analysis, typically a Principal Component Analysis (PCA).
#'
#' @param data the matrix of scores coming from a multivariable analysis, typically a Principal Component Analysis (PCA)
#' @param pc1 the fist component of the matrix of variable loadings to be plotted.
#' @param pc2 the second component of the matrix of variable loadings to be plotted.
#' as well as the levels of the experimental factors if specified in the input.
#' See \code{doProcessing} or \code{generateMetadata}
#' @param samples the samples matrix with the correspondence of the raw spectra,
#' @param factor if not null, the name of one of the columns defining the factorial groups in the samples matrix at input
#' @param cexlabel number indicating the amount by which plotting text and symbols should be scaled relative to the default.
#' @param level confidence level for plotting the corresponding ellipse
#' @param xlim gives the limit to be plotted of the first component
#' @param ylim gives the limit to be plotted of the second component
#' @param col colors vector for ellipses - automatically defined by default
plotScores <- function(data, pc1, pc2, samples, factor=NULL, cexlabel=1.2, level=0.95, xlim=NULL, ylim=NULL, col=NULL) {
if (is.null(xlim)) xlim=1.5*c(min(data[,pc1]),max(data[,pc1]))
if (is.null(ylim)) ylim=1.5*c(min(data[,pc2]),max(data[,pc2]))
graphics::plot(data ,pch=20, adj=0, lwd=2, main="", xlim=xlim, ylim=ylim )
graphics::text( adj=0, data[,pc1], data[,pc2], rownames(data), pos=4, cex=cexlabel )
if (! is.null(factor)) {
G <- unique(samples[ , factor ])
if (is.null(col)) {
grcols=grDevices::rainbow(length(G), s=0.9, v=0.8)
} else {
grcols=col
}
for(i in 1:length(G)) {
XY <- data[rownames(data) %in% samples$Samplecode[samples[ , factor ]==G[i]], ]
plotEllipse( XY[,pc1], XY[,pc2], level=level, col=grcols[i], lty=graphics::par()$lty, lwd=graphics::par()$lwd )
}
graphics::title(factor)
}
}
##########################################################################
## plot.loadings
##########################################################################
## Loadings plot : Input : data [N,PC] N variables (rows), PC (columns)
##
## use with associations file: plot of ellipses corresponding to each cluster
##
##
##########################################################################
#' plotLoadings
#'
#' Plots the two components defined by pc1, pc2 of the matrix of variable loadings coming from a
#' multivariable analysis, typically a Principal Component Analysis (PCA).
#' It can also plot the ellipses corresponding to each cluster defined by the associations matrix
#' if not null. (in fact it is the main interest of this function).
#'
#' @param data the matrix of variable loadings coming from a multivariable analysis, typically a Principal Component Analysis (PCA)
#' @param pc1 the fist component of the matrix of variable loadings to be plotted.
#' @param pc2 the second component of the matrix of variable loadings to be plotted.
#' @param associations the associations matrix that gives for each cluster (column 2) the corresponding buckets (column 1)
#' @param main Change the default plot title on the rigth corner
#' @param xlimu gives the limit to be plotted of the first component
#' @param ylimu gives the limit to be plotted of the second component
#' @param cexlabel number indicating the amount by which plotting text and symbols should be scaled relative to the default.
#' @param pch Plotting Symbols
#' @param ellipse boolean - specifies if ellipses are plot or not for each cluster
#' @param level confidence level for plotting the ellipses
plotLoadings <- function (data, pc1, pc2, associations=NULL, main="Loadings",
xlimu=c(min(data[,pc1]),max(data[,pc1])), ylimu=c(min(data[,pc2]),max(data[,pc2])), cexlabel=1, pch=20, ellipse=TRUE, level=0.8)
{
V <- t(data[,c(pc1,pc2)])
colnames(V) <- rownames(data)
graphics::plot(t(V),pch=20, adj=1, lwd=1, main=main,
xlab= colnames(data)[pc1], ylab= colnames(data)[pc2],
xlim=xlimu, ylim=ylimu, col='red')
graphics::abline(h=0,v=0)
#plotEllipse( data[, pc1], data[, pc2], center=c(0,0), level=0.666, col="red", lty=3, lwd=0.1, type="l")
# Use Associations (assoc1=T)
if ("matrix" %in% class(associations) && dim(associations)[1]>1) {
graphics::points(t(V), pch=pch, col="grey")
graphics::text( adj=0, t(V)[,1], t(V)[,2], col="lightgrey", colnames(V), cex=cexlabel )
colnames(V) <- as.vector(sapply(rownames(data),function(x) { ifelse(sum( associations[,1] %in% x), associations[,2][associations[,1] %in% x ], x) }))
P <- V[,colnames(V) %in% associations[,2] ]
cols <- "red"
Clusters <- unique(associations[order(associations[,1],decreasing=T),2])
if (length(Clusters)<length(colnames(P))) {
clcols=grDevices::rainbow(length(Clusters), s=0.9, v=0.8)
for (i in 1:length(Clusters)) {
XY <- t(P[,colnames(P)==Clusters[i]])
M<-c(mean(XY[,1]),mean(XY[,2]))
graphics::points(XY, pch=pch, col=clcols[i])
if (dim(XY)[1]>1 && ellipse) plotEllipse( XY[,1], XY[,2], center=M, level= level, col=clcols[i], lty=graphics::par()$lty, lwd=graphics::par()$lwd )
graphics::text(adj=0, M[1], M[2], Clusters[i], col="black", cex=cexlabel, font=2)
}
}
else {
clcols=grDevices::rainbow(length(Clusters), s=0.9, v=0.8)
cols <- NULL; for (i in 1:length(colnames(P))) cols <- c(cols, clcols[Clusters == colnames(P)[i]])
graphics::points(t(P), pch=pch, col="red")
graphics::text( adj=0, t(P)[,1], t(P)[,2], col=cols, colnames(P), cex=cexlabel )
}
}
else {
graphics::text( adj=0, t(V)[,1], t(V)[,2], col="cornflowerblue", colnames(V), cex=cexlabel )
}
}
##########################################################################
## plot.ellipse
##########################################################################
## Compute confidence ellipses.
## x and y variables for drawing.
## level confidence level used to construct the ellipses. By default, 0.95.
## npoint number of points used to draw the ellipses.
## bary logical value. If TRUE, the coordinates of the ellipse around the barycentre of individuals are calculated.
## See https://github.com/kassambara/ggpubr/blob/master/R/stat_conf_ellipse.R
##
## Note : the size of the ellipses is related to the level of confidence chosen p:
## p = 1 - exp(-cellipse^2 / 2) => cellipse <- sqrt(qchisq(p, 2))
## With a confidence level at 66.66% (2/3), we have cellipse = 1.5
## See http://www.visiondummy.com/2014/04/draw-error-ellipse-representing-covariance-matrix/
## http://www.earth-time.org/projects/upb/public_docs/ErrorEllipses.pdf
##########################################################################
plotEllipse <- function ( x, y, center=NULL, level = 0.95, npoint = 100, bary = FALSE, ... )
{
.ellipse <- function(x, scale = c(1, 1), centre = c(0, 0), t = 1.5, which = c(1,2), npoints = 100) {
names <- c("x", "y")
if (is.matrix(x)) {
xind <- which[1]
yind <- which[2]
r <- x[xind, yind]
if (missing(scale)) {
scale <- sqrt(c(x[xind, xind], x[yind, yind]))
if (scale[1] > 0)
r <- r/scale[1]
if (scale[2] > 0)
r <- r/scale[2]
}
if (!is.null(dimnames(x)[[1]]))
names <- dimnames(x)[[1]][c(xind, yind)]
}
else r <- x
r <- min(max(r, -1), 1)
d <- acos(r)
a <- seq(0, 2 * pi, len = npoints)
matrix(c(t*scale[1]*cos(a + d/2) + centre[1], t*scale[2]*cos(a - d/2) + centre[2]), npoints, 2, dimnames = list(NULL, names))
}
if (is.null(center)) center <- c(mean(x, na.rm = TRUE), mean(y, na.rm = TRUE))
if (length(as.vector(x))>2) {
tab <- data.frame(x = x, y = y)
mat.cov <- stats::cov(tab)
t = sqrt(stats::qchisq(level, 2))
if (bary) mat.cov = mat.cov/nrow(tab)
res <- .ellipse(mat.cov, centre = center, t=t, npoints = npoint)
graphics::lines(res, adj=1, main="", xlab="", ylab="", ... )
} else {
graphics::lines(cbind(x, y), adj=1, main="", xlab="", ylab="", ... )
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.