# enterotype
# http://enterotype.embl.de/enterotypes_tutorial.sanger.R
#
# Modified by Walter Xie
# Accessed on 23 Nov 2016
#
#install.packages("cluster")
#install.packages("clusterSim")
#library(cluster)
#library(clusterSim)
#library(ade4)
#library(ggplot2)
#library(ellipse)
#' @name enterotype
#' @title Enterotype
#'
#' @description The enterotype method is introduced by
#' \url{http://enterotype.embl.de}.
#' Besides the original functions copied from the website,
#' we also add few more analyses to advancing this method,
#' such as correlation between enterotypes and given/known groups.
#'
#' @details
#' \code{enterotypePipeline} is a pipeline summarised from
#' \url{http://enterotype.embl.de/enterotypes_tutorial.sanger.R}.
#' The steps of this pipeline are described below:
#'
#' 1. \code{getRelativeAbundance} turns taxa.assign into normalized probability distributions;
#'
#' 2. \code{getJSD} calculates Jensen-Shannon divergence dissimilarity;
#'
#' 3. \code{getClusters} lists Calinski-Harabasz (CH) Indices from 1 to n clusters;
#'
#' 4. \code{data.cluster} picks up the optimised or selected k-cluster final result;
#'
#' 5. \code{plotOptClusters} and \code{plotEnterotypes} make plots including BCA and PCoA.
#'
#' ---------------------------------------------------------------------------------------
#'
#' @keywords enterotype
#' @export
#' @examples
#' enterotypePipeline(taxa.assign, n.max=20, fig.path="./figures", percent=0)
#'
#' @rdname enterotype
enterotypePipeline <- function(taxa.assign, n.max=20, k=NULL, fig.path=NA, percent=0.01, validate=TRUE, ...) {
# 1. turn taxa.assign into normalized probability distributions
relative.abund <- ComMA::getRelativeAbundance(taxa.assign)
# 2. calculate Jensen-Shannon divergence dissimilarity
jsd.dist <- ComMA::getJSD(relative.abund)
# 3. Calinski-Harabasz (CH) Index of n clusters
nclusters <- ComMA::getClusters(relative.abund, jsd.dist, n.max=n.max)
# 4. the optimised or selected k-cluster final result
data.cluster <- ComMA::getDataCluster(jsd.dist, k=k, nclusters=nclusters, validate=TRUE)
# 5. plot
ComMA::plotOptClusters(nclusters, fig.path=fig.path, n.max=n.max)
ComMA::plotEnterotypes(relative.abund, jsd.dist, data.cluster, fig.path=fig.path, percent=percent, ...)
}
#' @details
#' \code{getRelativeAbundance} returns a matrix of normalized probability distributions
#' of the abundance matrix for an input of \code{getJSD}, namely relative abundance.
#'
#' @param taxa.assign The data frame of taxonomic assignments with abudence
#' at the \code{rank}, where rownames are taxonomy at that rank,
#' and columns are the sample names. It can be
#' one element of the list generated by \code{\link{assignTaxaByRank}}.
#' @keywords enterotype
#' @export
#' @examples
#' relative.abund <- getRelativeAbundance(taxa.assign)
#'
#' @rdname enterotype
getRelativeAbundance <- function(taxa.assign) {
taxa.assign.prop <- prop.table(as.matrix(taxa.assign), 2)
}
#' @details
#' \code{getJSD} return a distance \code{\link{dist}} object
#' calculated by Jensen-Shannon divergence (JSD) metric.
#'
#' @param data The matrix of normalized probability distributions
#' of the abundance matrix, also called abundance distributions.
#' Rows are taxonomy at that rank and columns are samples.
#' @keywords enterotype
#' @export
#' @examples
#' jsd.dist <- getJSD(relative.abund)
#'
#' @rdname enterotype
getJSD <- function(data) {
#nrow(data);ncol(relative.abund)
#colSums(data)
jsd.dist=dist.JSD(data)
return(jsd.dist)
}
#' @details
#' \code{getClusters} return the Calinski-Harabasz (CH) Index
#' for choosig a number of clusters from 2 to \code{n.max}.
#'
#' @param data.dist A distance \code{\link{dist}} object
#' calculated by Jensen-Shannon divergence (JSD) metric.
#' @param n.max The number of clusters. Default to 20.
#' @keywords enterotype
#' @export
#' @examples
#' nclusters <- getClusters(relative.abund, jsd.dist)
#'
#' @rdname enterotype
getClusters <- function(data, data.dist, n.max=20, ...) {
if (n.max < 2)
stop("The max number of clusters must >= 2!")
require("clusterSim")
#nclusters = index.G1(t(data), data.cluster, d = data.dist, centrotypes = "medoids")
nclusters=NULL
for (k in 1:n.max) {
if (k==1) {
nclusters[k]=NA
} else {
data.cluster_temp=pam.clustering(data.dist, k)
nclusters[k]=index.G1(t(data), data.cluster_temp, d=data.dist, centrotypes = "medoids")
}
}
return(nclusters)
}
#' @details
#' \code{plotOptClusters} plots the Calinski-Harabasz (CH) Index
#' for choosig a number of clusters from 2 to \code{n.max}.
#'
#' @keywords enterotype
#' @export
#' @examples
#' plotOptClusters(nclusters, fig.path="./figures", n.max=10)
#'
#' @rdname enterotype
plotOptClusters <- function(nclusters, fig.path=NA, n.max=20) {
if (length(nclusters) < n.max)
stop("n.max cannot excess length(nclusters) !")
if (!is.na(fig.path))
pdf(file.path(fig.path, "optimal-clusters-JSD.pdf"), width=6, height=6)
# the optimal number of clusters
plot(nclusters, type="h", xlab="number of clusters", ylab="CH index",
main=paste0("Optimal number of clusters (Jensen-Shannon divergence)"), xaxt="n")
axis(1, at = seq(1, n.max, by = 2))
if (! is.na(fig.path))
invisible(dev.off())
}
#' @details
#' \code{getDataCluster} returns the optimised or selected solution having
#' \code{k} clusters assigned by \code{\link{pam}}.
#'
#' @param k The number of clusters chosen for the optimised solution.
#' If NULL, the defult, it will take the number of clusters having
#' the largest CH Index.
#' @param nclusters The vector of nclusters, such as
#' Calinski-Harabasz (CH) Index, to find the optimised solution.
#' @param validate Logical if it needs to validate. Default to TRUE.
#' @param as.vector Logical if TRUE, as default, to return a vector,
#' otherwise return a \code{\link{pam.object}} representing the clustering.
#' @keywords enterotype
#' @export
#' @examples
#' data.cluster <- getDataCluster(jsd.dist, nclusters=nclusters)
#'
#' @rdname enterotype
getDataCluster <- function(data.dist, k=NULL, nclusters=NULL, as.vector=TRUE, validate=TRUE) {
if (is.null(k)) {
if (is.null(nclusters))
stop("Please provide the nclusters, such as CH Index, to find the optimised solution !")
k=match(max(nclusters[!is.na(nclusters)]), nclusters)
cat("The optimised solution is ", k, " clusters.\n")
}
data.cluster=pam.clustering(data.dist, k=k, as.vector=as.vector)
attr(data.cluster, "k") <- k
if (as.vector)
return(data.cluster)
#nclusters = index.G1(t(data), data.cluster, d = data.dist, centrotypes = "medoids")
if (validate)
validateCluster(data.cluster, data.dist)
return(data.cluster)
}
#' @details
#' \code{noise.removal} removes any row whose sum is samller than
#' the given percentage of total sum of matrix.
#' Advise to apply this function to data generated
#' using short sequencing technologes, like Illumina or Solid.
#'
#' @keywords enterotype
#' @export
#' @examples
#' data=noise.removal(data, percent=0.1)
#'
#' @rdname enterotype
noise.removal <- function(data, percent=0.01){
bigones <- rowSums(data)*100/sum(data) > percent
Matrix_1 <- data[bigones,]
cat("percent = ", percent, ", remove", nrow(data)-nrow(Matrix_1), "rows.\n")
return(Matrix_1)
}
#' @details
#' \code{plotEnterotypes} is a mixed function to plot clusters,
#' BCA \code{\link{bca}} and PCoA \code{\link{dudi.pco}}
#' from the optimised or selected solution.
#'
#' Between-class analysis (BCA) was performed to support the
#' clustering and identify the drivers for the enterotypes.
#' It is only available when \code{k} > 2.
#'
#' @param data.cluster The clusters assigned by
#' partitioning clustering \code{\link{pam}}.
#' @param attr.data A data frame to provide additional attributes
#' for visualization. Default to give an empty data frame to do nothing.
#' @param percent The percentage threshold to remove the noise
#' (ie. low abundant genera). Prior to the analysis.
#' Default to 0.
#' @param addLabel Logical if add labels of points. Default to TRUE.
#' @param text.colour.id The column name in \code{attr.data}
#' to colour texts of points.
#' @param fig.path The folder path to save figures. If NA,
#' the default, do not plot the figure.
#' @keywords enterotype
#' @export
#' @examples
#' plotEnterotypes(relative.abund, jsd.dist, data.cluster)
#'
#' @rdname enterotype
plotEnterotypes <- function(data, data.dist, data.cluster, attr.data=data.frame(), fig.path=NA,
percent=0, addLabel=TRUE, text.colour.id=NULL, palette="Set1", postfix="",
plot.clusters=TRUE, plot.bca=TRUE, plot.pcoa=TRUE,
cl.width=9, cl.height=6, width=8, height=8, verbose=TRUE, ...) {
attr.data.cluster <- attributes(data.cluster)
k <- as.numeric(attr.data.cluster$k)
if (percent > 0)
data=noise.removal(data, percent)
if (plot.clusters)
p.list <- ComMA::plotClusterAbundence(data, data.cluster, fig.path=fig.path, width=cl.width, height=cl.height)
require(ade4)
if (plot.bca && k > 2) {
cat("Plot between-class analysis (BCA) for", k, "clusters.\n")
#Between-class analysis (BCA) was performed to support the clustering and identify the drivers for the enterotypes.
obs.pca=dudi.pca(data.frame(t(data)), scannf=F, nf=10)
obs.bet=bca(obs.pca, fac=as.factor(data.cluster), scannf=F, nf=k-1)
cat("\nplot", k, "clusters between-class analysis (BCA).\n")
bca.df <- as.data.frame(obs.bet$ls)
colnames(bca.df)[1:2] <- c("PC1", "PC2")
bca.df$cluster <- data.cluster
p.list[["BCA"]] <- plotSamples(bca.df, attr.data=attr.data, fig.path=fig.path,
addLabel=addLabel, text.colour.id=text.colour.id, palette=palette,
prefix=paste("enterotypes-bca", postfix, sep = "-"),
width=width, height=height, x.id="PC1", y.id="PC2",
title="Between Class Analysis", verbose=verbose, ...)
}
if (plot.pcoa) {
cat("Plot principal coordinates analysis (PCoA) for", k, "clusters.\n")
#principal coordinates analysis (PCoA) of a Euclidean distance matrix
obs.pcoa=dudi.pco(data.dist, scannf=F, nf=3)
cat("\nplot", k, "clusters principal coordinates analysis (PCoA).\n")
pcoa.df <- as.data.frame(obs.pcoa$li)
colnames(pcoa.df)[1:3] <- c("PCo1", "PCo2", "PCo3")
pcoa.df$cluster <- data.cluster
p.list[["PCoA"]] <- plotSamples(pcoa.df, attr.data=attr.data, fig.path=fig.path,
addLabel=addLabel, text.colour.id=text.colour.id, palette=palette,
prefix=paste("enterotypes-pcoa", postfix, sep = "-"),
width=width, height=height, x.id="PCo1", y.id="PCo2",
title="Principal Coordinates Analysis", verbose=verbose, ...)
}
return(p.list)
}
#' @details
#' \code{plotClusterAbundence} return a list of \code{\link{ggplot2}} objects
#' for relative abundance distribution in each cluster.
#'
#' @keywords enterotype
#' @export
#' @examples
#' p.list <- plotClusterAbundence(data, data.cluster)
#'
#' @rdname enterotype
plotClusterAbundence <- function(data, data.cluster, fig.path=NA, cluster.colours=c(), min.median=0,
x.lab="", y.lab="Relative abundence", width=9, height=6) {
attr.data.cluster <- attributes(data.cluster)
k <- attr.data.cluster$k
if (!is(data, "matrix"))
stop("Data input has to be 'matrix' !")
cat("\nPlot the relative abundance for", k, "clusters.\n")
da.cl <- as.data.frame(cbind(t(data), cluster=data.cluster))
require(reshape2)
da.cl.melt <- melt(da.cl, id=c("cluster"))
da.cl.melt <- da.cl.melt[da.cl.melt$value>0,]
# TODO: cannot order by cluster and median in one data frame
plot.list <- list()
require(gg1L)
for (cl in sort(unique(data.cluster))) {
gg <- da.cl.melt[da.cl.melt$cluster==cl,]
cat("Cluster", cl, "samples:", paste(rownames(da.cl[da.cl$cluster==cl,]), collapse = ", "), ".\n")
gg$cluster <- paste0("cluster", cl)
#if (min.median > 0)
gg$variable <- reorder(gg$variable, gg$value, median, order=TRUE)
if (length(cluster.colours) > 0 && cl <= length(cluster.colours))
cl.col <- cluster.colours[cl]
else
cl.col <- NULL
p <- gg1L::ggBoxWhiskersPlot(gg, x.id="variable", y.id="value", fill.id="cluster",
palette=cl.col, scale.type="manual",
x.lab=x.lab, y.lab=y.lab, title=paste("Cluster", cl),
x.text.angle=90, no.legend="fill")
plot.list[[cl]] <- p
if(!is.na(fig.path))
gg1L::pdf.ggplot(p, fig.path = file.path(fig.path, paste0("enterotype-",k,"-",cl,".pdf")), width=width, height=height)
}
return(plot.list)
}
######### correlation between enterotypes and known groups ##########
#' @details
#' \code{corrEnterotypeToGroup} calculates Cramér's V between the enterotypes
#' and the known groups (two categorical variables) from a same data set
#' using Chi-Squared test \code{\link{chisq.test}}.
#' \url{http://en.wikipedia.org/wiki/Cramér\%27s_V}.
#'
#' @param group.id The column name in \code{attr.data} contains the known groups to
#' compare with enterotypes.
#' @param simulate.p.value a logical indicating whether to compute p-values
#' by Monte Carlo simulation, the default is FALSE.
#' @keywords enterotype
#' @export
#' @examples
#' chi <- corrEnterotypeToGroup(jsd.dist, k=5, attr.data=env, group.id="land.use")
#'
#' @rdname enterotype
corrEnterotypeToGroup <- function(data.dist, k, attr.data, group.id, simulate.p.value=TRUE) {
k.clusters <- pam.clustering(data.dist, k, as.vector=F)$clustering
df.merge <- ComMA::mergeBy(as.data.frame(k.clusters), attr.data)
clusters.id <- "k.clusters"
# data frame bug: cannot use df.merge[, c(clusters.id, group.id)]
pair.freq <- ComMA::freqUniqueValue(df.merge[, c(clusters.id, group.id)])
pair.freq <- pair.freq[order(pair.freq[,clusters.id], pair.freq[,group.id]),]
nrow=length(unique(pair.freq[,clusters.id]))
ncol=length(unique(pair.freq[,group.id]))
tb.freq <- matrix(data=pair.freq$Freq, nrow=nrow, ncol=ncol, byrow=T)
rownames(tb.freq) <- unique(pair.freq[,clusters.id])
colnames(tb.freq) <- unique(pair.freq[,group.id])
chi2v = chisq.test(tb.freq, simulate.p.value=simulate.p.value)
#c(chi2v$statistic, chi2v$p.value)
# V=sqrt( X-squared / (N * min(k-1, r-1)) )
V=sqrt( as.numeric(chi2v$statistic) / ( sum(tb.freq) * min(ncol(tb.freq)-1, nrow(tb.freq)-1) ) )
list(freq=tb.freq, chi=chi2v, V=V, p.value=chi2v$p.value)
}
######### internal ##########
# Jensen-Shannon divergence (JSD) (Endres & Schindelin, 2003)
# Kullback-Leibler divergence
dist.JSD <- function(inMatrix, pseudocount=0.000001, ...) {
KLD <- function(x,y) sum(x *log(x/y))
JSD<- function(x,y) sqrt(0.5 * KLD(x, (x+y)/2) + 0.5 * KLD(y, (x+y)/2))
matrixColSize <- length(colnames(inMatrix))
matrixRowSize <- length(rownames(inMatrix))
colnames <- colnames(inMatrix)
resultsMatrix <- matrix(0, matrixColSize, matrixColSize)
inMatrix = apply(inMatrix,1:2,function(x) ifelse (x==0,pseudocount,x))
for(i in 1:matrixColSize) {
for(j in 1:matrixColSize) {
resultsMatrix[i,j]=JSD(as.vector(inMatrix[,i]),
as.vector(inMatrix[,j]))
}
}
colnames -> colnames(resultsMatrix) -> rownames(resultsMatrix)
as.dist(resultsMatrix)->resultsMatrix
attr(resultsMatrix, "method") <- "dist"
return(resultsMatrix)
}
#data.dist=dist.JSD(data)
# Partitioning around medoids (PAM) clustering algorithm to cluster the abundance profiles.
# PAM derives from the basic k-means algorithm, but has the advantage that it supports any
# arbitrary distance measure and is more robust than k-means. It is a supervised procedure,
# where the predetermined number of clusters is given as input to the procedure, which then
# partitions the data into exactly that many clusters.
pam.clustering=function(x,k, as.vector=TRUE) { # x is a distance matrix and k the number of clusters
require(cluster)
cluster=pam(as.dist(x), k, diss=TRUE)
if (as.vector) {
return(as.vector(cluster$clustering))
}
return(cluster)
}
#data.cluster=pam.clustering(data.dist, k=3)
# Observations with a large s(i) (almost 1) are very well clustered,
# a small s(i) (around 0) means that the observation lies between two clusters,
# and observations with a negative s(i) are probably placed in the wrong cluster.
validateCluster <- function(data.cluster, data.dist) {
require(cluster)
obs.silhouette=mean(silhouette(data.cluster, data.dist)[,3])
cat("obs.silhouette =", obs.silhouette, "\n")
}
# for both BCA and PCoA
plotSamples <- function(points.df, attr.data=data.frame(), fig.path=NA, addLabel=TRUE,
colour.id="cluster", ellipsed.id="cluster", shape.id=NULL, text.colour.id=NULL,
palette="Set1", prefix="enterotypes-pcoa-", width=8, height=8,
x.id="PCo1", y.id="PCo2", title="Principal Coordinates Analysis", verbose=TRUE, ...) {
cluster.levels <- sort(unique(points.df$cluster))
k <- length(cluster.levels)
points.df$cluster <- factor(points.df$cluster, levels = cluster.levels)
if (addLabel) {
text.id="Row.names"
if (nrow(attr.data) > 0) {
points.df <- ComMA::mergeBy(points.df, attr.data, rm)
#points.df[,] <- factor(points.df[,], levels = attr.levels)
} else {
points.df$Row.names <- rownames(points.df)
}
} else {
text.id=NULL
}
require(gg1L)
p <- gg1L::ggScatterPlot(points.df, x.id=x.id, y.id=y.id, colour.id=colour.id, ellipsed.id=ellipsed.id,
shape.id=shape.id, text.id=text.id, text.colour.id=text.colour.id,
palette=palette, title=title, xintercept=0, yintercept=0, verbose=verbose, ...)
gt <- gg1L::unclip.ggplot(p)
if(!is.na(fig.path))
gg1L::pdf.gtable(gt, fig.path=file.path(fig.path, paste0(prefix, k,".pdf")), width=width, height=height)
return(p)
}
######### vegan cascadeKM ##########
plotCascadeKM <- function(fig.path, data, n.max) {
library(vegan)
ccas <- cascadeKM(t(data), 2, n.max)
#ccas
pdf(file.path(fig.path, paste0("optimal-clusters-cascadeKM.pdf")), width=6, height=6)
plot(ccas, sortq=TRUE)
invisible(dev.off())
ccas <- cascadeKM(t(data), 2, 20, criterion = "ssi")
pdf(file.path(fig.path, paste0("optimal-clusters-cascadeKM-ssi.pdf")), width=6, height=6)
plot(ccas, sortq=TRUE)
invisible(dev.off())
}
######### original plot fuction ##########
plotEnterotypes.bak <- function(fig.path, data, data.dist, data.cluster, k, percent=0.01) {
if (percent > 0)
data=noise.removal(data, percent)
require(ade4)
## plot 1
obs.pca=dudi.pca(data.frame(t(data)), scannf=F, nf=10)
obs.bet=bca(obs.pca, fac=as.factor(data.cluster), scannf=F, nf=k-1)
pdf(file.path(fig.path, "enterotypes-between-class.pdf"), width=6, height=6)
# dev.new()
s.class(obs.bet$ls, fac=as.factor(data.cluster), grid=F,sub="Between-class analysis")
invisible(dev.off())
#plot 2
obs.pcoa=dudi.pco(data.dist, scannf=F, nf=k)
pdf(file.path(fig.path, "enterotypes-principal-coordiante.pdf"), width=6, height=6)
# dev.new()
s.class(obs.pcoa$li, fac=as.factor(data.cluster), grid=F,sub="Principal coordiante analysis")
invisible(dev.off())
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.