#' @title Dissimilarity-Overlap Curve (DOC)
#'
#' @description Given matrix of relative taxon abundances, plot the dissimilarity-overlap curve (DOC) sample-wise.
#' The DOC method was developed by Bashan and colleagues.
#'
#' @param x rows are taxa, columns are samples, abundance in each sample is assumed to sum to one
#' @param B bootstrap iterations, if set to 0 or below, no bootstraps are carried out
#' @param polygons draw a polygon instead of the lower and upper confidence line (in development)
#' @param rand randomize the data sample-wise before starting the DOC analysis
#' @param lower.conf lower limit of the confidence interval
#' @param upper.conf upper limit of the confidence interval
#' @param null.model the null model to use, permut shuffles x sample-wise, assembly selects for each non-zero taxon one of the values taken across the samples at random
#' @param doc.res the result of a previous run of the doc method is added in green with black points, current result is shown in red with grey points (to plot null model together with data)
#' @return a list with the overlaps, dissimilarities, lowess smoothed overlaps and dissimilarites and lower and upper confidence intervals
#' @references A. Bashan et al. (2016). Universality of human microbial dynamics, Nature 534, 259-262.
#' @examples
#' \dontrun{
#' data("david_stoolA_otus")
#' data=normalize(rarefyFilter(david_stoolA_otus,min=10000)[[1]])
#' doc.out=doc(data[,1:50],B=10) # apply the DOC method to the first 50 samples
#' # plot the DOC curve together with the curve for the null model
#' doc.null=doc(data[,1:50],B=10,rand=TRUE,doc.res=doc.out)
#' }
#' @export
doc<-function(x, B=100, polygons=FALSE, rand=FALSE, lower.conf=0.03, upper.conf=0.97, null.model="assembly", doc.res=NULL){
if(rand == TRUE){
if(null.model == "permut"){
# permute each sample separately
for(i in 1:ncol(x)){
x[,i]=sample(x[,i], replace=FALSE)
}
}else if(null.model == "assembly"){
# keep the assembly: replace non-zero taxa by a randomly selected value from their range
for(taxa in 1:nrow(x)){
taxonvec=x[taxa,]
nonzeroindices=which(taxonvec>0)
for(samples in 1:ncol(x)){
if(x[taxa,samples] > 0){
x[taxa,samples]=taxonvec[sample(nonzeroindices,1)]
} # taxon has a non-zero count
} # sample loop
} # taxon loop
# renormalize sample sums to 1
colsums = apply(x,2,sum)
for(i in 1:ncol(x)){
x[,i]=x[,i]/colsums[i]
}
}else{
stop("Please choose null model permut or assembly.")
}
} # randomize
res=computeOandD(x)
comparisons=0.5*ncol(x)*(ncol(x)-1)
bootstrapsO=matrix(NA,nrow=B,ncol=comparisons)
bootstrapsD=matrix(NA, nrow=B, ncol=comparisons)
lowerConfO=c()
upperConfO=c()
lowerConfD=c()
upperConfD=c()
if(B > 0){
# do B bootstraps
for(iteration in 1:B){
#print(paste("Bootstrap",iteration))
# sample with replacement
#selected=sample(1:ncol(x),replace=TRUE)
#bootX=x[,selected]
#resB=computeOandD(bootX)
# from the dissimilarities and overlaps previously computed
bootstrap.indices=sample(x=(1:length(res$overlap)), size=length(res$overlap), replace=TRUE)
resB=list(res$overlap[bootstrap.indices],res$dissim[bootstrap.indices])
names(resB)=c("overlap","dissim")
outLB=lowess(resB$overlap,resB$dissim)
bootstrapsO[iteration,]=outLB$x
bootstrapsD[iteration,]=outLB$y
}
# get confidence intervals
for(i in 1:comparisons){
lowerConfO=c(lowerConfO, quantile(bootstrapsO[,i],lower.conf))
upperConfO=c(upperConfO, quantile(bootstrapsO[,i],upper.conf))
lowerConfD=c(lowerConfD, quantile(bootstrapsD[,i],lower.conf))
upperConfD=c(upperConfD, quantile(bootstrapsD[,i],upper.conf))
}
}
outL=lowess(res$overlap,res$dissim)
if(!is.null(doc.res)){
xlim=c(min(c(res$overlap,doc.res$overlap)),max(c(res$overlap,doc.res$overlap)))
ylim=c(min(c(res$dissim,doc.res$dissim)),max(c(res$dissim,doc.res$dissim)))
}else{
xlim=c(min(res$overlap),max(res$overlap))
ylim=c(min(res$dissim),max(res$dissim))
}
if(!is.null(doc.res)){
points.color="darkgrey"
conf.line.col="orange"
lowess.col="red"
fillcol=rgb(1,0,0,alpha=0.5)
}else{
points.color="black"
lowess.col="darkgreen"
conf.line.col="green"
fillcol=rgb(0,1,0,alpha=0.5)
}
if(polygons == FALSE){
plot(res$overlap, res$dissim, xlim=xlim, ylim=ylim, type="p", pch=16, xlab="overlap", ylab="dissimilarity (rJSD)", main="DOC", col=points.color)
# Lowess line
lines(outL, type="l", col=lowess.col, lwd=2)
if(!is.null(doc.res)){
print("Adding previous results to plot...")
# add previously computed DOC result to the plot
lines(doc.res$overlap, doc.res$dissim, type="p", pch=16, col="black")
# Lowess line
outL.prev=list(doc.res$lowesso, doc.res$lowessd)
names(outL.prev)=c("x","y")
lines(outL.prev, type="l", col="darkgreen", lwd=2)
if(length(doc.res$lowconfo)>0){
# lower confidence interval
lines(doc.res$lowconfo, doc.res$lowconfd, type="l", col="green")
# upper confidence interval
lines(doc.res$highconfo, doc.res$highconfd, type="l", col="green")
}
}
if(B>0){
# lower confidence interval
lines(lowerConfO, lowerConfD, type="l", col=conf.line.col)
# upper confidence interval
lines(upperConfO, upperConfD, type="l", col=conf.line.col)
}
}else{
plot(res$overlap, res$dissim, type="n",xlim=xlim,ylim=ylim, pch=".", xlab="overlap", ylab="dissimilarity (rJSD)", main="DOC", col=points.color)
# Lowess line
lines(outL, type="l", col=lowess.col, lwd=2)
if(B>0){
polygon(c(outL$x,lowerConfO),c(outL$x,lowerConfD), col=fillcol, border="white")
polygon(c(outL$x,upperConfO),c(outL$x,upperConfD), border="white", col="white")
}
if(!is.null(doc.res)){
print("Adding previous results to plot...")
# add previously computed Lowess line
outL.prev=list(doc.res$lowesso, doc.res$lowessd)
names(outL.prev)=c("x","y")
lines(outL.prev, type="l", col="darkgreen", lwd=2)
if(length(doc.res$lowconfo)>0){
polygon(c(doc.res$lowesso,doc.res$lowconfo),c(doc.res$lowesso,doc.res$lowconfd), col=rgb(0,1,0,alpha=0.5), border="white")
polygon(c(doc.res$lowesso,doc.res$highconfo),c(doc.res$lowesso,doc.res$highconfd), border="white", col="white")
}
}
}
res=list(res$overlap, res$dissim, outL$x, outL$y, lowerConfO, lowerConfD, upperConfO, upperConfD)
names(res)=c("overlap","dissim","lowesso","lowessd", "lowconfo", "lowconfd", "highconfo", "highconfd")
return(res)
}
# rJSD of a single shared species is 0, since renormalization
# will lead to this species having an abundance of 1 in both samples.
# Thus, minShared is set to 2.
computeOandD<-function(x, minShared=2){
overlaps=c()
dissimilarities=c()
# loop all sample pairs
for(i in 1:(ncol(x)-1)){
for(j in (i+1):ncol(x)){
a=as.numeric(x[,i])
b=as.numeric(x[,j])
# shared species
nonnulla=which(a>0)
nonnullb=which(b>0)
indices.shared=intersect(nonnulla, nonnullb)
# sample pair shares at least the minimal number of taxa
if(length(indices.shared) >= minShared){
# overlap
o=(sum(a[indices.shared])+sum(b[indices.shared]))/2
# dissimilarity
renorma=a[indices.shared]/sum(a[indices.shared])
renormb=b[indices.shared]/sum(b[indices.shared])
dissim=rJSD(renorma, renormb)
# no taxon shared: overlap is zero, but rJSD is not defined (division by zero)
# for relative abundances, rJSD is bounded by [0, sqrt(log(2))]
# both are set to NA to skip the corresponding point
}else{
o = NA
dissim = NA
}
overlaps=c(overlaps,o)
dissimilarities=c(dissimilarities, dissim)
}
}
res=list(overlaps, dissimilarities)
names(res)=c("overlap","dissim")
return(res)
}
# taken from: http://enterotype.embl.de/enterotypes.html
rJSD<-function(x,y){
return(sqrt(0.5 * KLD(x, (x+y)/2) + 0.5 * KLD(y, (x+y)/2)))
}
# addition of a pseudocount, to avoid negative infinities
KLD<-function(x,y, pseudocount=10^-10){
x = x + pseudocount
y = y + pseudocount
return(sum(x * log(x/y)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.