R/BiotypeR.r

Defines functions s.potatoe noise.removal print.ecosimulation ecosimulator dist.JSD CH.index cluster.biotypes.dist cluster.biotypes.default cluster.biotypes plot.biotypes.test print.biotypes.test test.biotypes summary.biotyper.obj.simul print.biotyper.obj.simul biotyper.ecosimulation plot.biotyper biotyper.data.frame biotyper

Documented in biotyper biotyper.data.frame biotyper.ecosimulation CH.index cluster.biotypes cluster.biotypes.default cluster.biotypes.dist dist.JSD ecosimulator noise.removal plot.biotyper print.biotyper.obj.simul print.biotypes.test s.potatoe summary.biotyper.obj.simul test.biotypes

###################################################################
#                                                                 #
#           BiotypeR Methods                                      #
#                                                                 #
###################################################################


biotyper=function(x, ...) UseMethod("biotyper")

biotyper.data.frame=function(obs, k=3 , distance.jsd=NULL, cluster=NULL, manalysis=FALSE,  plot=FALSE, nf=3, no.unassigned=TRUE,...) {

require(ade4)
require(fpc)
require(cluster) #Require cluster package for use of PAM methods
  
  
if(k==1) stop("k cannot be 1")

if( is.null(distance.jsd)){
as.matrix(obs)-> obs.m
rownames(obs)-> rownames(obs.m)
dist.JSD(obs.m)-> distance.jsd
}

if (is.null(cluster) ){ cluster=cluster.biotypes(distance.jsd,k) }

obs.silhouette=mean(silhouette(cluster, distance.jsd)[,3])

obs.pca=NULL
obs.bet=NULL
obs.dpcoa=NULL

if(manalysis) {

if(no.unassigned){
dudi.pca(data.frame(t(obs[-1,])), scannf=F, nf=nf)-> obs.pca
} else { dudi.pca(data.frame(t(obs)), scannf=F, nf=nf)-> obs.pca }

bca(obs.pca, fac=as.factor(cluster), scannf=F, nf=nf)-> obs.bet
dpcoa(noise.removal(obs, percent=0.000001), distance.jsd, scannf=F, nf=nf) -> obs.dpcoa
}


biotyper.obj=list(biotypes=cluster, obs.silhouette=obs.silhouette, PCA=obs.pca, BET=obs.bet, dPCoA=obs.dpcoa)
class(biotyper.obj)="biotyper"

if (plot) { plot.biotyper(biotyper.obj) }

return(biotyper.obj)
}






plot.biotyper=function(biotyper.obj, xax=1, yax=2, potatoes=FALSE, ...) {

if (is.null(biotyper.obj$PCA)) stop ("there is no multivariate analysis inside this object\n=>restart 'biotyper' with the option 'manalysis=TRUE'")

par(mfrow=c(2,2))

#barplot
barplot(table(biotyper.obj$biotypes), xlab="biotypes", ylab="Nb samples", col=as.numeric(levels(as.factor(as.numeric(biotyper.obj$biotypes))))+1) #+1 to avoid black
box()


#PCA
plot(biotyper.obj$PCA$li[,xax], biotyper.obj$PCA$li[,yax], main="PCA", pch=16, col=as.numeric(biotyper.obj$biotypes)+1, xlab=paste("PC",xax), ylab=paste("PC",yax))
box()

if(!potatoes) {
#DPCOA
plot(biotyper.obj$dPCoA$dls[1:2], type="n", xlab=paste("PC",xax), ylab=paste("PC",yax), main="dPCoA")
s.class(biotyper.obj$dPCoA$dls, fac=factor(biotyper.obj$biotypes), add.plot=TRUE,clab=1, xax=xax, yax=yax)
points(biotyper.obj$dPCoA$dls[,xax], biotyper.obj$dPCoA$dls[,yax], col=as.numeric(biotyper.obj$biotypes)+1, cex=1, pch=16)
if(dim(biotyper.obj$BET$ls)[2] >= 2){ #Fix issue where an error is thrown if only using 2 classes
    #BET
    plot(biotyper.obj$BET$ls[,xax], biotyper.obj$BET$ls[,yax], type="n", xlab=paste("PC",xax), ylab=paste("PC",yax), main="between class")
    s.class(biotyper.obj$BET$ls, fac=as.factor(biotyper.obj$biotypes), grid=F, xax=xax, yax=yax, add.plot=TRUE)
    points(biotyper.obj$BET$ls[,xax], biotyper.obj$BET$ls[,yax], col=as.numeric(biotyper.obj$biotypes)+1, cex=1, pch=16)
}else{
    warning('Cannot display between class graph, requires k > 2')
  }
} else {
  #DPCOA
plot(biotyper.obj$dPCoA$l1[1:2], type="n", xlab=paste("PC",xax), ylab=paste("PC",yax), main="dPCoA")
s.potatoe(biotyper.obj$dPCoA$l1, fac=factor(biotyper.obj$biotypes), xax=xax, yax=yax)
s.class(biotyper.obj$dPCoA$l1, fac=factor(biotyper.obj$biotypes), add.plot=TRUE,clab=1, xax=xax, yax=yax, cell=0, cstar=0, cpoint=0)
points(biotyper.obj$dPCoA$l1[,xax], biotyper.obj$dPCoA$l1[,yax], cex=1, pch=16)
  if(dim(biotyper.obj$BET$ls)[2] >= 2){ #Fix an issue where an error is given if only using 2 classes
    #BET
    plot(biotyper.obj$BET$ls[,xax], biotyper.obj$BET$ls[,yax], type="n", xlab=paste("PC",xax), ylab=paste("PC",yax), main="between class")
    s.potatoe(biotyper.obj$BET$ls, fac=as.factor(biotyper.obj$biotypes), xax=xax, yax=yax)
    s.class(biotyper.obj$BET$ls, fac=as.factor(biotyper.obj$biotypes), grid=F, xax=xax, yax=yax, add.plot=TRUE, cell=0, cstar=0, cpoint=0)
    points(biotyper.obj$BET$ls[,xax], biotyper.obj$BET$ls[,yax], cex=1, pch=16)
  }else{
    warning('Cannot display between class graph, requires k > 2')
  }
}


}



biotyper.ecosimulation=function(obs, k , ...) {

biotypes.uniform=sapply(ecosimulation$uniform, biotyper.data.frame, k=k)
print("uniform: biotyping done")

biotypes.normal=sapply(ecosimulation$normal, biotyper.data.frame, k=k)
print("normal: biotyping done")

biotypes.log.normal=sapply(ecosimulation$log.normal, biotyper.data.frame, k=k)
print("log.normal: biotyping done")

simul=list(biotypes.uniform=biotypes.uniform, biotypes.normal=biotypes.normal, biotypes.log.normal=biotypes.log.normal)
class(simul)<-"biotyper.obj.simul"
return(simul)

}


print.biotyper.obj.simul=function(biotyper.obj.simul){

sil.uniform=unlist(t(biotyper.obj.simul$biotypes.uniform)[,2])
sil.normal=unlist(t(biotyper.obj.simul$biotypes.normal)[,2])
sil.log.normal=unlist(t(biotyper.obj.simul$biotypes.log.normal)[,2])

silhouette = data.frame(sil.uniform=sil.uniform, sil.normal=sil.normal, sil.log.normal=sil.log.normal)
print(silhouette)

 }
 



summary.biotyper.obj.simul=function(biotyper.obj.simul){

sil.uniform=unlist(t(biotyper.obj.simul$biotypes.uniform)[,2])
sil.normal=unlist(t(biotyper.obj.simul$biotypes.normal)[,2])
sil.log.normal=unlist(t(biotyper.obj.simul$biotypes.log.normal)[,2])

silhouette = data.frame(sil.uniform=sil.uniform, sil.normal=sil.normal, sil.log.normal=sil.log.normal)
sum.silhouette = summary(silhouette)
print(sum.silhouette)

 }



test.biotypes=function(biotyper.obj, biotyper.obj.simul) {

if(!class(biotyper.obj)=="biotyper") {stop("this is not a biotyper object")}

if(!class(biotyper.obj.simul)=="biotyper.obj.simul") {stop("this is not a biotyper object(simulation)")}

sil=biotyper.obj$obs.silhouette

sil.uniform=unlist(t(biotyper.obj.simul$biotypes.uniform)[,2])
sil.normal=unlist(t(biotyper.obj.simul$biotypes.normal)[,2])
sil.log.normal=unlist(t(biotyper.obj.simul$biotypes.log.normal)[,2])

p.value.sil.uniform=1-(which(sort(c(sil,sil.uniform))==sil)/ length(c(sil,sil.uniform)))
p.value.sil.normal=1-(which(sort(c(sil,sil.normal))==sil)/ length(c(sil,sil.normal)))
p.value.sil.log.normal=1-(which(sort(c(sil,sil.log.normal))==sil)/ length(c(sil,sil.log.normal)))

sil.test=list(p.value.sil.uniform=p.value.sil.uniform,p.value.sil.normal=p.value.sil.normal,p.value.sil.log.normal=p.value.sil.log.normal)

silhouette.simul = list(sil.uniform=sil.uniform, sil.normal=sil.normal, sil.log.normal=sil.log.normal)
test=list(silhouette.test=sil.test, silhouette.simul=silhouette.simul, observed.sil=biotyper.obj$obs.silhouette)


class(test)<-"biotypes.test"

return(test)

}



print.biotypes.test=function(biotypes.test, ...) {


cat("Silhouette cluster validation : is species comunauty follow a continuun ?\n")
cat("here is alpha risk to say that observed value is different from expected value\n")
print(biotypes.test$silhouette.test)



}




plot.biotypes.test=function(biotypes.test, ...) {

stripchart(biotypes.test$observed.sil, pch=16, xlim=c(0,biotypes.test$observed.sil+(0.1*biotypes.test$observed.sil)), col="black", cex=2, xlab="Silhouette Coef")
stripchart(c(biotypes.test$silhouette.simul$sil.uniform), pch=16, col="blue", add=TRUE, method="jitter")
stripchart(c(biotypes.test$silhouette.simul$sil.normal), pch=16, col="red", add=TRUE, method="jitter")
stripchart(c(biotypes.test$silhouette.simul$sil.log.normal), pch=16, col="green", add=TRUE, method="jitter")
legend("topright", c("observed","uniform","normal","log.normal"), pt.cex=c(2,1,1,1), col=c("black","blue","red","green"), pch=16)

}








# if (is.null(best.species.number) ){
# best.species.number=2
# }

# rbind(
# dataframe.bet$co[order(dataframe.bet$co[,1])[1:best.species.number],],
# dataframe.bet$co[rev(order(dataframe.bet$co[,1]))[1:best.species.number],],
# dataframe.bet$co[order(dataframe.bet$co[,2])[1:best.species.number],],
# dataframe.bet$co[rev(order(dataframe.bet$co[,2]))[1:best.species.number],])-> best.species

# data=list(best.species=best.species, between=dataframe.bet, PCA=dataframe.pca, cluster=cluster)

# return(data)

# }

#########################################################
#                                                       #
#                 Cluster based on JSD distance         #
#                                                       #
#########################################################

cluster.biotypes=function(x, ...) UseMethod("cluster.biotypes")

cluster.biotypes.default=function(x,k) {

as.matrix(x)-> x.m
rownames(x)-> rownames(x.m)
dist.JSD(x.m)-> distance.JSD

cluster = as.vector(pam(as.dist(distance.JSD), k, diss=TRUE)$clustering)

return(cluster)


}

cluster.biotypes.dist=function(x,k) {


cluster = as.vector(pam(as.dist(x), k, diss=TRUE)$clustering)

return(cluster)


}



#########################################################
#                                                       #
#                 CH index                              #
#                                                       #
#########################################################



CH.index=function(dataframe, distance.jsd=NULL, kvector=1:20, clusterSim=FALSE){

if(clusterSim) {
	require(clusterSim)
	kvector=unique(as.integer(kvector))
	nclusters=NULL

		for (k in kvector) { 
			if (k==1) {
				nclusters[k]=0
			}
		
			else {
				if(distance.jsd==NULL) stop("a distance matrix is needed")
				cluster=cluster.biotypes(distance.jsd, k)
				index.G1(t(dataframe),cluster,  d = distance.jsd, centrotypes = "medoids") -> nclusters[k]
			}

		}

	} else {
	
	nclusters=pamk(t(dataframe), criterion="ch", krange=kvector)$crit
	
	}



return(nclusters)

}






#########################################################
#                                                       #
#                 JSD Function Distance                 #
#                                                       #
#########################################################
 
dist.JSD <- function(inMatrix, pseudocount=10^(round(log10(min(as.matrix(inMatrix)[as.matrix(inMatrix) > 0])),0)-1), ...) {
	
	KLD <- function(x,y) sum(x *log(x/y))
	JSD<- function(x,y) sqrt(0.5 * KLD(x, (x+y)/2) + 0.5 * KLD(y, (x+y)/2))
	matrixColSize <- length(colnames(inMatrix))
	matrixRowSize <- length(rownames(inMatrix))
	colnames <- colnames(inMatrix)
	resultsMatrix <- matrix(0, matrixColSize, matrixColSize)
	
	for(k in 1:matrixRowSize) { 
		for(j in 1:matrixColSize) {
			if (inMatrix[k,j] == 0) { inMatrix[k,j] <- pseudocount	}
		}
	}

	for(i in 1:matrixColSize) {
		for(j in 1:matrixColSize) { 
			JSD(as.vector(inMatrix[,i]), as.vector(inMatrix[,j]))->resultsMatrix[i,j] 
		}
	}
	colnames -> colnames(resultsMatrix) -> rownames(resultsMatrix)
	as.dist(resultsMatrix)->resultsMatrix
	attr(resultsMatrix, "method") <- "dist"
	return(resultsMatrix) 
}

###################################################
#                                                 #
#  Template based communities  simulator          #
#                                                 #
###################################################


#x=matrix(rnorm(100000), nc=100)

ecosimulator=function(x,r) {

sites=names(x)

x=as.matrix(x)
pseudocount=10^(round(log10(min(as.matrix(x)[as.matrix(x) > 0])),0)-1)
x[which(x==0)]<- pseudocount
colnames(x)=sites

generator.unif=function(x){
n=dim(x)[2]
mx=apply(x,1,min)
sx=apply(x,1,max)
dx=data.frame(mx,sx)
g=function(x, c1, c2, n) {runif(n,min=x[c1], max=(x[c2]))}
simul=apply(dx,1,g,c1="mx",c2="sx", n)
rownames(simul)=sites
simul=as.data.frame(prop.table(t(simul),2))
return(simul)
}


generator.norm=function(x){
n=dim(x)[2]
mx=apply(x,1,mean)
sx=apply(x,1,sd)
dx=data.frame(mx,sx)
g=function(x, c1, c2, n) {rnorm(n,mean=x[c1], sd=(x[c2]))}
simul=apply(dx,1,g,c1="mx",c2="sx", n)
rownames(simul)=sites
simul=t(simul)
simul[which(simul < pseudocount)] <- pseudocount
simul=as.data.frame(prop.table(simul,2))
return(simul)
}


generator.lnorm=function(x){
n=dim(x)[2]

lmean=function(x) {mean(log(x[x>0]))}
lsd=function(x) {sd(log(x[x>0]))}

mx=apply(x,1,lmean)
sx=apply(x,1,lsd)
dx=data.frame(mx,sx)
g=function(x, c1, c2, n) {rlnorm(n,mean=x[c1], sd=(x[c2]))}
simul=apply(dx,1,g,c1="mx",c2="sx", n)
rownames(simul)=sites
simul=as.data.frame(prop.table(t(simul),2))
return(simul)
}

#TO DO : generator.permute=function(x) 

lx.unif=vector("list",r); for (i in 1:r) generator.unif(x)-> lx.unif[[i]]
lx.norm=vector("list",r); for (i in 1:r) generator.norm(x)-> lx.norm[[i]]
lx.lnorm=vector("list",r); for (i in 1:r) generator.lnorm(x)-> lx.lnorm[[i]]

lx=list(uniform=lx.unif, normal=lx.norm, log.normal=lx.lnorm)

class(lx)="ecosimulation"

return(lx)
}


print.ecosimulation=function(x,...){

cat(names(x),sep=", ")
cat(" = ",length(x)," lists of",length(x[[1]]), "simulated data\n")
cat("nsites =",dim(x[[1]][[1]])[2],", n[taxa or functions] =",dim(x[[1]][[1]])[1],"\n")

}




###################################################################
#                                                                 #
#                          Noise Removal                          #
#                                                                 #
###################################################################


noise.removal<-function(dataframe, percent=NULL, top=NULL, bysample=TRUE){


noise.removal.global <- function(dataframe, percent=NULL, top=NULL){
dataframe->Matrix

if (is.null(top)){

if (is.null(percent)) {
percent=100*(10^(mean(log10(rowSums(Matrix)/sum(rowSums(Matrix)))) + 1.96*sd(log10(rowSums(Matrix)/sum(rowSums(Matrix)))))) # this is percent
}
bigones <- rowSums(Matrix)*100/(sum(rowSums(Matrix))) > percent  #this is percent ### noize filter
Matrix_1 <- Matrix[bigones,]
print(percent)
}

else {

bigones <- rev(order(apply(Matrix, 1, mean)))[1:top]
Matrix_1 <- Matrix[bigones,]


}


return(Matrix_1)


}


noise.removal.spec <- function(dataframe, percent=NULL, top=NULL){

dataframe->Matrix
bigones<-NULL

for(i in 1:dim(Matrix)[2]){
	
	if (is.null(top)){
		if (is.null(percent)) {
			percent<- 100*(10^(median(log10(Matrix[Matrix[,i]>0,i])) + 1.96*sd(log10(Matrix[Matrix[,i]>0,i]))))
			cat(percent,"\n")
		}
			tmp <- which(Matrix[,i] > percent/100)
			bigones<-unique(c(bigones,tmp))
	}
	else {
		tmp <- rev(order(Matrix[,i]))[1:top]
		bigones<-unique(c(bigones,tmp))
	}	

}

return(Matrix[bigones,])
print(percent)
}

if(bysample) { noise.removal.spec(dataframe, percent=percent, top=top)}

else {noise.removal.global(dataframe, percent=percent, top=top)}


}


s.potatoe<-function(dfxy, fac, xax = 1, yax = 2, col.border = seq(1:length(levels(fac)))+1, col.fill = seq(1:length(levels(fac)))+1,
shape = -0.5, open = "FALSE", ...) {
dfxy <- data.frame(dfxy)
opar <- par(mar = par("mar"))
par(mar = c(0.1, 0.1, 0.1, 0.1))
on.exit(par(opar))
x <- dfxy[, xax]
y <- dfxy[, yax]
for (f in levels(fac)) {
xx <- x[fac == f]
yy <- y[fac == f]
xc <- chull(xx, yy)
qui <- which(levels(fac) == f)
border <- col.border[qui]
col <- col.fill[qui]
xspline(xx[xc], yy[xc], shape = shape, open = open, border = border,
col = col, ...)
}
}
tapj/biotyper documentation built on Oct. 23, 2021, 5:44 a.m.