knitr::opts_chunk$set(echo = TRUE)

In this document, we group wells from the AM, PM1, and PM2A plates by similarity of their chemical products signature.

Well properties

if(!require(ChemmineR)){
 source("http://bioconductor.org/biocLite.R") # Sources the biocLite.R installation script. 
 biocLite("ChemmineR") # Installs the package. 
 library("ChemmineR") # Loads the package
}
source_db = read.table('../data/biolog_wells.txt',sep='\t')
colnames(source_db) = c('plateID','wellID','sourceName','pubchem','manualAnno','chemoCluster')

head(source_db)
# initiate output list
ANNO = list()
ENR = list()

Analysis for AM plate sources

plateID = 'AM'

Chemoinformatics features extraction

wells = source_db[which(source_db$plateID == plateID),]

head(wells)
job1 <- launchCMTool("pubchemID2SDF", wells$pubchem)
while(status(job1) == 'RUNNING'){}
result1 <- result(job1)
job4 <- launchCMTool("OpenBabel Descriptors", result1)
while(status(job4) == 'RUNNING'){}
result4 <- result(job4)
result4 = cbind(wells$sourceName,result4)
head(result4) 

# convert
apset <- sdf2ap(result1) 
cid(apset) = as.character(wells[,2])

ANNO[[plateID]] = list('result' = result4,'apset'=apset)

#save(result4,apset,file='../data/biolog_sugar_molecular_features.Rdata')

Similarity between carbon sources

We compute the Tanimoto similarity between each sugar molecule, based on their binary fingerprints (one bit per pattern), as explained in https://www.surechembl.org/knowledgebase/84207-tanimoto-coefficient-and-fingerprint-generation.

#load('../data/biolog_sugar_molecular_features.Rdata')
fpset <- desc2fp(ANNO[[plateID]]$apset)

Here are two examples of the fingerprints extracted from PubChem (only 20 first bits):

cat(as.character(wells$sourceName[5]), head(fpset[[5]]@fp,20),'\n')
cat(as.character(wells$sourceName[7]), head(fpset[[7]]@fp,20),'\n')

cat("Tanimoto's Similarity between",as.character(wells$sourceName[5]), "and",as.character(wells$sourceName[7]) ,":",fpSim(fpset[5], fpset[7], sorted=FALSE),'\n')
simMA <- sapply(cid(fpset), function(x) fpSim(fpset[x], fpset, sorted=FALSE))
ANNO[[plateID]]$simMA = simMA
#save(simMA,file='../data/pubchem_kegg/cluster_info.Rdata')
hc <- hclust(as.dist(1-simMA), method="single") 


if(!require(heatmaply)){
install.packages('heatmaply')
 library(heatmaply) # Loads the package
}


heatmaply(simMA, k_col = NA, k_row = NA, label_names = c("product1", "product2", "similarity"),labRow = rownames(simMA),labCol =colnames(simMA),fontsize_row = 4,fontsize_col = 4) %>% layout(margin = list(l = 130, b = 40))

# 10 groups were found

# get cluster/well info
tmp = heatmapr(simMA, k_col = NA, k_row = NA)
dend = as.dendrogram(tmp$rows)

if(!require(dendextend)){
install.packages('dendextend')
 library(dendextend) # Loads the package
}

col.leaf = get_leaves_branches_col(dend)
col.leaf = as.numeric(as.factor(col.leaf))
wells.dend = labels(dend)

Here is the assignment of sugars in each cluster:

head(cbind(wells.dend,col.leaf))

Here is the distribution of cluster size:

sort(table(col.leaf))

Here is the distribution of molecular weight in each cluster:

boxplot(result4$MW~col.leaf,xlab='cluster ID',ylab='Molecular Weight')

Functional analysis on the identified groups

Here we rely on KEGG database for compounds to understand the underlying biology of the clusters we obtained in the previous step:

if(!require(KEGGREST)){
install.packages('KEGGREST')
 library(KEGGREST) # Loads the package
}

DB.KEGG = list()

i=1
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=2
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=3
tmp = names(keggFind('compound',query='N-Acetyl-D-mannosamine'))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=4
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=5
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=6
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=7
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=8
tmp = names(keggFind('compound',query='Cellobiose'))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=9
tmp = names(keggFind('compound',query='Cyclodextrin'))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=10
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=11
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=12
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=13
tmp = names(keggFind('compound',query='Erythritol'))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=14
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=15
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=16
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=17
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=18
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=19
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=20
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=21
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=22
tmp2 = keggGet('C00103')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=23 
tmp2 = keggGet('C00092')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=24
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=25 
tmp2 = keggGet('C04508')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=26
tmp2 = keggGet('C00137')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=27 
tmp2 = keggGet('C00984')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=28
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=29
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=30
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=31
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=32
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=33 
tmp2 = keggGet('C08243')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=34 
tmp2 = keggGet('C05402')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=35 
tmp2 = keggGet('C11911')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=36 
tmp2 = keggGet('C04698')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=37
tmp2 = keggGet('C03619')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=38 
tmp2 = keggGet('C00963')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=39
tmp2 = keggGet('C00963')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=40
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=41 
tmp2 = keggGet('C00492')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=42
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=43
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=44
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=45
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=46
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=47
tmp2 = keggGet('C01083')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=48
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=49
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=50
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=51
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=52
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=53 
tmp2 = keggGet('C05984')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=54
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=55
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=56
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=57
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=58 
tmp2 = keggGet('C00256')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=59
tmp2 = keggGet('C00186')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=60 
tmp2 = keggGet('C06010')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=61
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=62
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=63
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[10]]

i=64
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=65 
tmp2 = keggGet('C00022')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=66
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=67 
tmp2 = keggGet('C01180')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=68
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=69 
tmp2 = keggGet('C00490')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=70 
tmp2 = keggGet('C00552')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=71
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=72 
tmp2 = keggGet('C19779')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=73
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=74 
tmp2 = keggGet('C00064')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=75 
tmp2 = keggGet('C00135')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=76
tmp2 = keggGet('C00188')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=77
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=78
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=79
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=80 
tmp2 = keggGet('C00049')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=81
tmp2 = keggGet('C00064')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=82
tmp2 = keggGet('C00073')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=83
tmp2 = keggGet('C00148')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=84
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=85
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=86
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=87
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=88
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=89
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=90
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[4]]

i=91
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[4]]

i=92
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=93
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[5]]

i=94
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=95 
tmp2 = keggGet('C00105')
as.character(wells$sourceName[i])
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]


ANNO[[plateID]]$DB.KEGG = DB.KEGG

Here are the top 20 most represented KEGG pathways across all sugars:

sort(table(unlist(lapply(DB.KEGG,function(x)x$PATHWAY))),decreasing=T)[1:20]

After getting KEGG pathways corresponding to the different wells, we want to check if the clusters we got from PubChem similarity are biologically relevant.

pathways = lapply(DB.KEGG,function(x)x$PATHWAY)
pathways.name = unique(unlist(pathways)) # 146 pathways
length(pathways.name)

# enrichment
ENR = vector("list", length = max(col.leaf))
for(i in 1:max(col.leaf)){
  HITS = NULL

  for(j in 1:length(pathways.name)){
    target = sapply(pathways[wells.dend[which(col.leaf == i)]], function(x) pathways.name[j] %in% x)
    bgd = sapply(pathways[wells.dend[which(col.leaf != i)]], function(x) pathways.name[j] %in% x)

    tmp = fisher.test(x=factor(c(target,bgd),levels=c('TRUE','FALSE')),y=factor(c(rep('target',length(target)),rep('bgd',length(bgd))),levels=c('target','bgd')))
    #if(tmp$estimate > 1 & tmp$p.value < 0.05) ENR[[i]] = c(ENR[[i]],pathways.name[j])
    if(tmp$p.value < 0.03) HITS = rbind(HITS,c(pathways.name[j],round(sign(tmp$estimate-1)*tmp$p.value,3)))
  }
 if(length(HITS) > 1) colnames(HITS) = c('Pathway','p-value')
  # we keep negative enrichment but wont' display it in the app

  ENR[[i]] = HITS[order(abs(as.numeric(HITS[,2])),decreasing = FALSE),]
}

# This enrichment mapping might be useful when we detect significant clusters in a specific group of strains
ANNO[[plateID]]$ENR = ENR
ANNO[[plateID]]$col.leaf = col.leaf
ANNO[[plateID]]$pathways.name = pathways.name
ANNO[[plateID]]$wells.dend = wells.dend


#save(ENR,col.leaf,DB.KEGG,pathways.name,wells.dend,file='biolog_kegg_wells.Rdata')

Here is the content of each cluster in terms of enriched KEGG pathways:

options(width = 1000)
ENR

Analysis for PM1 plate sources

plateID = 'PM1'

Chemoinformatics features extraction

wells = source_db[which(source_db$plateID == plateID),]

head(wells)
job1 <- launchCMTool("pubchemID2SDF", wells$pubchem)
while(status(job1) == 'RUNNING'){}
result1 <- result(job1)
job4 <- launchCMTool("OpenBabel Descriptors", result1)
while(status(job4) == 'RUNNING'){}
result4 <- result(job4)
result4 = cbind(wells$sourceName,result4)
head(result4) 

# convert
apset <- sdf2ap(result1) 
cid(apset) = as.character(wells[,2])

ANNO[[plateID]] = list('result' = result4,'apset'=apset)

#save(result4,apset,file='../data/biolog_sugar_molecular_features.Rdata')

Similarity between carbon sources

We compute the Tanimoto similarity between each sugar molecule, based on their binary fingerprints (one bit per pattern), as explained in https://www.surechembl.org/knowledgebase/84207-tanimoto-coefficient-and-fingerprint-generation.

#load('../data/biolog_sugar_molecular_features.Rdata')
fpset <- desc2fp(ANNO[[plateID]]$apset)

Here are two examples of the fingerprints extracted from PubChem (only 20 first bits):

cat(as.character(wells$sourceName[5]), head(fpset[[5]]@fp,20),'\n')
cat(as.character(wells$sourceName[7]), head(fpset[[7]]@fp,20),'\n')

cat("Tanimoto's Similarity between",as.character(wells$sourceName[5]), "and",as.character(wells$sourceName[7]) ,":",fpSim(fpset[5], fpset[7], sorted=FALSE),'\n')
simMA <- sapply(cid(fpset), function(x) fpSim(fpset[x], fpset, sorted=FALSE))
ANNO[[plateID]]$simMA = simMA
#save(simMA,file='../data/pubchem_kegg/cluster_info.Rdata')
hc <- hclust(as.dist(1-simMA), method="single") 


if(!require(heatmaply)){
install.packages('heatmaply')
 library(heatmaply) # Loads the package
}


heatmaply(simMA, k_col = NA, k_row = NA, label_names = c("product1", "product2", "similarity"),labRow = rownames(simMA),labCol =colnames(simMA),fontsize_row = 4,fontsize_col = 4) %>% layout(margin = list(l = 130, b = 40))

# 8 groups were found

# get cluster/well info
tmp = heatmapr(simMA, k_col = NA, k_row = NA)
dend = as.dendrogram(tmp$rows)

if(!require(dendextend)){
install.packages('dendextend')
 library(dendextend) # Loads the package
}

col.leaf = get_leaves_branches_col(dend)
col.leaf = as.numeric(as.factor(col.leaf))
wells.dend = labels(dend)

Here is the assignment of sugars in each cluster:

head(cbind(wells.dend,col.leaf))

Here is the distribution of cluster size:

sort(table(col.leaf))

Here is the distribution of molecular weight in each cluster:

boxplot(result4$MW~col.leaf,xlab='cluster ID',ylab='Molecular Weight')

Functional analysis on the identified groups

Here we rely on KEGG database for compounds to understand the underlying biology of the clusters we obtained in the previous step:

if(!require(KEGGREST)){
install.packages('KEGGREST')
 library(KEGGREST) # Loads the package
}

DB.KEGG = list()

i=1
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=2
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=3
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=4
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=5
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=6
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=7
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=8
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=9
tmp2 = keggGet('C01083')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=10
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=11
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=12
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=13
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=14
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=15
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=16
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=17
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=18
tmp2 = keggGet('C00093')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=19
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=20
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=21
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=22
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=23 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=24
tmp2 = keggGet('C00092')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=25 
tmp2 = keggGet('C00880')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=26
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=27 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=28
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=29
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=30
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=31
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=32
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=33 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=34 
tmp = names(keggFind('compound',query='Melibiose'))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=35 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=36 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=37
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=38 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=39
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=40
tmp2 = keggGet('D05566')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=41 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=42
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=43
tmp2 = keggGet('C03619')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=44
tmp = names(keggFind('compound',query='Lactose'))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=45
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=46
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=47
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[5]]

i=48
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=49
tmp2 = keggGet('C00552')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=50
tmp2 = keggGet('C00103')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=51
tmp2 = keggGet('C00085')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=52
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=53 
tmp2 = keggGet('C02630')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=54 
tmp2 = keggGet('C05984')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=55 
tmp2 = keggGet('C11611')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=56
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=57
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=58 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[4]]

i=59
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[9]]

i=60 
tmp2 = keggGet('C00049')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=61
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=62 
tmp2 = keggGet('C00137')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=63
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=64
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=65  
tmp2 = keggGet('C00042')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=66
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[10]]

i=67 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=68
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=69 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=70 
tmp = names(keggFind('compound',query='Cellobiose'))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=71
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[4]]

i=72 
tmp2 = keggGet('C00624')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=73
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=74 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=75 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=76
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=77  
tmp2 = keggGet('C05729')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=78
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=79 
tmp2 = keggGet('C00645')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=80 
tmp2 = keggGet('C14115')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=81
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=82
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=83
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=84 
tmp2 = keggGet('C01833')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=85 
tmp2 = keggGet('C05852')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=86 
tmp2 = keggGet('C05593')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=87
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=88
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=89
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=90    
tmp2 = keggGet('D01791')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=91
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=92 
tmp2 = keggGet('C01040')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=93
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=94
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=95 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]


ANNO[[plateID]]$DB.KEGG = DB.KEGG

Here are the top 20 most represented KEGG pathways across all sugars:

sort(table(unlist(lapply(DB.KEGG,function(x)x$PATHWAY))),decreasing=T)[1:20]

After getting KEGG pathways corresponding to the different wells, we want to check if the clusters we got from PubChem similarity are biologically relevant.

pathways = lapply(DB.KEGG,function(x)x$PATHWAY)
pathways.name = unique(unlist(pathways)) # 153 pathways
length(pathways.name)

# enrichment
ENR = vector("list", length = max(col.leaf))
for(i in 1:max(col.leaf)){
  HITS = NULL

  for(j in 1:length(pathways.name)){
    target = sapply(pathways[wells.dend[which(col.leaf == i)]], function(x) pathways.name[j] %in% x)
    bgd = sapply(pathways[wells.dend[which(col.leaf != i)]], function(x) pathways.name[j] %in% x)

    tmp = fisher.test(x=factor(c(target,bgd),levels=c('TRUE','FALSE')),y=factor(c(rep('target',length(target)),rep('bgd',length(bgd))),levels=c('target','bgd')))
    #if(tmp$estimate > 1 & tmp$p.value < 0.05) ENR[[i]] = c(ENR[[i]],pathways.name[j])
    if(tmp$p.value < 0.03) HITS = rbind(HITS,c(pathways.name[j],round(sign(tmp$estimate-1)*tmp$p.value,3)))
  }
 if(length(HITS) > 1) colnames(HITS) = c('Pathway','p-value')
  # we keep negative enrichment but wont' display it in the app

  ENR[[i]] = HITS[order(abs(as.numeric(HITS[,2])),decreasing = FALSE),]
}

# This enrichment mapping might be useful when we detect significant clusters in a specific group of strains
ANNO[[plateID]]$ENR = ENR
ANNO[[plateID]]$col.leaf = col.leaf
ANNO[[plateID]]$pathways.name = pathways.name
ANNO[[plateID]]$wells.dend = wells.dend

Here is the content of each cluster in terms of enriched KEGG pathways:

options(width = 1000)
ENR

Analysis for PM2A plate sources

plateID = 'PM2A'

Chemoinformatics features extraction

wells = source_db[which(source_db$plateID == plateID),]

head(wells)
job1 <- launchCMTool("pubchemID2SDF", wells$pubchem)
while(status(job1) == 'RUNNING'){}
result1 <- result(job1)
job4 <- launchCMTool("OpenBabel Descriptors", result1)
while(status(job4) == 'RUNNING'){}
result4 <- result(job4)
result4 = cbind(wells$sourceName,result4)
head(result4) 

# convert
apset <- sdf2ap(result1) 
cid(apset) = as.character(wells[,2])

ANNO[[plateID]] = list('result' = result4,'apset'=apset)

Similarity between carbon sources

We compute the Tanimoto similarity between each sugar molecule, based on their binary fingerprints (one bit per pattern), as explained in https://www.surechembl.org/knowledgebase/84207-tanimoto-coefficient-and-fingerprint-generation.

fpset <- desc2fp(ANNO[[plateID]]$apset)

Here are two examples of the fingerprints extracted from PubChem (only 20 first bits):

cat(as.character(wells$sourceName[5]), head(fpset[[5]]@fp,20),'\n')
cat(as.character(wells$sourceName[7]), head(fpset[[7]]@fp,20),'\n')

cat("Tanimoto's Similarity between",as.character(wells$sourceName[5]), "and",as.character(wells$sourceName[7]) ,":",fpSim(fpset[5], fpset[7], sorted=FALSE),'\n')
simMA <- sapply(cid(fpset), function(x) fpSim(fpset[x], fpset, sorted=FALSE))
ANNO[[plateID]]$simMA = simMA
#save(simMA,file='../data/pubchem_kegg/cluster_info.Rdata')
hc <- hclust(as.dist(1-simMA), method="single") 


if(!require(heatmaply)){
install.packages('heatmaply')
 library(heatmaply) # Loads the package
}


heatmaply(simMA, k_col = NA, k_row = NA, label_names = c("product1", "product2", "similarity"),labRow = rownames(simMA),labCol =colnames(simMA),fontsize_row = 4,fontsize_col = 4) %>% layout(margin = list(l = 130, b = 40))

# 10 groups were found

# get cluster/well info
tmp = heatmapr(simMA, k_col = NA, k_row = NA)
dend = as.dendrogram(tmp$rows)

if(!require(dendextend)){
install.packages('dendextend')
 library(dendextend) # Loads the package
}

col.leaf = get_leaves_branches_col(dend)
col.leaf = as.numeric(as.factor(col.leaf))
wells.dend = labels(dend)

Here is the assignment of sugars in each cluster:

head(cbind(wells.dend,col.leaf))

Here is the distribution of cluster size:

sort(table(col.leaf))

Here is the distribution of molecular weight in each cluster:

boxplot(result4$MW~col.leaf,xlab='cluster ID',ylab='Molecular Weight')

Functional analysis on the identified groups

Here we rely on KEGG database for compounds to understand the underlying biology of the clusters we obtained in the previous step:

if(!require(KEGGREST)){
install.packages('KEGGREST')
 library(KEGGREST) # Loads the package
}

DB.KEGG = list()

i=1
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=2 
tmp2 = keggGet('C00973')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=3
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=4
tmp2 = keggGet('C00973')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=5
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=6
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=7
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=8
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=9
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=10
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=11
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=12
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=13 
tmp2 = keggGet('C00270')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=14 
tmp2 = keggGet('C01487')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=15
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=16
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=17
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=18
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=19
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=20
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=21 
tmp2 = keggGet('C00503')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=22
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=23  
tmp2 = keggGet('C00243')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=24
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=25 
tmp2 = keggGet('C00031')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=26
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=27 
tmp2 = keggGet('C08243')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=28                       
tmp2 = keggGet('D04845')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=29 
tmp2 = keggGet('C03619')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=30
tmp2 = keggGet('C03619')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=31 
tmp2 = keggGet('C00031')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=32 
tmp2 = keggGet('C00257')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=33 
tmp2 = keggGet('C02603')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=34 
tmp2 = keggGet('C03619')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=35 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=36 
tmp2 = keggGet('C00492')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=37
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=38 
tmp2 = keggGet('C02076')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=39
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=40
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=41 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=42
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=43
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=44 
tmp2 = keggGet('C00140')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=45
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=46 
tmp2 = keggGet('C00431')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=47
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[8]]

i=48
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=49
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=50
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=51
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=52
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[3]]

i=53 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=54 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=55 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=56 
tmp2 = keggGet('C00989')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=57
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=58 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=59
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=60 
tmp2 = keggGet('C00256')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=61
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=62 
tmp2 = keggGet('C05402')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=63
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=64
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=65  
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=66 
tmp2 = keggGet('C00121')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=67 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=68
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=69 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=70 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=71
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=72 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[6]]

i=73 
tmp2 = keggGet('C19779')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=74 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=75 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=76
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=77  
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=78
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=79 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=80 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=81
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=82
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=83
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[2]]

i=84 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=85 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=86 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=87
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=88 
tmp2 = keggGet('C00487')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=89  
tmp2 = keggGet('C18706')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=90     
tmp2 = keggGet('C04227')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=91
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=92 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=93
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[5]]

i=94 
tmp2 = keggGet('C02845')
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]

i=95 
tmp = names(keggFind('compound',query=as.character(wells$sourceName[i])))
tmp2 = keggGet(tmp)
as.character(wells$sourceName[i])
lapply(tmp2,function(x)x$NAME)
DB.KEGG[[as.character(wells$wellID[i])]] = tmp2[[1]]


ANNO[[plateID]]$DB.KEGG = DB.KEGG

Here are the top 20 most represented KEGG pathways across all sugars:

sort(table(unlist(lapply(DB.KEGG,function(x)x$PATHWAY))),decreasing=T)[1:20]

After getting KEGG pathways corresponding to the different wells, we want to check if the clusters we got from PubChem similarity are biologically relevant.

pathways = lapply(DB.KEGG,function(x)x$PATHWAY)
pathways.name = unique(unlist(pathways)) # 153 pathways
length(pathways.name)

# enrichment
ENR = vector("list", length = max(col.leaf))
for(i in 1:max(col.leaf)){
  HITS = NULL

  for(j in 1:length(pathways.name)){
    target = sapply(pathways[wells.dend[which(col.leaf == i)]], function(x) pathways.name[j] %in% x)
    bgd = sapply(pathways[wells.dend[which(col.leaf != i)]], function(x) pathways.name[j] %in% x)

    tmp = fisher.test(x=factor(c(target,bgd),levels=c('TRUE','FALSE')),y=factor(c(rep('target',length(target)),rep('bgd',length(bgd))),levels=c('target','bgd')))
    #if(tmp$estimate > 1 & tmp$p.value < 0.05) ENR[[i]] = c(ENR[[i]],pathways.name[j])
    if(tmp$p.value < 0.03) HITS = rbind(HITS,c(pathways.name[j],round(sign(tmp$estimate-1)*tmp$p.value,3)))
  }
 if(length(HITS) > 1) colnames(HITS) = c('Pathway','p-value')
  # we keep negative enrichment but wont' display it in the app

  ENR[[i]] = HITS[order(abs(as.numeric(HITS[,2])),decreasing = FALSE),]
}

# This enrichment mapping might be useful when we detect significant clusters in a specific group of strains
ANNO[[plateID]]$ENR = ENR
ANNO[[plateID]]$col.leaf = col.leaf
ANNO[[plateID]]$pathways.name = pathways.name
ANNO[[plateID]]$wells.dend = wells.dend

Here is the content of each cluster in terms of enriched KEGG pathways:

options(width = 1000)
ENR

SAVE FINAL OBJECT

save(ANNO,file='../data/biolog_kegg_wells.Rdata')


kevinVervier/CarboLogR documentation built on Sept. 25, 2019, 6:06 p.m.