#' ChemSpider.annotation
#'
#' The ChemSpider annotation utility maps the input mz to known compounds in
#' biological, chemical, and environmental databases. Users have the option to
#' specify which data sources in ChemSpider should be used for annotaion.
#' NOTE: Please obtain a security token from www.chemspider.com/Register.aspx
#' before using this function. You will have to specify your role as a Service
#' Subscriber.
#'
#' This utility uses the ChemSpider web services to search for compounds based
#' on the input mz, mass search threshold (+/- ppm), and adduct. The output is
#' generated as an HTML report and a text file that includes pathway and
#' compound annotations with links to several biological, chemical, and
#' environmental databases. The function takes as input a data frame with a
#' list of input m/z, a user-defined m/z threshold (ppm) to define the minimum
#' and maximum mass range, list of adducts; eg: c(<93>M+H<94>,
#' <93>M+H-H2O<94>), the output folder location, list of data sources to be
#' searched, and a security token string. ChemSpider_annotation_results.html
#'
#' @param dataA Feature table from apLCMS or XCMS. The first column should be
#' m/z.
#' @param max.mz.diff Metlin matching m/z threshold in ppm. eg: 5
#' @param queryadductlist List of adducts to be used for searching. eg:
#' c("M+H","M+Na","M+K"), c("positive") for all positive ion adducts, or
#' c("negative") for all negative ion adducts, or c("all") for all adducts as
#' defined in Metlin.
#' @param xMSannotator.outloc Output location where the HTML and text reports
#' will be written. eg: "C:/experiment1/xMSanalyzeroutput/"
#' @param numnodes Number of computing nodes to use. Default: 1
#' @param datasources Provide a list of data sources to search against. The
#' following data sources are supported: KEGG, Human Metabolome Database,
#' LipidMAPS, MassBank, PubChem, BioCyc, SMPDB Small Molecule Pathway Database,
#' EPA DSSTox, EPA Toxcast, Pesticide Common Names, ChEMBL, ChEBI, and NIST.
#' The names of the data sources should be provided exactly as listed above.
#' eg: c("KEGG", "PubChem", "EPA Toxcast", "Human Metabolome Database",
#' "BioCyc")
#' @param tokenstr Please register on ChemSpider website to obtain a security
#' token. www.chemspider.com/Register.aspx Provide that security token here.
#' Eg: "mysecuritycode"
#' @param maxhits Maximum number of result hits to retrieve. Default: 30
#' @param syssleep Wait time between queries. Default: 5
#' @return A list is returned. \item{html.res }{Annotation report in HTML
#' format} \item{text.res }{Text delimited annotation report}
#' @author Karan Uppal <kuppal2@@emory.edu>
#' @keywords ~annotation ~pesticides ~pathway ~ChemSpider
ChemSpider.annotation <- function(dataA, max.mz.diff = 10,
queryadductlist = c("M+H"), xMSannotator.outloc, numnodes = 1,
datasources = c("KEGG"), tokenstr = NA, maxhits = 30,
syssleep = 5) {
data(adduct_table)
adduct_table <- as.data.frame(adduct_table)
data_a <- as.data.frame(dataA)
# print('Using the 1st column as \'mz\' for
# annotation.')
if (is.na(tokenstr) == TRUE) {
stop("Please specify a valid ChemSpider security token. \nCreate a ChemSpider account to obtain a token: \n www.chemspider.com/Register.aspx")
}
mzlist <- data_a[, 1]
# =c('PubChem','MassBank', 'EPA DSSTox','EPA
# Toxcast','NIST Chemistry WebBook','KEGG',
#'Human Metabolome Database', 'ChEMBL', 'ChEBI', 'NIAID','Pesticide Common Names','SMPDB Small Molecule Pathway Database',
#'MeSH','LipidMAPS','ChemBank','BioCyc')
dir.create(xMSannotator.outloc, showWarnings = FALSE)
setwd(xMSannotator.outloc)
# adduct_table<-read.table('/Users/karanuppal/Documents/Emory/JonesLab/Projects/xMSannotator/adduct_table.txt',sep='\t',header=TRUE)
# adduct_table<-adduct_table[c(which(adduct_table[,6]=='S'),which(adduct_table[,6]=='Acetonitrile')),]
adduct_names <- as.character(adduct_table[, 1])
adductlist <- adduct_table[, 4]
mult_charge <- adduct_table[, 3]
num_mol <- adduct_table[, 2]
names(adductlist) <- as.character(adduct_names)
names(mult_charge) <- as.character(adduct_names)
names(num_mol) <- as.character(adduct_names)
alladducts <- adduct_names
if (queryadductlist[1] == "positive") {
queryadductlist <- adduct_names[which(adduct_table[,
5] == "positive")]
} else {
if (queryadductlist[1] == "negative") {
queryadductlist <- adduct_names[which(adduct_table[,
5] == "negative")]
} else {
if (queryadductlist[1] == "all") {
queryadductlist <- alladducts
} else {
if (length(which(queryadductlist %in% alladducts ==
FALSE)) > 0) {
errormsg <- paste("Adduct should be one of:",
sep = "")
for (i in alladducts) {
errormsg <- paste(errormsg, i, sep = " ; ")
}
stop(errormsg, "\n\nUsage: feat.batch.annotation.KEGG(dataA,max.mz.diff=10, queryadductlist=c(\"M+H\", \"M+Na\"), xMSannotator.outloc, numnodes=1)",
"\n\n OR feat.batch.annotation.KEGG(dataA,max.mz.diff=10, queryadductlist=c(\"positive\"), xMSannotator.outloc, numnodes=1)",
"\n\n OR feat.batch.annotation.KEGG(dataA,max.mz.diff=10, queryadductlist=c(\"negative\"), xMSannotator.outloc, numnodes=1)",
"\n\n OR feat.batch.annotation.KEGG(dataA,max.mz.diff=10, queryadductlist=c(\"all\"), xMSannotator.outloc, numnodes=1)")
}
}
}
}
parentres = {
}
# cnames<-c('','Adduct','Query.m/z', 'Search mass \n
# tolerance range (+/-)', 'Metlin', 'Compound.Name',
# 'CASID', 'KEGG.Compound.ID', 'KEGG.Pathway.name',
# 'KEGG.Pathway.ID', 'HMDB.ID', 'PubChem.Substance.ID',
# 'PubChem.Compound.ID', 'ChEBI.ID', 'LIPID.MAPS.ID')
# cnames<-c('','Adduct','Query.m/z', 'Search mass \n
# tolerance range (+/-)', 'ChemspiderID',
# 'Link.to.ChemSpider','CommonName',
# 'Molecular.Formula','SMILES','InChI', 'InChIKey',
# 'AverageMass','MolecularWeight','MonoisotopicMass',
# 'NominalMass', 'ALogP', 'XLogP')
cnames <- c("", "Adduct", "Query.m/z", "Search mass \n tolerance range (+/-)",
"ChemspiderID", "CommonName", "Molecular.Formula",
"SMILES", "InChI", "InChIKey", "AverageMass", "MolecularWeight",
"MonoisotopicMass", "NominalMass", "ALogP", "XLogP",
"Structure", "KEGG.Compound.ID", "HMDB", "LipidMAPS",
"PubChem", "MassBank", "BioCyc", "SMPDB", "EPA.DSSTox",
"EPA.Toxcast", "Pesticide.Common.Names", "ChEMBL",
"ChEBI", "NIST.Chem.WebBook", "WikiPathways", "DrugBank",
"Comparative Toxicogenomics Database", "ACToR: Aggregated Computational Toxicology Resource")
for (adnum in 1:length(queryadductlist)) {
Sys.sleep(syssleep)
adductname = queryadductlist[adnum]
adductmass = adductlist[as.character(adductname)]
cl <- makeSOCKcluster(numnodes)
clusterEvalQ(cl, library(XML))
clusterEvalQ(cl, library(R2HTML))
clusterEvalQ(cl, library(RCurl))
clusterEvalQ(cl, library(SSOAP))
clusterEvalQ(cl, "processWSDL")
clusterEvalQ(cl, library(png))
clusterEvalQ(cl, "chspider.batch.annotation.child")
# print(paste('Query adduct: ',adductname,sep=''))
mz.annot.res <- new("list")
min_mz <- min(mzlist)
max_mz <- max(mzlist)
# mz_group<-ceiling(max_mz/min_mz)
mz_group <- 10
# mz_group<-round(max_mz/10) length(mzlist)
num_mz <- 1
for (mind in seq(1, length(mzlist), mz_group)) {
stopind <- mind + mz_group - 1
if (stopind > length(mzlist)) {
stopind <- length(mzlist)
}
s1 <- mzlist[mind:stopind]
s1 <- unique(s1)
num_mz <- num_mz + length(s1)
if (num_mz%%50 > 0) {
Sys.sleep(syssleep)
} else {
Sys.sleep(syssleep)
}
if (length(s1) > 2) {
repeat {
cur.annot.res <- parLapply(cl, s1, chspider.batch.annotation.child,
max.mz.diff = max.mz.diff, adductname = adductname,
datasources, tokenstr, maxhits, syssleep,
adduct_table = adduct_table)
if (is(cur.annot.res[1], "try-error")) {
Sys.sleep(10)
cur.annot.res <- parLapply(cl, s1, chspider.batch.annotation.child,
max.mz.diff = max.mz.diff, adductname = adductname,
datasources, tokenstr, maxhits, syssleep,
adduct_table = adduct_table)
} else {
break
}
}
mz.annot.res <- c(mz.annot.res, cur.annot.res)
} else {
for (i in 1:length(s1)) {
# print('mz is') print(s1[i])
rescur <- chspider.batch.annotation.child(mz.val = s1[i],
max.mz.diff = max.mz.diff, adductname = adductname,
datasources, tokenstr, maxhits, syssleep,
adduct_table = adduct_table)
# print(length(rescur))
if (length(rescur) > 0) {
rescur <- as.matrix(rescur)
# print(dim(rescur))
if (dim(rescur)[2] == 1) {
rescur <- t(rescur)
rescur <- as.data.frame(rescur)
}
rescur <- as.data.frame(rescur)
# print(dim(rescur))
mz.annot.res[[length(mz.annot.res) +
1L]] <- rescur
}
}
}
if (mind%%10 > 0) {
Sys.sleep(syssleep/2)
} else {
Sys.sleep(1)
stopCluster(cl)
cl <- makeSOCKcluster(numnodes)
clusterEvalQ(cl, library(XML))
clusterEvalQ(cl, library(RCurl))
clusterEvalQ(cl, library(SSOAP))
clusterEvalQ(cl, library(png))
clusterEvalQ(cl, "processWSDL")
clusterEvalQ(cl, "feat.batch.annotation.child")
}
fname <- paste("cur_res.Rda", sep = "")
# save(mz.annot.res,file=fname)
}
stopCluster(cl)
res = {
}
# print(length(mz.annot.res))
if (length(mz.annot.res) > 0) {
# print(adductname)
for (mzl in 1:length(mz.annot.res)) {
# print(dim(mz.annot.res[[mzl]]))
res = rbind(res, mz.annot.res[[mzl]])
}
}
# print(mz.annot.res)
res <- unique(res)
# print('length of res') print(length(res))
fname2 <- paste(adductname, "res.Rda", sep = "")
# save(res,file=fname2)
text_res <- {
}
if (length(res) > 0) {
adductname1 = c(rep(adductname, dim(res)[1]))
temp_res <- cbind(adductname1, res)
temp_res <- as.matrix(temp_res)
if (dim(temp_res)[2] == 1) {
temp_res <- t(temp_res)
temp_res <- as.data.frame(temp_res)
}
bad_rows <- which(temp_res[, 2] == "1")
if (length(bad_rows) > 0) {
temp_res <- temp_res[-bad_rows, ]
temp_res <- as.matrix(temp_res)
# temp_res<-t(temp_res)
if (dim(temp_res)[2] == 1) {
temp_res <- t(temp_res)
}
}
# temp_res<-as.data.frame(temp_res)
colnames(temp_res) = NULL
# text_resindex<-c(1,2,5,6,7,8,11,10,13,15,17,19,21)
# text_resindex<-c(1,2,3,4,6:16)
text_resindex <- c(1, 2, 3, 6:9, 10:17, seq(18,
50, 2))
text_resindex <- text_resindex + 1
text_res <- temp_res[, c(1, text_resindex)]
text_res <- as.matrix(text_res)
if (dim(text_res)[2] == 1) {
text_res <- t(text_res)
}
text_res <- as.data.frame(text_res)
bad_rows <- which(text_res[, 2] == "1")
if (length(bad_rows) > 0) {
text_res <- text_res[-bad_rows, ]
text_res <- as.matrix(text_res)
text_res <- t(text_res)
}
text_res <- as.data.frame(text_res)
sernum = seq(1, dim(text_res)[1])
text_res <- cbind(sernum, text_res)
colnames(text_res) = cnames
parentres = rbind(parentres, temp_res)
rm(temp_res)
colnames(parentres) = NULL
}
fname = paste(xMSannotator.outloc, "/ChemSpider_annotation_results_",
queryadductlist[adnum], ".txt", sep = "")
write.table(text_res, file = fname, sep = "\t", row.names = FALSE)
Sys.sleep(2)
}
html_res <- {
}
if (length(parentres) > 0) {
res <- parentres[order(parentres[, 2]), ]
# html_resindex<-c(1,2,5,4,6:7,9,11:12,14,16,18,20,22)
res <- as.matrix(res)
if (dim(res)[2] == 1) {
res <- t(res)
}
# html_resindex<-c(1,2,5,6:7,9,11:12,14,16,18,20,22)
# text_resindex<-c(1,2,3,6:9,10:17,seq(18,50,2))
html_resindex <- c(1, 2, 5:16, 17, seq(19, 51, 2))
html_resindex <- html_resindex + 1
html_res <- res[, c(1, html_resindex)]
html_res <- as.matrix(html_res)
if (dim(html_res)[2] == 1) {
html_res <- t(html_res)
}
sernum = seq(1, dim(html_res)[1])
# html_res<-cbind(sernum,html_res)
html_res <- as.data.frame(html_res)
#'ChemspiderID',
# cnames<-c('','Adduct','Query.m/z', 'Search mass \n
# tolerance range (+/-)', 'Metlin.match.ID',
# 'Metlin.match.mass', 'Metlin.compound.name', 'CASID',
# 'KEGG.Compound.ID', 'KEGG.Pathway.name',
# 'KEGG.Pathway.ID', 'HMDB.ID', 'PubChem.Substance.ID',
# 'PubChem.Compound.ID', 'ChEBI.ID', 'LIPID.MAPS.ID')
cnames <- c("Adduct", "Query.m/z", "Search mass \n tolerance range (+/-)",
"Link.to.ChemSpider", "CommonName", "Molecular.Formula",
"SMILES", "InChI", "InChIKey", "AverageMass",
"MolecularWeight", "MonoisotopicMass", "NominalMass",
"ALogP", "XLogP", "Structure", "KEGG.Compound.ID",
"HMDB", "LipidMAPS", "PubChem", "MassBank", "BioCyc",
"SMPDB", "EPA.DSSTox", "EPA.Toxcast", "Pesticide.Common.Names",
"ChEMBL", "ChEBI", "NIST.Chem.WebBook", "WikiPathways",
"DrugBank", "Comparative Toxicogenomics Database",
"ACToR: Aggregated Computational Toxicology Resource")
colnames(html_res) <- cnames
rownames(html_res) <- sernum
cnames2 <- cnames[c(1:3, 4, 5:6, 12, 16, 7:9, 17:33)]
html_res <- html_res[, c(1:3, 4, 5:6, 12, 16, 7:9,
17:33)]
colnames(html_res) <- cnames2
fname = paste("ChemSpider_annotation_results", sep = "")
unlink(fname)
# fname=paste(xMSannotator.outloc,'/KEGG_annotation_results.html',sep='')
HTMLInitFile(filename = fname, Title = "ChemSpider annotation results",
outdir = xMSannotator.outloc)
fname = paste(xMSannotator.outloc, "/ChemSpider_annotation_results.html",
sep = "")
HTML(html_res, file = fname, Border = 1, innerBorder = 1,
useCSS = TRUE)
HTMLEndFile(file = fname)
if (length(queryadductlist) > 1) {
text_res <- {
}
text_resindex <- c(1, 2, 3, 6:16, 17, seq(18,
50, 2))
text_resindex <- text_resindex + 1
text_res <- res[, c(1, text_resindex)]
text_res <- as.matrix(text_res)
if (dim(text_res)[2] == 1) {
text_res <- t(text_res)
}
if (length(text_res) > 0) {
sernum = seq(1, dim(text_res)[1])
} else {
sernum = {
}
}
# text_res<-cbind(sernum,text_res)
#'Link.to.ChemSpider',
cnames <- c("Adduct", "Query.m/z", "Search mass \n tolerance range (+/-)",
"ChemspiderID", "CommonName", "Molecular.Formula",
"SMILES", "InChI", "InChIKey", "AverageMass",
"MolecularWeight", "MonoisotopicMass", "NominalMass",
"ALogP", "XLogP", "Structure", "KEGG.Compound.ID",
"HMDB", "LipidMAPS", "PubChem", "MassBank",
"BioCyc", "SMPDB", "EPA.DSSTox", "EPA.Toxcast",
"Pesticide.Common.Names", "ChEMBL", "ChEBI",
"NIST.Chem.WebBook", "WikiPathways", "DrugBank",
"Comparative Toxicogenomics Database", "ACToR: Aggregated Computational Toxicology Resource")
text_res <- as.data.frame(text_res)
colnames(text_res) = cnames
# text_res<-text_res[,c(1:3,4,6,13,17,7:9,18:29)]
# c('','Adduct','Query.m/z', 'Search mass \n tolerance
# range (+/-)', 'Metlin.match.ID', 'Metlin.match.mass',
# 'Metlin.compound.name', 'CASID', 'KEGG.Compound.ID',
# 'KEGG.Pathway.ID', 'KEGG.Pathway.name', 'HMDB.ID',
# 'PubChem.Substance.ID', 'PubChem.Compound.ID',
# 'ChEBI.ID', 'LIPID.MAPS.ID')
fname = paste(xMSannotator.outloc, "/ChemSpider_annotation_results_alladducts.txt",
sep = "")
write.table(text_res, file = fname, sep = "\t",
row.names = FALSE)
}
}
return(list(text.res = text_res, html.res = html_res))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.