R/ChemSpider.annotation.R

Defines functions ChemSpider.annotation

Documented in ChemSpider.annotation

#' ChemSpider.annotation
#' 
#' The ChemSpider annotation utility maps the input mz to known compounds in
#' biological, chemical, and environmental databases. Users have the option to
#' specify which data sources in ChemSpider should be used for annotaion.
#' NOTE: Please obtain a security token from www.chemspider.com/Register.aspx
#' before using this function. You will have to specify your role as a Service
#' Subscriber.
#' 
#' This utility uses the ChemSpider web services to search for compounds based
#' on the input mz, mass search threshold (+/- ppm), and adduct.  The output is
#' generated as an HTML report and a text file that includes pathway and
#' compound annotations with links to several biological, chemical, and
#' environmental databases. The function takes as input a data frame with a
#' list of input m/z, a user-defined m/z threshold (ppm) to define the minimum
#' and maximum mass range, list of adducts; eg: c(<93>M+H<94>,
#' <93>M+H-H2O<94>), the output folder location, list of data sources to be
#' searched, and a security token string.  ChemSpider_annotation_results.html
#' 
#' @param dataA Feature table from apLCMS or XCMS.  The first column should be
#' m/z.
#' @param max.mz.diff Metlin matching m/z threshold in ppm. eg: 5
#' @param queryadductlist List of adducts to be used for searching.  eg:
#' c("M+H","M+Na","M+K"), c("positive") for all positive ion adducts, or
#' c("negative") for all negative ion adducts, or c("all") for all adducts as
#' defined in Metlin.
#' @param xMSannotator.outloc Output location where the HTML and text reports
#' will be written.  eg: "C:/experiment1/xMSanalyzeroutput/"
#' @param numnodes Number of computing nodes to use. Default: 1
#' @param datasources Provide a list of data sources to search against.  The
#' following data sources are supported: KEGG, Human Metabolome Database,
#' LipidMAPS, MassBank, PubChem, BioCyc, SMPDB Small Molecule Pathway Database,
#' EPA DSSTox, EPA Toxcast, Pesticide Common Names, ChEMBL, ChEBI, and NIST.
#' The names of the data sources should be provided exactly as listed above.
#' eg: c("KEGG", "PubChem", "EPA Toxcast", "Human Metabolome Database",
#' "BioCyc")
#' @param tokenstr Please register on ChemSpider website to obtain a security
#' token.  www.chemspider.com/Register.aspx Provide that security token here.
#' Eg: "mysecuritycode"
#' @param maxhits Maximum number of result hits to retrieve. Default: 30
#' @param syssleep Wait time between queries. Default: 5
#' @return A list is returned.  \item{html.res }{Annotation report in HTML
#' format} \item{text.res }{Text delimited annotation report}
#' @author Karan Uppal <kuppal2@@emory.edu>
#' @keywords ~annotation ~pesticides ~pathway ~ChemSpider
ChemSpider.annotation <- function(dataA, max.mz.diff = 10, 
    queryadductlist = c("M+H"), xMSannotator.outloc, numnodes = 1, 
    datasources = c("KEGG"), tokenstr = NA, maxhits = 30, 
    syssleep = 5) {
    
    
    data(adduct_table)
    adduct_table <- as.data.frame(adduct_table)
    
    data_a <- as.data.frame(dataA)
    
    # print('Using the 1st column as \'mz\' for
    # annotation.')
    
    if (is.na(tokenstr) == TRUE) {
        
        stop("Please specify a valid ChemSpider security token. \nCreate a ChemSpider account to obtain a token: \n www.chemspider.com/Register.aspx")
        
    }
    mzlist <- data_a[, 1]
    
    # =c('PubChem','MassBank', 'EPA DSSTox','EPA
    # Toxcast','NIST Chemistry WebBook','KEGG',
    #'Human Metabolome Database', 'ChEMBL', 'ChEBI', 'NIAID','Pesticide Common Names','SMPDB Small Molecule Pathway Database',
    #'MeSH','LipidMAPS','ChemBank','BioCyc')
    
    dir.create(xMSannotator.outloc, showWarnings = FALSE)
    setwd(xMSannotator.outloc)
    
    # adduct_table<-read.table('/Users/karanuppal/Documents/Emory/JonesLab/Projects/xMSannotator/adduct_table.txt',sep='\t',header=TRUE)
    # adduct_table<-adduct_table[c(which(adduct_table[,6]=='S'),which(adduct_table[,6]=='Acetonitrile')),]
    
    adduct_names <- as.character(adduct_table[, 1])
    adductlist <- adduct_table[, 4]
    mult_charge <- adduct_table[, 3]
    num_mol <- adduct_table[, 2]
    names(adductlist) <- as.character(adduct_names)
    names(mult_charge) <- as.character(adduct_names)
    names(num_mol) <- as.character(adduct_names)
    alladducts <- adduct_names
    
    if (queryadductlist[1] == "positive") {
        queryadductlist <- adduct_names[which(adduct_table[, 
            5] == "positive")]
        
    } else {
        if (queryadductlist[1] == "negative") {
            
            queryadductlist <- adduct_names[which(adduct_table[, 
                5] == "negative")]
            
        } else {
            if (queryadductlist[1] == "all") {
                
                
                queryadductlist <- alladducts
                
                
            } else {
                if (length(which(queryadductlist %in% alladducts == 
                  FALSE)) > 0) {
                  
                  errormsg <- paste("Adduct should be one of:", 
                    sep = "")
                  for (i in alladducts) {
                    errormsg <- paste(errormsg, i, sep = " ; ")
                  }
                  stop(errormsg, "\n\nUsage: feat.batch.annotation.KEGG(dataA,max.mz.diff=10, queryadductlist=c(\"M+H\", \"M+Na\"), xMSannotator.outloc, numnodes=1)", 
                    "\n\n OR  feat.batch.annotation.KEGG(dataA,max.mz.diff=10, queryadductlist=c(\"positive\"), xMSannotator.outloc, numnodes=1)", 
                    "\n\n OR  feat.batch.annotation.KEGG(dataA,max.mz.diff=10, queryadductlist=c(\"negative\"), xMSannotator.outloc, numnodes=1)", 
                    "\n\n OR  feat.batch.annotation.KEGG(dataA,max.mz.diff=10, queryadductlist=c(\"all\"), xMSannotator.outloc, numnodes=1)")
                }
                
            }
        }
    }
    
    parentres = {
    }
    # cnames<-c('','Adduct','Query.m/z', 'Search mass \n
    # tolerance range (+/-)', 'Metlin', 'Compound.Name',
    # 'CASID', 'KEGG.Compound.ID', 'KEGG.Pathway.name',
    # 'KEGG.Pathway.ID', 'HMDB.ID', 'PubChem.Substance.ID',
    # 'PubChem.Compound.ID', 'ChEBI.ID', 'LIPID.MAPS.ID')
    # cnames<-c('','Adduct','Query.m/z', 'Search mass \n
    # tolerance range (+/-)', 'ChemspiderID',
    # 'Link.to.ChemSpider','CommonName',
    # 'Molecular.Formula','SMILES','InChI', 'InChIKey',
    # 'AverageMass','MolecularWeight','MonoisotopicMass',
    # 'NominalMass', 'ALogP', 'XLogP')
    
    cnames <- c("", "Adduct", "Query.m/z", "Search mass \n tolerance range (+/-)", 
        "ChemspiderID", "CommonName", "Molecular.Formula", 
        "SMILES", "InChI", "InChIKey", "AverageMass", "MolecularWeight", 
        "MonoisotopicMass", "NominalMass", "ALogP", "XLogP", 
        "Structure", "KEGG.Compound.ID", "HMDB", "LipidMAPS", 
        "PubChem", "MassBank", "BioCyc", "SMPDB", "EPA.DSSTox", 
        "EPA.Toxcast", "Pesticide.Common.Names", "ChEMBL", 
        "ChEBI", "NIST.Chem.WebBook", "WikiPathways", "DrugBank", 
        "Comparative Toxicogenomics Database", "ACToR: Aggregated Computational Toxicology Resource")
    for (adnum in 1:length(queryadductlist)) {
        Sys.sleep(syssleep)
        
        adductname = queryadductlist[adnum]
        adductmass = adductlist[as.character(adductname)]
        
        cl <- makeSOCKcluster(numnodes)
        
        clusterEvalQ(cl, library(XML))
        clusterEvalQ(cl, library(R2HTML))
        clusterEvalQ(cl, library(RCurl))
        clusterEvalQ(cl, library(SSOAP))
        clusterEvalQ(cl, "processWSDL")
        clusterEvalQ(cl, library(png))
        clusterEvalQ(cl, "chspider.batch.annotation.child")
        
        # print(paste('Query adduct: ',adductname,sep=''))
        
        mz.annot.res <- new("list")
        min_mz <- min(mzlist)
        max_mz <- max(mzlist)
        
        # mz_group<-ceiling(max_mz/min_mz)
        mz_group <- 10
        # mz_group<-round(max_mz/10) length(mzlist)
        num_mz <- 1
        for (mind in seq(1, length(mzlist), mz_group)) {
            
            stopind <- mind + mz_group - 1
            if (stopind > length(mzlist)) {
                
                stopind <- length(mzlist)
            }
            s1 <- mzlist[mind:stopind]
            s1 <- unique(s1)
            num_mz <- num_mz + length(s1)
            if (num_mz%%50 > 0) {
                Sys.sleep(syssleep)
            } else {
                Sys.sleep(syssleep)
            }
            
            if (length(s1) > 2) {
                
                repeat {
                  cur.annot.res <- parLapply(cl, s1, chspider.batch.annotation.child, 
                    max.mz.diff = max.mz.diff, adductname = adductname, 
                    datasources, tokenstr, maxhits, syssleep, 
                    adduct_table = adduct_table)
                  if (is(cur.annot.res[1], "try-error")) {
                    Sys.sleep(10)
                    cur.annot.res <- parLapply(cl, s1, chspider.batch.annotation.child, 
                      max.mz.diff = max.mz.diff, adductname = adductname, 
                      datasources, tokenstr, maxhits, syssleep, 
                      adduct_table = adduct_table)
                    
                  } else {
                    break
                  }
                  
                }
                
                mz.annot.res <- c(mz.annot.res, cur.annot.res)
                
            } else {
                for (i in 1:length(s1)) {
                  # print('mz is') print(s1[i])
                  rescur <- chspider.batch.annotation.child(mz.val = s1[i], 
                    max.mz.diff = max.mz.diff, adductname = adductname, 
                    datasources, tokenstr, maxhits, syssleep, 
                    adduct_table = adduct_table)
                  # print(length(rescur))
                  if (length(rescur) > 0) {
                    rescur <- as.matrix(rescur)
                    # print(dim(rescur))
                    if (dim(rescur)[2] == 1) {
                      rescur <- t(rescur)
                      rescur <- as.data.frame(rescur)
                    }
                    rescur <- as.data.frame(rescur)
                    # print(dim(rescur))
                    mz.annot.res[[length(mz.annot.res) + 
                      1L]] <- rescur
                  }
                }
            }
            if (mind%%10 > 0) {
                Sys.sleep(syssleep/2)
            } else {
                Sys.sleep(1)
                stopCluster(cl)
                cl <- makeSOCKcluster(numnodes)
                
                
                clusterEvalQ(cl, library(XML))
                clusterEvalQ(cl, library(RCurl))
                clusterEvalQ(cl, library(SSOAP))
                clusterEvalQ(cl, library(png))
                clusterEvalQ(cl, "processWSDL")
                clusterEvalQ(cl, "feat.batch.annotation.child")
            }
            
            fname <- paste("cur_res.Rda", sep = "")
            # save(mz.annot.res,file=fname)
        }
        
        stopCluster(cl)
        
        res = {
        }
        
        # print(length(mz.annot.res))
        if (length(mz.annot.res) > 0) {
            # print(adductname)
            for (mzl in 1:length(mz.annot.res)) {
                # print(dim(mz.annot.res[[mzl]]))
                res = rbind(res, mz.annot.res[[mzl]])
                
            }
        }
        # print(mz.annot.res)
        res <- unique(res)
        
        # print('length of res') print(length(res))
        
        fname2 <- paste(adductname, "res.Rda", sep = "")
        # save(res,file=fname2)
        
        text_res <- {
        }
        
        if (length(res) > 0) {
            
            adductname1 = c(rep(adductname, dim(res)[1]))
            
            temp_res <- cbind(adductname1, res)
            
            temp_res <- as.matrix(temp_res)
            
            
            
            if (dim(temp_res)[2] == 1) {
                
                temp_res <- t(temp_res)
                temp_res <- as.data.frame(temp_res)
                
            }
            
            bad_rows <- which(temp_res[, 2] == "1")
            
            if (length(bad_rows) > 0) {
                temp_res <- temp_res[-bad_rows, ]
                temp_res <- as.matrix(temp_res)
                
                # temp_res<-t(temp_res)
                if (dim(temp_res)[2] == 1) {
                  
                  temp_res <- t(temp_res)
                  
                  
                }
                
                
            }
            # temp_res<-as.data.frame(temp_res)
            colnames(temp_res) = NULL
            
            
            # text_resindex<-c(1,2,5,6,7,8,11,10,13,15,17,19,21)
            # text_resindex<-c(1,2,3,4,6:16)
            text_resindex <- c(1, 2, 3, 6:9, 10:17, seq(18, 
                50, 2))
            text_resindex <- text_resindex + 1
            text_res <- temp_res[, c(1, text_resindex)]
            
            text_res <- as.matrix(text_res)
            
            
            
            if (dim(text_res)[2] == 1) {
                
                text_res <- t(text_res)
                
                
            }
            text_res <- as.data.frame(text_res)
            bad_rows <- which(text_res[, 2] == "1")
            
            
            if (length(bad_rows) > 0) {
                text_res <- text_res[-bad_rows, ]
                text_res <- as.matrix(text_res)
                text_res <- t(text_res)
            }
            text_res <- as.data.frame(text_res)
            
            sernum = seq(1, dim(text_res)[1])
            text_res <- cbind(sernum, text_res)
            colnames(text_res) = cnames
            
            
            parentres = rbind(parentres, temp_res)
            rm(temp_res)
            colnames(parentres) = NULL
            
        }
        
        fname = paste(xMSannotator.outloc, "/ChemSpider_annotation_results_", 
            queryadductlist[adnum], ".txt", sep = "")
        write.table(text_res, file = fname, sep = "\t", row.names = FALSE)
        
        
        
        Sys.sleep(2)
        
    }
    
    html_res <- {
    }
    
    
    
    if (length(parentres) > 0) {
        
        
        res <- parentres[order(parentres[, 2]), ]
        
        # html_resindex<-c(1,2,5,4,6:7,9,11:12,14,16,18,20,22)
        res <- as.matrix(res)
        
        if (dim(res)[2] == 1) {
            res <- t(res)
        }
        
        
        
        # html_resindex<-c(1,2,5,6:7,9,11:12,14,16,18,20,22)
        
        
        # text_resindex<-c(1,2,3,6:9,10:17,seq(18,50,2))
        html_resindex <- c(1, 2, 5:16, 17, seq(19, 51, 2))
        html_resindex <- html_resindex + 1
        html_res <- res[, c(1, html_resindex)]
        
        html_res <- as.matrix(html_res)
        if (dim(html_res)[2] == 1) {
            html_res <- t(html_res)
        }
        
        sernum = seq(1, dim(html_res)[1])
        
        # html_res<-cbind(sernum,html_res)
        html_res <- as.data.frame(html_res)
        #'ChemspiderID',
        # cnames<-c('','Adduct','Query.m/z', 'Search mass \n
        # tolerance range (+/-)', 'Metlin.match.ID',
        # 'Metlin.match.mass', 'Metlin.compound.name', 'CASID',
        # 'KEGG.Compound.ID', 'KEGG.Pathway.name',
        # 'KEGG.Pathway.ID', 'HMDB.ID', 'PubChem.Substance.ID',
        # 'PubChem.Compound.ID', 'ChEBI.ID', 'LIPID.MAPS.ID')
        
        cnames <- c("Adduct", "Query.m/z", "Search mass \n tolerance range (+/-)", 
            "Link.to.ChemSpider", "CommonName", "Molecular.Formula", 
            "SMILES", "InChI", "InChIKey", "AverageMass", 
            "MolecularWeight", "MonoisotopicMass", "NominalMass", 
            "ALogP", "XLogP", "Structure", "KEGG.Compound.ID", 
            "HMDB", "LipidMAPS", "PubChem", "MassBank", "BioCyc", 
            "SMPDB", "EPA.DSSTox", "EPA.Toxcast", "Pesticide.Common.Names", 
            "ChEMBL", "ChEBI", "NIST.Chem.WebBook", "WikiPathways", 
            "DrugBank", "Comparative Toxicogenomics Database", 
            "ACToR: Aggregated Computational Toxicology Resource")
        
        
        colnames(html_res) <- cnames
        rownames(html_res) <- sernum
        
        cnames2 <- cnames[c(1:3, 4, 5:6, 12, 16, 7:9, 17:33)]
        html_res <- html_res[, c(1:3, 4, 5:6, 12, 16, 7:9, 
            17:33)]
        colnames(html_res) <- cnames2
        
        fname = paste("ChemSpider_annotation_results", sep = "")
        unlink(fname)
        # fname=paste(xMSannotator.outloc,'/KEGG_annotation_results.html',sep='')
        HTMLInitFile(filename = fname, Title = "ChemSpider annotation results", 
            outdir = xMSannotator.outloc)
        fname = paste(xMSannotator.outloc, "/ChemSpider_annotation_results.html", 
            sep = "")
        HTML(html_res, file = fname, Border = 1, innerBorder = 1, 
            useCSS = TRUE)
        HTMLEndFile(file = fname)
        
        if (length(queryadductlist) > 1) {
            
            text_res <- {
            }
            text_resindex <- c(1, 2, 3, 6:16, 17, seq(18, 
                50, 2))
            text_resindex <- text_resindex + 1
            text_res <- res[, c(1, text_resindex)]
            
            text_res <- as.matrix(text_res)
            if (dim(text_res)[2] == 1) {
                text_res <- t(text_res)
            }
            
            if (length(text_res) > 0) {
                sernum = seq(1, dim(text_res)[1])
            } else {
                sernum = {
                }
            }
            # text_res<-cbind(sernum,text_res)
            #'Link.to.ChemSpider',
            cnames <- c("Adduct", "Query.m/z", "Search mass \n tolerance range (+/-)", 
                "ChemspiderID", "CommonName", "Molecular.Formula", 
                "SMILES", "InChI", "InChIKey", "AverageMass", 
                "MolecularWeight", "MonoisotopicMass", "NominalMass", 
                "ALogP", "XLogP", "Structure", "KEGG.Compound.ID", 
                "HMDB", "LipidMAPS", "PubChem", "MassBank", 
                "BioCyc", "SMPDB", "EPA.DSSTox", "EPA.Toxcast", 
                "Pesticide.Common.Names", "ChEMBL", "ChEBI", 
                "NIST.Chem.WebBook", "WikiPathways", "DrugBank", 
                "Comparative Toxicogenomics Database", "ACToR: Aggregated Computational Toxicology Resource")
            text_res <- as.data.frame(text_res)
            colnames(text_res) = cnames
            # text_res<-text_res[,c(1:3,4,6,13,17,7:9,18:29)]
            # c('','Adduct','Query.m/z', 'Search mass \n tolerance
            # range (+/-)', 'Metlin.match.ID', 'Metlin.match.mass',
            # 'Metlin.compound.name', 'CASID', 'KEGG.Compound.ID',
            # 'KEGG.Pathway.ID', 'KEGG.Pathway.name', 'HMDB.ID',
            # 'PubChem.Substance.ID', 'PubChem.Compound.ID',
            # 'ChEBI.ID', 'LIPID.MAPS.ID')
            
            fname = paste(xMSannotator.outloc, "/ChemSpider_annotation_results_alladducts.txt", 
                sep = "")
            write.table(text_res, file = fname, sep = "\t", 
                row.names = FALSE)
        }
        
        
    }
    return(list(text.res = text_res, html.res = html_res))
}
yufree/xMSannotator documentation built on Oct. 31, 2022, 12:20 a.m.