#'Compute Tanimoto similarity network
#'@description compute a network for compounds based on chemical structure similarity.
#'The function computes Tanimoto distances between the given compounds using PubChem fingerprints.
#'@usage computeSimilarity(txtinput, coef, returnas)
#'@param txtinput a character vector of PubChem CIDs.
#'@param coef a numeric value specifying the minimum Tanimoto similarity correlation coefficient to be included in the output (from 0 to 1, default is 0.7).
#'@param returnas a string specifying output type. It can be one of dataframe, list, json. Default is dataframe.
#'@return list of network information with the following components:
#'
#'nodes:
#'
#'\code{id} = pubchem CID or node neo4j id
#'
#'\code{gid} = pubchem CID
#'
#'\code{nodename} = pubchem CID or node name
#'
#'\code{nodelabel} = node type
#'
#'edges:
#'
#'\code{source, target} = pubchem CID or node neo4j id
#'
#'\code{coef} = Tanimoto similarity coefficient
#'
#'\code{type} = relationship type
#'
#'Return empty list if error or found nothing.
#'@note If the database is installed, node attributes will be automatically retrieved from the database. Otherwise node attributes will be the original input.
#'@author Kwanjeera W \email{kwanich@@ucdavis.edu}
#'@references Willett P., Barnard JM. and Downs GM. (1998) Chemical similarity searching. J. Chem. Inf. Comput. Sci., 38, 983–996.
#'@references Barupal KD., et al. (2012) MetaMapp: mapping and visualizing metabolomic data by integrating information from biochemical pathways and chemical and mass spectral similarity. BMC Bioinformatics. 13:99.
#'@references Grapov D., Wanichthanarak K. and Fiehn O. (2015) MetaMapR: pathway independent metabolomic network analysis incorporating unknowns. Bioinformatics. 31(16):2757-60.
#'@references Cao Y., Charisi A., Cheng L., Jiang T. and Girke T. (2008) ChemmineR: a compound mining framework for R. Bioinformatics, 24(15), pp. 1733–1734.
#'@references \url{ftp://ftp.ncbi.nih.gov/pubchem/specifications/pubchem_fingerprints.txt}
#'@seealso \pkg{\link{metabomapr}}, \code{\link{CID_tanimoto}}, \code{\link{fpSim}}, \code{\link{sdfid}}
#'@examples
#'#simnw <- computeSimilarity(c(1110,10413,196,51,311,43,764,790)) #compute similarity network for given pubchem compounds
#'@export
computeSimilarity <- function(txtinput, coef=0.7, returnas="dataframe") UseMethod("computeSimilarity")
#'@export
computeSimilarity.default <- function (txtinput, coef=0.7, returnas="dataframe")
{
out <- tryCatch(
{
flagdf = FALSE
if (class(txtinput) == "data.frame") {#get result from statistical analysis
if(!is.null(txtinput$PubChem)){
datinput = txtinput #keep input data
flagdf = TRUE
txtinput = txtinput$PubChem
}else if(!is.null(txtinput$pubchem)){
datinput = txtinput #keep input data
flagdf = TRUE
txtinput = txtinput$pubchem
}
}
txtinput = unique(stringr::str_trim(unlist(gsub("[^[:digit:]]","",txtinput)))) #remove whiteline, duplicate, words
txtinput = txtinput[txtinput!=""]
cat("Computing Tanimoto similarity ...\n")
# tanmt = metabomapr::CID_tanimoto(txtinput)
#format output
# tanmt = CID_tanimoto(txtinput)
# nRow = nrow(tanmt)
# nNames = dimnames(tanmt)[[1]]
# rowMat = matrix(c(1:nRow), nRow, nRow, byrow = TRUE)
# colMat = matrix(c(1:nRow), nRow, nRow)
# dstRows = as.dist(rowMat)
# dstCols = as.dist(colMat)
# network = data.frame(source = as.character(nNames[dstRows]), target = as.character(nNames[dstCols]), coef = tanmt[lower.tri(tanmt)], stringsAsFactors = FALSE)
# network = network[network$coef > coef, ]
network = getChemSimNet(txtinput, cutoff = coef)
cat("Format and returning network of size ",nrow(network)," ...\n")
if(nrow(network)>0){#pass cutoff
network = data.frame(source = as.character(network[,1]), target = as.character(network[,2]), coef = as.numeric(network[,3]), stringsAsFactors = FALSE)
network$type = "TANIMOTO_SIMILARITY"
cat("Format and returning network nodes ...\n")
#format nodeList from edgeList
so = data.frame(id=network$source, gid=network$source, nodename=network$source, nodelabel="Compound", stringsAsFactors = FALSE)
ta = data.frame(id=network$target, gid=network$target, nodename=network$target, nodelabel="Compound", stringsAsFactors = FALSE)
sota = unique(rbind(so,ta))
if(foundDb()){#have db
nodels = lapply(sota$id, formatNode.LIST, y="compound", z="grinnid") #query nodes by gid
networknode = plyr::ldply(nodels, data.frame)
#format edge, change gid to id, fix edge row order
s = dplyr::right_join(networknode[,1:2],network[,1:2],by=c('gid' = 'source'))
t = dplyr::right_join(networknode[,1:2],network[,1:2],by=c('gid' = 'target'))
network$source = s$id
network$target = t$id
}else{#no db
cat("No database installed, returning original input ...\n")
networknode = sota
}
if(flagdf){#keep input data
networknode = merge(networknode,datinput,by.x='gid',by.y='PubChem',all.x=TRUE)
networknode = networknode[,c(2,1,3:ncol(networknode))]
networknode[is.na(networknode)] = ""
}
## output
switch(returnas,
dataframe = list(nodes = networknode, edges = network),
list = list(nodes = split(networknode, seq(nrow(networknode))), edges = split(network, seq(nrow(network)))),
json = list(nodes = jsonlite::toJSON(networknode), edges = jsonlite::toJSON(network)),
stop("Error: incorrect 'returnas' type"))
}else{#not pass cutoff
switch(returnas,
dataframe = list(nodes = data.frame(), edges = data.frame()),
list = list(nodes = list(), edges = list()),
json = list(nodes = "", edges = ""))
}
},error=function(e) {
message(e)
cat("\nError: RETURN no network ..\n")
switch(returnas,
dataframe = list(nodes = data.frame(), edges = data.frame()),
list = list(nodes = list(), edges = list()),
json = list(nodes = "", edges = ""))
})
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.