R/fetchNode.R

#'Query nodes from the database
#'@description query nodes from the database which will return node information or node attributes
#'including neo4jid, grinnid, nodename, nodelabel, datasource, description, organism, synonyms and xref, if applicable.
#'@usage fetchNode(txtinput, nodetype, searchby, exactmatch, returnas)
#'@param txtinput a character vector of entities.
#'It can be the value of any node property key e.g. \code{txtinput} = c('pubchemId1', 'pubchemId2'), see also \code{searchby}.
#'@param nodetype a string specifying a node type. It can be one of compound, protein, gene, pathway, rna, dna, phenotype.
#'@param searchby a string specifying a node property key used for the query. It can be one of neo4jid (default), grinnid, name, synonyms, description, properties, xref, datasource.
#'@param exactmatch a logical value. If TRUE (default), match exactly, case-sensitive and data type-sensitive.
#'@param returnas a string specifying output type. It can be one of dataframe, list, json. Default is dataframe.
#'@details
#'The database uses two id systems. The neo4j id is a numeric, internal id automatically generated by the database system.
#'The grinn id (gid) is an id system of Grinn database that uses main ids of standard resources
#'i.e. ENSEMBL for genes (e.g.ENSG00000139618), UniProt for proteins (e.g.P0C9J6), PubChem CID for compounds (e.g.5793), KEGG for pathways (e.g.hsa00010).
#'@return node information. Return empty list or data frame if error or found nothing.
#'@note maximum no. of nodes returned = 30000 nodes
#'@author Kwanjeera W \email{kwanich@@ucdavis.edu}
#'@examples
#'# Query compounds by name
#'#kw <- list('1-Methylhistidine','D-Lactic acid')
#'#result <- fetchNode(txtinput=kw, nodetype="compound", searchby="name")
#'# Query compounds by KEGG ids
#'#kw <- c('C01152','C00256','C00345')
#'#result <- fetchNode(txtinput=kw, nodetype="compound", searchby="xref")
#'# Query compounds by synonyms
#'#kw <- '1-Methyl histidine'
#'#result <- fetchNode(txtinput=kw, nodetype="compound", searchby="synonyms", exactmatch=FALSE)
#'@export
fetchNode <- function(txtinput, nodetype, searchby="neo4jid", exactmatch=TRUE, returnas="dataframe") UseMethod("fetchNode")
#'@export
fetchNode.default <- function(txtinput, nodetype, searchby="neo4jid", exactmatch=TRUE, returnas="dataframe"){
  out <- tryCatch(
  {
    tmparg <- try(nodetype <- match.arg(tolower(nodetype), c("compound","protein","gene","pathway","rna","dna","phenotype"), several.ok = FALSE), silent = TRUE)
    if (class(tmparg) == "try-error") {
      stop("argument 'nodetype' is not valid, choose one from the list: compound,protein,gene,pathway,rna,dna,phenotype")
    }
    tmparg <- try(searchby <- match.arg(tolower(searchby), c("neo4jid","grinnid","name","synonyms","description","properties","xref","datasource"), several.ok = FALSE), silent = TRUE)
    if (class(tmparg) == "try-error") {
      stop("argument 'searchby' is not valid, choose one from the list: grinnid,name,synonyms,description,properties,xref,datasource")
    }
    #construct query
    txtinput = unique(stringr::str_trim(unlist(txtinput))) #remove whiteline, duplicate
    len = length(txtinput)
    maxkw = 500 #maximum keywords
    nodetype = Hmisc::capitalize(nodetype)
    isString =  searchby %in% propertyList$stringVal
    doPar = TRUE
    if (exactmatch == TRUE && isString == TRUE) {
      querystring = nodeList["exactMatch"]
      doPar = FALSE #use UNWIND
    }else if (exactmatch == FALSE && isString == TRUE) {
      querystring = nodeList["regexMatch"]
    }else if (exactmatch == TRUE && isString == FALSE) {
      querystring = nodeList["exactCollection"]
    }else{
      querystring = nodeList["regexCollection"]
    }
    if(searchby == 'neo4jid'){
      querystring = gsub("node.property = x", "ID(node) = toInt(x)", querystring)
      txtinput = txtinput[!is.na(suppressWarnings(as.numeric(txtinput)))] #remove string, ID accepts integer only
    }else if(searchby == 'grinnid'){
      querystring = gsub("property", "GID", querystring)
    }else if(searchby == 'datasource'){
      querystring = gsub("property", "dataSource", querystring)
    }else{
      querystring = gsub("property", searchby, querystring)
    }
    querystring = gsub("label", nodetype, querystring)
    querystring = paste(querystring,"RETURN DISTINCT node")
    cat("Querying group of nodes ...\n")
    if(!doPar){
      if(len <= maxkw){
        qstring = gsub("keyword", paste0("['",paste0(txtinput, collapse = "','"),"']"), querystring)
cat(qstring,"\n")
        nodes = curlRequest.TRANSACTION(cypher=qstring)
      }else{
        cat("Split queries for more than 500 nodes ...\nWarning: querying a large number of nodes will take long time. \n")
#        subinp = split(txtinput, ceiling(seq_along(txtinput)/maxkw)) #split keywords
#         nodes = foreach(i=1:length(subinp), .combine=c) %dopar% {
#           qstring = gsub("keyword", paste0("['",paste0(unlist(subinp[i]), collapse = "','"),"']"), querystring)
# cat(qstring,"\n")
#           curlRequest.TRANSACTION(cypher=qstring)
#         }
        nodes = lapply(txtinput, function (x) curlRequest.TRANSACTION(cypher=gsub("keyword", paste0("['",paste0(x, collapse = "','"),"']"), querystring)))
        nodes = unlist(nodes, recursive = FALSE)
      }
    }else{
      cat("Querying each node ...\nWarning: querying a large number of nodes will take long time. \n")
#       nodes = foreach(i=1:length(txtinput), .combine=c) %dopar% {
#         qstring = gsub("keyword", txtinput[i], querystring)
# cat(qstring,"\n")
#         curlRequest.TRANSACTION(cypher=qstring)
#       }
      nodes = lapply(txtinput, function (x) curlRequest.TRANSACTION(cypher=gsub("keyword", x, querystring)))
      nodes = unlist(nodes, recursive = FALSE)
    }
  formatNodeOutput(nodes,returnas)
  },error = function(e) {
    message(e)
    cat("\nError: RETURN no node ..\n")
    switch(returnas,
           dataframe = data.frame(),
           list = list(),
           json = list())
  })
  return(out)
}
kwanjeeraw/metabox documentation built on May 20, 2019, 7:07 p.m.