R/fetchHetNetworkByGID.R

#'Query networks containing one or several relationship types from the database
#'@description query networks of a relationship pattern that contains the given \code{from} and/or \code{to} nodes using grinn id (gid), see details.
#'@usage fetchHetNetworkByGID(from, to, pattern, returnas)
#'@param from a character vector of start nodes e.g. from = c('id1', 'id2'). Given \code{from} = NULL, will result in all possible start nodes.
#'Otherwise the value must be a grinn id, see details and see \code{\link{convertId}} for how to convert ids.
#'@param to a character vector of end nodes e.g. to = c('id1', 'id2'). Given \code{to} = NULL, will result in all possible end nodes, see \code{from} for details.
#'@param pattern a string specifying the relationship pattern.
#'
#'Providing a node as \code{(from:Nodetype)} is to specify the start node.
#'Providing a node as \code{(to:Nodetype)} is to specify the end node.
#'Providing a node as \code{(:Nodetype)} means any node of the given node type.
#'
#'A relationship type is indicated by a pair of square brackets between the arrow e.g. \code{-[:RELATIONSHIP_TYPE]->}
#'
#'@param returnas a string specifying output type. It can be one of dataframe, list, json. Default is dataframe.
#'@details The function is used to query networks containing one or more relationship types (heterogeneous network). Use \code{\link{fetchNetworkByGID}} to query networks containing one relationship type.
#'
#'The database uses two id systems. The neo4j id is a numeric, internal id automatically generated by the database system.
#'The grinn id (gid) is an id system of Grinn database that uses main ids of standard resources
#'i.e. ENSEMBL for genes (e.g.ENSG00000139618), UniProt for proteins (e.g.P0C9J6), PubChem CID for compounds (e.g.5793), KEGG for pathways (e.g.hsa00010).
#'@return list of network information with the following components:
#'
#'nodes:
#'
#'\code{id} = node neo4j id
#'
#'\code{gid} = node grinn id
#'
#'\code{nodename} = node name
#'
#'\code{nodelabel} = node type
#'
#'\code{nodexref} = node cross references
#'
#'edges:
#'
#'\code{source, target} = node neo4j id
#'
#'\code{type} = relationship type
#'
#'\code{datasource} = relationship resource
#'
#'\code{properties} = relationship properties
#'
#'Return empty list if error or found nothing.
#'@author Kwanjeera W \email{kwanich@@ucdavis.edu}
#'@seealso \code{\link{convertId}}, \code{\link{fetchNetworkByGID}}
#'
#'For database structure see \url{http://grinnhomepage}
#'@examples
#'# Query the network of (from:Compound)-BIOCHEMICAL_REACTION->(to:Compound)<-ANNOTATION-(:Pathway)
#'#from = list('1060','284','760') #By default, PubChem ids can be used as grinn ids for compounds
#'#to = list('222656','107689','5950','71080') #By default, PubChem ids can be used as grinn ids for compounds
#'#pattern = "(from:Compound)-[:BIOCHEMICAL_REACTION]->(to:Compound)<-[:ANNOTATION]-(:Pathway)"
#'#result = fetchHetNetworkByGID(from=from, to=to, pattern=pattern)
#'@export
fetchHetNetworkByGID <- function(from=NULL, to=NULL, pattern, returnas="dataframe") UseMethod("fetchHetNetworkByGID")
#'@export
fetchHetNetworkByGID.default <- function(from=NULL, to=NULL, pattern, returnas="dataframe"){
  out <- tryCatch(
    {
      if (!is.character(pattern)) stop("argument 'pattern' must be a character vector")
      if (length(from) == 0) from = NULL #reset empty list to NULL
      if (length(to) == 0) to = NULL #reset empty list to NULL
      flagfromdf = FALSE
      if (class(from) == "data.frame" && !is.null(from)) {
        datfrominput = from #keep input data
        flagfromdf = TRUE
        if(!is.null(from$grinn)){
          from = from$grinn
        }else if(!is.null(from$PubChem)){
          from = from$PubChem
          colnames(datfrominput) = gsub("PubChem","grinn",colnames(datfrominput))
        }else if(!is.null(from$pubchem)){
          from = from$pubchem
          colnames(datfrominput) = gsub("pubchem","grinn",colnames(datfrominput))
        }else if(!is.null(from$uniprot)){
          from = from$uniprot
          colnames(datfrominput) = gsub("uniprot","grinn",colnames(datfrominput))
        }else if(!is.null(from$ensembl)){
          from = from$ensembl
          colnames(datfrominput) = gsub("ensembl","grinn",colnames(datfrominput))
        }
      }
      flagtodf = FALSE
      if (class(to) == "data.frame" && !is.null(to)) {
          dattoinput = to #keep input data
          flagtodf = TRUE
          if(!is.null(to$grinn)){
            to = to$grinn
          }else if(!is.null(to$PubChem)){
            to = to$PubChem
            colnames(dattoinput) = gsub("PubChem","grinn",colnames(dattoinput))
          }else if(!is.null(to$pubchem)){
            to = to$pubchem
            colnames(dattoinput) = gsub("pubchem","grinn",colnames(dattoinput))
          }else if(!is.null(to$uniprot)){
            to = to$uniprot
            colnames(dattoinput) = gsub("uniprot","grinn",colnames(dattoinput))
          }else if(!is.null(to$ensembl)){
            to = to$ensembl
            colnames(dattoinput) = gsub("ensembl","grinn",colnames(dattoinput))
          }
      }
      #construct query
      maxkw = 500 #maximum keywords
      doPar = FALSE
      if (!is.null(from) && !is.null(to)) {
        querystring = pathsList_GID["fromto"]
        doPar = TRUE #do loop
        from = unique(stringr::str_trim(unlist(from))) #remove whiteline, duplicate
        to = unique(stringr::str_trim(unlist(to))) #remove whiteline, duplicate
      }else if (!is.null(from) && is.null(to)) {
        querystring = pathsList_GID["from"]
        txtinput = unique(stringr::str_trim(unlist(from))) #remove whiteline, duplicate
        len = length(txtinput)
      }else if (is.null(from) && !is.null(to)) {
        querystring = pathsList_GID["to"]
        txtinput = unique(stringr::str_trim(unlist(to))) #remove whiteline, duplicate
        len = length(txtinput)
      }else{
        stop("Error: No query provided")
      }
      querystring = gsub("relpattern", pattern, querystring)
      cat("Querying network ...\n")
      if(!doPar){
        if(len <= maxkw){
          qstring = gsub("keyword", paste0("['",paste0(txtinput, collapse = "','"),"']"), querystring)
cat(qstring,"\n")
          paths = curlRequest.TRANSACTION(cypher=qstring)
        }else{
          cat("Split queries for more than 500 nodes ...\nWarning: querying a large number of nodes will take long time. \n")
#           subinp = split(txtinput, ceiling(seq_along(txtinput)/maxkw)) #split keywords
#           paths = foreach(i=1:length(subinp), .combine=c) %dopar% {
#             qstring = gsub("keyword", paste0("['",paste0(unlist(subinp[i]), collapse = "','"),"']"), querystring)
# cat(qstring,"\n")
#             curlRequest.TRANSACTION(cypher=qstring)
#           }
          paths = lapply(txtinput, function (x) curlRequest.TRANSACTION(cypher=gsub("keyword", paste0("['",paste0(x, collapse = "','"),"']"), querystring)))
          paths = unlist(paths, recursive = FALSE)
        }
      }else{
        cat("Querying from combination of nodes ...\nWarning: querying a large number of nodes will take long time. \n")
#         path = foreach(i=1:length(from), .combine=rbind) %dopar% {
#           foreach(j=1:length(to)) %dopar% {
#             qstring = gsub("keyfrom", from[i], querystring)
#             qstring = gsub("keyto", to[j], qstring)
# cat(qstring,"\n")
#             curlRequest.TRANSACTION(cypher=qstring)
#           }
#         }
#         paths = unlist(path, recursive = FALSE)
        ft = expand.grid(from,to)
        path = apply(ft, 1, function (x) curlRequest.TRANSACTION(cypher=gsub("keyfrom", x[1], gsub("keyto", x[2], querystring))))
        paths = unlist(path, recursive = FALSE)
      }
      network = formatNetworkOutput(paths)
      networknode = network$nodes
      if(nrow(network$edges)>0){
        if(flagfromdf && flagtodf){#keep from-to input data
          fromtoinput = plyr::rbind.fill(datfrominput,dattoinput)
          fromtoinput = unique(fromtoinput)
          networknode = merge(networknode,fromtoinput,by.x='gid',by.y='grinn',all.x=TRUE)
          networknode = networknode[,c(2,1,3:ncol(networknode))]
        }
        if(flagfromdf && !flagtodf){#keep from input data
          networknode = merge(networknode,datfrominput,by.x='gid',by.y='grinn',all.x=TRUE)
          networknode = networknode[,c(2,1,3:ncol(networknode))]
        }
        if(!flagfromdf && flagtodf){#keep to input data
          networknode = merge(networknode,dattoinput,by.x='gid',by.y='grinn',all.x=TRUE)
          networknode = networknode[,c(2,1,3:ncol(networknode))]
        }
        networknode[is.na(networknode)] = ""
      }
      out = switch(returnas,
                   dataframe = list(nodes=networknode, edges=network$edges),
                   list = list(nodes = split(networknode, seq(nrow(networknode))), edges = split(network$edges, seq(nrow(network$edges)))),
                   json = list(nodes=jsonlite::toJSON(networknode), edges=jsonlite::toJSON(network$edges)),
                   stop("incorrect return type"))
    },error=function(e) {
      message(e)
      cat("\nError: RETURN no network ..\n")
      out = switch(returnas,
             dataframe = list(nodes = data.frame(), edges = data.frame()),
             list = list(nodes = list(), edges = list()),
             json = list(nodes = "", edges = ""))
    })
  return(out)
}
kwanjeeraw/metabox documentation built on May 20, 2019, 7:07 p.m.