R/convertId.R

#'Convert ids
#'@description convert input entities to internal neo4j ids and grinn ids.
#'@usage convertId(x, nodetype, searchby, exactmatch, returnas)
#'@param x a character vector or data frame of input entities, see details.
#'It can be the value of any node property key e.g. \code{txtinput} = c('pubchemId1', 'pubchemId2'), see also \code{searchby}.
#'@param nodetype a string specifying the type of a query node. It can be one of compound, protein, gene, pathway, rna, dna, phenotype.
#'@param searchby a string specifying a node property key used for the query. It can be one of neo4jid, grinnid, name, synonyms, xref (default).
#'@param exactmatch a logical value. If TRUE (default), match exactly, case-sensitive and data type-sensitive.
#'@param returnas a string specifying output type. It can be one of dataframe, list, json. Default is dataframe.
#'@details If \code{x} is a character vector, the results include the input entities, neo4j ids and grinn ids.
#'
#'\code{x} can be a data frame containing other information e.g. stat values. First column must be the input entities for the mapping.
#'The results will include the input entities, neo4j ids, grinn ids and the rest of the input information.
#'
#'Original input will be returned for unmapped entities.
#'
#'The database uses two id systems. The neo4j id is a numeric, internal id automatically generated by the database system.
#'The grinn id (gid) is an id system of Grinn database that uses main ids of standard resources
#'i.e. ENSEMBL for genes (e.g.ENSG00000139618), UniProt for proteins (e.g.P0C9J6), PubChem CID for compounds (e.g.5793), KEGG for pathways (e.g.hsa00010).
#'@return input entities and the matched neo4j ids and grinn ids. Return empty list or data frame if error.
#'@author Kwanjeera W \email{kwanich@@ucdavis.edu}
#'@examples
#'# Convert compound names
#'#kw <- list('1-Methylhistidine','D-Lactic acid')
#'#result <- convertId(x=kw, nodetype="compound", searchby="name")
#'# Query compounds by KEGG ids
#'#kw <- data.frame(kegg=c('C01152','C00256','C00345'),pval=c(0.01:0.3))
#'#result <- convertId(x=kw, nodetype="compound", searchby="xref")
#'@export
convertId <- function(x, nodetype, searchby="xref", exactmatch=TRUE, returnas="dataframe") UseMethod("convertId")
#'@export
convertId.default <- function(x, nodetype, searchby="xref", exactmatch=TRUE, returnas="dataframe"){
  out <- tryCatch(
    {
      tmparg <- try(nodetype <- match.arg(tolower(nodetype), c("compound","protein","gene","pathway","rna","dna","phenotype"), several.ok = FALSE), silent = TRUE)
      if (class(tmparg) == "try-error") {
        stop("argument 'nodetype' is not valid, choose one from the list: compound,protein,gene,pathway,rna,dna,phenotype")
      }
      tmparg <- try(searchby <- match.arg(tolower(searchby), c("xref","name","synonyms","grinnid","neo4jid"), several.ok = FALSE), silent = TRUE)
      if (class(tmparg) == "try-error") {
        stop("argument 'searchby' is not valid, choose one from the list: grinnid,name,neo4jid,synonyms,description,properties,xref")
      }
      if(!is.null(dim(x))){#dataframe input
        txtinput = unique(stringr::str_trim(unlist(x[,1]))) #remove whiteline, duplicate
        isDF = ifelse(ncol(x) > 1, TRUE, FALSE)
      }else{#list input
        txtinput = unique(stringr::str_trim(unlist(x))) #remove whiteline, duplicate
        isDF = FALSE
      }
      #construct query
      nodetype = Hmisc::capitalize(nodetype)
      cat("Converting ids ...\n")
      cat("Register parallel computing ...\nWarning: querying a large number of nodes will take long time. \n")
      if(isDF){#return all input data
#         nodes = foreach(i=1:length(txtinput), .combine=rbind) %dopar% {
#           res = formatNode.LIST(x=txtinput[i],y=nodetype,z=searchby)[,1:2] #get input attributes: id and gid
#           data.frame(txtinput[i], res, x[i,2:ncol(x)], stringsAsFactors = FALSE) #combine with the rest of input
#         }
        nodes = data.frame(stringsAsFactors = FALSE)
        for(i in 1:length(txtinput)){
          res = formatNode.LIST(x=txtinput[i],y=nodetype,z=searchby)[,1:2] #get input attributes: id and gid
          nodes = rbind(nodes,data.frame(txtinput[i], res, x[i,2:ncol(x)], stringsAsFactors = FALSE)) #combine with the rest of input
        }
        colnames(nodes) = c("input","neo4jid","grinnid",colnames(x)[2:ncol(x)])
        row.names(nodes) = NULL
      }else{#list input
#         nodes = foreach(i=1:length(txtinput), .combine=rbind) %dopar% {
#           res = formatNode.LIST(txtinput[i],y=nodetype,z=searchby)[,1:2] #get input attributes: id and gid
#           data.frame(txtinput[i], res, stringsAsFactors = FALSE)
#         }
        nodes = lapply(txtinput, function (x) data.frame(x, formatNode.LIST(x,y=nodetype,z=searchby)[,1:2], stringsAsFactors = FALSE))
        nodes = do.call(rbind, lapply(nodes, data.frame, stringsAsFactors=FALSE)) #total no. of entities
        colnames(nodes) = c("input","neo4jid","grinnid")
        row.names(nodes) = NULL
      }
      cat("Format and returning output of size ",nrow(nodes)," ...\n")
      ## output
      switch(returnas,
             dataframe = nodes,
             list = split(nodes, seq(nrow(nodes))),
             json = jsonlite::toJSON(nodes),
             stop("Error: incorrect 'returnas' type"))
    },error = function(e) {
      message(e)
      cat("\nError: RETURN no data ..\n")
      switch(returnas,
             dataframe = data.frame(),
             list = list(),
             json = list())
    })
  return(out)
}
kwanjeeraw/metabox documentation built on May 20, 2019, 7:07 p.m.