R/computeNodeOverrep.R

#'Perform overrepresentation analysis on the input entities
#'@description perform overrepresentation analysis using hypergeometric tests to retrieve overrepresented annotation terms of the input entities.
#'@usage computeNodeOverrep(txtinput, nodetype, annotation, internalid, size)
#'@param txtinput a character vector of entities e.g. c('pubchemId1', 'pubchemId2').
#'The value can be neo4j ids or grinn ids, see details and see \code{\link{convertId}} for how to convert ids.
#'For Mesh annotation, PubChem CIDs are required.
#'@param nodetype a string specifying a node type. It can be one of compound (default), protein, gene, pathway, rna, dna.
#'@param annotation a string specifying the annotation type e.g. pathway (default) and mesh. Pathway annotation requires the database.
#'Mesh annotation doesn't require the database but it is available for PubChem compounds only.
#'@param internalid a logical value indicating whether \code{txtinput} is the neo4j id, if TRUE (default). If not, \code{txtinput} is expected to be the grinn id or PubChem CID.
#'See details and see \code{\link{convertId}} for how to convert ids. It has no effect on Mesh annotation.
#'@param size a numeric vector specifying the minimum number of members in each annotation term to be used in the analysis. Default is 3.
#'@details The database uses two id systems. The neo4j id is a numeric, internal id automatically generated by the database system.
#'The grinn id (gid) is an id system of Grinn database that uses main ids of standard resources
#'i.e. ENSEMBL for genes (e.g.ENSG00000139618), UniProt for proteins (e.g.P0C9J6), PubChem CID for compounds (e.g.5793), KEGG for pathways (e.g.hsa00010).
#'@return list of data frame of nodes, edges and overrepresentation. The nodes data frame contains input attributes. The edges data frame contains annotation pairs.
#'The data frame of overrepresentation contains the following components:
#'
#'\code{rank} = rank sort by p adj
#'
#'\code{id} = annotation id or annotation neo4j id
#'
#'\code{gid} = annotation id or annotation grinn id
#'
#'\code{nodename} = annotation name
#'
#'\code{nodelabel} = annotation type
#'
#'\code{nodexref} = cross references
#'
#'\code{p} = raw p-values
#'
#'\code{p_adj} = adjusted p-values
#'
#'\code{no_of_entities} = number of input entities in each annotation term
#'
#'\code{annotation_size} = total number of entities in each annotation term from the database
#'
#'\code{background_size} = total number of annotated entities in the database
#'
#'\code{member} = list of entity members of the annotation term
#'
#'Return list of empty data frame if error or found nothing.
#'@author Kwanjeera W \email{kwanich@@ucdavis.edu}
#'@references Johnson NL., Kotz S., and Kemp AW. (1992) Univariate Discrete Distributions, Second Edition. New York: Wiley.
#'@seealso \code{\link{phyper}}, \code{\link{p.adjust}}
#'@examples
#'#txtinput <- c(1110,10413,196,51,311,43,764,790) #compute overrepresented terms for given pubchem compounds
#'#result <- computeNodeOverrep(txtinput=txtinput, nodetype="compound", annotation="mesh", internalid=FALSE)
#'@export
computeNodeOverrep <- function(txtinput, nodetype="compound", annotation="pathway", internalid = TRUE, size=3) UseMethod("computeNodeOverrep")
#'@export
computeNodeOverrep.default <- function (txtinput, nodetype="compound", annotation="pathway", internalid = TRUE, size=3){
  out <- tryCatch(
    {
      tmparg <- try(nodetype <- match.arg(tolower(nodetype), c("compound","protein","gene","rna","dna"), several.ok = FALSE), silent = TRUE)
      if (class(tmparg) == "try-error") {
        stop("argument 'nodetype' is not valid, choose one from the list: compound,protein,gene,rna,dna")
      }
      tmparg <- try(annotation <- match.arg(tolower(annotation), c("pathway","mesh"), several.ok = FALSE), silent = TRUE)
      if (class(tmparg) == "try-error") {
        stop("argument 'annotation' is not valid, choose one from the list: pathway,mesh")
      }
      require('dplyr')#load dplyr for opencpu
      nodetype = Hmisc::capitalize(nodetype)
      flagdf = FALSE
      if (class(txtinput) == "data.frame" && !is.null(txtinput)) {#get result from statistical analysis
        datinput = txtinput #keep input data
        flagdf = TRUE
        if(!is.null(txtinput$grinn)){
          txtinput = txtinput$grinn
        }else if(!is.null(txtinput$PubChem)){
          txtinput = txtinput$PubChem
          colnames(datinput) = gsub("PubChem","grinn",colnames(datinput))
        }else if(!is.null(txtinput$pubchem)){
          txtinput = txtinput$pubchem
          colnames(datinput) = gsub("pubchem","grinn",colnames(datinput))
        }else if(!is.null(txtinput$uniprot)){
          txtinput = txtinput$uniprot
          colnames(datinput) = gsub("uniprot","grinn",colnames(datinput))
        }else if(!is.null(txtinput$ensembl)){
          txtinput = txtinput$ensembl
          colnames(datinput) = gsub("ensembl","grinn",colnames(datinput))
        }
      }
      txtinput = unique(stringr::str_trim(unlist(txtinput))) #remove whiteline, duplicate
      txtinput = txtinput[!is.na(txtinput)]
      if(tolower(annotation) == 'pathway' && foundDb()){#pathway overrepresentation
        cat("Querying database ...\n")
        if(internalid){
          annols = lapply(txtinput, function(x) fetchNetwork(to=x, fromtype="pathway", totype = nodetype, reltype = "ANNOTATION")) #query annotation pairs
        }else{
          annols = lapply(txtinput, function(x) fetchNetworkByGID(to=x, fromtype="pathway", totype = nodetype, reltype = "ANNOTATION")) #query annotation pairs
        }
        if(!is.null(unlist(annols))){#found annotation
          annonws = combineNetworks(annols) #combine annotation pairs
          annomem = plyr::ddply(annonws$edges,c('source'),plyr::summarise,member=list(target)) #get members
          subanno = annonws$edges %>% dplyr::group_by(source) %>% dplyr::tally() #count entities
          colnames(subanno) = c('id','count')
          subanno = subanno[subanno$count >= size,] #filter by annotation size
          cat("Performing overrepresentation analysis ...\n")
          if(nrow(subanno) > 0){
            overDF = callHypergeo.pathway(edgelist=subanno, nodelist=annonws$nodes, member=annomem, nodetype=nodetype, inputsize=length(txtinput))
          }else{#less than minSize
            cat("No. of members is too small, returning no data ...\n")
            overDF = data.frame()
          }
          networknode = annonws$nodes[annonws$nodes$nodelabel != "Pathway", ] #not return pathway nodes
          if(flagdf){#keep input data
            networknode = merge(networknode,datinput,by.x='gid',by.y='grinn',all.x=TRUE)
            networknode = networknode[,c(2,1,3:ncol(networknode))]
            networknode[is.na(networknode)] = ""
          }
          meminfo = merge(annonws$edges, networknode, by.x='target', by.y='id')
          memname = plyr::ddply(meminfo,c('source'),plyr::summarise,membername=list(nodename))
          overDF = dplyr::left_join(overDF, memname, by=c('id'='source'))
          list(nodes=networknode, edges=annonws$edges, overrepresentation=overDF) #output
        }else{#no annotation found
          list(nodes=data.frame(), edges=data.frame(), overrepresentation=data.frame()) #output
        }
      }else if(tolower(annotation) == 'mesh'){#mesh overrepresentation
        if(internalid || tolower(nodetype) != "compound"){
          cat("Error: Accept only PubChem compounds, returning no data ...\n")
          annols = NULL
        }else{
          cat("Connecting PubChem ...\n")
          annols = lapply(txtinput, function(x) callMesh(pcid=x)) #query pubchem annotation pairs
        }
        if(!is.null(unlist(annols))){#found annotation
          annonws = combineNetworks(annols) #combine annotation pairs
          if(foundDb()){#have db
            nodels = lapply(txtinput, formatNode.LIST, y="compound", z="grinnid") #query nodes by gid
            networknode = plyr::ldply(nodels, data.frame)
            networknode$id = networknode$gid
          }else{#no db
            cat("No database installed, returning original input ...\n")
            networknode = data.frame(id=txtinput, gid=txtinput, nodename=txtinput, nodelabel="Compound", nodexref='', stringsAsFactors = FALSE)
          }
          overDF = callHypergeo.mesh(edgelist=annonws$edges, nodelist=annonws$nodes, size=size, inputsize=length(txtinput))
          if(flagdf){#keep input data
            networknode = merge(networknode,datinput,by.x='gid',by.y='grinn',all.x=TRUE)
            networknode = networknode[,c(2,1,3:ncol(networknode))]
            networknode[is.na(networknode)] = ""
          }
          meminfo = merge(annonws$edges, networknode, by.x='target', by.y='id')
          memname = plyr::ddply(meminfo,c('source'),plyr::summarise,membername=list(nodename))
          overDF = dplyr::left_join(overDF, memname, by=c('id'='source'))
          list(nodes=networknode, edges=annonws$edges, overrepresentation=overDF) #output
        }else{#no annotation found
          list(nodes=data.frame(), edges=data.frame(), overrepresentation=data.frame()) #output
        }
      }else{
        cat('Error: No database installed, returning no data ..\n')
        list(nodes=data.frame(), edges=data.frame(), overrepresentation=data.frame()) #output
      }
    },error=function(e) {
      message(e)
      cat("\nError: RETURN no data ..\n")
      list(nodes=data.frame(), edges=data.frame(), overrepresentation=data.frame()) #output
    })
  return(out)
}
kwanjeeraw/metabox documentation built on May 20, 2019, 7:07 p.m.