R/computeNodeEnrichment.R

#'Perform enrichment analysis on the input entities
#'@description perform enrichment analysis from p-values of the input entities. The function wraps around the main functions of \pkg{\link{piano}}.
#'@usage computeNodeEnrichment(nodedata, pcol, nodetype, annotation, internalid, method, size)
#'@param nodedata a two-column data frame of entities and the statistical values. The 1st column is entities and the second is statistical values e.g. p-values.
#'@param pcol a string specifying columnname containing p-values. This parameter is only for accepting the input from GUI.
#'@param nodetype a string specifying a node type. It can be one of compound (default), protein, gene, pathway, rna, dna.
#'@param annotation a string specifying the annotation type e.g. pathway (default) and mesh. Pathway annotation requires the database.
#'Mesh annotation doesn't require the database but it is available for PubChem compounds only.
#'@param internalid a logical value indicating whether the input entities are the neo4j ids, if TRUE (default). If not, the entities are expected to be the grinn id or PubChem CID.
#'See details and see \code{\link{convertId}} for how to convert ids. It has no effect on Mesh annotation.
#'@param method a string specifying the enrichment analysis method. It can be one of reporter (default), fisher, median, mean, stouffer. See \code{\link{runGSA}}
#'@param size a numeric vector specifying the minimum and maximum number of members in each annotation term to be used in the analysis. Default is c(3,500).
#'@param returnas a string specifying output type. It can be one of dataframe, list, json. Default is dataframe.
#'@details The database uses two id systems. The neo4j id is a numeric, internal id automatically generated by the database system.
#'The grinn id (gid) is an id system of Grinn database that uses main ids of standard resources
#'i.e. ENSEMBL for genes (e.g.ENSG00000139618), UniProt for proteins (e.g.P0C9J6), PubChem CID for compounds (e.g.5793), KEGG for pathways (e.g.hsa00010).
#'@return list of data frame of nodes, edges and enrichment. The nodes data frame contains input attributes. The edges data frame contains annotation pairs.
#'The data frame of enrichment contains the following components:
#'@param method a string specifying the enrichment analysis method. It can be one of reporter (default), fisher, median, mean, stouffer. See \code{\link{runGSA}}
#'@param size a numeric vector specifying the minimum and maximum number of members in each annotation term to be used in the analysis. Default is c(3,500).
#'@details The database uses two id systems. The neo4j id is a numeric, internal id automatically generated by the database system.
#'The grinn id (gid) is an id system of Grinn database that uses main ids of standard resources
#'i.e. ENSEMBL for genes (e.g.ENSG00000139618), UniProt for proteins (e.g.P0C9J6), PubChem CID for compounds (e.g.5793), KEGG for pathways (e.g.hsa00010).
#'@return list of data frame of nodes, edges, enrichment and pairs. The pairs data frame contains annotation pairs. The data frame of enrichment contains the following components:
#'
#'\code{rank} = rank sort by p
#'
#'\code{id} = annotation id or annotation neo4j id
#'
#'\code{gid} = annotation id or annotation grinn id
#'
#'\code{nodename} = annotation name
#'
#'\code{nodelabel} = annotation type
#'
#'\code{nodexref} = cross references
#'
#'\code{p} = raw p-values
#'
#'\code{p_adj} = adjusted p-values
#'
#'\code{no_of_entities} = number of input entities in each annotation term
#'
#'\code{annotation_size} = total number of entities in each annotation term from the database
#'
#'\code{member} = list of entity members of the annotation term
#'
#'Return list of empty data frame if error or found nothing.
#'@author Kwanjeera W \email{kwanich@@ucdavis.edu}
#'@references Fisher R. (1932) Statistical methods for research workers. Oliver and Boyd, Edinburgh.
#'@references Stouffer S., Suchman E., Devinney L., Star S., and Williams R. (1949) The American soldier: adjustment during army life. Princeton University Press, Oxford, England.
#'@references Patil K. and Nielsen J. (2005) Uncovering transcriptional regulation of metabolism by using metabolic network topology. Proceedings of the National Academy of Sciences of the United States of America 102(8), 2685.
#'@references Oliveira A., Patil K., and Nielsen J. (2008) Architecture of transcriptional regulatory circuits is knitted over the topology of bio-molecular interaction networks. BMC Systems Biology 2(1), 17.
#'@references Väremo L., Nielsen J., and Nookaew I. (2013) Enriching the gene set analysis of genome-wide data by incorporating directionality of gene expression and combining statistical hypotheses and methods. Nucleic Acids Research, 41(8), pp. 4378-4391.
#'@seealso \code{\link{loadGSC}}, \code{\link{runGSA}}, \code{\link{GSAsummaryTable}}
#'@examples
#'#dt <- data.frame(pubchem=c(1110,10413,196,51,311,43,764,790), stat=runif(8, 0, 0.06)) #statistical values of pubchem compounds
#'#result <- computeNodeEnrichment(nodedata=dt, nodetype="compound", annotation="mesh", internalid=FALSE)
#'@export
computeNodeEnrichment <- function(nodedata, pcol=NULL, nodetype="compound", annotation="pathway", internalid = TRUE, method="reporter", size=c(3,500)) UseMethod("computeNodeEnrichment")
#'@export
computeNodeEnrichment.default <- function (nodedata, pcol=NULL, nodetype="compound", annotation="pathway", internalid = TRUE, method="reporter", size=c(3,500)){
  out <- tryCatch(
    {
      tmparg <- try(nodetype <- match.arg(tolower(nodetype), c("compound","protein","gene","rna","dna"), several.ok = FALSE), silent = TRUE)
      if (class(tmparg) == "try-error") {
        stop("argument 'nodetype' is not valid, choose one from the list: compound,protein,gene,rna,dna")
      }
      tmparg <- try(annotation <- match.arg(tolower(annotation), c("pathway","mesh"), several.ok = FALSE), silent = TRUE)
      if (class(tmparg) == "try-error") {
        stop("argument 'annotation' is not valid, choose one from the list: pathway,mesh")
      }
      tmparg <- try(method <- match.arg(tolower(method), c("reporter","fisher","median","mean","stouffer"), several.ok = FALSE), silent = TRUE)
      if (class(tmparg) == "try-error") {
        stop("argument 'method' is not valid, choose one from the list: reporter,fisher,median,mean,stouffer")
      }
      if (class(nodedata) != "data.frame"){
        stop("argument 'nodedata' is not valid, data frame is required.")
      }
      flagdf = FALSE
      if(!is.null(pcol)){#get result from statistical analysis
        datinput = nodedata #keep input data
        flagdf = TRUE
        if(!is.null(nodedata$grinn)){
          nodedata = nodedata[,c('grinn',pcol)]
        }else if(!is.null(nodedata$PubChem)){
          nodedata = nodedata[,c('PubChem',pcol)]
          colnames(datinput) = gsub("PubChem","grinn",colnames(datinput))
        }else if(!is.null(nodedata$pubchem)){
          nodedata = nodedata[,c('pubchem',pcol)]
          colnames(datinput) = gsub("pubchem","grinn",colnames(datinput))
        }else if(!is.null(nodedata$uniprot)){
          nodedata = nodedata[,c('uniprot',pcol)]
          colnames(datinput) = gsub("uniprot","grinn",colnames(datinput))
        }else if(!is.null(nodedata$ensembl)){
          nodedata = nodedata[,c('ensembl',pcol)]
          colnames(datinput) = gsub("ensembl","grinn",colnames(datinput))
        }
        #nodedata = na.omit(nodedata)
      }
      require('dplyr')#load dplyr for opencpu
      nodetype = Hmisc::capitalize(nodetype)
      if(tolower(annotation) == 'pathway' && foundDb()){#pathway enrichment
        cat("Querying database ...\n")
        if(internalid){
          annols = apply(nodedata, 1, function(x) fetchNetwork(to=x[1], fromtype="pathway", totype = nodetype, reltype = "ANNOTATION")) #query annotation pairs
        }else{
          annols = apply(nodedata, 1, function(x) fetchNetworkByGID(to=x[1], fromtype="pathway", totype = nodetype, reltype = "ANNOTATION")) #query annotation pairs
        }
        if(!is.null(unlist(annols))){#found annotation
          annonws = combineNetworks(annols) #combine annotation pairs
          if(internalid){
            #format pval: 1st id, 2nd pval, ... ,use only 1st and 2nd column
            pv = nodedata[,2]
            pv = as.numeric(pv)
            names(pv) = nodedata[,1]
            pval = pv
          }else{
            #format pval: 1st gid, 2nd pval, ... ,use only 1st and 2nd column, change gid to id
            merged = merge(annonws$nodes, nodedata, by.x = colnames(annonws$nodes)[2], by.y = colnames(nodedata)[1], all.y = TRUE)
            pv = merged[,ncol(annonws$nodes)+1]
            pv = as.numeric(pv)
            names(pv) = merged$id
            pval = pv
          }
          era = computeEnrichment(edgelist = annonws$edges[,2:1], pval = pval, fc = NULL, method = method, size=size, returnas="dataframe") #compute enrichment
          era = era[order(era$p),]
          era = era[ , !(colnames(era) == 'no_of_entities')] #hide sum columns
          era$rank = seq(1:nrow(era))
          era = merge(annonws$nodes, era, by='id') #merge annotation attributes and enrichemt results
          era = era[,c(ncol(era),1:(ncol(era)-1))] #rearrange columns
          #count subtotal entities
          tatt = dplyr::left_join(annonws$edges[,1:2],annonws$nodes,by=c('target'='id')) #get target attributes
          subtotls = tatt %>% dplyr::group_by(source) %>% dplyr::tally() #count entities
          colnames(subtotls) = c('id','no_of_entities')
          era = dplyr::left_join(era, subtotls, by=c('id'='id'))
          #count total entities
          cat("Querying pathway statistics ...\n")
          ptwls = unique(era$id) #list of pathways
          ptwstat = lapply(ptwls, function(x){#get total number of annotated nodes of a pathway
            qstring = paste0('MATCH (from:Pathway)-[r:ANNOTATION]->(to:',nodetype,') where ID(from) = ',x,' RETURN toString(ID(from)), labels(to), count(to)')
            annosize = as.data.frame(curlRequest.TRANSACTION.row(qstring)[[1]]$row, stringsAsFactors = FALSE) #get annotation info from db
            colnames(annosize) = c('id','nodelabel','annotation_size')
            annosize
          })
          ptwstat = plyr::ldply(ptwstat, data.frame)
          era = dplyr::left_join(era,ptwstat[,-2],by=c('id'='id')) #merge with annotation info
          era = era[c(1:(ncol(era)-3),(ncol(era)-1),ncol(era),(ncol(era)-2))] #rearrange columns
          networknode = annonws$nodes[annonws$nodes$nodelabel != "Pathway", ] #not return pathway nodes
          if(flagdf){#keep input data
            networknode = merge(networknode,datinput,by.x='gid',by.y='grinn',all.x=TRUE)
            networknode = networknode[,c(2,1,3:ncol(networknode))]
            networknode[is.na(networknode)] = ""
          }
          meminfo = merge(annonws$edges, networknode, by.x='target', by.y='id')
          memname = plyr::ddply(meminfo,c('source'),plyr::summarise,membername=list(nodename))
          era = dplyr::left_join(era, memname, by=c('id'='source'))
          list(nodes=networknode, edges=annonws$edges, enrichment=era) #output
        }
        else{#no annotation found
          list(nodes=data.frame(), edges=data.frame(), enrichment=data.frame()) #output
        }
      }else if(tolower(annotation) == 'mesh'){#mesh enrichment
        if(internalid || tolower(nodetype) != "compound"){
          cat("Error: Accept only PubChem compounds, returning no data ...\n")
          annols = NULL
        }else{
          cat("Connecting PubChem ...\n")
          annols = apply(nodedata, 1, function(x) callMesh(pcid=x[1])) #query pubchem annotation pairs
        }
        if(!is.null(unlist(annols))){#found annotation
          annonws = combineNetworks(annols) #combine annotation pairs
          if(foundDb()){#have db
            nodels = lapply(nodedata[,1], formatNode.LIST, y="compound", z="grinnid") #query nodes by gid
            networknode = plyr::ldply(nodels, data.frame)
            networknode$id = networknode$gid
          }else{#no db
            cat("No database installed, returning original input ...\n")
            networknode = data.frame(id=nodedata[,1], gid=nodedata[,1], nodename=nodedata[,1], nodelabel="Compound", nodexref='', stringsAsFactors = FALSE)
          }
          #format pval: 1st gid, 2nd pval, ... ,use only 1st and 2nd column
          pv = nodedata[,2]
          pv = as.numeric(pv)
          names(pv) = nodedata[,1]
          pval = pv
          era = computeEnrichment(edgelist = annonws$edges[,2:1], pval = pval, fc = NULL, method = method, size=size, returnas="dataframe") #compute enrichment
          era = era[order(era$p),]
          era$rank = seq(1:nrow(era))
          era = merge(annonws$nodes[,1:6], era, by='id') #merge annotation attributes and enrichemt results
          era = era[,c(ncol(era),1:(ncol(era)-6),(ncol(era)-3),(ncol(era)-2),(ncol(era)-4),(ncol(era)-5),(ncol(era)-1))] #rearrange columns
          if(flagdf){#keep input data
            networknode = merge(networknode,datinput,by.x='gid',by.y='grinn',all.x=TRUE)
            networknode = networknode[,c(2,1,3:ncol(networknode))]
            networknode[is.na(networknode)] = ""
          }
          meminfo = merge(annonws$edges, networknode, by.x='target', by.y='id')
          memname = plyr::ddply(meminfo,c('source'),plyr::summarise,membername=list(nodename))
          era = dplyr::left_join(era, memname, by=c('id'='source'))
          list(nodes=networknode, edges=annonws$edges, enrichment=era) #output
        }
        else{#no annotation found
          list(nodes=data.frame(), edges=data.frame(), enrichment=data.frame()) #output
        }
      }else{
        cat('Error: No database installed, returning no data ..\n')
        list(nodes=data.frame(), edges=data.frame(), enrichment=data.frame()) #output
      }
    },error=function(e) {
      message(e)
      cat("\nError: RETURN no data ..\n")
      list(nodes=data.frame(), edges=data.frame(), enrichment=data.frame()) #output
    })
  return(out)
}
kwanjeeraw/metabox documentation built on May 20, 2019, 7:07 p.m.