R/computeNwOverrep.R

#'Perform overrepresentation analysis on the input network
#'@description perform overrepresentation analysis using hypergeometric tests to retrieve overrepresented annotation terms of the network nodes.
#'If there are more than one node types, Fisher's method is used to combine p-values of the nodes before adjusted.
#'The input network is generated from any function such as \code{\link{computeSimilarity}}, \code{\link{computeCorrelation}}, \code{\link{computeParCorrelation}}, \code{\link{computeSubnetwork}},
#'\code{\link{fetchHetNetwork}}, \code{\link{fetchHetNetworkByGID}}, \code{\link{fetchNetwork} and \code{\link{fetchNetworkByGID}}.
#'@usage computeNwOverrep(edgelist, nodelist, annotation, internalid, size)
#'@param edgelist a data frame of edges contains at least a source column (1st column) and a target column (2nd column).
#'@param nodelist a data frame of nodes contains at least two columns of node attributes.
#'1st column is id or neo4j id, 2nd column is id or grinn id. The 2nd column is used for Mesh annotation.
#'@param annotation a string specifying the annotation type e.g. pathway (default) and mesh. Pathway annotation requires the database.
#'Mesh annotation doesn't require the database but it is available for PubChem compounds only.
#'@param internalid a logical value indicating whether the network nodes are neo4j ids, if TRUE (default). If not, the network nodes are expected to be any ids.
#'See details and see \code{\link{convertId}} for how to convert ids. It has no effect on Mesh annotation.
#'@param size a numeric vector specifying the minimum number of members in each annotation term to be used in the analysis. Default is 3.
#'@details The database uses two id systems. The neo4j id is a numeric, internal id automatically generated by the database system.
#'The grinn id (gid) is an id system of Grinn database that uses main ids of standard resources
#'i.e. ENSEMBL for genes (e.g.ENSG00000139618), UniProt for proteins (e.g.P0C9J6), PubChem CID for compounds (e.g.5793), KEGG for pathways (e.g.hsa00010).
#'@return list of data frame of nodes, edges, overrepresentation and pairs. The pairs data frame contains annotation pairs. The data frame of overrepresentation contains the following components:
#'
#'\code{rank} = rank sort by p adj
#'
#'\code{id} = annotation id or annotation neo4j id
#'
#'\code{gid} = annotation id or annotation grinn id
#'
#'\code{nodename} = annotation name
#'
#'\code{nodelabel} = annotation type
#'
#'\code{nodexref} = cross references
#'
#'\code{p_combine} = combined-raw p-values, if there are more than one node types
#'
#'\code{p_combine_adj} = adjusted combined p-values, if there are more than one node types
#'
#'\code{p} = raw p-values
#'
#'\code{p_adj} = adjusted p-values
#'
#'\code{no_of_entities} = number of input entities in each annotation term
#'
#'\code{annotation_size} = total number of entities in each annotation term from the database
#'
#'\code{background_size} = total number of annotated entities in the database
#'
#'\code{member} = list of entity members of the annotation term
#'
#'Return list of empty data frame if error or found nothing.
#'@author Kwanjeera W \email{kwanich@@ucdavis.edu}
#'@references Johnson NL., Kotz S., and Kemp AW. (1992) Univariate Discrete Distributions, Second Edition. New York: Wiley.
#'@references Fisher R. (1932) Statistical methods for research workers. Oliver and Boyd, Edinburgh.
#'@seealso \code{\link{phyper}}, \code{\link{p.adjust}},\code{\link{pchisq}}
#'@examples
#'#simnw <- computeSimilarity(c(1110,10413,196,51,311,43,764,790)) #compute similarity network for given pubchem compounds
#'#result <- computeNwOverrep(simnw$edges, simnw$nodes, annotation="mesh", internalid = FALSE)
#'@export
computeNwOverrep <- function(edgelist, nodelist, annotation="pathway", internalid = TRUE, size=3, returnas="dataframe") UseMethod("computeNwOverrep")
#'@export
computeNwOverrep.default <- function (edgelist, nodelist, annotation="pathway", internalid = TRUE, size=3, returnas="dataframe"){
  out <- tryCatch(
    {
      tmparg <- try(annotation <- match.arg(tolower(annotation), c("pathway","mesh"), several.ok = FALSE), silent = TRUE)
      if (class(tmparg) == "try-error") {
        stop("argument 'annotation' is not valid, choose one from the list: pathway,mesh")
      }
      require('dplyr')#load dplyr for opencpu
      if(tolower(annotation) == 'pathway' && foundDb()){#pathway enrichment
        cat("Querying database ...\n")
        if(internalid){
          annols = apply(nodelist, 1, function(x) fetchNetwork(to=x["id"], fromtype="pathway", totype = x["nodelabel"], reltype = "ANNOTATION")) #query annotation pairs
        }else{
          annols = apply(nodelist, 1, function(x) fetchNetworkByGID(to=x["gid"], fromtype="pathway", totype = x["nodelabel"], reltype = "ANNOTATION")) #query annotation pairs
        }
        if(!is.null(unlist(annols))){#found annotation
          annonws = combineNetworks(annols) #combine annotation pairs
          annomem = plyr::ddply(annonws$edges,c('source'),plyr::summarise,member=list(target)) #get members
          tatt = dplyr::left_join(annonws$edges[,1:2],annonws$nodes,by=c('target'='id')) #get target attributes
          subanno = tatt %>% dplyr::group_by(source,nodelabel) %>% dplyr::tally() #count entities
          colnames(subanno) = c('id','nodelabel','count')
          subanno = subanno[subanno$count >= size,] #filter by annotation size
          if(nrow(subanno) > 0){
            ntypels = unique(annonws$nodes$nodelabel)
            ntypels = ntypels[ntypels!="Pathway"] #list of nodetype
            if(length(ntypels) > 1){#more than one node types
              cat("Performing overrepresentation analysis ...\n")
#               ntypestat = foreach(i=1:length(ntypels), .combine=rbind) %dopar% {#get db stat
#                 qstring = paste0('MATCH (:Pathway)-[r:ANNOTATION]->(n:',ntypels[i],') RETURN count(DISTINCT n)')#get db stat
#                 totEnt = data.frame(nl=ntypels[i], row=curlRequest.TRANSACTION.row(qstring), stringsAsFactors = FALSE) #total no. of entities
#               }
              ntypestat = lapply(ntypels, function (x) data.frame(nl=x, row=curlRequest.TRANSACTION.row(paste0('MATCH (:Pathway)-[r:ANNOTATION]->(n:',x,') RETURN count(DISTINCT n)')), stringsAsFactors = FALSE))
              ntypestat = do.call(rbind, lapply(ntypestat, data.frame, stringsAsFactors=FALSE)) #total no. of entities
              overDF = data.frame(stringsAsFactors = FALSE)
              for(i in 1:nrow(subanno)){#overrepresentation analysis
                totEnt = ntypestat[ntypestat$nl == subanno$nodelabel[i],2]
                qstring = paste0('MATCH (from:Pathway)-[r:ANNOTATION]->(to:',subanno$nodelabel[i],') where ID(from) = ',subanno$id[i],' RETURN toString(ID(from)), labels(to), count(to)')
                annosize = as.data.frame(curlRequest.TRANSACTION.row(qstring)[[1]]$row, stringsAsFactors = FALSE) #get annotation info from db
                colnames(annosize) = c('id','nodelabel','count')
                blackAnno = totEnt - annosize$count #no. of entities not in the annotation term
                pval = phyper(subanno$count[i]-1, annosize$count, blackAnno, nrow(nodelist), lower.tail = F) #hypergeometric test
                hyp = data.frame(id=as.character(subanno$id[i]), p=pval, no_of_entities=subanno$count[i],
                                 annotation_size=annosize$count, background_size=totEnt, nodelabel=subanno$nodelabel[i], stringsAsFactors = FALSE)
                overDF = rbind(overDF,hyp)
              }
              overDF$p_adj = p.adjust(overDF$p, method = "BH") #adjust raw p-values
              #format output table
              pcombine = plyr::ddply(overDF,c('id'),plyr::summarise,p_combine=combinePvals(p)) #Fisher combines raw p-values
              pcombine$p_combine_adj = p.adjust(pcombine$p_combine, method = "BH") #adjust raw p_combines
              pcombine$p = plyr::ddply(overDF,c('id'),plyr::summarise,p=list(paste0(nodelabel,' (',p,')')))$p
              pcombine$p_adj = plyr::ddply(overDF,c('id'),plyr::summarise,p_adj=list(paste0(nodelabel,' (',p_adj,')')))$p_adj
              pcombine$no_of_entities = plyr::ddply(overDF,c('id'),plyr::summarise,no_of_entities=list(paste0(nodelabel,' (',no_of_entities,')')))$no_of_entities
              pcombine$annotation_size = plyr::ddply(overDF,c('id'),plyr::summarise,annotation_size=list(paste0(nodelabel,' (',annotation_size,')')))$annotation_size
              pcombine$background_size = plyr::ddply(overDF,c('id'),plyr::summarise,background_size=list(paste0(nodelabel,' (',background_size,')')))$background_size
              pcombine = pcombine[order(pcombine$p_combine_adj),]
              pcombine$rank = seq(1:nrow(pcombine))
              pcombine = merge(annonws$nodes, pcombine, by='id') #merge annotation attributes and results
              pcombine = pcombine[,c(ncol(pcombine),1:(ncol(pcombine)-1))] #rearrange columns
              pcombine = dplyr::left_join(pcombine, annomem, by=c('id'='source'))
              overDF = pcombine #output
            }else{#one node type
              cat("Performing overrepresentation analysis ...\n")
              overDF = callHypergeo.pathway(edgelist=subanno, nodelist=annonws$nodes, member=annomem, nodetype=ntypels, inputsize=nrow(nodelist))
            }
          }else{#less than minSize
            cat("No. of members is too small, returning no data ...\n")
            overDF = data.frame()
          }
          meminfo = merge(annonws$edges, nodelist, by.x='target', by.y='id')
          memname = plyr::ddply(meminfo,c('source'),plyr::summarise,membername=list(nodename))
          overDF = dplyr::left_join(overDF, memname, by=c('id'='source'))
          list(nodes=nodelist, edges=edgelist, overrepresentation=overDF, pairs=annonws$edges) #output
        }else{#no annotation found
          list(nodes=data.frame(), edges=data.frame(), overrepresentation=data.frame(), pairs=data.frame()) #output
        }
      }else if(tolower(annotation) == 'mesh'){#mesh enrichment
        cat("Connecting PubChem ...\n")
        annols = apply(nodelist, 1, function(x) callMesh(pcid=x["gid"])) #query pubchem annotation pairs
        if(!is.null(unlist(annols))){#found annotation
          annonws = combineNetworks(annols) #combine annotation pairs
          #format edge, change gid to id, fix edge row order
          annopair = dplyr::right_join(nodelist[,1:2],annonws$edges[,1:2],by=c('gid' = 'target'))[,c(3,1)]
          colnames(annopair) = c('source','target')
          overDF = callHypergeo.mesh(edgelist=annopair, nodelist=annonws$nodes, size=size, inputsize=nrow(nodelist))
          meminfo = merge(annopair, nodelist, by.x='target', by.y='id')
          memname = plyr::ddply(meminfo,c('source'),plyr::summarise,membername=list(nodename))
          overDF = dplyr::left_join(overDF, memname, by=c('id'='source'))
          list(nodes=nodelist, edges=edgelist, overrepresentation=overDF, pairs=annopair) #output
        }else{#no annotation found
          list(nodes=data.frame(), edges=data.frame(), overrepresentation=data.frame(), pairs=data.frame()) #output
        }
      }else{
        cat('Error: No database installed, returning no data ..\n')
        list(nodes=data.frame(), edges=data.frame(), overrepresentation=data.frame(), pairs=data.frame()) #output
      }
    },error=function(e) {
      message(e)
      cat("\nError: RETURN no data ..\n")
      list(nodes=data.frame(), edges=data.frame(), overrepresentation=data.frame(), pairs=data.frame()) #output
    })
  return(out)
}
kwanjeeraw/metabox documentation built on May 20, 2019, 7:07 p.m.