R/xDefineNet.r

Defines functions xDefineNet

Documented in xDefineNet

#' Function to define a gene network
#'
#' \code{xDefineNet} is supposed to define a gene network sourced from the STRING database or the Pathway Commons database. It returns an object of class "igraph". 
#'
#' @param network the built-in network. Currently two sources of network information are supported: the STRING database (version 10) and the Pathway Commons database (version 7). STRING is a meta-integration of undirect interactions from the functional aspect, while Pathways Commons mainly contains both undirect and direct interactions from the physical/pathway aspect. Both have scores to control the confidence of interactions. Therefore, the user can choose the different quality of the interactions. In STRING, "STRING_highest" indicates interactions with highest confidence (confidence scores>=900), "STRING_high" for interactions with high confidence (confidence scores>=700), "STRING_medium" for interactions with medium confidence (confidence scores>=400), and "STRING_low" for interactions with low confidence (confidence scores>=150). For undirect/physical interactions from Pathways Commons, "PCommonsUN_high" indicates undirect interactions with high confidence (supported with the PubMed references plus at least 2 different sources), "PCommonsUN_medium" for undirect interactions with medium confidence (supported with the PubMed references). For direct (pathway-merged) interactions from Pathways Commons, "PCommonsDN_high" indicates direct interactions with high confidence (supported with the PubMed references plus at least 2 different sources), and "PCommonsUN_medium" for direct interactions with medium confidence (supported with the PubMed references). In addition to pooled version of pathways from all data sources, the user can also choose the pathway-merged network from individual sources, that is, "PCommonsDN_Reactome" for those from Reactome, "PCommonsDN_KEGG" for those from KEGG, "PCommonsDN_HumanCyc" for those from HumanCyc, "PCommonsDN_PID" for those froom PID, "PCommonsDN_PANTHER" for those from PANTHER, "PCommonsDN_ReconX" for those from ReconX, "PCommonsDN_TRANSFAC" for those from TRANSFAC, "PCommonsDN_PhosphoSite" for those from PhosphoSite, and "PCommonsDN_CTD" for those from CTD. For direct (pathway-merged) interactions sourced from KEGG, it can be 'KEGG' for all, 'KEGG_metabolism' for pathways grouped into 'Metabolism', 'KEGG_genetic' for 'Genetic Information Processing' pathways, 'KEGG_environmental' for 'Environmental Information Processing' pathways, 'KEGG_cellular' for 'Cellular Processes' pathways, 'KEGG_organismal' for 'Organismal Systems' pathways, and 'KEGG_disease' for 'Human Diseases' pathways. 'REACTOME' for protein-protein interactions derived from Reactome pathways. 'TRRUST' for TRRUST curated TF-target relations
#' @param STRING.only the further restriction of STRING by interaction type. If NA, no such restriction. Otherwide, it can be one or more of "neighborhood_score","fusion_score","cooccurence_score","coexpression_score","experimental_score","database_score","textmining_score". Useful options are c("experimental_score","database_score"): only experimental data (extracted from BIND, DIP, GRID, HPRD, IntAct, MINT, and PID) and curated data (extracted from Biocarta, BioCyc, GO, KEGG, and Reactome) are used
#' @param weighted logical to indicate whether edge weights should be considered. By default, it sets to false. If true, it only works for the network from the STRING database 
#' @param verbose logical to indicate whether the messages will be displayed in the screen. By default, it sets to true for display
#' @param RData.location the characters to tell the location of built-in RData files. See \code{\link{xRDataLoader}} for details
#' @param guid a valid (5-character) Global Unique IDentifier for an OSF project. See \code{\link{xRDataLoader}} for details
#' @return
#' an object of class "igraph"
#' @note The input graph will treat as an unweighted graph if there is no 'weight' edge attribute associated with
#' @export
#' @seealso \code{\link{xRDataLoader}}
#' @include xDefineNet.r
#' @examples
#' RData.location <- "http://galahad.well.ox.ac.uk/bigdata"
#' \dontrun{
#' # STRING (high quality)
#' g <- xDefineNet(network="STRING_high", RData.location=RData.location)
#' # STRING (high quality), with edges weighted 
#' g <- xDefineNet(network="STRING_high", weighted=T, RData.location=RData.location)
#' # STRING (high quality), only edges sourced from experimental or curated data
#' g <- xDefineNet(network="STRING_high", STRING.only=c("experimental_score","database_score"), RData.location=RData.location)
#' 
#' # Pathway Commons 
#' g <- xDefineNet(network="PCommonsDN_medium", RData.location=RData.location)
#' 
#' # KEGG (all)
#' g <- xDefineNet(network="KEGG", RData.location=RData.location)
#' # KEGG ('Organismal Systems')
#' g <- xDefineNet(network="KEGG_organismal", RData.location=RData.location)
#' }

xDefineNet <- function(network=c("STRING_highest","STRING_high","STRING_medium","STRING_low","PCommonsUN_high","PCommonsUN_medium","PCommonsDN_high","PCommonsDN_medium","PCommonsDN_Reactome","PCommonsDN_KEGG","PCommonsDN_HumanCyc","PCommonsDN_PID","PCommonsDN_PANTHER","PCommonsDN_ReconX","PCommonsDN_TRANSFAC","PCommonsDN_PhosphoSite","PCommonsDN_CTD", "KEGG","KEGG_metabolism","KEGG_genetic","KEGG_environmental","KEGG_cellular","KEGG_organismal","KEGG_disease","REACTOME","TRRUST"), STRING.only=c(NA,"neighborhood_score","fusion_score","cooccurence_score","coexpression_score","experimental_score","database_score","textmining_score")[1], weighted=FALSE, verbose=TRUE, RData.location="http://galahad.well.ox.ac.uk/bigdata", guid=NULL)
{
    
    ## match.arg matches arg against a table of candidate values as specified by choices, where NULL means to take the first one
    network <- match.arg(network)
    
	if(verbose){
		now <- Sys.time()
		message(sprintf("Load the network %s (%s) ...", network, as.character(now)), appendLF=TRUE)
	}
		
	if(length(grep('STRING',network,perl=TRUE)) > 0){
		g <- xRDataLoader(RData.customised='org.Hs.string', RData.location=RData.location, guid=guid, verbose=verbose)
		
		## restrict to those edges with given confidence
		flag <- unlist(strsplit(network,"_"))[2]
		if(flag=='highest'){
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[combined_score>=900])"))
		}else if(flag=='high'){
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[combined_score>=700])"))
		}else if(flag=='medium'){
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[combined_score>=400])"))
		}else if(flag=='low'){
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[combined_score>=150])"))
		}
		
		## further restricted by the evidence type
		default.STRING.only <- c("neighborhood_score","fusion_score","cooccurence_score","coexpression_score","experimental_score","database_score","textmining_score")
		ind <- match(default.STRING.only, STRING.only)
		STRING.only <- default.STRING.only[!is.na(ind)]
		if(length(STRING.only)>0){
			x <- sapply(STRING.only, function(x) paste0(x,'>0'))
			x <- paste0(x, collapse=' | ')
			x <- paste0("g <- igraph::subgraph.edges(g, eids=E(g)[",x,"])")
			eval(parse(text=x))
		}
		
		########################
		# because of the way storing the network from the STRING database
		## extract relations (by symbol)
		V(g)$name <- V(g)$symbol
		if(weighted){
			relations <- igraph::get.data.frame(g, what="edges")[, c(1,2,10)]
			colnames(relations) <- c("from","to","weight")
		}else{
			relations <- igraph::get.data.frame(g, what="edges")[, c(1,2)]
			colnames(relations) <- c("from","to")
			relations$weight <- rep(1, nrow(relations))
		}
		
		######################################
		# remove interaction between HLA genes
		if(0){
			ind <- which(!(grepl('HLA-',relations$from) & grepl('HLA-',relations$to)))
			relations <- relations[ind,]
		}
		######################################
		
		## do removal for node extraction (without 'name'; otherwise failed to do so using the function 'igraph::get.data.frame')
		g <- igraph::delete_vertex_attr(g, "name")
		g <- igraph::delete_vertex_attr(g, "seqid")
		g <- igraph::delete_vertex_attr(g, "geneid")
		nodes <- igraph::get.data.frame(g, what="vertices")
		### remove the duplicated
		nodes <- nodes[!duplicated(nodes), ]			
		########################
		
		g <- igraph::graph.data.frame(d=relations, directed=FALSE, vertices=nodes)
			
    }else if(length(grep('PCommonsUN',network,perl=TRUE)) > 0){
		g <- xRDataLoader(RData.customised='org.Hs.PCommons_UN', RData.location=RData.location, guid=guid, verbose=verbose)
			
		flag <- unlist(strsplit(network,"_"))[2]
		if(flag=='high'){
			# restrict to those edges with physical interactions and with score>=102
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[in_complex_with>=102 | interacts_with>=102])"))
		}else if(flag=='medium'){
			# restrict to those edges with physical interactions and with score>=101
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[in_complex_with>=101 | interacts_with>=101])"))
		}
		
		relations <- igraph::get.data.frame(g, what="edges")[, c(1,2)]
		colnames(relations) <- c("from","to")
		nodes <- igraph::get.data.frame(g, what="vertices")[, c(3,4)]
		g <- igraph::graph.data.frame(d=relations, directed=FALSE, vertices=nodes)
		
    }else if(length(grep('PCommonsDN',network,perl=TRUE)) > 0){
		flag <- unlist(strsplit(network,"_"))[2]
		if(flag=='high'){
			g <- xRDataLoader(RData.customised='org.Hs.PCommons_DN', RData.location=RData.location, guid=guid, verbose=verbose)
			# restrict to those edges with high confidence score>=102
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[catalysis_precedes>=102 | controls_expression_of>=102 | controls_phosphorylation_of>=102 | controls_state_change_of>=102 | controls_transport_of>=102])"))
		}else if(flag=='medium'){
			g <- xRDataLoader(RData.customised='org.Hs.PCommons_DN', RData.location=RData.location, guid=guid, verbose=verbose)
			# restrict to those edges with median confidence score>=101
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[catalysis_precedes>=101 | controls_expression_of>=101 | controls_phosphorylation_of>=101 | controls_state_change_of>=101 | controls_transport_of>=101])"))
		}else{
			g <- xRDataLoader(RData.customised='org.Hs.PCommons_DN.source', RData.location=RData.location, guid=guid, verbose=verbose)
			g <- g[[ flag ]]
			# restrict to those edges with high confidence score>=101
			eval(parse(text="g <- igraph::subgraph.edges(g, eids=E(g)[catalysis_precedes>=101 | controls_expression_of>=101 | controls_phosphorylation_of>=101 | controls_state_change_of>=101 | controls_transport_of>=101])"))
		}
		
		relations <- igraph::get.data.frame(g, what="edges")[, c(1,2)]
		colnames(relations) <- c("from","to")
		relations$weight <- rep(1, nrow(relations))
		nodes <- igraph::get.data.frame(g, what="vertices")[, c(3,4)]
		g <- igraph::graph.data.frame(d=relations, directed=TRUE, vertices=nodes)
    
    }else if(network=='KEGG'){
    	g <- xRDataLoader(RData.customised='ig.KEGG.merged', RData.location=RData.location, guid=guid, verbose=verbose)
    	g <- igraph::delete_vertex_attr(g, "hsa")
    	g <- igraph::delete_vertex_attr(g, "GeneID")
    	g <- igraph::delete_vertex_attr(g, "Symbol")
    	E(g)$weight <- 1
    	
		######################################
		# remove HLA genes
		if(1){
			v <- V(g)[grepl('HLA-',V(g)$name)]
			g <- delete_vertices(g, v)
		}
		######################################
    	
    }else if(length(grep('KEGG_',network,perl=TRUE)) > 0){
    	ls_ig <- xRDataLoader(RData.customised='ig.KEGG.mergedCategory', RData.location=RData.location, guid=guid, verbose=verbose)
		if(network=='KEGG_metabolism'){
			g <- ls_ig[['Metabolism']]
		}else if(network=='KEGG_genetic'){
			g <- ls_ig[['Genetic Information Processing']]
		}else if(network=='KEGG_environmental'){
			g <- ls_ig[['Environmental Information Processing']]
			if(is.null(g)){
				g <- ls_ig[['Environmental Process']]
			}
		}else if(network=='KEGG_cellular'){
			g <- ls_ig[['Cellular Processes']]
		}else if(network=='KEGG_organismal'){
			g <- ls_ig[['Organismal Systems']]
		}else if(network=='KEGG_disease'){
			g <- ls_ig[['Human Diseases']]
		}
    	g <- igraph::delete_vertex_attr(g, "hsa")
    	g <- igraph::delete_vertex_attr(g, "GeneID")
    	g <- igraph::delete_vertex_attr(g, "Symbol")
    	E(g)$weight <- 1

		######################################
		# remove HLA genes
		if(1){
			v <- V(g)[grepl('HLA-',V(g)$name)]
			g <- delete_vertices(g, v)
		}
		######################################

    }else if(network=='REACTOME'){
    	g <- xRDataLoader(RData.customised='ig.REACTOME.merged', RData.location=RData.location, guid=guid, verbose=verbose)
    	g <- igraph::delete_vertex_attr(g, "geneid")
    	g <- igraph::delete_vertex_attr(g, "symbol")
    	E(g)$weight <- 1

    }else if(network=='TRRUST'){
    	g <- xRDataLoader(RData.customised='ig.TRRUST', RData.location=RData.location, guid=guid, verbose=verbose)
    	E(g)$weight <- 1

    }
    
    invisible(g)
}

Try the Pi package in your browser

Any scripts or data that you put into this service are public.

Pi documentation built on Nov. 29, 2021, 3 p.m.