#this is for latex debug, remove when all works well options(tinytex.verbose = TRUE)
MTGOsc comes with some example data from the Mouse Atlas in the form of a (simplified for storage reasons) Seurat object. Let's start with loading Seurat and MTGO:
library(Seurat) library(MTGOsc)
Loading MTGOsc also loads three objects: bladder, markers, and mouse.pathways.
#this is a (simplified) Seurat object print(bladder) #this come from applying Seurat function FindAllMarkers() to the bladder dataset head(markers) #this is a simple gene -> pathway dictionary head(mouse.pathways)
Bladder object contains three clusters:
table(bladder@ident)
For this tutorial we'll focus on Basal epitelial cells cluster, which is of intermediate size. We create two "selector" arrays, one for cells and one for genes.
cells.selected = bladder@ident == 'Basal epithelial cell(Bladder)' genes.selected = subset(markers, cluster == 'Basal epithelial cell(Bladder)')$gene my.bladder = bladder my.bladder@data = my.bladder@data[genes.selected, cells.selected]
MTGOsc needs a folder where to save temporary files and final results.
root = tempdir() #change this to your preferred local path dir.create(root, recursive = TRUE, showWarnings = FALSE)
We are now ready to start with MTGOsc:
#building a genes-pathways dictionary dict = write.dictionary(genes=mouse.pathways$gene, terms = mouse.pathways$pathway, outfolder = root) #computign gene coexpression (default function is 'cor') coexp = write.coexpressionMatrix(geneExpression = my.bladder@data, outfolder = root) #thinning coexpression network via scale free criterion edges = write.edges(coexpression = coexp, outfolder = root, keep.weights = FALSE, fun = thinning_scale_free) #writing a parameter file, useful for MGTO write.paramFile(outfolder = root)
All the groundwork is done and we can invoke MTGO:
#actual call to MTGO #call.MTGO(outfolder = root, verbose = TRUE) #this prints several log messages #building and saving representation of resulting network #network.collapsed = export.network.modules(infolder = root, collapse.modules = TRUE) #network.full = export.network.modules(infolder = root, collapse.modules = FALSE)
At this point networks are saved on disk, in "Network" subfolder, in the form of two html files. In the first one (ClusterNetwork.html) each functional module is collapsed to a single node, the second one (FullNetwork.html) where each gene is represented and functional modules are color coded.
Here we look for Reactome pathway enrichment of the genes constituting the thinned network. This procedure is complementary to the exctraction of Reactome pathways by MTGO-SC.
# load libraries for gene enrichment on Reactome (those are on Bioconductor, not CRAN) library(ReactomePA) library(clusterProfiler) library(org.Mm.eg.db) #a support function to take care of gene upper/lower case convention firstup = function(x) { x = tolower(x) substr(x, 1, 1) = toupper(substr(x, 1, 1)) return(x) } #the list of all genes involved in the cluster genes = unique(c(as.character(edges$gene1), as.character(edges$gene2))) #correct casing of gene names genes = firstup(genes) #translating gene names to ENTREZID via org.Mm.eg.db database genes = bitr(genes, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Mm.eg.db") #the actual enrichment enriched = enrichPathway(gene=genes$ENTREZID, pvalueCutoff=0.05, readable=T, organism = "mouse", pAdjustMethod = "BH")
# In this example we search the literature for the Basal Epithelial cell type and the pathway terms # retrieved by both MTGO-SC (pathway labelling gene modules) and ReactomePA (pathways enriched for the whole basal epithelial gene network library(RISmed) # set the search parameters file_MTGO<-"Modules_Best_QGO.txt" #path file_enriched<-"Basal epithelial cell" #passare la variabile tissue<-"Bladder" celltype<-"Basal Epithelial Cell" modules<-read.table(file_MTGO,sep="\t",header=T) terms_MTGO<-as.character(modules[,4]) terms_enriched<-setdiff(as.character(read.table(file_enriched,header=T)[,1]),NA) terms<-union(terms_MTGO,terms_enriched) # Perform all the PUBMED searchings pubmed_search_both<-c() pubmed_search_term<-c() for(k in terms) { print(k) res <- EUtilsSummary(paste(tissue,celltype,k), type="esearch", db="pubmed", datetype='pdat', mindate=2000, maxdate=2019, retmax=500) pubmed_search_both[[k]]<-QueryCount(res) res <- EUtilsSummary(paste(tissue,k), type="esearch", db="pubmed", datetype='pdat', mindate=2000, maxdate=2019, retmax=500) pubmed_search_term[[k]]<-QueryCount(res) Sys.sleep(0.5) # a connection error might occur without this pause } pubmed_search_universe<-QueryCount(EUtilsSummary(tissue, type="esearch", db="pubmed", datetype='pdat', mindate=2000, maxdate=2019, retmax=500)) pubmed_search_celltype<-QueryCount(EUtilsSummary(paste(tissue,celltype), type="esearch", db="pubmed", datetype='pdat', mindate=2000, maxdate=2019, retmax=500)) # Compute p.values pvals<-c() M<-pubmed_search_celltype N<-pubmed_search_universe for(i in terms) { x<-pubmed_search_both[[i]] k<-pubmed_search_term[[i]] pvals[[i]]<-signif(phyper(x-1,M,N-M,k,lower.tail=F),2) } PUBMEDsearch_results<-cbind(terms,pvals,pubmed_search_both,pubmed_search_term,pubmed_search_celltype,pubmed_search_universe)
In the previous example we extracted the gene interaction network that is fed to MTGO via a two step approach:
MTGOsc package offers several options for both steps, and it is always possible for the user to write and use their own functions. As such we propose an objective approach based on an (external) ground truth to select the best combination of functions.
In general terms, we'll select the method that maximise the Affinity Score (AS), which is a metric of how close the extracted network is to a selected external ground truth. As such, the first thing we'll need is said ground truth, which luckily is already saved in MTGOsc:
#gGT is an iGraph object derived from four different sources: # - the CORUM database [1] # - a protein interaction map [2] # - String and Reactome databases [3] summary(gGT)
For more details see the following references:
1 Giurgiu M, Reinhard J, Brauner B, Dunger-Kaltenbach I, Fobo G, Frishman G, Montrone C, Ruepp A. CORUM: the comprehensive resource of mammalian protein complexes-2019. Nucleic Acids Res. 2018 Oct 24. doi: 10.1093/nar/gky973. [Epub ahead of print] 30357367
2 Ramani, Arun K., et al. "A map of human protein interactions derived from co‐expression of human mRNAs and their orthologs." Molecular systems biology 4.1 (2008): 180.
3 Skinnider, Michael A., Jordan W. Squair, and Leonard J. Foster. "Evaluating measures of association for single-cell transcriptomics." Nature methods 16.5 (2019): 381.
For the sake of this tutorial we'll compare two methods for extracting the interaction network:
Extracted networks must be transformed in iGraph object, starting from the edges object, as follow:
#Edges for Method 1, computed above edges.M1 = edges #Edges for Method 2, on the same data coexp.M2 = write.coexpressionMatrix( geneExpression = my.bladder@data, outfolder = root, fun = coexpr_propr, metric = 'phs', symmetrize = TRUE, overwrite = TRUE) edges.M2 = write.edges( coexpression = coexp.M2, outfolder = root, keep.weights = FALSE, fun = thinning_percentile, overwrite = TRUE) #transforming into iGraph network.M1 = igraph::graph_from_edgelist(as.matrix(edges.M1), directed = FALSE) network.M2 = igraph::graph_from_edgelist(as.matrix(edges.M2), directed = FALSE)
The Affinity Scores can be directly computed as follows:
#support function to compute Affinity Score between two networks compute_AS = function(N1, N2){ E.N1 = length(igraph::E(N1)) E.N2 = length(igraph::E(N2)) overlap = length(igraph::E(igraph::intersection(N1, N2))) #the formula is [overlap^2] / (edges1 * edges2) #but for better computation we implement it as res = (overlap / E.N1) * (overlap / E.N2) #and we are done return(res) } #ready to compute Affinity Scores for the examples AS.M1 = compute_AS(network.M1, gGT) AS.M2 = compute_AS(network.M2, gGT)
However they cannot be directly compared yet. For a proper comparison we should compute for each extracted network a random population of scrambled networks and then compare each AS to the distribution of scores. This will allow us to compute Z-scores and consequently P values associated to each extracted method. We can then (finally!) compare the logarithms of the P-values.
#numerosity for each random population N = 100 #room for Affinity Scores for the two scrambled networks populations AS.distr.M1 = c() AS.distr.M2 = c() #doing the computation for (i in 1:100){ #creating a random scramble of both networks random.network.M1 = igraph::rewire(network.M1, with = keeping_degseq( niter = igraph::vcount(network.M1) * 10, loops = FALSE)) random.network.M2 = igraph::rewire(network.M2, with = keeping_degseq( niter = igraph::vcount(network.M2) * 10, loops = FALSE)) #computing the new Affinity Scores AS.distr.M1[i] = compute_AS(random.network.M1, gGT) AS.distr.M2[i] = compute_AS(random.network.M2, gGT) } #Computing Z scores Z.M1 = (AS.M1 - mean(AS.distr.M1)) / sd(AS.distr.M1) Z.M2 = (AS.M2 - mean(AS.distr.M2)) / sd(AS.distr.M2) #Computing -log(P values) logP.M1 = -log(pnorm(-Z.M1)) logP.M2 = -log(pnorm(-Z.M2))
At this point the methods can be compared directly. In our case Method 2 shows higher values and should thus be preferred.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.