knitr::opts_chunk$set(echo = TRUE,
                      collapse = TRUE,
                      comment = "#>"
)
suppressMessages(library(synaptome.db))
suppressMessages(library(dplyr))
library(pander)
library(ggplot2)

Introduction

58 published synaptic proteomic datasets (2000-2022 years) that describe over 8,000 proteins were integrated and combined with direct protein-protein interactions and functional metadata to build a network resource.

The set includes 29 post synaptic proteome (PSP) studies (2000 to 2019) contributing a total of 5,560 mouse and human unique gene identifiers; 19 presynaptic studies (2004 to 2022) describe 2,772 unique human and mouse gene IDs, and 11 studies that span the whole synaptosome and report 7,198 unique genes.

NB: With the latest update of the database we have added 6 new studies: 1 presynaptic and 5 postsynaptic, new compsrtment (Synaptic Vesicle), and coding mutations for Autistic Spectral Disorder (ASD) and epilepsy (Epi).

To reconstruct protein-protein interaction (PPI) networks for the pre- and post-synaptic proteomes we used human PPI data filtered for the highest confidence direct and physical interactions from BioGRID, Intact and DIP. The resulting postsynaptic proteome (PSP) network contains 4,817 nodes and 27,788 edges in the Largest Connected Component (LCC). The presynaptic network is significantly smaller and comprises 2,221 nodes and 8,678 edges in the LCC.

The database includes: proteomic and interactomic data with supporting information on compartment, specie and brain region, GO function information for three species: mouse, rat and human, disease annotation for human (based on Human Disease Ontology (HDO) and GeneToModel table, which links certain synaptic proteins to existing computational models of synaptic plasticity and synaptic signal transduction

The original files are maintained at Eidnburgh Datashare https://doi.org/10.7488/ds/3017. Updated database file could be found here: https://doi.org/10.7488/ds/3771

The dataset was described in the paper:

  1. Sorokina, O., Mclean, C., Croning, M.D.R. et al.
    A unified resource and configurable model of the synapse proteome and its role in disease. Sci Rep 11, 9967 (2021). https://doi.org/10.1038/s41598-021-88945-7.

Database content overview

Following diagrams illustrate content of the database available for analysis by the package functions. Source code for construction of figures ar available in the Appendix statistics code section.

Presynaptic datasets

gp<-findGeneByCompartmentPaperCnt(1)

# presynaptic stats
presgp <- gp[gp$Localisation == "Presynaptic",]
syngp <- gp[gp$Localisation == "Synaptosome",]
presg <- getGeneInfoByIDs(presgp$GeneID)
mpres <- merge(presgp, presg, by = c("GeneID","Localisation"))
mmpres <- mpres[, c('GeneID',
                    'HumanEntrez.x',
                    'HumanName.x',
                    'Npmid',
                    'PaperPMID',
                    'Paper',
                    'Year')]
papers <- getPapers()
prespap <- papers[papers$Localisation == "Presynaptic",]
mmmpres <- mmpres[mmpres$PaperPMID %in% prespap$PaperPMID,]
mmmpres$found <- 0
for(i in 1:dim(mmmpres)[1]) {
    if (mmmpres$Npmid[i] == 1) {
        mmmpres$found[i] <- '1'
    } else if (mmmpres$Npmid[i] > 1 & mmmpres$Npmid[i] < 4) {
        mmmpres$found[i] <- '2-3'
    } else if (mmmpres$Npmid[i] >= 4 & mmmpres$Npmid[i] < 10) {
        mmmpres$found[i] <- '4-9'
    } else if (mmmpres$Npmid[i] >= 10) {
        mmmpres$found[i] <- '>10'
    }
}

mmmpres$found<- factor(mmmpres$found,
                        levels = c('1','2-3','4-9','>10'),
                        ordered=TRUE)
tp<-unique(mmmpres$Paper)
mmmpres$Paper<- factor(mmmpres$Paper,
                        levels =tp[order(as.numeric(sub('^[^0-9]+_([0-9]+)',
                                                        '\\1',tp)))],
                        ordered=TRUE)

ummpres<-unique(mmmpres[,c('GeneID','Paper','found')])
ggplot(ummpres) + geom_bar(aes(y = Paper, fill = found)) 

Postsynaptic datasets

#postsynaptic stats
pstgp <- gp[gp$Localisation == "Postsynaptic",]
postg <- getGeneInfoByIDs(pstgp$GeneID)
mpost <- merge(pstgp, postg, by = c("GeneID","Localisation"))
mmpost <- mpost[, c('GeneID',
                    'HumanEntrez.x',
                    'HumanName.x','Npmid',
                    'PaperPMID','Paper','Year')]
postspap <- papers[papers$Localisation == "Postsynaptic",]
mmmpost <- mmpost[mmpost$PaperPMID %in% postspap$PaperPMID,]
mmmpost$found <- 0
for(i in 1:dim(mmmpost)[1]) {
    if (mmmpost$Npmid[i] == 1) {
        mmmpost$found[i] <- '1'
    } else if (mmmpost$Npmid[i] > 1 & mmmpost$Npmid[i] < 4) {
        mmmpost$found[i] <- '2-3'
    } else if (mmmpost$Npmid[i] >= 4 & mmmpost$Npmid[i] < 10) {
        mmmpost$found[i] <- '4-9'
    } else if (mmmpost$Npmid[i] >= 10) {
        mmmpost$found[i] <- '>10'
    }
}
mmmpost$found<- factor(mmmpost$found,levels = c('1','2-3','4-9','>10'),
                        ordered=TRUE)
tp<-unique(mmmpost$Paper)
mmmpost$Paper<- factor(mmmpost$Paper,
                        levels =tp[order(as.numeric(sub('^[^0-9]+_([0-9]+)',
                                                        '\\1',tp)))],
                        ordered=TRUE)
ummpos<-unique(mmmpost[,c('GeneID','Paper','found')])
ggplot(ummpos) + geom_bar(aes(y = Paper, fill = found)) 

Synaptic Vesicle

svgp <- gp[gp$Localisation == "Synaptic_Vesicle",]
svg <- getGeneInfoByIDs(svgp$GeneID)
mpost <- merge(svgp, svg, by = c("GeneID","Localisation"))
mpost$Paper<-paste0(mpost$Paper,ifelse('FULL'==mpost$Dataset,'','_SVR'))
mmpost <- mpost[, c('GeneID','HumanEntrez.x','HumanName.x','Npmid',
                    'PaperPMID','Paper','Year')]
postspap <- papers[papers$Localisation == "Synaptic_Vesicle",]
mmmpost <- mmpost[mmpost$PaperPMID %in% postspap$PaperPMID,]
mmmpost$found <- 0
for(i in 1:dim(mmmpost)[1]) {
    if (mmmpost$Npmid[i] == 1) {
        mmmpost$found[i] <- '1'
    } else if (mmmpost$Npmid[i] > 1 & mmmpost$Npmid[i] < 4) {
        mmmpost$found[i] <- '2-3'
    } else if (mmmpost$Npmid[i] >= 4 & mmmpost$Npmid[i] < 6) {
        mmmpost$found[i] <- '4-5'
    } else if (mmmpost$Npmid[i] >= 6) {
        mmmpost$found[i] <- '>6'
    }
}

mmmpost$found<- factor(mmmpost$found,levels = c('1','2-3','4-5','>6'),
                        ordered=TRUE)
tp<-unique(mmmpost$Paper)
mmmpost$Paper<- factor(mmmpost$Paper,
                        levels =tp[order(as.numeric(sub('^[^0-9]+_([0-9]+)_?.*',
                                                        '\\1',tp)))],
                        ordered=TRUE)

ummpos<-unique(mmmpost[,c('GeneID','Paper','found')])
g<-ggplot(ummpos) + geom_bar(aes(y = Paper, fill = found)) 
g

Brain region distribution

#brain region statistics
totg <- getGeneInfoByIDs(gp$GeneID)
mtot <- merge(gp, totg, by = c("GeneID","Localisation"))
mmptot <- mtot[, c('GeneID',
                    'HumanEntrez.x',
                    'HumanName.x',
                    'Localisation',
                    'Npmid',
                    'Paper',
                    'BrainRegion')]
untot<-unique(mmptot[,c('GeneID','BrainRegion','Localisation')])
loccolors<-c("#3D5B59","#B5E5CF","#FCB5AC", "#B99095")
loccolors<-loccolors[1:length(unique(untot$Localisation))]
ggplot(untot) + geom_bar(aes(y = BrainRegion, fill = Localisation)) + 
    scale_fill_manual(values = loccolors)
table(untot$Localisation,untot$BrainRegion)-> m
as.data.frame(m)->udf
names(udf)<-c('Localisation','BrainRegion','value')
ggplot(udf, aes(fill=Localisation, y=value, x=BrainRegion)) + 
    geom_bar(position="dodge", stat="identity")+ 
    scale_fill_manual(values = loccolors) + 
    theme(axis.text.x = element_text(face="plain", 
                                        color="#993333", 
                                        angle=45,vjust = 1,
                                        hjust=1,size = rel(1.5)))

Overview of capabilities

1.Get information for a specific gene or gene set.

The dataset can be used to answer frequent questions such as “What is known about my favourite gene? Is it pre- or postsynaptic? Which brain region was it identified in?”, "Which publication it was reported in?" Information could be obtained by submitting gene EntrezID or Gene name

t <- getGeneInfoByEntrez(1742) 
pander(head(t))

t <- getGeneInfoByName("CASK")
pander(head(t))

t <- getGeneInfoByName(c("CASK", "DLG2"))
pander(head(t))                      

2.Get internal GeneIDs for the list of genes.

Obtaining Internal database GeneIDs is a useful intermediate step for more complex queries including those for building protein-protein interaction (PPI) networks for compartments and brain regions. Internal GeneID is specie-neutral and unique, which allows exact identification of the object of interest in case of redundancy (e.g. one Human genes matches on a few mouse ones, etc.)

t <- findGenesByEntrez(c(1742, 1741, 1739, 1740))
pander(head(t))

t <- findGenesByName(c("SRC", "SRCIN1", "FYN"))
pander(head(t))

3.Get disease information for the gene set

Synaptic genes are annotated with disease information from Human Disease Ontology, where available. To get disease information one can submit the list of Human Entrez Is or Human genes names, it could be also the list of Internal GeneIDs if using getGeneDiseaseByIDs function

t <- getGeneDiseaseByName (c("CASK", "DLG2", "DLG1"))
pander(head(t))

t <- getGeneDiseaseByEntres (c(8573, 1742, 1739))
pander(head(t))

5.Get information about the studies, combined into dataset

One can obtain the overview of synaptic proteome papers combined into the database, which includes paper PMID, specie Tax ID, year of publication, subcellular localisation, brain region and number of proteins identified in the paper. This information may help to choose the specific study(ies) for further work.

p <- getPapers()
pander(head(p))

6.Get the table of frequently identified proteins

It is also possible to obtain the list of proteins found in more than one study, for the whole synaptic proteome. For that, the user needs to provide a "count" value as a desired minimal number of identifications (e.g. 2 or more). The command returns the table with gene identifiers and "Npmid" column, which contains the number of studies where this gene was identified.

#find all proteins in synaptic proteome identified 2 times or more
gp <- findGeneByPaperCnt(cnt = 2)
pander(head(gp))

7.Get the table of proteins identified in specific studies

Following section 5, when the information for all considered proteomic studies was obtained, user can select specific study(ies) by PMID and get the proteins identified in those studies. By providing count value user can extract either all proteins from specified studies (count = 1), or just frequently found ones (count >=2). As above, the command returns the table with gene identifiers with Npmid column, which contains the number of studies where this protein was identified

spg <- findGeneByPapers(p$PaperPMID[1:5], cnt = 1)
pander(head(spg))

8.Get the table of proteins frequently identified in specific compartment

Most of the times, user is interested in the specific compartment rather then in total synaptic proteome. To help identify the genes most probably residenting in the specific compartment and exclude possible contaminants,
findGeneByCompartmentPaperCnt function provides the table of proteins found cnt or more times in different compartment-paper pairs.

gcp <- findGeneByCompartmentPaperCnt(cnt = 2)
pander(head(gcp))
# Now user can select the specific compartment and proceed working with 
# obtained list of frequently found proteins
presgp <- gcp[gcp$Localisation == "Presynaptic",]
dim(presgp)
pander(head(presgp))

9.Get PPI interactions for my list of genes

Custom Protein-protein interactions based on bespoke subsets of molecules could be extracted in two general ways: "induced" and "limited". In the first case, the command will return all possible interactors for the genes within the whole interactome. In the second case it will return only interactions between the genes of interest. PPIs could be obtained by submitting list of EntrezIDs or gene names, or Internal IDs - in all cases the interactions will be returned as a list of interacting pairs of Intenal GeneIDs.

t <- getPPIbyName(
    c("CASK", "DLG4", "GRIN2A", "GRIN2B","GRIN1"), 
    type = "limited")
pander(head(t))

t <- getPPIbyEntrez(c(1739, 1740, 1742, 1741), type='induced')
pander(head(t))
 #obtain PPIs for the list of frequently found genes in presynaptc compartment
t <- getPPIbyEntrez(presgp$HumanEntrez, type='induced')
pander(head(t))

10.Get the molecular structure of synaptic compartment

Three main synaptic compartments considered in the database are "presynaptic", "postsynaptic" and "synaptosome". Genes are classified to compartments based on respective publications, so that each gene can belong to one or two, or even three compartments. The full list of genes for specific compartment could be obtained with command getAllGenes4Compartment, which returns the table with main gene identifiers, like internal GeneIDs, MGI ID, Human Entrez ID, Human Gene Name, Mouse Entrez ID, Mouse Gene Name, Rat Entrez ID, Rat Gene Name.

If you need to check which genes of your list belong to specific compartment, you can use getGenes4Compartment command, which will select from your list only genes associated with specific compartment. To obtain the PPI network for compartment one has to submit the list of Internal GeneIDs obtained with previous commands.

#getting the list of compartment
comp <- getCompartments()
pander(comp)

#getting all genes for postsynaptic compartment
gns <- getAllGenes4Compartment(compartmentID = 1) 
pander(head(gns))

#getting full PPI network for postsynaptic compartment
ppi <- getPPIbyIDs4Compartment(gns$GeneID,compartmentID =1, type = "induced")
pander(head(ppi))

11.Get the molecular structure of the brain region.

Three are 12 brain regions considered in the database based on respective publications, so that each gene can belong to the single or to the several brain regions. Brain regions differ between species, and specie brain region information is not 100% covered in the database(e.g. we don't have yet studies for Human Striatum, but do have for Mouse and Rat), that's why when querying the database for brain region information you will need to specify the specie. The full list of genes for specific region could be obtained wuth command getAllGenes4BrainRegion, which returns the table with main gene identifiers, like internal Gene IDs, MGI ID, Human Entrez ID, Human Gene Name, Mouse Entrez ID, Mouse Gene Name, Rat Entrez ID, Rat Gene Name.

If you need to check which genes of your list were identified in specific region, you can use getGenes4BrainRegion command, which will select only genes associated with specific region from your list.

To obtain the PPI network for brain region you need to submit the list of Internal GeneIDs obtained with previous commands.

#getting the full list of brain regions
reg <- getBrainRegions()
pander(reg)

#getting all genes for mouse Striatum
gns <- getAllGenes4BrainRegion(brainRegion = "Striatum",taxID = 10090)
pander(head(gns))

#getting full PPI network for postsynaptic compartment
ppi <- getPPIbyIDs4BrainRegion(
    gns$GeneID, brainRegion = "Striatum", 
    taxID = 10090, type = "limited")
pander(head(ppi))

12.Checking third-party list against Synaptic Proteome db

#check which genes from 250 random EntrezIds are in the database
listG<-findGenesByEntrez(1:250) 
dim(listG)
head(listG)

#check which genes from subset identified as synaptic are presynaptic
getCompartments()
presG <- getGenes4Compartment(listG$GeneID, 2) 
dim(presG)
head(presG)

#check which genes from subset identified as synaptic are found in 
#human cerebellum
getBrainRegions()
listR <- getGenes4BrainRegion(listG$GeneID, 
                              brainRegion = "Cerebellum", taxID = 10090) 
dim(listR)
head(listR)

13.Visualisatiion of PPI network with Igraph.

Combine information from PPI data.frame obtained with functions like getPPIbyName, getPPIbyEntrez, getPPIbyIDs4Compartment or getPPIbyIDs4BrainRegion with information about genes obtained from getGenesByID to make interpretable undirected PPI graph in igraph format. In this format network could be further analysed and visualized by algorithms from igraph package.

library(igraph)
g<-getIGraphFromPPI(
    getPPIbyIDs(c(48, 129,  975,  4422, 5715, 5835), type='lim'))
plot(g,vertex.label=V(g)$RatName,vertex.size=25)

1.Export of PPI network as a table.

If Igraph is not an option, the PPI network could be exported as an interpretible table to be processed with other tools, e.g. Cytoscape,etc.

tbl<-getTableFromPPI(getPPIbyIDs(c(48, 585, 710), type='limited'))
tbl

References

  1. Sorokina, O., Mclean, C., Croning, M.D.R. et al.
    A unified resource and configurable model of the synapse proteome and its role in disease. Sci Rep 11, 9967 (2021). https://doi.org/10.1038/s41598-021-88945-7.

Appendix {.tabset}

Statistics graph code {#statistics .unnumbered}






Session Info

c<-devtools::session_info()
pander::pander(t(data.frame(c(c$platform))))
pander::pander(as.data.frame(c$packages)[,-c(4,5,10,11)])


lptolik/synaptome.db documentation built on Sept. 13, 2023, 2:50 p.m.