knitr::opts_chunk$set(echo = TRUE) packages = c("PItools", "downloader", "parallel", "R.utils", "PSICQUIC", "Biostrings", "BiocGenerics", "GenomeInfoDb", "GenomicRanges", "qvalue", "rtracklayer", "ggplot2") # install packages if not available if(mean(packages %in% names(installed.packages()[,"Package"])) != 1){ packages_to_install = packages[!packages %in% names(installed.packages()[,"Package"])] # specifying mirror is necessary for some Linux systems install.packages(packages_to_install, dependencies = T, repos = "http://mirrors.ebi.ac.uk/CRAN/") packages_to_install = packages[!packages %in% names(installed.packages()[,"Package"])] source("https://bioconductor.org/biocLite.R") biocLite(packages_to_install) devtools::install_github("vitkl/PItools", dependencies = T) } suppressPackageStartupMessages({ library(PItools) library(parallel) library(downloader) library(ggplot2) })
This article and accompanying slides cover fundamentals of working with PPI data and the problem of study bias in protein interactions networks.
This workshop covers the following topics:
0. Brief intro into R Markdown
1. The fundamentals of protein-protein interactions (PPI) and networks (lecture)
2. Getting PPI data into R (from IntAct, using PSICQUIC)
3. Finding publication (Pubmed) IDs for every protein
4. Why should you care about the study bias in your network and how to evaluate it
Learn more:
http://rmarkdown.rstudio.com/
Cheat sheet:
https://www.rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf
Online course on Reproducible data analysis: https://www.coursera.org/learn/reproducible-research/lecture/5NzHN/r-markdown
Slides:
https://github.com/vitkl/PPI_biases_workshop/blob/master/PPI_networks_and_study_bias.pptx
The PowerPoint presentation for this lecture can be found in the project folder (PPI_networks_and_study_bias.pptx).
You can download protein-protein interaction data directly from IntAct or using PSICQUIC and PSICQUIC client implemented in PSICQUIC R package. Both ways are implemented in PItools package. Downloading data from IntAct gets you all data from all IMEx consortium databases. PSICQUIC route allows to query non-IMEx databases and construct complex search queries.
As implemented in PItools, PSICQUIC route is slow to download (limit of 2500 interactions per second to lower the load on servers). IntAct route is faster but requires memory intensive processing locally (3 GB table).
First, let's focus on the IntAct route. Later we will download molecular interactions from a large non-IMEx database called BioGRID.
By species search is taxonomy-hierarchy aware.
# find the date of the latest locally available release human = fullInteractome(taxid = 9606, database = "IntActFTP", # 9606 - human taxid clean = TRUE, protein_only = TRUE, directory = "./data_files/", # NULL to keep data files inside R library - default releaseORdate = NULL) # useful to keep track of the release date e.g. 2019Mar23, but for the first download set to NULL
10239 is the top viral clade identifier.
# read all IntAct data (filter later) IntAct = loadIntActFTP(dir = "./data_files/IntActRelease_2019Mar23/") # filter for viral-human interaction human_viral = interSpeciesInteractome(MITABdata = IntAct, taxid1 = 9606, taxid2 = 10239, database = "IntActFTP", clean = TRUE, protein_only = TRUE, directory = "./data_files/", releaseORdate = "2019Mar23") uniqueNinteractions(human_viral) uniqueNinteractors(human_viral, taxid = 9606) uniqueNinteractors(human_viral, taxid = 9606, inverse_filter = T) # retrieve all VirHostNet data # which does not understand taxonomy hierarhy so interSpeciesInteractome() will not work VirHostNet_raw = queryPSICQUICrlib(query = "taxid:9606", format = "tab27", database = "VirHostNet", directory = "./data_files/", releaseORdate = NULL) VirHostNet = interSpeciesInteractome(taxid1 = 9606, taxid2 = 10239, MITABdata = VirHostNet_raw, clean = TRUE, protein_only = TRUE, directory = "./data_files/") # filter out human-human and viral-viral interactions VirHostNet$data = VirHostNet$data[!Taxid_interactor_A == Taxid_interactor_B] uniqueNinteractions(VirHostNet) uniqueNinteractors(VirHostNet, taxid = 9606) uniqueNinteractors(VirHostNet, taxid = 9606, inverse_filter = T) # Find which interactions are uniquely present in VirHostNet (not IntAct) VirHostNet$data = VirHostNet$data[!pair_id %in% human_viral$data$pair_id] uniqueNinteractions(VirHostNet) uniqueNinteractors(VirHostNet, taxid = 9606) uniqueNinteractors(VirHostNet, taxid = 9606, inverse_filter = T)
# filter for viral-mouse interaction human_mouse = interSpeciesInteractome(taxid1 = 9606, taxid2 = 10090, database = "IntActFTP", MITABdata = IntAct, clean = TRUE, protein_only = TRUE, directory = "./data_files/", releaseORdate = "2019Mar23") # save RData file to save time processing this data if you want to proceed to later sections immediately rm(IntAct) save(list = ls(), file = "./RData_steps/study_bias_doc_1.RData")
You can find out any other taxid if you know species name using UniProt taxonomy search: https://www.uniprot.org/taxonomy/
# load previosly saved RData load("./RData_steps/study_bias_doc_1.RData") uniqueNinteractions(human) uniqueNinteractors(human) uniqueNinteractions(human_viral) uniqueNinteractors(human_viral) uniqueNinteractions(human_mouse) uniqueNinteractors(human_mouse) # how many of these proteins are human? uniqueNinteractors(human_mouse, taxid = 9606)
# subset two-hybrid interactions human_two_hybrid = subsetMITABbyMethod(MITABdata = human, Interaction_detection_methods = "MI:0018") uniqueNinteractions(human_two_hybrid) uniqueNinteractors(human_two_hybrid) # subset all interactions but two-hybrid human_NOT_two_hybrid = subsetMITABbyMethod(MITABdata = human, Interaction_detection_methods = "MI:0018", inverse_filter = T) uniqueNinteractions(human_NOT_two_hybrid) uniqueNinteractors(human_NOT_two_hybrid) # subset affinity purification - mass spectrometry interactions human_AP_MS = subsetMITABbyMethod(MITABdata = human, Interaction_detection_methods = "MI:0004", Identification_method = "MI:0433") uniqueNinteractions(human_AP_MS) uniqueNinteractors(human_AP_MS)
# subset both published and unpublished Vidal group data Vidal_all = subsetMITABbyPMIDs(MITABdata = human, PMIDs = c("25416956", "unassigned1304")) uniqueNinteractions(Vidal_all) uniqueNinteractors(Vidal_all) # subset Mattias Mann 2015 paper data Mann = subsetMITABbyPMIDs(MITABdata = human, PMIDs = "26496610") uniqueNinteractions(Mann) uniqueNinteractors(Mann)
You can get help and more details on these functions (for example, how to find which molecular ontology terms correspond to which methods): ?subsetMITABbyMethod
mediator_complex_proteins = fread("http://www.uniprot.org/uniprot/?query=GO:0016592%20AND%20taxonomy:9606&format=tab&columns=id") mediator_complex = subsetMITABbyID(Vidal_all, ID_seed = mediator_complex_proteins$Entry, within_seed = T, only_seed2nonseed = F) uniqueNinteractions(mediator_complex) uniqueNinteractors(mediator_complex)
mediator_complex_interactions = subsetMITABbyID(Vidal_all, ID_seed = mediator_complex_proteins$Entry, within_seed = F, only_seed2nonseed = T) uniqueNinteractions(mediator_complex_interactions) uniqueNinteractors(mediator_complex_interactions)
Let's retrieve all human-viral interactions from BioGrid database. But first we need to find correct database names.
client = PSICQUIC() providers(client)
# Don't run human_viral_BioGrid = interSpeciesInteractome(taxid1 = 9606, taxid2 = 10239, database = "BioGrid", clean = TRUE, protein_only = TRUE, directory = "./data_files/") human_BioGrid = fullInteractome(taxid = 9606, database = "BioGrid", # 9606 - human taxid clean = TRUE, protein_only = TRUE, directory = "./data_files/")
This can be useful in you need interactions for a small number of proteins or if you want to query non-IMEx databases. Note that queryPSICQUIC doesn't keep track of database version data, while queryPSICQUICrlib does.
# Query for interactions of bacterial RNA polymerase sigma factor SigA identified using two-hybrid methods in all imex databases queryPSICQUIC(query = "id:P74565 AND detmethod:\"MI:0018\"", format = "tab27", database = "imex", file = "./data_files/P74565_2H_interactions_imex_tab27.tsv") # Query for interactions of sigma factor SigA identified using two-hybrid methods in mentha (a database that aggregates data from all primary databases, but does no interaction predition) queryPSICQUIC(query = "id:P74565 AND detmethod:\"MI:0018\"", format = "tab25", database = "mentha", file = "./data_files/P74565_2H_interactions_mentha_tab25.tsv") # Query for interactions of sigma factor SigA in mentha queryPSICQUIC(query = "id:P74565", format = "tab25", database = "mentha", file = "./data_files/P74565_2H_interactions_mentha_tab25.tsv") # Retrieve interaction of any proteins encoded by Beta-adrenergic receptor kinase 1 gene (Entrez GeneID 156) from BioGRID (which recognises only this type of ID) queryPSICQUIC(query = "id:156", format = "tab25", database = "BioGrid", file = "./data_files/entrezgene156_interactions_BioGrid_tab25.tsv") # The function return the report of how many interaction were found in each database, not the data itself. Reading data into R. fread("./data_files/entrezgene156_interactions_BioGrid_tab25.tsv", header = T, stringsAsFactors = F)[1:5]
All the same operations can be done using function queryPSICQUICrlib but with the convienience of automatic tracking of database release date and the exact query text. This function also return the data in object of class RAW_MItab that after cleaned make data ready for use with other tools in the package.
BioGrid_156 = queryPSICQUICrlib(query = "id:156", format = "tab25", database = "BioGrid", directory = "./data_files/") # The same protein, but only two-hybrid interactions BioGrid_156_2H = queryPSICQUICrlib(query = "id:156 AND detmethod:\"MI:0018\"", format = "tab25", database = "BioGrid", directory = "./data_files/") # The data returned by queryPSICQUICrlib constains auxillary information that is not necessary for most analysis. Let's clean the data. cleanMITAB(BioGrid_156)[1:5]
There are many ways to find publications for a given protein (all proteins). We will discuss 3 and focus on one (UniProtKB references).
https://www.uniprot.org/help/publications_section#additional_bibliography:
"As a comprehensive and high-quality resource of protein sequence and functional information, UniProtKB strives to provide comprehensive literature citations associated with protein sequences and their characterization. Currently about 2 thirds of the UniProtKB PubMed citations are found in UniProtKB/Swiss-Prot, as a result of active integration in the course of manual curation.
In order to keep up with the explosive growth of literature and to give our users access to additional publications, we decided to integrate additional sources of literature from other annotated databases into UniProtKB. For this purpose we selected a number external databases, e.g. Entrez Gene (GeneRIFs), SGD, MGI, GAD and PDB, and extracted citations that were mapped to UniProtKB entries. This additional protein bibliography information helps our users to better explore the existing knowledge of their proteins of interest."
Europe PMC text-mining efforts yeld multiple identifier-publication associations. Details can be found here: https://europepmc.org/downloads. UniProtKB to pubmedID mapping can be downloaded from ftp: ftp://ftp.ebi.ac.uk/pub/databases/pmc/TextMinedTerms/
NCBI portal: references for entrez geneID. Similar to UniProtKB references, but different ID, documentation about how these links are maintained is provided here: https://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html#gene. GeneID to pubmedID mapping can be downloaded from NCBI ftp: ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz
Let's download the list of NCBI PubMed identifiers (PubMed IDs) associated with the UniProtKB entry (human proteins) and computationally mapped to the UniProtKB entry (human proteins).
# retrieve human interactions (taxonomy:9606) from SwissProt, a manually reviewed part or the UniProt (reviewed:yes) url = "https://www.uniprot.org/uniprot/?query=taxonomy:9606&compress=yes&format=tab&columns=id,citation,citationmapping" filename.gz = "./data_files/uniprot2pubmed_human.tsv.gz" filename = "./data_files/uniprot2pubmed_human.tsv" if(!file.exists(filename.gz)) download(url, filename.gz) R.utils::gunzip(filename.gz, filename, remove = F, overwrite = T) # Read into R uniprot2pubmed_human = fread(filename, header = T, stringsAsFactors = F) # We need to do some processing ## 1. Merge manually associated and computationally mapped publications uniprot2pubmed_human[, PMIDs := paste0(`PubMed ID`,";",`Mapped PubMed ID`)] uniprot2pubmed_human[`PubMed ID` == "" & `Mapped PubMed ID` == "", PMIDs := ""] uniprot2pubmed_human[`PubMed ID` == "" & `Mapped PubMed ID` != "", PMIDs := `Mapped PubMed ID`] uniprot2pubmed_human[`PubMed ID` != "" & `Mapped PubMed ID` == "", PMIDs := `PubMed ID`] uniprot2pubmed_human$`PubMed ID` = NULL uniprot2pubmed_human$`Mapped PubMed ID` = NULL ## 2. Split concatenated PubMed IDs and count their number per protein uniprot2pubmed_human = uniprot2pubmed_human[, .(N_PMIDs = uniqueN(unlist(tstrsplit(PMIDs,";")))), by = Entry] # remove objects necessary for the next part, save rm(list = ls()[!ls() %in% c("human", "uniprot2pubmed_human", "Vidal_all", "Mann", "human_two_hybrid", "human_AP_MS")]) save(list = ls(), file = "./RData_steps/study_bias_doc_2.RData")
Back to the presentation, slide 61. Study bias is the phenomenon that causes certain properties of proteins to appear biologically meaningful but actually confounded with how well studies proteins are overall. Study bias may cause certain proteins to appear as hubs in the protein interaction network. This may lead to false conclusions regarding the importance of the protein in the network. Therefore, it is important to estimate and if possible correct the effect of study bias on the property of your interest.
Let's use the number of UniProt reference as a proxy for how well studied the protein is. We will bin all proteins that have interactions in our network by the number of publications.
load("./RData_steps/study_bias_doc_2.RData") uniprot2pubmed_human = uniprot2pubmed_human[order(N_PMIDs, decreasing = T)] uniprot2pubmed_human_w = uniprot2pubmed_human[Entry %in% extractInteractors(human)] # Most proteins have very low number of publications: hist(log10(uniprot2pubmed_human_w$N_PMIDs), xlab = "log10(number of publications) per protein") n_bins = 50 n_proteins = nrow(uniprot2pubmed_human_w) n_proteins_per_bin = round(n_proteins/n_bins) bins = rep(1:50, each = n_proteins_per_bin) if(n_proteins > length(bins)){ bins = c(bins, rep(50, n_proteins - length(bins))) } else if(n_proteins < length(bins)){ bins = bins[1:n_proteins] } uniprot2pubmed_human_w[, bins := bins] bin_lists = split(uniprot2pubmed_human_w$Entry, uniprot2pubmed_human_w$bins)
This method shows the gist of the problem of study bias but needs further improvement.
unique_interactions = unique(human$data[, .(IDs_interactor_A, IDs_interactor_B)]) N_interaction_table = lapply(1:length(bin_lists), function(bin_list1_N, unique_interactions, bin_lists){ N_interaction_table = lapply(1:length(bin_lists), function(bin_list2_N, bin_list1_N, unique_interactions, bin_lists) { bin_list1 = bin_lists[[bin_list1_N]] bin_list2 = bin_lists[[bin_list2_N]] N_proteins = uniqueN(unique_interactions[(IDs_interactor_A %in% bin_list1 & IDs_interactor_B %in% bin_list2) | (IDs_interactor_A %in% bin_list2 & IDs_interactor_B %in% bin_list1), c(IDs_interactor_A, IDs_interactor_B)]) N_interactions = unique_interactions[,sum((IDs_interactor_A %in% bin_list1 & IDs_interactor_B %in% bin_list2) | (IDs_interactor_A %in% bin_list2 & IDs_interactor_B %in% bin_list1))] data.table(N_interactions_per_protein = N_interactions / N_proteins, x = bin_list1_N, y = bin_list2_N, N_interactions = N_interactions, N_proteins = N_proteins) }, bin_list1_N, unique_interactions, bin_lists) Reduce(rbind, N_interaction_table) }, unique_interactions, bin_lists) N_interaction_table = Reduce(rbind, N_interaction_table)
cols = c(colorRampPalette(c("white", "#95cbee", "#0099dc", "#4ab04a", "#ffd73e"), bias = 2)(10), #"#c9e2f6" colorRampPalette(c("#eec73a", "#e29421", "#e29421", "#f05336","#ce472e"))(20)) ggplot(N_interaction_table, aes(x, y)) + geom_raster(aes(fill = N_interactions_per_protein)) + xlab("rank by number of studies") + ylab("rank by number of studies") + theme_bw() + scale_fill_gradientn(colours=cols, na.value=rgb(246, 246, 246, max=255), guide=guide_colourbar(ticks=T, nbin=50, barheight=15, label=T))
First, let's put what we just did into a function.
interactionBias = function(ppi_network, uniprot2pubmed, n_bins = 50){ uniprot2pubmed = copy(uniprot2pubmed) uniprot2pubmed = uniprot2pubmed[order(N_PMIDs, decreasing = T)] uniprot2pubmed = uniprot2pubmed[Entry %in% extractInteractors(human)] n_proteins = nrow(uniprot2pubmed) n_proteins_per_bin = round(n_proteins/n_bins) bins = rep(1:n_bins, each = n_proteins_per_bin) if(n_proteins > length(bins)){ bins = c(bins, rep(n_bins, n_proteins - length(bins))) } else if(n_proteins < length(bins)){ bins = bins[1:n_proteins] } uniprot2pubmed[, bins := bins] bin_lists = split(uniprot2pubmed$Entry, uniprot2pubmed$bins) unique_interactions = unique(ppi_network$data[, .(IDs_interactor_A, IDs_interactor_B)]) N_interaction_table = lapply(1:length(bin_lists), function(bin_list1_N, unique_interactions, bin_lists){ N_interaction_table = lapply(1:length(bin_lists), function(bin_list2_N, bin_list1_N, unique_interactions, bin_lists) { bin_list1 = bin_lists[[bin_list1_N]] bin_list2 = bin_lists[[bin_list2_N]] N_proteins = uniqueN(unique_interactions[(IDs_interactor_A %in% bin_list1 & IDs_interactor_B %in% bin_list2) | (IDs_interactor_A %in% bin_list2 & IDs_interactor_B %in% bin_list1), c(IDs_interactor_A, IDs_interactor_B)]) N_interactions = unique_interactions[,sum((IDs_interactor_A %in% bin_list1 & IDs_interactor_B %in% bin_list2) | (IDs_interactor_A %in% bin_list2 & IDs_interactor_B %in% bin_list1))] data.table(N_interactions_per_protein = N_interactions / N_proteins, x = bin_list1_N, y = bin_list2_N, N_interactions = N_interactions, N_proteins = N_proteins) }, bin_list1_N, unique_interactions, bin_lists) Reduce(rbind, N_interaction_table) }, unique_interactions, bin_lists) N_interaction_table = Reduce(rbind, N_interaction_table) return(N_interaction_table) }
N_interaction_table[, title := "All IntAct data"] N_interaction_Mann = interactionBias(ppi_network = Mann, uniprot2pubmed = uniprot2pubmed_human, n_bins = 50) N_interaction_Mann[, title := "Mattias Mann unbiased AP-MS dataset"] N_interaction_Vidal = interactionBias(ppi_network = Vidal_all, uniprot2pubmed = uniprot2pubmed_human, n_bins = 50) N_interaction_Vidal[, title := "Marc Vidal unbiased two-hybrid dataset"] N_interaction_two_hybrid = interactionBias(ppi_network = human_two_hybrid, uniprot2pubmed = uniprot2pubmed_human, n_bins = 50) N_interaction_two_hybrid[, title := "all two-hybrid data"] N_interaction_AP_MS = interactionBias(ppi_network = human_AP_MS, uniprot2pubmed = uniprot2pubmed_human, n_bins = 50) N_interaction_AP_MS[, title := "all AP-MS data"] N_interaction = Reduce(rbind, list(N_interaction_Mann, N_interaction_Vidal, N_interaction_two_hybrid, N_interaction_AP_MS, N_interaction_table)) ggplot(N_interaction, aes(x, y)) + geom_raster(aes(fill = N_interactions_per_protein)) + xlab("rank by number of studies") + ylab("rank by number of studies") + facet_wrap( ~ title, ncol = 2) + theme_bw() + scale_fill_gradientn(colours=cols, na.value=rgb(246, 246, 246, max=255), guide=guide_colourbar(ticks=T, nbin=50, barheight=20, label=T))
Sys.Date() devtools::session_info()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.