knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html )
library(BioPlex) library(AnnotationDbi) library(AnnotationHub) library(graph)
Connect to AnnotationHub:
ah <- AnnotationHub::AnnotationHub()
Connect to ExperimentHub:
eh <- ExperimentHub::ExperimentHub()
OrgDb package for human:
orgdb <- AnnotationHub::query(ah, c("orgDb", "Homo sapiens")) orgdb <- orgdb[[1]] orgdb keytypes(orgdb)
Get core set of complexes:
core <- getCorum(set = "core", organism = "Human")
Turn the CORUM complexes into a list of graph instances, where all nodes of a complex are connected to all other nodes of that complex with undirected edges.
core.glist <- corum2graphlist(core, subunit.id.type = "UNIPROT")
Identify complexes that have a subunit of interest:
has.cdk2 <- hasSubunit(core.glist, subunit = "CDK2", id.type = "SYMBOL")
Check the answer:
table(has.cdk2) cdk2.glist <- core.glist[has.cdk2] lapply(cdk2.glist, function(g) unlist(graph::nodeData(g, attr = "SYMBOL")))
We can then also inspect the graph with plotting utilities from the Rgraphviz package:
plot(cdk2.glist[[1]], main = names(cdk2.glist)[1])
Get the latest version of the 293T PPI network:
bp.293t <- getBioPlex(cell.line = "293T", version = "3.0")
Turn the BioPlex PPI network into one big graph where bait and prey relationship are represented by directed edges from bait to prey.
bp.gr <- bioplex2graph(bp.293t)
Now we can also easily pull out a BioPlex subnetwork for a CORUM complex of interest:
n <- graph::nodes(cdk2.glist[[1]]) bp.sgr <- graph::subGraph(n, bp.gr) bp.sgr
Add PFAM domain annotations to the node metadata:
bp.gr <- BioPlex::annotatePFAM(bp.gr, orgdb)
Create a map from PFAM to UNIPROT:
unip2pfam <- graph::nodeData(bp.gr, graph::nodes(bp.gr), "PFAM") pfam2unip <- stack(unip2pfam) pfam2unip <- split(as.character(pfam2unip$ind), pfam2unip$values) head(pfam2unip, 2)
Let's focus on PF02023, corresponding to the zinc finger-associated SCAN domain. For each protein containing the SCAN domain, we now extract PFAM domains connected to the SCAN domain by an edge in the BioPlex network.
scan.unip <- pfam2unip[["PF02023"]] getIAPfams <- function(n) graph::nodeData(bp.gr, graph::edges(bp.gr)[[n]], "PFAM") unip2iapfams <- lapply(scan.unip, getIAPfams) unip2iapfams <- lapply(unip2iapfams, unlist) names(unip2iapfams) <- scan.unip
Looking at the top 5 PFAM domains most frequently connected to the SCAN domain by an edge in the BioPlex network ...
pfam2iapfams <- unlist(unip2iapfams) sort(table(pfam2iapfams), decreasing = TRUE)[1:5]
... we find PF02023, the SCAN domain itself, and PF00096, a C2H2 type zinc finger domain. This finding is consistent with results reported in the BioPlex 3.0 publication.
See also the PFAM domain-domain association analysis vignette for a more comprehensive analysis of PFAM domain associations in the BioPlex network.
Genomic data from whole-genome sequencing for six different lineages of the human embryonic kidney HEK293 cell line can be obtained from hek293genome.org.
This includes copy number variation (CNV) data for the 293T cell line. Available CNV tracks include (i) CNV regions inferred from sequencing read-depth analysis, and (ii) CNV regions inferred from Illumina SNP arrays.
Here, we check agreement between inferred copy numbers from both assay types.
We start by obtaining genomic coordinates and copy number scores from the sequencing track ...
cnv.hmm <- getHEK293GenomeTrack(track = "cnv.hmm", cell.line = "293T") cnv.hmm
... and from the SNP track.
cnv.snp <- getHEK293GenomeTrack(track = "cnv.snp", cell.line = "293T") cnv.snp
We reduce the check for agreement between both CNV tracks by transferring copy numbers to overlapping genes, and subsequently, assess the agreement between the resulting gene copy numbers for both tracks.
As the genomic coordinates from both CNV tracks is based on the hg18 human genome assembly, we obtain gene coordinates for hg18 from AnnotationHub:
AnnotationHub::query(ah, c("TxDb", "Homo sapiens")) txdb <- ah[["AH52257"]] gs <- GenomicFeatures::genes(txdb) gs
We then transfer SNP-inferred copy numbers to genes by overlap ...
olaps <- GenomicRanges::findOverlaps(gs, cnv.snp) joined <- gs[S4Vectors::queryHits(olaps)] joined$score <- cnv.snp$score[S4Vectors::subjectHits(olaps)] joined
... and, analogously, transfer sequencing-inferred copy numbers to genes by overlap.
olaps <- GenomicRanges::findOverlaps(gs, cnv.hmm) joined2 <- gs[S4Vectors::queryHits(olaps)] joined2$score <- cnv.hmm$score[S4Vectors::subjectHits(olaps)] joined2
We then restrict both tracks to common genes.
isect <- intersect(names(joined), names(joined2)) joined <- joined[isect] joined2 <- joined2[isect]
Now, can assess agreement by testing the correlation between SNP-inferred gene copy numbers and the corresponding sequencing-inferred gene copy numbers.
cor(joined$score, joined2$score) cor.test(joined$score, joined2$score)
We also inspect the correlation via a boxplot.
spl <- split(joined2$score, joined$score) boxplot(spl, xlab = "SNP-inferred copy number", ylab = "sequencing-inferred copy number") rho <- cor(joined$score, joined2$score) rho <- paste("cor", round(rho, digits=3), sep=" = ") p <- cor.test(joined$score, joined2$score) p <- p$p.value p <- " p < 2.2e-16" legend("topright", legend=c(rho, p))
Get RNA-seq data for HEK293 cells from GEO: GSE122425
se <- getGSE122425() se
Inspect expression of prey genes:
bait <- unique(bp.293t$SymbolA) length(bait) prey <- unique(bp.293t$SymbolB) length(prey)
ind <- match(prey, rowData(se)$SYMBOL) par(las = 2) boxplot(log2(assay(se, "rpkm") + 0.5)[ind,], names = se$title, ylab = "log2 RPKM")
How many prey genes are expressed (raw read count > 0) in all 3 WT reps:
# background: how many genes in total are expressed in all three WT reps gr0 <- rowSums(assay(se)[,1:3] > 0) table(gr0 == 3) # prey: expressed in all three WT reps table(gr0[ind] == 3) # prey: expressed in at least one WT rep table(gr0[ind] > 0)
Are prey genes overrepresented in the expressed genes?
exprTable <- matrix(c(9346, 1076, 14717, 32766), nrow = 2, dimnames = list(c("Expressed", "Not.expressed"), c("In.prey.set", "Not.in.prey.set"))) exprTable
Test using hypergeometric test (i.e. one-sided Fisher's exact test):
fisher.test(exprTable, alternative = "greater")
Alternatively: permutation test, i.e. repeatedly sample number of prey genes from the background, and assess how often we have as many or more than 9346 genes expressed:
permgr0 <- function(gr0, nr.genes = length(prey)) { ind <- sample(seq_along(gr0), nr.genes) sum(gr0[ind] == 3) }
perms <- replicate(permgr0(gr0), 1000) summary(perms) (sum(perms >= 9346) + 1) / 1001
Check which genes turn up most frequently as prey:
prey.freq <- sort(table(bp.293t$SymbolB), decreasing = TRUE) preys <- names(prey.freq) prey.freq <- as.vector(prey.freq) names(prey.freq) <- preys head(prey.freq) summary(prey.freq) hist(prey.freq, breaks = 50, main = "", xlab = "Number of PPIs", ylab = "Number of genes")
Prey genes are involved in r round(mean(as.vector(prey.freq)))
PPIs on average.
There doesn't seem to be a strong correlation between expression level and the frequency of gene to turn up as prey:
ind <- match(names(prey.freq), rowData(se)$SYMBOL) rmeans <- rowMeans(assay(se, "rpkm")[ind, 1:3]) log.rmeans <- log2(rmeans + 0.5) par(pch = 20) plot( x = prey.freq, y = log.rmeans, xlab = "prey frequency", ylab = "log2 RPKM") cor(prey.freq, log.rmeans, use = "pairwise.complete.obs")
See also the BioNet maximum scoring subnetwork analysis vignette for a more comprehensive analysis of the 293T transcriptome data from GSE122425 when mapped onto BioPlex PPI network.
Get the relative protein expression data comparing 293T and HCT116 cells from Supplementary Table S4A of the BioPlex 3 paper:
bp.prot <- getBioplexProteome() bp.prot rowData(bp.prot)
A couple of quick sanity checks:
rowSums(assay(bp.prot)[1:5,])
rowData
column log2ratio
corresponds to the mean of the five HEK samples,
divided by the mean of the five HCT samples (and then taking log2 of it):ratio <- rowMeans(assay(bp.prot)[1:5, 1:5]) / rowMeans(assay(bp.prot)[1:5, 6:10]) log2(ratio)
rowData
column adj.pvalue
stores Benjamini-Hochberg adjusted p-values
from a t-test between the five HEK samples and the five HCT samples:t.test(assay(bp.prot)[1, 1:5], assay(bp.prot)[1, 6:10])
The Transcriptome-Proteome analysis vignette also explores the agreement between differential gene expression and differential protein expression when comparing HEK293 against HCT116 cells.
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.