In ccb-hms/BioPlexAnalysis: Applications and downstream analysis of BioPlex PPI data

Setup

library(BioPlex)
library(AnnotationDbi)
library(AnnotationHub)
library(ExperimentHub)
library(ExpressionAtlas)
library(GenomicFeatures)
library(edgeR)
library(ggplot2)
library(limma)

CCLE transcriptome and proteome data for HCT116

Get CCLE transcriptome data for HCT116:

atlasRes <- ExpressionAtlas::searchAtlasExperiments(
              properties = "Cancer Cell Line Encyclopedia", 
              species = "human" )
atlasRes
ccle.trans <- ExpressionAtlas::getAtlasExperiment("E-MTAB-2770")
ccle.trans <- ccle.trans[[1]]
ccle.trans <- ccle.trans[,grep("HCT 116", ccle.trans$cell_line)]
ccle.trans

There is currently an issue with obtaining E-MTAB-2770 via the ExpressionAtlas package. We therefore pull the file directly from ftp as a workaround.

ebi.ftp.url <- "ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments"
mtab.url <- "E-MTAB-2770/archive/E-MTAB-2770-atlasExperimentSummary.Rdata.1"
mtab.url <- file.path(ebi.ftp.url, mtab.url)
download.file(mtab.url, "E-MTAB-2770.Rdata")
load("E-MTAB-2770.Rdata")
ccle.trans <- experimentSummary
file.remove("E-MTAB-2770.Rdata")

ccle.trans <- BioPlexAnalysis:::.getResourceFromCache("ccle.trans", update.value = NA)
if(is.null(ccle.trans))
{
    ebi.ftp.url <- "ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments"
    mtab.url <- "E-MTAB-2770/archive/E-MTAB-2770-atlasExperimentSummary.Rdata.1"
    mtab.url <- file.path(ebi.ftp.url, mtab.url)
    download.file(mtab.url, "E-MTAB-2770.Rdata")
    load("E-MTAB-2770.Rdata")
    ccle.trans <- experimentSummary
    file.remove("E-MTAB-2770.Rdata")
    BioPlexAnalysis:::.cacheResource(ccle.trans, "ccle.trans")
}

We proceed as before:

ccle.trans <- ccle.trans[[1]]
ccle.trans <- ccle.trans[,grep("HCT 116", ccle.trans$cell_line)]
ccle.trans

Get the CCLE proteome data for HCT116:

eh <- ExperimentHub::ExperimentHub()
AnnotationHub::query(eh, c("gygi", "depmap"))
ccle.prot <- eh[["EH3459"]]
ccle.prot <- as.data.frame(ccle.prot)
ccle.prot <- BioPlex::ccleProteome2SummarizedExperiment(ccle.prot)
ccle.prot

Connect to AnnotationHub and obtain OrgDb package for human:

ah <- AnnotationHub::AnnotationHub()
orgdb <- AnnotationHub::query(ah, c("orgDb", "Homo sapiens"))
orgdb <- orgdb[[1]]

Map to ENSEMBL for comparison with CCLE transcriptome data for HCT116:

rnames <- AnnotationDbi::mapIds(orgdb,
                      keytype = "UNIPROT",
                      column = "ENSEMBL",
                      keys = rownames(ccle.prot))

Subset to the ENSEMBL IDs that both datasets have in common

isect <- intersect(rnames, rownames(ccle.trans))
ind <- match(isect, rnames)

This should be rather RPKM, provided gene length from EDASeq:

assay(ccle.trans, "cpm") <- edgeR::cpm(assay(ccle.trans), log = TRUE)

A look at general correlation between transcriptome and proteome:

cor.test(assay(ccle.trans, "cpm")[isect,], 
         assay(ccle.prot)[ind,],
         use = "complete.obs")

df <- data.frame(trans = assay(ccle.trans, "cpm")[isect,],
                 prot = assay(ccle.prot)[ind,])
ggplot(df, aes(x = trans, y = prot) ) +
    geom_bin2d(bins = 70) +
    scale_fill_continuous(type = "viridis") +
    xlab("log2 CPM") +
    ylab("log2 intensity") +
    theme_bw()

Let's check whether this looks very different when accounting for gene length. We therefore obtain gene length for the hg38 genome assembly (used for CCLE).

AnnotationHub::query(ah, c("TxDb", "Homo sapiens"))
txdb <- ah[["AH92591"]]
gs <- GenomicFeatures::genes(txdb)
gs
len <- GenomicRanges::width(gs)
names(len) <- names(gs)
head(len)

This requires to map from Entrez IDs present for the gene length data to ENSEMBL IDs present in the transcriptomic data.

eids <- AnnotationDbi::mapIds(orgdb,
                              column = "ENTREZID",
                              keytype = "ENSEMBL", 
                              keys = rownames(ccle.trans))
rowData(ccle.trans)$length <- len[eids]

We can now compute RPKM given the obtained gene lengths as input.

assay(ccle.trans, "rpkm") <- edgeR::rpkm(assay(ccle.trans), 
                                         gene.length = rowData(ccle.trans)$length,
                                         log = TRUE)

cor.test(assay(ccle.trans, "rpkm")[isect,],
         assay(ccle.prot)[ind,],
         use = "complete.obs")

df <- data.frame(trans = assay(ccle.trans, "rpkm")[isect,],
                 prot = assay(ccle.prot)[ind,])
ggplot(df, aes(x = trans, y = prot) ) +
    geom_bin2d(bins = 70) +
    scale_fill_continuous(type = "viridis") +
    xlab("log2 RPKM") +
    ylab("log2 intensity") +
    theme_bw()

DE analysis HEK293 vs. HCT116 (transcriptomic and proteomic level)

Pull the HEK293 data:

gse.293t <- BioPlex::getGSE122425()

Pull the HCT116 data:

klijn <- ExpressionAtlas::getAtlasData("E-MTAB-2706")
klijn <- klijn$`E-MTAB-2706`$rnaseq
klijn

Combine the both HCT116 samples:

ind2 <- grep("HCT 116", klijn$cell_line)
isect <- intersect(rownames(ccle.trans), rownames(klijn))
emat <- cbind(assay(ccle.trans)[isect,], assay(klijn)[isect,ind2])
colnames(emat) <- c("ccle", "klijn")
head(emat)

Combine with the HEK293 wildtype samples:

isect <- intersect(rownames(emat), rownames(gse.293t))
emat <- cbind(emat[isect,], assay(gse.293t)[isect, 1:3])
colnames(emat) <- paste0(rep(c("HCT", "HEK"), c(2,3)), c(1:2, 1:3))

Compute logCPMs to bring samples from different cell lines and experiments on the same scale using the limma-trend approach:

dge <- edgeR::DGEList(counts = emat)
dge$group <- rep(c("HCT", "HEK"), c(2,3))
design <- model.matrix(~ dge$group)
keep <- edgeR::filterByExpr(dge, design)
dge <- dge[keep,,keep.lib.sizes = FALSE]
dim(dge)

dge <- edgeR::calcNormFactors(dge)
logCPM <- edgeR::cpm(dge, log = TRUE, prior.count = 3)
fit <- limma::lmFit(logCPM, design)
fit <- limma::eBayes(fit, trend = TRUE)
limma::topTable(fit, coef = ncol(design))
tt <- limma::topTable(fit, coef = ncol(design), number = nrow(logCPM), sort.by = "none")

Now let's pull the BioPlex3 proteome data:

bp.prot <- BioPlex::getBioplexProteome()
rowData(bp.prot)

Compare differential expression results on transcriptomic and proteomic level based on gene symbols as those are readily available:

isect <- intersect(rowData(bp.prot)$SYMBOL, 
                   rowData(gse.293t)[rownames(logCPM), "SYMBOL"])
length(isect)

ind.trans <- match(isect, rowData(gse.293t)[rownames(logCPM), "SYMBOL"])
ind.prot <- match(isect, rowData(bp.prot)$SYMBOL)

We need to switch here the sign of the fold change because the transcriptome is HEK-vs-HCT, the proteome is HCT-vs-HEK:

cor.test(-1 * tt[ind.trans, "logFC"], 
         rowData(bp.prot)[ind.prot, "log2ratio"])

df <- data.frame(trans = -1 * tt[ind.trans, "logFC"],
                 prot = rowData(bp.prot)[ind.prot, "log2ratio"])
ggplot(df, aes(x = trans, y = prot) ) +
    geom_bin2d(bins = 70) +
    scale_fill_continuous(type = "viridis") +
    xlab("log2FC (transcriptome)") +
    ylab("log2FC (proteome)") +
    theme_bw()

Interaction networks of cell-line specific proteins

We can now inspect the interactions of proteins that are strongly differentially expressed between cell lines as in Supplementary Figure S3, Panels J-M, of the BioPlex 3.0 publication.

Here, we inspect the interactions of CDH2, a protein that was observed in ~5-fold lower abundance in the HCT116 cell line when compared to the 293T cell line.

subset(rowData(bp.prot), SYMBOL == "CDH2")

We therefore obtain the latest versions of the both BioPlex PPI networks ...

bp.293t <- BioPlex::getBioPlex(cell.line = "293T", version = "3.0")
bp.hct <- BioPlex::getBioPlex(cell.line = "HCT116", version = "1.0")

Get all interactions involving CDH2:

cdh2.293t <- subset(bp.293t, SymbolA == "CDH2" | SymbolB == "CDH2")
cdh2.293t
cdh2.hct <- subset(bp.hct, SymbolA == "CDH2" | SymbolB == "CDH2")
cdh2.hct

Expand by including interactions between interactors of CDH2:

cdh2i <- cdh2.293t$SymbolA
cdh2i.293t <- subset(bp.293t, SymbolA %in% cdh2i & SymbolB %in% cdh2i)
cdh2i.293t
cdh2i.hct <- subset(bp.hct, SymbolA %in% cdh2i & SymbolB %in% cdh2i)
cdh2i.hct

Now we construct a joined network of interactions involving CDH2 or one of its interactors for both networks:

cdh2.293t <- rbind(cdh2.293t, cdh2i.293t)
cdh2.293t$cell.line <- "293T"
cdh2.hct <- cdh2i.hct
cdh2.hct$cell.line <- "HCT116"
cdh2.df <- rbind(cdh2.293t, cdh2.hct)
cdh2.df

We turn the resulting data.frame into a graph representation

cdh2.gr <- BioPlex::bioplex2graph(cdh2.df)
cdh2.gr

And map the proteome data on the graph:

cdh2.gr <- BioPlex::mapSummarizedExperimentOntoGraph(cdh2.gr, bp.prot)

And annotate for each edge whether it is present for both cell lines or only one of them.

graph::edgeDataDefaults(cdh2.gr, "cell.line") <- "BOTH"
rel.cols <- paste0("Uniprot", c("A", "B"))
rel.cols <- c(rel.cols, "cell.line")
jdf <-  cdh2.df[,rel.cols]
jdf[,1:2] <- apply(jdf[,1:2], 2, function(x) sub("-[0-9]{1,2}$", "", x))
dind <- duplicated(jdf[,1:2])
dup <- jdf[dind,]
ind <- jdf$UniprotA %in% dup$UniprotA & jdf$UniprotB %in% dup$UniprotB 
ind <- ind & jdf$cell.line == "293T"
jdf[ind,"cell.line"] <- "BOTH"  
jdf <- jdf[!dind,]
jdf

We add this information to the graph:

graph::edgeData(cdh2.gr, jdf$UniprotA, jdf$UniprotB, "cell.line") <- jdf$cell.line

Inspect the resulting graph:

p <- BioPlexAnalysis::plotGraph(cdh2.gr, 
                           edge.color = "grey",
                           node.data = "log2ratio",
                           edge.data = "cell.line")
p + scale_color_gradient2(low = "blue",
                          mid = "lightgrey",
                          high = "red",
                          name = "log2ratio")

ccb-hms/BioPlexAnalysis documentation built on Jan. 29, 2023, 6:55 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ccb-hms/BioPlexAnalysis
Applications and downstream analysis of BioPlex PPI data

In ccb-hms/BioPlexAnalysis: Applications and downstream analysis of BioPlex PPI data

Setup

CCLE transcriptome and proteome data for HCT116

DE analysis HEK293 vs. HCT116 (transcriptomic and proteomic level)

Interaction networks of cell-line specific proteins

R Package Documentation

Browse R Packages

We want your feedback!

ccb-hms/BioPlexAnalysis Applications and downstream analysis of BioPlex PPI data

In ccb-hms/BioPlexAnalysis: Applications and downstream analysis of BioPlex PPI data

Setup

CCLE transcriptome and proteome data for HCT116

DE analysis HEK293 vs. HCT116 (transcriptomic and proteomic level)

Interaction networks of cell-line specific proteins

R Package Documentation

Browse R Packages

We want your feedback!

ccb-hms/BioPlexAnalysis
Applications and downstream analysis of BioPlex PPI data