Setup

library(BioPlex)
library(AnnotationDbi)
library(AnnotationHub)
library(ExperimentHub)
library(ExpressionAtlas)
library(GenomicFeatures)
library(edgeR)
library(ggplot2)
library(limma)

CCLE transcriptome and proteome data for HCT116

Get CCLE transcriptome data for HCT116:

atlasRes <- ExpressionAtlas::searchAtlasExperiments(
              properties = "Cancer Cell Line Encyclopedia", 
              species = "human" )
atlasRes
ccle.trans <- ExpressionAtlas::getAtlasExperiment("E-MTAB-2770")
ccle.trans <- ccle.trans[[1]]
ccle.trans <- ccle.trans[,grep("HCT 116", ccle.trans$cell_line)]
ccle.trans

There is currently an issue with obtaining E-MTAB-2770 via the ExpressionAtlas package. We therefore pull the file directly from ftp as a workaround.

ebi.ftp.url <- "ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments"
mtab.url <- "E-MTAB-2770/archive/E-MTAB-2770-atlasExperimentSummary.Rdata.1"
mtab.url <- file.path(ebi.ftp.url, mtab.url)
download.file(mtab.url, "E-MTAB-2770.Rdata")
load("E-MTAB-2770.Rdata")
ccle.trans <- experimentSummary
file.remove("E-MTAB-2770.Rdata")
ccle.trans <- BioPlexAnalysis:::.getResourceFromCache("ccle.trans", update.value = NA)
if(is.null(ccle.trans))
{
    ebi.ftp.url <- "ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments"
    mtab.url <- "E-MTAB-2770/archive/E-MTAB-2770-atlasExperimentSummary.Rdata.1"
    mtab.url <- file.path(ebi.ftp.url, mtab.url)
    download.file(mtab.url, "E-MTAB-2770.Rdata")
    load("E-MTAB-2770.Rdata")
    ccle.trans <- experimentSummary
    file.remove("E-MTAB-2770.Rdata")
    BioPlexAnalysis:::.cacheResource(ccle.trans, "ccle.trans")
}

We proceed as before:

ccle.trans <- ccle.trans[[1]]
ccle.trans <- ccle.trans[,grep("HCT 116", ccle.trans$cell_line)]
ccle.trans

Get the CCLE proteome data for HCT116:

eh <- ExperimentHub::ExperimentHub()
AnnotationHub::query(eh, c("gygi", "depmap"))
ccle.prot <- eh[["EH3459"]]
ccle.prot <- as.data.frame(ccle.prot)
ccle.prot <- BioPlex::ccleProteome2SummarizedExperiment(ccle.prot)
ccle.prot

Connect to AnnotationHub and obtain OrgDb package for human:

ah <- AnnotationHub::AnnotationHub()
orgdb <- AnnotationHub::query(ah, c("orgDb", "Homo sapiens"))
orgdb <- orgdb[[1]]

Map to ENSEMBL for comparison with CCLE transcriptome data for HCT116:

rnames <- AnnotationDbi::mapIds(orgdb,
                      keytype = "UNIPROT",
                      column = "ENSEMBL",
                      keys = rownames(ccle.prot))

Subset to the ENSEMBL IDs that both datasets have in common

isect <- intersect(rnames, rownames(ccle.trans))
ind <- match(isect, rnames)

This should be rather RPKM, provided gene length from EDASeq:

assay(ccle.trans, "cpm") <- edgeR::cpm(assay(ccle.trans), log = TRUE)

A look at general correlation between transcriptome and proteome:

cor.test(assay(ccle.trans, "cpm")[isect,], 
         assay(ccle.prot)[ind,],
         use = "complete.obs")
df <- data.frame(trans = assay(ccle.trans, "cpm")[isect,],
                 prot = assay(ccle.prot)[ind,])
ggplot(df, aes(x = trans, y = prot) ) +
    geom_bin2d(bins = 70) +
    scale_fill_continuous(type = "viridis") +
    xlab("log2 CPM") +
    ylab("log2 intensity") +
    theme_bw()

Let's check whether this looks very different when accounting for gene length. We therefore obtain gene length for the hg38 genome assembly (used for CCLE).

AnnotationHub::query(ah, c("TxDb", "Homo sapiens"))
txdb <- ah[["AH92591"]]
gs <- GenomicFeatures::genes(txdb)
gs
len <- GenomicRanges::width(gs)
names(len) <- names(gs)
head(len)

This requires to map from Entrez IDs present for the gene length data to ENSEMBL IDs present in the transcriptomic data.

eids <- AnnotationDbi::mapIds(orgdb,
                              column = "ENTREZID",
                              keytype = "ENSEMBL", 
                              keys = rownames(ccle.trans))
rowData(ccle.trans)$length <- len[eids]

We can now compute RPKM given the obtained gene lengths as input.

assay(ccle.trans, "rpkm") <- edgeR::rpkm(assay(ccle.trans), 
                                         gene.length = rowData(ccle.trans)$length,
                                         log = TRUE) 
cor.test(assay(ccle.trans, "rpkm")[isect,],
         assay(ccle.prot)[ind,],
         use = "complete.obs")
df <- data.frame(trans = assay(ccle.trans, "rpkm")[isect,],
                 prot = assay(ccle.prot)[ind,])
ggplot(df, aes(x = trans, y = prot) ) +
    geom_bin2d(bins = 70) +
    scale_fill_continuous(type = "viridis") +
    xlab("log2 RPKM") +
    ylab("log2 intensity") +
    theme_bw()

DE analysis HEK293 vs. HCT116 (transcriptomic and proteomic level)

Pull the HEK293 data:

gse.293t <- BioPlex::getGSE122425()

Pull the HCT116 data:

klijn <- ExpressionAtlas::getAtlasData("E-MTAB-2706")
klijn <- klijn$`E-MTAB-2706`$rnaseq
klijn

Combine the both HCT116 samples:

ind2 <- grep("HCT 116", klijn$cell_line)
isect <- intersect(rownames(ccle.trans), rownames(klijn))
emat <- cbind(assay(ccle.trans)[isect,], assay(klijn)[isect,ind2])
colnames(emat) <- c("ccle", "klijn")
head(emat)

Combine with the HEK293 wildtype samples:

isect <- intersect(rownames(emat), rownames(gse.293t))
emat <- cbind(emat[isect,], assay(gse.293t)[isect, 1:3])
colnames(emat) <- paste0(rep(c("HCT", "HEK"), c(2,3)), c(1:2, 1:3)) 

Compute logCPMs to bring samples from different cell lines and experiments on the same scale using the limma-trend approach:

dge <- edgeR::DGEList(counts = emat)
dge$group <- rep(c("HCT", "HEK"), c(2,3))
design <- model.matrix(~ dge$group)
keep <- edgeR::filterByExpr(dge, design)
dge <- dge[keep,,keep.lib.sizes = FALSE]
dim(dge)
dge <- edgeR::calcNormFactors(dge)
logCPM <- edgeR::cpm(dge, log = TRUE, prior.count = 3)
fit <- limma::lmFit(logCPM, design)
fit <- limma::eBayes(fit, trend = TRUE)
limma::topTable(fit, coef = ncol(design))
tt <- limma::topTable(fit, coef = ncol(design), number = nrow(logCPM), sort.by = "none")

Now let's pull the BioPlex3 proteome data:

bp.prot <- BioPlex::getBioplexProteome()
rowData(bp.prot)

Compare differential expression results on transcriptomic and proteomic level based on gene symbols as those are readily available:

isect <- intersect(rowData(bp.prot)$SYMBOL, 
                   rowData(gse.293t)[rownames(logCPM), "SYMBOL"])
length(isect)
ind.trans <- match(isect, rowData(gse.293t)[rownames(logCPM), "SYMBOL"])
ind.prot <- match(isect, rowData(bp.prot)$SYMBOL)

We need to switch here the sign of the fold change because the transcriptome is HEK-vs-HCT, the proteome is HCT-vs-HEK:

cor.test(-1 * tt[ind.trans, "logFC"], 
         rowData(bp.prot)[ind.prot, "log2ratio"])
df <- data.frame(trans = -1 * tt[ind.trans, "logFC"],
                 prot = rowData(bp.prot)[ind.prot, "log2ratio"])
ggplot(df, aes(x = trans, y = prot) ) +
    geom_bin2d(bins = 70) +
    scale_fill_continuous(type = "viridis") +
    xlab("log2FC (transcriptome)") +
    ylab("log2FC (proteome)") +
    theme_bw()

Interaction networks of cell-line specific proteins

We can now inspect the interactions of proteins that are strongly differentially expressed between cell lines as in Supplementary Figure S3, Panels J-M, of the BioPlex 3.0 publication.

Here, we inspect the interactions of CDH2, a protein that was observed in ~5-fold lower abundance in the HCT116 cell line when compared to the 293T cell line.

subset(rowData(bp.prot), SYMBOL == "CDH2")

We therefore obtain the latest versions of the both BioPlex PPI networks ...

bp.293t <- BioPlex::getBioPlex(cell.line = "293T", version = "3.0")
bp.hct <- BioPlex::getBioPlex(cell.line = "HCT116", version = "1.0")

Get all interactions involving CDH2:

cdh2.293t <- subset(bp.293t, SymbolA == "CDH2" | SymbolB == "CDH2")
cdh2.293t
cdh2.hct <- subset(bp.hct, SymbolA == "CDH2" | SymbolB == "CDH2")
cdh2.hct 

Expand by including interactions between interactors of CDH2:

cdh2i <- cdh2.293t$SymbolA
cdh2i.293t <- subset(bp.293t, SymbolA %in% cdh2i & SymbolB %in% cdh2i)
cdh2i.293t
cdh2i.hct <- subset(bp.hct, SymbolA %in% cdh2i & SymbolB %in% cdh2i)
cdh2i.hct

Now we construct a joined network of interactions involving CDH2 or one of its interactors for both networks:

cdh2.293t <- rbind(cdh2.293t, cdh2i.293t)
cdh2.293t$cell.line <- "293T"
cdh2.hct <- cdh2i.hct
cdh2.hct$cell.line <- "HCT116"
cdh2.df <- rbind(cdh2.293t, cdh2.hct)
cdh2.df

We turn the resulting data.frame into a graph representation

cdh2.gr <- BioPlex::bioplex2graph(cdh2.df)
cdh2.gr

And map the proteome data on the graph:

cdh2.gr <- BioPlex::mapSummarizedExperimentOntoGraph(cdh2.gr, bp.prot)

And annotate for each edge whether it is present for both cell lines or only one of them.

graph::edgeDataDefaults(cdh2.gr, "cell.line") <- "BOTH"
rel.cols <- paste0("Uniprot", c("A", "B"))
rel.cols <- c(rel.cols, "cell.line")
jdf <-  cdh2.df[,rel.cols]
jdf[,1:2] <- apply(jdf[,1:2], 2, function(x) sub("-[0-9]{1,2}$", "", x))
dind <- duplicated(jdf[,1:2])
dup <- jdf[dind,]
ind <- jdf$UniprotA %in% dup$UniprotA & jdf$UniprotB %in% dup$UniprotB 
ind <- ind & jdf$cell.line == "293T"
jdf[ind,"cell.line"] <- "BOTH"  
jdf <- jdf[!dind,]
jdf

We add this information to the graph:

graph::edgeData(cdh2.gr, jdf$UniprotA, jdf$UniprotB, "cell.line") <- jdf$cell.line

Inspect the resulting graph:

p <- BioPlexAnalysis::plotGraph(cdh2.gr, 
                           edge.color = "grey",
                           node.data = "log2ratio",
                           edge.data = "cell.line")
p + scale_color_gradient2(low = "blue",
                          mid = "lightgrey",
                          high = "red",
                          name = "log2ratio")


ccb-hms/BioPlexAnalysis documentation built on Jan. 29, 2023, 6:55 p.m.