library(BioPlex) library(AnnotationDbi) library(AnnotationHub) library(ExperimentHub) library(ExpressionAtlas) library(GenomicFeatures) library(edgeR) library(ggplot2) library(limma)
Get CCLE transcriptome data for HCT116:
atlasRes <- ExpressionAtlas::searchAtlasExperiments( properties = "Cancer Cell Line Encyclopedia", species = "human" ) atlasRes ccle.trans <- ExpressionAtlas::getAtlasExperiment("E-MTAB-2770") ccle.trans <- ccle.trans[[1]] ccle.trans <- ccle.trans[,grep("HCT 116", ccle.trans$cell_line)] ccle.trans
There is currently an issue with obtaining E-MTAB-2770 via the ExpressionAtlas package. We therefore pull the file directly from ftp as a workaround.
ebi.ftp.url <- "ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments" mtab.url <- "E-MTAB-2770/archive/E-MTAB-2770-atlasExperimentSummary.Rdata.1" mtab.url <- file.path(ebi.ftp.url, mtab.url) download.file(mtab.url, "E-MTAB-2770.Rdata") load("E-MTAB-2770.Rdata") ccle.trans <- experimentSummary file.remove("E-MTAB-2770.Rdata")
ccle.trans <- BioPlexAnalysis:::.getResourceFromCache("ccle.trans", update.value = NA) if(is.null(ccle.trans)) { ebi.ftp.url <- "ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments" mtab.url <- "E-MTAB-2770/archive/E-MTAB-2770-atlasExperimentSummary.Rdata.1" mtab.url <- file.path(ebi.ftp.url, mtab.url) download.file(mtab.url, "E-MTAB-2770.Rdata") load("E-MTAB-2770.Rdata") ccle.trans <- experimentSummary file.remove("E-MTAB-2770.Rdata") BioPlexAnalysis:::.cacheResource(ccle.trans, "ccle.trans") }
We proceed as before:
ccle.trans <- ccle.trans[[1]] ccle.trans <- ccle.trans[,grep("HCT 116", ccle.trans$cell_line)] ccle.trans
Get the CCLE proteome data for HCT116:
eh <- ExperimentHub::ExperimentHub() AnnotationHub::query(eh, c("gygi", "depmap")) ccle.prot <- eh[["EH3459"]] ccle.prot <- as.data.frame(ccle.prot) ccle.prot <- BioPlex::ccleProteome2SummarizedExperiment(ccle.prot) ccle.prot
Connect to AnnotationHub and obtain OrgDb package for human:
ah <- AnnotationHub::AnnotationHub() orgdb <- AnnotationHub::query(ah, c("orgDb", "Homo sapiens")) orgdb <- orgdb[[1]]
Map to ENSEMBL for comparison with CCLE transcriptome data for HCT116:
rnames <- AnnotationDbi::mapIds(orgdb, keytype = "UNIPROT", column = "ENSEMBL", keys = rownames(ccle.prot))
Subset to the ENSEMBL IDs that both datasets have in common
isect <- intersect(rnames, rownames(ccle.trans)) ind <- match(isect, rnames)
This should be rather RPKM, provided gene length from EDASeq:
assay(ccle.trans, "cpm") <- edgeR::cpm(assay(ccle.trans), log = TRUE)
A look at general correlation between transcriptome and proteome:
cor.test(assay(ccle.trans, "cpm")[isect,], assay(ccle.prot)[ind,], use = "complete.obs")
df <- data.frame(trans = assay(ccle.trans, "cpm")[isect,], prot = assay(ccle.prot)[ind,]) ggplot(df, aes(x = trans, y = prot) ) + geom_bin2d(bins = 70) + scale_fill_continuous(type = "viridis") + xlab("log2 CPM") + ylab("log2 intensity") + theme_bw()
Let's check whether this looks very different when accounting for gene length. We therefore obtain gene length for the hg38 genome assembly (used for CCLE).
AnnotationHub::query(ah, c("TxDb", "Homo sapiens")) txdb <- ah[["AH92591"]] gs <- GenomicFeatures::genes(txdb) gs len <- GenomicRanges::width(gs) names(len) <- names(gs) head(len)
This requires to map from Entrez IDs present for the gene length data to ENSEMBL IDs present in the transcriptomic data.
eids <- AnnotationDbi::mapIds(orgdb, column = "ENTREZID", keytype = "ENSEMBL", keys = rownames(ccle.trans)) rowData(ccle.trans)$length <- len[eids]
We can now compute RPKM given the obtained gene lengths as input.
assay(ccle.trans, "rpkm") <- edgeR::rpkm(assay(ccle.trans), gene.length = rowData(ccle.trans)$length, log = TRUE)
cor.test(assay(ccle.trans, "rpkm")[isect,], assay(ccle.prot)[ind,], use = "complete.obs")
df <- data.frame(trans = assay(ccle.trans, "rpkm")[isect,], prot = assay(ccle.prot)[ind,]) ggplot(df, aes(x = trans, y = prot) ) + geom_bin2d(bins = 70) + scale_fill_continuous(type = "viridis") + xlab("log2 RPKM") + ylab("log2 intensity") + theme_bw()
Pull the HEK293 data:
gse.293t <- BioPlex::getGSE122425()
Pull the HCT116 data:
klijn <- ExpressionAtlas::getAtlasData("E-MTAB-2706") klijn <- klijn$`E-MTAB-2706`$rnaseq klijn
Combine the both HCT116 samples:
ind2 <- grep("HCT 116", klijn$cell_line) isect <- intersect(rownames(ccle.trans), rownames(klijn)) emat <- cbind(assay(ccle.trans)[isect,], assay(klijn)[isect,ind2]) colnames(emat) <- c("ccle", "klijn") head(emat)
Combine with the HEK293 wildtype samples:
isect <- intersect(rownames(emat), rownames(gse.293t)) emat <- cbind(emat[isect,], assay(gse.293t)[isect, 1:3]) colnames(emat) <- paste0(rep(c("HCT", "HEK"), c(2,3)), c(1:2, 1:3))
Compute logCPMs to bring samples from different cell lines and experiments on the same scale using the limma-trend approach:
dge <- edgeR::DGEList(counts = emat) dge$group <- rep(c("HCT", "HEK"), c(2,3)) design <- model.matrix(~ dge$group) keep <- edgeR::filterByExpr(dge, design) dge <- dge[keep,,keep.lib.sizes = FALSE] dim(dge)
dge <- edgeR::calcNormFactors(dge) logCPM <- edgeR::cpm(dge, log = TRUE, prior.count = 3) fit <- limma::lmFit(logCPM, design) fit <- limma::eBayes(fit, trend = TRUE) limma::topTable(fit, coef = ncol(design)) tt <- limma::topTable(fit, coef = ncol(design), number = nrow(logCPM), sort.by = "none")
Now let's pull the BioPlex3 proteome data:
bp.prot <- BioPlex::getBioplexProteome() rowData(bp.prot)
Compare differential expression results on transcriptomic and proteomic level based on gene symbols as those are readily available:
isect <- intersect(rowData(bp.prot)$SYMBOL, rowData(gse.293t)[rownames(logCPM), "SYMBOL"]) length(isect)
ind.trans <- match(isect, rowData(gse.293t)[rownames(logCPM), "SYMBOL"]) ind.prot <- match(isect, rowData(bp.prot)$SYMBOL)
We need to switch here the sign of the fold change because the transcriptome is HEK-vs-HCT, the proteome is HCT-vs-HEK:
cor.test(-1 * tt[ind.trans, "logFC"], rowData(bp.prot)[ind.prot, "log2ratio"])
df <- data.frame(trans = -1 * tt[ind.trans, "logFC"], prot = rowData(bp.prot)[ind.prot, "log2ratio"]) ggplot(df, aes(x = trans, y = prot) ) + geom_bin2d(bins = 70) + scale_fill_continuous(type = "viridis") + xlab("log2FC (transcriptome)") + ylab("log2FC (proteome)") + theme_bw()
We can now inspect the interactions of proteins that are strongly differentially expressed between cell lines as in Supplementary Figure S3, Panels J-M, of the BioPlex 3.0 publication.
Here, we inspect the interactions of CDH2, a protein that was observed in ~5-fold lower abundance in the HCT116 cell line when compared to the 293T cell line.
subset(rowData(bp.prot), SYMBOL == "CDH2")
We therefore obtain the latest versions of the both BioPlex PPI networks ...
bp.293t <- BioPlex::getBioPlex(cell.line = "293T", version = "3.0") bp.hct <- BioPlex::getBioPlex(cell.line = "HCT116", version = "1.0")
Get all interactions involving CDH2:
cdh2.293t <- subset(bp.293t, SymbolA == "CDH2" | SymbolB == "CDH2") cdh2.293t cdh2.hct <- subset(bp.hct, SymbolA == "CDH2" | SymbolB == "CDH2") cdh2.hct
Expand by including interactions between interactors of CDH2:
cdh2i <- cdh2.293t$SymbolA cdh2i.293t <- subset(bp.293t, SymbolA %in% cdh2i & SymbolB %in% cdh2i) cdh2i.293t cdh2i.hct <- subset(bp.hct, SymbolA %in% cdh2i & SymbolB %in% cdh2i) cdh2i.hct
Now we construct a joined network of interactions involving CDH2 or one of its interactors for both networks:
cdh2.293t <- rbind(cdh2.293t, cdh2i.293t) cdh2.293t$cell.line <- "293T" cdh2.hct <- cdh2i.hct cdh2.hct$cell.line <- "HCT116" cdh2.df <- rbind(cdh2.293t, cdh2.hct) cdh2.df
We turn the resulting data.frame
into a graph representation
cdh2.gr <- BioPlex::bioplex2graph(cdh2.df) cdh2.gr
And map the proteome data on the graph:
cdh2.gr <- BioPlex::mapSummarizedExperimentOntoGraph(cdh2.gr, bp.prot)
And annotate for each edge whether it is present for both cell lines or only one of them.
graph::edgeDataDefaults(cdh2.gr, "cell.line") <- "BOTH" rel.cols <- paste0("Uniprot", c("A", "B")) rel.cols <- c(rel.cols, "cell.line") jdf <- cdh2.df[,rel.cols] jdf[,1:2] <- apply(jdf[,1:2], 2, function(x) sub("-[0-9]{1,2}$", "", x)) dind <- duplicated(jdf[,1:2]) dup <- jdf[dind,] ind <- jdf$UniprotA %in% dup$UniprotA & jdf$UniprotB %in% dup$UniprotB ind <- ind & jdf$cell.line == "293T" jdf[ind,"cell.line"] <- "BOTH" jdf <- jdf[!dind,] jdf
We add this information to the graph:
graph::edgeData(cdh2.gr, jdf$UniprotA, jdf$UniprotB, "cell.line") <- jdf$cell.line
Inspect the resulting graph:
p <- BioPlexAnalysis::plotGraph(cdh2.gr, edge.color = "grey", node.data = "log2ratio", edge.data = "cell.line") p + scale_color_gradient2(low = "blue", mid = "lightgrey", high = "red", name = "log2ratio")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.