knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
library(MultiAssayExperiment) library(curatedTCGAData) library(TCGAutils) library(EnrichmentBrowser) library(GSEABenchmarkeR) library(RTCGAToolbox) library(BiocParallel) library(ComplexHeatmap) library(ggpubr)
data("diseaseCodes") head(avail <- diseaseCodes[ diseaseCodes[["Available"]] == "Yes", "Study.Abbreviation"])
Some datasets were missing normals and we suspected that there were different datasets being produced by RTCGAToolbox. Upon further inspection, TCGA provides 'illuminaga' and 'illumina HiSeq' datasets.
Note. First, download RTCGAToolbox from Bioc-devel and obtain correlations for samples that are in both platforms in "COAD", "READ", and "UCEC".
dl <- c("COAD", "READ", "UCEC") redl <- lapply(setNames(dl, dl), function(x) { tx <- getFirehoseData(x, RNASeq2GeneNorm = TRUE, clinical = FALSE) txl <- biocExtract(tx, "RNASeq2GeneNorm") sameSamps <- Reduce(intersect, lapply(txl, colnames)) # correlation between platforms 'illuminaga' and 'illuminahiseq' message(x) datm <- do.call(cbind.data.frame, lapply(txl, function(g) { assay(g[,sameSamps]) }) ) if (length(datm)) print(cor(datm[,1], datm[,2])) newassay <- cbind(assay(txl[[1]]), assay(txl[[2]])) newassay <- data.matrix(newassay[, !duplicated(colnames(newassay))]) newmeta <- c(metadata(txl[[1]]), metadata(txl[[2]])) newColData <- DataFrame(row.names = colnames(newassay)) SummarizedExperiment(assays = list(TPM = newassay), metadata = newmeta, colData = newColData) } ) exper <- as(redl, "ExperimentList") sampleTables(MultiAssayExperiment(exper))
allSamps <- unlist(colnames(exper)) clinphone <- DataFrame(patientID = TCGAbarcode(allSamps)) rownames(clinphone) <- clinphone[["patientID"]] sum(duplicated(rownames(clinphone))) ## [1] 77 clinphone <- clinphone[!duplicated(rownames(clinphone)), , drop = FALSE]
MultiAssayExperiment
:outAssays <- c("COAD_RNASeq2GeneNorm-20160128", "READ_RNASeq2GeneNorm-20160128", "UCEC_RNASeq2GeneNorm-20160128") core <- exper[ c("COAD", "READ", "UCEC") ] names(core) <- outAssays mapper <- generateMap(core, clinphone, idConverter = TCGAbarcode) coremae <- MultiAssayExperiment( experiments = core, sampleMap = mapper, colData = clinphone )
Obtain the RNAseq data from curatedTCGAData
:
rnacomp <- curatedTCGAData("*", "RNASeq2*", FALSE) rnacomp
Replace above processed datasets in the full RNASeq MultiAssayExperiment
:
rnacopy <- rnacomp rnacomp <- rnacomp[, , !names(rnacomp) %in% outAssays] rnacomp <- c(rnacomp, coremae)
Here we check for the samples of interest using sampleTables
:
sampleTables(rnacomp)
Which cancer codes have both tumors and normal samples?
okCA <- vapply(sampleTables(rnacomp), function(x) { all(c("01", "11") %in% names(x)) }, logical(1L) ) ( okCodes <- vapply( strsplit(names(which(okCA)), "_"), `[`, character(1L), 1L) )
Assemble appropriate MultiAssayExperiment
object with
cancer codes that have both tumors and normals.
rnacomp2 <- rnacomp[ , , okCA]
Isolate tumor and normal samples for each cancer that have more than 10 samples of each:
selects <- TCGAsampleSelect(colnames(rnacomp2), c("01", "11")) rnacomp3 <- rnacomp2[, selects] enoughNorm <- vapply(sampleTables(rnacomp3), function(samp) { samp["11"] >= 10L }, logical(1L)) rnacomp4 <- rnacomp3[, , enoughNorm] sampleTables(rnacomp4)
Create an index to annotate each of the SummarizedExperiment
objects
contained in the ExperimentList
:
tlogic <- TCGAsampleSelect(colnames(rnacomp4), "01") sampIndx <- IntegerList(lapply(tlogic, ifelse, 1L, 0L)) naks <- mendoapply(function(x, y) { colData(x)[["GROUP"]] <- y colData(x)[["BLOCK"]] <- TCGAbarcode(rownames(colData(x))) x }, x = experiments(rnacomp4), y = sampIndx) rnacomp5 <- BiocGenerics:::replaceSlots(rnacomp4, ExperimentList = naks, check = FALSE) table(rnacomp5[[1L]]$GROUP)
Split, create matched tumors and normals, and then recombine into a
single MultiAssayExperiment
:
matchlist <- lapply(setNames(seq_along(rnacomp5), names(rnacomp5)), function(idx) { as(splitAssays(rnacomp5[, , idx]), "MatchedAssayExperiment") } ) newEXPS <- ExperimentList( lapply(matchlist, function(expr) { do.call(cbind, experiments(expr)) }) ) rnacomp6 <- BiocGenerics:::replaceSlots(rnacomp5, ExperimentList = newEXPS, check = FALSE )
Save point:
saveRDS(experiments(rnacomp6), file = "../inst/data/pancan_exprs.rds") r6ex <- loadRDS(file = "../inst/data/pancan_exprs.rds")
Differential Expression analysis:
exp.list <- experiments(rnacomp6) names(exp.list) <- substring(names(exp.list), 1, 4) exp.list <- lapply(exp.list, function(se) { # if assay is matrix: assay(se) <- log(assay(se) + 1, base=2) assays(se) <- list(TPM=log(as.matrix(assay(se)) + 1, base=2)) return(se) }) exp.list <- runDE(exp.list) # for(n in names(exp.list)) saveRDS(exp.list[[n]], file = paste(n, "rds", sep="."))
Extract genes with consistent expression changes:
wfcs <- metaFC(exp.list, max.na=3) # Heatmap top.wfcs.genes <- names(wfcs)[1:50] extractFC <- function(exp.list, top.wfcs.genes) { fcs <- vapply(exp.list, function(d) rowData(d)[top.wfcs.genes, "FC"], numeric(length(top.wfcs.genes))) rownames(fcs) <- top.wfcs.genes return(fcs) } fcs <- extractFC(exp.list, top.wfcs.genes) # pdf("log2fc.pdf", paper = "special", width = 8, height = 12) Heatmap(fcs, name="log2FC", show_row_names=TRUE) # dev.off()
# Boxplot top.wfcs.genes <- names(c(wfcs[wfcs > 0][1:8], wfcs[wfcs < 0][1:8])) fcs <- extractFC(exp.list, top.wfcs.genes) df <- reshape2::melt(fcs) medians <- apply(fcs, 1, median, na.rm=TRUE) o <- names(sort(medians)) p <- ggboxplot(df, x = "Var1", y = "value", width = 0.8, ylab="log2 fold change", xlab="", order=o, color="Var1", add ="jitter") p <- ggpar(p, x.text.angle=45, palette = "simpsons", legend="none") gg <- p + geom_hline(yintercept=0, linetype="dashed", color = "darkgrey") + geom_vline(xintercept=8.5, linetype="dashed", color = "darkgrey") gg
ggsaving...
ggsave("boxplot_pancan.svg", gg, device = "svg") ## pdf ggsave("boxplot_pancan.pdf", gg, device = "pdf")
ID mapping
exp.list <- lapply(exp.list, function(se) idMap(se, org="hsa", from="SYMBOL", to="ENTREZID"))
Obtain gene sets
go.gs <- getGenesets(org="hsa", db="go") ids <- vapply(names(go.gs), function(n) unlist(strsplit(n, "_"))[1], character(1)) ids <- sub(":", "", ids) extractTitle <- function(n) { spl <- unlist(strsplit(n, "_"))[-1] paste(spl, collapse = " ") } go.i2n <- vapply(names(go.gs), extractTitle, character(1)) names(go.i2n) <- ids
Execute ORA (has been pre-computed on a cluster, skip this step):
res <- runEA(exp.list, methods=c("ora", "padog"), gs=go.gs, perm=c(0, 1000), save2file=TRUE, out.dir="../inst/data")
Read pre-computed results for ORA and PADOG for GO-BP gene sets
res <- readResults(data.dir="../inst/data", data.ids=names(exp.list), methods=c("ora", "padog"), type="ranking")
Determine enrichment across datasets:
isEnriched <- function(res, pthresh=0.05, nr.top=1:15) { for(i in seq_along(res)) rownames(res[[i]]) <- res[[i]]$GENE.SET pmat <- vapply(res, function(r) r[rownames(res[[1]]), "PVAL"], res[[1]]$PVAL) rownames(pmat) <- rownames(res[[1]]) is.enriched <- pmat < pthresh rse <- rowSums(is.enriched) top.gs <- names(sort(rse, decreasing=TRUE)[nr.top]) top.mat <- is.enriched[top.gs,] mode(top.mat) <- "integer" return(top.mat) } go.ora <- isEnriched(res$ora) go.padog <- isEnriched(res$padog, nr.top=rownames(go.ora)) rownames(go.ora) <- rownames(go.padog) <- go.i2n[rownames(go.ora)] extract <- function(n, ind) paste(unlist(strsplit(n, " "))[ind], collapse = " ") rownames(go.ora)[4] <- extract(rownames(go.ora)[4], 1:2) rownames(go.ora)[10] <- extract(rownames(go.ora)[10], c(1, 3:4)) rownames(go.ora)[11] <- extract(rownames(go.ora)[11], 1:2) rownames(go.ora)[15] <- extract(rownames(go.ora)[15], 1:5) rownames(go.padog) <- rownames(go.ora)
h1 <- Heatmap(go.ora, cluster_rows=FALSE, cluster_columns=FALSE, show_row_names=FALSE, show_heatmap_legend = FALSE, col=c("white", "black"), column_title="ORA p < 0.05", row_names_gp = gpar(fontsize = 11)) h2 <- Heatmap(go.padog, cluster_rows=FALSE, cluster_columns=FALSE, show_row_names=TRUE, show_heatmap_legend = FALSE, col=c("white", "black"), column_title="PADOG p < 0.05", row_names_gp = gpar(fontsize = 11)) hl <- h1 + h2 draw(hl, ht_gap = unit(1, "cm"))
Code for saving to PDF:
pdf("orapadog.pdf", paper = "special", width = 12, height = 8) draw(hl, ht_gap = unit(1, "cm")) dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.