knitr::opts_chunk$set(echo = TRUE, cache = TRUE)

Dependencies

library(MultiAssayExperiment)
library(curatedTCGAData)
library(TCGAutils)
library(EnrichmentBrowser)
library(GSEABenchmarkeR)
library(RTCGAToolbox)
library(BiocParallel)
library(ComplexHeatmap)
library(ggpubr)

Obtain ACC data from curatedTCGAData:

data("diseaseCodes")
head(avail <- diseaseCodes[
    diseaseCodes[["Available"]] == "Yes", "Study.Abbreviation"])

Download and reshape data

Some datasets were missing normals and we suspected that there were different datasets being produced by RTCGAToolbox. Upon further inspection, TCGA provides 'illuminaga' and 'illumina HiSeq' datasets.

Note. First, download RTCGAToolbox from Bioc-devel and obtain correlations for samples that are in both platforms in "COAD", "READ", and "UCEC".

dl <- c("COAD", "READ", "UCEC")
redl <- lapply(setNames(dl, dl), function(x) {
    tx <- getFirehoseData(x, RNASeq2GeneNorm = TRUE, clinical = FALSE)
    txl <- biocExtract(tx, "RNASeq2GeneNorm")
    sameSamps <- Reduce(intersect, lapply(txl, colnames))

    # correlation between platforms 'illuminaga' and 'illuminahiseq'
    message(x)
    datm <- do.call(cbind.data.frame,
        lapply(txl, function(g) { assay(g[,sameSamps]) })
    )
    if (length(datm))
        print(cor(datm[,1], datm[,2]))

    newassay <- cbind(assay(txl[[1]]), assay(txl[[2]]))
    newassay <- data.matrix(newassay[, !duplicated(colnames(newassay))])
    newmeta <- c(metadata(txl[[1]]), metadata(txl[[2]]))
    newColData <- DataFrame(row.names = colnames(newassay))
    SummarizedExperiment(assays = list(TPM = newassay),
        metadata = newmeta, colData = newColData)
    }
)
exper <- as(redl, "ExperimentList")
sampleTables(MultiAssayExperiment(exper))

Create a pseudo clinical DataFrame for new samples

allSamps <- unlist(colnames(exper))
clinphone <- DataFrame(patientID = TCGAbarcode(allSamps))
rownames(clinphone) <- clinphone[["patientID"]]
sum(duplicated(rownames(clinphone)))
## [1] 77
clinphone <- clinphone[!duplicated(rownames(clinphone)), , drop = FALSE]

Combine cancer datasets into one MultiAssayExperiment:

outAssays <- c("COAD_RNASeq2GeneNorm-20160128",
    "READ_RNASeq2GeneNorm-20160128", "UCEC_RNASeq2GeneNorm-20160128")
core <- exper[ c("COAD", "READ", "UCEC") ]
names(core) <- outAssays
mapper <- generateMap(core, clinphone, idConverter = TCGAbarcode)
coremae <-
    MultiAssayExperiment(
        experiments = core, sampleMap = mapper, colData = clinphone
    )

PanCan RNASeq2*

Obtain the RNAseq data from curatedTCGAData:

rnacomp <- curatedTCGAData("*", "RNASeq2*", FALSE)
rnacomp

Replace above processed datasets in the full RNASeq MultiAssayExperiment:

rnacopy <- rnacomp
rnacomp <- rnacomp[, , !names(rnacomp) %in% outAssays]
rnacomp <- c(rnacomp, coremae)

Here we check for the samples of interest using sampleTables:

sampleTables(rnacomp)

Which cancer codes have both tumors and normal samples?

okCA <- vapply(sampleTables(rnacomp),
    function(x) {
        all(c("01", "11") %in% names(x))
    },
    logical(1L)
)

(
okCodes <- vapply(
    strsplit(names(which(okCA)), "_"),
    `[`,
    character(1L),
    1L)
)

Assemble appropriate MultiAssayExperiment object with cancer codes that have both tumors and normals.

rnacomp2 <- rnacomp[ , , okCA]

Isolate tumor and normal samples for each cancer that have more than 10 samples of each:

selects <- TCGAsampleSelect(colnames(rnacomp2), c("01", "11"))

rnacomp3 <- rnacomp2[, selects]
enoughNorm <- vapply(sampleTables(rnacomp3),
    function(samp) { samp["11"] >= 10L }, logical(1L))

rnacomp4 <- rnacomp3[, , enoughNorm]
sampleTables(rnacomp4)

Create an index to annotate each of the SummarizedExperiment objects contained in the ExperimentList:

tlogic <- TCGAsampleSelect(colnames(rnacomp4), "01")
sampIndx <- IntegerList(lapply(tlogic, ifelse, 1L, 0L))

naks <- mendoapply(function(x, y) {
    colData(x)[["GROUP"]] <- y
    colData(x)[["BLOCK"]] <- TCGAbarcode(rownames(colData(x)))
    x
}, x = experiments(rnacomp4), y = sampIndx)

rnacomp5 <- BiocGenerics:::replaceSlots(rnacomp4,
    ExperimentList = naks,
    check = FALSE)

table(rnacomp5[[1L]]$GROUP)

Split, create matched tumors and normals, and then recombine into a single MultiAssayExperiment:

matchlist <- lapply(setNames(seq_along(rnacomp5), names(rnacomp5)),
    function(idx) {
        as(splitAssays(rnacomp5[, , idx]), "MatchedAssayExperiment")
    }
)
newEXPS <- ExperimentList(
    lapply(matchlist, function(expr) {
        do.call(cbind, experiments(expr))
    })
)

rnacomp6 <- BiocGenerics:::replaceSlots(rnacomp5,
    ExperimentList = newEXPS,
    check = FALSE
)

Save point:

saveRDS(experiments(rnacomp6), file = "../inst/data/pancan_exprs.rds")
r6ex <- loadRDS(file = "../inst/data/pancan_exprs.rds")

Differential Expression analysis:

exp.list <- experiments(rnacomp6)
names(exp.list) <- substring(names(exp.list), 1, 4)
exp.list <- lapply(exp.list,
    function(se)
    {
        # if assay is matrix: assay(se) <- log(assay(se) + 1, base=2)
        assays(se) <- list(TPM=log(as.matrix(assay(se)) + 1, base=2))
        return(se)
    })
exp.list <- runDE(exp.list)
# for(n in names(exp.list)) saveRDS(exp.list[[n]], file = paste(n, "rds", sep="."))

Figure 3 BoxPlot

Extract genes with consistent expression changes:

wfcs <- metaFC(exp.list, max.na=3)

# Heatmap
top.wfcs.genes <- names(wfcs)[1:50]
extractFC <- function(exp.list, top.wfcs.genes)
{
    fcs <- vapply(exp.list,
        function(d) rowData(d)[top.wfcs.genes, "FC"],
        numeric(length(top.wfcs.genes)))
    rownames(fcs) <- top.wfcs.genes
    return(fcs)
}
fcs <- extractFC(exp.list, top.wfcs.genes)

# pdf("log2fc.pdf", paper = "special", width = 8, height = 12)
Heatmap(fcs, name="log2FC", show_row_names=TRUE)
# dev.off()

Figure 3

Pan-cancer differential expression analysis

# Boxplot
top.wfcs.genes <- names(c(wfcs[wfcs > 0][1:8], wfcs[wfcs < 0][1:8]))
fcs <- extractFC(exp.list, top.wfcs.genes)
df <- reshape2::melt(fcs)
medians <- apply(fcs, 1, median, na.rm=TRUE)
o <- names(sort(medians))
p <- ggboxplot(df, x = "Var1", y = "value", width = 0.8,
    ylab="log2 fold change", xlab="", order=o, color="Var1", add ="jitter")
p <- ggpar(p, x.text.angle=45, palette = "simpsons", legend="none")
gg <- p + geom_hline(yintercept=0, linetype="dashed", color = "darkgrey") +
    geom_vline(xintercept=8.5, linetype="dashed", color = "darkgrey")
gg

ggsaving...

ggsave("boxplot_pancan.svg", gg, device = "svg")

## pdf
ggsave("boxplot_pancan.pdf", gg, device = "pdf")

GSEA

ID mapping

exp.list <- lapply(exp.list,
    function(se) idMap(se, org="hsa", from="SYMBOL", to="ENTREZID"))

Obtain gene sets

go.gs <- getGenesets(org="hsa", db="go")
ids <- vapply(names(go.gs), function(n) unlist(strsplit(n, "_"))[1], character(1))
ids <- sub(":", "", ids)
extractTitle <- function(n)
{
    spl <- unlist(strsplit(n, "_"))[-1]
    paste(spl, collapse = " ")
}
go.i2n <- vapply(names(go.gs), extractTitle, character(1))
names(go.i2n) <- ids

Execute ORA (has been pre-computed on a cluster, skip this step):

res <- runEA(exp.list, methods=c("ora", "padog"),
    gs=go.gs, perm=c(0, 1000), save2file=TRUE, out.dir="../inst/data")

Read pre-computed results for ORA and PADOG for GO-BP gene sets

res <- readResults(data.dir="../inst/data", data.ids=names(exp.list),
    methods=c("ora", "padog"), type="ranking")

Determine enrichment across datasets:

isEnriched <- function(res, pthresh=0.05, nr.top=1:15)
{
    for(i in seq_along(res)) rownames(res[[i]]) <- res[[i]]$GENE.SET
    pmat <- vapply(res, function(r) r[rownames(res[[1]]), "PVAL"], res[[1]]$PVAL)
    rownames(pmat) <- rownames(res[[1]])
    is.enriched <- pmat < pthresh
    rse <- rowSums(is.enriched)

    top.gs <- names(sort(rse, decreasing=TRUE)[nr.top])
    top.mat <- is.enriched[top.gs,]
    mode(top.mat) <- "integer"
    return(top.mat)
}

go.ora <- isEnriched(res$ora)
go.padog <- isEnriched(res$padog, nr.top=rownames(go.ora))

rownames(go.ora) <- rownames(go.padog) <- go.i2n[rownames(go.ora)]
extract <- function(n, ind) paste(unlist(strsplit(n, " "))[ind], collapse = " ")
rownames(go.ora)[4] <- extract(rownames(go.ora)[4], 1:2)
rownames(go.ora)[10] <- extract(rownames(go.ora)[10], c(1, 3:4))
rownames(go.ora)[11] <- extract(rownames(go.ora)[11], 1:2)
rownames(go.ora)[15] <- extract(rownames(go.ora)[15], 1:5)

rownames(go.padog) <- rownames(go.ora)

Figure 4

Pan-cancer gene set enrichment analysis

h1 <- Heatmap(go.ora, cluster_rows=FALSE, cluster_columns=FALSE,
    show_row_names=FALSE, show_heatmap_legend = FALSE,
    col=c("white", "black"), column_title="ORA p < 0.05",
    row_names_gp = gpar(fontsize = 11))
h2 <- Heatmap(go.padog, cluster_rows=FALSE, cluster_columns=FALSE,
    show_row_names=TRUE, show_heatmap_legend = FALSE,
    col=c("white", "black"), column_title="PADOG p < 0.05",
    row_names_gp = gpar(fontsize = 11))
hl <- h1 + h2
draw(hl, ht_gap = unit(1, "cm"))

Code for saving to PDF:

pdf("orapadog.pdf", paper = "special", width = 12, height = 8)
draw(hl, ht_gap = unit(1, "cm"))
dev.off()


LiNk-NY/curatedTCGAManu documentation built on July 22, 2020, 4:06 p.m.