## nolint start
suppressPackageStartupMessages({
    ## CRAN packages.
    library(R.utils)
    library(ggnewscale)
    library(stringi)
    library(withr)
    ## Bioconductor packages.
    library(DESeq2)
    library(DOSE)
    library(clusterProfiler)
    library(enrichplot)
    library(pathview)
    ## Acid Genomics packages.
    library(goalie)
    library(AcidGenomes)
    library(basejump)
    library(DESeqAnalysis)
    library(AcidGSEA)
    library(bcbioRNASeq)
})
prepareTemplate()
source("_setup.R")
## May encounter warning from ggrepel package otherwise.
options("ggrepel.max.overlaps" = Inf)
## Avoid logical coercion error in DOSE.
Sys.setenv(
    "_R_CHECK_LENGTH_1_CONDITION_" = "false",
    "_R_CHECK_LENGTH_1_LOGIC2_" = "false"
)
## nolint end
keggPlotsDir <- file.path(params[["results_dir"]], "kegg-plots")
invisible(lapply(
    X = c(
        params[["data_dir"]],
        params[["results_dir"]],
        keggPlotsDir
    ),
    FUN = initDir
))
assert(isSubset(params[["go_class"]], c("BP", "CC", "MF")))
pathview <- params[["pathview"]]

Prepare variables for enrichment analysis

First, we need to load the DESeqAnalysis object, which has been generated from our bcbioRNASeq object during the differential expression step.

## Ensure that our DESeqAnalysis object contains NCBI gene identifiers.
deseq <- import(params[["deseq_file"]])
deseq <- updateObject(deseq)
assert(
    is(deseq, "DESeqAnalysis"),
    validObject(deseq),
    isSubset(c("geneId", "ncbiGeneId"), colnames(rowData(deseq@data)))
)
rowRanges(deseq@data) <- EnsemblGenes(rowRanges(deseq@data))
if (!isSubset("geneIdNoVersion", colnames(rowData(deseq@data)))) {
    rowData(deseq@data)[["geneIdNoVersion"]] <-
        rowData(deseq@data)[["geneId"]]
    rowData(deseq@transform)[["geneIdNoVersion"]] <-
        rowData(deseq@transform)[["geneId"]]
}
rownames(deseq@data) <-
    as.character(rowData(deseq@data)[["geneIdNoVersion"]])
rownames(deseq@transform) <-
    as.character(rowData(deseq@transform)[["geneIdNoVersion"]])
if (is.list(deseq@results)) {
    deseq@results <- lapply(
        X = deseq@results,
        rownames = rownames(deseq@data),
        FUN = function(res, rownames) {
            rownames(res) <- rownames
            res
        }
    )
}
if (is.list(deseq@lfcShrink)) {
    deseq@lfcShrink <- lapply(
        X = deseq@lfcShrink,
        rownames = rownames(deseq@data),
        FUN = function(res, rownames) {
            rownames(res) <- rownames
            res
        }
    )
}
assert(validObject(deseq))
print(deseq)

We need to extract the corresponding DESeqDataSet and DESeqResults objects.

dds <- as.DESeqDataSet(deseq)
print(dds)
## > help(topic = "results", package = "DESeqAnalysis")
## > help(topic = "resultsNames", package = "DESeqAnalysis")
res <- DESeqAnalysis::results(object = deseq, i = params[["results_name"]])
assert(
    is(res, "DESeqResults"),
    validObject(res)
)
## Not allowing the user to apply post-hoc LFC filtering in params.
alphaThreshold <- alphaThreshold(res)
lfcThreshold <- lfcThreshold(res)
print(res)

We need to get the relevant Bioconductor OrgDb package from the full Latin organism name (e.g. "Homo sapiens" to "org.Hs.eg.db").

organism <- organism(dds)
match <- stri_match_first_regex(
    str = organism,
    pattern = "^([A-Z])[a-z]+\\s([a-z])[a-z]+$"
)
orgDb <- paste(
    "org",
    paste(match[, 2L:3L], collapse = ""),
    "eg",
    "db",
    sep = "."
)
assert(isInstalled(orgDb))
rm(match)

Significant results vectors

Now we can prepare the input required for enrichment analysis.

## Subset NA adjusted P values.
res <- res[!is.na(res[["padj"]]), , drop = FALSE]
res <- res[order(res[["padj"]]), , drop = FALSE]
## Match the DESeqDataSet.
dds <- dds[rownames(res), , drop = FALSE]
## Set our background genes vector (aka universe).
allEnsemblGenes <- rownames(res)
## > help(topic = "deg", package = "DESeqAnalysis")
sigEnsemblGenes <- deg(object = res, direction = "both")
sigEnsemblRes <- res[sigEnsemblGenes, , drop = FALSE]
## LFC ranked list vector.
ensemblLfcVec <- sigEnsemblRes[["log2FoldChange"]]
names(ensemblLfcVec) <- rownames(sigEnsemblRes)
## Sort from upregulated to downregulated.
ensemblLfcVec <- sort(ensemblLfcVec, decreasing = TRUE)
summary(ensemblLfcVec)

Ensembl-to-NCBI gene identifier mappings

NCBI gene (Entrez) identifiers are required for [Kyoto Encyclopedia of Genes and Genomes (KEGG)][KEGG] analysis. Here we are defining 1:1 mappings of the Ensembl gene identifiers to NCBI gene identifiers. For genes that map to multiple NCBI identifiers, we are using the oldest identifier to define the 1:1 mapping.

map <- EnsemblToNcbi(dds)
assert(isSubset(rownames(map), rownames(dds)))
allNcbiGenes <- map[["ncbiGeneId"]]
names(allNcbiGenes) <- rownames(map)
str(allNcbiGenes)

Now we can obtain a vector of significant genes and LFCs that map to Entrez.

idx <- na.omit(match(x = sigEnsemblGenes, table = names(allNcbiGenes)))
sigNcbiGenes <- allNcbiGenes[idx]
ncbiLfcVec <- sigEnsemblRes[names(sigNcbiGenes), "log2FoldChange", drop = TRUE]
names(ncbiLfcVec) <- unname(sigNcbiGenes)
## Sort from upregulated to downregulated.
ncbiLfcVec <- sort(ncbiLfcVec, decreasing = TRUE)
str(ncbiLfcVec)

GSEA ranked lists

A common approach in analyzing gene expression profiles was identifying differential expressed genes that are deemed interesting. The enrichment analysis we demonstrated previously were based on these differentially expressed genes. This approach will find genes where the difference is large, but it will not detect a situation where the difference is small, but evidenced in coordinated way in a set of related genes. [Gene Set Enrichment Analysis (GSEA)][GSEA] directly addresses this limitation. All genes can be used in [GSEA][]; [GSEA][] aggregates the per gene statistics across genes within a gene set, therefore making it possible to detect situations where all genes in a predefined set change in a small but coordinated way. Since it is likely that many relevant phenotypic differences are manifested by small but consistent changes in a set of genes.

## > help(topic = "RankedList", package = "AcidGSEA")
## Assuming the DESeq analysis input contains Ensembl reference genome. The
## gene identifiers should be defined as `"geneId"` in DESeqDataSet rowData.
rankedListEnsembl <- RankedList(
    object = deseq,
    value = "stat",
    keyType = "ensemblGeneId"
)
rankedListEnsembl <- rankedListEnsembl[[params[["results_name"]]]]
## NCBI gene identifiers must be defined in the `"ncbiGeneId"` column of
## DESeqDataSet rowData, otherwise this step will error.
rankedListNcbi <- RankedList(
    object = deseq,
    value = "stat",
    keyType = "ncbiGeneId"
)
rankedListNcbi <- rankedListNcbi[[params[["results_name"]]]]
assert(
    is.numeric(rankedListEnsembl),
    is.numeric(rankedListNcbi)
)

GO enrichment analysis

[Gene Ontology (GO)][GO] term enrichment is a technique for interpreting sets of genes making use of the [Gene Ontology][GO] system of classification, in which genes are assigned to a set of predefined bins depending on their functional characteristics.

## > help(topic = "enrichGO", package = "clusterProfiler")
enrichGo <- enrichGO(
    gene = sigEnsemblGenes,
    OrgDb = orgDb,
    keyType = "ENSEMBL",
    ont = params[["go_class"]],
    universe = allEnsemblGenes,
    qvalueCutoff = 0.05,
    readable = TRUE
)
enrichGoDf <-
    enrichGo |>
    slot("result") |>
    as.data.frame() |>
    camelCase()
saveData(
    enrichGo, enrichGoDf,
    dir = params[["data_dir"]]
)
export(
    object = enrichGoDf,
    con = file.path(
        params[["results_dir"]],
        paste0(
            paste(
                "go",
                tolower(params[["go_class"]]),
                "clusterprofiler",
                "padj",
                alphaThreshold,
                "lfc",
                lfcThreshold,
                sep = "_"
            ),
            ".csv.gz"
        )
    )
)

Dot plot

## > help(topic = "dotplot", package = "enrichplot")
try({
    suppressWarnings({
        dotplot(enrichGo, showCategory = 25L)
    })
})

GO terms map

## > help(topic = "emapplot", package = "enrichplot")
try({
    suppressWarnings({
        emapplot(
            x = pairwise_termsim(enrichGo),
            showCategory = 25L
        )
    })
})

Gene map

In order to consider the potentially biological complexities in which a gene may belong to multiple annotation categories, and provide information of numeric changes if available.

Here we are plotting genes colored by LFC for top significant [GO][] terms.

## > help(topic = "cnetplot", package = "enrichplot")
try({
    suppressWarnings({
        cnetplot(enrichGo, foldChange = ensemblLfcVec)
    })
})

GO GSEA analysis

Now we're ready to run GSEA using the GO terms.

## > help(topic = "gseGO", package = "clusterProfiler")
gseaGo <- gseGO(
    geneList = rankedListEnsembl,
    ont = params[["go_class"]],
    OrgDb = get(orgDb, envir = asNamespace(orgDb)),
    keyType = "ENSEMBL",
    pvalueCutoff = 0.05
)
gseaGoDf <-
    gseaGo |>
    slot("result") |>
    as.data.frame() |>
    camelCase()
saveData(
    gseaGo, gseaGoDf,
    dir = params[["data_dir"]]
)
export(
    object = gseaGoDf,
    con = file.path(
        params[["results_dir"]],
        paste0(
            paste(
                "gsea",
                "clusterprofiler",
                "padj",
                alphaThreshold,
                "lfc",
                lfcThreshold,
                sep = "_"
            ),
            ".csv.gz"
        )
    )
)

KEGG enrichment analysis

Match the KEGG organism code (e.g. "hsa" for Homo sapiens).

## > help(topic = "search_kegg_organism", package = "clusterProfiler")
keggCode <-
    search_kegg_organism(organism, "scientific_name") |>
    getElement("kegg_code")
assert(
    isString(keggCode),
    isMatchingRegex(pattern = "^[a-z]{3}$", x = keggCode)
)
## > help(topic = "enrichKEGG", package = "clusterProfiler")
kegg <- enrichKEGG(
    gene = as.character(sigNcbiGenes),
    organism = keggCode,
    keyType = "ncbi-geneid",
    universe = as.character(allNcbiGenes),
    qvalueCutoff = 0.05
)
keggDf <-
    kegg |>
    slot("result") |>
    as.data.frame() |>
    camelCase()
saveData(
    kegg, keggDf,
    dir = params[["data_dir"]]
)
export(
    object = keggDf,
    con = file.path(
        params[["results_dir"]],
        paste0(
            paste(
                "kegg",
                "clusterprofiler",
                "padj",
                alphaThreshold,
                "lfc",
                lfcThreshold,
                sep = "_"
            ),
            ".csv.gz"
        )
    )
)

KEGG GSEA analysis

[GSEA][] analysis is performed with the [clusterProfiler][] tool using KEGG gene sets and using the log2 fold changes as input. By using the log2 fold changes as the input, we are identifying pathways with genes that exhibit coordinated fold changes that are larger than might be expected by chance. The significant pathways can be visualized using the log2 fold changes with the Pathview tool.

Gene set enrichment analysis tools use ranked lists of genes (here ranked by log2FC) without using a threshold. This allows the tools to use more information to identify enriched biological processes. The introduction to gene set enrichment analysis goes into more detail about some of the advantages of this approach. By using the log2 fold changes as the input, we are identifying pathways with genes that exhibit coordinated fold changes that are larger than might be expected by chance. The significant pathways can be visualized using the log2 fold changes with the [pathview][] tool.

The significantly dysregulated pathways (q-value (FDR) less than 0.05) are displayed below in the pathway images, which show the degree of dysregulation of the genes (the minus direction (green) is down-regulated, while the positive direction (red) is up-regulated).

When performing [GSEA][] analysis it may be useful to adjust the minGSSize and/or maxGSSize parameter based on the pathways you would like to search for significance. If you are interested in smaller pathways (e.g. a gene set size of 24 genes), then you would want to adjust the minGSSize to less than 24. If you are only interested in larger pathways, then you would want to adjust the GSSize to a larger number. The fewer pathways tested, the less we need to correct for multiple test correction, so by adjusting the minGSSize and maxGSSize parameters you can test fewer pathways and limit testing to the pathways of interest.

## > help(topic = "gseKEGG", package = "clusterProfiler")
gseaKegg <- gseKEGG(
    geneList = ncbiLfcVec,
    organism = keggCode,
    keyType = "ncbi-geneid"
)
gseaKeggDf <-
    gseaKegg |>
    slot("result") |>
    as.data.frame() |>
    camelCase()
saveData(
    gseaKegg, gseaKeggDf,
    dir = params[["data_dir"]]
)
export(
    object = gseaKeggDf,
    con = file.path(
        params[["results_dir"]],
        paste0(
            paste(
                "gsea",
                "kegg",
                "clusterprofiler",
                "padj",
                alphaThreshold,
                "lfc",
                lfcThreshold,
                sep = "_"
            ),
            ".csv.gz"
        )
    )
)

Pathview {.tabset}

## > help(topic = "pathview", package = "pathview")
##
## This requires Entrez identifiers to be defined in the DESeqAnalysis object
## defined in params.
##
## The `gene.data` vector must be numeric with Entrez identifiers as names.
##
## There is currently no way to set the output path of the pathview PNG files,
## so we're changing the working directory temporarily using the withr package.
##
## Also, We're using `tryCatch()` here to return to the user any pathways that
## didn't output graphics correctly.
##
## Using `withTimeout` here to harden against very large pathway sets
## (e.g.`"mmu01100"` for F1000 paper example) that can fail to plot.
##
## dplyr may need to be unloaded for pathview to work.
pathways <- gseaKeggDf[["id"]]
if (hasLength(pathways)) {
    ## > suppressWarnings({
    ## >     detach("package:dplyr", unload = TRUE, force = TRUE))
    ## > })
    with_dir(
        new = keggPlotsDir,
        code = {
            invisible(lapply(
                X = pathways,
                FUN = function(pathway) {
                    tryCatch(
                        expr = withTimeout(
                            expr = {
                                pathview(
                                    gene.data = ncbiLfcVec,
                                    pathway.id = pathway,
                                    gene.idtype = "entrez",
                                    gene.annotpkg = NULL,
                                    species = keggCode,
                                    ## Use bidirectional coloring.
                                    limit = list(gene = 2L)
                                )
                            },
                            timeout = 60L
                        ),
                        error = function(e) {
                            warning(sprintf("%s failed to plot.", pathway))
                        }
                    )
                }
            ))
        }
    )
    figures <- list.files(
        path = keggPlotsDir,
        pattern = "pathview",
        full.names = TRUE
    )
    invisible(lapply(
        X = figures,
        FUN = function(figure) {
            markdownHeader(basename(figure), level = 4L, asis = TRUE)
            cat(paste0("<img src=\"", figure, "\">\n"))
        }
    ))
}




hbc/bcbioRNASeq documentation built on March 28, 2024, 3:01 p.m.