R/fgseaORA.R

Defines functions collapsePathwaysORA fora

Documented in collapsePathwaysORA fora

#' Simple overrepresentation analysis based on hypergeometric test
#' @param pathways List of gene sets to check.
#' @param genes Set of query genes
#' @param universe A universe from whiche `genes` were selected
#' @param minSize Minimal size of a gene set to test. All pathways below the threshold are excluded.
#' @param maxSize Maximal size of a gene set to test. All pathways above the threshold are excluded.
#' @return A table with ORA results. Each row corresponds to a tested pathway.
#' The columns are the following:
#' \itemize{
#'  \item pathway -- name of the pathway as in `names(pathway)`;
#'  \item pval -- an enrichment p-value from hypergeometric test;
#'  \item padj -- a BH-adjusted p-value;
#'  \item overlap -- size of the overlap;
#'  \item size -- size of the gene set;
#'  \item leadingEdge -- vector with overlapping genes.
#' }
#'
#' @export
#' @examples
#' data(examplePathways)
#' data(exampleRanks)
#' foraRes <- fora(examplePathways, genes=tail(names(exampleRanks), 200), universe=names(exampleRanks))
fora <- function(pathways, genes, universe, minSize=1, maxSize=Inf) {
    # Error if pathways is not a list
    if (!is.list(pathways)) {
        stop("pathways should be a list with each element containing genes from the universe")
    }

    # Warning message for duplicate gene names
    if (any(duplicated(universe))) {
        warning("There were duplicate genes in universe, they were collapsed")
        universe <- unique(universe)
    }

    minSize <- max(minSize, 1)

    pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, universe))) })
    pathwaysSizes <- sapply(pathwaysFiltered, length)

    toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)

    pathwaysFiltered <- pathwaysFiltered[toKeep]
    pathwaysSizes <- pathwaysSizes[toKeep]


    if (!all(genes %in% universe)) {
        warning("Not all of the input genes belong to the universe, such genes were removed")
    }
    genesFiltered <- unique(na.omit(fmatch(genes, universe)))

    overlaps <- sapply(pathwaysFiltered, intersect, genesFiltered)

    overlapGenes <- lapply(overlaps, function(x) universe[x])

    overlapsT <- data.table(
        q=sapply(overlaps, length),
        m=sapply(pathwaysFiltered, length),
        n=length(universe)-sapply(pathwaysFiltered, length),
        k=length(genesFiltered))

    # q-1 because we want probability of having >=q white balls
    pathways.pvals <- with(overlapsT,
                           phyper(q-1, m, n, k, lower.tail = FALSE))

    res <- data.table(pathway=names(pathwaysFiltered),
                      pval=pathways.pvals,
                      padj=p.adjust(pathways.pvals, method="BH"),
                      overlap=overlapsT$q,
                      size=overlapsT$m,
                      overlapGenes=overlapGenes)

    res <- res[order(pval),]
    res
}

#' Collapse list of enriched pathways to independent ones. Version for ORA hypergeometric test.
#'
#' @param foraRes Table with results of running fgsea(), should be filtered
#'                 by p-value, for example by selecting ones with padj < 0.01.
#' @param pathways List of pathways, should contain all the pathways present in
#'                 `fgseaRes`.
#' @param genes Set of query genes, same as in `fora()`
#' @param universe A universe from whiche `genes` were selected, same as in `fora()`
#' @param pval.threshold Two pathways are considered dependent when p-value
#'                       of enrichment of one pathways on background of another
#'                       is greater then `pval.threshold`.
#' @return Named list with two elments: `mainPathways` containing IDs of pathways
#'         not reducable to each other, and `parentPathways` with vector describing
#'         for all the pathways to which ones they can be reduced. For
#'         pathways from `mainPathwyas` vector `parentPathways` contains `NA` values.
#' @examples
#' data(examplePathways)
#' data(exampleRanks)
#' foraRes <- fora(examplePathways, genes=tail(names(exampleRanks), 200), universe=names(exampleRanks))
#' collapsedPathways <- collapsePathwaysORA(foraRes[order(pval)][padj < 0.01],
#'                                              examplePathways,
#'                                              genes=tail(names(exampleRanks), 200),
#'                                              universe=names(exampleRanks))
#'
#' mainPathways <- foraRes[pathway %in% collapsedPathways$mainPathways][
#'                           order(pval), pathway]
#' @export
collapsePathwaysORA <- function(foraRes,
                             pathways,
                             genes,
                             universe,
                             pval.threshold=0.05) {

    pathways <- pathways[foraRes$pathway]
    pathways <- lapply(pathways, intersect, universe)

    parentPathways <- setNames(rep(NA, length(pathways)), names(pathways))

    for (i in seq_along(pathways)) {
        p <- names(pathways)[i]
        if (!is.na(parentPathways[p])) {
            next
        }

        pathwaysToCheck <- setdiff(names(which(is.na(parentPathways))), p)

        if (length(pathwaysToCheck) == 0) {
            break
        }

        minPval <- setNames(rep(1, length(pathwaysToCheck)), pathwaysToCheck)

        u1 <- setdiff(universe, pathways[[p]])
        foraRes1 <- fora(pathways = pathways[pathwaysToCheck],
                         genes=intersect(genes, u1),
                         universe=u1,
                         maxSize=length(u1)-1)
        minPval[foraRes1$pathway] <- pmin(minPval[foraRes1$pathway], foraRes1$pval)

        u2 <- pathways[[p]]
        foraRes2 <- fora(pathways = pathways[pathwaysToCheck],
                                genes=intersect(genes, u2),
                                universe=u2,
                                maxSize=length(u2)-1)
        minPval[foraRes2$pathway] <- pmin(minPval[foraRes2$pathway], foraRes2$pval)

        parentPathways[names(which(minPval > pval.threshold))] <- p
    }

    return(list(mainPathways=names(which(is.na(parentPathways))),
                parentPathways=parentPathways))
}

Try the fgsea package in your browser

Any scripts or data that you put into this service are public.

fgsea documentation built on Nov. 8, 2020, 5:22 p.m.