R/gsea.R

Defines functions fgseaCellID integrateGSEASeurat GetGSEAMatrix RunGroupGSEA.SingleCellExperiment RunGroupGSEA.Seurat RunGroupGSEA RunCellGSEA.SingleCellExperiment RunCellGSEA.Seurat RunCellGSEA

Documented in fgseaCellID GetGSEAMatrix integrateGSEASeurat RunCellGSEA RunCellGSEA.Seurat RunCellGSEA.SingleCellExperiment RunGroupGSEA RunGroupGSEA.Seurat RunGroupGSEA.SingleCellExperiment

#' Run Gene Set Enrichment Analysis on cells
#'
#' Calculate cells gene specificty ranking and then perform geneset enrichment analysis on it.
#'
#' @param X Seurat or SingleCellExperiment object
#' @param pathways List of gene sets to check
#' @param reduction Which dimensionality reduction to use, must be based on MCA.
#' @param dims A vector of integers indicating which dimensions to use with reduction embeddings and loadings for distance calculation.
#' @param features Character vector of feature names to subset feature coordinates. If not specified will take all features available from specified reduction Loadings.
#' @param cells Character vector of cell names to subset cell coordinates. If not specified will take all features available from specified reduction Embeddings
#' @param nperm Number of permutations to do. Minimial possible nominal p-value is about 1/nperm
#' @param minSize Minimal size of a gene set to test. All pathways below the threshold are excluded.
#' @param maxSize Maximal size of a gene set to test. All pathways above the threshold are excluded.
#' @param gseaParam GSEA parameter value, all gene-level statis are raised to the power of 'gseaParam' before calculation of GSEA enrichment scores
#' @param n.core A single integer to specify the number of core for parallelisation.
#'
#' @return A data.table with geneset enrichment analysis statistics.
#' @export
#'
#' @examples
#' seuratPbmc <- RunMCA(seuratPbmc, nmcs = 5)
#' GSEAResults <- RunCellGSEA(seuratPbmc, Hallmark, dims = 1:5)
RunCellGSEA <- function(X, pathways, reduction, dims, features, cells, nperm, minSize, maxSize, gseaParam, n.core) {
    UseMethod("RunCellGSEA", X)
}

#' @rdname RunCellGSEA
#' @export
RunCellGSEA.Seurat <-
    function(X,
             pathways,
             reduction = "mca",
             dims = seq(50),
             features = NULL,
             cells = NULL,
             nperm = 1000,
             minSize = 10,
             maxSize = 500,
             gseaParam = 0,
             n.core = 1) {
        # Check -------------------------------------------------------------------
        check <-
            checkCellIDArg(
                X         = X,
                reduction = reduction,
                dims      = dims,
                features  = features,
                cells     = cells
            )
        features <- check$features
        dims     <- check$dims
        cells    <- check$cells

        # Parallel ----------------------------------------------------------------
        BPPARAM <- BiocParallel::bpparam()
        BPPARAM$workers <- n.core
        BPPARAM$tasks <- as.integer(n.core * 5)
        BPPARAM$progressbar <- TRUE
        
        # Ranking -----------------------------------------------------------------
        CellGeneDist <-
            GetCellGeneDistance(
                X,
                reduction = reduction,
                dims      = dims,
                features  = features,
                cells     = cells
            )
        CellGeneDist <- as.data.table(CellGeneDist, "features")
        features <- CellGeneDist$features
        message("starting GSEA")
        gseaResults <- bplapply(CellGeneDist[,-1], fgseaCellIDPar,
                   pathways = pathways,
                   features = features,
                   nperm = nperm,
                   minSize = minSize,
                   maxSize = maxSize,
                   BPPARAM = BPPARAM
        )
        gseaResults <- rbindlist(gseaResults, idcol = "cell")
        return(gseaResults)
    }

#' @rdname RunCellGSEA
#' @export
RunCellGSEA.SingleCellExperiment <-
    function(X,
             pathways,
             reduction = "mca",
             dims = seq(50),
             features = NULL,
             cells = NULL,
             nperm = 1000,
             minSize = 10,
             maxSize = 500,
             gseaParam = 0,
             n.core = 1) {
        # Check -------------------------------------------------------------------
        check <-
            checkCellIDArg(
                X = X,
                reduction = reduction,
                dims = dims,
                features = features,
                cells = cells
            )
        features <- check$features
        dims <- check$dims
        cells <- check$cells

        # Parallel ----------------------------------------------------------------
        BPPARAM <- BiocParallel::bpparam()
        BPPARAM$workers <- n.core
        BPPARAM$tasks <- as.integer(n.core * 5)
        BPPARAM$progressbar <- TRUE
        
        # Ranking -----------------------------------------------------------------
        CellGeneDist <-
            GetCellGeneDistance(
                X,
                reduction = reduction,
                dims = dims,
                features = features,
                cells = cells
            )
        CellGeneDist <- as.data.table(CellGeneDist, "features")
        features <- CellGeneDist$features
        message("starting GSEA")
        bplapply(CellGeneDist[,-1], fgseaCellIDPar,
                 pathways = pathways,
                 features = features,
                 nperm = nperm,
                 minSize = minSize,
                 maxSize = maxSize,
                 BPPARAM = BPPARAM
        )
        gseaResults <- rbindlist(gseaResults, idcol = "cell")
        return(gseaResults)
    }


#' Run GSEA on cluster/groups
#'
#' Calculate group gene specificty ranking and then perform geneset enrichment analysis on it.
#'
#' @param X pathways List of gene sets to check
#' @param pathways reduction Which dimensionality reduction to use, must be based on MCA.
#' @param group.by dims A vector of integers indicating which dimensions to use with reduction embeddings and loadings for distance calculation.
#' @param reduction features Character vector of feature names to subset feature coordinates. If not specified will take all features available from specified reduction Loadings.
#' @param dims cells Character vector of cell names to subset cell coordinates. If not specified will take all features available from specified reduction Embeddings
#' @param features cells Character vector of cell names to subset cell coordinates. If not specified will take all features available from specified reduction Embeddings
#' @param nperm nperm Number of permutations to do. Minimial possible nominal p-value is about 1/nperm
#' @param minSize minSize Minimal size of a gene set to test. All pathways below the threshold are excluded.
#' @param maxSize maxSize Maximal size of a gene set to test. All pathways above the threshold are excluded.
#' @param gseaParam gseaParam GSEA parameter value, all gene-level statis are raised to the power of 'gseaParam' before calculation of GSEA enrichment scores
#'
#' @return A data.table with geneset enrichment analysis statistics.
#' @export
#'
#' @examples
#' seuratPbmc <- RunMCA(seuratPbmc, nmcs = 5)
#' GSEAResults <- RunGroupGSEA(seuratPbmc, Hallmark, group.by = "seurat_clusters", dims = 1:5)
RunGroupGSEA <-
    function(X, pathways, group.by, reduction, dims, features, nperm, minSize, maxSize, gseaParam) {
        UseMethod("RunGroupGSEA", X)
    }


#' @rdname RunGroupGSEA
#' @export
RunGroupGSEA.Seurat <-
    function(X, pathways, group.by = NULL, reduction = "mca", dims = seq(50), features = NULL, nperm = 1000, minSize = 10, maxSize = 500, gseaParam = 0) {
        check <-
            checkCellIDArg(
                X = X,
                group.by = group.by,
                reduction = reduction,
                dims = dims,
                features = features,
                cells = NULL
            )
        features <- check$features
        dims <- check$dims
        group.by <- check$group.by
        message("\nranking genes \n")
        # Ranking -----------------------------------------------------------------
        GroupGeneRanking <-
            GetGroupGeneRanking(
                X = X,
                group.by = group.by,
                reduction = reduction,
                dims = dims,
                features = features
            )
        GroupGeneRanking <-
            lapply(GroupGeneRanking, function(i)
                (i + 0.1)^-1)
        gseaResultsAll <- lapply(GroupGeneRanking, function(x,
                                                                pathways,
                                                                nperm,
                                                                minSize,
                                                                maxSize) {
            stats <- x
            gseaResults <- CellID::fgseaCellID(
                pathways = pathways,
                stats = stats,
                nperm = nperm,
                minSize = minSize,
                maxSize = maxSize
            )}, pathways = pathways,
            nperm = nperm,
            minSize = minSize,
            maxSize = maxSize)
        gseaResultsAll <-
            rbindlist(gseaResultsAll, idcol = "cell")
        setnames(gseaResultsAll, "cell", "group")
        return(gseaResultsAll)
    }

#' @rdname RunGroupGSEA
#' @export
RunGroupGSEA.SingleCellExperiment <-
    function(X, pathways, group.by, reduction = "MCA", dims = seq(50), features = NULL, nperm = 1000, minSize = 10, maxSize = 500, gseaParam = 0) {
        check <-
            checkCellIDArg(
                X = X,
                group.by = group.by,
                reduction = reduction,
                dims = dims,
                features = features,
                cells = NULL
            )
        features <- check$features
        dims <- check$dims
        group.by <- check$group.by
        message("\nranking genes \n")
        # Ranking -----------------------------------------------------------------
        GroupGeneRanking <-
            GetGroupGeneRanking(
                X = X,
                group.by = group.by,
                reduction = reduction,
                dims = dims,
                features = features
            )
        GroupGeneRanking <-
            lapply(GroupGeneRanking, function(i)
                (i + 0.1)^-1)
        gseaResultsAll <- lapply(GroupGeneRanking, function(x,
                                                                pathways,
                                                                features,
                                                                nperm,
                                                                minSize,
                                                                maxSize) {
            stats <- x
            gseaResults <- CellID::fgseaCellID(
                pathways = pathways,
                stats = stats,
                nperm = nperm,
                minSize = minSize,
                maxSize = maxSize,
                .progress = TRUE
            )})
        gseaResultsAll <-
            rbindlist(gseaResultsAll, idcol = "cell")
        setnames(gseaResultsAll, "cell", "group")
        return(gseaResultsAll)
    }


#' Get Matrix from Enrichment Results
#'
#' @param X an enrichment results obtained by RunGroupGSEA or RunCellGSEA
#' @param metric a character indicating which metric to use as value of matrix (ES, NES, padj, pval)
#'
#' @return A matrix of geneset enrichment metric with cell/group at columns and pathways/genesets at rows
#' @export
#'
#' @examples
#' seuratPbmc <- RunMCA(seuratPbmc, nmcs = 5)
#' GSEAResults <- RunGroupGSEA(seuratPbmc, Hallmark, group.by = "seurat_clusters", dims = 1:5)
#' GSEAMatrix <- GetGSEAMatrix(GSEAResults)
GetGSEAMatrix <- function(X, metric = "ES") {
    names(X)[1] <- "cell"
    dt <- dcast(X, pathway ~ cell, value.var = metric)
    gsea_mat <- as.matrix(dt[, -1])
    rownames(gsea_mat) <- dt$pathway
    return(gsea_mat)
}

#' GSEA Results Integration in Seurat object
#'
#' @param X a Seurat object
#' @param gseaResults Results of GSEA obtained by RunCellGSEA or RunGroupGSEA
#' @param assay.name The name of the new Seurat assay slot to include GSEA results
#' @return a Seurat object with a new assay slot filled with GSE analysis resultsq
integrateGSEASeurat <- function(X, gseaResults, assay.name) {
    message("\n\ngenerating seurat object\n")
    ES_matrix <-
        GetGSEAMatrix(gseaResults, metric = "ES")
    coln <- sort(colnames(ES_matrix))
    rown <- sort(rownames(ES_matrix))
    ES_matrix <- ES_matrix[rown, coln]
    NES_matrix <-
        GetGSEAMatrix(gseaResults, metric = "NES")
    Scaled_NES_matrix <- t(apply(NES_matrix, 1, scale))
    colnames(Scaled_NES_matrix) <- colnames(NES_matrix)
    message("\nreturning seurat object\n")
    X[[assay.name]] <-
        CreateAssayObject(counts = Matrix::Matrix(ES_matrix, sparse = TRUE))
    X <-
        SetAssayData(X, slot = "data", NES_matrix, assay = assay.name)
    X <-
        SetAssayData(X, slot = "scale.data", Scaled_NES_matrix, assay = assay.name)
    return(X)
}


# fgsea -------------------------------------------------------------------

# Extracted from fgsea
calcGseaStatCumulativeBatch <-
    getFromNamespace("calcGseaStatCumulativeBatch", "fgsea")

#' reworked fgsea for ram and speed efficiency in CellID
#'
#' @param pathways List of gene sets to check
#' @param stats Named vector of gene-level stats. Names should be the same as in 'pathways'
#' @param nperm Number of permutations to do. Minimial possible nominal p-value is about 1/nperm
#' @param minSize Minimal size of a gene set to test. All pathways below the threshold are excluded.
#' @param maxSize Maximal size of a gene set to test. All pathways above the threshold are excluded.
#' @param gseaParam GSEA parameter value, all gene-level statis are raised to the power of 'gseaParam' before calculation of GSEA enrichment scores
#'
#' @return A table with GSEA results. Each row corresponds to a tested pathway.
#' The columns are the following:
#' \itemize{
#'   \item pathway -- name of the pathway as in `names(pathway)`;
#'   \item pval -- an enrichment p-value;
#'   \item padj -- a BH-adjusted p-value;
#'   \item ES -- enrichment score, same as in Broad GSEA implementation;
#'   \item NES -- enrichment score normalized to mean enrichment of random samples of the same size;
#'   \item nMoreExtreme` -- a number of times a random gene set had a more
#'      extreme enrichment score value;
#'   \item size -- size of the pathway after removing genes not present in `names(stats)`.
#'   \item leadingEdge -- vector with indexes of leading edge genes that drive the enrichment, see \url{http://software.broadinstitute.org/gsea/doc/GSEAUserGuideTEXT.htm#_Running_a_Leading}.
#' }
#' @export
fgseaCellID <-
    function(pathways, stats, nperm = 1000, minSize = 10, maxSize = 500, gseaParam = 0) {
        # remove note
        . <- NULL
        ties <- sum(duplicated(stats[stats != 0]))
        if (any(duplicated(names(stats)))) {
            warning("There are duplicate gene names, fgsea may produce unexpected results")
        }
        granularity <- 1000
        seeds <- 1
        minSize <- max(minSize, 1)
        stats <- sort(stats, decreasing = TRUE)
        stats <- abs(stats)^gseaParam
        pathwaysFiltered <- lapply(pathways, function(p) {
            as.vector(na.omit(fastmatch::fmatch(p, names(stats))))
        })
        pathwaysSizes <-
            vapply(X = pathwaysFiltered, FUN = length, FUN.VALUE = as.integer(0)
            )
        toKeep <-
            which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)
        m <- length(toKeep)
        if (m == 0) {
            return(
                data.table(
                    pathway = character(),
                    pval = numeric(),
                    padj = numeric(),
                    ES = numeric(),
                    NES = numeric()
                )
            )
        }
        pathwaysFiltered <- pathwaysFiltered[toKeep]
        pathwaysSizes <- pathwaysSizes[toKeep]
        K <- max(pathwaysSizes)
        gseaStatRes <-
            vapply(
                X = pathwaysFiltered,
                FUN = calcGseaStat,
                FUN.VALUE = 0,
                stats = stats
            )
        pathwayScores <- gseaStatRes
        universe <- seq_along(stats)
        leEs <- rep(0, m)
        geEs <- rep(0, m)
        leZero <- rep(0, m)
        geZero <- rep(0, m)
        leZeroSum <- rep(0, m)
        geZeroSum <- rep(0, m)
        if (m == 1) {
            for (i in seq_len(nperm)) {
                randSample <- sample.int(length(universe), K)
                randEsP <- calcGseaStat(
                    stats = stats,
                    selectedStats = randSample,
                    gseaParam = gseaParam
                )
                leEs <- leEs + (randEsP <= pathwayScores)
                geEs <- geEs + (randEsP >= pathwayScores)
                leZero <- leZero + (randEsP <= 0)
                geZero <- geZero + (randEsP >= 0)
                leZeroSum <- leZeroSum + pmin(randEsP, 0)
                geZeroSum <- geZeroSum + pmax(randEsP, 0)
            }
        } else {
            aux <- calcGseaStatCumulativeBatch(
                stats = stats,
                gseaParam = gseaParam,
                pathwayScores = pathwayScores,
                pathwaysSizes = pathwaysSizes,
                iterations = nperm,
                seed = 1
            )
            leEs <- get("leEs", aux)
            geEs <- get("geEs", aux)
            leZero <- get("leZero", aux)
            geZero <- get("geZero", aux)
            leZeroSum <- get("leZeroSum", aux)
            geZeroSum <- get("geZeroSum", aux)
        }
        counts <- data.table(
            pathway = seq_len(m),
            leEs = leEs,
            geEs = geEs,
            leZero = leZero,
            geZero = geZero,
            leZeroSum = leZeroSum,
            geZeroSum = geZeroSum
        )

        leEs <- leZero <- geEs <- geZero <- leZeroSum <- geZeroSum <- NULL
        pathway <-
            padj <- pval <- ES <- NES <- geZeroMean <- leZeroMean <- NULL
        nMoreExtreme <- nGeEs <- nLeEs <- size <- NULL
        leadingEdge <- NULL
        pvals <- counts[, list(
            pval = min(
                (1 + sum(leEs)) / (1 + sum(leZero)),
                (1 + sum(geEs)) / (1 + sum(geZero))
            ),
            leZeroMean = sum(leZeroSum) / sum(leZero),
            geZeroMean = sum(geZeroSum) / sum(geZero),
            nLeEs = sum(leEs),
            nGeEs = sum(geEs)
        ), by = .(pathway)]
        pvals[, `:=`(padj, p.adjust(pval, method = "BH"))]
        pvals[, `:=`(ES, pathwayScores[pathway])]
        pvals[, `:=`(NES, ES / ifelse(ES > 0, geZeroMean, abs(leZeroMean)))]
        pvals[, `:=`(leZeroMean, NULL)]
        pvals[, `:=`(geZeroMean, NULL)]
        pvals[, `:=`(nLeEs, NULL)]
        pvals[, `:=`(nGeEs, NULL)]
        pvals[, `:=`(pathway, names(pathwaysFiltered)[pathway])]
        pvals <- pvals[]
        return(pvals)
    }

fgseaCellIDPar <- function(x, pathways, features, nperm, minSize, maxSize) {
    x <- (x+0.1)^-1
    names(x) <- features
    gseaResults <- CellID::fgseaCellID(
        pathways = pathways,
        stats = x,
        nperm = nperm,
        minSize = minSize,
        maxSize = maxSize
    )}
Cortalak/cellID documentation built on Aug. 3, 2020, 9:01 p.m.