R/run_fgsea.R

Defines functions run_gsea

Documented in run_gsea

#' Run GSEA to compare a gene list(s) to per cell or
#' per cluster expression data
#' @description Use fgsea algorithm to compute normalized enrichment
#' scores and pvalues for gene
#' set ovelap
#' @param expr_mat single-cell expression matrix or Seurat object
#' @param query_genes A vector or named list of vectors of genesets of interest
#' to compare via GSEA. If supplying a named list, then the gene set names
#' will appear in the output.
#' @param cluster_ids vector of cell cluster assignments, supplied as a
#' vector with order that
#' matches columns in `expr_mat`. Not required if running per cell.
#' @param n_perm Number of permutation for fgsea function. Defaults to 1000.
#' @param per_cell if true run per cell, otherwise per cluster.
#' @param scale convert expr_mat into zscores prior to running GSEA?,
#' default = FALSE
#' @param no_warnings suppress warnings from gsea ties
#' @return dataframe of gsea scores (pval, NES), with clusters as rownames
#' @examples
#' run_gsea(
#'     expr_mat = pbmc_matrix_small,
#'     query_genes = pbmc_vargenes[1:100],
#'     n_perm = 10,
#'     cluster_ids = pbmc_meta$classified,
#'     no_warnings = TRUE
#' )
#' @export
run_gsea <- function(expr_mat,
    query_genes,
    cluster_ids = NULL,
    n_perm = 1000,
    per_cell = FALSE,
    scale = FALSE,
    no_warnings = TRUE) {
    if (!is.list(query_genes)) {
        geneset_list <- list("query_genes" = query_genes)
    } else {
        geneset_list <- query_genes
    }

    if (!per_cell & (ncol(expr_mat) != length(cluster_ids))) {
        stop("cluster_ids do not match number of cells (columns) ",
            "in expr_mat",
            call. = FALSE
        )
    }

    if (n_perm > 1e4 & per_cell) {
        warning(
            "run_gsea() take a long time if running many ",
            "permutations and running per cell"
        )
    }

    if (scale) {
        expr_mat <- t(scale(t(as.matrix(expr_mat))))
    }

    if (!per_cell) {
        avg_mat <- average_clusters(expr_mat,
            metadata = cluster_ids
        )
    } else {
        avg_mat <- expr_mat
    }

    res <- list()
    for (i in seq_along(colnames(avg_mat))) {
        if (!(no_warnings)) {
            gsea_res <- fgsea::fgsea(
                geneset_list,
                avg_mat[, i],
                minSize = 1,
                maxSize = max(vapply(geneset_list,
                    length,
                    FUN.VALUE = numeric(1)
                )),
                nproc = 1,
                nperm = n_perm
            )
        } else {
            suppressWarnings(
                gsea_res <- fgsea::fgsea(
                    geneset_list,
                    avg_mat[, i],
                    minSize = 1,
                    maxSize = max(vapply(geneset_list,
                        length,
                        FUN.VALUE = numeric(1)
                    )),
                    nproc = 1,
                    nperm = n_perm
                )
            )
        }
        res[[i]] <- gsea_res[, c("pathway", "pval", "NES")]
    }
    gsea_res <- dplyr::bind_rows(res)
    gsea_res <-
        as.data.frame(dplyr::mutate(gsea_res, cell = colnames(avg_mat)))
    if (tibble::has_rownames(gsea_res)) {
        gsea_res <- tibble::remove_rownames(gsea_res)
    }
    gsea_res <- tibble::column_to_rownames(gsea_res, "cell")

    gsea_res
}

Try the clustifyr package in your browser

Any scripts or data that you put into this service are public.

clustifyr documentation built on Nov. 8, 2020, 5:32 p.m.