knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(cache = TRUE) knitr::opts_chunk$set(warning = FALSE) knitr::opts_chunk$set(message = FALSE) knitr::opts_chunk$set(out.extra = '')
This vignette describes how the gsfisher package can be used to test for over-representation of Gene Ontology (GO) categories, KEGG pathways and arbitrary genesets (including xCell cell type annotations) amongst the markers of single cell clusters.
We demonstrate the approach using data from the Seurat 3K PBMC tutorial. First we load the required R libraries.
library(dplyr) library(kableExtra) library(knitr) library(Seurat) library(patchwork) library(gsfisher)
Here we run a minimal set of commands to obtain the cluster markers for this dataset:
if(!file.exists("filtered_gene_bc_matrices/hg19/matrix.mtx")) { download.file("https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz", "pbmc3k_filtered_gene_bc_matrices.tar.gz") untar("pbmc3k_filtered_gene_bc_matrices.tar.gz") } if(!file.exists("pbmc.markers.rds") | !file.exists("pbmc.rds")) { pbmc.data <- Read10X(data.dir = "filtered_gene_bc_matrices/hg19/") pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200) pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) pbmc <- NormalizeData(pbmc) pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(pbmc) pbmc <- ScaleData(pbmc, features = all.genes) pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) pbmc <- FindNeighbors(pbmc, dims = 1:10) pbmc <- FindClusters(pbmc, resolution = 0.5) pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) saveRDS(pbmc, "pbmc.rds") saveRDS(pbmc.markers, "pbmc.markers.rds") } pbmc <- readRDS("pbmc.rds") pbmc.markers <- readRDS("pbmc.markers.rds")
We now have a table containing the positive marker genes for each cluster.
Before we can test the lists of cluster markers for geneset over-representation we must define a suitable background list of expressed genes. This is also sometimes referred to as the gene universe.
Here, to construct the background list we select genes that are expressed in the same fraction of cells as was required for inclusion in the testing with the Seurat FindAllMarkers function (i.e. the min.pct parameter).
In this case, we need to identify the set of genes which are expressed in 25% of the cluster cells, or 25% of the other cells (for any the clusters).
# Helper function for extracting the expressed genes from a seurat object getExpressedGenesFromSeuratObject <- function(seurat_object, clusters, min.pct=0.1) { expressed <- c() for(cluster in clusters) { # get genes detected in the cluster cluster_cells <- names(pbmc$seurat_clusters[pbmc$seurat_clusters==cluster]) clust_pcts <- apply(pbmc@assays$RNA@counts[,cluster_cells], 1, function(x) sum(x>0)/length(x)) detected_in_clust <- names(clust_pcts[clust_pcts>min.pct]) # get genes detected in the other cells other_cells <- names(pbmc$seurat_clusters[pbmc$seurat_clusters!=cluster]) other_pcts <- apply(pbmc@assays$RNA@counts[,other_cells], 1, function(x) sum(x>0)/length(x)) detected_in_other_cells <- names(other_pcts[other_pcts>min.pct]) expressed <- c(expressed, detected_in_clust, detected_in_other_cells) } expressed <- unique(expressed) expressed } expressed_genes <- getExpressedGenesFromSeuratObject( pbmc,unique(pbmc$seurat_clusters), min.pct=0.25)
Most of the genesets and pathways we are interested in are mapped to Entrez gene identifiers. In order to map the gene symbols (which are used by Seurat) to Entrez identifiers we use the "fetchAnnotation()" function from gsfisher to retrieve a table of gene annotations.
if(!file.exists("annotation.rds")) { annotation <- fetchAnnotation(species="hs", ensembl_version=NULL, ensembl_host = NULL) saveRDS(annotation, "annotation.rds") } annotation <- readRDS("annotation.rds")
Next we add the Entrez ids to the FindAllMarkers() results table and also translate the background gene symbols to entrez ids.
pbmc.markers$entrez_id <- as.character(annotation$entrez_id[ match(pbmc.markers$gene, annotation$gene_name)]) pbmc.markers <- pbmc.markers[!is.na(pbmc.markers$entrez_id),] background_entrez <- as.character(annotation$entrez_id[ match(expressed_genes, annotation$gene_name)]) background_entrez <- background_entrez[!is.na(background_entrez)]
We can now use the gsfisher "runGO.all()" function to test for GO category enrichments. Note that before doing so we filter the markers table to the level of significance (after multiple testing correction) that we are interested in.
pbmc.markers.filtered <- pbmc.markers[pbmc.markers$p_val_adj < 0.05,] if(!file.exists("go.results.rds")) { go.results <- runGO.all(results=pbmc.markers.filtered, species = "hs", background_ids = background_entrez, gene_id_col="entrez_id", gene_id_type="entrez", sample_col="cluster", p_col="p_val_adj", p_threshold=0.05) saveRDS(go.results, "go.results.rds") } go.results <- readRDS("go.results.rds")
We now have a results table that contains odds ratios and p-values for all GO categories from the biological process (BP), cellular component (CC) and molecular function (MF) sub-ontologies. Here the first 3 rows of the table are shown:
x <- go.results[1:3,] x$gene_names <- gsub(",",", ",x$gene_names) x$entrez_ids <- gsub(",",", ",x$entrez_ids) kable(x) %>% column_spec(1, width = "2em") %>% column_spec(2, width = "5em") %>% column_spec(3, width = "10em") %>% column_spec(4, width = "5em") %>% column_spec(5, width = "5em") %>% column_spec(6, width = "5em") %>% column_spec(7, width = "2em") %>% column_spec(8, width = "2em") %>% column_spec(9, width = "2em") %>% column_spec(10, width = "2em") %>% column_spec(11, width = "5em") %>% column_spec(12, width = "20em") %>% column_spec(13, width = "20em") %>% column_spec(14, width = "5em") %>% kable_styling(latex_options="scale_down")
We next filter the results table to discard genesets that we not interested in. In order to maintain statistical rigour these criteria should be decided in advance.
To filter the results table we use the gsfisher "filterGenesets()" convenience function which will also correct the p-values for multiple testing after the filtering.
Here we are only interested in the biological processes, GO categories that have at least 3 genes in the foreground list, GO categories that have no more than 500 genes (to avoid generic enrichments) and GO categories that have an over-representation odds ratio of at least 2 (we are not interested in small effect sizes).
go.results.filtered <- go.results[go.results$ontology=="BP",] go.results.filtered <- filterGenesets(go.results.filtered, min_foreground_genes = 3, max_genes_geneset = 500, min_odds_ratio = 2, p_col = "p.val", padjust_method = "BH", use_adjusted_pvalues = TRUE, pvalue_threshold = 0.05)
We can use the gsfisher "sampleEnrichmentDotplot()" function to make a dotplot showing the enrichments of the top 5 most significant categories by cluster. Here dot size is proportional to the number of genes in the category that were found as cluster markers (n_fg). The dot color is used to communicate the effect size (odds ratio).
go.results.top <- go.results.filtered %>% group_by(cluster) %>% top_n(n=5, -p.val) sampleEnrichmentDotplot(go.results.top, selection_col = "description", selected_genesets = unique(go.results.top$description), sample_id_col = "cluster", fill_var = "odds.ratio", maxl=50, title="GO biological pathway")
Sometimes many semi-redundant categories will be enriched for a cluster. To help understand the results we can cluster the enriched categories according to the similarity of their gene membership and visualise the resulting dendrogram using the gsfisher "visualiseClusteredGenesets()" function.
go.results.filtered.1 <- go.results.filtered[go.results.filtered$cluster==1,] visualiseClusteredGenesets(go.results.filtered.1, geneset_clust = NULL, odds_ratio_max_quantile = 0.9, highlight = c("T cell differentiation"), id_col = "geneset_id", name_col = "description", ngenes_col = "n_fg", odds_ratio_col = "odds.ratio")
To test for overenriched KEGG pathways we can use the gsfisher "runKEGG.all()" function.
if(!file.exists("kegg.results.rds")) { kegg.results <- runKEGG.all(results=pbmc.markers.filtered, species = "hs", background_ids = background_entrez, gene_id_col="entrez_id", gene_id_type="entrez", sample_col="cluster", p_col="p_val_adj", p_threshold=0.05) saveRDS(kegg.results,"kegg.results.rds") } kegg.results <- readRDS("kegg.results.rds") kegg.results.filtered <- filterGenesets(kegg.results) kegg.results.top <- kegg.results.filtered %>% group_by(cluster) %>% top_n(n=5, -p.val) sampleEnrichmentDotplot(kegg.results.top, selected_genesets = unique(kegg.results.top$description), maxl=50, title="KEGG pathway")
We can use the genesets from the xCell package to help identify the cell types. First we need to extract the genesets from the xCell R package and translate them to entrez ids.
library(devtools) library(GSEABase) # Install the xCell R package devtools::install_github('dviraran/xCell') library(xCell) # Export the xCell genesets as a gmt file xx <- as.list(xCell.data) toGmt(xx$signatures,"xcell.gmt") # Clean up the xCell geneset names xcell <- readGMT("xcell.gmt") names(xcell) <- gsub("%","_",names(xcell)) # Translate the lists to entrez ids xcell_entrez <- list() for(gs in names(xcell)) { symbols <- xcell[[gs]] entrez_ids <- unique(annotation$entrez_id[match(symbols, annotation$gene_name)]) entrez_ids <- entrez_ids[!is.na(entrez_ids)] xcell_entrez[[gs]] <- as.character(entrez_ids) } writeGMT(xcell_entrez, "xcell_entrez.gmt")
Now we can test for enrichment of the xCell genesets in the cluster markers. Note that the gsfisher "runGMT.all()" function can be used to test for enrichments of any genesets in the gene matrix transposed (GMT) format. More details on this format can be found here.
# run the enricment tests. This function can be used to test arbitrary gmt files xcell.results <- runGMT.all(results=pbmc.markers.filtered, species = "hs", background_ids = background_entrez, gene_id_col="entrez_id", gene_id_type="entrez", gmt_file="xcell_entrez.gmt", sample_col="cluster", p_col="p_val_adj", p_threshold=0.05) xcell.results.filtered <- filterGenesets(xcell.results) xcell.results.top <- xcell.results.filtered %>% group_by(cluster) %>% top_n(n=5, -p.val) sampleEnrichmentDotplot(xcell.results.top, selected_genesets = unique(xcell.results.top$geneset_id), selection_col = "geneset_id", maxl=50, title="xCell geneset")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.