knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width=7, fig.height=5 )
We present here some usage case of Cell-ID, a clustering-free multivariate statistical method for the robust extraction of per-cell gene signatures from single-cell RNA-seq. Cell-ID allows unbiased cell identity recognition across different donors, tissues-of-origin, model organisms and single-cell omics protocols.
In order to install CellID, the user will have to set the repositories option to enable downloading Bioconductor dependencies.
if(!require("tidyverse")) install.packages("tidyverse") if(!require("ggpubr")) install.packages("ggpubr") if(!require("devtools")) install.packages("devtools") setRepositories(ind = c(1, 2, 3, 4)) devtools::install_github("CellID")
library(CellID) library(tidyverse) # general purpose library for data handling library(ggpubr) #library for plotting
To illustrate CellID usage we will use two different pancreatic data sets from Baron and Segerstolpe.
#download data download.file(url = "https://storage.googleapis.com/cellid-cbl/BaronMatrix.rds",destfile = "BaronMatrix.rds") download.file(url = "https://storage.googleapis.com/cellid-cbl/BaronMetaData.rds",destfile = "BaronMetaData.rds") download.file(url = "https://storage.googleapis.com/cellid-cbl/SegerstolpeMatrix.rds",destfile = "SegerstolpeMatrix.rds") download.file(url = "https://storage.googleapis.com/cellid-cbl/SegerstolpeMetaData2.rds",destfile = "SegerstolpeMetaData.rds") #read data BaronMatrix <- readRDS("BaronMatrix.rds") BaronMetaData <- readRDS("BaronMetaData.rds") SegerMatrix <- readRDS("SegerstolpeMatrix.rds") SegerMetaData <- readRDS("SegerstolpeMetaData.rds")
CellID can be either used on Seurat or SingleCellExperiment object, here before creating the Seurat object we filter the matrix genes with protein coding genes.
# Remove non protein coding genes BaronMatrixProt <- BaronMatrix[rownames(BaronMatrix) %in% HgProteinCodingGenes,] SegerMatrixProt <- SegerMatrix[rownames(SegerMatrix) %in% HgProteinCodingGenes,] # Create Seurat object and remove remove low detection genes BaronSeurat <- CreateSeuratObject(counts = BaronMatrixProt, project = "Baron", min.cells = 5, meta.data = BaronMetaData) SegerSeurat <- CreateSeuratObject(counts = SegerMatrixProt, project = "Segerstolpe", min.cells = 5, meta.data = SegerMetaData)
Before running CellID we perform standard seurat workflow as desribed here.
BaronSeurat <- NormalizeData(BaronSeurat) BaronSeurat <- ScaleData(BaronSeurat, features = rownames(BaronSeurat)) BaronSeurat <- RunPCA(BaronSeurat, features = rownames(BaronSeurat)) BaronSeurat <- RunUMAP(BaronSeurat, dims = 1:30) BaronSeurat <- RunTSNE(BaronSeurat, dims = 1:30) PCA <- DimPlot(BaronSeurat, reduction = "pca", group.by = "cell.type") + ggtitle("PCA") + theme(legend.text = element_text(size =10), aspect.ratio = 1) tSNE <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "cell.type")+ ggtitle("tSNE") + theme(legend.text = element_text(size =10), aspect.ratio = 1) UMAP <- DimPlot(BaronSeurat, reduction = "umap", group.by = "cell.type") + ggtitle("UMAP") + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggarrange(tSNE, UMAP, common.legend = T, legend = "top")
CellID is based on Multiple Correspondence Analysis (MCA), a dimensionality reduction methods that allows the representation of both cells and genes in a single euclidean space.
Running MCA enables to compute afterwards the distance betwwen genes and cells to obtain a ranking of gene specificity for each cells.
MCA can be Run like other dimensionality reduction methods in Seurat by using the command RunMCA
.
Using the DimPlotMC
it is also possible to plot both cells and genes in the same space.
BaronSeurat <- RunMCA(BaronSeurat) MCA <- DimPlot(BaronSeurat, reduction = "mca", group.by = "cell.type") + ggtitle("MCA") + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggarrange(PCA, MCA, common.legend = T, legend = "top") DimPlotMC(BaronSeurat, reduction = "mca", group.by = "cell.type", features = c("CTRL", "INS", "MYZAP", "CDH11"), as.text = T) + ggtitle("MCA with some key gene markers")
After running MCA, the user can perform enrichment analysis on each cells in the data by performing Hypergeometric test. This enables using pre-established cell types gene sets in the litterature to predict cell types in the data. Here we will use the panglao database curated signature database to predict the cell types in the Baron data. We will use two different collection signatures: one with only known pancreatic cell types and one with all cell types.
#download signatures from panglaoDB panglao <- read_tsv("https://panglaodb.se/markers/PanglaoDB_markers_27_Mar_2020.tsv.gz") #filter to get pancreas specific genes panglao_pancreas <- panglao %>% filter(organ == "Pancreas") #filter to get human specific genes panglao_pancreas <- panglao_pancreas %>% filter(str_detect(species,"Hs")) # convert dataframes to a list of vectors which is the format for CellID input panglao_pancreas <- panglao_pancreas %>% group_by(`cell type`) %>% summarise(geneset = list(`official gene symbol`)) pancreas_gs <- setNames(panglao_pancreas$geneset, panglao_pancreas$`cell type`)
#filter to get human specific genes panglao_all <- panglao %>% filter(str_detect(species,"Hs")) # convert dataframes to a list of named vectors which is the format for CellID input panglao_all <- panglao_all %>% group_by(`cell type`) %>% summarise(geneset = list(`official gene symbol`)) all_gs <- setNames(panglao_all$geneset, panglao_all$`cell type`) #remove very short signatures all_gs <- all_gs[sapply(all_gs, length) >= 10]
After downloading and formatting the signatures, we perform the Hypergeometric test to test each cells against each signatures. By assigning the most highly enriched signature to each cells, it is possible to determine the cell type composition of the dataset. The results quality is highly dependant on the quality of the signatures.
#perform hypergeometric test HGT_pancreas_gs <- RunCellHGT(BaronSeurat, pathways = pancreas_gs, dims = 1:50) #for each cells detect the maximum enrichment pancreas_gs_prediction <- rownames(HGT_pancreas_gs)[apply(HGT_pancreas_gs, 2, which.max)] #test if maximum enrichment is significant pancreas_gs_prediction_signif <- ifelse(apply(HGT_pancreas_gs, 2, max)>2, yes = pancreas_gs_prediction, "unassigned") #put predictions in Seurat metadata BaronSeurat$pancreas_gs_prediction <- pancreas_gs_prediction_signif #compare original labels with predictions color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey") ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned")) OriginalPlot <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "cell.type") + scale_color_manual(values = ggcolor) + theme(legend.text = element_text(size =10), aspect.ratio = 1) Predplot1 <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "pancreas_gs_prediction") + scale_color_manual(values = ggcolor) + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggarrange(OriginalPlot, Predplot1, legend = "top",common.legend = T)
HGT_all_gs <- RunCellHGT(BaronSeurat, pathways = all_gs, dims = 1:50) all_gs_prediction <- rownames(HGT_all_gs)[apply(HGT_all_gs, 2, which.max)] all_gs_prediction_signif <- ifelse(apply(HGT_all_gs, 2, max)>2, yes = all_gs_prediction, "unassigned") BaronSeurat$all_gs_prediction <- ifelse(all_gs_prediction_signif %in% c(names(pancreas_gs), "Schwann cells", "Endothelial cells", "Macrophages", "Mast cells", "T cells","Fibroblasts", "unassigned"), all_gs_prediction_signif,"other") color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "#D575FE", "#F962DD", "grey", "black") ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "Fibroblasts", "Schwann cells", "unassigned", "other")) BaronSeurat$pancreas_gs_prediction <- factor(BaronSeurat$pancreas_gs_prediction,c(sort(unique(BaronSeurat$cell.type)), "Fibroblasts", "Schwann cells", "unassigned", "other")) PredPlot2 <- DimPlot(BaronSeurat, group.by = "all_gs_prediction", reduction = "tsne") + scale_color_manual(values = ggcolor, drop =FALSE) + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggarrange(OriginalPlot, PredPlot2, legend = "top",common.legend = T)
CellID allows signatures extraction from data set both at group level and cell level. These signatures can later be used as reference gene sets. Here we use Baron as a Reference dataset to extract both cell signatures and cell type signatures, and we use the Segerstolpe dataset as a query dataset to prdict the cell type composition.
# extract gene signatures of baron cells Baron_cell_gs <- GetCellGeneSet(BaronSeurat, dims = 1:50, n.features = 200) # extract gene signatures of baron cell types Baron_group_gs <- GetGroupGeneSet(BaronSeurat, dims = 1:50, n.features = 200, group.by = "cell.type")
Again, here we run the basic seurat protocole and MCA.
SegerSeurat <- NormalizeData(SegerSeurat) SegerSeurat <- FindVariableFeatures(SegerSeurat) SegerSeurat <- ScaleData(SegerSeurat) SegerSeurat <- RunPCA(SegerSeurat) SegerSeurat <- RunMCA(SegerSeurat, nmcs = 50) SegerSeurat <- RunUMAP(SegerSeurat, dims = 1:30) SegerSeurat <- RunTSNE(SegerSeurat, dims = 1:30) tSNE <- DimPlot(SegerSeurat, reduction = "tsne", group.by = "cell.type", pt.size = 0.1) + ggtitle("tSNE") + theme(aspect.ratio = 1) UMAP <- DimPlot(SegerSeurat, reduction = "umap", group.by = "cell.type", pt.size = 0.1) + ggtitle("UMAP") + theme(aspect.ratio = 1) ggarrange(tSNE, UMAP, common.legend = T, legend = "top")
Here we perform Hypergeometric test on Segerstolpe dataset using Baron cell type signatures. As we did for the pre-established signatures, we choose the signature with the highest enrichment for each cells to assign cell types to the query dataset.
HGT_baron_group_gs <- RunCellHGT(SegerSeurat, pathways = Baron_group_gs, dims = 1:50) baron_group_gs_prediction <- rownames(HGT_baron_group_gs)[apply(HGT_baron_group_gs, 2, which.max)] baron_group_gs_prediction_signif <- ifelse(apply(HGT_baron_group_gs, 2, max)>2, yes = baron_group_gs_prediction, "unassigned") SegerSeurat$baron_group_gs_prediction <- baron_group_gs_prediction_signif color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey") ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned")) ggPredictions <- DimPlot(SegerSeurat, group.by = "baron_group_gs_prediction", pt.size = 0.2, reduction = "tsne") + ggtitle("Predicitons") + scale_color_manual(values = ggcolor, drop =FALSE) + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggOriginal <- DimPlot(SegerSeurat, group.by = "cell.type", pt.size = 0.2, reduction = "tsne") + ggtitle("Original") + scale_color_manual(values = ggcolor) + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggarrange(ggOriginal, ggPredictions, legend = "top", common.legend = T)
Alternatively we can also use Baron individual cell signatures to match one Baron data cells to a Segerstolpe data cells and the transfer the label to predict cell types in Segerstolpe.
HGT_baron_cell_gs <- RunCellHGT(SegerSeurat, pathways = Baron_cell_gs, dims = 1:50) baron_cell_gs_match <- rownames(HGT_baron_cell_gs)[apply(HGT_baron_cell_gs, 2, which.max)] baron_cell_gs_prediction <- BaronSeurat$cell.type[baron_cell_gs_match] baron_cell_gs_prediction_signif <- ifelse(apply(HGT_baron_cell_gs, 2, max)>2, yes = baron_cell_gs_prediction, "unassigned") SegerSeurat$baron_cell_gs_prediction <- baron_cell_gs_prediction_signif color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey") ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned")) ggPredictionsCellMatch <- DimPlot(SegerSeurat, group.by = "baron_cell_gs_prediction", pt.size = 0.2, reduction = "tsne") + ggtitle("Predicitons") + scale_color_manual(values = ggcolor, drop =FALSE) + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggOriginal <- DimPlot(SegerSeurat, group.by = "cell.type", pt.size = 0.2, reduction = "tsne") + ggtitle("Original") + scale_color_manual(values = ggcolor) + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggarrange(ggOriginal, ggPredictionsCellMatch, legend = "top", common.legend = T)
Once MCA is performed, any type of signatures can be tested and that includes functionnal pathways. Here we use the Hallmark and KEGG pathways in HyperGeometric test and integrate the results into the Seurat object to visualise the -log10 pvalue of the enrichment into an UMAP.
KEGG <- fgsea::gmtPathways("https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName=KEGG_2019_Human") HGT_Hallmark <- RunCellHGT(SegerSeurat, pathways = Hallmark, dims = 1:50) HGT_KEGG <- RunCellHGT(SegerSeurat, pathways = KEGG, dims = 1:50) SegerSeurat@assays[["Hallmark"]] <- CreateAssayObject(HGT_Hallmark) SegerSeurat@assays[["KEGG"]] <- CreateAssayObject(HGT_KEGG) ggG2Mcell <- FeaturePlot(SegerSeurat, "G2M-CHECKPOINT", order = T, reduction = "tsne", min.cutoff = 2) + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggPancSecr <- FeaturePlot(SegerSeurat, "Pancreatic secretion", order = T, reduction = "tsne", min.cutoff = 2) + theme(legend.text = element_text(size =10), aspect.ratio = 1) ggarrange(ggG2Mcell, ggPancSecr)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.