knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width=7, fig.height=5
)

Introduction

We present here some usage case of Cell-ID, a clustering-free multivariate statistical method for the robust extraction of per-cell gene signatures from single-cell RNA-seq. Cell-ID allows unbiased cell identity recognition across different donors, tissues-of-origin, model organisms and single-cell omics protocols.

Installation

In order to install CellID, the user will have to set the repositories option to enable downloading Bioconductor dependencies.

if(!require("tidyverse")) install.packages("tidyverse")
if(!require("ggpubr")) install.packages("ggpubr")
if(!require("devtools")) install.packages("devtools")
setRepositories(ind = c(1, 2, 3, 4))
devtools::install_github("CellID")

Load library

library(CellID)
library(tidyverse) # general purpose library for data handling
library(ggpubr) #library for plotting

Download data

To illustrate CellID usage we will use two different pancreatic data sets from Baron and Segerstolpe.

#download data
download.file(url = "https://storage.googleapis.com/cellid-cbl/BaronMatrix.rds",destfile = "BaronMatrix.rds")
download.file(url = "https://storage.googleapis.com/cellid-cbl/BaronMetaData.rds",destfile = "BaronMetaData.rds")
download.file(url = "https://storage.googleapis.com/cellid-cbl/SegerstolpeMatrix.rds",destfile = "SegerstolpeMatrix.rds")
download.file(url = "https://storage.googleapis.com/cellid-cbl/SegerstolpeMetaData2.rds",destfile = "SegerstolpeMetaData.rds")


#read data
BaronMatrix <- readRDS("BaronMatrix.rds")
BaronMetaData <- readRDS("BaronMetaData.rds")
SegerMatrix <- readRDS("SegerstolpeMatrix.rds")
SegerMetaData <- readRDS("SegerstolpeMetaData.rds")

Preprocessing and Create Seurat Object

CellID can be either used on Seurat or SingleCellExperiment object, here before creating the Seurat object we filter the matrix genes with protein coding genes.

# Remove non protein coding genes
BaronMatrixProt <- BaronMatrix[rownames(BaronMatrix) %in% HgProteinCodingGenes,]
SegerMatrixProt <- SegerMatrix[rownames(SegerMatrix) %in% HgProteinCodingGenes,]

# Create Seurat object  and remove remove low detection genes
BaronSeurat <- CreateSeuratObject(counts = BaronMatrixProt, project = "Baron", min.cells = 5, meta.data = BaronMetaData)
SegerSeurat <- CreateSeuratObject(counts = SegerMatrixProt, project = "Segerstolpe", min.cells = 5, meta.data = SegerMetaData)

Running Standard Seurat Protocole

Before running CellID we perform standard seurat workflow as desribed here.

BaronSeurat <- NormalizeData(BaronSeurat)
BaronSeurat <- ScaleData(BaronSeurat, features = rownames(BaronSeurat))
BaronSeurat <- RunPCA(BaronSeurat, features = rownames(BaronSeurat))
BaronSeurat <- RunUMAP(BaronSeurat, dims = 1:30)
BaronSeurat <- RunTSNE(BaronSeurat, dims = 1:30)
PCA <- DimPlot(BaronSeurat, reduction = "pca", group.by = "cell.type")  + ggtitle("PCA") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
tSNE <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "cell.type")+ ggtitle("tSNE") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
UMAP <- DimPlot(BaronSeurat, reduction = "umap", group.by = "cell.type") + ggtitle("UMAP") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(tSNE, UMAP, common.legend = T, legend = "top")

Running MCA

CellID is based on Multiple Correspondence Analysis (MCA), a dimensionality reduction methods that allows the representation of both cells and genes in a single euclidean space. Running MCA enables to compute afterwards the distance betwwen genes and cells to obtain a ranking of gene specificity for each cells. MCA can be Run like other dimensionality reduction methods in Seurat by using the command RunMCA. Using the DimPlotMC it is also possible to plot both cells and genes in the same space.

BaronSeurat <- RunMCA(BaronSeurat)
MCA <- DimPlot(BaronSeurat, reduction = "mca", group.by = "cell.type")  + ggtitle("MCA") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(PCA, MCA, common.legend = T, legend = "top")
DimPlotMC(BaronSeurat, reduction = "mca", group.by = "cell.type", features = c("CTRL", "INS", "MYZAP", "CDH11"), as.text = T) + ggtitle("MCA with some key gene markers")

Using pre-established gene sets

After running MCA, the user can perform enrichment analysis on each cells in the data by performing Hypergeometric test. This enables using pre-established cell types gene sets in the litterature to predict cell types in the data. Here we will use the panglao database curated signature database to predict the cell types in the Baron data. We will use two different collection signatures: one with only known pancreatic cell types and one with all cell types.

Pancreatic cell type gene sets

#download signatures from panglaoDB
panglao <- read_tsv("https://panglaodb.se/markers/PanglaoDB_markers_27_Mar_2020.tsv.gz")

#filter to get pancreas specific genes
panglao_pancreas <- panglao %>% filter(organ == "Pancreas")

#filter to get human specific genes
panglao_pancreas <- panglao_pancreas %>%  filter(str_detect(species,"Hs"))

# convert dataframes to a list of vectors which is the format for CellID input
panglao_pancreas <- panglao_pancreas %>%  group_by(`cell type`) %>%  summarise(geneset = list(`official gene symbol`))
pancreas_gs <- setNames(panglao_pancreas$geneset, panglao_pancreas$`cell type`)

All cell type gene sets

#filter to get human specific genes
panglao_all <- panglao %>%  filter(str_detect(species,"Hs"))

# convert dataframes to a list of named vectors which is the format for CellID input
panglao_all <- panglao_all %>%  group_by(`cell type`) %>%  summarise(geneset = list(`official gene symbol`))
all_gs <- setNames(panglao_all$geneset, panglao_all$`cell type`)

#remove very short signatures
all_gs <- all_gs[sapply(all_gs, length) >= 10]

Cell type predictions using preestablished gene sets

After downloading and formatting the signatures, we perform the Hypergeometric test to test each cells against each signatures. By assigning the most highly enriched signature to each cells, it is possible to determine the cell type composition of the dataset. The results quality is highly dependant on the quality of the signatures.

#perform hypergeometric test
HGT_pancreas_gs <- RunCellHGT(BaronSeurat, pathways = pancreas_gs, dims = 1:50)

#for each cells detect the maximum enrichment
pancreas_gs_prediction <- rownames(HGT_pancreas_gs)[apply(HGT_pancreas_gs, 2, which.max)]

#test if maximum enrichment is significant
pancreas_gs_prediction_signif <- ifelse(apply(HGT_pancreas_gs, 2, max)>2, yes = pancreas_gs_prediction, "unassigned")

#put predictions in Seurat metadata
BaronSeurat$pancreas_gs_prediction <- pancreas_gs_prediction_signif

#compare original labels with predictions
color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned"))
OriginalPlot <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "cell.type") + 
  scale_color_manual(values = ggcolor) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
Predplot1 <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "pancreas_gs_prediction") + 
  scale_color_manual(values = ggcolor) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(OriginalPlot, Predplot1, legend = "top",common.legend = T)
HGT_all_gs <- RunCellHGT(BaronSeurat, pathways = all_gs, dims = 1:50)
all_gs_prediction <- rownames(HGT_all_gs)[apply(HGT_all_gs, 2, which.max)]
all_gs_prediction_signif <- ifelse(apply(HGT_all_gs, 2, max)>2, yes = all_gs_prediction, "unassigned")
BaronSeurat$all_gs_prediction <- ifelse(all_gs_prediction_signif %in% c(names(pancreas_gs), "Schwann cells", "Endothelial cells", "Macrophages", "Mast cells", "T cells","Fibroblasts", "unassigned"), all_gs_prediction_signif,"other")

color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "#D575FE", "#F962DD", "grey", "black")
ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "Fibroblasts", "Schwann cells", "unassigned", "other"))
BaronSeurat$pancreas_gs_prediction <- factor(BaronSeurat$pancreas_gs_prediction,c(sort(unique(BaronSeurat$cell.type)), "Fibroblasts", "Schwann cells", "unassigned", "other"))
PredPlot2 <- DimPlot(BaronSeurat, group.by = "all_gs_prediction", reduction = "tsne") + 
  scale_color_manual(values = ggcolor, drop =FALSE) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(OriginalPlot, PredPlot2, legend = "top",common.legend = T)

Gene Set Extraction from Reference dataset

CellID allows signatures extraction from data set both at group level and cell level. These signatures can later be used as reference gene sets. Here we use Baron as a Reference dataset to extract both cell signatures and cell type signatures, and we use the Segerstolpe dataset as a query dataset to prdict the cell type composition.

# extract gene signatures of baron cells
Baron_cell_gs <- GetCellGeneSet(BaronSeurat, dims = 1:50, n.features = 200)

# extract gene signatures of baron cell types
Baron_group_gs <- GetGroupGeneSet(BaronSeurat, dims = 1:50, n.features = 200, group.by = "cell.type")

Preprocessing query dataset

Again, here we run the basic seurat protocole and MCA.

SegerSeurat <- NormalizeData(SegerSeurat)
SegerSeurat <- FindVariableFeatures(SegerSeurat)
SegerSeurat <- ScaleData(SegerSeurat)
SegerSeurat <- RunPCA(SegerSeurat)
SegerSeurat <- RunMCA(SegerSeurat, nmcs = 50)
SegerSeurat <- RunUMAP(SegerSeurat, dims = 1:30)
SegerSeurat <- RunTSNE(SegerSeurat, dims = 1:30)
tSNE <- DimPlot(SegerSeurat, reduction = "tsne", group.by = "cell.type", pt.size = 0.1) + ggtitle("tSNE") + theme(aspect.ratio = 1)
UMAP <- DimPlot(SegerSeurat, reduction = "umap", group.by = "cell.type", pt.size = 0.1) + ggtitle("UMAP") + theme(aspect.ratio = 1)
ggarrange(tSNE, UMAP, common.legend = T, legend = "top")

Predictions using cell type gene sets

Here we perform Hypergeometric test on Segerstolpe dataset using Baron cell type signatures. As we did for the pre-established signatures, we choose the signature with the highest enrichment for each cells to assign cell types to the query dataset.

HGT_baron_group_gs <- RunCellHGT(SegerSeurat, pathways = Baron_group_gs, dims = 1:50)
baron_group_gs_prediction <- rownames(HGT_baron_group_gs)[apply(HGT_baron_group_gs, 2, which.max)]
baron_group_gs_prediction_signif <- ifelse(apply(HGT_baron_group_gs, 2, max)>2, yes = baron_group_gs_prediction, "unassigned")
SegerSeurat$baron_group_gs_prediction <- baron_group_gs_prediction_signif

color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned"))

ggPredictions <- DimPlot(SegerSeurat, group.by = "baron_group_gs_prediction", pt.size = 0.2, reduction = "tsne") + ggtitle("Predicitons") + scale_color_manual(values = ggcolor, drop =FALSE) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggOriginal <- DimPlot(SegerSeurat, group.by = "cell.type", pt.size = 0.2, reduction = "tsne") + ggtitle("Original") + scale_color_manual(values = ggcolor) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(ggOriginal, ggPredictions, legend = "top", common.legend = T)

Predictions using cell matching

Alternatively we can also use Baron individual cell signatures to match one Baron data cells to a Segerstolpe data cells and the transfer the label to predict cell types in Segerstolpe.

HGT_baron_cell_gs <- RunCellHGT(SegerSeurat, pathways = Baron_cell_gs, dims = 1:50)
baron_cell_gs_match <- rownames(HGT_baron_cell_gs)[apply(HGT_baron_cell_gs, 2, which.max)]
baron_cell_gs_prediction <- BaronSeurat$cell.type[baron_cell_gs_match]
baron_cell_gs_prediction_signif <- ifelse(apply(HGT_baron_cell_gs, 2, max)>2, yes = baron_cell_gs_prediction, "unassigned")
SegerSeurat$baron_cell_gs_prediction <- baron_cell_gs_prediction_signif

color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned"))

ggPredictionsCellMatch <- DimPlot(SegerSeurat, group.by = "baron_cell_gs_prediction", pt.size = 0.2, reduction = "tsne") + ggtitle("Predicitons") + scale_color_manual(values = ggcolor, drop =FALSE) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggOriginal <- DimPlot(SegerSeurat, group.by = "cell.type", pt.size = 0.2, reduction = "tsne") + ggtitle("Original") + scale_color_manual(values = ggcolor) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(ggOriginal, ggPredictionsCellMatch, legend = "top", common.legend = T)

Functionnal Enrichment

Once MCA is performed, any type of signatures can be tested and that includes functionnal pathways. Here we use the Hallmark and KEGG pathways in HyperGeometric test and integrate the results into the Seurat object to visualise the -log10 pvalue of the enrichment into an UMAP.

KEGG <- fgsea::gmtPathways("https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName=KEGG_2019_Human")
HGT_Hallmark <- RunCellHGT(SegerSeurat, pathways = Hallmark, dims = 1:50)
HGT_KEGG <- RunCellHGT(SegerSeurat, pathways = KEGG, dims = 1:50)
SegerSeurat@assays[["Hallmark"]] <- CreateAssayObject(HGT_Hallmark)
SegerSeurat@assays[["KEGG"]] <- CreateAssayObject(HGT_KEGG)
ggG2Mcell <- FeaturePlot(SegerSeurat, "G2M-CHECKPOINT", order = T, reduction = "tsne", min.cutoff = 2) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggPancSecr <- FeaturePlot(SegerSeurat, "Pancreatic secretion", order = T, reduction = "tsne", min.cutoff = 2) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(ggG2Mcell, ggPancSecr)


cbl-imagine/cellID documentation built on July 24, 2020, 9:35 p.m.