In khodosevichlab/CellAnnotatoR: Automated marker-based cell type annotation

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

This vignette shows annotation of BM+CB dataset from the Conos tutorial across multiple samples

library(CellAnnotatoR)
library(conos)
library(pagoda2)
library(dplyr)
library(ggplot2)
library(pbapply)
library(cowplot)

theme_set(theme_bw())

Pre-processing

Let's load and pre-process data:

panel <- readRDS(system.file("extdata", "panel.rds", package="conos"))
panel_preprocessed <- lapply(panel, basicP2proc, n.cores=4, min.cells.per.gene=0, 
                             n.odgenes=2e3, get.largevis=FALSE, make.geneknn=FALSE)

Now we can integrate it with Conos:

con <- Conos$new(panel_preprocessed, n.cores=4)
con$buildGraph()
con$findCommunities(method=conos::leiden.community, resolution=3)
con$embedGraph(method="UMAP", min.dist=1, spread=2, n.cores=30)

con$plotGraph(size=0.2, shuffle.colors=T)

Prepare data for annotation:

marker_path <- system.file("extdata", "bm_cb.md", package = "CellAnnotatoR")
markers <- parseMarkerFile(marker_path) # We don't want to re-read marker inside each step of lapply
clf_datas <- lapply(con$samples, function(p2) 
  getClassificationData(Matrix::t(p2$misc$rawCounts), markers))

score_infos <- lapply(clf_datas, getMarkerScoreInfo)

Annotation

Now we can run individual annotation on each dataset:

ann_by_dataset <- pbmapply(function(cd, ms, p2) 
  assignCellsByScores(p2$graphs$PCA, score.info=ms, clf.data=cd),
  clf_datas, score_infos, panel_preprocessed, SIMPLIFY=F) %>% 
  setNames(names(clf_datas))

all_annotations <- lapply(ann_by_dataset, function(an) an$annotation$l1) %>% Reduce(c, .)
all_annotations_filt <- lapply(ann_by_dataset, function(an) an$annotation.filt$l1) %>% Reduce(c, .)

plot_grid(
  con$plotGraph(groups=all_annotations, size=0.2),
  con$plotGraph(groups=all_annotations_filt, size=0.2), 
  labels=c("All", "Filtered")
)

We can see that running annotation on individual samples doesn't neccesserily guarantee smoothness of labeling on the joint graph, as such approach can't utilize joint structure. To deal with it we can run annotation on the whole graph:

all_score_info <- mergeScoreInfos(score_infos, verbose=T)
ann_by_level <- assignCellsByScores(con$graph, score.info=all_score_info, clf.data=clf_datas[[1]])

plot_grid(
  con$plotGraph(groups=ann_by_level$annotation$l1, size=0.2),
  con$plotGraph(groups=ann_by_level$annotation.filt$l1, size=0.2), 
  labels=c("All", "Filtered")
)

To further deal with noise, we can use clustering information:

clusters <- con$clusters$leiden$groups
ann_by_level <- assignCellsByScores(con$graph, score.info=all_score_info, clf.data=clf_datas[[1]], 
                                clusters=clusters)

plot_grid(
  con$plotGraph(groups=ann_by_level$annotation$l1, size=0.2),
  con$plotGraph(groups=ann_by_level$annotation.filt$l1, size=0.2), 
  labels=c("All", "Filtered")
)

In the current example, clustering resolution is too low to separate all subpopulatons, which lead to lack of CLP and DC populations. Let's increase resolution:

ann <- ann_by_level$annotation$l1
target_clusters <- clusters[names(ann)[ann %in% c("Progenitors", "Plasma")]] %>% 
  as.character() %>% unique()
clusters_inc <- findSubcommunities(con, target_clusters, groups=clusters, resolution=1)
con$plotGraph(groups=clusters_inc, size=0.2, shuffle.colors=T)

And now we can re-run annotation:

ann_by_level <- assignCellsByScores(con$graph, score.info=all_score_info, clf.data=clf_datas[[1]], 
                                clusters=clusters_inc)

plot_grid(
  con$plotGraph(groups=ann_by_level$annotation$l1, size=0.2),
  con$plotGraph(groups=ann_by_level$annotation.filt$l1, size=0.2), 
  labels=c("All", "Filtered")
)