options(tinytex.verbose = TRUE) knitr::opts_chunk$set( cache = TRUE, cache.lazy = FALSE, tidy = TRUE )
# we would like to demonstrate the hybrid strategy using Seurat workflow library(Seurat) library(cowplot) # packages for cell hashing demultiplexing library(HTOreader) library(cellhashR) # this package integrated the following methods: multiseq, htodemux, gmm_demux, bff_raw, bff_cluster and dropletutils
We load 8pool-CA dataset from the 10X folder using Seurat
# Load in the RNA UMI matrix for doublet negative (DN) dataset and antigen specific (AS) dataset DN.data <- Read10X(data.dir = "../data/8pool-Carr-RNA/filtered_feature_bc_matrix/")
split different modalities: Cell hashing (HTO), antigen probes (Probe) and CITE-seq surface protein panel (ADT) In 8pool DN and AS datasets, we have applied 8 cell hashing tags, 18 antigen probes, and a large CITE-seq panel
DN.probe <- DN.data[["Antibody Capture"]][1:18,] # expression of antigen probes, Carrier data set doesn't have this measurement DN.hto <- DN.data[["Antibody Capture"]][19:26,] # expression of cell hashing tags DN.ADT <- DN.data[["Antibody Capture"]][27:163,] # expression of CITE-seq surface protein panel
# create Seurat object DN <- CreateSeuratObject(counts = DN.data$`Gene Expression`, project = "Carr") # create multi-modality assay and attach them to Seurat object DN[['HTO']] <- CreateAssayObject(counts = DN.hto) DN[['ADT']] <- CreateAssayObject(counts = DN.ADT) # percent of MT genes DN[["percent.mt"]] <- PercentageFeatureSet(object = DN, pattern = "^MT-") # percent of IG genes (for B cell study) DN[["percent.ig"]] <- PercentageFeatureSet(object = DN, pattern = "^IG[HKL]")
Filter out unwanted cells (optional)
DN <- subset(DN, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 10)
Normalize HTO and ADT data
DN <- NormalizeData(DN, assay = "HTO", normalization.method = "CLR") DN <- NormalizeData(DN, assay = "ADT", normalization.method = "CLR")
load data (we hide the plots of this step)
# parse cell hashing output, providing a barcode whitelist. (please refer to cellhashR tutorial for more information: https://github.com/BimberLab/cellhashR?tab=readme-ov-file#example) barcodeData <- ProcessCountMatrix(rawCountData = '../data/8pool-Carr-RNA/filtered_feature_bc_matrix/', barcodeWhitelist = c("hashtag1","hashtag2","hashtag3","hashtag4","hashtag5","hashtag6","hashtag7","hashtag8"), datatypeName = 'Antibody Capture') rownames(barcodeData) <- c("S282","S283","S284","S344","S397","S417","S421","S423")
QC (we hide the plots of this step)
# Create QC plots of barcode normalization PlotNormalizationQC(barcodeData)
Calling cell hashing using different methods (we hide the plots of this step)
# Generate the final cell hashing calls calls <- GenerateCellHashingCalls(barcodeMatrix = barcodeData, methods = c('multiseq', 'gmm_demux','bff_raw','bff_cluster','dropletutils'))
# Inspect negative cells: SummarizeCellsByClassification(calls = calls, barcodeMatrix = barcodeData)
DN <- NormalizeData(DN, assay = "HTO", normalization.method = "CLR") DN <- HTODemux(DN, assay = "HTO", positive.quantile = 0.99)
DN <- HTOClassification(DN, assay = 'HTO', method = 'log') plots <- PlotHTO(DN, assay = 'HTO', method = 'log') plot_grid(plotlist = plots, nrow = 2)
rownames(calls) <- calls$cellbarcode DN_barcode <- names(DN@active.ident) DN_barcode <- sub('-1','',DN_barcode) DN_hq_barcode <- intersect(DN_barcode, calls$cellbarcode) calls_hq <- calls[DN_hq_barcode,] HTOid.hq <- DN$HTOid names(HTOid.hq) <- sub('-1','',names(HTOid.hq)) calls_hq$HTOid <- HTOid.hq[rownames(calls_hq)] Seuratid.hq <- DN$hash.ID names(Seuratid.hq) <- sub('-1','',names(Seuratid.hq)) calls_hq$seurat <- as.character(Seuratid.hq[rownames(calls_hq)])
res_table <- matrix(data = c(table(calls_hq$multiseq), table(calls_hq$gmm_demux), table(calls_hq$bff_raw), table(calls_hq$bff_cluster), table(calls_hq$dropletutils), table(calls_hq$HTOid), table(calls_hq$seurat)), nrow = 7, ncol = 10, byrow = TRUE,dimnames = list(c('multiseq', 'gmm_demux','bff_raw','bff_cluster', 'dropletutils','HTOreader','htodemux(Seurat)'), names(table(calls_hq$multiseq)))) # singlet rate singlet_count <- rowSums(res_table[,which(colnames(res_table) %in% rownames(barcodeData))]) singlet_rate <- singlet_count / rowSums(res_table) final_table <- cbind(res_table, singlet_count, singlet_rate) colnames(final_table) <- c(colnames(res_table), 'singlet','singlet rate')
Cell hashing Demultiplexing results:
final_table
We found two BFF models failed to identify S282. The best singlet rate is 83.9%, average rate is ~80% (exlcuding BFF models due to failure of S282)
In order to apply hybrid demultiplexing, we need to add SNP-based labels into Seurat object:
# add soupercell labels into Seurat object souporcell.res <- read.csv('./8pool/clusters.tsv', sep = '\t') souporcell.res$identity <- souporcell.res$status souporcell.res$identity[which(souporcell.res$status == 'singlet')] <- paste0(souporcell.res$status[which(souporcell.res$status == 'singlet')], souporcell.res$assignment[which(souporcell.res$status == 'singlet')]) rownames(souporcell.res) <- souporcell.res$barcode DN$souporcell <- souporcell.res[names(DN@active.ident), 'identity']
Then we reveal cell labels using hybrid demultiplexing ("HybridDemultiplexing" function in HTOreader package). In this case, we use HTOreader result for cell hashing demultiplexing
DN <- HybridDemultiplexing(object = DN, cellhashing_label = 'HTOid', genotype_label = 'souporcell', hto_names = c("S282","S283","S284","S344","S397","S417","S421","S423"))
table(DN$hybridID)
BTW, The Hybrid strategy is highly flexiable to cell hashing demultiplexing method and quality, it works with all cell hashing methods. For example:
# Hybrid demultiplexing using HTOdemux (Seurat) DN <- HybridDemultiplexing(object = DN, cellhashing_label = 'hash.ID', genotype_label = 'souporcell', hto_names = c("S282","S283","S284","S344","S397","S417","S421","S423"))
table(DN$hybridID)
Furthermore, The Hybrid strategy even works with two BFF models that fail to identify S282. HybridDemultiplexing correctly linked singlet6 with S282.
rownames(calls) <- paste0(calls$cellbarcode, '-1') DN$bff_cluster <- calls[names(DN@active.ident), 'bff_cluster'] DN <- HybridDemultiplexing(object = DN, cellhashing_label = 'bff_cluster', genotype_label = 'souporcell', hto_names = c("S282","S283","S284","S344","S397","S417","S421","S423"))
We observe that results from hybrid runs using various cell hashing methods may exhibit minor variations. This is due to our adoption of a conservative strategy, which categorizes all Case 2/3 inconsistency as unassigned to guarantee the accuracy and precision of the outcomes. For detailed information, please refer to the Methods section of our paper.
table(DN$hybridID)
In this case, we have 8pool-AS, which shares the same group of human donors but doesn't have cell hashing for some reason. Using Hybrid demultiplexing, we're still able to handle it. We load the 8pool-AS data first:
AS.data <- Read10X(data.dir = "../data/8pool-AS-RNA/filtered_feature_bc_matrix/") AS.probe <- AS.data[["Antibody Capture"]][1:18,] # expression of antigen probes AS.hto <- AS.data[["Antibody Capture"]][19:26,] # expression of cell hashing tags, Antigen specific data set doesn't have this measurement, they have 0 counts AS.ADT <- AS.data[["Antibody Capture"]][27:163,] # expression of CITE-seq surface protein panel AS <- CreateSeuratObject(counts = AS.data$`Gene Expression`, project = "AS") AS[['ADT']] <- CreateAssayObject(counts = AS.ADT) AS[['Probe']] <- CreateAssayObject(counts = AS.probe) AS[["percent.mt"]] <- PercentageFeatureSet(object = AS, pattern = "^MT-") AS[["percent.ig"]] <- PercentageFeatureSet(object = AS, pattern = "^IG[HKL]") AS <- subset(AS, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 10) AS <- NormalizeData(AS, assay = "Probe", normalization.method = "CLR") AS <- NormalizeData(AS, assay = "ADT", normalization.method = "CLR")
We ran souporcell on the entire 8pool dataset (AS + DN) so that the genotypes are across two datasets. We annotate souporcell genotype clusters to 8pool-AS dataset
souporcell.res <- read.csv('./8pool/clusters.tsv', sep = '\t') souporcell.res$identity <- souporcell.res$status souporcell.res$identity[which(souporcell.res$status == 'singlet')] <- paste0(souporcell.res$status[which(souporcell.res$status == 'singlet')], souporcell.res$assignment[which(souporcell.res$status == 'singlet')]) rownames(souporcell.res) <- souporcell.res$barcode AS$souporcell <- souporcell.res[names(AS@active.ident), 'identity']
We get high singlet rate for 8pool-AS using hybrid demultiplexing and saved reagents of cell hashing on 8pool-AS. Beside the economy, there is a more critical reason for not using cell hashing in 8pool-AS dataset: we aim to measure the binding affinity between antigen specific B cells and our antigen probes. We don't want any disruptions from the cell hashing due to they share the same channel(Antibody Capture)
Singlet rate of hybrid on 8pool-AS:
(1 - length(which(AS$souporcell %in% c('doublet','unassigned'))) / length(AS$souporcell)) * 100 table(AS$souporcell)
We have illustrated the effectiveness of hybrid demultiplexing on "high quality" data, where cell hashing and SNP-based methods are highly consistent with each other. However, there are instances where the two modalities do not align well. In such cases, our hybrid demultiplexing approach adeptly identifies significant misalignment issues via the convergence score and issues a warning to the user. Discrepancies between the two modalities often suggest a high doublet rate or a complex genetic background, necessitating careful examination by users with relevant biological expertise. To exemplify this, we use the dataset labeled '9pool-CA':
DN.9pool.data <- Read10X(data.dir = "../data/9pool-CA-RNA/filtered_feature_bc_matrix/") DN.9pool.hto <- DN.9pool.data[["Antibody Capture"]][138:146,] DN.9pool.data[["Antibody Capture"]] <- DN.9pool.data[["Antibody Capture"]][1:137,] DN.9pool <- CreateSeuratObject(counts = DN.9pool.data$`Gene Expression`, project = "Carr") DN.9pool[['HTO']] <- CreateAssayObject(counts = DN.9pool.hto) DN.9pool$subject <- "CA" DN.9pool <- RenameCells(object = DN.9pool, add.cell.id = "CA") DN.9pool[["percent.mt"]] <- PercentageFeatureSet(object = DN.9pool, pattern = "^MT-") DN.9pool[["percent.ig"]] <- PercentageFeatureSet(object = DN.9pool, pattern = "^IG[HKL]") DN.9pool <- subset(DN.9pool, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 10) DN.9pool <- NormalizeData(DN.9pool, assay = "HTO", normalization.method = "CLR")
Cell hashing demultiplexing using HTOreader
DN.9pool <- HTOClassification(DN.9pool, assay = 'HTO', method = 'log')
Load souporcell results. There are three runs, 1) using all cells, 2) only remove cells that expressed both B and T cell markers, 3) remove all doublet cells that identified by cell hashing
# souporcell results using all cells sampleID <- read.table(file = '9pool/clusters.csv',header = TRUE, sep = ',', row.names = 1) soupID <- as.character(DN.9pool@meta.data[["orig.ident"]]) i <- 1 for (barcode in names(DN.9pool@active.ident)){ barcode1 <- gsub('^[A-Za-z]+_','',barcode) if(barcode1 %in% rownames(sampleID)) { soupID[i] <- sampleID[barcode1,'identity'] }else { soupID[i] <- 'unassigned' } i <- i + 1 } DN.9pool$soupID1 <- soupID # souporcell results on the dataset, after excluding cells that expressed both B and T cell markers sampleID <- read.table(file = '9pool/clusters1.csv',header = TRUE, sep = ',', row.names = 1) soupID <- as.character(DN.9pool@meta.data[["orig.ident"]]) i <- 1 for (barcode in names(DN.9pool@active.ident)){ barcode1 <- gsub('^[A-Za-z]+_','',barcode) if(barcode1 %in% rownames(sampleID)) { soupID[i] <- sampleID[barcode1,'identity'] }else { soupID[i] <- 'unassigned' } i <- i + 1 } DN.9pool$soupID2 <- soupID # souporcell results on the dataset, , after excluding all doublet cells that identified by cell hashing sampleID <- read.table(file = '9pool/clusters2.csv',header = TRUE, sep = ',', row.names = 1) soupID <- as.character(DN.9pool@meta.data[["orig.ident"]]) i <- 1 for (barcode in names(DN.9pool@active.ident)){ barcode1 <- gsub('^[A-Za-z]+_','',barcode) if(barcode1 %in% rownames(sampleID)) { soupID[i] <- sampleID[barcode1,'identity'] }else { soupID[i] <- 'unassigned' } i <- i + 1 } DN.9pool$soupID3 <- soupID
We then run HybridDemultiplexing for using all three genotype labels. For soupID1, a low convergence score was identified, and a warning was poped. HybridID was set to the cell hashing labels cause the genotype labels are not accurate.
DN.9pool <- HybridDemultiplexing(object = DN.9pool, cellhashing_label = 'HTOid', genotype_label = 'soupID1', hto_names = DN.9pool.hto@Dimnames[[1]])
For soupID2, the convergence score was improved, but still too low. HybridID was set to the cell hashing labels cause the genotype labels are not accurate.
DN.9pool <- HybridDemultiplexing(object = DN.9pool, cellhashing_label = 'HTOid', genotype_label = 'soupID2', hto_names = DN.9pool.hto@Dimnames[[1]])
For soupID3, the convergence score was highly improved. HybridID was determined by both genotype labels and cell hashing labels.
DN.9pool <- HybridDemultiplexing(object = DN.9pool, cellhashing_label = 'HTOid', genotype_label = 'soupID3', hto_names = DN.9pool.hto@Dimnames[[1]])
Session info
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.