options(tinytex.verbose = TRUE)
knitr::opts_chunk$set(
  cache = TRUE,
  cache.lazy = FALSE,
  tidy = TRUE
)

Load packages

# we would like to demonstrate the hybrid strategy using Seurat workflow
library(Seurat)
library(cowplot)

# packages for cell hashing demultiplexing 
library(HTOreader)
library(cellhashR) # this package integrated the following methods: multiseq, htodemux, gmm_demux, bff_raw, bff_cluster and dropletutils

Step 1 Load data from 10X folder

We load 8pool-CA dataset from the 10X folder using Seurat

# Load in the RNA UMI matrix for doublet negative (DN) dataset and antigen specific (AS) dataset
DN.data <- Read10X(data.dir = "../data/8pool-Carr-RNA/filtered_feature_bc_matrix/")

split different modalities: Cell hashing (HTO), antigen probes (Probe) and CITE-seq surface protein panel (ADT) In 8pool DN and AS datasets, we have applied 8 cell hashing tags, 18 antigen probes, and a large CITE-seq panel

DN.probe <- DN.data[["Antibody Capture"]][1:18,]                            # expression of antigen probes, Carrier data set doesn't have this measurement
DN.hto <- DN.data[["Antibody Capture"]][19:26,]                             # expression of cell hashing tags  
DN.ADT <- DN.data[["Antibody Capture"]][27:163,]                            # expression of CITE-seq surface protein panel

Step 2 Create Seurat object and attached multi-modality data

# create Seurat object
DN <- CreateSeuratObject(counts = DN.data$`Gene Expression`, project = "Carr")

# create multi-modality assay and attach them to Seurat object
DN[['HTO']] <- CreateAssayObject(counts = DN.hto)
DN[['ADT']] <- CreateAssayObject(counts = DN.ADT)

# percent of MT genes
DN[["percent.mt"]] <- PercentageFeatureSet(object = DN, pattern = "^MT-")
# percent of IG genes (for B cell study)
DN[["percent.ig"]] <- PercentageFeatureSet(object = DN, pattern = "^IG[HKL]")

Step 3 Pre-process

Filter out unwanted cells (optional)

DN <- subset(DN, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 10)

Normalize HTO and ADT data

DN <- NormalizeData(DN, assay = "HTO", normalization.method = "CLR")
DN <- NormalizeData(DN, assay = "ADT", normalization.method = "CLR")

Step 4 Cell hashing demultiplexing

1) Demultiplexing of cell hashing using existing methods integrated in cellhashR: multiseq, gmm_demux, bff_raw, bff_cluster, dropletutils

load data (we hide the plots of this step)

#  parse cell hashing output, providing a barcode whitelist. (please refer to cellhashR tutorial for more information: https://github.com/BimberLab/cellhashR?tab=readme-ov-file#example) 
barcodeData <- ProcessCountMatrix(rawCountData = '../data/8pool-Carr-RNA/filtered_feature_bc_matrix/', barcodeWhitelist = c("hashtag1","hashtag2","hashtag3","hashtag4","hashtag5","hashtag6","hashtag7","hashtag8"), datatypeName = 'Antibody Capture')
rownames(barcodeData) <- c("S282","S283","S284","S344","S397","S417","S421","S423")

QC (we hide the plots of this step)

# Create QC plots of barcode normalization
PlotNormalizationQC(barcodeData)

Calling cell hashing using different methods (we hide the plots of this step)

# Generate the final cell hashing calls
calls <- GenerateCellHashingCalls(barcodeMatrix = barcodeData, methods = c('multiseq', 'gmm_demux','bff_raw','bff_cluster','dropletutils'))
# Inspect negative cells:
SummarizeCellsByClassification(calls = calls, barcodeMatrix = barcodeData)

2) Demultiplexing of cell hashing using Seurat (htodemux)

DN <- NormalizeData(DN, assay = "HTO", normalization.method = "CLR")
DN <- HTODemux(DN, assay = "HTO", positive.quantile = 0.99)

3) Demultiplexing of cell hashing using HTOreader

DN <- HTOClassification(DN, assay = 'HTO', method = 'log')

plots <- PlotHTO(DN, assay = 'HTO', method = 'log')
plot_grid(plotlist = plots, nrow = 2)

4)integrate HTOreader, Seurat results with cellhashR results

rownames(calls) <- calls$cellbarcode
DN_barcode <- names(DN@active.ident)
DN_barcode <- sub('-1','',DN_barcode)
DN_hq_barcode <- intersect(DN_barcode, calls$cellbarcode)

calls_hq <- calls[DN_hq_barcode,]

HTOid.hq <- DN$HTOid
names(HTOid.hq) <- sub('-1','',names(HTOid.hq))
calls_hq$HTOid <- HTOid.hq[rownames(calls_hq)]

Seuratid.hq <- DN$hash.ID
names(Seuratid.hq) <- sub('-1','',names(Seuratid.hq))
calls_hq$seurat <- as.character(Seuratid.hq[rownames(calls_hq)])
res_table <- matrix(data = c(table(calls_hq$multiseq), table(calls_hq$gmm_demux), table(calls_hq$bff_raw), table(calls_hq$bff_cluster), table(calls_hq$dropletutils), table(calls_hq$HTOid), table(calls_hq$seurat)), nrow = 7, ncol = 10, byrow = TRUE,dimnames = list(c('multiseq', 'gmm_demux','bff_raw','bff_cluster', 'dropletutils','HTOreader','htodemux(Seurat)'), names(table(calls_hq$multiseq))))

# singlet rate
singlet_count <- rowSums(res_table[,which(colnames(res_table) %in% rownames(barcodeData))])
singlet_rate <- singlet_count / rowSums(res_table)

final_table <- cbind(res_table, singlet_count, singlet_rate)
colnames(final_table) <- c(colnames(res_table), 'singlet','singlet rate')

Cell hashing Demultiplexing results:

final_table

We found two BFF models failed to identify S282. The best singlet rate is 83.9%, average rate is ~80% (exlcuding BFF models due to failure of S282)

Step 4, Apply hybrid strategy: combine cell hashing demultiplexing with SNP-based method

In order to apply hybrid demultiplexing, we need to add SNP-based labels into Seurat object:

# add soupercell labels into Seurat object
souporcell.res <- read.csv('./8pool/clusters.tsv', sep = '\t')
souporcell.res$identity <- souporcell.res$status
souporcell.res$identity[which(souporcell.res$status == 'singlet')] <- paste0(souporcell.res$status[which(souporcell.res$status == 'singlet')], souporcell.res$assignment[which(souporcell.res$status == 'singlet')])
rownames(souporcell.res) <- souporcell.res$barcode
DN$souporcell <- souporcell.res[names(DN@active.ident), 'identity']

Then we reveal cell labels using hybrid demultiplexing ("HybridDemultiplexing" function in HTOreader package). In this case, we use HTOreader result for cell hashing demultiplexing

DN <- HybridDemultiplexing(object = DN, cellhashing_label = 'HTOid', genotype_label = 'souporcell', hto_names = c("S282","S283","S284","S344","S397","S417","S421","S423"))
table(DN$hybridID)

BTW, The Hybrid strategy is highly flexiable to cell hashing demultiplexing method and quality, it works with all cell hashing methods. For example:

# Hybrid demultiplexing using HTOdemux (Seurat)
DN <- HybridDemultiplexing(object = DN, cellhashing_label = 'hash.ID', genotype_label = 'souporcell', hto_names = c("S282","S283","S284","S344","S397","S417","S421","S423"))
table(DN$hybridID)

Furthermore, The Hybrid strategy even works with two BFF models that fail to identify S282. HybridDemultiplexing correctly linked singlet6 with S282.

rownames(calls) <- paste0(calls$cellbarcode, '-1')
DN$bff_cluster <- calls[names(DN@active.ident), 'bff_cluster']

DN <- HybridDemultiplexing(object = DN, cellhashing_label = 'bff_cluster', genotype_label = 'souporcell', hto_names = c("S282","S283","S284","S344","S397","S417","S421","S423"))

We observe that results from hybrid runs using various cell hashing methods may exhibit minor variations. This is due to our adoption of a conservative strategy, which categorizes all Case 2/3 inconsistency as unassigned to guarantee the accuracy and precision of the outcomes. For detailed information, please refer to the Methods section of our paper.

table(DN$hybridID)

Step 5, hybrid demultiplexing for multiple datasets from the same group of donors (optional)

In this case, we have 8pool-AS, which shares the same group of human donors but doesn't have cell hashing for some reason. Using Hybrid demultiplexing, we're still able to handle it. We load the 8pool-AS data first:

AS.data <- Read10X(data.dir = "../data/8pool-AS-RNA/filtered_feature_bc_matrix/")

AS.probe <- AS.data[["Antibody Capture"]][1:18,]                            # expression of antigen probes
AS.hto <- AS.data[["Antibody Capture"]][19:26,]                             # expression of cell hashing tags, Antigen specific data set doesn't have this measurement, they have 0 counts 
AS.ADT <- AS.data[["Antibody Capture"]][27:163,]                            # expression of CITE-seq surface protein panel

AS <- CreateSeuratObject(counts = AS.data$`Gene Expression`, project = "AS")

AS[['ADT']] <- CreateAssayObject(counts = AS.ADT)
AS[['Probe']] <- CreateAssayObject(counts = AS.probe)

AS[["percent.mt"]] <- PercentageFeatureSet(object = AS, pattern = "^MT-")
AS[["percent.ig"]] <- PercentageFeatureSet(object = AS, pattern = "^IG[HKL]")

AS <- subset(AS, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 10)

AS <- NormalizeData(AS, assay = "Probe", normalization.method = "CLR")
AS <- NormalizeData(AS, assay = "ADT", normalization.method = "CLR")

We ran souporcell on the entire 8pool dataset (AS + DN) so that the genotypes are across two datasets. We annotate souporcell genotype clusters to 8pool-AS dataset

souporcell.res <- read.csv('./8pool/clusters.tsv', sep = '\t')
souporcell.res$identity <- souporcell.res$status
souporcell.res$identity[which(souporcell.res$status == 'singlet')] <- paste0(souporcell.res$status[which(souporcell.res$status == 'singlet')], souporcell.res$assignment[which(souporcell.res$status == 'singlet')])
rownames(souporcell.res) <- souporcell.res$barcode

AS$souporcell <- souporcell.res[names(AS@active.ident), 'identity']

We get high singlet rate for 8pool-AS using hybrid demultiplexing and saved reagents of cell hashing on 8pool-AS. Beside the economy, there is a more critical reason for not using cell hashing in 8pool-AS dataset: we aim to measure the binding affinity between antigen specific B cells and our antigen probes. We don't want any disruptions from the cell hashing due to they share the same channel(Antibody Capture)

Singlet rate of hybrid on 8pool-AS:

(1 - length(which(AS$souporcell %in% c('doublet','unassigned'))) / length(AS$souporcell)) * 100
table(AS$souporcell)

Supplementary, hybrid demultiplexing for dataset that has high doublet rate

We have illustrated the effectiveness of hybrid demultiplexing on "high quality" data, where cell hashing and SNP-based methods are highly consistent with each other. However, there are instances where the two modalities do not align well. In such cases, our hybrid demultiplexing approach adeptly identifies significant misalignment issues via the convergence score and issues a warning to the user. Discrepancies between the two modalities often suggest a high doublet rate or a complex genetic background, necessitating careful examination by users with relevant biological expertise. To exemplify this, we use the dataset labeled '9pool-CA':

DN.9pool.data <- Read10X(data.dir = "../data/9pool-CA-RNA/filtered_feature_bc_matrix/")
DN.9pool.hto <- DN.9pool.data[["Antibody Capture"]][138:146,]
DN.9pool.data[["Antibody Capture"]] <- DN.9pool.data[["Antibody Capture"]][1:137,]
DN.9pool <- CreateSeuratObject(counts = DN.9pool.data$`Gene Expression`, project = "Carr")
DN.9pool[['HTO']] <- CreateAssayObject(counts = DN.9pool.hto)
DN.9pool$subject <- "CA"
DN.9pool <- RenameCells(object = DN.9pool, add.cell.id = "CA")
DN.9pool[["percent.mt"]] <- PercentageFeatureSet(object = DN.9pool, pattern = "^MT-")
DN.9pool[["percent.ig"]] <- PercentageFeatureSet(object = DN.9pool, pattern = "^IG[HKL]")
DN.9pool <- subset(DN.9pool, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 10)
DN.9pool <- NormalizeData(DN.9pool, assay = "HTO", normalization.method = "CLR")

Cell hashing demultiplexing using HTOreader

DN.9pool <- HTOClassification(DN.9pool, assay = 'HTO', method = 'log')

Load souporcell results. There are three runs, 1) using all cells, 2) only remove cells that expressed both B and T cell markers, 3) remove all doublet cells that identified by cell hashing

# souporcell results using all cells
sampleID <- read.table(file = '9pool/clusters.csv',header = TRUE, sep = ',', row.names = 1)
soupID <- as.character(DN.9pool@meta.data[["orig.ident"]])
i <- 1
for (barcode in names(DN.9pool@active.ident)){
  barcode1 <- gsub('^[A-Za-z]+_','',barcode)
  if(barcode1 %in% rownames(sampleID)) {
    soupID[i] <- sampleID[barcode1,'identity']
  }else {
    soupID[i] <- 'unassigned'
  }
  i <- i + 1
}
DN.9pool$soupID1 <- soupID

# souporcell results on the dataset, after excluding cells that expressed both B and T cell markers
sampleID <- read.table(file = '9pool/clusters1.csv',header = TRUE, sep = ',', row.names = 1)
soupID <- as.character(DN.9pool@meta.data[["orig.ident"]])
i <- 1
for (barcode in names(DN.9pool@active.ident)){
  barcode1 <- gsub('^[A-Za-z]+_','',barcode)
  if(barcode1 %in% rownames(sampleID)) {
    soupID[i] <- sampleID[barcode1,'identity']
  }else {
    soupID[i] <- 'unassigned'
  }
  i <- i + 1
}
DN.9pool$soupID2 <- soupID

# souporcell results on the dataset, , after excluding all doublet cells that identified by cell hashing
sampleID <- read.table(file = '9pool/clusters2.csv',header = TRUE, sep = ',', row.names = 1)
soupID <- as.character(DN.9pool@meta.data[["orig.ident"]])
i <- 1
for (barcode in names(DN.9pool@active.ident)){
  barcode1 <- gsub('^[A-Za-z]+_','',barcode)
  if(barcode1 %in% rownames(sampleID)) {
    soupID[i] <- sampleID[barcode1,'identity']
  }else {
    soupID[i] <- 'unassigned'
  }
  i <- i + 1
}
DN.9pool$soupID3 <- soupID

We then run HybridDemultiplexing for using all three genotype labels. For soupID1, a low convergence score was identified, and a warning was poped. HybridID was set to the cell hashing labels cause the genotype labels are not accurate.

DN.9pool <- HybridDemultiplexing(object = DN.9pool, cellhashing_label = 'HTOid', genotype_label = 'soupID1', hto_names = DN.9pool.hto@Dimnames[[1]])

For soupID2, the convergence score was improved, but still too low. HybridID was set to the cell hashing labels cause the genotype labels are not accurate.

DN.9pool <- HybridDemultiplexing(object = DN.9pool, cellhashing_label = 'HTOid', genotype_label = 'soupID2', hto_names = DN.9pool.hto@Dimnames[[1]])

For soupID3, the convergence score was highly improved. HybridID was determined by both genotype labels and cell hashing labels.

DN.9pool <- HybridDemultiplexing(object = DN.9pool, cellhashing_label = 'HTOid', genotype_label = 'soupID3', hto_names = DN.9pool.hto@Dimnames[[1]])

Session info

sessionInfo()


LeiLi-Uchicago/HTOreader documentation built on Aug. 14, 2024, 8:38 p.m.