library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
library('knitr')
library('dplyr')
library('ggrepel')
library('patchwork')
knit_hooks$set(optipng = hook_optipng)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  cache = FALSE,
  warning = FALSE,
  digits = 2,
  tidy = TRUE,
  tidy.opts = list(width.cutoff=80),
  optipng = '-o 5 -strip all -quiet',
  fig.width=6.5, fig.height=2.5, dpi=100, out.width = '80%'
)
options(dplyr.summarise.inform = FALSE)
options(tibble.width = Inf)
old_theme <- theme_set(theme_bw(base_size=10))
# We use the Future API for parallel processing; set parameters here
future::plan(strategy = 'multicore', workers = 1)
options(future.globals.maxSize = 8 * 1024 ^ 3)
options(future.fork.enable = FALSE)
set.seed(6646428)
tic <- proc.time()

NOTE: This document was generated with sctransform version r packageVersion(pkg = 'sctransform')

Introduction

With this vignette we introduce the non-parametric differential expression test for sparse non-negative data as seen in single-cell RNA sequencing.

The test is a model-free randomization test where the observed difference in mean between two groups is compared against a null distribution that is obtained by random shuffling of the group labels.

Given the observed difference and the null distribution, empirical p-values are calculated: emp_pval = (b + 1) / (R + 1) where b is the number of times the absolute difference in mean from a random permutation is at least as large as the absolute value of the observed difference in mean, R is the number of random permutations. This is an upper bound of the real empirical p-value that would be obtained by enumerating all possible group label permutations.

Additionally, we approximate the empirical null distribution with a normal distribution and turn the observed difference in mean into a z-score and then into a p-value. Finally, all p-values (for the tested genes) are adjusted using the Benjamini & Hochberg method (fdr).

The log2FC values in the output are log2(mean1 / mean2).

The current implementation only supports sparse matrices of class dgCMatrix from the Matrix package and has been optimized for speed (see benchmarking results below).

Load some data

We use the publicly available "10k PBMCs from a Healthy Donor (v3 chemistry)" data (11,769 cells) from 10x Genomics available at https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_10k_v3

The data is the same as the demo dataset used in Azimuth. We load the results obtained from Azimuth (Azimuth version: 0.3.2; Seurat version: 4.0.0; Reference version: 1.0.0; default parameters in web-app).

We then keep only those cell types that have at least 5 cells with a prediction and mapping score > 0.66 and further remove all genes that have not been detected in at least 5 cells.

counts <- Seurat::Read10X_h5('~/Projects/data_warehouse/raw_public_10x/pbmc_10k_v3_filtered_feature_bc_matrix.h5')
predictions <- read.delim('~/Projects/data_warehouse/raw_public_10x/pbmc_10k_v3_azimuth_pred.tsv', row.names = 1)

tab <- table(predictions$predicted.celltype.l2, predictions$predicted.celltype.l2.score > 0.66 & predictions$mapping.score > 0.66)
keep_types <- rownames(tab)[tab[, 2] >= 5]
keep_cells <- rownames(predictions)[predictions$predicted.celltype.l2 %in% keep_types]

counts <- counts[, keep_cells]
counts <- counts[rowSums(counts > 0) >= 5, ]
predictions <- predictions[keep_cells, ]

cell_types <- factor(predictions$predicted.celltype.l2, levels = names(sort(table(predictions$predicted.celltype.l2), decreasing = TRUE)))

We now have a count matrix of r nrow(counts) genes and r ncol(counts) cells with the following cell type labels:

data.frame(table(cell_types))

Motivation

Here we illustrate the concept of the test using CD14 Monocytes as group 1 and all remaining cells as group 2. We will show two example genes: HINT1 (not differentially expressed), and CD14.

goi <- c('HINT1', 'CD14')
df <- melt(t(as.matrix(counts[goi, , drop = FALSE])), varnames = c('cell', 'gene'), value.name = 'counts')
df$cell_type <- factor(c('rest', 'CD14 Mono')[(cell_types == 'CD14 Mono') + 1])
# calculate the (geometric) mean per group
df_sum <- group_by(df, gene, cell_type) %>% 
  summarise(mean = expm1(mean(log1p(counts))), mid = median(range(counts)), .groups = 'drop')
# and the difference of means
df_diff <- group_by(df_sum, gene) %>% 
  summarise(diff_mean = mean[1] - mean[2], 
            label = sprintf('Difference in mean: %1.2g\nlog2 fold-change: %1.2g', diff_mean, log2(mean[1] / mean[2])), 
            x = max(mid), 
            y = Inf, 
            .groups = 'drop')
p1 <- ggplot(df, aes(counts, y = ..density.., fill = cell_type)) +
  geom_histogram(binwidth = 1, position = 'identity', alpha = 0.4) +
  geom_vline(data = df_sum, aes(xintercept = mean, color = cell_type)) +
  geom_label(data = df_diff, aes(x, y, label = label), 
            vjust = 1, inherit.aes = FALSE, size = 3) +
  facet_wrap(~ gene, scales = 'free') +
  xlab('Gene counts') + ylab('Proportion of cells') +
  ggtitle('Observed data and differences in geometric mean')
plot(p1)

The plot above shows the UMI counts per gene per group. Also shown is the difference in mean (mean1 - mean2) and the log2 fold-change (log2(mean1 / mean2)). To find out whether the observed difference in mean is significant we look at the null distribution of difference in mean, i.e. we shuffle the labels (here we use 99 repetitions) and calculate the difference in mean.

# calculate null distribution of difference in mean for each gene
grp <- factor(c('rest', 'CD14 Mono')[(cell_types == 'CD14 Mono') + 1])
tmp_counts <- counts[goi, , drop = FALSE]
R <- 99
diff_mean_null <- sapply(1:R, function(i) {
  mean_r <- sctransform:::row_gmean_grouped_dgcmatrix(matrix = tmp_counts, group = grp, eps = 1, shuffle = TRUE)
  mean_r[, 1] - mean_r[, 2]
})
df_null <- melt(diff_mean_null, varnames = c('gene', 'iteration'), value.name = 'diff_mean')

p2 <- ggplot(df_null, aes(diff_mean)) + 
  geom_histogram(bins = 33) +
  facet_wrap(~ gene, scales = 'free') +
  xlab('Difference in geometric mean') + ylab('Count') +
  ggtitle('Null distribution of differences in geometric mean')
plot(p2)

The null distribution of 'difference in mean' shown above indicates what values to expect if the null is true (no difference in mean between the two groups). We can use the distribution to obtain an empirical p-value by asking how often the absolute value of the null distribution is larger or equal to the observed difference in mean. We use the absolute value since this is a two-tailed test, and use a pseudo-count in nominator and denominator when turning the observed frequencies into p-values (see @phipson2010 and @hartwig2013 for discussions).

# given the null distribution, get empirical p-value, fit a gaussian and get
# approximated p-value
df_res <- left_join(df_null, df_diff, by = 'gene') %>% 
  group_by(gene) %>% 
  summarise(
    emp_pval = (sum((abs(diff_mean.x) - abs(diff_mean.y)) >= 0) + 1) / (R + 1), 
    sds = sqrt(sum(diff_mean.x^2)/(R-1)),
    zscore = (diff_mean.y[1] - mean(diff_mean.x)) / sds,
    pval = 2 * pnorm(-abs(zscore)),
    min_r = min(diff_mean.x),
    max_r = max(diff_mean.x),
    mean_r = mean(diff_mean.x),
    observed = diff_mean.y[1],
    .groups = 'drop')
df_fit <- group_by(df_res, gene) %>% 
  summarise(x = seq(from = min(min_r, observed), to = max(max_r, observed), length.out = 333),
            y = dnorm(x = x, mean = mean_r, sd = sds), .groups = 'drop')
df_anno <- group_by(df_res, gene) %>% 
  summarise(x = max(max_r, observed),
            y = Inf,
            label = sprintf('Empirical p-value: %1.2g\nApprox. p-value: %1.2g', emp_pval, pval))

p3 <- ggplot(df_null, aes(diff_mean, y = ..density..)) + 
  geom_histogram(bins = 33, aes(fill = 'gray70')) +
  geom_line(data = df_fit, aes(x = x, y = y, linetype = '1')) +
  geom_vline(data = df_res, aes(xintercept = observed, linetype = '2'), show_guide=FALSE) +
  geom_label(data = df_anno, aes(x, y, label = label), hjust = 1, vjust = 1, size = 3) +
  facet_wrap(~ gene, scales = 'free') +
  xlab('Difference in geometric mean') + ylab('Distribution density') +
  ggtitle('Using the null distribution to obtain p-values') +
  scale_fill_manual(name = '', values = 'gray70', labels = sprintf('null distribution', R)) +
  scale_linetype_manual(name = '', values = c(1, 2), labels = c('Approximated null', 'Observed difference\nin mean'))
plot(p3)

The lowest possible empirical p-value is 1/(R+1) whith R being the number of random permutation used. However, the gaussian approximation of the null distribution allows us to calculate z-scores and consequently p-values that are lower than that. While the approximation using a gaussian might not be exact, especially for genes with very low detection rate or when cell numbers are very low, it generally agrees well with the empirical data.

Example 1: DE of CD14 Mono vs CD16 Mono (one vs one)

First, we will take the count matrix and fit a model using sctransform::vst, and in a second step obtain corrected counts (with the sequencing depth effect removed). Then compare the two groups.

vst_out <- vst(umi = counts, method = 'qpoisson', residual_type = 'none', return_cell_attr = TRUE, verbosity = 0)
counts_corrected <- correct_counts(x = vst_out, umi = counts, verbosity = 0)

By default sctransform::diff_mean_test applies some moderate pre-filtering and tests only genes with:

Here we disable the first filter, but require a mean of at least 0.1 in at least one of the groups. We show results as a volcano plot and highlight the top DE genes (based on p-value or log-fold-change).

bm <- bench::mark(
  de_res <- diff_mean_test(y = counts_corrected, 
                           group_labels = cell_types, 
                           compare = c('CD14 Mono', 'CD16 Mono'),
                           log2FC_th = 0, 
                           mean_th = 0.1),
  max_iterations = 1, 
  filter_gc = FALSE
)
knitr::kable(data.frame(bm[, 5:9]), caption = "Benchmarking details")

The runtime for the tests was r as.character(bm$total_time). In total, r nrow(de_res) genes were tested. r sum(de_res$emp_pval_adj <= 0.05 & abs(de_res$log2FC) >= 1) genes showed a mean that was significantly different between groups (FDR <= 0.05 based on empirical p-values) AND had an absolute log2 fold-change of at least 1 (i.e. abs(log2(mean1/mean2)) >= 1).

top_markers <- arrange(de_res, sign(log2FC), -abs(log2FC)) %>%
  group_by(sign(log2FC)) %>%
  filter(rank(-abs(zscore), ties.method = "first") <= 4 |
         rank(-abs(log2FC), ties.method = "first") <= 4) %>%
  ungroup() %>%
  select(gene, mean1, mean2, log2FC, emp_pval_adj, pval_adj, zscore)

p1 <- ggplot(de_res, aes(log2FC, pmax(-0.5, log10(abs(zscore))))) + 
  geom_point(aes(color = emp_pval_adj < 0.05 & pval_adj < 0.05)) + 
  geom_point(data = top_markers, color = 'deeppink') +
  geom_text_repel(data = top_markers, mapping = aes(label = gene)) +
  theme(legend.position = 'bottom') +
  ylab('Zscore [log10 of absolute value, clipped at -0.5]') +
  xlab('log2 fold-change (log2(mean1 / mean2))')

show(p1)

Top markers per cell type

filter(top_markers, log2FC < 0) %>% DT::datatable(rownames = FALSE, options = list(paging = FALSE, searching = FALSE)) %>% DT::formatRound(2:7, digits = 2)
filter(top_markers, log2FC > 0) %>% DT::datatable(rownames = FALSE, options = list(paging = FALSE, searching = FALSE)) %>% DT::formatRound(2:7, digits = 2)

Example 2: Top markers for all cell types (each vs rest)

Here we use the test to find genes that are high for each cell type compared to the rest. This is the default behavior of the test function. To speed things up, we use fewer random permutations (49) and test only the 222 genes with highest log2 fold-change.

bm <- bench::mark(
  de_res <- diff_mean_test(y = counts_corrected, 
                           group_labels = cell_types, 
                           R = 49, 
                           only_pos = TRUE, 
                           only_top_n = 222,
                           verbosity = 0)
)
knitr::kable(data.frame(bm[, 5:9]), caption = "Benchmarking details")

Show one plot per cell type and highlight the top 4 markers with respect to p-value and the top 4 markers with respect to log2FC.

top_markers <- group_by(de_res, group1) %>% 
  filter(rank(-zscore, ties.method = "first") <= 4 | rank(-log2FC, ties.method = "first") <= 4) %>% select(group1, gene, mean1, mean2, log2FC, zscore, emp_pval_adj)

p1 <- ggplot(de_res, aes(pmin(log2FC, 10), pmin(log10(zscore), 4))) + 
  geom_point(aes(color = emp_pval_adj < 0.05)) + 
  geom_point(data = top_markers, color = 'deeppink') +
  geom_text_repel(data = top_markers, mapping = aes(label = gene)) +
  facet_wrap(~ group1, ncol = 3) + 
  theme(legend.position = 'bottom')
show(p1)

Table of top markers per cell type

DT::datatable(top_markers, rownames = FALSE) %>% DT::formatRound(3:7, digits = 2)

Example 3: Unique markers for all cell types (all vs all)

Since the test is pretty fast we can run all pairwise comparisons to try to find genes that are cell type specific across many individual comparisons.

bm <- bench::mark(
  de_res <- diff_mean_test(y = counts_corrected, 
                           group_labels = cell_types, 
                           compare = 'all_vs_all',
                           R = 49, 
                           mean_th = 0.1, 
                           log2FC_th = 0,
                           verbosity = 0)
)
knitr::kable(data.frame(bm[, 5:9]), caption = "Benchmarking details")

There are r length(unique(paste(de_res$group1, de_res$group2))) unique cell type pairs in the output. On average r round(nrow(de_res)/length(unique(paste(de_res$group1, de_res$group2)))) genes were tested in each comparison. Below we list the top genes per cell type ranked by fold change and the number of times it was a cell type marker (absolute log2FC >= 0 and p-value <= 0.05). We also show the minimum log2 fold-change and the maximum p-value across all comparisons.

log2fc_th <- 0#log2(1.2)
p_th <- 0.05
fn <- function(fc, pval, fc_th = 0, p_th = Inf) {
  worst <- which.min(fc)
  wp <- pval[worst]
  if (fc[worst] < 0) {
    wp <- 1 - wp
  }
  data.frame(n = sum(fc >= fc_th & pval <= p_th),
             log2FC = fc[worst],
             pval = wp)
}
tmp1 <- group_by(de_res, gene, group1) %>% 
  summarise(fn(log2FC, pval))

tmp1 <- group_by(de_res, gene, group1) %>% 
  summarise(n = sum(log2FC >= log2fc_th),
            n_sig = sum(log2FC >= log2fc_th & pval <= p_th),
            log2FC = min(log2FC), 
            pval = max(pval, log2FC < 0))

tmp2 <- group_by(de_res, gene, group2) %>% 
  summarise(n = sum(log2FC <= -log2fc_th),
            n_sig = sum(log2FC <= -log2fc_th & pval <= p_th),
            log2FC = min(-log2FC), 
            pval = max(pval, log2FC < 0))

top_markers <- full_join(tmp1, tmp2, by = c('group1' = 'group2', 'gene' = 'gene')) %>%
  rename(cell_type = group1) %>%
  mutate(comparisons = mapply(sum, n.x, n.y, na.rm = TRUE),
         n_sig = mapply(sum, n_sig.x, n_sig.y, na.rm = TRUE),
         log2FC = pmin(log2FC.x, log2FC.y, na.rm = TRUE),
         pval = pmax(pval.x, pval.y, na.rm = TRUE)) %>%
  arrange(cell_type, -comparisons, -log2FC*n_sig, pval) %>%
  group_by(cell_type) %>%
  slice_head(n = 4) %>%
  select(cell_type, gene, comparisons, log2FC, pval)

DT::datatable(top_markers, rownames = FALSE, caption = 'Cell type specific markers') %>% 
  DT::formatRound(4:5, digits = 2)

Heatmap of top markers

mat <- sctransform:::row_gmean_grouped_dgcmatrix(
  matrix = counts_corrected[unique(top_markers$gene), ], group = cell_types, eps = 1, shuffle = FALSE)
mat <- t(scale(t(mat)))

p1 <- melt(mat, varnames = c('gene', 'celltype')) %>% 
  mutate(celltype = factor(celltype, levels = rev(colnames(mat)))) %>%
  ggplot(aes(gene, celltype, fill = value)) +
  geom_tile(colour = "gray66") +
  scale_fill_gradient2(low = 'white', mid = 'white', high = 'black', 
                       name = 'Geometric mean of each gene per cell type, scaled per gene') +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  labs(x = NULL, y = NULL) +
  scale_x_discrete(expand = c(0, 0)) + 
  scale_y_discrete(expand = c(0, 0)) +
  theme(legend.position = 'top', axis.ticks = element_blank()) 
show(p1)

Example 4: CD4 vs CD8 T cells (some vs some)

We can also test one or more groups against one or more other groups. As an example we test CD4 vs CD8 cells (in each group we combine Naive, TCM, TEM subgroups). The cell type identities for the comparison are passed as a list of length two to the compare argument of the diff_mean_test function.

bm <- bench::mark(
  de_res <- diff_mean_test(y = counts_corrected, 
                           group_labels = cell_types, 
                           compare = list(c('CD4 Naive', 'CD4 TCM', 'CD4 TEM'),
                                          c('CD8 Naive', 'CD8 TCM', 'CD8 TEM')),
                           log2FC_th = 0, 
                           mean_th = 0.1),
  max_iterations = 1, 
  filter_gc = FALSE
)
knitr::kable(data.frame(bm[, 5:9]), caption = "Benchmarking details")
top_markers <- arrange(de_res, sign(log2FC), -abs(log2FC)) %>%
  group_by(sign(log2FC)) %>%
  filter(rank(-abs(zscore), ties.method = "first") <= 4 |
         rank(-abs(log2FC), ties.method = "first") <= 4) %>%
  ungroup() %>%
  select(gene, mean1, mean2, log2FC, emp_pval_adj, pval_adj, zscore)

p1 <- ggplot(de_res, aes(log2FC, pmax(-0.5, log10(abs(zscore))))) + 
  geom_point(aes(color = emp_pval_adj < 0.05 & pval_adj < 0.05)) + 
  geom_point(data = top_markers, color = 'deeppink') +
  geom_text_repel(data = top_markers, mapping = aes(label = gene)) +
  theme(legend.position = 'bottom') +
  ylab('Zscore [log10 of absolute value, clipped at -0.5]') +
  xlab('log2 fold-change (log2(mean1 / mean2))')

show(p1)

Top markers per group

filter(top_markers, log2FC < 0) %>% DT::datatable(rownames = FALSE, options = list(paging = FALSE, searching = FALSE), caption = 'Higher in CD8') %>% DT::formatRound(2:7, digits = 2)
filter(top_markers, log2FC > 0) %>% DT::datatable(rownames = FALSE, options = list(paging = FALSE, searching = FALSE), caption = 'Higher in CD4') %>% DT::formatRound(2:7, digits = 2)

Example 5: Using the test with Seurat

The functionality outlined in the examples above can also be applied to Seurat objects. We demonstrate this with an example below where we perform unsupervised clustering followed by cluster marker identification (as in example 2).

library('Seurat')
s <- CreateSeuratObject(counts = counts)
# run sctransform
s <- SCTransform(s, verbose = FALSE, method = 'qpoisson')
# 'standard' workflow
s <- RunPCA(s, verbose = FALSE)
s <- RunUMAP(s, dims = 1:30, verbose = FALSE)
s <- FindNeighbors(s, dims = 1:30, verbose = FALSE)
s <- FindClusters(s, verbose = FALSE)
DimPlot(s, label = TRUE) + NoLegend() + ggtitle('Unsupervised clustering')

The cells have been grouped into r length(levels(s$seurat_clusters)) clusters. We are now going to use sctransform::diff_mean_test to find the top cluster markers. For this we access the normalized counts that are in the SCT assay and the cluster labels.

bm <- bench::mark(
  de_res <- diff_mean_test(y = GetAssayData(s, assay = 'SCT', slot = 'counts'), 
                           group_labels = s$seurat_clusters, 
                           R = 49, 
                           only_pos = TRUE, 
                           only_top_n = 222,
                           verbosity = 0)
)
knitr::kable(data.frame(bm[, 5:9]), caption = "Benchmarking details")
top_markers <- group_by(de_res, group1) %>% 
  filter(rank(-zscore, ties.method = "first") <= 4 | rank(-log2FC, ties.method = "first") <= 4) %>% select(group1, gene, mean1, mean2, log2FC, zscore, emp_pval_adj)

p1 <- ggplot(de_res, aes(pmin(log2FC, 10), pmin(log10(zscore), 4))) + 
  geom_point(aes(color = emp_pval_adj < 0.05)) + 
  geom_point(data = top_markers, color = 'deeppink') +
  geom_text_repel(data = top_markers, mapping = aes(label = gene)) +
  facet_wrap(~ group1, ncol = 4) + 
  theme(legend.position = 'bottom')
show(p1)

Table of top markers per cluster

DT::datatable(top_markers, rownames = FALSE) %>% DT::formatRound(3:7, digits = 2)

Seurat heatmap

marker_genes <- group_by(de_res, group1) %>% 
  filter(rank(-log2FC, ties.method = "first") <= 4) %>% 
  pull(gene)
# make sure all markers are in the scale.data slot (by default only the highly variable genes are there)
s <- GetResidual(s, features = marker_genes, verbose = FALSE)
DoHeatmap(s, features = marker_genes, slot = 'scale.data')

Example 6: Conserved markers after integration

Here we show how to use the test to identify the conserved (or consistent) cluster markers after multi-sample integration. This is different from the examples above because additionally to a cluster label the cells also have a sample label and we run the tests per sample and combine the results.

First, follow the Seurat integration vignette to obtain an integrated object.

library('Seurat')
library('SeuratData')
LoadData("ifnb")
ifnb.list <- SplitObject(ifnb, split.by = "stim")
ifnb.list <- lapply(X = ifnb.list, FUN = SCTransform, method = 'qpoisson', verbose = FALSE)
features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000,
                                      verbose = FALSE)
ifnb.list <- PrepSCTIntegration(object.list = ifnb.list, anchor.features = features,
                                verbose = FALSE)
immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list,
                                         normalization.method = "SCT",
                                         anchor.features = features,
                                         verbose = FALSE)
immune.combined.sct <- IntegrateData(anchorset = immune.anchors,
                                     normalization.method = "SCT",
                                     verbose = FALSE)

Note that in this example both samples have been sequenced to similar depth. When working with data where there is a stark discrepancy between sequencing depth, one should make sure that the corrected counts used for differential expression testing were generated using the same 'target' depth. Since this is not supported by Seurat::SCTransform it requires running sctransform::correct() with identical cell meta data for all samples and setting the as_is parameter to TRUE. However, in this example we can proceed without any extra steps.

In an unsupervised analysis one would now perform dimensionality reduction and clustering. To keep it simple, we skip these steps and directly use the cell type labels that are provided with the original seurat object.

Now we identify cluster marker genes (each vs rest) that are conserved, i.e. consistent across conditions.

bm <- bench::mark(
  de_res <- diff_mean_test_conserved(
    y = GetAssayData(immune.combined.sct, assay = 'SCT', slot = 'counts'), 
    group_labels = immune.combined.sct$seurat_annotations, 
    sample_labels = immune.combined.sct$stim, 
    only_pos = TRUE, 
    verbosity = 0)
)
knitr::kable(data.frame(bm[, 5:9]), caption = "Benchmarking details")

The function we used calls diff_mean_test repeatedly (once per sample given this balanced design) and aggregates the results per group and gene. The output table has the following columns

The output is ordered by group1, -de_tests, -abs(log2FC_median), pval_max

Show the top marker genes

top_markers <- filter(de_res, de_tests == 2, p.adjust(pval_max) <= 1e-3) %>%
  group_by(group1) %>% 
  filter(rank(-log2FC_median, ties.method = "first") <= 6)
DT::datatable(top_markers, rownames = FALSE, caption = 'Conserved cell type specific markers') %>% 
  DT::formatRound(5:9, digits = 2) %>% DT::formatSignif(10)

Show a heatmap of the marker genes. The mean expression per gene is scaled separately for each sample (CTRL and STIM) but they are shown interleaved to highlight that the differential expression is conserved across samples.

celltypes <- immune.combined.sct$seurat_annotations
samples <- immune.combined.sct$stim
immune.combined.sct$type_by_stim <- factor(x = paste(celltypes, samples, sep = ' '),
                                           levels = paste(rep(levels(top_markers$group1), each = length(unique(samples))), unique(samples)))
mat <- sctransform:::row_gmean_grouped_dgcmatrix(
  matrix = GetAssayData(immune.combined.sct, assay = 'SCT', slot = 'counts')[top_markers$gene, ], 
  group = factor(immune.combined.sct$type_by_stim), 
  eps = 1, 
  shuffle = FALSE)
# scale samples separately
sel <- grepl(' CTRL$', colnames(mat))
mat[, sel] <- t(scale(t(mat[, sel])))
sel <- grepl(' STIM$', colnames(mat))
mat[, sel] <- t(scale(t(mat[, sel])))
mat <- mat[order(apply(mat, 1, which.max)), ]

p1 <- melt(mat, varnames = c('gene', 'celltype')) %>% 
  mutate(celltype = factor(celltype, levels = rev(colnames(mat)))) %>%
  ggplot(aes(gene, celltype, fill = value)) +
  geom_tile(colour = "gray66") +
  scale_fill_gradient2(low = 'white', mid = 'white', high = 'black', 
                       name = 'Geometric mean of each gene per cell type and sample, scaled per gene and sample') +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  labs(x = NULL, y = NULL) +
  scale_x_discrete(expand = c(0, 0)) + 
  scale_y_discrete(expand = c(0, 0)) +
  theme(legend.position = 'top', axis.ticks = element_blank()) 
show(p1)

To identify differences between stimulated and control conditions within a celltype, use the test as shown in the examples above. For example, below we look for genes differentially expressed between conditions within B cells.

de_res <- diff_mean_test(
    y = GetAssayData(immune.combined.sct, assay = 'SCT', slot = 'counts'), 
    group_labels = immune.combined.sct$type_by_stim, 
    compare = c('B CTRL', 'B STIM'),
    verbosity = 0)
top_markers1 <- arrange(de_res, zscore) %>% head(10)
top_markers2 <- arrange(de_res, -zscore) %>% head(10)
rbind(top_markers1, top_markers2) %>% select(gene, mean1, mean2, log2FC, pval_adj) %>%
  DT::datatable(rownames = FALSE, caption = 'Top 20 B cell DE genes between control (group 1) and stimulated (group 2)') %>% 
  DT::formatRound(2:4, digits = 2) %>% DT::formatSignif(5)

Session info and runtime

Session info

sessionInfo()

Runtime

print(proc.time() - tic)

References



ChristophH/sctransform documentation built on Oct. 22, 2023, 3:28 p.m.