library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
library('knitr')
knit_hooks$set(optipng = hook_optipng)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  digits = 2,
  tidy = TRUE,
  tidy.opts = list(width.cutoff=80),
  optipng = '-o 5 -strip all -quiet',
  fig.width=5, fig.height=3, dpi=100, out.width = '70%'
)
old_theme <- theme_set(theme_classic(base_size=8))
# some of the vst steps can use multiple cores
# We use the Future API for parallel processing; set parameters here
future::plan(strategy = 'multicore', workers = 3)
options(future.globals.maxSize = 8 * 1024 ^ 3)
options(future.fork.enable = TRUE)

In this vignette we show how the regression model in the variance stabilizing transformation can also be used to mitigate batch effects. We use data from Shekhar et al., Cell 2016 to demonstrate the functionality. The original analysis and data are available here.

We first load the data and transform without using the batch information.

# load data and cluster identities, and calculate some cell attributes
cm <- readRDS(file = '~/Projects/data/BipolarCell2016_GSE81904_sparse_matrix.Rds')
cell_attr <- read.table(file = '~/Projects/data/BipolarCell2016_GSE81904_ClustAssignFile.txt', header = TRUE, stringsAsFactors = FALSE, row.names = 1)
cell_attr <- cell_attr[colnames(cm), ]
cell_attr$CLUSTER <- factor(cell_attr$CLUSTER)
cell_attr$replicate <- as.integer(sapply(strsplit(rownames(cell_attr), '_'), function(x) gsub('Bipolar', '', x[1])))
cell_attr$batch <- factor(as.integer(cell_attr$replicate %in% c(5, 6)) + 1)
cell_attr$gene <- Matrix::colSums(cm > 0)
cell_attr$umi <- as.integer(Matrix::colSums(cm))
cell_attr$log_umi <- log10(cell_attr$umi)
cell_attr$umi_per_gene <- cell_attr$umi / cell_attr$gene
cell_attr$log_umi_per_gene <- log10(cell_attr$umi_per_gene)

# remove 50 weird 'cells' 
weird <- cell_attr$log_umi_per_gene < 0.05 | cell_attr$log_umi_per_gene > 0.35
cm <- cm[, !weird]
cell_attr <- cell_attr[!weird, ]

# remove genes as in the paper; goes from 24904 down to 13181
bad_genes <- Matrix::rowSums(cm > 0) < 30 | Matrix::rowSums(cm) < 60
cm <- cm[!bad_genes, ]

# use fewer cells to speed up vignette compilation
set.seed(42)
keep <- sample(1:ncol(cm), 10000)
cm <- cm[, keep]
cell_attr <- cell_attr[keep, ]

# apply vst
set.seed(42)
vst_out <- vst(cm, cell_attr = cell_attr, latent_var = 'log_umi_per_gene', verbosity = 1)

In order to get an overview of the data, we perform dimensionality reduction (PCA) and keep the top 35 dimensions. For 2D plotting, we run t-SNE and display the batch label and the original clustering.

pca <- irlba::prcomp_irlba(t(vst_out$y), n = 35)
tsne <- Rtsne::Rtsne(scale(pca$x), pca = FALSE)
cell_attr$tSNE1 <- tsne$Y[, 1]
cell_attr$tSNE2 <- tsne$Y[, 2]
ggplot(cell_attr, aes(tSNE1, tSNE2, color = batch)) + geom_point(alpha=0.5, shape=16) + 
  theme(legend.position="bottom")
ggplot(cell_attr, aes(tSNE1, tSNE2, color = factor(CLUSTER))) + geom_point(alpha=0.5, shape=16) +
  theme(legend.position="bottom")

We can see a clear separation between batches, while the main source of variability in the data seems to come from the heterogeneity between cell types.

We will now repeat the analysis but this time specify a batch variable when calling vst. This changes the regression model specification, adding batch indicator variable and its interaction with the regression coefficients. This effectively fits the regression model per batch, but keeps a single theta variable (overdispersion parameter) per gene. Parameter regularization is performed per batch.

set.seed(42)
vst_out2 <- vst(cm, cell_attr = cell_attr, latent_var = 'log_umi_per_gene', batch_var = 'batch', verbosity = 1)

# dimensionality reduction via PCA
pca2 <- irlba::prcomp_irlba(t(vst_out2$y), n = 35)

tsne2 <- Rtsne::Rtsne(scale(pca2$x), pca = FALSE)
cell_attr$tSNE1 <- tsne2$Y[, 1]
cell_attr$tSNE2 <- tsne2$Y[, 2]
ggplot(cell_attr, aes(tSNE1, tSNE2, color = batch)) + geom_point(alpha=0.5, shape=16) + 
  theme(legend.position="bottom")
ggplot(cell_attr, aes(tSNE1, tSNE2, color = factor(CLUSTER))) + geom_point(alpha=0.5, shape=16) +
  theme(legend.position="bottom")

After including the batch variable in the model the data separates by cluster only. To demonstrate the difference between the models before and after including batch, we plot the observed UMI counts and the model fit for the gene mt-Rnr2 which exhibits a large batch effect. The left plot shows the first model, ignoring the batch label, while the right plot shows the second model compensating for differences between batches. Cells are ordered first by batch, then by number of UMI (decreasing). The pink line shows the expected mt-Rnr2 UMI count given the model, the shaded region shows the model uncertainty as one standard deviation above and below the expected value.

# plot expression and model for one gene as example
cell_attr$idx <- NA
cell_attr$idx[order(cell_attr$batch, -cell_attr$log_umi_per_gene)] <- 1:ncol(cm)
goi <- 'mt-Rnr2'
plot_model(vst_out, cm, goi, x_var = 'idx', cell_attr = cell_attr,
           batches = cell_attr$batch, plot_residual = TRUE, show_density = FALSE)
plot_model(vst_out2, cm, goi, x_var = 'idx', cell_attr = cell_attr,
           batches = cell_attr$batch, plot_residual = TRUE, show_density = FALSE)


ChristophH/sctransform documentation built on Oct. 22, 2023, 3:28 p.m.