In VPetukhov/dropEstAnalysis: Analysis to produce figures from dropEst paper

knitr::read_chunk("../../analysis/chunks.R")

library(ggplot2)
library(ggrastr)
library(ggpubr)
library(dplyr)
library(parallel)
library(Seurat)
library(dropestr)
library(dropEstAnalysis)

theme_set(theme_base)

set.seed(42)
kOutputFolder <- '../../output/'
kDataPath <- '../../data/'
kEstDataPath <- paste0(kDataPath, 'dropest/10x/frozen_bmmc_healthy_donor1/')
kAnnotationDataPath <- paste0(kDataPath, 'annotation/')
kEstFolder <- paste0(kEstDataPath, 'est_11_10_umi_quality/')
k10xFolder <- paste0(kEstDataPath, 'filtered_matrices_mex/hg19/')

Read data

Link to the original dataset.

holder <- readRDS(paste0(kEstFolder, 'bmmc_no_umi.rds'))

cm_10x <- Read10xMatrix(k10xFolder)
cm_10x <- cm_10x[, order(Matrix::colSums(cm_10x), decreasing=F)]

umis_per_cell <- sort(Matrix::colSums(holder$cm_raw), decreasing=T)
est_cell_num <- EstimateCellsNumber(umis_per_cell)
drop_est_cbs <- names(umis_per_cell)[1:est_cell_num$expected]

Here we compare threshold selection, so we can set quality score threshold to 0.5.

intersect_cbs <- intersect(colnames(cm_10x), drop_est_cbs)
rescued_cbs <- setdiff(drop_est_cbs, colnames(cm_10x))

c(Unchanged=length(intersect_cbs), Rescued=length(rescued_cbs))

r_cm_rescued <- holder$cm_raw[, c(drop_est_cbs, colnames(cm_10x)) %>% unique()]
r_cm_rescued <- r_cm_rescued[grep("^[^;]+$", rownames(r_cm_rescued)),]

if (!all(colnames(cm_10x) %in% colnames(r_cm_rescued)))
  stop("All 10x cells must be presented")

Rescued cells

r_rescued <- GetPagoda(r_cm_rescued, n.cores=30)

# You need to run "annotation/annotation_bmmc1.Rmd" first
clusters_annotated <- paste0(kAnnotationDataPath, 'bmmc1_clusters_annotated.csv') %>%
  read.csv() %>% (function(x) setNames(as.character(x$Type), x$Barcode))

notannotated_cells <- setdiff(colnames(r_cm_rescued), names(clusters_annotated))
clusters_annotated_resc <- AnnotateClustersByGraph(r_rescued$graphs$PCA,
                                                   clusters_annotated,
                                                   notannotated_cells, mc.cores=10)

rescued_clusters <- clusters_annotated_resc[rescued_cbs]
intersect_clusters <- clusters_annotated[intersect_cbs]

unchanged_clusters <- names(clusters_annotated) %>% setdiff(rescued_cbs)

long_type_names <- c("CD14+ Monocytes", "Non-dividing Pro B cells", "Monocyte progenitors", 
                     "Epithelial cells", "Cytotoxic T cells", "Immature B cells", 
                     "Dendritic cells", "Pre-pro B cells")

plot_clusters <- clusters_annotated[unchanged_clusters]
plot_rescued_clusters <- rescued_clusters
for (type in long_type_names) {
  plot_clusters[plot_clusters == type] <- sub(" ", "\n", type)
  plot_rescued_clusters[plot_rescued_clusters == type] <- sub(" ", "\n", type)
}

gg_tsne <- PlotFiltrationResults(r_rescued, plot_clusters, filtered.cbs=NULL,
                                 rescued.clusters=plot_rescued_clusters,
                                 raster.width=4.28, raster.height=4.16,
                                 rescued.alpha=0.5, rescued.size=1.5, lineheight=0.8) +
  theme_pdf(legend.pos=c(0, 1), show.ticks=F)

gg_tsne

Number of rescued cells per cluster

rescued_table <- TableOfRescuedCells(clusters_annotated_resc[c(intersect_cbs, rescued_cbs)], 
                                     rescued_cbs)
write.csv(rescued_table, paste0(kOutputFolder, "tables/rescued_cbc_bmmc1.csv"), row.names=F)
rescued_table

Comparison of unchanged and rescued cells

Seurat

Parameters are the same as in the demonstration.

presented_cbs <- intersect(colnames(r_cm_rescued), names(clusters_annotated))
seurat_cm <- r_cm_rescued[, presented_cbs]
seurat_cm <- seurat_cm[Matrix::rowSums(seurat_cm) > 200, ]

srt <- CreateSeuratObject(raw.data = seurat_cm, project = "bmmc1", display.progress=F)
srt <- NormalizeData(object = srt, normalization.method = "LogNormalize",
                     scale.factor = 10000, display.progress=F)
srt <- FindVariableGenes(object = srt, mean.function = ExpMean,
                         dispersion.function = LogVMR, x.low.cutoff = 0.0125,
                         x.high.cutoff = 3, y.cutoff = 1, do.plot=F, display.progress=F)
srt <- ScaleData(object = srt, vars.to.regress = "nUMI", display.progress=F)

Find genes to compare cell types:

srt@ident <- as.factor(clusters_annotated[colnames(srt@raw.data)])
names(srt@ident) <- colnames(srt@raw.data)
compared_clusters <- unique(srt@ident) %>% as.character()
cluster_markers <- mclapply(compared_clusters, function(i)
  mclapply(setdiff(compared_clusters, i), FindClusterMarkers, i, srt, mc.cores=4),
  mc.cores=11)

de_genes <- GetOverexpressedGenes(srt, compared_clusters, cluster_markers)

r length(de_genes) differentially expressed genes found.

Plots

10x CellRanger used wrong threshold:

scores <- ScorePipelineCells(holder, mit.chromosome.name='MT', 
                             predict.all=T)[names(umis_per_cell)]

smoothScatter(scores[1:6000], bandwidth=c(60, 0.015), xlab='Cell rank',
              ylab='Quality score')
abline(v=ncol(cm_10x), col='#bc2727', lty=2, lw=2.5)
abline(v=est_cell_num$expected, col='#0a6607', lty=2, lw=2.5)
arrows(x0=c(1000, 4000), y0=c(0.6, 0.7), 
       x1=c(ncol(cm_10x) - 100, est_cell_num$expected + 100), y1=c(0.42, 0.52),
       lw=2)
text(x=c(900, 4100), y=c(0.67, 0.75),
     labels=c("10x threshold", "dropEst threshold"), cex=1.3)

tested_clusts <- clusters_annotated[presented_cbs]

separation <- c(setNames(rep('rescued', length(rescued_cbs)), rescued_cbs),
                setNames(rep('real', length(intersect_cbs)), intersect_cbs))

umis_per_cb_subset <- log10(Matrix::colSums(r_cm_rescued[, names(tested_clusts)]))
tested_clusts <- tested_clusts[order(tested_clusts, -umis_per_cb_subset)]

Prepare heatmaps:

plot_df <- ExpressionMatrixToDataFrame(r_rescued$counts[names(tested_clusts), de_genes],
                                       umis_per_cb_subset, tested_clusts,
                                       filtration.type=separation)

plot_df$Cluster <- as.character(plot_df$Cluster)
plot_df$Cluster[plot_df$Cluster == "Non-dividing Pro B cells"] <- "Non-dividing\nPro B cells"

plot_df <- plot_df %>% filter(UmisPerCb < 3.4)
plot_dfs <- split(plot_df, plot_df$FiltrationType)

ggs <- lapply(plot_dfs, HeatmapAnnotGG, umi.per.cell.limits=range(plot_df$UmisPerCb),
              raster.width=3, raster.height=3, raster.dpi=100)

legend_guides <- list(HeatmapLegendGuide('Expression'),
                      HeatmapLegendGuide('Cell type', guide=guide_legend, ncol=3),
                      HeatmapLegendGuide('log10(#molecules)'))
gg_legends <- mapply(`+`, ggs$real, legend_guides, SIMPLIFY=F) %>%
  lapply(`+`, theme(legend.margin=margin(l=4, r=4, unit='pt'))) %>% lapply(get_legend)

ggs$real$heatmap <- ggs$real$heatmap + rremove('xlab') + ylab('Cells')
ggs$rescued$heatmap <- ggs$rescued$heatmap + labs(x = 'Genes', y = 'Cells')
ggs_annot <- lapply(ggs, function(gg) cowplot::plot_grid(
  plotlist=lapply(gg, `+`, theme(legend.position="none", plot.margin=margin())),
  nrow=1, rel_widths=c(1.5, 0.1, 0.1), align='h'))

gg_legends_plot <- cowplot::plot_grid(plotlist=gg_legends, nrow=3, align='v')

Compile plot parts:

gg_left <- cowplot::plot_grid(ggs_annot$real, ggs_annot$rescued, nrow=2,
                              labels=c('B', 'C'))
gg_right <- gg_tsne + theme(plot.margin=margin(l=0.1, unit='in'),
                            axis.text=element_blank(), axis.ticks=element_blank())
gg_bottom <- cowplot::plot_grid(plotlist=gg_legends[c(1, 3, 2)], ncol=3,
                                rel_widths=c(1, 1, 2.6))

gg_filtration <- cowplot::plot_grid(gg_left, gg_right, labels=c('', 'D'), ncol=2) %>%
  cowplot::plot_grid(gg_bottom, nrow=2, rel_heights=c(1, 0.27), align='v')

coords <- list(optimal=list(x_l=est_cell_num$expected,
                            x_t=4000, y_l=3.2e5, y_t=5e5),
               `10x`=list(x_l=ncol(cm_10x),
                          x_t=1300, y_l=2.5e5, y_t=4e5))

gg_repel <- function(coords, label) {
  coords$label <- label
  gg <- ggrepel::geom_label_repel(
    data=as.data.frame(coords),
    mapping=aes(x=x_l, y=y_l, label=label), nudge_x=coords$x_t - coords$x_l,
    nudge_y=coords$y_t - coords$y_l, size=5.5, segment.size=0.7, force=0,
    arrow=ggplot2::arrow(length = unit(0.03, 'npc')), fill=alpha("white", 0.7)
  )
  return(gg)
}

gg_cell_number <- PlotCellsNumberLine(Matrix::colSums(holder$cm_raw)) +
  geom_vline(aes(xintercept=coords$`10x`$x_l), linetype='dashed',
             color='#bc2727', size=1) +
  geom_vline(aes(xintercept=coords$optimal$x_l), linetype='dashed',
             color='#0a6607', size=1) +
  gg_repel(coords$`10x`, label="10x threshold") +
  gg_repel(coords$optimal, label="dropEst threshold") +
  scale_x_continuous(limits=c(0, 5750), expand=c(0, 0)) +
  theme_pdf() +
  theme(axis.ticks=element_blank(), axis.text.y=element_blank(),
        panel.grid.major.y=element_blank(), panel.grid.minor.y=element_blank())

Final figure

gg_fig <- cowplot::plot_grid(gg_cell_number + theme(plot.margin=margin(b=0.1, unit="in")),
                             gg_filtration, nrow=2, rel_heights=c(1.2, 3),
                             labels=c('A', '')) +
  theme(plot.margin=margin(1, 1, 1, 1))

ggsave(paste0(kOutputFolder, 'figures/fig_bmmc_filtration.pdf'), gg_fig, width=7.5, height=7)

gg_fig