knitr::read_chunk("../../analysis/chunks.R")



library(cowplot)
library(ggplot2)
library(ggsci)
library(ggpubr)
library(ggrastr)
library(ggrepel)
library(dplyr)
library(parallel)
library(Seurat)
library(dropestr)
library(dropEstAnalysis)

theme_set(theme_base)

set.seed(42)
kOutputFolder <- '../../output/'
kPlotsFolder <- paste0(kOutputFolder, 'figures/')
kDataPath <- '../../data/dropest/10x/pbmc8k/'
kEstFolder <- paste0(kDataPath, 'est_11_11_umi_quality/')
k10xFolder <- paste0(kDataPath, 'filtered_gene_bc_matrices/GRCh38/')
# holder <- readRDS(paste0(kEstFolder, 'pbmc8k.rds'))
# holder$reads_per_umi_per_cell <- NULL
# saveRDS(holder, paste0(kEstFolder, 'pbmc8k_no_umi.rds'))

holder <- readRDS(paste0(kEstFolder, 'pbmc8k_no_umi.rds'))
cm_10x <- Read10xMatrix(k10xFolder, use.gene.names=T)
cm_10x <- cm_10x[, order(Matrix::colSums(cm_10x), decreasing=F)]
umis_per_cell <- sort(Matrix::colSums(holder$cm_raw), decreasing=T)
est_cell_num <- EstimateCellsNumber(umis_per_cell)

Initial labeling

PlotCellsNumberLogLog(umis_per_cell, estimate.cells.number=T) + 
  theme_pdf(legend.pos=c(1, 1))

Quality scores

scores <- ScorePipelineCells(holder, mit.chromosome.name='MT', predict.all=T, 
                             verbose=T)[names(umis_per_cell)]
PlotCellScores(scores[1:20000], cells.number=est_cell_num)
pdf(paste0(kPlotsFolder, 'lq_pbmc8k_scores.pdf'), width=7.5, height=3.5)
par(mar=c(2.7,2.5,0,0.3), tck=-0.02, mgp=c(1.4,0.3,0))
PlotCellScores(scores[1:20000], cells.number=est_cell_num)
invisible(dev.off())

dropEst threshold

intersect_cbs <- names(scores[1:est_cell_num$expected])
intersect_cbs <- intersect_cbs[scores[intersect_cbs] > 0.9]

unknown_cell_scores <- scores[(est_cell_num$expected + 1):length(scores)]
rescued_cbs <- names(unknown_cell_scores)[unknown_cell_scores > 0.5]

unknown_cell_scores <- scores[1:est_cell_num$expected]
filtered_cbs <- names(unknown_cell_scores)[unknown_cell_scores < 0.1]

c(Uncahnged=length(intersect_cbs), Rescued=length(rescued_cbs), 
  Filtered=length(filtered_cbs))
r_cm_rescued <- holder$cm_raw[, c(names(umis_per_cell)[1:est_cell_num$expected], 
                                  rescued_cbs)]
r_cm_rescued <- r_cm_rescued[grep("^[^;]+$", rownames(r_cm_rescued)),]

if (!all(colnames(cm_10x) %in% colnames(r_cm_rescued))) 
  stop("All 10x cells must be presented")
# r_rescued <- GetPagoda(r_cm_rescued, n.cores=30, tsne.iters.num=3000)
# r_rescued$getEmbedding(type='PCA', perplexity=50, embeddingType = 'tSNE', max_iter=3000)
# saveRDS(r_rescued, paste0(kEstFolder, 'pagoda.rds'))
r_rescued <- readRDS(paste0(kEstFolder, 'pagoda.rds'))
clusters_annotated <- read.csv(paste0(kEstFolder, 'clusters_annotated.csv')) %>% 
  (function(x) setNames(as.character(x$Type), x$Barcode))
clusters <- r_rescued$clusters$PCA$infomap

notannotated_cells <- setdiff(names(clusters), names(clusters_annotated))
clusters_annotated_resc <- AnnotateClustersByGraph(r_rescued$graphs$PCA, clusters_annotated, 
                                                   notannotated_cells, mc.cores=10)
rescued_clusters <- clusters_annotated_resc[rescued_cbs]
tsne <- r_rescued$embeddings$PCA$tSNE
intergenic_inf <- names(clusters)[clusters == 11] %>% intersect(filtered_cbs) %>%
  GetCellsChull(tsne, offset.y=-0.5)
mitochondrial_inf <- names(clusters)[clusters == 16] %>% intersect(filtered_cbs) %>%
  GetCellsChull(tsne)
low_rpu_inf <- names(clusters)[clusters == 7] %>% intersect(filtered_cbs) %>%
  GetCellsChull(tsne, chull.quantile=0.8)

chull_list <- list(intergenic_inf, mitochondrial_inf, low_rpu_inf) %>% 
  lapply(`[[`, 'chull') %>% lapply(as.data.frame) %>% 
  setNames(c('Intergenic', 'Mitochondrial', 'Low expression'))

label_colors <- c('#c63737', '#1989c6', '#d3d631') %>% setNames(names(chull_list))

labels_df <- Reduce(rbind,  lapply(chull_list, colMeans)) %>% as.data.frame() %>% 
  mutate(Text=paste0(names(chull_list), ' (', c('C', 'B', 'D'), ')'), Color=label_colors) %>%
  mutate(OffsetX=c(4, -4, -2), OffsetY=c(8, 3, -6))
gg_labs <- labs(x='tSNE-1', y='tSNE-2')
gg_all <- PlotPagodaEmbeding(r_rescued, clusters=r_rescued$clusters$PCA$infomap, 
                             mark.clusters=F, font.size=NULL, raster=T) +
  theme_pdf(show.ticks=F) + gg_labs
plot_clusters <- setdiff(names(clusters_annotated), c(filtered_cbs, rescued_cbs))
gg_base <- PlotFiltrationResults(r_rescued, r_rescued$clusters$PCA$infomap[plot_clusters],
                            filtered.cbs=filtered_cbs, raster.width=5, raster.height=5,
                            mark.clusters=F) +
   theme_pdf(legend.pos=c(1, 1), show.ticks=F) + gg_labs

gg <- Reduce(`+`, lapply(names(chull_list), function(n) 
  geom_polygon(data=chull_list[[n]], mapping=aes(x=V1, y=V2), alpha=0.3, 
               color=label_colors[n])), init=gg_base)

gg_repels <- lapply(1:3, function(i) 
  geom_label_repel(data=labels_df[i,], mapping=aes(x=V1, y=V2, label=Text), 
                   fill=alpha(labels_df$Color[i], 0.5), nudge_x=labels_df$OffsetX[i], 
                   nudge_y=labels_df$OffsetY[i], segment.size=1, size=3, point.padding=0,
                   arrow=arrow(length = unit(0.02, 'npc')), color='black', 
                   segment.color=labels_df$Color[i], fontface='bold'))

gg_filt <- Reduce(`+`, gg_repels, init=gg)

t-SNE with noise

plot_clusters <- setdiff(names(clusters_annotated), c(filtered_cbs, rescued_cbs))
gg <- PlotFiltrationResults(r_rescued, clusters_annotated[plot_clusters],
                            filtered.cbs=filtered_cbs, rescued.clusters=rescued_clusters, 
                            raster.width=4.28, raster.height=6)

gg <- Reduce(`+`, lapply(names(chull_list), function(n) 
  geom_polygon(data=chull_list[[n]], mapping=aes(x=V1, y=V2), alpha=0.3, 
               color=label_colors[n])), init=gg)

gg_repels <- lapply(1:3, function(i) 
  geom_label_repel(data=labels_df[i,], mapping=aes(x=V1, y=V2, label=Text), 
                   fill=alpha(labels_df$Color[i], 0.5), nudge_x=labels_df$OffsetX[i], 
                   nudge_y=labels_df$OffsetY[i], segment.size=1, size=3, point.padding=0,
                   arrow=arrow(length = unit(0.02, 'npc')), color='black', 
                   segment.color=labels_df$Color[i], fontface='bold'))

gg <- Reduce(`+`, gg_repels, init=gg)
gg_tsne_noise <- gg + theme_pdf(legend.pos=c(1, 1), show.ticks=F)
gg_tsne_noise

Error sources

bc_data <- PrepareLqCellsDataPipeline(holder, mit.chromosome.name='MT', scale=F)
real_cell_color <- '#0faf00'

gg <- function(y.max, filt.color) {
  ggplot() + scale_y_continuous(expand=c(0, 0), name='Fraction of cells', 
                                limits=c(0, y.max)) + 
    guides(fill=guide_legend(title=NULL)) + theme_pdf(legend.pos=c(1, 1)) +
    scale_fill_manual(values=as.vector(c(filt.color, real_cell_color)))
}

gg1 <- gg(0.9, label_colors['Mitochondrial']) + 
  geom_histogram(aes(x=bc_data[mitochondrial_inf$cbs, ]$MitochondrionFraction, 
                     y=..count.. / sum(..count..), fill = 'Mitochondrial'), 
                 color='black') +
  geom_histogram(aes(x=bc_data[intersect_cbs, ]$MitochondrionFraction, 
                     y=..count.. / sum(..count..), fill='Real'), alpha=0.5, 
                 color='black') +
  scale_x_continuous(limits=c(0, 1.01), expand=c(0, 0), 
                     name='Fraction of mitochondrial reads')

gg2 <- gg(0.51, label_colors['Intergenic']) + 
  geom_histogram(aes(x=bc_data[intergenic_inf$cbs, ]$IntergenicFrac, 
                     y=..count.. / sum(..count..), fill = 'Intergenic'), 
                 color='black') +
  geom_histogram(aes(x=bc_data[intersect_cbs, ]$IntergenicFrac, 
                     y=..count.. / sum(..count..), fill='Real'), 
                 alpha=0.5, color='black') +
  scale_x_continuous(limits=c(0, 1.01), expand=c(0, 0), 
                     name='Fraction of intergenic reads')

gg3 <- gg(0.6, label_colors['Low expression']) + 
  geom_histogram(aes(x=bc_data[low_rpu_inf$cbs, ]$UmiPerGene, 
                     y=..count.. / sum(..count..), fill = 'Low expression'), 
                 color='black') +
  geom_histogram(aes(x=bc_data[intersect_cbs, ]$UmiPerGene, 
                     y=..count.. / sum(..count..), fill='Real'), alpha=0.5, 
                 color='black') +
  scale_x_continuous(limits=c(1, 5.5), expand=c(0, 0), name='Mean #UMIs per gene')

gg_distributions <- plot_grid(gg1, gg2, gg3, nrow=3, align='v', labels=c('B', 'C', 'D'), 
                              label_x=0.025)
gg_distributions

Final figure

gg_noise_fig <- plot_grid(gg_tsne_noise, gg_distributions, ncol=2, rel_widths=c(1, 0.75), 
          labels=c('A', NULL))
gg_noise_fig
ggsave(paste0(kPlotsFolder, 'fig_pbmc8k_lq_cells.pdf'), gg_noise_fig, width=7.5, height=6)

Comparison with 10x

cbs_10x <- colnames(cm_10x)
rescued_cbs <- names(scores)[scores > 0.9] %>% setdiff(cbs_10x)
filtered_cbs <- names(scores)[scores < 0.1] %>% intersect(cbs_10x)
intersect_cbs <- names(scores)[scores > 0.9] %>% intersect(cbs_10x)

rescued_clusters <- clusters_annotated_resc[rescued_cbs]

c(Unchanged=length(cbs_10x), Rescued=length(rescued_cbs), 
  Filtered=length(filtered_cbs))
plot_clusters <- setdiff(cbs_10x, filtered_cbs)
gg_tsne <- PlotFiltrationResults(r_rescued, clusters=clusters_annotated[plot_clusters],
                                 filtered.cbs=filtered_cbs, rescued.clusters=rescued_clusters, 
                                 raster.width=4.28, raster.height=5) +
  theme_pdf(legend.pos=c(0, 1), show.ticks=F)

gg_tsne
tested_clusts <- sort(c(clusters_annotated[intersect_cbs], rescued_clusters))
tested_clusts <- as.factor(tested_clusts) %>% setNames(names(tested_clusts))
large_rescued <- which(table(clusters_annotated[rescued_cbs]) >= 10) %>% names()
tested_clusts <- tested_clusts[tested_clusts %in% large_rescued]

Number of rescued cells per cluster

rescued_table <- TableOfRescuedCells(clusters_annotated_resc[c(intersect_cbs, rescued_cbs)], 
                                     rescued_cbs)
write.csv(rescued_table, paste0(kOutputFolder, "tables/rescued_cbc_pbmc8k.csv"), row.names=F)
rescued_table

Seurat

seurat_clusters <- tested_clusts
seurat_cm <- r_cm_rescued[, names(seurat_clusters)]
seurat_cm <- seurat_cm[Matrix::rowSums(seurat_cm) > 200, ]

srt <- CreateSeuratObject(raw.data = seurat_cm, project = "pbmc8k", display.progress=F)
srt <- NormalizeData(object = srt, normalization.method = "LogNormalize",
                     scale.factor = 10000, display.progress=F)
srt <- FindVariableGenes(object = srt, mean.function = ExpMean,
                         dispersion.function = LogVMR, x.low.cutoff = 0.0125,
                         x.high.cutoff = 3, y.cutoff = 1, do.plot=F, display.progress=F)
srt <- ScaleData(object = srt, vars.to.regress = "nUMI", display.progress=F)
save(srt, file="../../output/pbmc8k_srt.Rda")
# load("../../output/pbmc8k_srt.Rda")
srt@ident <- as.factor(seurat_clusters[colnames(srt@raw.data)])
names(srt@ident) <- colnames(srt@raw.data)
compared_clusters <- unique(srt@ident) %>% as.character()
cluster_markers <- mclapply(compared_clusters, function(i) 
  mclapply(setdiff(compared_clusters, i), FindClusterMarkers, i, srt, mc.cores=5), 
  mc.cores=6)

overexpressed_genes <- GetOverexpressedGenes(srt, compared_clusters, cluster_markers, 
                                             expression.threshold=0.4)
length(overexpressed_genes)

Heatmaps

tested_clusts <- seurat_clusters

separation <- c(setNames(rep('rescued', length(rescued_cbs)), rescued_cbs), 
                setNames(rep('real', length(intersect_cbs)), intersect_cbs))

umis_per_cb_subset <- log10(Matrix::colSums(r_cm_rescued[, names(tested_clusts)]))
tested_clusts <- tested_clusts[order(tested_clusts, -umis_per_cb_subset)]

de_genes <- overexpressed_genes
raster_width <-  3
raster_height <- 3
raster_dpi <- 100

plot_df <- ExpressionMatrixToDataFrame(r_rescued$counts[names(tested_clusts), de_genes],
                                       umis_per_cb_subset, tested_clusts,
                                       filtration.type=separation)
plot_df <- plot_df %>% filter(UmisPerCb < 3.4)
plot_dfs <- split(plot_df, plot_df$FiltrationType)

ggs <- lapply(plot_dfs, HeatmapAnnotGG, umi.per.cell.limits=range(plot_df$UmisPerCb),
              raster.width=raster_width, raster.height=raster_height, raster.dpi=raster_dpi)

legend_guides <- list(HeatmapLegendGuide('Expression'),
                      HeatmapLegendGuide('Cell type', guide=guide_legend, ncol=3),
                      HeatmapLegendGuide('log10(#molecules)'))
gg_legends <- mapply(`+`, ggs$real, legend_guides, SIMPLIFY=F) %>%
  lapply(`+`, theme(legend.margin=margin(l=4, r=4, unit='pt'))) %>% lapply(get_legend)

ggs$real$heatmap <- ggs$real$heatmap + rremove('xlab') + ylab('Cells')
ggs$rescued$heatmap <- ggs$rescued$heatmap + labs(x = 'Genes', y = 'Cells')
ggs_annot <- lapply(ggs, function(gg)
  plot_grid(plotlist=lapply(gg, `+`, theme(legend.position="none", plot.margin=margin())),
            nrow=1, rel_widths=c(1.5, 0.1, 0.1), align='h'))

gg_legends_plot <- plot_grid(plotlist=gg_legends, nrow=3, align='v')
gg_left <- plot_grid(ggs_annot$real, ggs_annot$rescued, nrow=2, labels=c('B', 'C'))
gg_right <- gg_tsne + theme(plot.margin=margin(l=0.1, unit='in'), axis.text=element_blank(),
                            axis.ticks=element_blank())
gg_bottom <- plot_grid(plotlist=gg_legends[c(1, 3, 2)], ncol=3, rel_widths=c(1, 1, 2.6))

gg_10x <- plot_grid(gg_left, gg_right, labels=c('', 'D'), ncol=2) %>%
  plot_grid(gg_bottom, nrow=2, rel_heights=c(1, 0.21), align='v')

Supplementary figure

gg_10x
ggsave(paste0(kPlotsFolder, 'supp_comparison_with_10x.pdf'), gg_10x, width=7.5, height=6)

Initial labels and scores

kEmbeddingType <- 'tSNE'
# kEmbeddingType <- 'largeVis'

cbs_full <- names(scores)[1:20000]
r_cm_full <- holder$cm_raw[, cbs_full]
r_full <- GetPagoda(r_cm_full, n.cores=1, embeding.type=kEmbeddingType)
rast_width <- 7.5 / 2
rast_height <- 5
rast_dpi <- 150
initial_labels <- dropestr:::EstimateCellsQuality(Matrix::colSums(r_cm_full))
gg1 <- PlotPagodaEmbeding(r_full, clusters=initial_labels, mark.clusters=F, show.legend=T, alpha=0.6, embeding.type=kEmbeddingType,
                          size=0.5, raster=T, raster.width=rast_width, raster.height=rast_height, raster.dpi=rast_dpi) +
  scale_color_manual(values=scales::hue_pal(l=55)(3)[c(2, 1, 3)], name='Initial label') +
  guides(color=guide_legend(override.aes=list(size=1.2))) +
  theme_pdf(legend.pos=c(1, 1), show.ticks=F)

gg2 <- PlotPagodaEmbeding(r_full, colors=scores, mark.clusters=F, show.legend=T, alpha=0.6, embeding.type=kEmbeddingType, 
                          size=0.5, raster=T, raster.width=rast_width, raster.height=rast_height, raster.dpi=rast_dpi) +
  scale_color_distiller(palette='RdYlBu', direction=1, limits=c(0, 1), name='Score') +
  theme_pdf(legend.pos=c(1, 1), show.ticks=F)

gg_fig <- plot_grid(gg1, gg2 + rremove("ylab"), labels='AUTO', label_x=c(0, -0.05))
gg_fig
ggsave(paste0(kPlotsFolder, 'supp_pbmc8k_initial_labeling.pdf'), gg_fig, width=7.5, height=5)

Session information




VPetukhov/dropEstAnalysis documentation built on Dec. 28, 2019, 8:16 p.m.