knitr::read_chunk("../../analysis/chunks.R")
library(cowplot) library(ggplot2) library(ggsci) library(ggpubr) library(ggrastr) library(ggrepel) library(dplyr) library(parallel) library(Seurat) library(dropestr) library(dropEstAnalysis) theme_set(theme_base) set.seed(42) kOutputFolder <- '../../output/' kPlotsFolder <- paste0(kOutputFolder, 'figures/') kDataPath <- '../../data/dropest/10x/pbmc8k/' kEstFolder <- paste0(kDataPath, 'est_11_11_umi_quality/') k10xFolder <- paste0(kDataPath, 'filtered_gene_bc_matrices/GRCh38/')
# holder <- readRDS(paste0(kEstFolder, 'pbmc8k.rds')) # holder$reads_per_umi_per_cell <- NULL # saveRDS(holder, paste0(kEstFolder, 'pbmc8k_no_umi.rds')) holder <- readRDS(paste0(kEstFolder, 'pbmc8k_no_umi.rds'))
cm_10x <- Read10xMatrix(k10xFolder, use.gene.names=T) cm_10x <- cm_10x[, order(Matrix::colSums(cm_10x), decreasing=F)]
umis_per_cell <- sort(Matrix::colSums(holder$cm_raw), decreasing=T) est_cell_num <- EstimateCellsNumber(umis_per_cell)
PlotCellsNumberLogLog(umis_per_cell, estimate.cells.number=T) + theme_pdf(legend.pos=c(1, 1))
scores <- ScorePipelineCells(holder, mit.chromosome.name='MT', predict.all=T, verbose=T)[names(umis_per_cell)] PlotCellScores(scores[1:20000], cells.number=est_cell_num)
pdf(paste0(kPlotsFolder, 'lq_pbmc8k_scores.pdf'), width=7.5, height=3.5) par(mar=c(2.7,2.5,0,0.3), tck=-0.02, mgp=c(1.4,0.3,0)) PlotCellScores(scores[1:20000], cells.number=est_cell_num) invisible(dev.off())
intersect_cbs <- names(scores[1:est_cell_num$expected]) intersect_cbs <- intersect_cbs[scores[intersect_cbs] > 0.9] unknown_cell_scores <- scores[(est_cell_num$expected + 1):length(scores)] rescued_cbs <- names(unknown_cell_scores)[unknown_cell_scores > 0.5] unknown_cell_scores <- scores[1:est_cell_num$expected] filtered_cbs <- names(unknown_cell_scores)[unknown_cell_scores < 0.1] c(Uncahnged=length(intersect_cbs), Rescued=length(rescued_cbs), Filtered=length(filtered_cbs))
r_cm_rescued <- holder$cm_raw[, c(names(umis_per_cell)[1:est_cell_num$expected], rescued_cbs)] r_cm_rescued <- r_cm_rescued[grep("^[^;]+$", rownames(r_cm_rescued)),] if (!all(colnames(cm_10x) %in% colnames(r_cm_rescued))) stop("All 10x cells must be presented")
# r_rescued <- GetPagoda(r_cm_rescued, n.cores=30, tsne.iters.num=3000) # r_rescued$getEmbedding(type='PCA', perplexity=50, embeddingType = 'tSNE', max_iter=3000) # saveRDS(r_rescued, paste0(kEstFolder, 'pagoda.rds')) r_rescued <- readRDS(paste0(kEstFolder, 'pagoda.rds'))
clusters_annotated <- read.csv(paste0(kEstFolder, 'clusters_annotated.csv')) %>% (function(x) setNames(as.character(x$Type), x$Barcode)) clusters <- r_rescued$clusters$PCA$infomap notannotated_cells <- setdiff(names(clusters), names(clusters_annotated))
clusters_annotated_resc <- AnnotateClustersByGraph(r_rescued$graphs$PCA, clusters_annotated, notannotated_cells, mc.cores=10) rescued_clusters <- clusters_annotated_resc[rescued_cbs]
tsne <- r_rescued$embeddings$PCA$tSNE intergenic_inf <- names(clusters)[clusters == 11] %>% intersect(filtered_cbs) %>% GetCellsChull(tsne, offset.y=-0.5) mitochondrial_inf <- names(clusters)[clusters == 16] %>% intersect(filtered_cbs) %>% GetCellsChull(tsne) low_rpu_inf <- names(clusters)[clusters == 7] %>% intersect(filtered_cbs) %>% GetCellsChull(tsne, chull.quantile=0.8) chull_list <- list(intergenic_inf, mitochondrial_inf, low_rpu_inf) %>% lapply(`[[`, 'chull') %>% lapply(as.data.frame) %>% setNames(c('Intergenic', 'Mitochondrial', 'Low expression')) label_colors <- c('#c63737', '#1989c6', '#d3d631') %>% setNames(names(chull_list)) labels_df <- Reduce(rbind, lapply(chull_list, colMeans)) %>% as.data.frame() %>% mutate(Text=paste0(names(chull_list), ' (', c('C', 'B', 'D'), ')'), Color=label_colors) %>% mutate(OffsetX=c(4, -4, -2), OffsetY=c(8, 3, -6))
gg_labs <- labs(x='tSNE-1', y='tSNE-2') gg_all <- PlotPagodaEmbeding(r_rescued, clusters=r_rescued$clusters$PCA$infomap, mark.clusters=F, font.size=NULL, raster=T) + theme_pdf(show.ticks=F) + gg_labs
plot_clusters <- setdiff(names(clusters_annotated), c(filtered_cbs, rescued_cbs)) gg_base <- PlotFiltrationResults(r_rescued, r_rescued$clusters$PCA$infomap[plot_clusters], filtered.cbs=filtered_cbs, raster.width=5, raster.height=5, mark.clusters=F) + theme_pdf(legend.pos=c(1, 1), show.ticks=F) + gg_labs gg <- Reduce(`+`, lapply(names(chull_list), function(n) geom_polygon(data=chull_list[[n]], mapping=aes(x=V1, y=V2), alpha=0.3, color=label_colors[n])), init=gg_base) gg_repels <- lapply(1:3, function(i) geom_label_repel(data=labels_df[i,], mapping=aes(x=V1, y=V2, label=Text), fill=alpha(labels_df$Color[i], 0.5), nudge_x=labels_df$OffsetX[i], nudge_y=labels_df$OffsetY[i], segment.size=1, size=3, point.padding=0, arrow=arrow(length = unit(0.02, 'npc')), color='black', segment.color=labels_df$Color[i], fontface='bold')) gg_filt <- Reduce(`+`, gg_repels, init=gg)
plot_clusters <- setdiff(names(clusters_annotated), c(filtered_cbs, rescued_cbs)) gg <- PlotFiltrationResults(r_rescued, clusters_annotated[plot_clusters], filtered.cbs=filtered_cbs, rescued.clusters=rescued_clusters, raster.width=4.28, raster.height=6) gg <- Reduce(`+`, lapply(names(chull_list), function(n) geom_polygon(data=chull_list[[n]], mapping=aes(x=V1, y=V2), alpha=0.3, color=label_colors[n])), init=gg) gg_repels <- lapply(1:3, function(i) geom_label_repel(data=labels_df[i,], mapping=aes(x=V1, y=V2, label=Text), fill=alpha(labels_df$Color[i], 0.5), nudge_x=labels_df$OffsetX[i], nudge_y=labels_df$OffsetY[i], segment.size=1, size=3, point.padding=0, arrow=arrow(length = unit(0.02, 'npc')), color='black', segment.color=labels_df$Color[i], fontface='bold')) gg <- Reduce(`+`, gg_repels, init=gg) gg_tsne_noise <- gg + theme_pdf(legend.pos=c(1, 1), show.ticks=F) gg_tsne_noise
bc_data <- PrepareLqCellsDataPipeline(holder, mit.chromosome.name='MT', scale=F)
real_cell_color <- '#0faf00' gg <- function(y.max, filt.color) { ggplot() + scale_y_continuous(expand=c(0, 0), name='Fraction of cells', limits=c(0, y.max)) + guides(fill=guide_legend(title=NULL)) + theme_pdf(legend.pos=c(1, 1)) + scale_fill_manual(values=as.vector(c(filt.color, real_cell_color))) } gg1 <- gg(0.9, label_colors['Mitochondrial']) + geom_histogram(aes(x=bc_data[mitochondrial_inf$cbs, ]$MitochondrionFraction, y=..count.. / sum(..count..), fill = 'Mitochondrial'), color='black') + geom_histogram(aes(x=bc_data[intersect_cbs, ]$MitochondrionFraction, y=..count.. / sum(..count..), fill='Real'), alpha=0.5, color='black') + scale_x_continuous(limits=c(0, 1.01), expand=c(0, 0), name='Fraction of mitochondrial reads') gg2 <- gg(0.51, label_colors['Intergenic']) + geom_histogram(aes(x=bc_data[intergenic_inf$cbs, ]$IntergenicFrac, y=..count.. / sum(..count..), fill = 'Intergenic'), color='black') + geom_histogram(aes(x=bc_data[intersect_cbs, ]$IntergenicFrac, y=..count.. / sum(..count..), fill='Real'), alpha=0.5, color='black') + scale_x_continuous(limits=c(0, 1.01), expand=c(0, 0), name='Fraction of intergenic reads') gg3 <- gg(0.6, label_colors['Low expression']) + geom_histogram(aes(x=bc_data[low_rpu_inf$cbs, ]$UmiPerGene, y=..count.. / sum(..count..), fill = 'Low expression'), color='black') + geom_histogram(aes(x=bc_data[intersect_cbs, ]$UmiPerGene, y=..count.. / sum(..count..), fill='Real'), alpha=0.5, color='black') + scale_x_continuous(limits=c(1, 5.5), expand=c(0, 0), name='Mean #UMIs per gene') gg_distributions <- plot_grid(gg1, gg2, gg3, nrow=3, align='v', labels=c('B', 'C', 'D'), label_x=0.025)
gg_distributions
gg_noise_fig <- plot_grid(gg_tsne_noise, gg_distributions, ncol=2, rel_widths=c(1, 0.75), labels=c('A', NULL))
gg_noise_fig
ggsave(paste0(kPlotsFolder, 'fig_pbmc8k_lq_cells.pdf'), gg_noise_fig, width=7.5, height=6)
cbs_10x <- colnames(cm_10x) rescued_cbs <- names(scores)[scores > 0.9] %>% setdiff(cbs_10x) filtered_cbs <- names(scores)[scores < 0.1] %>% intersect(cbs_10x) intersect_cbs <- names(scores)[scores > 0.9] %>% intersect(cbs_10x) rescued_clusters <- clusters_annotated_resc[rescued_cbs] c(Unchanged=length(cbs_10x), Rescued=length(rescued_cbs), Filtered=length(filtered_cbs))
plot_clusters <- setdiff(cbs_10x, filtered_cbs) gg_tsne <- PlotFiltrationResults(r_rescued, clusters=clusters_annotated[plot_clusters], filtered.cbs=filtered_cbs, rescued.clusters=rescued_clusters, raster.width=4.28, raster.height=5) + theme_pdf(legend.pos=c(0, 1), show.ticks=F) gg_tsne
tested_clusts <- sort(c(clusters_annotated[intersect_cbs], rescued_clusters)) tested_clusts <- as.factor(tested_clusts) %>% setNames(names(tested_clusts)) large_rescued <- which(table(clusters_annotated[rescued_cbs]) >= 10) %>% names() tested_clusts <- tested_clusts[tested_clusts %in% large_rescued]
rescued_table <- TableOfRescuedCells(clusters_annotated_resc[c(intersect_cbs, rescued_cbs)], rescued_cbs) write.csv(rescued_table, paste0(kOutputFolder, "tables/rescued_cbc_pbmc8k.csv"), row.names=F) rescued_table
seurat_clusters <- tested_clusts seurat_cm <- r_cm_rescued[, names(seurat_clusters)] seurat_cm <- seurat_cm[Matrix::rowSums(seurat_cm) > 200, ] srt <- CreateSeuratObject(raw.data = seurat_cm, project = "pbmc8k", display.progress=F) srt <- NormalizeData(object = srt, normalization.method = "LogNormalize", scale.factor = 10000, display.progress=F) srt <- FindVariableGenes(object = srt, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 1, do.plot=F, display.progress=F) srt <- ScaleData(object = srt, vars.to.regress = "nUMI", display.progress=F) save(srt, file="../../output/pbmc8k_srt.Rda") # load("../../output/pbmc8k_srt.Rda")
srt@ident <- as.factor(seurat_clusters[colnames(srt@raw.data)]) names(srt@ident) <- colnames(srt@raw.data) compared_clusters <- unique(srt@ident) %>% as.character() cluster_markers <- mclapply(compared_clusters, function(i) mclapply(setdiff(compared_clusters, i), FindClusterMarkers, i, srt, mc.cores=5), mc.cores=6) overexpressed_genes <- GetOverexpressedGenes(srt, compared_clusters, cluster_markers, expression.threshold=0.4) length(overexpressed_genes)
tested_clusts <- seurat_clusters separation <- c(setNames(rep('rescued', length(rescued_cbs)), rescued_cbs), setNames(rep('real', length(intersect_cbs)), intersect_cbs)) umis_per_cb_subset <- log10(Matrix::colSums(r_cm_rescued[, names(tested_clusts)])) tested_clusts <- tested_clusts[order(tested_clusts, -umis_per_cb_subset)] de_genes <- overexpressed_genes
raster_width <- 3 raster_height <- 3 raster_dpi <- 100 plot_df <- ExpressionMatrixToDataFrame(r_rescued$counts[names(tested_clusts), de_genes], umis_per_cb_subset, tested_clusts, filtration.type=separation) plot_df <- plot_df %>% filter(UmisPerCb < 3.4) plot_dfs <- split(plot_df, plot_df$FiltrationType) ggs <- lapply(plot_dfs, HeatmapAnnotGG, umi.per.cell.limits=range(plot_df$UmisPerCb), raster.width=raster_width, raster.height=raster_height, raster.dpi=raster_dpi) legend_guides <- list(HeatmapLegendGuide('Expression'), HeatmapLegendGuide('Cell type', guide=guide_legend, ncol=3), HeatmapLegendGuide('log10(#molecules)')) gg_legends <- mapply(`+`, ggs$real, legend_guides, SIMPLIFY=F) %>% lapply(`+`, theme(legend.margin=margin(l=4, r=4, unit='pt'))) %>% lapply(get_legend) ggs$real$heatmap <- ggs$real$heatmap + rremove('xlab') + ylab('Cells') ggs$rescued$heatmap <- ggs$rescued$heatmap + labs(x = 'Genes', y = 'Cells') ggs_annot <- lapply(ggs, function(gg) plot_grid(plotlist=lapply(gg, `+`, theme(legend.position="none", plot.margin=margin())), nrow=1, rel_widths=c(1.5, 0.1, 0.1), align='h')) gg_legends_plot <- plot_grid(plotlist=gg_legends, nrow=3, align='v')
gg_left <- plot_grid(ggs_annot$real, ggs_annot$rescued, nrow=2, labels=c('B', 'C')) gg_right <- gg_tsne + theme(plot.margin=margin(l=0.1, unit='in'), axis.text=element_blank(), axis.ticks=element_blank()) gg_bottom <- plot_grid(plotlist=gg_legends[c(1, 3, 2)], ncol=3, rel_widths=c(1, 1, 2.6)) gg_10x <- plot_grid(gg_left, gg_right, labels=c('', 'D'), ncol=2) %>% plot_grid(gg_bottom, nrow=2, rel_heights=c(1, 0.21), align='v')
gg_10x
ggsave(paste0(kPlotsFolder, 'supp_comparison_with_10x.pdf'), gg_10x, width=7.5, height=6)
kEmbeddingType <- 'tSNE' # kEmbeddingType <- 'largeVis' cbs_full <- names(scores)[1:20000] r_cm_full <- holder$cm_raw[, cbs_full] r_full <- GetPagoda(r_cm_full, n.cores=1, embeding.type=kEmbeddingType)
rast_width <- 7.5 / 2 rast_height <- 5 rast_dpi <- 150 initial_labels <- dropestr:::EstimateCellsQuality(Matrix::colSums(r_cm_full)) gg1 <- PlotPagodaEmbeding(r_full, clusters=initial_labels, mark.clusters=F, show.legend=T, alpha=0.6, embeding.type=kEmbeddingType, size=0.5, raster=T, raster.width=rast_width, raster.height=rast_height, raster.dpi=rast_dpi) + scale_color_manual(values=scales::hue_pal(l=55)(3)[c(2, 1, 3)], name='Initial label') + guides(color=guide_legend(override.aes=list(size=1.2))) + theme_pdf(legend.pos=c(1, 1), show.ticks=F) gg2 <- PlotPagodaEmbeding(r_full, colors=scores, mark.clusters=F, show.legend=T, alpha=0.6, embeding.type=kEmbeddingType, size=0.5, raster=T, raster.width=rast_width, raster.height=rast_height, raster.dpi=rast_dpi) + scale_color_distiller(palette='RdYlBu', direction=1, limits=c(0, 1), name='Score') + theme_pdf(legend.pos=c(1, 1), show.ticks=F) gg_fig <- plot_grid(gg1, gg2 + rremove("ylab"), labels='AUTO', label_x=c(0, -0.05))
gg_fig
ggsave(paste0(kPlotsFolder, 'supp_pbmc8k_initial_labeling.pdf'), gg_fig, width=7.5, height=5)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.