library(ggplot2) library(ggrastr) library(dplyr) library(dropestr) theme_set(theme_base) FillNa <- function(data, value=0) { data[is.na(data)] <- value return(data) } # kPlotsFolder <- '~/Data/Plots/PaperReview/CBMerge/'
kDataPath <- '../../data/dropest/10x/hgmm_6k/' kDataFolders <- c(poisson='est_01_14_precise/', real='est_01_14_barcodes/', unmerged='est_01_14_unmerged/', merge_all='est_01_16_merge_all/', simple='est_01_14_simple/') # holders <- mclapply(kDataFolders, function(folder) readRDS(paste0(kDataPath, folder, kDatasetName, '.rds')), mc.cores=length(kDataFolders)) # length(holders)
# holder <- readRDS(paste0(kDataPath, kDataFolders['merge_all'], kDatasetName, '.rds')) # validation_data$merge_targets$merge_all <- unlist(holder$merge_targets) # validation_data$cms_raw$merge_all <- holder$cm_raw # validation_data$cms$merge_all <- holder$cm
# validation_data <- list( # merge_targets = lapply(holders, function(holder) unlist(holder$merge_targets)), # cms_raw = lapply(holders, `[[`, 'cm_raw'), # cms = lapply(holders, `[[`, 'cm') # ) # # saveRDS(validation_data, paste0(kDataPath, 'est_01_14_validation_data.rds')) validation_data <- readRDS(paste0(kDataPath, 'est_01_14_validation_data.rds'))
validation_data$cms_raw <- lapply(validation_data$cms_raw, function(cm) cm[grep("^[^;]+$", rownames(cm)),]) validation_data$cms <- lapply(validation_data$cms, function(cm) cm[grep("^[^;]+$", rownames(cm)),])
umis_per_cb <- Matrix::colSums(validation_data$cms$real) %>% sort(decreasing=T) real_cbs <- names(umis_per_cb)[1:6000] PlotCellsNumberLine(umis_per_cb[1:10000])
gene_species <- ifelse(substr(rownames(validation_data$cms_raw$unmerged), 1, 2) == "hg", 'Human', 'Mouse') %>% as.factor() cell_species <- lapply(levels(gene_species), function(l) validation_data$cms_raw$unmerged[gene_species == l,] %>% Matrix::colSums()) %>% (function(x) levels(gene_species)[as.integer(x[[1]] < x[[2]]) + 1]) %>% setNames(colnames(validation_data$cms_raw$unmerged)) %>% as.factor() table(cell_species[real_cbs]) table(cell_species) / sum(table(cell_species))
qplot(Matrix::colSums(validation_data$cms_raw$unmerged[gene_species == 'Mouse',]) / Matrix::colSums(validation_data$cms_raw$unmerged))
merge_targets <- lapply(validation_data$merge_targets, function(mt) mt[mt %in% real_cbs]) (comparison <- MergeComparisonSummary(merge_targets, cell_species, dataset="10x hgmm6k")) write.csv(comparison, paste0(kDataPath, 'merge_comparison.csv'))
umis_per_cb <- lapply(validation_data$cms_raw, function(cm) sort(Matrix::colSums(cm), decreasing=T)) # for (n in names(umis_per_cb)) { # names(umis_per_cb[[n]]) <- substr(names(umis_per_cb[[n]]), 1, 14) # filt_cbs <- substr(names(umis_per_cb[[n]]), 1, 14) %>% table() %>% (function(x) names(x)[x == 1]) # umis_per_cb[[n]] <- umis_per_cb[[n]][filt_cbs] # }
linewidth <- 1 alpha <- 0.7 gg1 <- PlotCellsNumberLogLog(umis_per_cb$real, plot.label='Poisson, real barcodes', plot.border=F, linewidth=linewidth, alpha=alpha) gg2 <- PlotCellsNumberLogLog(umis_per_cb$poisson, gg.base=gg1, plot.label='Poisson, no barcodes', plot.border=F, linewidth=linewidth, alpha=alpha) gg3 <- PlotCellsNumberLogLog(umis_per_cb$unmerged, gg.base=gg2, plot.label='No merge', plot.border=F, linewidth=linewidth, alpha=alpha) gg4 <- PlotCellsNumberLogLog(umis_per_cb$merge_all, gg.base=gg3, plot.label='Simple, no barcodes', plot.border=F, linewidth=linewidth, alpha=alpha) gg_merge_sizes <- gg4 + guides(color=guide_legend(title='Merge type')) + scale_color_hue(l=55) + # scale_x_continuous(limits=c(0, 10000)) + theme_pdf(legend.pos=c(1, 1)) gg_merge_sizes # ggsave(filename=paste0(kPlotsFolder, 'merge_sizes.pdf'), plot=gg_merge_sizes, width=6, height=4)
gg1 <- PlotCellsNumberLine(umis_per_cb$real, plot.label='Real barodes merge', breaks=50) gg2 <- PlotCellsNumberLine(umis_per_cb$poisson, gg.base=gg1, plot.label='No barcodes merge', breaks=50) gg3 <- PlotCellsNumberLine(umis_per_cb$unmerged, gg.base=gg2, plot.label='No merge', breaks=50) gg4 <- PlotCellsNumberLine(umis_per_cb$merge_all, gg.base=gg3, plot.label='Merge all', breaks=50) gl <- guide_legend(title='Merge type') gg4 + scale_x_continuous(limits=c(0, 15000), expand=c(0.01, 0.01)) + scale_y_continuous(limits=c(0, 4.5e7), expand=c(0.01, 0.01)) + # scale_fill_npg(alpha=0.6) + guides(fill=gl, linetype=gl) + theme_pdf(legend.pos=c(1, 1)) # ggsave(paste0(kPlotsFolder, 'cell_number_ridges.', kDatasetName, '.pdf'), width=4, height=4)
# holder <- readRDS(paste0(kDataPath, kDataFolders['unmerged'], kDatasetName, '.rds')) # holder$reads_per_umi_per_cell <- NULL # saveRDS(holder, paste0(kDataPath, kDataFolders['unmerged'], kDatasetName, '_no_umis.rds')) holder <- readRDS('../../data/dropest/10x/hgmm_1k/est_2018_01_25_filtered/hgmm_1k.rds') # holder <- readRDS('../../data/dropest/10x/hgmm_1k/est_2018_01_25_not_filtered/hgmm_1k.rds') bc_data <- PrepareLqCellsDataPipeline(holder, mit.chromosome.name='hg19_MT', scale=FALSE) bc_data <- bc_data %>% tibble::rownames_to_column("CB") %>% rename(MitochondrionFractionHuman=MitochondrionFraction) %>% mutate(MitochondrionFractionMouse=GetChromosomeFraction(holder$reads_per_chr_per_cells$Exon, 'mm10_MT')[CB], MitochondrionFraction = (MitochondrionFractionMouse * Mouse + MitochondrionFractionHuman * Human) / Total) cell_number <- 1000
cm_real <- holder$cm_raw gene_species <- ifelse(substr(rownames(cm_real), 1, 2) == "hg", 'Human', 'Mouse') %>% as.factor() umi_by_species <- lapply(levels(gene_species), function(l) cm_real[gene_species == l,] %>% Matrix::colSums()) %>% as.data.frame() %>% `colnames<-`(levels(gene_species)) %>% tibble::rownames_to_column('CB') %>% as_tibble() %>% mutate(Total = Human + Mouse, Organism=ifelse(Human > Mouse, "Human", "Mouse"), IsReal=order(Total, decreasing=T) <= cell_number) reads_per_chr <- FillNa(holder$reads_per_chr_per_cells$Exon[umi_by_species$CB,]) umi_by_species <- umi_by_species %>% mutate( MitReads = reads_per_chr$mm10_MT + reads_per_chr$hg19_MT, TotalReads = rowSums(reads_per_chr), MitochondrionFraction = MitReads / TotalReads )
# smoothScatter(log10(umi_by_species$Human), log10(umi_by_species$Mouse)) ggplot(umi_by_species %>% filter(!IsReal)) + geom_histogram(aes(x=pmin(Human, Mouse) / Total, y=..density..), binwidth=0.005) + theme_pdf() ggplot(umi_by_species) + geom_histogram(aes(x=pmin(Human, Mouse), y=..density..), binwidth=2) + xlim(0, 200) + theme_pdf() ggplot(umi_by_species) + geom_histogram(aes(x=pmin(Human, Mouse) %>% log10(), y=..density..), bins=100)
plot_frac <- bc_data$MitochondrionFraction %>% setNames(bc_data$CB) # plot_frac <- bc_data$LowExpressedGenesFrac %>% setNames(bc_data$CB) # plot_frac[plot_frac > 0.2] <- 0.2 ggplot(umi_by_species) + geom_point(aes(x=Total, y=pmin(Human, Mouse), color=plot_frac[CB]), size=0.1, alpha=0.1) + scale_x_log10(name='Real UMIs', limits=c(10, 2e5)) + scale_y_log10(name='Wrong UMIs') + annotation_logticks() + theme_pdf(legend.pos=c(0, 1))
ggplot(umi_by_species) + geom_point(aes(x=Total, y=pmin(Human, Mouse) / Total, color=Organism), size=0.1, alpha=0.1) + scale_x_log10(name='Real UMIs', limits=c(10, 2e5)) + annotation_logticks() + ylab('Fraction of mixed UMIs') + theme_pdf(legend.pos=c(1, 1))
Consistence with common distribution
mouse_frac <- umi_by_species %>% filter(IsReal) %>% summarise(Mouse=sum(Mouse[Organism == 'Mouse']), Human=sum(Human[Organism == 'Human']), MF=Mouse / (Mouse + Human)) %>% .$MF ggplot(umi_by_species) + geom_histogram(aes(x=Mouse / Total, y=..density..), binwidth=0.005) + geom_vline(xintercept=mouse_frac) + theme_pdf()
ggplot(umi_by_species) + geom_point(aes(x=Mouse, y=Human, color=MitochondrionFraction), size=0.1, alpha=0.1) + geom_abline(aes(slope=1, intercept=0), linetype='dashed', alpha=0.5) + scale_x_log10(limits=c(10, 2e5)) + scale_y_log10() + annotation_logticks() + guides(color=guide_colorbar(title='Mitochondrial\nfraction')) + theme_pdf(legend.pos=c(0, 1))
ggplot(umi_by_species) + geom_point(aes(x=Mouse, y=Human, color=plot_frac[CB]), size=0.1, alpha=0.1) + geom_abline(aes(slope=1, intercept=0), linetype='dashed', alpha=0.5) + scale_x_log10(limits=c(10, 2e5)) + scale_y_log10() + annotation_logticks() + theme_pdf(legend.pos=c(0, 1))
plot_df <- umi_by_species %>% filter(IsReal, Organism == 'Human', Mouse < 900) ggplot(plot_df, aes(x=Mouse, y=Human)) + geom_point(size=0.1, alpha=0.1) + scale_x_log10(limits=c(10, 2e5)) + scale_y_log10() + annotation_logticks() + theme_pdf(legend.pos=c(0, 1))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.