library(ggplot2) library(ggsci) library(ggpubr) library(dplyr) library(parallel) library(dropestr) library(reshape2) library(Matrix) source("../Functions/PlotFuncs.R") source("../Functions/Functions.R") knitr::opts_chunk$set(fig.width=5, fig.height=3, echo=FALSE, warning=FALSE, message=FALSE) theme_set(theme_base) kPlotsFolder <- '~/Data/Plots/PaperReview/CBMerge/'
kDatasetName <- 'thousand' kDataPath <- paste0('/d0-mendel/home/viktor_petukhov/Data/dropseq/thousand/') kDataFolders <- c(poisson='est_01_16_precise/', unmerged='est_01_16_unmerged/', merge_all='est_01_16_merge_all/', simple='est_01_16_simple/') # holders <- mclapply(kDataFolders, function(folder) readRDS(paste0(kDataPath, folder, kDatasetName, '.rds')), mc.cores=length(kDataFolders)) # length(holders)
# validation_data <- list( # merge_targets = lapply(holders, function(holder) unlist(holder$merge_targets)), # cms_raw = lapply(holders, `[[`, 'cm_raw'), # cms = lapply(holders, `[[`, 'cm') # ) # # saveRDS(validation_data, paste0(kDataPath, 'est_01_16_validation_data.rds')) validation_data <- readRDS(paste0(kDataPath, 'est_01_16_validation_data.rds'))
umis_per_cb <- Matrix::colSums(validation_data$cms$unmerged) %>% sort(decreasing=T) real_cbs <- names(umis_per_cb)[1:1000] PlotCellsNumberLine(umis_per_cb[1:5000])
gene_species <- ifelse(substr(rownames(validation_data$cms_raw$unmerged), 1, 5) == "HUMAN", 'Human', 'Mouse') %>% as.factor() cell_species <- lapply(levels(gene_species), function(l) validation_data$cms_raw$unmerged[gene_species == l,] %>% Matrix::colSums()) %>% (function(x) levels(gene_species)[as.integer(x[[1]] < x[[2]]) + 1]) %>% setNames(colnames(validation_data$cms_raw$unmerged)) %>% as.factor() table(cell_species[real_cbs]) table(cell_species)
qplot(Matrix::colSums(validation_data$cms_raw$unmerged[gene_species == 'Mouse',]) / Matrix::colSums(validation_data$cms_raw$unmerged))
merge_targets <- lapply(validation_data$merge_targets, function(mt) mt[mt %in% real_cbs]) (comparison <- MergeComparisonSummary(merge_targets, cell_species, dataset='Drop-seq thousand')) write.csv(comparison, paste0(kDataPath, 'merge_comparison.csv'))
umis_per_cb <- lapply(validation_data$cms_raw, function(cm) sort(Matrix::colSums(cm), decreasing=T))
linewidth <- 1 alpha <- 0.7 gg1 <- PlotCellsNumberLogLog(umis_per_cb$poisson, plot.label='Poisson, no barcodes', plot.border=F, linewidth=linewidth, alpha=alpha) gg2 <- PlotCellsNumberLogLog(umis_per_cb$unmerged, gg.base=gg1, plot.label='No merge', plot.border=F, linewidth=linewidth, alpha=alpha) gg3 <- PlotCellsNumberLogLog(umis_per_cb$merge_all, gg.base=gg2, plot.label='Simple, no barcodes', plot.border=F, linewidth=linewidth, alpha=alpha) gg_merge_sizes <- gg3 + guides(color=guide_legend(title='Merge type')) + scale_color_hue(l=55) + scale_x_continuous(limits=c(1, 5000), trans='log10') + scale_y_continuous(limits=c(1e3, 3e5), trans='log10') + theme_pdf(legend.pos=c(1, 1)) gg_merge_sizes # ggsave(filename=paste0(kPlotsFolder, 'merge_sizes.pdf'), plot=gg_merge_sizes, width=6, height=4)
gg1 <- PlotCellsNumberLine(umis_per_cb$poisson, plot.label='No barcodes merge', breaks=50) gg2 <- PlotCellsNumberLine(umis_per_cb$unmerged, gg.base=gg1, plot.label='No merge', breaks=50) gg3 <- PlotCellsNumberLine(umis_per_cb$merge_all, gg.base=gg2, plot.label='Merge all', breaks=50) gl <- guide_legend(title='Merge type') gg3 + scale_x_continuous(limits=c(0, 10000), expand=c(0.01, 0.01)) + # scale_y_continuous(limits=c(0, 4.5e7), expand=c(0.01, 0.01)) + # scale_fill_npg(alpha=0.6) + guides(fill=gl, linetype=gl) + theme_pdf(legend.pos=c(1, 1)) # ggsave(paste0(kPlotsFolder, 'cell_number_ridges.', kDatasetName, '.pdf'), width=4, height=4)
# holder <- readRDS(paste0(kDataPath, kDataFolders['unmerged'], kDatasetName, '.rds')) # holder$reads_per_umi_per_cell <- NULL # saveRDS(holder, paste0(kDataPath, kDataFolders['unmerged'], kDatasetName, '_no_umis.rds')) holder <- readRDS(paste0(kDataPath, kDataFolders['unmerged'], kDatasetName, '_no_umis.rds')) bc_data <- PrepareLqCellsDataPipeline(holder, mit.chromosome.name='HUMAN_MT', scale=FALSE) bc_data <- bc_data %>% tibble::rownames_to_column("CB") %>% rename(MitochondrionFractionHuman=MitochondrionFraction) %>% mutate(MitochondrionFractionMouse=GetChromosomeFraction(holder$reads_per_chr_per_cells$Exon, 'MOUSE_MT')[CB], MitochondrionFraction = MitochondrionFractionMouse + MitochondrionFractionHuman)
cm_real <- validation_data$cms_raw$unmerged real_cbs <- (Matrix::colSums(cm_real) %>% sort(decreasing=T) %>% names())[1:1000] gene_species <- ifelse(substr(rownames(cm_real), 1, 2) == "HU", 'Human', 'Mouse') %>% as.factor() umi_by_species <- lapply(levels(gene_species), function(l) cm_real[gene_species == l,] %>% Matrix::colSums()) %>% as.data.frame() %>% `colnames<-`(levels(gene_species)) %>% tibble::rownames_to_column('CB') %>% as_tibble() %>% mutate(Total = Human + Mouse, Organism=ifelse(Human > Mouse, "Human", "Mouse"), IsReal=CB %in% real_cbs)
# smoothScatter(log10(umi_by_species$Human), log10(umi_by_species$Mouse)) ggplot(umi_by_species %>% filter(!IsReal)) + geom_histogram(aes(x=pmin(Human, Mouse) / Total, y=..density..), binwidth=0.005) + theme_pdf() ggplot(umi_by_species) + geom_histogram(aes(x=pmin(Human, Mouse), y=..density..), binwidth=2) + xlim(0, 200) + theme_pdf() ggplot(umi_by_species) + geom_histogram(aes(x=pmin(Human, Mouse) %>% log10(), y=..density..), bins=100)
# plot_frac <- bc_data$MitochondrionFraction %>% setNames(bc_data$CB) plot_frac <- bc_data$LowExpressedGenesFrac %>% setNames(bc_data$CB) # plot_frac[plot_frac > 0.2] <- 0.2 ggplot(umi_by_species) + geom_point(aes(x=Total, y=pmin(Human, Mouse), color=plot_frac[CB]), size=0.1, alpha=0.1) + scale_x_log10(name='Real UMIs', limits=c(10, 2e5)) + scale_y_log10(name='Wrong UMIs') + annotation_logticks() + theme_pdf(legend.pos=c(0, 1))
ggplot(umi_by_species) + geom_point(aes(x=Total, y=pmin(Human, Mouse) / Total, color=Organism), size=0.1, alpha=0.1) + scale_x_log10(name='Real UMIs', limits=c(10, 2e5)) + annotation_logticks() + ylab('Fraction of mixed UMIs') + theme_pdf(legend.pos=c(1, 1))
Consistence with common distribution
mouse_frac <- umi_by_species %>% filter(IsReal) %>% summarise(Mouse=sum(Mouse[Organism == 'Mouse']), Human=sum(Human[Organism == 'Human']), MF=Mouse / (Mouse + Human)) %>% .$MF ggplot(umi_by_species) + geom_histogram(aes(x=Mouse / Total, y=..density..), binwidth=0.005) + geom_vline(xintercept=mouse_frac) + theme_pdf()
plot_frac <- bc_data$MitochondrionFraction %>% setNames(bc_data$CB) smoothScatter(plot_frac[umi_by_species$CB[order(umi_by_species$Total, decreasing=T)]]) # plot_frac[plot_frac > 0.2] <- 0.2 ggplot(umi_by_species) + geom_point(aes(x=Mouse, y=Human, color=plot_frac[CB]), size=0.1, alpha=0.15) + geom_abline(aes(slope=1, intercept=0), linetype='dashed', alpha=0.5) + scale_x_log10(limits=c(10, 2e5)) + scale_y_log10() + annotation_logticks() + scale_color_gradientn(colours=c("#1200ba", "#347fff", "#cc4000", "#ff3333"), values=scales::rescale(c(0, 0.1, 0.3, 0.8))) + guides(color=guide_colorbar(direction="horizontal", title.position="top", title="Mitochondrial\nfraction", barwidth=unit(1.0, units="in"))) + theme_pdf(legend.pos=c(0.97, 0.05)) + theme(legend.margin=margin(l=3, r=3, unit="pt"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.