knitr::read_chunk("../../analysis/chunks.R")



library(cowplot)
library(ggplot2)
library(ggsci)
library(ggpubr)
library(ggrastr)
library(dplyr)
library(parallel)
library(reshape2)
library(dropestr)
library(dropEstAnalysis)

theme_set(theme_base)
kPlotsFolder <- '../../output/figures/'
kDataPath <- '../../data/dropest/10x/aml035_post_transplant/'
kTrimmedLengths <- 6:9

Data loading

# holders <- mclapply(paste0(kDataPath, c('est_01_21_umi_quality/aml035ost_transplant.rds', 'est_01_21_10x_umi/aml035ost_transplant.rds')), readRDS, mc.cores=2) %>%
#   setNames(c('raw', '10x'))
# 
# reads_per_umi_per_cell <- holders$raw$reads_per_umi_per_cell
# reads_per_umi_per_cell$reads_per_umi <- FilterNUmis(reads_per_umi_per_cell$reads_per_umi)
# 
# saveRDS(reads_per_umi_per_cell, paste0(kDataPath, 'reads_per_umi_per_cell.rds'))

reads_per_umi_per_cell <- readRDS(paste0(kDataPath, 'reads_per_umi_per_cell.rds'))
# reads_per_umi_per_cell$reads_per_umi <- reads_per_umi_per_cell$reads_per_umi[1:50000]
# raw <- readRDS(paste0(kDataPath, 'raw.rds'))

length(reads_per_umi_per_cell$reads_per_umi)

Trimming

# # WARNING: it requires a lot of memory and CPU. Because of weird R multiprocessing, for me it took up to 650Gb RAM for 30 cores.
# trimmed <- mclapply(kTrimmedLengths, function(i) 
#   TrimAndCorrect(reads_per_umi_per_cell, umi.trim.length=i, mc.cores.large=7, 
#                  mc.cores.small=7, verbosity.level=1), mc.cores=4)
# 
# raw <- list()
# 
# raw$umi_distribution <- GetUmisDistribution(reads_per_umi_per_cell$reads_per_umi, smooth=10)
# raw$umi_probabilities <- raw$umi_distribution / sum(raw$umi_distribution)
# 
# max_umi_per_gene <- sapply(reads_per_umi_per_cell$reads_per_umi, length) %>% max()
# raw$collisions_info <- FillCollisionsAdjustmentInfo(raw$umi_probabilities, max_umi_per_gene)
# raw$correction_info <- PrepareUmiCorrectionInfo(umi.probabilities=raw$umi_probabilities, 
#                                                 raw$collisions_info[max_umi_per_gene])
# 
# raw$filt_reads_per_umi_simple <- reads_per_umi_per_cell %>% 
#   CorrectUmiSequenceErrors(raw$umi_probabilities, method='Classic', 
#                            correction.info=raw$correction_info, 
#                            collisions.info=raw$collisions_info, 
#                            mc.cores=30, verbosity.level=2, return='reads')
# 
# raw$filt_umis_per_gene_simple <- sapply(raw$filt_reads_per_umi_simple, length)
# 
# raw$umis_per_gene <- sapply(reads_per_umi_per_cell$reads_per_umi, length)
# 
# saveRDS(raw, paste0(kDataPath, 'raw5.rds'))
# saveRDS(trimmed, paste0(kDataPath, 'trimmed5.rds'))
raw <- readRDS(paste0(kDataPath, 'raw5.rds'))
trimmed <- readRDS(paste0(kDataPath, 'trimmed5.rds'))

Plots

Supp. Figure Collisions

umis_per_gene_df <- as.data.frame(lapply(trimmed[1:4], `[[`, 'umis.per.gene'))
umis_per_gene_df$raw <- raw$umis_per_gene
colnames(umis_per_gene_df)[1:4] <- paste0(kTrimmedLengths)

umis_per_gene_adj_df <- trimmed[1:4] %>%
  lapply(function(tr) tr$collisions.info[tr$umis.per.gene]) %>% as.data.frame()
colnames(umis_per_gene_adj_df) <- paste0(kTrimmedLengths)

umis_per_gene_adj_classic_df <- mclapply(kTrimmedLengths, function(i) sapply(
  trimmed[[i - 5]]$umis.per.gene, AdjustGeneExpressionUniform, 4^i), mc.cores=5) %>%
  as.data.frame()
colnames(umis_per_gene_adj_classic_df) <- paste0(kTrimmedLengths)

plot_df <- melt(umis_per_gene_df, id.vars='raw', variable.name='UmiLen', 
                value.name='NoAdjustment')
plot_df$Adjusted <- melt(umis_per_gene_adj_df)$value
plot_df$AdjustedClassic <- melt(umis_per_gene_adj_classic_df)$value

plot_df <- melt(plot_df, id.vars=c('raw', 'UmiLen'), variable.name='Adjustment') %>%
  filter(raw > 5)
plot_dfs <- split(plot_df, plot_df$UmiLen)
scale_y_low <- scale_y_continuous(limits=c(-3000, 0), breaks=seq(0, -2500, by=-2500), expand=c(0, 0))

ggs <- lapply(names(plot_dfs), function(n) 
  ggplot(plot_dfs[[n]], aes(x=raw, y=(value - raw), linetype=Adjustment, col=UmiLen)) + 
    geom_smooth() + 
    scale_color_manual(values=scales::hue_pal()(4)[as.integer(n) - 5]) +
    scale_linetype_manual(labels=c('No adjustment', 'Empirical', 'Uniform'), 
                          values=c('solid', 'dashed', 'dotted')) +
    scale_x_continuous(expand=c(0, 0), limits=c(0, 12300), 
                       breaks=seq(0, 10000, 2500)) + 
    scale_y_continuous(expand=c(0, 0), limits=c(-10000, 0)) +
    theme_pdf())

ggs[3:4] <- lapply(ggs[3:4], `+`, scale_y_low)
ggs[[1]] <- ggs[[1]] + theme_pdf(legend.pos=c(0, 0)) + theme(legend.box='horizontal') +
  scale_color_manual(values=alpha(scales::hue_pal()(4), 0.7), name='Length of\ntrimmed UMI', 
                     labels=names(plot_dfs), drop=F) +
  guides(color=guide_legend(ncol=2, nrow=2, byrow=T))

ggs <- BuildPanel4(ggs, "", "", return.raw=T, legend.plot.id=1)

ga1 <- plot_grid(plotlist=ggs[c(1,3)], nrow=2, rel_heights=c(2.5, 1), align='v')
ga2 <- plot_grid(plotlist=ggs[c(2,4)], nrow=2, rel_heights=c(2.5, 1), align='v')

supp_fig_collisions <- annotate_figure(ggarrange(ga1, ga2, ncol=2), 
                                 bottom=text_grob("#Molecules after trimming", size=14), 
                                 left = text_grob("Error after trimming (#molecules)", 
                                                  rot=90, size=14))

ggsave(paste0(kPlotsFolder, 'supp_umi_trim_collisions.pdf'), supp_fig_collisions,
       height=4, width=8)
supp_fig_collisions

Supp. Figure Corrections

plots <- lapply(kTrimmedLengths, function(i) 
  PlotTrimmedCorrections(trimmed[[i - 5]], raw, i, log=T, rast.width=4,
                         rast.height=4.5, rast.dpi=150, heights.ratio=c(2.5, 3)))
large_plots <- lapply(plots, `[[`, 'large') %>% lapply(`+`, annotation_logticks(sides='b'))
small_plots <- lapply(plots, `[[`, 'small')

title_theme <- theme(plot.title=element_text(size=12))
gp_large <- BuildPanel4(large_plots, xlabel="Corrected #UMIs without trimming", 
                        ylabel="Error on trimmed data, %", legend.plot.id=3, labels=NULL, 
                        plot.theme=title_theme)
gp_small <- BuildPanel4(small_plots, xlabel="Corrected #UMIs without trimming", 
                        ylabel="Error on trimmed data, %", labels=NULL, 
                        plot.theme=title_theme)
supp_fig_errors <- plot_grid(gp_large, gp_small, nrow=2, labels=c('A', 'B'), 
                             rel_heights=c(5, 6))
supp_fig_errors
ggsave(paste0(kPlotsFolder, 'supp_umi_trim_error.pdf'), supp_fig_errors, width=8, height=9)

Main figure

umis_per_gene <- as.data.frame(lapply(trimmed, `[[`, 'umis.per.gene'))
colnames(umis_per_gene) <- paste0(kTrimmedLengths)
errors <- ((umis_per_gene - raw$filt_umis_per_gene_simple) / raw$filt_umis_per_gene_simple) %>%
  cbind(Real=raw$filt_umis_per_gene_simple) %>% 
  melt(id.vars='Real', variable.name='UmiLength', value.name='Error')

errors_sum <- errors %>% group_by(Real, UmiLength) %>% summarise(Error=mean(Error, trim=0.2))

Figure A

kPlotWidth <- 8
kPlotHeight <- 7
gp1 <- ggplot(errors_sum, aes(x=Real, y=100 * Error, color=UmiLength)) +
  geom_point_rast(size=0.3, alpha=0.5, width=kPlotWidth / 2, height=kPlotHeight / 3) +
  geom_smooth() +
  scale_x_log10(expand=c(0.01, 0)) + annotation_logticks(sides='b') + 
  scale_y_continuous(limits=c(-65, 60), expand=c(0, 0)) +
  theme_pdf(legend.pos=c(0, 0)) +
  scale_color_npg() +
  guides(color=guide_legend(title='Length of trimmed UMI', ncol=4))

gp1

Figure B

fig_umi_length <- 6
trimmed_corrected_cells <- lapply(raw$filt_reads_per_umi_simple, TrimUmis, fig_umi_length)
umis_per_gene_plt <- data.frame(none=sapply(trimmed_corrected_cells, length), 
                                real=raw$filt_umis_per_gene_simple)

umis_per_gene_plt$empirical <- trimmed[[fig_umi_length - 5]]$collisions.info[umis_per_gene_plt$none]
umis_per_gene_plt$uniform <- sapply(umis_per_gene_plt$none, AdjustGeneExpressionUniform, 
                                    4^fig_umi_length)
umis_per_gene_plt_df <- melt(as.data.frame(umis_per_gene_plt), id.vars='real', 
                             variable.name='Adjustment', value.name='value')

umis_per_gene_plt_df <- umis_per_gene_plt_df %>% group_by(Adjustment, real) %>% 
  summarize(value=mean(value, 0.2))

gp2 <- umis_per_gene_plt_df %>%
  ggplot(aes(x=real, y=100 * (value - real) / real, col=Adjustment)) +
  geom_point_rast(size=0.3, alpha=0.3, width=kPlotWidth / 2, height=kPlotHeight / 3) +
  geom_smooth() +
  scale_x_log10(expand=c(0.01, 0)) + annotation_logticks(sides='b') +
  scale_y_continuous(expand=c(0, 0), limits=c(-65, 11)) +
  scale_color_nejm() +
  labs(x='Corrected #molecules without trimming', y='Error, %') +
  guides(color = guide_legend(title='Collision adjustment', ncol=3)) +
  theme_pdf(legend.pos=c(0, 0))

gp2

Figure C

fig_umi_length <- 7
corrected_data <- as_tibble(trimmed[[fig_umi_length - 5]]$filt_cells)
colnames(corrected_data) <- c('Bayesian', 'cluster', 'cluster-neq', 'directional')
corrected_data$`no correction` <- trimmed[[fig_umi_length - 5]]$umis.per.gene
corrected_data$real <- raw$filt_umis_per_gene_simple

plot_df <- melt(corrected_data, id.vars='real', variable.name='Correction') %>%
  group_by(Correction, real) %>% summarize(value=median(value))

gp3 <- ggplot(plot_df, aes(x=real, y=100 * (value - real) / real, color=Correction)) +
  geom_point_rast(size=0.3, alpha=0.3, width=kPlotWidth / 2, height=kPlotHeight / 3) +
  geom_smooth() +
  scale_x_log10(expand=c(0.01, 0)) + annotation_logticks(sides='b') +
  scale_y_continuous(limits=c(-101, 51), expand=c(0, 0)) +
  # scale_color_manual(values=correction_colors[2:length(correction_colors)]) +
  guides(color=guide_legend(ncol=3)) + theme_pdf(legend.pos=c(0, 0))

gp3
aml035_plots <- list(gp1=gp1, gp2=gp2, gp3=gp3)
# You need to run "umi_correction/umi_bmmc1.Rmd" first to produce these plots
bmmc_plots_data <- readRDS('../../data/plot_data/bmmc_umi_fig_part2.rds')

Complete figure

colors <- alpha(bmmc_plots_data$correction_colors, 0.7) %>% 
  setNames(names(bmmc_plots_data$correction_colors))
scale_color_short <- scale_color_manual(values=colors)
figure_trim <- plot_grid(
  aml035_plots$gp1 + theme_pdf() + ylab('Error, %') + rremove("xlab") + 
    theme(plot.margin=margin(b=2, unit='pt')),
  aml035_plots$gp2 + theme_pdf() + rremove("xlab") + 
    theme(plot.margin=margin(b=2, unit='pt')),
  aml035_plots$gp3 + theme_pdf() + ylab('Error, %') + scale_color_short +
    guides(color=guide_legend(title='Correction (see Figure D)', 
                              label.theme=element_blank(), ncol=5, reverse=T)) +
    rremove("xlab") + theme(plot.margin=margin()),
  ncol=1, nrow=3, labels = c("A", "B", "C"), align='v')

figure_trim_ann <- (figure_trim + theme(plot.margin=margin(t=10, l=0, unit='pt'))) %>%
  annotate_figure(bottom=text_grob("Gene expression magnitude (#molecules)", size=14, 
                                   hjust=0.43))
fig_full <- plot_grid(figure_trim_ann + theme(plot.margin=margin()), 
                      bmmc_plots_data$gg_fig + theme(plot.margin=margin()), ncol=2)
fig_full_ann <- annotate_figure(fig_full, fig.lab="Figure 2", fig.lab.pos="top.right")
fig_full_ann
ggsave(paste0(kPlotsFolder, 'umi_correction_figure.pdf'), fig_full_ann, width=8, height=7)

Session information




VPetukhov/dropEstAnalysis documentation built on Dec. 28, 2019, 8:16 p.m.