This document contains all the analyses done on RNA data that we generated for the tailfindr paper. Knit this document after you have successfully run drake::r_make()
.
knitr::opts_chunk$set(echo = TRUE) save_figures <- FALSE
Load the required libraries first:
pacman::p_load(dplyr, magrittr, ggplot2, drake, knitr, ggpubr, here, tidyverse)
# functions for plotting double single and double-sided violins "%||%" <- function(a, b) { if (!is.null(a)) a else b } geom_one_sided_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity", position = "dodge", trim = TRUE, scale = "area", show.legend = NA, inherit.aes = TRUE, ...) { layer(data = data, mapping = mapping, stat = stat,geom = GeomFlatViolinOneSided, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list(trim = trim, scale = scale, ...)) } #' @rdname ggplot2-ggproto #' @format NULL #' @usage NULL #' @export GeomFlatViolinOneSided <- ggproto("GeomFlatViolin", Geom, setup_data = function(data, params) { data$width <- data$width %||% params$width %||% (resolution(data$x, FALSE) * 0.9) # ymin, ymax, xmin, and xmax define the bounding rectangle for each group data %>% group_by(group) %>% mutate(ymin = min(y), ymax = max(y), xmin = x, xmax = x - width / 2) }, draw_group = function(data, panel_scales, coord) { # Find the points for the line to go all the way around data <- transform(data, xminv = x, xmaxv = x + violinwidth * (xmax - x)) # Make sure it's sorted properly to draw the outline newdata <- rbind(plyr::arrange(transform(data, x = xminv), y), plyr::arrange(transform(data, x = xmaxv), -y)) # Close the polygon: set first and last point the same # Needed for coord_polar and such newdata <- rbind(newdata, newdata[1,]) ggplot2:::ggname("geom_flat_violin", GeomPolygon$draw_panel(newdata, panel_scales, coord)) }, draw_key = draw_key_polygon, default_aes = aes(weight = 1, colour = "grey20", fill = "white", size = 0.5, alpha = NA, linetype = "solid"), required_aes = c("x", "y") ) geom_two_sided_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity", position = "dodge", trim = TRUE, scale = "area", show.legend = NA, inherit.aes = TRUE, ...) { layer(data = data, mapping = mapping, stat = stat, geom = GeomFlatViolinTwoSided, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list(trim = trim, scale = scale, ...)) } #' @rdname ggplot2-ggproto #' @format NULL #' @usage NULL #' @export GeomFlatViolinTwoSided <- ggproto("GeomFlatViolin", Geom, setup_data = function(data, params) { data$width <- data$width %||% params$width %||% (resolution(data$x, FALSE) * 0.9) # ymin, ymax, xmin, and xmax define the bounding rectangle for each group data %>% group_by(group) %>% mutate(ymin = min(y), ymax = max(y), xmin = x, xmax = x - width / 2) %>% mutate(xmax = ifelse((group %% 2) == 0, x + width / 2, xmax) ) }, draw_group = function(data, panel_scales, coord) { # Find the points for the line to go all the way around data <- transform(data, xminv = x, xmaxv = x + violinwidth * (xmax - x)) # Make sure it's sorted properly to draw the outline newdata <- rbind(plyr::arrange(transform(data, x = xminv), y), plyr::arrange(transform(data, x = xmaxv), -y)) # Close the polygon: set first and last point the same # Needed for coord_polar and such newdata <- rbind(newdata, newdata[1,]) ggplot2:::ggname("geom_flat_violin", GeomPolygon$draw_panel(newdata, panel_scales, coord)) }, draw_key = draw_key_polygon, default_aes = aes(weight = 1, colour = "grey20", fill = "white", size = 0.5, alpha = NA, linetype = "solid"), required_aes = c("x", "y") )
Now load the data:
loadd(rna_kr_data)
rna_kr_data
is made automatically when drake::r_make()
command is run. It is a dataframe containing RNA data from three replicates of Oxford Nanopore Direct RNA sequencing experiments. The first two replicates were obtained using RNA-SQK001 kit with reverse transcription, while the third replicate was obtained using RNA-SQK002 sequencing kit and omitting the reverse transcription step during library preparation.
For each read, rna_kr_data
dataframe contains:
Output of tailfindr
Output of Nanopolish
Barcode label of the read, and
Location of the poly(A) end adjacent to eGFP transcript found by alignment of the expected eGFP sequence.
The raw CSV files from which rna_kr_data
was made are present in the extdata/
directory. The code used to consolidate all these disparate pieces of information into a single data frame (rna_kr_data
) is present in the Analyses/
and R/
directories.
Here is a description of columns:
col_names_df <- data.frame(Columns = c("read_id", "barcode", "replicate", "file_path", "tail_start_tf", "tail_end_tf", "samples_per_nt_tf", "tail_length_tf", "tail_start_np", "tail_end_np", "read_rate_np", "tail_length_np", "qc_tag_np", "samples_per_nt_np","transcript_alignment_start","kit"), Description = c("Unique read ID generated by MinKNOW", "Barcode assigning the expected poly(A) tail length", "Replicate number", "Full file path (not relevant)", "tailfindr's estimate of poly(A) start site", "tailfindr's estimate of poly(A) end site", "tailfindr's estimate of read-specific nucleotide translocation rate (samples per nucleotide)", "tailfindr's estimate of poly(A) tail length", "Nanopolish estimate of poly(A) start site", "Nanopolish estimate of poly(A) end site", "Nanopolish read rate", "Nanopolish estimate of poly(A) tail length", "Nanopolish QC tag", "Nanopolish estimate of read-specific nucleotide translocation rate (samples per nucleotide) calculated by 3012/read_rate", "Location of poly(A) end as detected by eGFP sequence alignment", 'ONT Library prep kit used'))
knitr::kable(col_names_df)
Create a dataset in which tail lengths are capped to 300 nt.
rna_kr_data_capped <- rna_kr_data %>% dplyr::mutate(tail_length_tf = ifelse(tail_length_tf >= 300, 300, tail_length_tf), tail_length_np = ifelse(tail_length_np >= 300, 300, tail_length_np))
p <- ggplot(data = rna_kr_data_capped, aes(x = barcode, y = tail_length_tf, fill = barcode)) + geom_one_sided_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .5) + facet_grid(~barcode, scales = 'free') + geom_hline(aes(yintercept=as.numeric(as.character(barcode))))
p <- p + theme(panel.background = element_blank(), panel.grid.minor = ggplot2::element_line(colour="grey", size=0.5), panel.grid.major = ggplot2::element_line(colour="grey", size=0.5), panel.grid.major.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(1.5, "lines"), legend.position = 'none') + scale_y_continuous(minor_breaks = seq(0, 300, 25)) + ylab('tailfindr tail length [nt]') + labs(caption = 'The black horizontal lines represent the expected poly(A) tail length.') p
if (save_figures) { RSvgDevice::devSVG(here('figures', 'rna_kr_tf_tail_length_all_bc_all_reps_density_plot.svg'), width = 5, height = 3.5, onefile = TRUE, xmlHeader = TRUE) print(p) dev.off() }
# define the function for computing standard error std_err <- function(x) sd(x, na.rm = TRUE)/sqrt(length(x)) # summarize the data and display a table summary_data <- rna_kr_data %>% group_by(barcode) %>% summarise(read_count = n(), mean = mean(tail_length_tf, na.rm = TRUE), median = median(tail_length_tf, na.rm = TRUE), std_dev = sd(tail_length_tf, na.rm = TRUE), std_err = std_err(tail_length_tf)) summary_data %<>% mutate(cof_var = std_dev/mean) kable(summary_data)
p <- ggplot(data = rna_kr_data_capped, aes(x = barcode, y = tail_length_tf, color = kit, fill = kit)) + geom_two_sided_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .5) + facet_grid(~barcode, scales = 'free') + geom_hline(aes(yintercept=as.numeric(as.character(barcode))))
lengend_name <- 'Library prep kit' legend_labels <- c('SQK-RNA001 +RT', 'SQK-RNA002 -RT') p <- p + theme(panel.background = element_blank(), panel.grid.minor = ggplot2::element_line(colour="grey", size=0.5), panel.grid.major = ggplot2::element_line(colour="grey", size=0.5), panel.grid.major.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(1.5, "lines"), legend.direction = "horizontal", legend.position = "top") + scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + scale_fill_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + scale_y_continuous(minor_breaks = seq(0, 300, 25)) + ylab('tailfindr tail length (nt(') + labs(caption = 'tailfindr tail length estimate is indepedent of sequencing kit used and whether or not reverse transcription step was performed. The black horizontal lines represent the expected poly(A) tail length.') p
if (save_figures) { RSvgDevice::devSVG(here('figures', 'rna_kr_tail_length_rna001_vs_rna002_density_plot.svg'), width = 5, height = 3.5, onefile = TRUE, xmlHeader = TRUE) print(p) dev.off() }
# define the function for computing standard error std_err <- function(x) sd(x, na.rm = TRUE)/sqrt(length(x)) # summarize the data and display a table summary_data <- rna_kr_data %>% group_by(barcode, kit) %>% summarise(read_count = n(), mean = mean(tail_length_tf, na.rm = TRUE), median = median(tail_length_tf, na.rm = TRUE), std_dev = sd(tail_length_tf, na.rm = TRUE), std_err = std_err(tail_length_tf)) summary_data %<>% mutate(cof_var = std_dev/mean) kable(summary_data)
The first two replicates (both obtained with SQK-RNA001) are loaded from complete dataset.
rna001_data <- rna_kr_data_capped %>% filter(replicate == 1 | replicate == 2)
p <- ggplot(rna001_data, aes(x = barcode, y = tail_length_tf, color = replicate, fill = replicate)) + geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) + facet_grid(~barcode, scales = 'free') + geom_hline(aes(yintercept = as.numeric(as.character(barcode))))
lengend_name <- 'Replicate' legend_labels <- c('SQK-RNA001 Replicate 1', 'SQK-RNA001 Replicate 2') p <- p + theme(panel.background = element_blank(), panel.grid.minor = ggplot2::element_line(colour="grey", size=0.5), panel.grid.major = ggplot2::element_line(colour="grey", size=0.5), panel.grid.major.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(1.5, "lines"), legend.direction = "horizontal", legend.position = "top") + scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + scale_fill_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + ggplot2::scale_y_continuous(minor_breaks = seq(0, 300, 25)) + ylab('tailfindr tail length [nt]') + labs(caption = 'tailfindr tail length estimate is robust across technical replicates. The black horizontal lines represent the expected poly(A) tail length.', fill = 'Replicate') p
if (save_figures) { RSvgDevice::devSVG(here('figures', 'rna_kr_tail_length_rna001_rep1_vs_rna001_rep2_density_plot.svg'), width = 5, height = 3.5, onefile = TRUE, xmlHeader = TRUE) print(p) dev.off() }
summary_data <- rna001_data %>% group_by(barcode, replicate) %>% summarise(read_count = n(), mean = mean(tail_length_tf, na.rm = TRUE), median = median(tail_length_tf, na.rm = TRUE), std_dev = sd(tail_length_tf, na.rm = TRUE), std_err = std_err(tail_length_tf)) summary_data %<>% mutate(cof_var = std_dev/mean) kable(summary_data)
# make a long data frame tf_vs_np_data <- rna_kr_data_capped %>% select(barcode, tail_length_tf, tail_length_np) %>% rename(tailfindr = tail_length_tf, nanopolish = tail_length_np) %>% gather(key = 'tool', value = 'tail_length', tailfindr, nanopolish) %>% mutate(tool = fct_relevel(tool, c('tailfindr', 'nanopolish')))
p <- ggplot(tf_vs_np_data, aes(x = barcode, y = tail_length, color = tool, fill = tool)) + geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) + facet_grid(~barcode, scales = 'free') + geom_hline(aes(yintercept = as.numeric(as.character(barcode))))
lengend_name <- 'Tool' legend_labels <- c('tailfindr', 'Nanopolish') p <- p + theme(panel.background = element_blank(), panel.grid.minor = ggplot2::element_line(colour = "grey", size = 0.3), panel.grid.major = ggplot2::element_line(colour = "grey", size = 0.4), panel.grid.major.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(1.5, "lines"), legend.direction = "horizontal", legend.position = "top") + scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + scale_fill_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + ggplot2::scale_y_continuous(minor_breaks = seq(0, 300, 25)) + ylab('tail length (nt)') + labs(caption = 'The black horizontal lines represent the expected poly(A) tail length.', fill = 'Tool') p
if (save_figures) { RSvgDevice::devSVG(here('figures', 'rna_kr_tail_length_tf_vs_np_all_barcodes_density.svg'), width = 5, height = 3.5, onefile = TRUE, xmlHeader = TRUE) print(p) dev.off() }
summary_data <- tf_vs_np_data %>% group_by(barcode, tool) %>% summarise(read_count = n(), mean = mean(tail_length, na.rm = TRUE), median = median(tail_length, na.rm = TRUE), std_dev = sd(tail_length, na.rm = TRUE), std_err = std_err(tail_length)) summary_data %<>% mutate(cof_var = std_dev/mean) kable(summary_data)
# subset data tf_vs_np_data_two_barcode <- tf_vs_np_data %>% filter(barcode == 40 | barcode == 150) p <- ggplot(data = tf_vs_np_data_two_barcode, aes(x = barcode, y = tail_length, color = tool, fill = tool)) + geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) + facet_grid(~barcode, scales = 'free') + geom_hline(aes(yintercept = as.numeric(as.character(barcode))))
lengend_name <- 'Tool' legend_labels <- c('Nanopolish', 'tailfindr') p <- p + theme(panel.background = element_blank(), panel.grid.minor = ggplot2::element_line(colour = "grey", size = 0.5), panel.grid.major = ggplot2::element_line(colour = "grey", size = 0.5), panel.grid.major.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(3, "lines"), legend.direction = "horizontal", legend.position = "top") + scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + scale_fill_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + ggplot2::scale_y_continuous(minor_breaks = seq(0, 300, 25)) + ylab('tail length (nt)') + labs(caption = 'The black horizontal lines represent the expected poly(A) tail length.', fill = 'Tool') p
if (save_figures) { RSvgDevice::devSVG(here('figures', 'rna_kr_tail_length_tf_vs_np_two_barcodes_density.svg'), width = 4, height = 3, onefile = TRUE, xmlHeader = TRUE) print(p) dev.off() }
To test whether poly(A) end as defined by tailfindr and the sequence end of eGFP as defined by sequence alignment match, a scatter plot is generated:
p <- ggplot(rna_kr_data, aes(x = tail_end_tf, y = transcript_alignment_start)) + geom_point(shape = 21, colour = 'black', fill = 'black', size = 2, stroke=0, alpha = 0.1) + geom_abline(intercept = 0, slope = 1, color="red", linetype = 'dotted', size = 0.7) + geom_smooth(method = 'lm', formula = y~x, color="#797979", fullrange = TRUE, se = FALSE, size = 0.5) + stat_cor(method = "pearson", label.x = 10, label.y = 52000)
p <- p + theme_bw() + theme( axis.line =element_line(colour = "black"), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank() ) + xlab('tailfindr tail end') + ylab('Tail end defined by eGFP alignment') + labs(caption = 'The red dotted line represents x = y. The grey line represents the linear model fit.') + coord_fixed() p
if (save_figures) { ggsave(plot = p, filename = here('figures', 'rna_kr_tf_tail_end_vs_egfp_end_scatter_plot.png'), bg = "transparent", width = 5, height = 5, units = 'in') }
To better visualize the precision of end definition, a histogram of the difference between tailfindr poly(A) end and the sequence alignment position is generated:
data <- mutate(rna_kr_data, diff = tail_end_tf - transcript_alignment_start) p <- ggplot(data, aes(x = diff)) + geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha = 0.6)
p <- p + theme_bw() + coord_cartesian(xlim = c(-500, 500)) + scale_x_continuous(minor_breaks = seq(-500, 500, 50)) + xlab('tailfindr tail end - sequence end defined by eGFP alignment') + ylab('Read count') p
if (save_figures) { ggsave(plot = p, filename = here('figures', 'rna_kr_tf_tail_end_vs_egfp_end_scatter_histogram.png'), bg = "transparent", width = 6.5, height = 5, units = 'in') }
Let us generate a scatter plot of tailfindr and Nanopolish poly(A) tail start estimates:
p <- ggplot(rna_kr_data, aes(x = tail_length_tf, y = tail_length_np)) + geom_point(shape = 21, colour = 'black', fill = 'black', size = 2, stroke = 0, alpha = 0.1) + geom_abline(intercept = 0, slope = 1, color = "red", linetype = 'dotted', size = 0.7) + geom_smooth(method = 'lm', formula = y~x, color="#797979", fullrange = TRUE, se = FALSE, size = 0.5) + stat_cor(method = "pearson", label.x = 10, label.y = 580)
p <- p + theme_bw() + theme( axis.line = element_line(colour = "black"), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank(), aspect.ratio = 1 ) + scale_y_continuous(breaks = seq(0, 600, 100),minor_breaks=NULL) + scale_x_continuous(breaks = seq(0, 600, 100),minor_breaks=NULL) + xlab('tailfindr tail length') + ylab('Nanopolish tail length') + labs(caption = 'The red dotted line represents x = y. The grey line represents the linear model fit.') p
if (save_figures) { ggsave(plot = p, filename = here('figures', 'rna_kr_tf_vs_np_tail_length.png'), bg = "transparent", width = 10, height = 6, units = 'in') }
To better visualize the overlap in poly(A) start detection, a histogram of the difference between Nanopolish and tailfindr poly(A) start is generated:
data <- mutate(rna_kr_data, diff = tail_length_np - tail_length_tf) p <- ggplot(data, aes(x = diff)) + geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha=0.6) + theme_bw() + coord_cartesian(xlim = c(-250, 250)) + scale_x_continuous(minor_breaks = seq(-250, 250, 50)) + xlab('Nanopolish tail length - tailfindr tail length') + ylab('Read count') p
Let us generate a scatter plot of tailfindr and Nanopolish poly(A) tail start estimates:
p <- ggplot(rna_kr_data, aes(x = tail_start_tf, y = tail_start_np)) + geom_point(shape = 21, colour = 'black', fill = 'black', size = 2, stroke = 0, alpha = 0.1) + geom_abline(intercept = 0, slope = 1, color = "red", linetype = 'dotted', size = 0.7) + geom_smooth(method = 'lm', formula = y~x, color="#797979", fullrange = TRUE, se = FALSE, size = 0.5) + stat_cor(method = "pearson", label.x = 10, label.y = 52000)
p <- p + theme_bw() + theme( axis.line = element_line(colour = "black"), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank() ) + xlab('tailfindr tail start') + ylab('Nanopolish tail start') + labs(caption = 'The red dotted line represents x = y. The grey line represents the linear model fit.') + coord_fixed() p
if (save_figures) { ggsave(plot = p, filename = here('figures', 'rna_kr_tf_vs_np_tail_tail_start_scatter_plot.png'), bg = "transparent", width = 10, height = 6, units = 'in') }
To better visualize the overlap in poly(A) start detection, a histogram of the difference between Nanopolish and tailfindr poly(A) start is generated:
data <- mutate(rna_kr_data, diff = tail_start_np - tail_start_tf) p <- ggplot(data, aes(x = diff)) + geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha=0.6) + theme_bw() + coord_cartesian(xlim = c(-250, 250)) + scale_x_continuous(minor_breaks = seq(-250, 250, 50)) + xlab('Nanopolish tail start - tailfindr tail start') + ylab('Read count') p
Let us generate a scatter plot of tailfindr and Nanopolish poly(A) tail end estimates:
p <- ggplot(rna_kr_data, aes(x = tail_end_tf, y = tail_end_np)) + geom_point(shape = 21, colour = 'black', fill = 'black', size = 2, stroke = 0, alpha = 0.1) + geom_abline(intercept = 0, slope = 1, color = "red", linetype = 'dotted', size = 0.7) + geom_smooth(method = 'lm', formula = y~x, color = "#797979", fullrange = TRUE, se = FALSE, size = 0.5) + stat_cor(method = "pearson", label.x = 10, label.y = 52000)
p <- p + theme_bw() + theme( axis.line = element_line(colour = "black"), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank() ) + xlab('tailfindr tail end') + ylab('Nanopolish tail end') + labs(caption = 'The red dotted line represents x = y. The grey line represents the linear model fit.') + coord_fixed() p
if (save_figures) { ggsave(plot = p, filename = here('figures', 'rna_kr_tf_vs_np_tail_end_scatter_plot.png'), bg = "transparent", width = 10, height = 6, units = 'in') }
To better visualize the overlap in poly(A) start detection, a histogram of the difference between Nanopolish and tailfindr poly(A) start is generated:
data <- mutate(rna_kr_data, diff = tail_end_np - tail_end_tf) p <- ggplot(data, aes(x = diff)) + geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha=0.6) + theme_bw() + coord_cartesian(xlim = c(-250, 250)) + scale_x_continuous(minor_breaks = seq(-250, 250, 50)) + xlab('Nanopolish tail end - tailfindr tail end') + ylab('Read count') p
p <- ggplot(rna_kr_data, aes(x = samples_per_nt_tf, y = samples_per_nt_np)) + geom_point(shape = 21, colour = 'black', fill = 'black', size = 1.2, stroke = 0, alpha = 0.1) + geom_abline(intercept = 0, slope = 1, color="red", linetype = 'dotted', size = 0.7) + geom_smooth(method = 'lm', formula = y~x, color = "#797979", fullrange = TRUE, se = FALSE, size = 0.5) + stat_cor(method = "pearson", label.x = 21, label.y = 92)
p <- p + theme_bw() + theme( axis.line = element_line(colour = "black"), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank() ) + xlab('tailfindr normalizer (Samples/nt)') + ylab('Nanopolish normalizer (Samples/nt)') + labs(caption = 'The red dotted line represents x = y. The grey line represents the linear model fit.') + coord_fixed(ratio = 1, xlim = c(20, 90), ylim = c(20, 90)) p
if (save_figures) { ggsave(plot = p, filename = here('figures', 'rna_kr_tf_vs_np_normalizer.png'), bg = "transparent", width = 10, height = 6, units = 'in') }
To define poly(A) segments from signal data, we defined a threshold of 0.3 as the lower limit of the poly(A) level expected after squiggle data z-normalisation. We chose this threshold, as the expected mean signal level for homopolymer A stretches in RNA is well above zero (see below). To make the threshold robust against data variation, we further chose the threshold to not be violated even when substracting two standard deviations of the expected mean signal. The results of expected signal after normalisation and with standard deviation substraction can be found below.
model_180mv <- here('data', 'r9.4_180mv_70bps_5mer_RNA_template_median69pA.model') df_180mv <- read.csv(model_180mv, header = TRUE, stringsAsFactors = FALSE, sep = '\t') df_180mv <- as.tibble(df_180mv) # sample 1000 numbers from a Gaussian distribution for each kmer mu and sd set.seed(5) df_180mv %<>% select(kmer, level_mean, level_stdv) %>% rowwise() %>% mutate(sampled_numbers = list(rnorm(n = 1000, level_mean, level_stdv))) sampled_numbers <- purrr::flatten(df_180mv$sampled_numbers) sampled_numbers <- unlist(sampled_numbers, recursive = FALSE) # Calculate mean and sd of AAAAA state, and all the kmers combined mean_AAAAA <- mean(unlist(df_180mv$sampled_numbers[1])) sd_AAAAA <- sd(unlist(df_180mv$sampled_numbers[1])) mean_all_kmers <- mean(sampled_numbers) sd_all_kmers <- sd(sampled_numbers) paste0("Mean expected AAAAA level after normalisation: ", (mean_AAAAA - mean_all_kmers) / sd_all_kmers) minimum_expected_normalized_AAAAA_level <- ((mean_AAAAA - 2 * sd_AAAAA) - mean_all_kmers) / sd_all_kmers paste0("Minimum expected normalized AAAAA level: ", minimum_expected_normalized_AAAAA_level)
p <- ggplot(rna_kr_data, aes(x = samples_per_nt_tf, color = replicate)) + geom_line(stat = 'density', size = 0.7, position = 'identity')
lengend_name <- 'Replicates ' legend_labels <- c('Replicate 1 SQK-RNA001 +RT', 'Replicate 2 SQK-RNA001 +RT', 'Replicate 3 SQK-RNA002 -RT') p <- p + theme(panel.background = element_blank(), panel.grid.minor = ggplot2::element_line(colour="grey", size=0.5), panel.grid.major = ggplot2::element_line(colour="grey", size=0.5), panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank(), legend.direction = "vertical", legend.position = "top") + scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) + coord_cartesian(xlim = c(20, 60)) + ylab('Density') + xlab('Translocation rate (Samples/nucleotide)') + labs(caption = 'RNA without reverse transcription transloactes slightly slower than RNA with reverse transcription.', fill = 'Condition') p
if (save_figures) { RSvgDevice::devSVG(here('figures', 'rna_kr_tail_all_three_replicates_normalizer.svg'), width = 10, height = 3.5, onefile = TRUE, xmlHeader = TRUE) print(p) dev.off() }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.