This document contains all the analyses done on RNA data that we generated for the tailfindr paper. Knit this document after you have successfully run drake::r_make().

knitr::opts_chunk$set(echo = TRUE)
save_figures <- FALSE

Load the required libraries first:

pacman::p_load(dplyr, magrittr, ggplot2, drake, knitr, ggpubr, here, tidyverse)
# functions for plotting double single and double-sided violins
"%||%" <- function(a, b) {
  if (!is.null(a)) a else b
}
geom_one_sided_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity",
                                       position = "dodge", trim = TRUE, scale = "area",
                                       show.legend = NA, inherit.aes = TRUE, ...) {
  layer(data = data, mapping = mapping, stat = stat,geom = GeomFlatViolinOneSided,
        position = position, show.legend = show.legend, inherit.aes = inherit.aes,
        params = list(trim = trim, scale = scale, ...))
}

#' @rdname ggplot2-ggproto
#' @format NULL
#' @usage NULL
#' @export
GeomFlatViolinOneSided <-
  ggproto("GeomFlatViolin", Geom,
          setup_data = function(data, params) {
            data$width <- data$width %||%
              params$width %||% (resolution(data$x, FALSE) * 0.9)
            # ymin, ymax, xmin, and xmax define the bounding rectangle for each group
            data %>%
              group_by(group) %>%
              mutate(ymin = min(y),
                     ymax = max(y),
                     xmin = x,
                     xmax = x - width / 2) 
          },
  draw_group = function(data, panel_scales, coord) {
    # Find the points for the line to go all the way around
    data <- transform(data, xminv = x,
                      xmaxv = x + violinwidth * (xmax - x))
    # Make sure it's sorted properly to draw the outline
    newdata <- rbind(plyr::arrange(transform(data, x = xminv), y),
                     plyr::arrange(transform(data, x = xmaxv), -y))
    # Close the polygon: set first and last point the same
    # Needed for coord_polar and such
    newdata <- rbind(newdata, newdata[1,])
    ggplot2:::ggname("geom_flat_violin", GeomPolygon$draw_panel(newdata, panel_scales, coord))
  },
  draw_key = draw_key_polygon,
  default_aes = aes(weight = 1, colour = "grey20", fill = "white", size = 0.5,
                    alpha = NA, linetype = "solid"),

  required_aes = c("x", "y")
  )

geom_two_sided_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity",
                                          position = "dodge", trim = TRUE, scale = "area",
                                          show.legend = NA, inherit.aes = TRUE, ...) {
  layer(data = data, mapping = mapping, stat = stat, geom = GeomFlatViolinTwoSided,
        position = position, show.legend = show.legend, inherit.aes = inherit.aes,
        params = list(trim = trim, scale = scale, ...))
}

#' @rdname ggplot2-ggproto
#' @format NULL
#' @usage NULL
#' @export
GeomFlatViolinTwoSided <-
  ggproto("GeomFlatViolin", Geom,
          setup_data = function(data, params) {
            data$width <- data$width %||%
              params$width %||% (resolution(data$x, FALSE) * 0.9)
            # ymin, ymax, xmin, and xmax define the bounding rectangle for each group
            data %>%
              group_by(group) %>%
              mutate(ymin = min(y),
                     ymax = max(y),
                     xmin = x,
                     xmax = x - width / 2) %>% 
              mutate(xmax = ifelse((group %% 2) == 0, x + width / 2, xmax) )
          },
  draw_group = function(data, panel_scales, coord) {
    # Find the points for the line to go all the way around
    data <- transform(data, xminv = x,
                      xmaxv = x + violinwidth * (xmax - x))
    # Make sure it's sorted properly to draw the outline
    newdata <- rbind(plyr::arrange(transform(data, x = xminv), y),
                     plyr::arrange(transform(data, x = xmaxv), -y))
    # Close the polygon: set first and last point the same
    # Needed for coord_polar and such
    newdata <- rbind(newdata, newdata[1,])
    ggplot2:::ggname("geom_flat_violin", GeomPolygon$draw_panel(newdata, panel_scales, coord))
  },
  draw_key = draw_key_polygon,
  default_aes = aes(weight = 1, colour = "grey20", fill = "white", size = 0.5,
                    alpha = NA, linetype = "solid"),
  required_aes = c("x", "y")
  )

Now load the data:

loadd(rna_kr_data)

About the data

rna_kr_data is made automatically when drake::r_make() command is run. It is a dataframe containing RNA data from three replicates of Oxford Nanopore Direct RNA sequencing experiments. The first two replicates were obtained using RNA-SQK001 kit with reverse transcription, while the third replicate was obtained using RNA-SQK002 sequencing kit and omitting the reverse transcription step during library preparation.

For each read, rna_kr_data dataframe contains:

  1. Output of tailfindr

  2. Output of Nanopolish

  3. Barcode label of the read, and

  4. Location of the poly(A) end adjacent to eGFP transcript found by alignment of the expected eGFP sequence.

The raw CSV files from which rna_kr_data was made are present in the extdata/ directory. The code used to consolidate all these disparate pieces of information into a single data frame (rna_kr_data) is present in the Analyses/ and R/ directories.

Here is a description of columns:

col_names_df <- data.frame(Columns = c("read_id", "barcode", "replicate", "file_path", "tail_start_tf", "tail_end_tf", "samples_per_nt_tf", "tail_length_tf", "tail_start_np", "tail_end_np", "read_rate_np", "tail_length_np", "qc_tag_np", "samples_per_nt_np","transcript_alignment_start","kit"), Description = c("Unique read ID generated by MinKNOW",
"Barcode assigning the expected poly(A) tail length", "Replicate number", "Full file path (not relevant)", "tailfindr's estimate of poly(A) start site",  "tailfindr's estimate of poly(A) end site",
"tailfindr's estimate of read-specific nucleotide translocation rate (samples per nucleotide)",  "tailfindr's estimate of poly(A) tail length",  "Nanopolish estimate of poly(A) start site", "Nanopolish estimate of poly(A) end site", "Nanopolish read rate", "Nanopolish estimate of poly(A) tail length", "Nanopolish QC tag", "Nanopolish estimate of read-specific nucleotide translocation rate (samples per nucleotide) calculated by 3012/read_rate", "Location of poly(A) end as detected by eGFP sequence alignment", 'ONT Library prep kit used'))
knitr::kable(col_names_df)

Tail length across all three replicates

Create a dataset in which tail lengths are capped to 300 nt.

rna_kr_data_capped <- rna_kr_data %>% 
  dplyr::mutate(tail_length_tf = ifelse(tail_length_tf >= 300, 300, tail_length_tf),
                tail_length_np = ifelse(tail_length_np >= 300, 300, tail_length_np))

Tail length densities

p <- ggplot(data = rna_kr_data_capped, aes(x = barcode, y = tail_length_tf, 
                                           fill = barcode)) +
  geom_one_sided_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept=as.numeric(as.character(barcode)))) 
p <- p + theme(panel.background = element_blank(),
        panel.grid.minor = ggplot2::element_line(colour="grey", size=0.5),
        panel.grid.major = ggplot2::element_line(colour="grey", size=0.5),
        panel.grid.major.x = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.spacing = unit(1.5, "lines"),
        legend.position = 'none') +
  scale_y_continuous(minor_breaks = seq(0, 300, 25)) +
  ylab('tailfindr tail length [nt]') +
  labs(caption = 'The black horizontal lines represent the expected poly(A) tail length.')
p
if (save_figures) {
  RSvgDevice::devSVG(here('figures', 'rna_kr_tf_tail_length_all_bc_all_reps_density_plot.svg'), 
                     width = 5, height = 3.5, onefile = TRUE, xmlHeader = TRUE)
  print(p)
  dev.off()
}

Tail length statistics

# define the function for computing standard error 
std_err <- function(x) sd(x, na.rm = TRUE)/sqrt(length(x))

# summarize the data and display a table
summary_data <- rna_kr_data %>% group_by(barcode) %>% 
  summarise(read_count = n(),
            mean = mean(tail_length_tf, na.rm = TRUE),
            median = median(tail_length_tf, na.rm = TRUE),
            std_dev = sd(tail_length_tf, na.rm = TRUE),
            std_err = std_err(tail_length_tf)) 
summary_data %<>% mutate(cof_var = std_dev/mean)
kable(summary_data)

Comparing ONT Library prep conditions: SQK-RNA001 vs. SQK-RNA002

Tail length densities

p <- ggplot(data = rna_kr_data_capped, aes(x = barcode, y = tail_length_tf, 
                                           color = kit, fill = kit)) +
  geom_two_sided_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept=as.numeric(as.character(barcode)))) 
lengend_name <- 'Library prep kit'
legend_labels <- c('SQK-RNA001 +RT', 
                   'SQK-RNA002 -RT')
p <- p + theme(panel.background = element_blank(),
        panel.grid.minor = ggplot2::element_line(colour="grey", size=0.5),
        panel.grid.major = ggplot2::element_line(colour="grey", size=0.5),
        panel.grid.major.x = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.spacing = unit(1.5, "lines"),
        legend.direction = "horizontal",
        legend.position = "top") +
  scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  scale_fill_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  scale_y_continuous(minor_breaks = seq(0, 300, 25)) +
  ylab('tailfindr tail length (nt(') +
  labs(caption = 'tailfindr tail length estimate is indepedent of sequencing kit used 
       and whether or not reverse transcription step was performed. 
       The black horizontal lines represent the expected poly(A) tail length.')
p
if (save_figures) {
  RSvgDevice::devSVG(here('figures', 'rna_kr_tail_length_rna001_vs_rna002_density_plot.svg'), 
                     width = 5, height = 3.5, onefile = TRUE, xmlHeader = TRUE)
  print(p)
  dev.off()
}

Tail length statistics

# define the function for computing standard error 
std_err <- function(x) sd(x, na.rm = TRUE)/sqrt(length(x))

# summarize the data and display a table
summary_data <- rna_kr_data %>% group_by(barcode, kit) %>% 
  summarise(read_count = n(),
            mean = mean(tail_length_tf, na.rm = TRUE),
            median = median(tail_length_tf, na.rm = TRUE),
            std_dev = sd(tail_length_tf, na.rm = TRUE),
            std_err = std_err(tail_length_tf)) 
summary_data %<>% mutate(cof_var = std_dev/mean)
kable(summary_data)

Comparing technical replicates: Replicate 1 vs Replicate 2 (both SQK-RNA001)

The first two replicates (both obtained with SQK-RNA001) are loaded from complete dataset.

rna001_data <- rna_kr_data_capped %>% filter(replicate == 1 | replicate == 2)

Tail length densities

p <- ggplot(rna001_data, aes(x = barcode, y = tail_length_tf, 
                             color = replicate, fill = replicate)) +
  geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept = as.numeric(as.character(barcode)))) 
lengend_name <- 'Replicate'
legend_labels <- c('SQK-RNA001 Replicate 1', 
                   'SQK-RNA001 Replicate 2')
p <- p + theme(panel.background = element_blank(),
        panel.grid.minor = ggplot2::element_line(colour="grey", size=0.5),
        panel.grid.major = ggplot2::element_line(colour="grey", size=0.5),
        panel.grid.major.x = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.spacing = unit(1.5, "lines"),
        legend.direction = "horizontal",
        legend.position = "top") +
  scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  scale_fill_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  ggplot2::scale_y_continuous(minor_breaks = seq(0, 300, 25)) +
  ylab('tailfindr tail length [nt]') +
  labs(caption = 'tailfindr tail length estimate is robust across technical replicates.
       The black horizontal lines represent the expected poly(A) tail length.',
       fill = 'Replicate')
p
if (save_figures) {
  RSvgDevice::devSVG(here('figures', 'rna_kr_tail_length_rna001_rep1_vs_rna001_rep2_density_plot.svg'), 
                     width = 5, height = 3.5, onefile = TRUE, xmlHeader = TRUE)
  print(p)
  dev.off()
}

Tail length summaries

summary_data <- rna001_data %>% group_by(barcode, replicate) %>% 
  summarise(read_count = n(),
            mean = mean(tail_length_tf, na.rm = TRUE),
            median = median(tail_length_tf, na.rm = TRUE),
            std_dev = sd(tail_length_tf, na.rm = TRUE),
            std_err = std_err(tail_length_tf)) 
summary_data %<>% mutate(cof_var = std_dev/mean)
kable(summary_data)

tailfindr estimations vs. Nanopolish estimation

Density plot (all barcodes)

# make a long data frame
tf_vs_np_data <- rna_kr_data_capped %>% 
  select(barcode, tail_length_tf, tail_length_np) %>% 
  rename(tailfindr = tail_length_tf, nanopolish = tail_length_np) %>% 
  gather(key = 'tool', value = 'tail_length', tailfindr, nanopolish) %>% 
  mutate(tool = fct_relevel(tool, c('tailfindr', 'nanopolish')))
p <- ggplot(tf_vs_np_data, aes(x = barcode, y = tail_length, 
                               color = tool, fill = tool)) +
  geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept = as.numeric(as.character(barcode)))) 
lengend_name <- 'Tool'
legend_labels <- c('tailfindr', 
                   'Nanopolish')
p <- p + theme(panel.background = element_blank(),
        panel.grid.minor = ggplot2::element_line(colour = "grey", size = 0.3),
        panel.grid.major = ggplot2::element_line(colour = "grey", size = 0.4),
        panel.grid.major.x = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.spacing = unit(1.5, "lines"),
        legend.direction = "horizontal",
        legend.position = "top") +
  scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  scale_fill_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  ggplot2::scale_y_continuous(minor_breaks = seq(0, 300, 25)) +
  ylab('tail length (nt)') +
  labs(caption = 'The black horizontal lines represent the expected poly(A) tail length.',
       fill = 'Tool')
p
if (save_figures) {
  RSvgDevice::devSVG(here('figures', 'rna_kr_tail_length_tf_vs_np_all_barcodes_density.svg'), 
                     width = 5, height = 3.5, onefile = TRUE, xmlHeader = TRUE)
  print(p)
  dev.off()
}

Tail length summaries

summary_data <- tf_vs_np_data %>% group_by(barcode, tool) %>% 
  summarise(read_count = n(),
            mean = mean(tail_length, na.rm = TRUE),
            median = median(tail_length, na.rm = TRUE),
            std_dev = sd(tail_length, na.rm = TRUE),
            std_err = std_err(tail_length)) 
summary_data %<>% mutate(cof_var = std_dev/mean)
kable(summary_data)

Density plot (just two barcodes)

# subset data
tf_vs_np_data_two_barcode <- tf_vs_np_data %>% 
  filter(barcode == 40 | barcode == 150) 

p <- ggplot(data = tf_vs_np_data_two_barcode, 
            aes(x = barcode, y = tail_length, color = tool, fill = tool)) +
  geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept = as.numeric(as.character(barcode))))
lengend_name <- 'Tool'
legend_labels <- c('Nanopolish', 
                   'tailfindr')
p <- p + theme(panel.background = element_blank(),
        panel.grid.minor = ggplot2::element_line(colour = "grey", size = 0.5),
        panel.grid.major = ggplot2::element_line(colour = "grey", size = 0.5),
        panel.grid.major.x = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.spacing = unit(3, "lines"),
        legend.direction = "horizontal",
        legend.position = "top") +
  scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  scale_fill_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  ggplot2::scale_y_continuous(minor_breaks = seq(0, 300, 25)) +
  ylab('tail length (nt)') +
  labs(caption = 'The black horizontal lines represent the expected poly(A) tail length.',
       fill = 'Tool')
p
if (save_figures) {
  RSvgDevice::devSVG(here('figures', 'rna_kr_tail_length_tf_vs_np_two_barcodes_density.svg'), 
                     width = 4, height = 3, onefile = TRUE, xmlHeader = TRUE)
  print(p)
  dev.off()
}

tailfindr poly(A) end vs. sequence end by alignment of eGFP

To test whether poly(A) end as defined by tailfindr and the sequence end of eGFP as defined by sequence alignment match, a scatter plot is generated:

p <- ggplot(rna_kr_data, aes(x = tail_end_tf, y = transcript_alignment_start)) +
    geom_point(shape = 21, colour = 'black', fill = 'black', 
               size = 2, stroke=0, alpha = 0.1) +
    geom_abline(intercept = 0, slope = 1, color="red", 
                linetype = 'dotted', size = 0.7) +
    geom_smooth(method = 'lm', formula = y~x, 
                color="#797979", fullrange = TRUE, se = FALSE, size = 0.5) +
    stat_cor(method = "pearson", label.x = 10, label.y = 52000) 
p <- p + theme_bw() +
  theme(
    axis.line =element_line(colour = "black"),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()
  ) +
  xlab('tailfindr tail end') +
  ylab('Tail end defined by eGFP alignment') + 
  labs(caption = 'The red dotted line represents x = y.
       The grey line represents the linear model fit.') +
  coord_fixed() 
p
if (save_figures) {
  ggsave(plot = p,  
         filename = here('figures', 'rna_kr_tf_tail_end_vs_egfp_end_scatter_plot.png'),
         bg = "transparent", width = 5, height = 5, units = 'in')
}

To better visualize the precision of end definition, a histogram of the difference between tailfindr poly(A) end and the sequence alignment position is generated:

data <- mutate(rna_kr_data, diff = tail_end_tf - transcript_alignment_start)
p <- ggplot(data, aes(x = diff)) + 
  geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha = 0.6) 
p <- p +
  theme_bw() +
  coord_cartesian(xlim = c(-500, 500)) +
  scale_x_continuous(minor_breaks = seq(-500, 500, 50)) +
  xlab('tailfindr tail end - sequence end defined by eGFP alignment') +
  ylab('Read count')
p
if (save_figures) {
  ggsave(plot = p,  
         filename = here('figures', 'rna_kr_tf_tail_end_vs_egfp_end_scatter_histogram.png'),
         bg = "transparent", width = 6.5, height = 5, units = 'in')
}

Nanopolish tail length estimate vs. tailfindr tail length estimate

Let us generate a scatter plot of tailfindr and Nanopolish poly(A) tail start estimates:

p <- ggplot(rna_kr_data, aes(x = tail_length_tf, y = tail_length_np)) +
    geom_point(shape = 21, colour = 'black', fill = 'black', 
               size = 2, stroke = 0, alpha = 0.1) +
    geom_abline(intercept = 0, slope = 1, color = "red", 
                linetype = 'dotted', size = 0.7) +
    geom_smooth(method = 'lm', formula = y~x, 
                color="#797979", fullrange = TRUE, se = FALSE, size = 0.5) +
    stat_cor(method = "pearson", label.x = 10, label.y = 580) 
p <- p + theme_bw() +
  theme(
    axis.line = element_line(colour = "black"),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    aspect.ratio = 1
  ) +
  scale_y_continuous(breaks = seq(0, 600, 100),minor_breaks=NULL) +
  scale_x_continuous(breaks = seq(0, 600, 100),minor_breaks=NULL) +
  xlab('tailfindr tail length') +
  ylab('Nanopolish tail length') + 
  labs(caption = 'The red dotted line represents x = y.
       The grey line represents the linear model fit.') 
p
if (save_figures) {
  ggsave(plot = p,  
         filename = here('figures', 'rna_kr_tf_vs_np_tail_length.png'), 
         bg = "transparent", width = 10, height = 6, units = 'in')
}

To better visualize the overlap in poly(A) start detection, a histogram of the difference between Nanopolish and tailfindr poly(A) start is generated:

data <- mutate(rna_kr_data, diff = tail_length_np - tail_length_tf)
p <- ggplot(data, aes(x = diff)) + 
  geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha=0.6) + 
  theme_bw() + 
  coord_cartesian(xlim = c(-250, 250)) +
  scale_x_continuous(minor_breaks = seq(-250, 250, 50)) +
  xlab('Nanopolish tail length - tailfindr tail length') +
  ylab('Read count')
p

Nanopolish tail start estimate vs. tailfindr tail start estimate

Let us generate a scatter plot of tailfindr and Nanopolish poly(A) tail start estimates:

p <- ggplot(rna_kr_data, aes(x = tail_start_tf, y = tail_start_np)) +
    geom_point(shape = 21, colour = 'black', fill = 'black', 
               size = 2, stroke = 0, alpha = 0.1) +
    geom_abline(intercept = 0, slope = 1, color = "red", 
                linetype = 'dotted', size = 0.7) +
    geom_smooth(method = 'lm', formula = y~x, 
                color="#797979", fullrange = TRUE, se = FALSE, size = 0.5) +
    stat_cor(method = "pearson", label.x = 10, label.y = 52000) 
p <- p + theme_bw() +
  theme(
    axis.line = element_line(colour = "black"),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()
  ) +
  xlab('tailfindr tail start') +
  ylab('Nanopolish tail start') + 
  labs(caption = 'The red dotted line represents x = y.
       The grey line represents the linear model fit.') +
  coord_fixed()
p
if (save_figures) {
  ggsave(plot = p,  
         filename = here('figures', 'rna_kr_tf_vs_np_tail_tail_start_scatter_plot.png'), 
         bg = "transparent", width = 10, height = 6, units = 'in')
}

To better visualize the overlap in poly(A) start detection, a histogram of the difference between Nanopolish and tailfindr poly(A) start is generated:

data <- mutate(rna_kr_data, diff = tail_start_np - tail_start_tf)
p <- ggplot(data, aes(x = diff)) + 
  geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha=0.6) + 
  theme_bw() + 
  coord_cartesian(xlim = c(-250, 250)) +
  scale_x_continuous(minor_breaks = seq(-250, 250, 50)) +
  xlab('Nanopolish tail start - tailfindr tail start') +
  ylab('Read count')
p

Nanopolish tail end estimate vs. tailfindr tail end estimate

Let us generate a scatter plot of tailfindr and Nanopolish poly(A) tail end estimates:

p <- ggplot(rna_kr_data, aes(x = tail_end_tf, y = tail_end_np)) +
    geom_point(shape = 21, colour = 'black', fill = 'black', 
               size = 2, stroke = 0, alpha = 0.1) +
    geom_abline(intercept = 0, slope = 1, color = "red", 
                linetype = 'dotted', size = 0.7) +
    geom_smooth(method = 'lm', formula = y~x, 
                color = "#797979", fullrange = TRUE, se = FALSE, size = 0.5) +
    stat_cor(method = "pearson", label.x = 10, label.y = 52000) 
p <- p + theme_bw() +
  theme(
    axis.line = element_line(colour = "black"),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()
  ) +
  xlab('tailfindr tail end') +
  ylab('Nanopolish tail end') + 
  labs(caption = 'The red dotted line represents x = y.
       The grey line represents the linear model fit.') +
  coord_fixed()
p
if (save_figures) {
  ggsave(plot = p,  
         filename = here('figures', 'rna_kr_tf_vs_np_tail_end_scatter_plot.png'), 
         bg = "transparent", width = 10, height = 6, units = 'in')
}

To better visualize the overlap in poly(A) start detection, a histogram of the difference between Nanopolish and tailfindr poly(A) start is generated:

data <- mutate(rna_kr_data, diff = tail_end_np - tail_end_tf)
p <- ggplot(data, aes(x = diff)) + 
  geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha=0.6) + 
  theme_bw() + 
  coord_cartesian(xlim = c(-250, 250)) +
  scale_x_continuous(minor_breaks = seq(-250, 250, 50)) +
  xlab('Nanopolish tail end - tailfindr tail end') +
  ylab('Read count')
p

Nanopolish read rate vs. tailfindr read rate

p <- ggplot(rna_kr_data, aes(x = samples_per_nt_tf, y = samples_per_nt_np)) +
    geom_point(shape = 21, colour = 'black', fill = 'black', 
               size = 1.2, stroke = 0, alpha = 0.1) +
    geom_abline(intercept = 0, slope = 1, color="red", 
                linetype = 'dotted', size = 0.7) +
    geom_smooth(method = 'lm', formula = y~x, 
                color = "#797979", fullrange = TRUE, se = FALSE, size = 0.5) +
    stat_cor(method = "pearson", label.x = 21, label.y = 92) 
p <- p + theme_bw() +
  theme(
    axis.line = element_line(colour = "black"),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()
  ) +
  xlab('tailfindr normalizer (Samples/nt)') +
  ylab('Nanopolish normalizer (Samples/nt)') + 
  labs(caption = 'The red dotted line represents x = y.
       The grey line represents the linear model fit.') +
  coord_fixed(ratio = 1, xlim = c(20, 90), ylim = c(20, 90))
p
if (save_figures) {
  ggsave(plot = p,  
         filename = here('figures', 'rna_kr_tf_vs_np_normalizer.png'), 
         bg = "transparent", width = 10, height = 6, units = 'in')
}

Justification for RNA thresholds

To define poly(A) segments from signal data, we defined a threshold of 0.3 as the lower limit of the poly(A) level expected after squiggle data z-normalisation. We chose this threshold, as the expected mean signal level for homopolymer A stretches in RNA is well above zero (see below). To make the threshold robust against data variation, we further chose the threshold to not be violated even when substracting two standard deviations of the expected mean signal. The results of expected signal after normalisation and with standard deviation substraction can be found below.

model_180mv <- here('data', 'r9.4_180mv_70bps_5mer_RNA_template_median69pA.model')
df_180mv <- read.csv(model_180mv, header = TRUE, 
                     stringsAsFactors = FALSE, sep = '\t') 

df_180mv <- as.tibble(df_180mv)

# sample 1000 numbers from a Gaussian distribution for each kmer mu and sd
set.seed(5)
df_180mv %<>% select(kmer, level_mean, level_stdv) %>% 
  rowwise() %>% 
  mutate(sampled_numbers = list(rnorm(n = 1000, level_mean, level_stdv)))

sampled_numbers <- purrr::flatten(df_180mv$sampled_numbers)
sampled_numbers <- unlist(sampled_numbers, recursive = FALSE)

# Calculate mean and sd of AAAAA state, and all the kmers combined
mean_AAAAA <- mean(unlist(df_180mv$sampled_numbers[1])) 
sd_AAAAA <- sd(unlist(df_180mv$sampled_numbers[1])) 
mean_all_kmers <- mean(sampled_numbers) 
sd_all_kmers <- sd(sampled_numbers)

paste0("Mean expected AAAAA level after normalisation: ", 
      (mean_AAAAA - mean_all_kmers) / sd_all_kmers)

minimum_expected_normalized_AAAAA_level  <- 
  ((mean_AAAAA - 2 * sd_AAAAA) - mean_all_kmers) / sd_all_kmers

paste0("Minimum expected normalized AAAAA level: ", 
       minimum_expected_normalized_AAAAA_level)

Translocation rate across sequencing kits

p <- ggplot(rna_kr_data, aes(x = samples_per_nt_tf, color = replicate)) +
  geom_line(stat = 'density', size = 0.7, position = 'identity')
lengend_name <- 'Replicates '
legend_labels <- c('Replicate 1 SQK-RNA001 +RT', 
                   'Replicate 2 SQK-RNA001 +RT', 
                   'Replicate 3 SQK-RNA002 -RT')
p <- p + theme(panel.background = element_blank(),
        panel.grid.minor = ggplot2::element_line(colour="grey", size=0.5),
        panel.grid.major = ggplot2::element_line(colour="grey", size=0.5),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank(),
        legend.direction = "vertical",
        legend.position = "top") +
  scale_color_brewer(palette = "Paired", name = lengend_name, labels = legend_labels) +
  coord_cartesian(xlim = c(20, 60)) +
  ylab('Density') +
  xlab('Translocation rate (Samples/nucleotide)') +
  labs(caption = 'RNA without reverse transcription transloactes slightly slower than RNA with reverse transcription.',
       fill = 'Condition')
p
if (save_figures) {
  RSvgDevice::devSVG(here('figures', 'rna_kr_tail_all_three_replicates_normalizer.svg'), 
                     width = 10, height = 3.5, onefile = TRUE, xmlHeader = TRUE)
  print(p)
  dev.off()
}


adnaniazi/krauseNiazi2019Analyses documentation built on June 9, 2019, 7:22 p.m.