In HIVDiversity/MotifBinner2: Processes NGS Primer ID Datasets

opts_chunk$set(echo=FALSE)
opts_chunk$set(warning=FALSE)
opts_chunk$set(fig.pos = 'H')

library(MotifBinner2)

if (!exists('result'))
{
  result <- all_results[[grep('basicQC', names(all_results))[1]]]
}
if (class(result) != 'basicQC')
{
  result <- all_results[[grep('basicQC', names(all_results))[1]]]
}

figure_path <- paste('figure_', result$config$op_full_name, '/', sep = '')

`r result$config$op_full_name`

Basic Quality Control reports on the raw reads from the sequencing run.

Inspired by FastQC: Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: http://www.bioinformatics.babraham.ac.uk/projects/fastqc

is_miseq <- tolower(config$header_format) == "miseq"
seq_df <- result$metrics$seq_df
zone_qual <- result$metrics$zone_qual
tile_qual <- result$metrics$tile_qual
pos_qual <- result$metrics$pos_qual
gap_follow <- result$metrics$gap_follow
aligned_to_gap <- result$metrics$aligned_to_gap
zone_qual$zone_quality_botcut <- zone_qual$zone_quality
zone_qual$zone_quality_botcut[zone_qual$zone_quality < sort(zone_qual$zone_quality)[50]] <- sort(zone_qual$zone_quality)[50]

fig.cap1 <- "Figure: Histograms of read lengths and average qualities."
fig.cap2 <- "Figure: Read length and quality by position on slide."
fig.cap3 <- "Figure: Average sequence quality by tile and cycle."
fig.cap4 <- "Figure: Distribution of sequence qualities by cycle. For plotting reasons, cycles are grouped into bins of size two. This means that y-axis is double the range it would be if each cycle was plotted independently. Also, cycles 300 and 301 (250 and 251) are grouped together and since the max length is 300 (250), the final bar is expected to be half the height of the bar immediately preceeding it."
fig.cap5 <- 'Figure: Distribution of insertions in the reads relative to the profile used for mapping.'
fig.cap6 <- 'Figure: Distribution of deletions in the reads relative to the profile used for mapping.'

options(scipen=99)
n_breaks <- ceiling(quantile(seq_df$read_widths, 0.95)/15)
read_width_breaks <- c(-Inf,(0:n_breaks)*15, Inf)
qual_breaks <- (0:21)*2
p1 <- 
ggplot(seq_df, aes(x=read_widths)) + 
  geom_histogram(breaks=read_width_breaks) +
  xlab('Read Length') +
  ylab('Number of Reads')
p2 <- 
ggplot(seq_df, aes(x=qual)) + 
  geom_histogram(breaks=qual_breaks) +
  xlab('Quality Score') +
  ylab('Number of Reads')
grid.arrange(p1, p2, ncol=2)

cat('Since the sequence headers are not MiSeq formatted, no reports based on the location of the reads on the slide can be generated.')

p3 <-
ggplot(zone_qual,
       aes(x=xcat, y=ycat, fill=n_seq)) + 
       geom_tile() + 
       theme(legend.position = 'bottom') +
       xlab(NULL) + ylab(NULL) +
       labs(list(fill = 'Number of Sequences'))

p4 <-
ggplot(zone_qual,
       aes(x=xcat, y=ycat, fill=zone_quality_botcut)) + 
       geom_tile() + 
       theme(legend.position = 'bottom') +
       xlab(NULL) + ylab(NULL) +
       labs(list(fill = 'Average Quality Score'))
grid.arrange(p3, p4, ncol=2)

print(
  ggplot(data.frame(tile_qual), aes(cycle, as.factor(tile_indx))) +
  geom_tile(aes(fill=avg_qual)) +
  xlab("Sequencing Cycle - nucleotide position in the raw reads") +
  ylab("Tile - Region on the slide")+
  scale_fill_continuous(guide = guide_legend("Avg. Q\nScore"))
)

print(
  ggplot(na.omit(pos_qual), aes(x = cycle_cat, 
    fill = reorder(qual, desc(qual), ordered=T))) +
         geom_bar(aes(y=count), width=2, stat = 'identity') +
         xlab("Sequencing Cycle - nucleotide position in the raw reads") +
         ylab("Number of Reads (x2)") +
         scale_fill_hue(guide = guide_legend(title = "Q Score"),
                        h = c(360, 30)+15)
)

print(
  ggplot(gap_follow, aes(x = pos)) +
         geom_bar(aes(y=n_gaps), width=1, stat = 'identity') +
         xlab("Sequencing Cycle - nucleotide position in the raw reads") +
         ylab("Number of Reads with Insertions")
)

print(
  ggplot(aligned_to_gap, aes(x = pos)) +
         geom_bar(aes(y=n_gaps), width=1, stat = 'identity') +
         xlab("Sequencing Cycle - nucleotide position in the raw reads") +
         ylab("Number of Reads with Deletions")
)

summary_tab <- result$summary
cat("\n\nTable: Basic statistics of the QC'd sequences.\n\n")
kept_tab <- summary_tab[,grep('k_', names(summary_tab))]
names(kept_tab) <- gsub('k_', '', names(kept_tab))
print(format_table(kept_tab))

timingTable(result)