library(goldclipReport)
library(fastqcr)
library(dplyr)
library(ggplot2)
library(cowplot)
library(knitr)
suppressPackageStartupMessages(library(kableExtra))

knitr::opts_chunk$set(fig.width = 12, 
                      fig.height = 8, 
                      fig.path = 'Figs/',
                      echo = FALSE,
                      eval = TRUE,
                      cache = FALSE,
                      prompt = FALSE,
                      tidy = FALSE,
                      comment = NA,
                      message = FALSE,
                      warning = FALSE,
                      rownames.print = FALSE)
options(width=150)

print_df <- function(x){
  x %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling() %>%
  kableExtra::scroll_box(width = "100%", height = "400px")
}

qc_path <- params$qc_path

1. Summary

qc <- qc_aggregate(qc_path, progressbar = FALSE)
qcgs <- qc_stats(qc)
qcgs <- dplyr::mutate(qcgs, 
               tot.seq = as.numeric(tot.seq),
               # tot.seq = as.numeric(tot.seq) / 1e6,
               pct.dup = paste0(pct.dup, '%'),
               pct.gc  = paste0(pct.gc, '%')) %>%
  dplyr::select(sample, tot.seq, pct.dup, pct.gc, seq.length)
names(qcgs) <- c('Sample', 'Reads', '% Dup', '% GC', 'Length (nt)')
print_df(as.data.frame(qcgs))

2. Number of reads

library(ggplot2)
library(dplyr)

nmax <- 10^(nchar(max(qcgs$Reads)) - 1)
ymax <- ceiling(max(qcgs$Reads) / nmax) * nmax
ymax <- ymax / 1e6

p1 <- qcgs %>%
  mutate(count = round(as.numeric(Reads) / 1e6, 2)) %>%
  ggplot(aes(Sample, count)) +
  geom_bar(stat = "identity", fill = "grey80", color = "black", size = .5) +
  geom_text(aes(label = count), vjust = .5, hjust = -0.1) +
  scale_x_discrete(limits = rev(unique(sort(qcgs$Sample)))) +
  scale_y_continuous(limits = c(0, ymax),
                     breaks = seq(0, ymax, length.out = 5),
                     labels = seq(0, ymax, length.out = 5),
                     position = "right") +
  coord_flip() +
  xlab(NULL) + 
  ylab(paste0("Number of reads (Million)")) +
  theme_bw() +
  theme(panel.grid   = element_blank(),
        panel.border = element_blank(),
        plot.title   = element_text(color = "black", size = rel(1.5),
                                    face = "bold", hjust = .5),
        axis.line    = element_line(color = "black", size = .5),
        axis.title   = element_text(color = "black", size = rel(1.2)),
        axis.text.x  = element_text(color = "black", size = rel(1.2)),
        axis.text.y  = element_text(color = "black", size = rel(1.2)))

print(p1)

3. Distribution of Base Quality

From Illumina:

Q scores are defined as a property that is logarithmically related to the base calling error probabilities (P)

$$Q = - 10 log_{10} P $$

For example, if Phred assigns a Q score of 30 (Q30) to a base, this is equivalent to the probability of an incorrect base call 1 in 1000 times (Table 1). This means that the base call accuracy (i.e., the probability of a correct base call) is 99.9%. A lower base call accuracy of 99% (Q20) will have an incorrect base call probability of 1 in 100, meaning that every 100 bp sequencing read will likely contain an error. When sequencing quality reaches Q30, virtually all of the reads will be perfect, having zero errors and ambiguities. This is why Q30 is considered a benchmark for quality in next-generation sequencing. By comparison, Sanger sequencing systems generally produce base call accuracy of ~99.4%, or $Q20^{3}$. Low Q scores can increase false-positive variant calls, which can result in inaccurate conclusions and higher costs for validation experiments ^[Illumina, Inc, https://www.illumina.com/documents/products/technotes/technote_Q-Scores.pdf].

qual <- data.frame("Score" = as.character(c(10, 20, 30, 40, 50)),
                   "Incorrect" = c("1 in 10", 
                                   "1 in 100",
                                   "1 in 1000", 
                                   "1 in 10000",
                                   "1 in 100000"),
                   "Accuracy" = c("90%", "99%", "99.9%", "99.99%", "99.999%"),
                   stringsAsFactors = FALSE)
print_df(qual)

Score = "Phred Quality Score", Incorrect = "Probability of Incorrect Base Call", "Accuracy" = "Base Call Accuracy".

## grid plots
qc_files <- FastqcFiles(qc_path)

for(x in qc_files) {
  p <- fastqc_plot(x)
  print(p)
}

3. Quality Control Report in details

qc_html <- list.files(qc_path, "*_fastqc.html", all.files = TRUE, 
                      full.names = FALSE, recursive = FALSE)
qc_html_name <- gsub("_fastqc.html", "", basename(qc_html))
qc_url <- glue::glue("[{qc_html_name}](./{qc_html})")

qc_df <- data.frame(Number = as.character(seq_len(length(qc_html))),
                    Sample = qc_url,
                    stringsAsFactors = FALSE)

qc_df

4. Overall report

qc_overall <- summary(qc)

qc_overall %>%
  kable() %>%
  kable_styling() %>%
  scroll_box(width = "100%", height = "300px")

5. Problems

qc_problem <- qc_problems(qc, "sample")

qc_problem %>%
  kable() %>%
  kable_styling() %>%
  scroll_box(width = "100%", height = "300px")

6. Failed modules

qc_fail <- qc_fails(qc, "module")

qc_fail %>%
  kable() %>%
  kable_styling() %>%
  scroll_box(width = "100%", height = "300px")

[#EOF]



bakerwm/goldclipReport documentation built on July 9, 2019, 4:54 p.m.