In bakerwm/goldclipReport: Reporter for GoldCLIP (Title Case)

library(goldclipReport)
library(fastqcr)
library(dplyr)
library(ggplot2)
library(cowplot)
library(knitr)
suppressPackageStartupMessages(library(kableExtra))

knitr::opts_chunk$set(fig.width = 12, 
                      fig.height = 8, 
                      fig.path = 'Figs/',
                      echo = FALSE,
                      eval = TRUE,
                      cache = FALSE,
                      prompt = FALSE,
                      tidy = FALSE,
                      comment = NA,
                      message = FALSE,
                      warning = FALSE,
                      rownames.print = FALSE)
options(width=150)

print_df <- function(x){
  x %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling() %>%
  kableExtra::scroll_box(width = "100%", height = "400px")
}

qc_path <- params$qc_path

1. Summary

qc <- qc_aggregate(qc_path, progressbar = FALSE)
qcgs <- qc_stats(qc)
qcgs <- dplyr::mutate(qcgs, 
               tot.seq = as.numeric(tot.seq),
               # tot.seq = as.numeric(tot.seq) / 1e6,
               pct.dup = paste0(pct.dup, '%'),
               pct.gc  = paste0(pct.gc, '%')) %>%
  dplyr::select(sample, tot.seq, pct.dup, pct.gc, seq.length)
names(qcgs) <- c('Sample', 'Reads', '% Dup', '% GC', 'Length (nt)')
print_df(as.data.frame(qcgs))

2. Number of reads

library(ggplot2)
library(dplyr)

nmax <- 10^(nchar(max(qcgs$Reads)) - 1)
ymax <- ceiling(max(qcgs$Reads) / nmax) * nmax
ymax <- ymax / 1e6

p1 <- qcgs %>%
  mutate(count = round(as.numeric(Reads) / 1e6, 2)) %>%
  ggplot(aes(Sample, count)) +
  geom_bar(stat = "identity", fill = "grey80", color = "black", size = .5) +
  geom_text(aes(label = count), vjust = .5, hjust = -0.1) +
  scale_x_discrete(limits = rev(unique(sort(qcgs$Sample)))) +
  scale_y_continuous(limits = c(0, ymax),
                     breaks = seq(0, ymax, length.out = 5),
                     labels = seq(0, ymax, length.out = 5),
                     position = "right") +
  coord_flip() +
  xlab(NULL) + 
  ylab(paste0("Number of reads (Million)")) +
  theme_bw() +
  theme(panel.grid   = element_blank(),
        panel.border = element_blank(),
        plot.title   = element_text(color = "black", size = rel(1.5),
                                    face = "bold", hjust = .5),
        axis.line    = element_line(color = "black", size = .5),
        axis.title   = element_text(color = "black", size = rel(1.2)),
        axis.text.x  = element_text(color = "black", size = rel(1.2)),
        axis.text.y  = element_text(color = "black", size = rel(1.2)))

print(p1)

3. Distribution of Base Quality

From Illumina:

Q scores are defined as a property that is logarithmically related to the base calling error probabilities (P)

$$Q = - 10 log_{10} P $$

For example, if Phred assigns a Q score of 30 (Q30) to a base, this is equivalent to the probability of an incorrect base call 1 in 1000 times (Table 1). This means that the base call accuracy (i.e., the probability of a correct base call) is 99.9%. A lower base call accuracy of 99% (Q20) will have an incorrect base call probability of 1 in 100, meaning that every 100 bp sequencing read will likely contain an error. When sequencing quality reaches Q30, virtually all of the reads will be perfect, having zero errors and ambiguities. This is why Q30 is considered a benchmark for quality in next-generation sequencing. By comparison, Sanger sequencing systems generally produce base call accuracy of ~99.4%, or $Q20^{3}$. Low Q scores can increase false-positive variant calls, which can result in inaccurate conclusions and higher costs for validation experiments ^[Illumina, Inc, https://www.illumina.com/documents/products/technotes/technote_Q-Scores.pdf].

qual <- data.frame("Score" = as.character(c(10, 20, 30, 40, 50)),
                   "Incorrect" = c("1 in 10", 
                                   "1 in 100",
                                   "1 in 1000", 
                                   "1 in 10000",
                                   "1 in 100000"),
                   "Accuracy" = c("90%", "99%", "99.9%", "99.99%", "99.999%"),
                   stringsAsFactors = FALSE)
print_df(qual)

Score = "Phred Quality Score", Incorrect = "Probability of Incorrect Base Call", "Accuracy" = "Base Call Accuracy".

## grid plots
qc_files <- FastqcFiles(qc_path)

for(x in qc_files) {
  p <- fastqc_plot(x)
  print(p)
}

3. Quality Control Report in details

qc_html <- list.files(qc_path, "*_fastqc.html", all.files = TRUE, 
                      full.names = FALSE, recursive = FALSE)
qc_html_name <- gsub("_fastqc.html", "", basename(qc_html))
qc_url <- glue::glue("[{qc_html_name}](./{qc_html})")

qc_df <- data.frame(Number = as.character(seq_len(length(qc_html))),
                    Sample = qc_url,
                    stringsAsFactors = FALSE)

qc_df

4. Overall report

qc_overall <- summary(qc)

qc_overall %>%
  kable() %>%
  kable_styling() %>%
  scroll_box(width = "100%", height = "300px")

5. Problems

qc_problem <- qc_problems(qc, "sample")

qc_problem %>%
  kable() %>%
  kable_styling() %>%
  scroll_box(width = "100%", height = "300px")

6. Failed modules

qc_fail <- qc_fails(qc, "module")

qc_fail %>%
  kable() %>%
  kable_styling() %>%
  scroll_box(width = "100%", height = "300px")

[#EOF]

bakerwm/goldclipReport documentation built on July 9, 2019, 4:54 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

bakerwm/goldclipReport
Reporter for GoldCLIP (Title Case)

In bakerwm/goldclipReport: Reporter for GoldCLIP (Title Case)

1. Summary

2. Number of reads

3. Distribution of Base Quality

3. Quality Control Report in details

4. Overall report

5. Problems

6. Failed modules

R Package Documentation

Browse R Packages

We want your feedback!

bakerwm/goldclipReport Reporter for GoldCLIP (Title Case)

In bakerwm/goldclipReport: Reporter for GoldCLIP (Title Case)

1. Summary

2. Number of reads

3. Distribution of Base Quality

3. Quality Control Report in details

4. Overall report

5. Problems

6. Failed modules

R Package Documentation

Browse R Packages

We want your feedback!

bakerwm/goldclipReport
Reporter for GoldCLIP (Title Case)