knitr::opts_chunk$set(
    echo = FALSE,
    results = "asis",
    message = FALSE, 
    warning = FALSE,
    error = FALSE
    )
library(ngsReports)
library(magrittr)
library(stringr)
library(dplyr)
library(readr)
library(tibble)
library(ggplot2)
library(scales)
library(DT)
library(pander)
globals <- list(
    usePlotly = params$usePlotly,
    cluster = TRUE,
    dendrogram = TRUE,
    theoreticalGC = TRUE,
    gcType = params$gcType,
    species = params$species
    )
fastqcFiles <- list.files(pattern = "(fastqc.zip|fastqc)$")
stopifnot(length(fastqcFiles) > 1)
message("FastQC files found. Loading FastQC data")
fastqcData <- tryCatch(FastqcDataList(fastqcFiles))
plotLabels <- structure(
    gsub(".(fastq|fastq.gz|bam)", "", fqName(fastqcData)),  
    names = fqName(fastqcData)
)
n <- length(fastqcData)
fh <- max(0.25*n, 6)
message("Checking for ", params$tgtsFile)
if (file.exists(params$tgtsFile)) {
    message("Found targets.csv...checking columns")
    targets <- read_csv(params$tgtsFile)
    fCol <- grep("[Ff]ile[Nn]ame", colnames(targets))
    lCol <- grep("[Ll]abel", colnames(targets))
    if (length(fCol) == 1 && length(lCol) == 1) {
        stopifnot(all(fqName(fastqcData) %in% targets[[fCol]]))
        message("Alternate labels found")
        plotLabels <- structure(targets[[lCol]], names = targets[[fCol]])
    }
    else{
        message("No valid labels found")
    }
}
if (!file.exists(params$tgtsFile)) {
    message(params$tgtsFile, " not found. Using default labels")
}

FastQC Summary

bs <- getModule(fastqcData, "Basic_Statistics")
bs %>%
    mutate(
        Sequence_length = paste(Shortest_sequence, Longest_sequence, sep = "-"),
        `%GC` = as.numeric(`%GC`) / 100
    ) %>%
    dplyr::select(Filename, contains("sequence"), `%GC`, File_type, Encoding, -contains("est")) %>%
    set_names(gsub("_", " ", names(.))) %>%
    rename_all(str_to_title) %>%
    set_names(str_remove_all(names(.), "[Ss]equences")) %>%
    rename_all(str_trim) %>%
    dplyr::rename(`%GC` = `%Gc`) %>%
    datatable(
        caption = "Summary statistics for all libraries",
        rownames = FALSE,
        options = list(
            pageLength = 25
        ),
        class = "stripe"
    ) %>%
    formatPercentage(
        columns = "%GC"
    ) %>%
    formatRound(
        columns = c("Total", "Flagged As Poor Quality"),
        digits = 0,
        mark = ","
    )

Read Totals

Library Sizes ranged between r pander(comma(range(readTotals(fastqcData)$Total_Sequences))) reads.

plotReadTotals(fastqcData, labels = plotLabels, usePlotly = globals$usePlotly)

FastQC Summary

plotSummary(fastqcData, labels = plotLabels, usePlotly = globals$usePlotly)

Per Base Sequence Quality

plotBaseQuals(fastqcData, labels = plotLabels, cluster = globals$cluster, dendrogram = globals$dendrogram, usePlotly = globals$usePlotly)

Per Sequence Quality Scores

plotSeqQuals(fastqcData, labels = plotLabels, cluster = globals$cluster, dendrogram = globals$dendrogram, usePlotly = globals$usePlotly)

Per Base Sequence Content

plotSeqContent(fastqcData, labels = plotLabels, cluster = globals$cluster, dendrogram = globals$dendrogram, usePlotly = globals$usePlotly)

Per Sequence GC Content

plotGcContent(fastqcData, labels = plotLabels, theoreticalGC = globals$theoreticalGC, gcType = globals$gcType, species = globals$species, cluster = globals$cluster, dendrogram = globals$dendrogram, usePlotly = globals$usePlotly)
plotGcContent(fastqcData, labels = plotLabels, theoreticalGC = globals$theoreticalGC, gcType = globals$gcType, species = globals$species, plotType = "line", usePlotly = globals$usePlotly)

Sequence Length Distribution

plotSeqLengthDistn(fastqcData, labels = plotLabels, cluster = globals$cluster, dendrogram = globals$dendrogram, usePlotly = globals$usePlotly)
plotSeqLengthDistn(fastqcData, plotType = "cdf", labels = plotLabels, cluster = globals$cluster, dendrogram = globals$dendrogram, usePlotly = globals$usePlotly)

Sequence Duplication Levels

plotDupLevels(fastqcData, labels = plotLabels, cluster = globals$cluster, dendrogram = globals$dendrogram, usePlotly = globals$usePlotly)

Adapter Content

plotAdapterContent(fastqcData, labels = plotLabels, cluster = globals$cluster, dendrogram = globals$dendrogram, usePlotly = globals$usePlotly) 

Overrepresented Summary

plotOverrep(fastqcData, labels = plotLabels, usePlotly = globals$usePlotly)

Overrepresented Sequences

os <- getModule(fastqcData, "Overrepresented_sequences") 
if (length(os)) {
    os %>% 
        mutate(Filename = plotLabels[Filename]) %>%
        group_by(Sequence, Possible_Source) %>%
        summarise(
            Total = sum(Count),
            Files = n(),
            Max_Percentage = max(Percentage/100)
        ) %>%
        ungroup() %>%
        dplyr::arrange(desc(Total)) %>%
        dplyr::slice(1:params$nOver) %>%
        mutate(
            `% Across All Files` = Total / sum(bs$Total_Sequences)
        ) %>%
        dplyr::select(
            Sequence, Total, `Present In` = Files, `% Across All Files`, 
            `Max Individual %` = Max_Percentage, Possible_Source
        ) %>%
        set_names(gsub("_", " ", names(.))) %>%
        rename_all(str_to_title) %>%
        datatable(
            caption = paste(
                "Summary of the most overrepresented sequences in all files.",
                "A maximum of", params$nOver, "sequences are given"
            ),
            rownames = FALSE,
            options = list(
                pageLength = params$nOver,
                autoWidth = TRUE
            )
        ) %>%
        formatPercentage(
            columns = c("% Across All Files", "Max Individual %"),
            digits = 2
        ) %>%
        formatRound(
            columns = "Total",
            digits = 0,
            mark = ","
        )
}
if (length(os) == 0) {
    message("No overrepresented sequences were detected by FastQC")
}

Session Information

sessionInfo() %>% pander()


UofABioinformaticsHub/fastqcReports documentation built on April 1, 2024, 5:29 p.m.