#' Create ggplot2 figures from MultiQC results
#'
#' @param directory Folder containing all the data files generated by MultiQC,
#' e.g. "multiqc_data/"
#' @param font_size Base font size (defaults to 18)
#' @param threshold_line Provide a number to draw a red dashed line at the
#' indicated number of reads for FastQC read, STAR, and HTSeq plots. Defaults
#' to 10e6; set to NULL to disable.
#' @param limits Override the upper limit of FastQC read, STAR, and HTSeq plots.
#' Supply a single number to give all three plots the same limit, or a vector
#' of three values to modify each individually. Defaults to NULL, which sets
#' automatic limits.
#'
#' @return A list with elements "plot" containing the `ggplot` objects, and
#' "data" containing all the underlying data
#' @export
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom forcats fct_inorder
#' @importFrom ggrepel geom_label_repel
#' @importFrom janitor clean_names
#' @importFrom readr cols read_delim
#' @importFrom stringr str_remove str_replace_all
#' @importFrom tidyr pivot_longer
#'
#' @description Creates four ggplot2 figures from MultiQC results, and returns
#' the data along with the plot objects. The plots are for FastQC (read counts
#' and Phred scores), STAR, and HTSeq. See Details for more information.
#'
#' @details For the Phred scores, one must open the MultiQC HTML report, and
#' export the data for "fastqc_per_base_sequence_quality_plot" as a tab-
#' delimited file (TSV), placing it inside the same directory as the rest.
#'
#' @references <https://multiqc.info/>
#' @seealso <https://www.github.com/travis-m-blimkie/tRavis>
#'
#' @examples
#' if (FALSE) tr_qc_plots("multiqc_data")
#'
tr_qc_plots <- function(
directory,
font_size = 18,
threshold_line = 10e6,
limits = NULL
) {
# Setup -----------------------------------------------------------------
file_phred_scores <-
file.path(directory, "fastqc_per_base_sequence_quality_plot.tsv")
file_fastqc_reads <- file.path(directory, "multiqc_fastqc.txt")
file_star <- file.path(directory, "multiqc_star.txt")
file_htseq <- file.path(directory, "multiqc_htseq.txt")
qc_theme <- theme_bw(base_size = font_size) +
theme(axis.text = element_text(colour = "black"))
draw_line <- ifelse(!is.null(threshold_line), TRUE, FALSE)
dashed_line <- geom_vline(
xintercept = threshold_line,
linetype = "dashed",
colour = "#EE2C2C",
linewidth = 1
)
colour_keys <- list(
"fastqc" = c(
"unique" = "#9AC0CD",
"duplicate" = "#7F7F7F"
),
"star" = c(
"unique" = "#104E8B",
"multimapped" = "#7CB5EC",
"too many" = "#F7A35C",
"too short" = "#F08080",
"unmapped" = "#7F0000"
),
"htseq" = c(
"assigned" = "#7CB5EC",
"ambiguous" = "#434348",
"not unique" = "#90ED7D",
"no feature" = "#F7A35C",
"low aQual" = "#8085E9"
)
)
output_list <- list()
# Phred scores ----------------------------------------------------------
if (file.exists(file_phred_scores)) {
phred_1 <- read_delim(
file = file_phred_scores,
delim = "\t",
col_types = cols()
)
phred_2 <- phred_1 %>%
pivot_longer(
-`Position (bp)`,
names_to = "sample",
values_to = "phred_score"
) %>%
rename("position" = `Position (bp)`)
bad_samples <- phred_2 %>%
filter(phred_score < 30) %>%
distinct(sample, .keep_all = TRUE) %>%
mutate(qc = sample)
phred_3 <- left_join(
phred_2,
bad_samples,
by = c("position", "sample", "phred_score")
)
max_phred <- round_any(
x = max(phred_3$phred_score),
accuracy = 5,
f = ceiling
)
line_alpha <- ifelse(
length(unique(phred_3$sample)) > 20,
0.3,
1
)
plot_phred_scores <-
ggplot(phred_3, aes(position, phred_score, group = sample)) +
geom_line(alpha = line_alpha) +
geom_hline(
yintercept = 30,
linetype = "dashed",
colour = "#00CD66",
linewidth = 1.5
) +
geom_label_repel(
aes(label = qc),
size = 4,
min.segment.length = 0,
show.legend = FALSE,
na.rm = TRUE
) +
scale_x_continuous(expand = expansion(mult = 0.02)) +
scale_y_continuous(limits = c(0, max_phred)) +
labs(x = "Position", y = "Phred score") +
qc_theme
output_list$plots$phred_scores <- plot_phred_scores
output_list$data$phred_scores <- phred_3
} else {
message(
"Required file 'fastqc_per_base_sequence_quality_plot.tsv' not found. ",
"To get this plot, open the MultiQC HTML report and export the data for ",
"the 'FastQC per-base sequence quality' as a tab-delimited file (TSV), ",
"saving into the same data directory."
)
}
# FastQC reads ----------------------------------------------------------
if (file.exists(file_fastqc_reads)) {
fastqc_1 <-
read_delim(file_fastqc_reads, delim = "\t", col_types = cols()) %>%
clean_names()
fastqc_2 <- fastqc_1 %>%
mutate(
unique = total_sequences * (total_deduplicated_percentage / 100),
duplicate = total_sequences - unique
) %>%
select(sample, unique, duplicate, total_sequences) %>%
arrange(total_sequences) %>%
mutate(sample = fct_inorder(sample))
fastqc_3 <- fastqc_2 %>%
select(-total_sequences) %>%
pivot_longer(
unique:duplicate,
names_to = "read_type",
values_to = "n_reads"
)
rounded_max_fastqc <-
if (is.null(limits)) {
get_rounded_max(fastqc_3)
} else if (length(limits) == 1) {
limits
} else if (length(limits) == 3) {
limits[1]
} else {
stop(
"Argument 'limits' must be NULL, a single value, or a numeric ",
"vector with length 3."
)
}
plot_fastqc_reads <-
ggplot(fastqc_3, aes(n_reads, sample, fill = read_type)) +
geom_col() +
{if (draw_line) dashed_line} +
scale_fill_manual(values = colour_keys$fastqc) +
scale_x_continuous(
expand = expansion(mult = c(0, 0.1)),
limits = c(0, rounded_max_fastqc),
labels = ~.x / 1e6
) +
labs(x = "Reads (M)", y = NULL, fill = "Read type") +
qc_theme +
theme(panel.grid.major.y = element_blank())
output_list$plots$fastqc_reads <- plot_fastqc_reads
output_list$data$fastqc_reads <- fastqc_3
} else {
message(
"No data found for FastQC reads; check that 'multiqc_fastqc.txt' exists."
)
}
# STAR ------------------------------------------------------------------
if (file.exists(file_star)) {
star_1 <- read_delim(
file = file_star,
delim = "\t",
col_types = cols()
) %>%
clean_names()
star_2 <- star_1 %>%
arrange(total_reads) %>%
mutate(sample = fct_inorder(sample)) %>%
select(
sample,
"unique" = uniquely_mapped,
multimapped,
"too_many" = multimapped_toomany,
"too_short" = unmapped_tooshort,
"unmapped" = unmapped_other
)
star_3 <- star_2 %>%
pivot_longer(
-sample,
names_to = "read_type",
values_to = "n_reads"
) %>%
mutate(
read_type = str_replace_all(read_type, pattern = c("_" = " ")),
read_type = factor(read_type, c(
"unmapped",
"too short",
"too many",
"multimapped",
"unique"
))
)
rounded_max_star <-
if (is.null(limits)) {
get_rounded_max(star_3)
} else if (length(limits) == 1) {
limits
} else if (length(limits) == 3) {
limits[2]
} else {
stop(
"Argument 'limits' must be NULL, a single value, or a numeric ",
"vector with length 3."
)
}
plot_star <- ggplot(star_3, aes(n_reads, sample, fill = read_type)) +
geom_col() +
{if (draw_line) dashed_line} +
scale_x_continuous(
expand = expansion(mult = c(0, 0.1)),
limits = c(0, rounded_max_star),
labels = ~.x / 1e6
) +
scale_fill_manual(values = colour_keys$star) +
labs(x = "Reads (M)", y = NULL, fill = "Read type") +
qc_theme +
theme(panel.grid.major.y = element_blank())
output_list$plots$star <- plot_star
output_list$data$star <- star_3
} else {
message("No data found for STAR; check that 'multiqc_star.txt' exists.")
}
# HTSeq -----------------------------------------------------------------
if (file.exists(file_htseq)) {
htseq_1 <- read_delim(
file = file_htseq,
delim = "\t",
col_types = cols()
) %>%
clean_names()
htseq_2 <- htseq_1 %>%
arrange(total_count) %>%
mutate(
sample = str_remove(sample, ".count$"),
sample = fct_inorder(sample)
) %>%
select(
sample,
assigned,
ambiguous,
"not_unique" = alignment_not_unique,
no_feature,
"low_aQual" = too_low_a_qual
)
htseq_3 <- htseq_2 %>%
pivot_longer(
-sample,
names_to = "read_type",
values_to = "n_reads"
) %>%
mutate(
read_type = str_replace_all(read_type, pattern = c("_" = " ")),
read_type = factor(read_type, c(
"low aQual",
"no feature",
"not unique",
"ambiguous",
"assigned"
))
)
rounded_max_htseq <-
if (is.null(limits)) {
get_rounded_max(htseq_3)
} else if (length(limits) == 1) {
limits
} else if (length(limits) == 3) {
limits[3]
} else {
stop(
"Argument 'limits' must be NULL, a single value, or a numeric ",
"vector with length 3."
)
}
plot_htseq <-
ggplot(htseq_3, aes(n_reads, sample, fill = read_type)) +
geom_col() +
{if (draw_line) dashed_line} +
scale_x_continuous(
expand = expansion(mult = c(0, 0.1)),
limits = c(0, rounded_max_htseq),
labels = ~.x / 1e6
) +
scale_fill_manual(values = colour_keys$htseq) +
labs(x = "Reads (M)", y = NULL, fill = "Read type") +
qc_theme +
theme(panel.grid.major.y = element_blank())
output_list$plots$htseq <- plot_htseq
output_list$data$htseq <- htseq_3
} else {
message("No data found for HTSeq; check that 'multiqc_htseq.txt' exists.")
}
return(output_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.