R/tr_qc_plots.R

Defines functions tr_qc_plots

Documented in tr_qc_plots

#' Create ggplot2 figures from MultiQC results
#'
#' @param directory Folder containing all the data files generated by MultiQC,
#'   e.g. "multiqc_data/"
#' @param font_size Base font size (defaults to 18)
#' @param threshold_line Provide a number to draw a red dashed line at the
#'   indicated number of reads for FastQC read, STAR, and HTSeq plots. Defaults
#'   to 10e6; set to NULL to disable.
#' @param limits Override the upper limit of FastQC read, STAR, and HTSeq plots.
#'   Supply a single number to give all three plots the same limit, or a vector
#'   of three values to modify each individually. Defaults to NULL, which sets
#'   automatic limits.
#'
#' @return A list with elements "plot" containing the `ggplot` objects, and
#'   "data" containing all the underlying data
#' @export
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom forcats fct_inorder
#' @importFrom ggrepel geom_label_repel
#' @importFrom janitor clean_names
#' @importFrom readr cols read_delim
#' @importFrom stringr str_remove str_replace_all
#' @importFrom tidyr pivot_longer
#'
#' @description Creates four ggplot2 figures from MultiQC results, and returns
#'   the data along with the plot objects. The plots are for FastQC (read counts
#'   and Phred scores), STAR, and HTSeq. See Details for more information.
#'
#' @details For the Phred scores, one must open the MultiQC HTML report, and
#'   export the data for "fastqc_per_base_sequence_quality_plot" as a tab-
#'   delimited file (TSV), placing it inside the same directory as the rest.
#'
#' @references <https://multiqc.info/>
#' @seealso <https://www.github.com/travis-m-blimkie/tRavis>
#'
#' @examples
#' if (FALSE) tr_qc_plots("multiqc_data")
#'
tr_qc_plots <- function(
    directory,
    font_size = 18,
    threshold_line = 10e6,
    limits = NULL
) {


  # Setup -----------------------------------------------------------------

  file_phred_scores <-
    file.path(directory, "fastqc_per_base_sequence_quality_plot.tsv")
  file_fastqc_reads <- file.path(directory, "multiqc_fastqc.txt")
  file_star <- file.path(directory, "multiqc_star.txt")
  file_htseq <- file.path(directory, "multiqc_htseq.txt")

  qc_theme <- theme_bw(base_size = font_size) +
    theme(axis.text = element_text(colour = "black"))

  draw_line <- ifelse(!is.null(threshold_line), TRUE, FALSE)

  dashed_line <- geom_vline(
    xintercept = threshold_line,
    linetype = "dashed",
    colour = "#EE2C2C",
    linewidth = 1
  )

  colour_keys <- list(
    "fastqc" = c(
      "unique" = "#9AC0CD",
      "duplicate" = "#7F7F7F"
    ),
    "star" = c(
      "unique" = "#104E8B",
      "multimapped" = "#7CB5EC",
      "too many" = "#F7A35C",
      "too short" = "#F08080",
      "unmapped" = "#7F0000"
    ),
    "htseq" = c(
      "assigned" = "#7CB5EC",
      "ambiguous" = "#434348",
      "not unique" = "#90ED7D",
      "no feature" = "#F7A35C",
      "low aQual" = "#8085E9"
    )
  )

  output_list <- list()


  # Phred scores ----------------------------------------------------------

  if (file.exists(file_phred_scores)) {

    phred_1 <- read_delim(
      file = file_phred_scores,
      delim = "\t",
      col_types = cols()
    )

    phred_2 <- phred_1 %>%
      pivot_longer(
        -`Position (bp)`,
        names_to = "sample",
        values_to = "phred_score"
      ) %>%
      rename("position" = `Position (bp)`)

    bad_samples <- phred_2 %>%
      filter(phred_score < 30) %>%
      distinct(sample, .keep_all = TRUE) %>%
      mutate(qc = sample)

    phred_3 <- left_join(
      phred_2,
      bad_samples,
      by = c("position", "sample", "phred_score")
    )

    max_phred <- round_any(
      x = max(phred_3$phred_score),
      accuracy = 5,
      f = ceiling
    )

    line_alpha <- ifelse(
      length(unique(phred_3$sample)) > 20,
      0.3,
      1
    )

    plot_phred_scores <-
      ggplot(phred_3, aes(position, phred_score, group = sample)) +
      geom_line(alpha = line_alpha) +
      geom_hline(
        yintercept = 30,
        linetype = "dashed",
        colour = "#00CD66",
        linewidth = 1.5
      ) +
      geom_label_repel(
        aes(label = qc),
        size = 4,
        min.segment.length = 0,
        show.legend = FALSE,
        na.rm = TRUE
      ) +
      scale_x_continuous(expand = expansion(mult = 0.02)) +
      scale_y_continuous(limits = c(0, max_phred)) +
      labs(x = "Position", y = "Phred score") +
      qc_theme

    output_list$plots$phred_scores <- plot_phred_scores
    output_list$data$phred_scores <- phred_3
  } else {
    message(
      "Required file 'fastqc_per_base_sequence_quality_plot.tsv' not found. ",
      "To get this plot, open the MultiQC HTML report and export the data for ",
      "the 'FastQC per-base sequence quality' as a tab-delimited file (TSV), ",
      "saving into the same data directory."
    )
  }


  # FastQC reads ----------------------------------------------------------

  if (file.exists(file_fastqc_reads)) {

    fastqc_1 <-
      read_delim(file_fastqc_reads, delim = "\t", col_types = cols()) %>%
      clean_names()

    fastqc_2 <- fastqc_1 %>%
      mutate(
        unique = total_sequences * (total_deduplicated_percentage / 100),
        duplicate = total_sequences - unique
      ) %>%
      select(sample, unique, duplicate, total_sequences) %>%
      arrange(total_sequences) %>%
      mutate(sample = fct_inorder(sample))

    fastqc_3 <- fastqc_2 %>%
      select(-total_sequences) %>%
      pivot_longer(
        unique:duplicate,
        names_to = "read_type",
        values_to = "n_reads"
      )

    rounded_max_fastqc <-
      if (is.null(limits)) {
        get_rounded_max(fastqc_3)
      } else if (length(limits) == 1) {
        limits
      } else if (length(limits) == 3) {
        limits[1]
      } else {
        stop(
          "Argument 'limits' must be NULL, a single value, or a numeric ",
          "vector with length 3."
        )
      }

    plot_fastqc_reads <-
      ggplot(fastqc_3, aes(n_reads, sample, fill = read_type)) +
      geom_col() +
      {if (draw_line) dashed_line} +
      scale_fill_manual(values = colour_keys$fastqc) +
      scale_x_continuous(
        expand = expansion(mult = c(0, 0.1)),
        limits = c(0, rounded_max_fastqc),
        labels = ~.x / 1e6
      ) +
      labs(x = "Reads (M)", y = NULL, fill = "Read type") +
      qc_theme +
      theme(panel.grid.major.y = element_blank())

    output_list$plots$fastqc_reads <- plot_fastqc_reads
    output_list$data$fastqc_reads <- fastqc_3
  } else {
    message(
      "No data found for FastQC reads; check that 'multiqc_fastqc.txt' exists."
    )
  }


  # STAR ------------------------------------------------------------------

  if (file.exists(file_star)) {

    star_1 <- read_delim(
      file = file_star,
      delim = "\t",
      col_types = cols()
    ) %>%
      clean_names()

    star_2 <- star_1 %>%
      arrange(total_reads) %>%
      mutate(sample = fct_inorder(sample)) %>%
      select(
        sample,
        "unique" = uniquely_mapped,
        multimapped,
        "too_many" = multimapped_toomany,
        "too_short" = unmapped_tooshort,
        "unmapped" = unmapped_other
      )

    star_3 <- star_2 %>%
      pivot_longer(
        -sample,
        names_to = "read_type",
        values_to = "n_reads"
      ) %>%
      mutate(
        read_type = str_replace_all(read_type, pattern = c("_" = " ")),
        read_type = factor(read_type, c(
          "unmapped",
          "too short",
          "too many",
          "multimapped",
          "unique"
        ))
      )

    rounded_max_star <-
      if (is.null(limits)) {
        get_rounded_max(star_3)
      } else if (length(limits) == 1) {
        limits
      } else if (length(limits) == 3) {
        limits[2]
      } else {
        stop(
          "Argument 'limits' must be NULL, a single value, or a numeric ",
          "vector with length 3."
        )
      }

    plot_star <- ggplot(star_3, aes(n_reads, sample, fill = read_type)) +
      geom_col() +
      {if (draw_line) dashed_line} +
      scale_x_continuous(
        expand = expansion(mult = c(0, 0.1)),
        limits = c(0, rounded_max_star),
        labels = ~.x / 1e6
      ) +
      scale_fill_manual(values = colour_keys$star) +
      labs(x = "Reads (M)", y = NULL, fill = "Read type") +
      qc_theme +
      theme(panel.grid.major.y = element_blank())

    output_list$plots$star <- plot_star
    output_list$data$star <- star_3
  } else {
    message("No data found for STAR; check that 'multiqc_star.txt' exists.")
  }


  # HTSeq -----------------------------------------------------------------

  if (file.exists(file_htseq)) {

    htseq_1 <- read_delim(
      file = file_htseq,
      delim = "\t",
      col_types = cols()
    ) %>%
      clean_names()

    htseq_2 <- htseq_1 %>%
      arrange(total_count) %>%
      mutate(
        sample = str_remove(sample, ".count$"),
        sample = fct_inorder(sample)
      ) %>%
      select(
        sample,
        assigned,
        ambiguous,
        "not_unique" = alignment_not_unique,
        no_feature,
        "low_aQual" = too_low_a_qual
      )

    htseq_3 <- htseq_2 %>%
      pivot_longer(
        -sample,
        names_to = "read_type",
        values_to = "n_reads"
      ) %>%
      mutate(
        read_type = str_replace_all(read_type, pattern = c("_" = " ")),
        read_type = factor(read_type, c(
          "low aQual",
          "no feature",
          "not unique",
          "ambiguous",
          "assigned"
        ))
      )

    rounded_max_htseq <-
      if (is.null(limits)) {
        get_rounded_max(htseq_3)
      } else if (length(limits) == 1) {
        limits
      } else if (length(limits) == 3) {
        limits[3]
      } else {
        stop(
          "Argument 'limits' must be NULL, a single value, or a numeric ",
          "vector with length 3."
        )
      }

    plot_htseq <-
      ggplot(htseq_3, aes(n_reads, sample, fill = read_type)) +
      geom_col() +
      {if (draw_line) dashed_line} +
      scale_x_continuous(
        expand = expansion(mult = c(0, 0.1)),
        limits = c(0, rounded_max_htseq),
        labels = ~.x / 1e6
      ) +
      scale_fill_manual(values = colour_keys$htseq) +
      labs(x = "Reads (M)", y = NULL, fill = "Read type") +
      qc_theme +
      theme(panel.grid.major.y = element_blank())

    output_list$plots$htseq <- plot_htseq
    output_list$data$htseq <- htseq_3
  } else {
    message("No data found for HTSeq; check that 'multiqc_htseq.txt' exists.")
  }

  return(output_list)
}
travis-m-blimkie/tRavis documentation built on April 9, 2024, 11:45 p.m.