R/rnaseq_workflow_qc.R

Defines functions plot_gene_mapping_rate plot_dna_contamination_ratio plot_intergenic_genic_ratio plot_rin plot_partitioned_pc plot_partitioned_mean_expression plot_shannon_index plot_expression_fraction plot_pca plot_mapping_categories plot_inner_dist plot_junction_saturation plot_hisat2_vs_salmon plot_hisat2_mapped_reads plot_hisat2_mapping_rate plot_hisat2_stats plot_salmon_categories plot_failed_mapping_count plot_salmon_score_filtered_rate plot_dovetail_mapping_rate plot_decoy_mapping_rate plot_salmon_mapped_reads plot_salmon_mapping_pct plot_salmon_stats plot_frag_length plot_trimmomatic_paired plot_seq_duplication plot_seq_depth_vs_lib_complexity plot_unique_read_pct plot_adapter_content plot_gc_mean plot_gc_content_hist plot_gc_content plot_phred_mean plot_phred_per_bp plot_phred_per_read plot_seq_depth_hist plot_seq_depth parse_by_trim_status load_paired_end_qc_data

Documented in load_paired_end_qc_data parse_by_trim_status plot_adapter_content plot_decoy_mapping_rate plot_dna_contamination_ratio plot_dovetail_mapping_rate plot_expression_fraction plot_failed_mapping_count plot_frag_length plot_gc_content plot_gc_content_hist plot_gc_mean plot_gene_mapping_rate plot_hisat2_mapped_reads plot_hisat2_mapping_rate plot_hisat2_stats plot_hisat2_vs_salmon plot_inner_dist plot_intergenic_genic_ratio plot_junction_saturation plot_mapping_categories plot_partitioned_mean_expression plot_partitioned_pc plot_pca plot_phred_mean plot_phred_per_bp plot_phred_per_read plot_rin plot_salmon_categories plot_salmon_mapped_reads plot_salmon_mapping_pct plot_salmon_score_filtered_rate plot_salmon_stats plot_seq_depth plot_seq_depth_hist plot_seq_depth_vs_lib_complexity plot_seq_duplication plot_shannon_index plot_trimmomatic_paired plot_unique_read_pct

# Tools for munging and visualizing QC metrics for RNA-seq data.
# Author: Bryan Quach (bryancquach@gmail.com)

#' Load QC metrics data from paired-end RNA-seq processing
#'
#' Load QC data files generated by MultiQC for various stages of paired-end RNA-seq data
#' processing using the automated RNA-seq data processing WDL workflow.
#'
#' The automated RNA-seq data processing workflow produces several tables with various metrics.
#' Additionally, several tables can be exported from the MultiQC HTML report. This function will
#' import all of these data into a list for further processing in R. The assumed data files are
#' *  `multiqc_fastqc.txt`: An auto-generated workflow file with general FASTQC statistics.
#' *  `multiqc_hisat2.txt`: An auto-generated workflow file with HISAT2 alignment statistics.
#' *  `multiqc_rseqc_bam_stat.txt`: An auto-generated workflow file with alignment statistics
#' generated by RSeQC.
#' *  `multiqc_rseqc_read_distribution.txt`: An auto-generated workflow file with alignment
#' categorizations to different types of genomic regions (introns, exons, integenic, etc.).
#' *  `multiqc_salmon.txt`: An auto-generated workflow file with mapping statistics from Salmon.
#' *  `multiqc_trimmomatic.txt`: An auto-generated workflow file with trimming and read quality
#' filtering statistics using Trimmomatic.
#' *  `rseqc_inner_distance_plot.tsv`: Table exported from the MultiQC HTML report with inner
#' distance statistics between pairs of reads.
#' *  `salmon_plot.tsv`: Table exported from the MultiQC HTML report with RNA fragment length
#' distribution statistics as estimated by Salmon.
#' *  `fastqc_per_base_sequence_quality_plot.tsv`: Table exported from the MultiQC HTML report with
#' per base PHRED scores statistics generated by FASTQC.
#' *  `fastqc_per_sequence_quality_scores_plot.tsv`: Table exported from the MultiQC HTML report
#' with per sequence PHRED score statistics generated by FASTQC.
#' *  `fastqc_per_sequence_gc_content_plot.tsv`: Table exported from the MultiQC HTML report with
#' per sequence GC content statistics generated by FASTQC.
#' *  `fastqc_sequence_duplication_levels_plot.tsv`: Table exported from the MultiQC HTML report
#' with sequence duplication statistics generated by FASTQC.
#' *  `fastqc_adapter_content_plot.tsv`: Table exported from the MultiQC HTML report with adapter
#' content statistics generated by FASTQC.
#' *  `rseqc_known_junction_saturation_plot.tsv`: Table exported from the MultiQC HTML report with
#' junction saturation statistics for known splice junctions.
#' *  `rseqc_novel_junction_saturation_plot.tsv`: Table exported from the MultiQC HTML report with
#' junction saturation statistics for novel splice junctions.
#'
#' @param data_dir A string denoting the local path where all the data files are stored.
#' @return A list with several elements:
#' * `fastqc`: Data frame imported from `multiqc_fastqc.txt`.
#' * `hisat2`: Data frame imported from `multiqc_hisat2.txt`.
#' * `rseqc_bam`: Data frame imported from `multiqc_rseqc_bam_stat.txt`.
#' * `rseqc_alignment_category`: Data frame imported from `multiqc_rseqc_read_distribution.txt`.
#' * `salmon`: Data frame imported from `multiqc_salmon.txt`.
#' * `trimmomatic`: Data frame imported from `multiqc_trimmomatic.txt`.
#' * `inner_dist`: Data frame imported from `rseqc_inner_distance_plot.tsv`.
#' * `frag_length`: Data frame imported from `salmon_plot.tsv`.
#' * `phred_bp`: Data frame imported from `fastqc_per_base_sequence_quality_plot.tsv`.
#' * `phred_seq`: Data frame imported from `fastqc_per_sequence_quality_scores_plot.tsv`.
#' * `gc_content`: Data frame imported from `fastqc_per_sequence_gc_content_plot.tsv`.
#' * `seq_duplication`: Data frame imported from `fastqc_sequence_duplication_levels_plot.tsv`.
#' * `adapter_content`: Data frame imported from `fastqc_adapter_content_plot.tsv`.
#' * `known_junction`: Data frame imported from `rseqc_known_junction_saturation_plot.tsv`.
#' * `novel_junction`: Data frame imported from `rseqc_novel_junction_saturation_plot.tsv`.
#' @export
load_paired_end_qc_data <- function(data_dir) {
  qc_data <- list()
  qc_data$fastqc <- read.delim(
    file.path(data_dir, "/multiqc_fastqc.txt"),
    header = T,
    sep = "\t"
  )
  qc_data$hisat2 <- read.delim(
    file.path(data_dir, "/multiqc_hisat2.txt"),
    header = T,
    sep = "\t"
  )
  qc_data$rseqc_bam <- read.delim(
    file.path(data_dir, "/multiqc_rseqc_bam_stat.txt"),
    header = T,
    sep = "\t"
  )
  qc_data$rseqc_alignment_category <- read.delim(
    file.path(data_dir, "/multiqc_rseqc_read_distribution.txt"),
    header = T,
    sep = "\t"
  )
  qc_data$salmon <- read.delim(
    file.path(data_dir, "/multiqc_salmon.txt"),
    header = T,
    sep = "\t"
  )
  qc_data$trimmomatic <- read.delim(
    file.path(data_dir, "/multiqc_trimmomatic.txt"),
    header = T,
    sep = "\t"
  )
  qc_data$inner_dist <- read.delim(
    file.path(data_dir, "/rseqc_inner_distance_plot.tsv"),
    header = T,
    sep = "\t"
  )
  qc_data$frag_length <- read.delim(
    file.path(data_dir, "/salmon_plot.tsv"),
    header = T,
    sep = "\t"
  )
  qc_data$phred_bp <- read.delim(
    file.path(data_dir, "/fastqc_per_base_sequence_quality_plot.tsv"),
    header = T,
    sep = "\t"
  )
  qc_data$phred_seq <- read.delim(
    file.path(data_dir, "/fastqc_per_sequence_quality_scores_plot.tsv"),
    header = T,
    sep = "\t"
  )
  qc_data$gc_content <- read.delim(
    file.path(data_dir, "/fastqc_per_sequence_gc_content_plot.tsv"),
    header = T,
    sep = "\t"
  )
  qc_data$seq_duplication <- read.delim(
    file.path(data_dir, "/fastqc_sequence_duplication_levels_plot.tsv"),
    header = T,
    sep = "\t"
  )
  qc_data$adapter_content <- read.delim(
    file.path(data_dir, "/fastqc_adapter_content_plot.tsv"),
    header = T,
    sep = "\t"
  )
  qc_data$known_junction <- read.delim(
    file.path(data_dir, "/rseqc_known_junction_saturation_plot.tsv"),
    header = T,
    sep = "\t"
  )
  qc_data$novel_junction <- read.delim(
    file.path(data_dir, "/rseqc_novel_junction_saturation_plot.tsv"),
    header = T,
    sep = "\t"
  )

  # Formatting for sequence duplication data
  qc_data$seq_duplication <- t(qc_data$seq_duplication)
  colnames(qc_data$seq_duplication) <- paste("level", qc_data$seq_duplication[1, ], sep = "_")
  qc_data$seq_duplication <- qc_data$seq_duplication[-1, ]
  if (any(is.na(qc_data$seq_duplication))) {
    qc_data$seq_duplication[is.na(qc_data$seq_duplication)] <- 0
  }
  tmp_ids <- gsub(rownames(qc_data$seq_duplication), pattern = "^X", replacement = "", perl = T)
  tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
  qc_data$seq_duplication <- as.data.frame(qc_data$seq_duplication, stringAsFactors = F)
  rownames(qc_data$seq_duplication) <- tmp_ids

  # Formatting for per sequence PHRED score data
  qc_data$phred_seq <- t(qc_data$phred_seq)
  colnames(qc_data$phred_seq) <- paste("PHRED", qc_data$phred_seq[1, ], sep = "_")
  qc_data$phred_seq <- qc_data$phred_seq[-1, ]
  if (any(is.na(qc_data$phred_seq))) {
    qc_data$phred_seq[is.na(qc_data$phred_seq)] <- 0
  }
  tmp_ids <- gsub(rownames(qc_data$phred_seq), pattern = "^X", replacement = "", perl = T)
  tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
  qc_data$phred_seq <- as.data.frame(qc_data$phred_seq, stringAsFactors = F)
  rownames(qc_data$phred_seq) <- tmp_ids

  # Formatting for per bp PHRED score data
  qc_data$phred_bp <- t(qc_data$phred_bp)
  colnames(qc_data$phred_bp) <- paste("BP", qc_data$phred_bp[1, ], sep = "_")
  qc_data$phred_bp <- qc_data$phred_bp[-1, ]
  tmp_ids <- gsub(rownames(qc_data$phred_bp), pattern = "^X", replacement = "", perl = T)
  tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
  qc_data$phred_bp <- as.data.frame(qc_data$phred_bp, stringAsFactors = F)
  rownames(qc_data$phred_bp) <- tmp_ids

  # Formatting for adapter content data
  qc_data$adapter_content <- t(qc_data$adapter_content)
  colnames(qc_data$adapter_content) <- paste("BP", qc_data$adapter_content[1, ], sep = "_")
  qc_data$adapter_content <- qc_data$adapter_content[-1, ]
  tmp_ids <- gsub(rownames(qc_data$adapter_content), pattern = "^X", replacement = "", perl = T)
  tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
  qc_data$adapter_content <- as.data.frame(qc_data$adapter_content, stringAsFactors = F)
  qc_data$adapter_content$adapter <- sapply(
    strsplit(tmp_ids, split = " | ", fixed = T),
    function(id_pieces) {
      id_pieces[length(id_pieces)]
    },
    simplify = T
  )
  rownames(qc_data$adapter_content) <- tmp_ids

  # Formatting for GC content data
  qc_data$gc_content <- t(qc_data$gc_content)
  colnames(qc_data$gc_content) <- paste("GC", qc_data$gc_content[1, ], sep = "_")
  qc_data$gc_content <- qc_data$gc_content[-1, ]
  tmp_ids <- gsub(rownames(qc_data$gc_content), pattern = "^X", replacement = "", perl = T)
  tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
  qc_data$gc_content <- as.data.frame(qc_data$gc_content, stringAsFactors = F)
  rownames(qc_data$gc_content) <- tmp_ids

  # Formatting for fragment length data
  qc_data$frag_length <- t(qc_data$frag_length)
  colnames(qc_data$frag_length) <- paste("BP", qc_data$frag_length[1, ], sep = "_")
  qc_data$frag_length <- qc_data$frag_length[-1, ]
  tmp_ids <- gsub(
    rownames(qc_data$frag_length),
    pattern = "libParams...",
    replacement = "",
    perl = T
  )
  tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
  qc_data$frag_length <- as.data.frame(qc_data$frag_length, stringAsFactors = F)
  rownames(qc_data$frag_length) <- tmp_ids

  # Formatting for HISAT2 data
  qc_data$hisat2$id <- gsub(qc_data$hisat2$Sample, pattern = "\\|.+$", replacement = "", perl = T)
  qc_data$hisat2$id <- trimws(qc_data$hisat2$id)
  qc_data$hisat2 <- sapply(
    unique(qc_data$hisat2$id),
    function(tmp_id) {
      tmp_subset <- qc_data$hisat2[which(qc_data$hisat2$id == tmp_id), ]
      paired_total <- sum(tmp_subset$paired_total)
      paired_aligned_none <- sum(tmp_subset$paired_aligned_none)
      paired_aligned_one <- sum(tmp_subset$paired_aligned_one)
      paired_aligned_multi <- sum(tmp_subset$paired_aligned_multi)
      paired_aligned_discord_one <- sum(tmp_subset$paired_aligned_discord_one)
      unpaired_total <- sum(tmp_subset$unpaired_total)
      unpaired_aligned_none <- sum(tmp_subset$unpaired_aligned_none)
      unpaired_aligned_one <- sum(tmp_subset$unpaired_aligned_one)
      unpaired_aligned_multi <- sum(tmp_subset$unpaired_aligned_multi)
      paired_sum <- 2 * (paired_aligned_one + paired_aligned_multi + paired_aligned_discord_one)
      unpaired_sum <- unpaired_aligned_one + unpaired_aligned_multi
      overall_alignment_rate <- (paired_sum + unpaired_sum) / (2 * paired_total)
      revised_paired_sum <- 2 * (paired_aligned_one + paired_aligned_multi)
      revised_alignment_rate <- revised_paired_sum / ((2 * paired_total) - unpaired_total)
      merged_subset <- c(
        tmp_id,
        paired_total,
        paired_aligned_none,
        paired_aligned_one,
        paired_aligned_multi,
        paired_aligned_discord_one,
        unpaired_total,
        unpaired_aligned_none,
        unpaired_aligned_one,
        unpaired_aligned_multi,
        overall_alignment_rate,
        revised_alignment_rate
      )
      names(merged_subset) <- c(
        "id",
        "paired_total",
        "paired_aligned_none",
        "paired_aligned_one",
        "paired_aligned_multi",
        "paired_aligned_discord_one",
        "unpaired_total",
        "unpaired_aligned_none",
        "unpaired_aligned_one",
        "unpaired_aligned_multi",
        "overall_alignment_rate",
        "revised_alignment_rate"
      )
      return(merged_subset)
    }
  )
  qc_data$hisat2 <- as.data.frame(t(qc_data$hisat2))
  rownames(qc_data$hisat2) <- qc_data$hisat2$id

  # Formatting for splice junction data
  qc_data$known_junction <- t(qc_data$known_junction)
  colnames(qc_data$known_junction) <- paste("percent", qc_data$known_junction[1, ], sep = "_")
  qc_data$known_junction <- qc_data$known_junction[-1, ]
  if (any(is.na(qc_data$known_junction))) {
    qc_data$known_junction[is.na(qc_data$known_junction)] <- 0
  }
  tmp_ids <- gsub(rownames(qc_data$known_junction), pattern = "^X", replacement = "", perl = T)
  qc_data$known_junction <- as.data.frame(qc_data$known_junction, stringsAsFactors = F)
  rownames(qc_data$known_junction) <- tmp_ids

  qc_data$novel_junction <- t(qc_data$novel_junction)
  colnames(qc_data$novel_junction) <- paste("percent", qc_data$novel_junction[1, ], sep = "_")
  qc_data$novel_junction <- qc_data$novel_junction[-1, ]
  if (any(is.na(qc_data$novel_junction))) {
    qc_data$novel_junction[is.na(qc_data$novel_junction)] <- 0
  }
  tmp_ids <- gsub(rownames(qc_data$novel_junction), pattern = "^X", replacement = "", perl = T)
  qc_data$novel_junction <- as.data.frame(qc_data$novel_junction, stringsAsFactors = F)
  rownames(qc_data$novel_junction) <- tmp_ids

  # Formatting for read pair inner distance data
  qc_data$inner_dist <- t(qc_data$inner_dist)
  colnames(qc_data$inner_dist) <- paste("BP", qc_data$inner_dist[1, ], sep = "_")
  qc_data$inner_dist <- qc_data$inner_dist[-1, ]
  tmp_ids <- gsub(rownames(qc_data$inner_dist), pattern = "^X", replacement = "", perl = T)
  qc_data$inner_dist <- as.data.frame(qc_data$inner_dist, stringsAsFactors = F)
  rownames(qc_data$inner_dist) <- tmp_ids

  # Formatting for RSeQC read alignment categories
  keeper_col_ids <- c(
    "Sample",
    "other_intergenic_tag_pct",
    "tes_down_10kb_tag_pct",
    "tss_up_10kb_tag_pct",
    "introns_tag_pct",
    "X3_utr_exons_tag_pct",
    "X5_utr_exons_tag_pct",
    "cds_exons_tag_pct"
  )
  qc_data$rseqc_alignment_category <- qc_data$rseqc_alignment_category[, keeper_col_ids]
  colnames(qc_data$rseqc_alignment_category) <- c(
    "id",
    "Other_intergenic",
    "Downstream_10kb",
    "Upstream_10kb",
    "Intron",
    "UTR_3",
    "UTR_5",
    "CDS"
  )
  rownames(qc_data$rseqc_alignment_category) <- qc_data$rseqc_alignment_category$id
  rownames(qc_data$fastqc) <- qc_data$fastqc$Sample
  rownames(qc_data$rseqc_bam) <- qc_data$rseqc_bam$Sample
  rownames(qc_data$salmon) <- qc_data$salmon$Sample
  rownames(qc_data$trimmomatic) <- qc_data$trimmomatic$Sample
  return(qc_data)
}

#' Partition rows by trimming status.
#'
#' Partition rows of a data frame by trimming status using string matching on row names.
#'
#' Parses a quality metric table that has data from multiple stages of the RNA-seq data processing
#' workflow into separate tables for pre-Trimmomatic and post-Trimmomatic data. If the data are
#' from paired-end sequencing, additional partitions can be made that separate post-Trimmomatic
#' preserved read pairs from post-Trimmomatic unpaired reads, and read 1 (R1) and read 2 (R2). This
#' function is intended for use with auto-generated QC metrics tables from the RNA-seq data
#' processing workflow and tables exported from MultiQC HTML reports.
#'
#' @param data A data frame with rows corresponding to samples with names that differentiate
#' different stages of sample processing from the RNA-seq data processing workflow (pre vs.
#' post-Trimmomatic and "R1" and "R2" for read pairs in paired-end sequencing).
#' @param str_query A string in the row names for `data` that distinguish pre-Trimmomatic from
#' post-Trimmomatic records.
#' @param paired_end A logical. Is the data from paired-end sequencing? If so, additional
#' partitions will be made that separate the forward and reverse read pairs.
#' @param r1_query A string in the row names for `data` that distinguish R1 reads from others in
#' paired-end sequencing data.
#' @param r2_query A string in the row names for `data` that distinguish R2 reads from others in
#' paired-end sequencing data.
#' @param unpaired_query A string in the row names for `data` that distinguish unpaired reads from
#' preserved pairs in paired-end sequencing data.
#' @return For single-end sequencing data, a list with the elements `untrimmed` and `trimmed` to
#' denote data frames with pre-Trimmomatic and post-Trimmomatic data respectively. For
#' paired-end sequencing data, a list with the following elements:
#' *  `untrimmed_r1`: Data frame with pre-Trimmomatic R1 reads.
#' *  `untrimmed_r2`: Data frame with pre-Trimmomatic R2 reads.
#' *  `trimmed_r1`: Data frame with post-Trimmomatic R1 reads from preserved pairs.
#' *  `trimmed_r2`: Data frame with post-Trimmomatic R2 reads from preserved pairs.
#' *  `unpaired_r1`: Data frame with post-Trimmomatic R1 reads without a read pair.
#' *  `unpaired_r2`: Data frame with post-Trimmomatic R2 reads without a read pair.
#' @export
parse_by_trim_status <- function(data,
                                 str_query = "trimmed",
                                 paired_end = F,
                                 r1_query = "R1",
                                 r2_query = "R2",
                                 unpaired_query = "unpaired") {
  data <- as.data.frame(data, stringsAsFactors = F)
  all_ids <- rownames(data)
  data_untrimmed <- data[grep(x = all_ids, str_query, invert = T), ]
  data_trimmed <- data[grep(x = all_ids, str_query), ]
  if (!paired_end) {
    partitions <- list(untrimmed = data_untrimmed, trimmed = data_trimmed)
    if (nrow(partitions$untrimmed) != nrow(partitions$trimmed)) {
      warning("Differing number of pre- and post-Trimmomatic records")
    }
    return(partitions)
  }
  ids_untrimmed <- rownames(data_untrimmed)
  data_untrimmed_r1 <- data_untrimmed[grep(x = ids_untrimmed, r1_query), ]
  data_untrimmed_r2 <- data_untrimmed[grep(x = ids_untrimmed, r2_query), ]
  ids_trimmed <- rownames(data_trimmed)
  data_trimmed_paired <- data_trimmed[grep(x = ids_trimmed, unpaired_query, invert = T), ]
  data_trimmed_unpaired <- data_trimmed[grep(x = ids_trimmed, unpaired_query), ]
  ids_trimmed_paired <- rownames(data_trimmed_paired)
  data_trimmed_paired_r1 <- data_trimmed_paired[grep(x = ids_trimmed_paired, r1_query), ]
  data_trimmed_paired_r2 <- data_trimmed_paired[grep(x = ids_trimmed_paired, r2_query), ]
  ids_trimmed_unpaired <- rownames(data_trimmed_unpaired)
  data_trimmed_unpaired_r1 <- data_trimmed_unpaired[grep(x = ids_trimmed_unpaired, r1_query), ]
  data_trimmed_unpaired_r2 <- data_trimmed_unpaired[grep(x = ids_trimmed_unpaired, r2_query), ]
  partitions <- list(
    untrimmed_r1 = data_untrimmed_r1,
    untrimmed_r2 = data_untrimmed_r2,
    trimmed_r1 = data_trimmed_paired_r1,
    trimmed_r2 = data_trimmed_paired_r2,
    unpaired_r1 = data_trimmed_unpaired_r1,
    unpaired_r2 = data_trimmed_unpaired_r2
  )
  if (nrow(partitions$untrimmed_r1) != nrow(partitions$untrimmed_r2)) {
    warning("Differing number of pre-Trimmomatic R1 and R2 records")
  }
  if (nrow(partitions$trimmed_r1) != nrow(partitions$trimmed_r2)) {
    warning("Differing number of post-Trimmomatic preserved pair R1 and R2 records")
  }
  if (nrow(partitions$unpaired_r1) != nrow(partitions$unpaired_r2)) {
    warning("Differing number of unpaired R1 and R2 records")
  }
  if (nrow(partitions$untrimmed_r1) != nrow(partitions$trimmed_r1)) {
    warning("Differing number of pre- and post-Trimmomatic R1 records")
  }
  if (nrow(partitions$untrimmed_r2) != nrow(partitions$trimmed_r2)) {
    warning("Differing number of pre- and post-Trimmomatic R2 records")
  }
  return(partitions)
}

#' Sequencing depth per sample scatterplot
#'
#' Create a sequencing depth by sample scatterplot.
#'
#' @param data A data frame with sequencing depth data in the same format as `fastqc` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param point_size A numeric. The size for data points.
#' @param fill A string. The fill color for data points.
#' @param alpha A numeric. The alpha level for data points.
#' @param sort A logical. Should the x-axis be ordered by sequencing depth?
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_seq_depth <- function(data,
                           point_size = 4,
                           fill = "red4",
                           alpha = 0.75,
                           sort = T,
                           ids = NULL,
                           invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  if (sort) {
    data <- data[order(data$Total.Sequences), ]
  }
  ggout <- ggplot(data, aes(x = seq_len(nrow(data)), y = Total.Sequences / 1000000)) +
    geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
    labs(y = "Reads (millions)", x = "Sample index") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Sequencing depth histogram
#'
#' Create a sequencing depth histogram.
#'
#' @param data A data frame with sequencing depth data in the same format as `fastqc` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param binsize A numeric value for the histogram bin widths.
#' @param fill A string. The fill color for histogram bars.
#' @param alpha A numeric. The alpha level for histogram bars.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_seq_depth_hist <- function(data,
                                binsize = NULL,
                                fill = "red4",
                                alpha = 0.75,
                                ids = NULL,
                                invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  if (is.null(binsize)) {
    binsize <- diff(range(data$Total.Sequences / 1000000)) / 25
  }
  ggout <- ggplot(data, aes(x = Total.Sequences / 1000000)) +
    geom_histogram(binwidth = binsize, fill = fill, color = "white", alpha = alpha) +
    labs(y = "Number of Samples", x = "Reads (millions)") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Per read PHRED frequency plot
#'
#' Create a line plot with per read PHRED frequencies.
#'
#' @param data A list of data frames with data in the same format as `phred_seq` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @param labels A vector of strings for legend labels corresponding to the data frames in `data`.
#' @param line_colors A vector of strings. Line colors corresponding to the data frames in `data`.
#' @param alpha A numeric. The alpha level for lines.
#' @param line_width A numeric value for the line width.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_phred_per_read <- function(data,
                                labels = NULL,
                                line_colors = rep("red4", length(data)),
                                alpha = 0.25,
                                line_width = 1.5,
                                ids = NULL,
                                invert = F) {
  plot_df_list <- sapply(
    seq_len(length(data)),
    function(i) {
      df <- data[[i]]
      if (!is.null(ids)) {
        if (invert) {
          df <- df[!rownames(df) %in% ids, , drop = F]
        } else {
          df <- df[rownames(df) %in% ids, , drop = F]
        }
      }
      df$id <- rownames(df)
      new_df <- melt(
        df,
        id.vars = "id",
        variable.name = "PHRED",
        value.name = "count",
        factorsAsStrings = T
      )
      new_df$phred <- gsub(
        as.character(new_df$PHRED),
        pattern = "PHRED_",
        replacement = "",
        fixed = T
      )
      new_df$phred <- as.numeric(new_df$phred)
      new_df$id <- reorder(new_df$id, new_df$phred)
      if (is.null(labels)) {
        new_df$group <- names(data)[i]
      } else {
        new_df$group <- labels[i]
      }
      return(new_df)
    },
    simplify = F
  )
  plot_data <- do.call(rbind, plot_df_list)
  ggout <- ggplot(plot_data, aes(x = phred, y = log10(count + 1), group = id, col = group)) +
    geom_line(lwd = line_width, alpha = alpha) +
    scale_color_manual(values = line_colors) +
    xlim(0, 40) +
    labs(x = "Mean PHRED score", y = "Read frequency (log10)", col = "") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1),
      legend.title = element_text(size = 16),
      legend.text = element_text(size = 16)
    )
  return(ggout)
}

#' Per base pair PHRED score plot
#'
#' Create a line plot with per base pair mean PHRED scores.
#'
#' @param data A list of data frames with data in the same format as `phred_bp` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @inheritParams plot_phred_per_read
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_phred_per_bp <- function(data,
                              labels = NULL,
                              line_colors = rep("red4", length(data)),
                              alpha = 0.25,
                              line_width = 1.5,
                              ids = NULL,
                              invert = F) {
  plot_df_list <- sapply(
    seq_len(length(data)),
    function(i) {
      df <- data[[i]]
      if (!is.null(ids)) {
        if (invert) {
          df <- df[!rownames(df) %in% ids, , drop = F]
        } else {
          df <- df[rownames(df) %in% ids, , drop = F]
        }
      }
      df$id <- rownames(df)
      new_df <- melt(
        df,
        id.vars = "id",
        variable.name = "BP",
        value.name = "score",
        factorsAsStrings = T
      )
      new_df$bp <- gsub(
        as.character(new_df$BP),
        pattern = "BP_",
        replacement = "",
        fixed = T
      )
      new_df$bp <- as.numeric(new_df$bp)
      new_df$id <- reorder(new_df$id, new_df$bp)
      if (is.null(labels)) {
        new_df$group <- names(data)[i]
      } else {
        new_df$group <- labels[i]
      }
      return(new_df)
    },
    simplify = F
  )
  plot_data <- do.call(rbind, plot_df_list)
  ggout <- ggplot(plot_data, aes(x = bp, y = score, group = id, col = group)) +
    geom_line(lwd = line_width, alpha = alpha) +
    scale_color_manual(values = line_colors) +
    ylim(0, 40) +
    labs(y = "Mean PHRED score", x = "Read position", col = "") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1),
      legend.title = element_text(size = 16),
      legend.text = element_text(size = 16)
    )
  return(ggout)
}

#' Plot per sample mean of per read mean PHRED scores
#'
#' Create a histogram and boxplot of per sample mean of per read mean PHRED scores. The two plots
#' are intended to be used as a single column, two row figure panel.
#'
#' @param data A data frame with data in the same format as `phred_seq` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match rownames
#' in `data`.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_phred_mean <- function(data,
                            group = NULL,
                            ids = NULL,
                            invert = F,
                            return_data = F,
                            binsize = NULL,
                            colors = NULL,
                            hist_alpha = 0.75,
                            box_fill = "goldenrod",
                            box_alpha = 0.5,
                            box_lwd = 1,
                            jitter_alpha = 0.75,
                            jitter_size = 1.75,
                            x_title = "Mean of mean PHRED score",
                            y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  phred_mean <- apply(
    data,
    1,
    function(phred_freq) {
      phred_scores <- gsub(names(phred_freq), pattern = "PHRED_", replacement = "", fixed = T)
      phred_scores <- as.numeric(phred_scores)
      phred_freq <- as.numeric(phred_freq)
      phred_mean <- sum(phred_freq * phred_scores, na.rm = T) / sum(phred_freq, na.rm = T)
      return(phred_mean)
    }
  )
  phred_mean <- data.frame(values = phred_mean)
  rownames(phred_mean) <- rownames(data)
  plot_data <- merge(x = phred_mean, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    data = plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' GC content distribution line plot
#'
#' Create a line plot with read GC content distributions.
#'
#' @param data A list of data frames with data in the same format as `gc_content` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @inheritParams plot_phred_per_read
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_gc_content <- function(data,
                            labels = NULL,
                            line_colors = rep("red4", length(data)),
                            alpha = 0.25,
                            line_width = 1.5,
                            ids = NULL,
                            invert = F) {
  plot_df_list <- sapply(
    seq_len(length(data)),
    function(i) {
      df <- data[[i]]
      if (!is.null(ids)) {
        if (invert) {
          df <- df[!rownames(df) %in% ids, , drop = F]
        } else {
          df <- df[rownames(df) %in% ids, , drop = F]
        }
      }
      df$id <- rownames(df)
      new_df <- melt(
        df,
        id.vars = "id",
        variable.name = "GC",
        value.name = "percent",
        factorsAsStrings = T
      )
      new_df$gc <- gsub(
        as.character(new_df$GC),
        pattern = "GC_",
        replacement = "",
        fixed = T
      )
      new_df$gc <- as.numeric(new_df$gc)
      new_df$id <- reorder(new_df$id, new_df$gc)
      if (is.null(labels)) {
        new_df$group <- names(data)[i]
      } else {
        new_df$group <- labels[i]
      }
      return(new_df)
    },
    simplify = F
  )
  plot_data <- do.call(rbind, plot_df_list)
  ggout <- ggplot(plot_data, aes(x = gc, y = percent, group = id, col = group)) +
    geom_line(lwd = line_width, alpha = alpha) +
    scale_color_manual(values = line_colors) +
    labs(y = "% reads", x = "Mean % GC", col = "") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1),
      legend.title = element_text(size = 16),
      legend.text = element_text(size = 16)
    )
  return(ggout)
}

#' GC content histogram
#'
#' Create a GC content by sample histogram.
#'
#' @param data A list of data frames with data in the same format as `fastqc` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param labels A vector of strings for legend labels corresponding to the data frames in `data`.
#' @inheritParams plot_seq_depth_hist
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_gc_content_hist <- function(data,
                                 binsize = 2.5,
                                 labels = NULL,
                                 fill = rep("red4", length(data)),
                                 alpha = 0.25,
                                 ids = NULL,
                                 invert = F) {
  plot_df_list <- sapply(
    seq_len(length(data)),
    function(i) {
      df <- data[[i]]
      if (!is.null(ids)) {
        if (invert) {
          df <- df[!rownames(df) %in% ids, , drop = F]
        } else {
          df <- df[rownames(df) %in% ids, , drop = F]
        }
      }
      df$id <- rownames(df)
      new_df <- df[, c("id", "X.GC")]
      colnames(new_df) <- c("id", "gc")
      new_df$gc <- as.numeric(new_df$gc)
      if (is.null(labels)) {
        new_df$group <- names(data)[i]
      } else {
        new_df$group <- labels[i]
      }
      return(new_df)
    },
    simplify = F
  )
  plot_data <- do.call(rbind, plot_df_list)
  ggout <- ggplot(plot_data, aes(x = gc, fill = group)) +
    geom_histogram(
      position = "identity",
      binwidth = binsize,
      color = "white",
      alpha = alpha
    ) +
    scale_fill_manual(values = fill) +
    labs(y = "Number of Samples", x = "Mean % GC", fill = "") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1),
      legend.title = element_text(size = 16),
      legend.text = element_text(size = 16)
    )
  return(ggout)
}

#' Plot per sample mean read GC content
#'
#' Create a histogram and boxplot of per sample mean read GC content. The two plots
#' are intended to be used as a single column, two row figure panel.
#'
#' @param data A data frame with data in the same format as `gc_content` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_phred_mean
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean GC content and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_gc_mean <- function(data,
                         group = NULL,
                         ids = NULL,
                         invert = F,
                         return_data = F,
                         binsize = NULL,
                         colors = NULL,
                         hist_alpha = 0.75,
                         box_fill = "goldenrod",
                         box_alpha = 0.5,
                         box_lwd = 1,
                         jitter_alpha = 0.75,
                         jitter_size = 1.75,
                         x_title = "Mean read GC content (%)",
                         y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  gc_mean <- apply(
    data,
    1,
    function(gc_freq) {
      gc_scores <- gsub(names(gc_freq), pattern = "GC_", replacement = "", fixed = T)
      gc_scores <- as.numeric(gc_scores)
      gc_freq <- as.numeric(gc_freq)
      gc_mean <- sum(gc_freq * gc_scores, na.rm = T) / sum(gc_freq, na.rm = T)
      return(gc_mean)
    }
  )
  gc_mean <- data.frame(values = gc_mean)
  rownames(gc_mean) <- rownames(data)
  plot_data <- merge(x = gc_mean, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    data = plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Adapter content distribution line plot
#'
#' Create a line plot with per position adapter frequency.
#'
#' @param data A list of data frames with data in the same format as `adapter_content` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @inheritParams plot_phred_per_read
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_adapter_content <- function(data,
                                 labels = NULL,
                                 line_colors = rep("red4", length(data)),
                                 alpha = 0.25,
                                 line_width = 1.5,
                                 ids = NULL,
                                 invert = F) {
  plot_df_list <- sapply(
    seq_len(length(data)),
    function(i) {
      df <- data[[i]]
      if (!is.null(ids)) {
        if (invert) {
          df <- df[!rownames(df) %in% ids, , drop = F]
        } else {
          df <- df[rownames(df) %in% ids, , drop = F]
        }
      }
      df$id <- rownames(df)
      new_df <- melt(
        df,
        id.vars = c("id", "adapter"),
        variable.name = "BP",
        value.name = "percent",
        factorsAsStrings = T
      )
      new_df$bp <- gsub(
        as.character(new_df$BP),
        pattern = "BP_",
        replacement = "",
        fixed = T
      )
      new_df$bp <- as.numeric(new_df$bp)
      new_df$id <- reorder(new_df$id, new_df$bp)
      if (is.null(labels)) {
        new_df$group <- names(data)[i]
      } else {
        new_df$group <- labels[i]
      }
      return(new_df)
    },
    simplify = F
  )
  plot_data <- do.call(rbind, plot_df_list)
  ggout <- ggplot(plot_data, aes(x = bp, y = percent, group = id, col = group)) +
    geom_line(lwd = line_width, alpha = alpha) +
    scale_color_manual(values = line_colors) +
    labs(y = "Cumulative % reads", x = "Read position", col = "") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1),
      legend.title = element_text(size = 16),
      legend.text = element_text(size = 16)
    )
  return(ggout)
}

#' Unique read percentage per sample scatterplot
#'
#' Create a unique read percentage by sample scatterplot.
#'
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match rownames
#' in `data`.
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams plot_seq_depth
#' @return A ggplot object.
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' unique read percentage and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_unique_read_pct <- function(data,
                                 group = NULL,
                                 point_size = 4,
                                 fill = NULL,
                                 alpha = 0.75,
                                 sort = T,
                                 ids = NULL,
                                 invert = F,
                                 return_data = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = data$total_deduplicated_percentage)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (sort) {
    plot_data <- plot_data[order(plot_data$values), ]
  }
  if (return_data) {
    return(plot_data)
  }
  ggout <- ggplot(plot_data, aes(x = seq_len(nrow(plot_data)), y = values, fill = group)) +
    geom_point(shape = 21, size = point_size, col = "white", alpha = alpha) +
    labs(y = "% unique reads", x = "Sample index", fill = "") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  if (is.null(group0)) {
    ggout <- ggout +
      theme(legend.position = "none")
  }
  if (is.null(fill)) {
    ggout <- ggout + scale_fill_brewer(palette = "Dark2")
  } else {
    ggout <- ggout + scale_fill_manual(values = fill)
  }
  return(ggout)
}

#' Sequencing depth vs. library complexity scatterplot
#'
#' Create a sequencing depth vs. unique read percentage scatterplot.
#'
#' @param data A data frame with sequencing depth and unique read percentage data in the same
#' format as `fastqc` returned by \code{\link{load_paired_end_qc_data}}.
#' @param point_size A numeric. The size for data points.
#' @param fill A string. The fill color for data points.
#' @param alpha A numeric. The alpha level for data points.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_seq_depth_vs_lib_complexity <- function(data,
                                             point_size = 4,
                                             fill = "red4",
                                             alpha = 0.75,
                                             ids = NULL,
                                             invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  ggout <- ggplot(data, aes(x = total_deduplicated_percentage, y = Total.Sequences / 1000000)) +
    geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
    xlim(0, 100) +
    labs(y = "Reads (millions)", x = "% unique reads") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Sequence duplication line plot
#'
#' Create a line plot with per sample sequence duplication levels.
#'
#' @param data A list of data frames with data in the same format as `seq_duplication` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @inheritParams plot_phred_per_read
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_seq_duplication <- function(data,
                                 labels = NULL,
                                 line_colors = rep("red4", length(data)),
                                 alpha = 0.25,
                                 line_width = 1.5,
                                 ids = NULL,
                                 invert = F) {
  plot_df_list <- sapply(
    seq_len(length(data)),
    function(i) {
      df <- data[[i]]
      if (!is.null(ids)) {
        if (invert) {
          df <- df[!rownames(df) %in% ids, , drop = F]
        } else {
          df <- df[rownames(df) %in% ids, , drop = F]
        }
      }
      df$id <- rownames(df)
      new_df <- melt(
        df,
        id.vars = "id",
        variable.name = "Level",
        value.name = "percent",
        factorsAsStrings = T
      )
      new_df$level <- gsub(
        as.character(new_df$Level),
        pattern = "level_",
        replacement = "",
        fixed = T
      )
      new_df$level <- gsub(
        as.character(new_df$level),
        pattern = ">",
        replacement = "",
        fixed = T
      )
      new_df$level <- gsub(
        as.character(new_df$level),
        pattern = "(k.)|(k)",
        replacement = "000",
        perl = T
      )
      new_df$level <- as.numeric(new_df$level)
      new_df$percent <- as.numeric(new_df$percent)
      new_df$id <- reorder(new_df$id, new_df$level)
      if (is.null(labels)) {
        new_df$group <- names(data)[i]
      } else {
        new_df$group <- labels[i]
      }
      return(new_df)
    },
    simplify = F
  )
  plot_data <- do.call(rbind, plot_df_list)
  x_names <- c(
    "1", "2", "3", "4", "5",
    "6", "7", "8", "9", ">10",
    ">50", ">100", ">500", ">1K",
    ">5K", ">10K"
  )
  dup_level_names <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500, 1000, 5000, 10000)
  dup_levels <- seq_len(length(dup_level_names))
  names(dup_levels) <- dup_level_names
  plot_data$level <- dup_levels[as.character(plot_data$level)]
  ggout <- ggplot(plot_data, aes(x = level, y = percent, group = id, col = group)) +
    geom_line(lwd = line_width, alpha = alpha) +
    scale_color_manual(values = line_colors) +
    labs(y = "% reads", x = "Duplication level", col = "") +
    scale_x_continuous(breaks = dup_levels, labels = x_names) +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.text.x = element_text(angle = 45, hjust = 1),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1),
      legend.title = element_text(size = 16),
      legend.text = element_text(size = 16)
    )
  return(ggout)
}

#' Trimmomatic statistics histograms
#'
#' Create a histograms for read retention and filtering rates.
#'
#' Uses output generated by MultiQC on Trimmomatic output for paired-end sequencing data to
#' produce sample frequency histograms of read retention and filtering rates. Note: this function
#' only works with Trimmomatic output for paired-end sequencing data.
#'
#' @param data A data frame with sequencing depth data in the same format as `trimmomatic` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param binsize A 2-item numeric vector that denotes the bin width for histogram bars. The
#' first value is for the `excluded` plot, and the other value corresponds to the `retained`
#' plot.
#' @param fill A 3-item vector of strings that denotes the colors for histogram bars. The first
#' color is for the `excluded` plot, and the other colors correspond to R1 and R2 reads in the
#' `retained` plot.
#' @inheritParams plot_seq_depth_hist
#' @return A list with two ggplot objects, `excluded` and `retained` that correspond to ggplot
#' objects for reads excluded and retained after Trimmomatic respectively.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_trimmomatic_paired <- function(data,
                                    binsize = c(2.5, 0.5),
                                    fill = c("gray10", "goldenrod2", "steelblue4"),
                                    alpha = 0.5,
                                    ids = NULL,
                                    invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  ggout_list <- list()
  ggout_list$excluded <- ggplot(data, aes(x = dropped_pct)) +
    geom_histogram(binwidth = binsize[1], fill = fill[1], color = "white", alpha = alpha) +
    labs(y = "Number of Samples", x = "% reads excluded") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  plot_data <- melt(
    data[, c("forward_only_surviving_pct", "reverse_only_surviving_pct")],
    measure.vars = c("forward_only_surviving_pct", "reverse_only_surviving_pct"),
    variable.name = "group",
    value.name = "percent"
  )
  plot_data$percent <- plot_data$percent
  plot_data$group <- factor(
    x = plot_data$group,
    levels = c("forward_only_surviving_pct", "reverse_only_surviving_pct"),
    labels = c("R1", "R2")
  )
  ggout_list$retained <- ggplot(plot_data, aes(x = percent, fill = group)) +
    geom_histogram(position = "identity", binwidth = binsize[2], color = "white", alpha = alpha) +
    scale_fill_manual(values = fill[2:3]) +
    labs(y = "Number of Samples", x = "% unpaired reads retained", fill = "") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1),
      legend.title = element_text(size = 16),
      legend.text = element_text(size = 16)
    )
  return(ggout_list)
}

#' Fragment length frequency line plot
#'
#' Create a fragment length frequency line plot.
#'
#' @param data A data frame with fragment length data in the same format as `frag_length` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param line_width A numeric. The line size for plot lines.
#' @param line_color A string. The color for plot lines.
#' @param alpha A numeric. The alpha level for data points.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_frag_length <- function(data,
                             line_width = 1.5,
                             line_color = "red4",
                             alpha = 0.5,
                             ids = NULL,
                             invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  data$id <- rownames(data)
  data <- melt(
    data,
    id.vars = "id",
    variable.name = "BP",
    value.name = "percent",
    factorsAsStrings = T
  )
  data$percent <- data$percent * 100
  data$bp <- gsub(
    as.character(data$BP),
    pattern = "BP_",
    replacement = "",
    fixed = T
  )
  data$bp <- as.numeric(data$bp)
  data$id <- reorder(data$id, data$bp)
  ggout <- ggplot(data, aes(x = bp, y = percent, group = id)) +
    geom_line(lwd = line_width, alpha = alpha, color = line_color) +
    labs(y = "% RNA fragments", x = "Fragment length") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Plot Salmon mapping statistics as a scatterplot
#'
#' Create a read count total vs. reads mapped scatterplot.
#'
#' @param data A data frame Salmon mapping data in the same format as `salmon` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_seq_depth_vs_lib_complexity
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_salmon_stats <- function(data,
                              point_size = 4,
                              fill = "red4",
                              alpha = 0.75,
                              ids = NULL,
                              invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  axis_min <- min(data$num_mapped / 1000000, data$num_processed / 1000000, na.rm = T) * 0.85
  axis_max <- max(data$num_mapped / 1000000, data$num_processed / 1000000, na.rm = T) * 1.15
  ggout <- ggplot(data, aes(x = num_mapped / 1000000, y = num_processed / 1000000)) +
    geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
    geom_abline(intercept = 0, slope = 1, size = 1, lty = 3, color = "gray60") +
    xlim(axis_min, axis_max) +
    ylim(axis_min, axis_max) +
    labs(y = "Reads input (millions)", x = "Reads mapped (millions)") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Plot Salmon mapping percentage
#'
#' Create a Salmon mapping percentage histogram and boxplot. The two plots are intended to be used
#' as a single column, two row figure panel.
#'
#' @param data A data frame with Salmon mapping data in the same format as `salmon` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match rownames
#' in `data`.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_salmon_mapping_pct <- function(data,
                                    group = NULL,
                                    ids = NULL,
                                    invert = F,
                                    return_data = F,
                                    binsize = NULL,
                                    colors = NULL,
                                    hist_alpha = 0.75,
                                    box_fill = "goldenrod",
                                    box_alpha = 0.5,
                                    box_lwd = 1,
                                    jitter_alpha = 0.75,
                                    jitter_size = 1.75,
                                    x_title = "% reads mapped",
                                    y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = data$percent_mapped)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    data = plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot Salmon mapped read count
#'
#' Create a Salmon mapped read count histogram and boxplot. The two plots are intended to be used
#' as a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_salmon_mapped_reads <- function(data,
                                     group = NULL,
                                     ids = NULL,
                                     invert = F,
                                     return_data = F,
                                     binsize = NULL,
                                     colors = NULL,
                                     hist_alpha = 0.75,
                                     box_fill = "goldenrod",
                                     box_alpha = 0.5,
                                     box_lwd = 1,
                                     jitter_alpha = 0.75,
                                     jitter_size = 1.75,
                                     x_title = "Reads mapped (millions)",
                                     y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = data$num_mapped / 1000000)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot Salmon percent decoy fragments
#'
#' Create a Salmon decoy mapping rate histogram and boxplot. The two plots are intended to be used
#' as a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso `\link{load_paired_end_qc_data}`
#' @export
plot_decoy_mapping_rate <- function(data,
                                    group = NULL,
                                    ids = NULL,
                                    invert = F,
                                    return_data = F,
                                    binsize = NULL,
                                    colors = NULL,
                                    hist_alpha = 0.75,
                                    box_fill = "goldenrod",
                                    box_alpha = 0.5,
                                    box_lwd = 1,
                                    jitter_alpha = 0.75,
                                    jitter_size = 1.75,
                                    x_title = "% mapped to decoys",
                                    y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = (data$num_decoy_fragments / data$num_processed) * 100)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot Salmon percent dovetail fragments
#'
#' Create a Salmon dovetail fragment percentage histogram and boxplot. The two plots are intended
#' to be used as a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso `\link{load_paired_end_qc_data}`
#' @export
plot_dovetail_mapping_rate <- function(data,
                                       group = NULL,
                                       ids = NULL,
                                       invert = F,
                                       return_data = F,
                                       binsize = NULL,
                                       colors = NULL,
                                       hist_alpha = 0.75,
                                       box_fill = "goldenrod",
                                       box_alpha = 0.5,
                                       box_lwd = 1,
                                       jitter_alpha = 0.75,
                                       jitter_size = 1.75,
                                       x_title = "% dovetail fragments",
                                       y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = (data$num_dovetail_fragments / data$num_processed) * 100)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot Salmon percent score filtered fragments
#'
#' Create a Salmon score-based filtering rate histogram and boxplot. The two plots are intended to
#' be used as a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso `\link{load_paired_end_qc_data}`
#' @export
plot_salmon_score_filtered_rate <- function(data,
                                            group = NULL,
                                            ids = NULL,
                                            invert = F,
                                            return_data = F,
                                            binsize = NULL,
                                            colors = NULL,
                                            hist_alpha = 0.75,
                                            box_fill = "goldenrod",
                                            box_alpha = 0.5,
                                            box_lwd = 1,
                                            jitter_alpha = 0.75,
                                            jitter_size = 1.75,
                                            x_title = "% low-score filtered fragments",
                                            y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = (data$num_fragments_filtered_vm / data$num_processed) * 100)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot Salmon failed mapping count
#'
#' Create a Salmon failed mapping histogram and boxplot. The two plots are intended to be used as
#' a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso `\link{load_paired_end_qc_data}`
#' @export
plot_failed_mapping_count <- function(data,
                                      group = NULL,
                                      ids = NULL,
                                      invert = F,
                                      return_data = F,
                                      binsize = NULL,
                                      colors = NULL,
                                      hist_alpha = 0.75,
                                      box_fill = "goldenrod",
                                      box_alpha = 0.5,
                                      box_lwd = 1,
                                      jitter_alpha = 0.75,
                                      jitter_size = 1.75,
                                      x_title = paste0(
                                        c(
                                          "Number of failed mappings",
                                          "for mapped fragments (millions)"
                                        ),
                                        collapse = "\n"
                                      ),
                                      y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  failed_mappings <- data$num_alignments_below_threshold_for_mapped_fragments_vm
  plot_data <- data.frame(values = failed_mappings / 1000000)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot Salmon mapping category percentages
#'
#' Create barplots with percentages for types of Salmon mappings.
#'
#' Create barplots with the percentage of reads/read pairs input into Salmon that were
#' mapped to the transcriptome, decoy fragments, as dovetails, and as low-score filtered mappings.
#' Categories are not mutually exclusive. X-axes are ordered the same across plots.
#'
#' @param data A data frame with Salmon mapping data in the same format as `salmon` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param fill A string vector with 4 colors corresponding to bars for the barplots for the
#' transcriptome, decoys, dovetails, and filtered mappings respectively.
#' @param sort A logical. Should the x-axis be ordered by transcriptome mapping rate?
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A list with four ggplot objects `transcriptome`, `decoy`, `dovetail`, and `filtered`
#' that correspond to the plots for the transcriptome, decoys, dovetails, and filtered mappings
#' respectively.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_salmon_categories <- function(data,
                                   fill = brewer.pal(n = 4, name = "Spectral"),
                                   sort = F,
                                   ids = NULL,
                                   invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  plot_data <- data.frame(id = rownames(data))
  plot_data$transcriptome <- data$num_mapped / data$num_processed * 100
  plot_data$decoy <- data$num_decoy_fragments / data$num_processed * 100
  plot_data$dovetail <- data$num_dovetail_fragments / data$num_processed * 100
  plot_data$filtered <- data$num_fragments_filtered_vm / data$num_processed * 100
  if (sort) {
    plot_data$id <- reorder(plot_data$id, plot_data$transcriptome)
  }
  ggout_list <- list()
  ggout_list$transcriptome <- ggplot(plot_data, aes(x = id, y = transcriptome)) +
    geom_bar(position = "dodge", stat = "identity", color = "black", lwd = 0.5, fill = fill[1]) +
    ylim(0, 100) +
    labs(x = "Samples", y = "% of fragments", title = "Transcriptome mappings") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      legend.title = element_blank(),
      legend.text = element_blank(),
      axis.text = element_text(size = 18),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  ggout_list$decoy <- ggplot(plot_data, aes(x = id, y = decoy)) +
    geom_bar(position = "dodge", stat = "identity", color = "black", lwd = 0.5, fill = fill[2]) +
    ylim(0, 100) +
    labs(x = "Samples", y = "% of fragments", title = "Decoy mappings") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      legend.title = element_blank(),
      legend.text = element_blank(),
      axis.text = element_text(size = 18),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  ggout_list$dovetail <- ggplot(plot_data, aes(x = id, y = dovetail)) +
    geom_bar(position = "dodge", stat = "identity", color = "black", lwd = 0.5, fill = fill[3]) +
    ylim(0, 100) +
    labs(x = "Samples", y = "% of fragments", title = "Dovetail mappings") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      legend.title = element_blank(),
      legend.text = element_blank(),
      axis.text = element_text(size = 18),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  ggout_list$filtered <- ggplot(plot_data, aes(x = id, y = filtered)) +
    geom_bar(position = "dodge", stat = "identity", color = "black", lwd = 0.5, fill = fill[4]) +
    ylim(0, 100) +
    labs(x = "Samples", y = "% of fragments", title = "Low-score filtered mappings") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      legend.title = element_blank(),
      legend.text = element_blank(),
      axis.text = element_text(size = 18),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout_list)
}

#' Plot HISAT2 mapping statistics as a scatterplot
#'
#' Create a concordant alignment vs. overall alignment rate scatterplot. Only applicable for
#' paired-end sequencing data.
#'
#' @param data A data frame HISAT2 mapping data in the same format as `hisat2` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_seq_depth_vs_lib_complexity
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_hisat2_stats <- function(data,
                              point_size = 4,
                              fill = "red4",
                              alpha = 0.75,
                              ids = NULL,
                              invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  data$overall_alignment_rate <- as.numeric(data$overall_alignment_rate)
  data$revised_alignment_rate <- as.numeric(data$revised_alignment_rate)
  axis_min <- min(data$overall_alignment_rate, data$revised_alignment_rate, na.rm = T) * 0.85
  axis_min <- max(axis_min, 0, na.rm = T)
  axis_max <- max(data$overall_alignment_rate, data$revised_alignment_rate, na.rm = T) * 1.15
  axis_max <- min(axis_max, 1, na.rm = T)
  ggout <- ggplot(data, aes(x = overall_alignment_rate, y = revised_alignment_rate)) +
    geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
    geom_abline(intercept = 0, slope = 1, size = 1, lty = 3, color = "gray60") +
    xlim(axis_min, axis_max) +
    ylim(axis_min, axis_max) +
    labs(y = "Concordant alignment rate", x = "Overall alignment rate") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Plot HISAT2 mapping percentage
#'
#' Create a HISAT2 concordant alignment rate histogram and boxplot. The two plots are intended to
#' be used as a single column, two row figure panel. Only applicable for paired-end sequencing
#' data.
#'
#' @param data A data frame with HISAT2 mapping data in the same format as `hisat2` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @inheritParams hist_boxplot
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_hisat2_mapping_rate <- function(data,
                                     ids = NULL,
                                     invert = F,
                                     binsize = NULL,
                                     hist_fill = "gray10",
                                     hist_alpha = 0.75,
                                     box_fill = "goldenrod",
                                     box_alpha = 0.5,
                                     box_lwd = 1,
                                     jitter_color = "gray30",
                                     jitter_alpha = 0.75,
                                     jitter_size = 1.75,
                                     x_title = "Concordant alignment rate",
                                     y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  data$revised_alignment_rate <- as.numeric(data$revised_alignment_rate)
  if (is.null(binsize)) {
    binsize <- diff(range(data$revised_alignment_rate)) / 25
  }
  ggout_list <- hist_boxplot(
    data[, "revised_alignment_rate", drop = F],
    binsize = binsize,
    hist_fill = hist_fill,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_color = jitter_color,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  return(ggout_list)
}


#' Plot HISAT2 mapped read count
#'
#' Create a HISAT2 mapped read count histogram and boxplot. The two plots are intended to be used
#' as a single column, two row figure panel. Only applicable for paired-end sequencing data.
#'
#' @param data A data frame with HISAT2 mapping data in the same format as `hisat2` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_hisat2_mapping_rate
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_hisat2_mapped_reads <- function(data,
                                     ids = NULL,
                                     invert = F,
                                     binsize = NULL,
                                     hist_fill = "gray10",
                                     hist_alpha = 0.75,
                                     box_fill = "goldenrod",
                                     box_alpha = 0.5,
                                     box_lwd = 1,
                                     jitter_color = "gray30",
                                     jitter_alpha = 0.75,
                                     jitter_size = 1.75,
                                     x_title = "Reads pairs aligned (millions)",
                                     y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  data$revised_alignment_rate <- as.numeric(data$revised_alignment_rate)
  data$paired_total <- as.numeric(data$paired_total)
  data$aligned_reads_count <- (data$paired_total * data$revised_alignment_rate) / 1000000
  if (is.null(binsize)) {
    binsize <- diff(range(data$aligned_reads_count)) / 25
  }
  plot_data <- data[, "aligned_reads_count", drop = F]
  ggout_list <- hist_boxplot(
    plot_data,
    binsize = binsize,
    hist_fill = hist_fill,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_color = jitter_color,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  return(ggout_list)
}

#' Plot HISAT2 vs. Salmon mapping rates
#'
#' Create a HISAT2 vs Salmon mapping rate scatterplot. Only applicable for
#' paired-end sequencing data.
#'
#' Creates a scatterplot comparing mapping rates between HISAT2 and Salmon for samples that have
#' matchable IDs between data from the two tools. IDs are formatted  for matching based on the
#' assumption that sample names are equivalent up until the first `|` delimiter (if any) in the
#' name.
#'
#' @param hisat_data A data frame of HISAT2 mapping data in the same format as `hisat2` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param salmon_data A data frame of Salmon mapping data in the same format as `hisat2` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param ids A vector of rownames for subsetting `hisat_data` and `salmon_data` for plotting.
#' @inheritParams plot_seq_depth_vs_lib_complexity
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_hisat2_vs_salmon <- function(hisat_data,
                                  salmon_data,
                                  point_size = 4,
                                  fill = "red4",
                                  alpha = 0.75,
                                  ids = NULL,
                                  invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      hisat_data <- hisat_data[!rownames(hisat_data) %in% ids, , drop = F]
      salmon_data <- salmon_data[!rownames(salmon_data) %in% ids, , drop = F]
    } else {
      hisat_data <- hisat_data[rownames(hisat_data) %in% ids, , drop = F]
      salmon_data <- salmon_data[rownames(salmon_data) %in% ids, , drop = F]
    }
  }
  hisat_data$revised_alignment_rate <- as.numeric(hisat_data$revised_alignment_rate)
  hisat_data$id <- rownames(hisat_data)
  hisat_data$id <- gsub(hisat_data$id, pattern = " \\|.+$", replacement = "", perl = T)
  hisat_data <- hisat_data[, c("id", "revised_alignment_rate")]
  salmon_data$id <- rownames(salmon_data)
  salmon_data$id <- gsub(salmon_data$id, pattern = " \\|.+$", replacement = "", perl = T)
  salmon_data <- salmon_data[, c("id", "percent_mapped")]
  plot_data <- merge(x = hisat_data, y = salmon_data, by = "id")
  plot_data$salmon_rate <- plot_data$percent_mapped / 100

  axis_min <- min(plot_data$revised_alignment_rate, plot_data$salmon_rate, na.rm = T) * 0.85
  axis_min <- max(axis_min, 0, na.rm = T)
  axis_max <- max(plot_data$revised_alignment_rate, plot_data$salmon_rate, na.rm = T) * 1.15
  axis_max <- min(axis_max, 1, na.rm = T)
  ggout <- ggplot(plot_data, aes(x = salmon_rate, y = revised_alignment_rate)) +
    geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
    geom_abline(intercept = 0, slope = 1, size = 1, lty = 3, color = "gray60") +
    xlim(axis_min, axis_max) +
    ylim(axis_min, axis_max) +
    labs(x = "Salmon mapping rate", y = "HISAT2 mapping rate") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Splice junction saturation plot
#'
#' Create a splice junction saturation line plot.
#'
#' @param data A data frame with splice junction data in the same format as `known_junction`
#' or `novel_junction` returned by \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_frag_length
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_junction_saturation <- function(data,
                                     line_width = 1.5,
                                     line_color = "red4",
                                     alpha = 0.5,
                                     ids = NULL,
                                     invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  data$id <- rownames(data)
  data <- melt(
    data,
    id.vars = "id",
    variable.name = "percent",
    value.name = "count",
    factorsAsStrings = F
  )
  data$percent <- gsub(
    data$percent,
    pattern = "percent_",
    replacement = "",
    fixed = T
  )
  data$percent <- as.numeric(data$percent)
  data$count <- data$count / 1000
  data$id <- reorder(data$id, data$percent)
  ggout <- ggplot(data, aes(x = percent, y = count, group = id)) +
    geom_line(lwd = line_width, alpha = alpha, color = line_color) +
    labs(y = "Recovered splice junctions\n(thousands)", x = "% of reads") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Read pair inner distance distribution
#'
#' Create a paired-end read inner distance line plot. Applicable only to paired-end sequencing
#' data.
#'
#' @param data A data frame with fragment length data in the same format as `inner_dist` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param line_width A numeric. The line size for plot lines.
#' @param line_color A string. The color for plot lines.
#' @param alpha A numeric. The alpha level for data points.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_inner_dist <- function(data,
                            line_width = 1.5,
                            line_color = "red4",
                            alpha = 0.5,
                            ids = NULL,
                            invert = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  data$id <- rownames(data)
  data <- melt(
    data,
    id.vars = "id",
    variable.name = "distance",
    value.name = "count",
    factorsAsStrings = F
  )
  data$distance <- gsub(
    data$distance,
    pattern = "BP_.",
    replacement = "-",
    fixed = T
  )
  data$distance <- gsub(
    data$distance,
    pattern = "BP_",
    replacement = "",
    fixed = T
  )
  data$distance <- as.numeric(data$distance)
  data$count <- data$count / 1000
  data$id <- reorder(data$id, data$distance)
  ggout <- ggplot(data, aes(x = distance, y = count, group = id)) +
    geom_line(lwd = line_width, alpha = alpha, color = line_color) +
    labs(y = "Alignment count (thousands)", x = "Distance between read pairs") +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1)
    )
  return(ggout)
}

#' Plot annotation category percentages for alignments
#'
#' Create stacked barplots with percentages for types of genomic annotations.
#'
#' Create barplots with the percentage of alignments that map to a genomic region with annotations
#' of intergenic, proximally down and upstream of genes, intronic, UTRs or coding sequence.
#' Includes the option to consolidate categories into more general categories for intergenic,
#' intronic, and exonic regions.
#'
#' @param data A data frame with annotation data in the same format as `rseqc_alignment_category`
#' returned by \code{\link{load_paired_end_qc_data}}.
#' @param fill A string vector with 3 or 6 colors corresponding to cateogires for the barplot.
#' @param sort A logical. Should the x-axis be ordered by coding sequence/exonic alignment
#' percentage?
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param consolidate A logical. Should mapping categories be consolidated into the three more
#' general categories of exonic, intronic, and intergenic?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @return A ggplot object.  If `return_data` is `TRUE`, then a data frame with the plot data
#' values for each genomic annotation category as columns.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_mapping_categories <- function(data,
                                    fill = NULL,
                                    sort = F,
                                    ids = NULL,
                                    invert = F,
                                    consolidate = F,
                                    return_data = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  if (sort) {
    data$id <- reorder(data$id, data$CDS)
  }
  if (consolidate) {
    intergenic_col_ids <- c("Other_intergenic", "Downstream_10kb", "Upstream_10kb")
    intronic_col_ids <- c("Intron")
    exonic_col_ids <- c("UTR_3", "UTR_5", "CDS")
    intergenic_index <- which(colnames(data) %in% intergenic_col_ids)
    intronic_index <- which(colnames(data) %in% intronic_col_ids)
    exonic_index <- which(colnames(data) %in% exonic_col_ids)
    sample_index <- which(colnames(data) == "id")
    data <- t(
      apply(
        data,
        1,
        function(x) {
          new_row <- data.frame(
            id = x[sample_index],
            Intergenic = sum(as.numeric(x[intergenic_index])),
            Intronic = sum(as.numeric(x[intronic_index])),
            Exonic = sum(as.numeric(x[exonic_index]))
          )
          return(new_row)
        }
      )
    )
    data <- as.data.frame(do.call("rbind", data))
    if (sort) {
      data$id <- reorder(data$id, data$Exonic)
    }
  }
  if (return_data) {
    rownames(data) <- data$id
    data <- data[, -1 * which(colnames(data) == "id")]
    return(data)
  }
  plot_data <- melt(
    data,
    id.vars = "id",
    variable.name = "type",
    value.name = "percent",
    factorsAsStrings = T
  )
  ggout <- ggplot(plot_data, aes(y = percent, x = id, fill = type)) +
    geom_bar(position = "stack", stat = "identity", color = NA, lwd = 0) +
    labs(x = "Samples", y = "% primary alignments", fill = "") +
    geom_hline(
      yintercept = c(12.5, 25, 37.5, 50, 62.5, 75, 87.5),
      lty = 3,
      lwd = 1,
      color = "white",
      alpha = 0.5
    ) +
    theme(
      plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
      title = element_text(size = 16),
      axis.text = element_text(size = 18),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.title = element_text(size = 16),
      axis.title.y = element_text(vjust = 3),
      axis.title.x = element_text(vjust = -1),
      legend.title = element_text(size = 16),
      legend.text = element_text(size = 16)
    )
  if (is.null(fill)) {
    if (consolidate) {
      ggout <- ggout + scale_fill_manual(values = c("#3b7854", "#d49b00", "#30485e"))
    } else {
      ggout <- ggout + scale_fill_brewer(palette = "Spectral")
    }
  } else {
    ggout <- ggout + scale_fill_manual(values = fill)
  }
  return(ggout)
}

#' ggplot2-based PCA plot
#'
#' Create an eigenvectors scatterplot from principal component analysis.
#'
#' Conducts a principal component analysis (PCA) then produces a scatterplot using the
#' user-specified eigenvectors (ie., principal components or PCs) from the eigenvector matrix. Data
#' points can be optionally color-coded based on a user-specified variable.
#'
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @inheritParams ggpca
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' user-specified PCs, grouping variable, and an attribute for the percent variace explained for
#' each user-specified PC.
#' @seealso \code{\link{prcomp}}, \code{\link{SummarizedExperiment}}, \code{\link{assay}},
#' \code{\link{colData}}
#' @export
plot_pca <- function(data,
                     ids = NULL,
                     invert = F,
                     group_var = NULL,
                     pc_x = 1,
                     pc_y = 2,
                     ntop = nrow(data),
                     center = T,
                     scale = T,
                     equal_axes = F,
                     point_size = 3,
                     alpha = 0.75,
                     color = "red4",
                     palette = "Paired",
                     return_data = F) {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[, !colnames(data) %in% ids, drop = F]
    } else {
      data <- data[, colnames(data) %in% ids, drop = F]
    }
  }
  ggout <- ggpca(
    data = data,
    group_var = group_var,
    pc_x = pc_x,
    pc_y = pc_y,
    ntop = ntop,
    center = center,
    scale = scale,
    equal_axes = equal_axes,
    point_size = point_size,
    alpha = alpha,
    color = color,
    palette = palette,
    return_data = return_data
  )
  return(ggout)
}

#' Plot feature set expression fraction
#'
#' Create a feature set fraction histogram and boxplot.
#'
#' Calculates the fraction of feature (e.g., gene) expression represented by a user-specified set
#' of features and plots this as a histogram and boxplot. The two plots are intended to be used as
#' a single column, two row figure panel.
#'
#' @param data A feature (row) by sample (column) data frame or matrix with expression count
#' values.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param feature_ids A vector of row names for subsetting `data` for plotting.
#' @param sample_ids A vector of column names for subsetting `data` for plotting.
#' @param invert_rows A logical. Should `feature_ids` be used for excluding rows from plotting
#' instead?
#' @param invert_cols A logical. Should `sample_ids` be used for excluding columns from plotting
#' instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_expression_fraction <- function(data,
                                     group = NULL,
                                     feature_ids = NULL,
                                     sample_ids = NULL,
                                     invert_rows = F,
                                     invert_cols = F,
                                     return_data = F,
                                     binsize = NULL,
                                     colors = NULL,
                                     hist_alpha = 0.75,
                                     box_fill = "goldenrod",
                                     box_alpha = 0.5,
                                     box_lwd = 1,
                                     jitter_alpha = 0.75,
                                     jitter_size = 1.75,
                                     x_title = "% of total expression",
                                     y_title = "Sample count") {
  if (!is.null(sample_ids)) {
    if (invert_cols) {
      data <- data[, !colnames(data) %in% sample_ids, drop = F]
    } else {
      data <- data[, colnames(data) %in% sample_ids, drop = F]
    }
  }
  total_expr <- colSums(data, na.rm = F)
  if (!is.null(feature_ids)) {
    if (invert_rows) {
      data <- data[!rownames(data) %in% feature_ids, ]
    } else {
      data <- data[rownames(data) %in% feature_ids, ]
    }
  }
  subset_expr <- colSums(data, na.rm = F)
  expr_pct <- (subset_expr / total_expr) * 100
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, ncol(data))))
    rownames(group) <- colnames(data)
  }
  plot_data <- data.frame(values = expr_pct)
  rownames(plot_data) <- colnames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (ncol(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot shannon diversity index
#'
#' Create a shannon index histogram and boxplot.
#'
#' Calculates the shannon index for a user-specified set of expression features (e.g., genes) and
#' plots this as a histogram and boxplot. The two plots are intended to be used as a single column,
#' two row figure panel.
#'
#' @param min_value The minimum value (i.e., expression level) in `data` for which greater values
#' are considered in the shannon index calculation.
#' @inheritParams plot_expression_fraction
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_shannon_index <- function(data,
                               group = NULL,
                               min_value = 0,
                               feature_ids = NULL,
                               sample_ids = NULL,
                               invert_rows = F,
                               invert_cols = F,
                               return_data = F,
                               binsize = NULL,
                               colors = NULL,
                               hist_alpha = 0.75,
                               box_fill = "goldenrod",
                               box_alpha = 0.5,
                               box_lwd = 1,
                               jitter_alpha = 0.75,
                               jitter_size = 1.75,
                               x_title = "Shannon index",
                               y_title = "Sample count") {
  if (!is.null(sample_ids)) {
    if (invert_cols) {
      data <- data[, !colnames(data) %in% sample_ids, drop = F]
    } else {
      data <- data[, colnames(data) %in% sample_ids, drop = F]
    }
  }
  if (!is.null(feature_ids)) {
    if (invert_rows) {
      data <- data[!rownames(data) %in% feature_ids, ]
    } else {
      data <- data[rownames(data) %in% feature_ids, ]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, ncol(data))))
    rownames(group) <- colnames(data)
  }

  diversity <- apply(
    data,
    2,
    function(sample_data) {
      sample_data <- sample_data[which(sample_data > min_value)]
      p <- sample_data / sum(sample_data, na.rm = T)
      shannon_index <- -1 * sum(p * log(p))
      return(shannon_index)
    }
  )
  plot_data <- data.frame(values = diversity, row.names = colnames(data))
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (ncol(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot partitioned mean expression
#'
#' Create a histogram and boxplot with per sample mean expression levels partitioned by a
#' categorical variable.
#'
#' Calculates the mean across a user-specified set of expression features (e.g., genes) for each
#' sample and plots this as a histogram and boxplot. The two plots are intended to be used as a
#' single column, two row figure panel. Data points in the plots are partitioned by a categorical
#' variable.
#'
#' @param data A SummarizedExperiment-like object. Must be compatible with `assay()` and
#' `colData()`.
#' @param group_var A string for the categorical variable to use for partitioning.
#' @param feature_ids A vector of row names for subsetting `data` for plotting.
#' @param sample_ids A vector of column names for subsetting `data` for plotting.
#' @param invert_rows A logical. Should `feature_ids` be used for excluding rows from plotting
#' instead?
#' @param invert_cols A logical. Should `sample_ids` be used for excluding columns from plotting
#' instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{hist_boxplot2}}
#' @export
plot_partitioned_mean_expression <- function(data,
                                             group_var,
                                             feature_ids = NULL,
                                             sample_ids = NULL,
                                             invert_rows = F,
                                             invert_cols = F,
                                             return_data = F,
                                             binsize = NULL,
                                             colors = NULL,
                                             hist_alpha = 0.75,
                                             box_fill = "gray50",
                                             box_alpha = 0.5,
                                             box_lwd = 1,
                                             jitter_alpha = 0.5,
                                             jitter_size = 1.75,
                                             x_title = "Mean expression",
                                             y_title = "Sample count") {
  if (!is.null(sample_ids)) {
    if (invert_cols) {
      data <- data[, !colnames(data) %in% sample_ids, drop = F]
    } else {
      data <- data[, colnames(data) %in% sample_ids, drop = F]
    }
  }
  if (!is.null(feature_ids)) {
    if (invert_rows) {
      data <- data[!rownames(data) %in% feature_ids, ]
    } else {
      data <- data[rownames(data) %in% feature_ids, ]
    }
  }
  plot_data <- data.frame(
    values = colMeans(assay(data)),
    group = colData(data)[, group_var, drop = T]
  )
  rownames(plot_data) <- colnames(data)
  if (return_data) {
    return(plot_data)
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  ggout_list <- hist_boxplot2(
    data = plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  return(ggout_list)
}

#' Plot partitioned principal component
#'
#' Create a histogram and boxplot with per sample principal component values partitioned by a
#' categorical variable.
#'
#' Runs PCA using a user-specified set of expression features (e.g., genes) and plots a
#' user-specified PC as a histogram and boxplot. The two plots are intended to be used as a
#' single column, two row figure panel. Data points in the plots are partitioned by a categorical
#' variable.
#'
#' @param pc An integer for the principal component (ranked from highest to lowest variance
#' contribution) to use for plotting.
#' @param center A logical. Should the data be zero-centered prior to PCA?
#' @param scale A logical. Should the data be scaled to unit variance prior to PCA?
#' @inheritParams plot_partitioned_mean_expression
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{hist_boxplot2}} \code{\link{prcomp}}
#' @export
plot_partitioned_pc <- function(data,
                                pc = 1,
                                group_var,
                                center = T,
                                scale = T,
                                feature_ids = NULL,
                                sample_ids = NULL,
                                invert_rows = F,
                                invert_cols = F,
                                return_data = F,
                                binsize = NULL,
                                colors = NULL,
                                hist_alpha = 0.75,
                                box_fill = "gray50",
                                box_alpha = 0.5,
                                box_lwd = 1,
                                jitter_alpha = 0.5,
                                jitter_size = 1.75,
                                x_title = NULL,
                                y_title = "Sample count") {
  if (!is.null(sample_ids)) {
    if (invert_cols) {
      data <- data[, !colnames(data) %in% sample_ids, drop = F]
    } else {
      data <- data[, colnames(data) %in% sample_ids, drop = F]
    }
  }
  if (!is.null(feature_ids)) {
    if (invert_rows) {
      data <- data[!rownames(data) %in% feature_ids, ]
    } else {
      data <- data[rownames(data) %in% feature_ids, ]
    }
  }
  pc_name <- paste0("PC", pc)
  pca <- prcomp(t(SummarizedExperiment::assay(data)), center = center, scale. = scale)
  pct_var <- pca$sdev[pc]^2 / sum(pca$sdev^2)
  plot_data <- data.frame(
    values = pca$x[, pc],
    group = SummarizedExperiment::colData(data)[, group_var, drop = T]
  )
  rownames(plot_data) <- colnames(data)
  if (return_data) {
    return(plot_data)
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (is.null(x_title)) {
    x_title <- paste0(pc_name, " (", round(pct_var * 100), "% variance)")
  }
  ggout_list <- hist_boxplot2(
    data = plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  return(ggout_list)
}

#' Plot RIN scores
#'
#' Create a histogram and boxplot of RIN scores. The two plots are intended to be used as a single
#' column, two row figure panel.
#'
#' @param data A single column data frame with RIN score values and sample IDs as rownames.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{hist_boxplot2}}
#' @export
plot_rin <- function(data,
                     group = NULL,
                     ids = NULL,
                     invert = F,
                     return_data = F,
                     binsize = NULL,
                     colors = NULL,
                     hist_alpha = 0.75,
                     box_fill = "goldenrod",
                     box_alpha = 0.5,
                     box_lwd = 1,
                     jitter_alpha = 0.75,
                     jitter_size = 1.75,
                     x_title = "RIN",
                     y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, , drop = F]
    } else {
      data <- data[rownames(data) %in% ids, , drop = F]
    }
  }
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = data[, 1])
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot intergenic-genic ratio
#'
#' Create an intergenic-genic mapping proportion histogram and boxplot.
#'
#' Calculates the ratio between intergenic and genic mapping rates and plots this as a histogram
#' and boxplot. The two plots are intended to be used as a single column, two row figure panel. The
#' intergenic and genic mapping rates can be calculated uing
#' \code{\link{plot_mapping_categories}}.
#'
#' @param data A three column data frame with intergenic, intronic, and exonic mapping rates as 
#' the first, second, and third column, respectively.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param ids A vector of row names for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{plot_mapping_categories}} \code{\link{hist_boxplot2}}
#' @export
plot_intergenic_genic_ratio <- function(data,
                                        group = NULL,
                                        ids = NULL,
                                        invert = F,
                                        return_data = F,
                                        binsize = NULL,
                                        colors = NULL,
                                        hist_alpha = 0.75,
                                        box_fill = "goldenrod",
                                        box_alpha = 0.5,
                                        box_lwd = 1,
                                        jitter_alpha = 0.75,
                                        jitter_size = 1.75,
                                        x_title = "Intergenic-genic mappping ratio",
                                        y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, ]
    } else {
      data <- data[rownames(data) %in% ids, ]
    }
  }
  colnames(data) <- c("intergenic", "intronic", "exonic")
  contamination_ratio <- data$intergenic / (data$intronic + data$exonic)
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = contamination_ratio)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot DNA contamination ratio
#'
#' Create an intergenic-intronic mapping proportion histogram and boxplot.
#'
#' Calculates the ratio between intergenic and intronic mapping rates and plots this as a histogram
#' and boxplot. The two plots are intended to be used as a single column, two row figure panel. The
#' intergenic and intronic mapping rates can be calculated uing
#' \code{\link{plot_mapping_categories}}.
#'
#' @param data A two column data frame with intergenic mapping rates as the first column and
#' intronic mapping rates as the second column.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param ids A vector of row names for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{plot_mapping_categories}} \code{\link{hist_boxplot2}}
#' @export
plot_dna_contamination_ratio <- function(data,
                                         group = NULL,
                                         ids = NULL,
                                         invert = F,
                                         return_data = F,
                                         binsize = NULL,
                                         colors = NULL,
                                         hist_alpha = 0.75,
                                         box_fill = "goldenrod",
                                         box_alpha = 0.5,
                                         box_lwd = 1,
                                         jitter_alpha = 0.75,
                                         jitter_size = 1.75,
                                         x_title = "Intergenic-intronic mappping ratio",
                                         y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, ]
    } else {
      data <- data[rownames(data) %in% ids, ]
    }
  }
  colnames(data) <- c("intergenic", "intronic")
  mapping_ratio <- data$intergenic / data$intronic
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = mapping_ratio)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}

#' Plot gene mapping rate
#'
#' Create a gene mapping rate histogram and boxplot.
#'
#' Calculates the gene mapping rate (combined exonic and intronic mapping rates) and plots this as
#' a histogram and boxplot. The two plots are intended to be used as a single column, two row
#' figure panel. The exonic and intronic mapping rates can be calculated uing
#' \code{\link{plot_mapping_categories}}.
#'
#' @param data A two column data frame with exonic mapping rates as the first column and
#' intronic mapping rates as the second column.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param ids A vector of row names for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{plot_mapping_categories}} \code{\link{hist_boxplot2}}
#' @export
plot_gene_mapping_rate <- function(data,
                                   group = NULL,
                                   ids = NULL,
                                   invert = F,
                                   return_data = F,
                                   binsize = NULL,
                                   colors = NULL,
                                   hist_alpha = 0.75,
                                   box_fill = "goldenrod",
                                   box_alpha = 0.5,
                                   box_lwd = 1,
                                   jitter_alpha = 0.75,
                                   jitter_size = 1.75,
                                   x_title = "Gene mappping %",
                                   y_title = "Sample count") {
  if (!is.null(ids)) {
    if (invert) {
      data <- data[!rownames(data) %in% ids, ]
    } else {
      data <- data[rownames(data) %in% ids, ]
    }
  }
  colnames(data) <- c("exonic", "intronic")
  mapping_pct <- (data$exonic + data$intronic)
  group0 <- group
  if (is.null(group)) {
    group <- data.frame(group = factor(rep(1, nrow(data))))
    rownames(group) <- rownames(data)
  }
  plot_data <- data.frame(values = mapping_pct)
  rownames(plot_data) <- rownames(data)
  plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
  rownames(plot_data) <- plot_data[, 1]
  plot_data <- plot_data[, -1] # remove new row name column
  plot_data$values <- as.numeric(plot_data$values)
  if (nrow(data) > nrow(plot_data)) {
    warning("Not all samples from 'data' have a value in 'group'")
  }
  if (is.null(binsize)) {
    binsize <- diff(range(plot_data$values)) / 25
  }
  if (return_data) {
    return(plot_data)
  }
  ggout_list <- hist_boxplot2(
    plot_data,
    binsize = binsize,
    colors = colors,
    hist_alpha = hist_alpha,
    box_fill = box_fill,
    box_alpha = box_alpha,
    box_lwd = box_lwd,
    jitter_alpha = jitter_alpha,
    jitter_size = jitter_size,
    x_title = x_title,
    y_title = y_title
  )
  if (is.null(group0)) {
    ggout_list$hist <- ggout_list$hist +
      theme(legend.position = "none")
    ggout_list$boxplot <- ggout_list$boxplot +
      theme(legend.position = "none")
  }
  return(ggout_list)
}
bryancquach/omixjutsu documentation built on Jan. 29, 2023, 3:47 p.m.