# Tools for munging and visualizing QC metrics for RNA-seq data.
# Author: Bryan Quach (bryancquach@gmail.com)
#' Load QC metrics data from paired-end RNA-seq processing
#'
#' Load QC data files generated by MultiQC for various stages of paired-end RNA-seq data
#' processing using the automated RNA-seq data processing WDL workflow.
#'
#' The automated RNA-seq data processing workflow produces several tables with various metrics.
#' Additionally, several tables can be exported from the MultiQC HTML report. This function will
#' import all of these data into a list for further processing in R. The assumed data files are
#' * `multiqc_fastqc.txt`: An auto-generated workflow file with general FASTQC statistics.
#' * `multiqc_hisat2.txt`: An auto-generated workflow file with HISAT2 alignment statistics.
#' * `multiqc_rseqc_bam_stat.txt`: An auto-generated workflow file with alignment statistics
#' generated by RSeQC.
#' * `multiqc_rseqc_read_distribution.txt`: An auto-generated workflow file with alignment
#' categorizations to different types of genomic regions (introns, exons, integenic, etc.).
#' * `multiqc_salmon.txt`: An auto-generated workflow file with mapping statistics from Salmon.
#' * `multiqc_trimmomatic.txt`: An auto-generated workflow file with trimming and read quality
#' filtering statistics using Trimmomatic.
#' * `rseqc_inner_distance_plot.tsv`: Table exported from the MultiQC HTML report with inner
#' distance statistics between pairs of reads.
#' * `salmon_plot.tsv`: Table exported from the MultiQC HTML report with RNA fragment length
#' distribution statistics as estimated by Salmon.
#' * `fastqc_per_base_sequence_quality_plot.tsv`: Table exported from the MultiQC HTML report with
#' per base PHRED scores statistics generated by FASTQC.
#' * `fastqc_per_sequence_quality_scores_plot.tsv`: Table exported from the MultiQC HTML report
#' with per sequence PHRED score statistics generated by FASTQC.
#' * `fastqc_per_sequence_gc_content_plot.tsv`: Table exported from the MultiQC HTML report with
#' per sequence GC content statistics generated by FASTQC.
#' * `fastqc_sequence_duplication_levels_plot.tsv`: Table exported from the MultiQC HTML report
#' with sequence duplication statistics generated by FASTQC.
#' * `fastqc_adapter_content_plot.tsv`: Table exported from the MultiQC HTML report with adapter
#' content statistics generated by FASTQC.
#' * `rseqc_known_junction_saturation_plot.tsv`: Table exported from the MultiQC HTML report with
#' junction saturation statistics for known splice junctions.
#' * `rseqc_novel_junction_saturation_plot.tsv`: Table exported from the MultiQC HTML report with
#' junction saturation statistics for novel splice junctions.
#'
#' @param data_dir A string denoting the local path where all the data files are stored.
#' @return A list with several elements:
#' * `fastqc`: Data frame imported from `multiqc_fastqc.txt`.
#' * `hisat2`: Data frame imported from `multiqc_hisat2.txt`.
#' * `rseqc_bam`: Data frame imported from `multiqc_rseqc_bam_stat.txt`.
#' * `rseqc_alignment_category`: Data frame imported from `multiqc_rseqc_read_distribution.txt`.
#' * `salmon`: Data frame imported from `multiqc_salmon.txt`.
#' * `trimmomatic`: Data frame imported from `multiqc_trimmomatic.txt`.
#' * `inner_dist`: Data frame imported from `rseqc_inner_distance_plot.tsv`.
#' * `frag_length`: Data frame imported from `salmon_plot.tsv`.
#' * `phred_bp`: Data frame imported from `fastqc_per_base_sequence_quality_plot.tsv`.
#' * `phred_seq`: Data frame imported from `fastqc_per_sequence_quality_scores_plot.tsv`.
#' * `gc_content`: Data frame imported from `fastqc_per_sequence_gc_content_plot.tsv`.
#' * `seq_duplication`: Data frame imported from `fastqc_sequence_duplication_levels_plot.tsv`.
#' * `adapter_content`: Data frame imported from `fastqc_adapter_content_plot.tsv`.
#' * `known_junction`: Data frame imported from `rseqc_known_junction_saturation_plot.tsv`.
#' * `novel_junction`: Data frame imported from `rseqc_novel_junction_saturation_plot.tsv`.
#' @export
load_paired_end_qc_data <- function(data_dir) {
qc_data <- list()
qc_data$fastqc <- read.delim(
file.path(data_dir, "/multiqc_fastqc.txt"),
header = T,
sep = "\t"
)
qc_data$hisat2 <- read.delim(
file.path(data_dir, "/multiqc_hisat2.txt"),
header = T,
sep = "\t"
)
qc_data$rseqc_bam <- read.delim(
file.path(data_dir, "/multiqc_rseqc_bam_stat.txt"),
header = T,
sep = "\t"
)
qc_data$rseqc_alignment_category <- read.delim(
file.path(data_dir, "/multiqc_rseqc_read_distribution.txt"),
header = T,
sep = "\t"
)
qc_data$salmon <- read.delim(
file.path(data_dir, "/multiqc_salmon.txt"),
header = T,
sep = "\t"
)
qc_data$trimmomatic <- read.delim(
file.path(data_dir, "/multiqc_trimmomatic.txt"),
header = T,
sep = "\t"
)
qc_data$inner_dist <- read.delim(
file.path(data_dir, "/rseqc_inner_distance_plot.tsv"),
header = T,
sep = "\t"
)
qc_data$frag_length <- read.delim(
file.path(data_dir, "/salmon_plot.tsv"),
header = T,
sep = "\t"
)
qc_data$phred_bp <- read.delim(
file.path(data_dir, "/fastqc_per_base_sequence_quality_plot.tsv"),
header = T,
sep = "\t"
)
qc_data$phred_seq <- read.delim(
file.path(data_dir, "/fastqc_per_sequence_quality_scores_plot.tsv"),
header = T,
sep = "\t"
)
qc_data$gc_content <- read.delim(
file.path(data_dir, "/fastqc_per_sequence_gc_content_plot.tsv"),
header = T,
sep = "\t"
)
qc_data$seq_duplication <- read.delim(
file.path(data_dir, "/fastqc_sequence_duplication_levels_plot.tsv"),
header = T,
sep = "\t"
)
qc_data$adapter_content <- read.delim(
file.path(data_dir, "/fastqc_adapter_content_plot.tsv"),
header = T,
sep = "\t"
)
qc_data$known_junction <- read.delim(
file.path(data_dir, "/rseqc_known_junction_saturation_plot.tsv"),
header = T,
sep = "\t"
)
qc_data$novel_junction <- read.delim(
file.path(data_dir, "/rseqc_novel_junction_saturation_plot.tsv"),
header = T,
sep = "\t"
)
# Formatting for sequence duplication data
qc_data$seq_duplication <- t(qc_data$seq_duplication)
colnames(qc_data$seq_duplication) <- paste("level", qc_data$seq_duplication[1, ], sep = "_")
qc_data$seq_duplication <- qc_data$seq_duplication[-1, ]
if (any(is.na(qc_data$seq_duplication))) {
qc_data$seq_duplication[is.na(qc_data$seq_duplication)] <- 0
}
tmp_ids <- gsub(rownames(qc_data$seq_duplication), pattern = "^X", replacement = "", perl = T)
tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
qc_data$seq_duplication <- as.data.frame(qc_data$seq_duplication, stringAsFactors = F)
rownames(qc_data$seq_duplication) <- tmp_ids
# Formatting for per sequence PHRED score data
qc_data$phred_seq <- t(qc_data$phred_seq)
colnames(qc_data$phred_seq) <- paste("PHRED", qc_data$phred_seq[1, ], sep = "_")
qc_data$phred_seq <- qc_data$phred_seq[-1, ]
if (any(is.na(qc_data$phred_seq))) {
qc_data$phred_seq[is.na(qc_data$phred_seq)] <- 0
}
tmp_ids <- gsub(rownames(qc_data$phred_seq), pattern = "^X", replacement = "", perl = T)
tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
qc_data$phred_seq <- as.data.frame(qc_data$phred_seq, stringAsFactors = F)
rownames(qc_data$phred_seq) <- tmp_ids
# Formatting for per bp PHRED score data
qc_data$phred_bp <- t(qc_data$phred_bp)
colnames(qc_data$phred_bp) <- paste("BP", qc_data$phred_bp[1, ], sep = "_")
qc_data$phred_bp <- qc_data$phred_bp[-1, ]
tmp_ids <- gsub(rownames(qc_data$phred_bp), pattern = "^X", replacement = "", perl = T)
tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
qc_data$phred_bp <- as.data.frame(qc_data$phred_bp, stringAsFactors = F)
rownames(qc_data$phred_bp) <- tmp_ids
# Formatting for adapter content data
qc_data$adapter_content <- t(qc_data$adapter_content)
colnames(qc_data$adapter_content) <- paste("BP", qc_data$adapter_content[1, ], sep = "_")
qc_data$adapter_content <- qc_data$adapter_content[-1, ]
tmp_ids <- gsub(rownames(qc_data$adapter_content), pattern = "^X", replacement = "", perl = T)
tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
qc_data$adapter_content <- as.data.frame(qc_data$adapter_content, stringAsFactors = F)
qc_data$adapter_content$adapter <- sapply(
strsplit(tmp_ids, split = " | ", fixed = T),
function(id_pieces) {
id_pieces[length(id_pieces)]
},
simplify = T
)
rownames(qc_data$adapter_content) <- tmp_ids
# Formatting for GC content data
qc_data$gc_content <- t(qc_data$gc_content)
colnames(qc_data$gc_content) <- paste("GC", qc_data$gc_content[1, ], sep = "_")
qc_data$gc_content <- qc_data$gc_content[-1, ]
tmp_ids <- gsub(rownames(qc_data$gc_content), pattern = "^X", replacement = "", perl = T)
tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
qc_data$gc_content <- as.data.frame(qc_data$gc_content, stringAsFactors = F)
rownames(qc_data$gc_content) <- tmp_ids
# Formatting for fragment length data
qc_data$frag_length <- t(qc_data$frag_length)
colnames(qc_data$frag_length) <- paste("BP", qc_data$frag_length[1, ], sep = "_")
qc_data$frag_length <- qc_data$frag_length[-1, ]
tmp_ids <- gsub(
rownames(qc_data$frag_length),
pattern = "libParams...",
replacement = "",
perl = T
)
tmp_ids <- gsub(tmp_ids, pattern = "...", replacement = " | ", fixed = T)
qc_data$frag_length <- as.data.frame(qc_data$frag_length, stringAsFactors = F)
rownames(qc_data$frag_length) <- tmp_ids
# Formatting for HISAT2 data
qc_data$hisat2$id <- gsub(qc_data$hisat2$Sample, pattern = "\\|.+$", replacement = "", perl = T)
qc_data$hisat2$id <- trimws(qc_data$hisat2$id)
qc_data$hisat2 <- sapply(
unique(qc_data$hisat2$id),
function(tmp_id) {
tmp_subset <- qc_data$hisat2[which(qc_data$hisat2$id == tmp_id), ]
paired_total <- sum(tmp_subset$paired_total)
paired_aligned_none <- sum(tmp_subset$paired_aligned_none)
paired_aligned_one <- sum(tmp_subset$paired_aligned_one)
paired_aligned_multi <- sum(tmp_subset$paired_aligned_multi)
paired_aligned_discord_one <- sum(tmp_subset$paired_aligned_discord_one)
unpaired_total <- sum(tmp_subset$unpaired_total)
unpaired_aligned_none <- sum(tmp_subset$unpaired_aligned_none)
unpaired_aligned_one <- sum(tmp_subset$unpaired_aligned_one)
unpaired_aligned_multi <- sum(tmp_subset$unpaired_aligned_multi)
paired_sum <- 2 * (paired_aligned_one + paired_aligned_multi + paired_aligned_discord_one)
unpaired_sum <- unpaired_aligned_one + unpaired_aligned_multi
overall_alignment_rate <- (paired_sum + unpaired_sum) / (2 * paired_total)
revised_paired_sum <- 2 * (paired_aligned_one + paired_aligned_multi)
revised_alignment_rate <- revised_paired_sum / ((2 * paired_total) - unpaired_total)
merged_subset <- c(
tmp_id,
paired_total,
paired_aligned_none,
paired_aligned_one,
paired_aligned_multi,
paired_aligned_discord_one,
unpaired_total,
unpaired_aligned_none,
unpaired_aligned_one,
unpaired_aligned_multi,
overall_alignment_rate,
revised_alignment_rate
)
names(merged_subset) <- c(
"id",
"paired_total",
"paired_aligned_none",
"paired_aligned_one",
"paired_aligned_multi",
"paired_aligned_discord_one",
"unpaired_total",
"unpaired_aligned_none",
"unpaired_aligned_one",
"unpaired_aligned_multi",
"overall_alignment_rate",
"revised_alignment_rate"
)
return(merged_subset)
}
)
qc_data$hisat2 <- as.data.frame(t(qc_data$hisat2))
rownames(qc_data$hisat2) <- qc_data$hisat2$id
# Formatting for splice junction data
qc_data$known_junction <- t(qc_data$known_junction)
colnames(qc_data$known_junction) <- paste("percent", qc_data$known_junction[1, ], sep = "_")
qc_data$known_junction <- qc_data$known_junction[-1, ]
if (any(is.na(qc_data$known_junction))) {
qc_data$known_junction[is.na(qc_data$known_junction)] <- 0
}
tmp_ids <- gsub(rownames(qc_data$known_junction), pattern = "^X", replacement = "", perl = T)
qc_data$known_junction <- as.data.frame(qc_data$known_junction, stringsAsFactors = F)
rownames(qc_data$known_junction) <- tmp_ids
qc_data$novel_junction <- t(qc_data$novel_junction)
colnames(qc_data$novel_junction) <- paste("percent", qc_data$novel_junction[1, ], sep = "_")
qc_data$novel_junction <- qc_data$novel_junction[-1, ]
if (any(is.na(qc_data$novel_junction))) {
qc_data$novel_junction[is.na(qc_data$novel_junction)] <- 0
}
tmp_ids <- gsub(rownames(qc_data$novel_junction), pattern = "^X", replacement = "", perl = T)
qc_data$novel_junction <- as.data.frame(qc_data$novel_junction, stringsAsFactors = F)
rownames(qc_data$novel_junction) <- tmp_ids
# Formatting for read pair inner distance data
qc_data$inner_dist <- t(qc_data$inner_dist)
colnames(qc_data$inner_dist) <- paste("BP", qc_data$inner_dist[1, ], sep = "_")
qc_data$inner_dist <- qc_data$inner_dist[-1, ]
tmp_ids <- gsub(rownames(qc_data$inner_dist), pattern = "^X", replacement = "", perl = T)
qc_data$inner_dist <- as.data.frame(qc_data$inner_dist, stringsAsFactors = F)
rownames(qc_data$inner_dist) <- tmp_ids
# Formatting for RSeQC read alignment categories
keeper_col_ids <- c(
"Sample",
"other_intergenic_tag_pct",
"tes_down_10kb_tag_pct",
"tss_up_10kb_tag_pct",
"introns_tag_pct",
"X3_utr_exons_tag_pct",
"X5_utr_exons_tag_pct",
"cds_exons_tag_pct"
)
qc_data$rseqc_alignment_category <- qc_data$rseqc_alignment_category[, keeper_col_ids]
colnames(qc_data$rseqc_alignment_category) <- c(
"id",
"Other_intergenic",
"Downstream_10kb",
"Upstream_10kb",
"Intron",
"UTR_3",
"UTR_5",
"CDS"
)
rownames(qc_data$rseqc_alignment_category) <- qc_data$rseqc_alignment_category$id
rownames(qc_data$fastqc) <- qc_data$fastqc$Sample
rownames(qc_data$rseqc_bam) <- qc_data$rseqc_bam$Sample
rownames(qc_data$salmon) <- qc_data$salmon$Sample
rownames(qc_data$trimmomatic) <- qc_data$trimmomatic$Sample
return(qc_data)
}
#' Partition rows by trimming status.
#'
#' Partition rows of a data frame by trimming status using string matching on row names.
#'
#' Parses a quality metric table that has data from multiple stages of the RNA-seq data processing
#' workflow into separate tables for pre-Trimmomatic and post-Trimmomatic data. If the data are
#' from paired-end sequencing, additional partitions can be made that separate post-Trimmomatic
#' preserved read pairs from post-Trimmomatic unpaired reads, and read 1 (R1) and read 2 (R2). This
#' function is intended for use with auto-generated QC metrics tables from the RNA-seq data
#' processing workflow and tables exported from MultiQC HTML reports.
#'
#' @param data A data frame with rows corresponding to samples with names that differentiate
#' different stages of sample processing from the RNA-seq data processing workflow (pre vs.
#' post-Trimmomatic and "R1" and "R2" for read pairs in paired-end sequencing).
#' @param str_query A string in the row names for `data` that distinguish pre-Trimmomatic from
#' post-Trimmomatic records.
#' @param paired_end A logical. Is the data from paired-end sequencing? If so, additional
#' partitions will be made that separate the forward and reverse read pairs.
#' @param r1_query A string in the row names for `data` that distinguish R1 reads from others in
#' paired-end sequencing data.
#' @param r2_query A string in the row names for `data` that distinguish R2 reads from others in
#' paired-end sequencing data.
#' @param unpaired_query A string in the row names for `data` that distinguish unpaired reads from
#' preserved pairs in paired-end sequencing data.
#' @return For single-end sequencing data, a list with the elements `untrimmed` and `trimmed` to
#' denote data frames with pre-Trimmomatic and post-Trimmomatic data respectively. For
#' paired-end sequencing data, a list with the following elements:
#' * `untrimmed_r1`: Data frame with pre-Trimmomatic R1 reads.
#' * `untrimmed_r2`: Data frame with pre-Trimmomatic R2 reads.
#' * `trimmed_r1`: Data frame with post-Trimmomatic R1 reads from preserved pairs.
#' * `trimmed_r2`: Data frame with post-Trimmomatic R2 reads from preserved pairs.
#' * `unpaired_r1`: Data frame with post-Trimmomatic R1 reads without a read pair.
#' * `unpaired_r2`: Data frame with post-Trimmomatic R2 reads without a read pair.
#' @export
parse_by_trim_status <- function(data,
str_query = "trimmed",
paired_end = F,
r1_query = "R1",
r2_query = "R2",
unpaired_query = "unpaired") {
data <- as.data.frame(data, stringsAsFactors = F)
all_ids <- rownames(data)
data_untrimmed <- data[grep(x = all_ids, str_query, invert = T), ]
data_trimmed <- data[grep(x = all_ids, str_query), ]
if (!paired_end) {
partitions <- list(untrimmed = data_untrimmed, trimmed = data_trimmed)
if (nrow(partitions$untrimmed) != nrow(partitions$trimmed)) {
warning("Differing number of pre- and post-Trimmomatic records")
}
return(partitions)
}
ids_untrimmed <- rownames(data_untrimmed)
data_untrimmed_r1 <- data_untrimmed[grep(x = ids_untrimmed, r1_query), ]
data_untrimmed_r2 <- data_untrimmed[grep(x = ids_untrimmed, r2_query), ]
ids_trimmed <- rownames(data_trimmed)
data_trimmed_paired <- data_trimmed[grep(x = ids_trimmed, unpaired_query, invert = T), ]
data_trimmed_unpaired <- data_trimmed[grep(x = ids_trimmed, unpaired_query), ]
ids_trimmed_paired <- rownames(data_trimmed_paired)
data_trimmed_paired_r1 <- data_trimmed_paired[grep(x = ids_trimmed_paired, r1_query), ]
data_trimmed_paired_r2 <- data_trimmed_paired[grep(x = ids_trimmed_paired, r2_query), ]
ids_trimmed_unpaired <- rownames(data_trimmed_unpaired)
data_trimmed_unpaired_r1 <- data_trimmed_unpaired[grep(x = ids_trimmed_unpaired, r1_query), ]
data_trimmed_unpaired_r2 <- data_trimmed_unpaired[grep(x = ids_trimmed_unpaired, r2_query), ]
partitions <- list(
untrimmed_r1 = data_untrimmed_r1,
untrimmed_r2 = data_untrimmed_r2,
trimmed_r1 = data_trimmed_paired_r1,
trimmed_r2 = data_trimmed_paired_r2,
unpaired_r1 = data_trimmed_unpaired_r1,
unpaired_r2 = data_trimmed_unpaired_r2
)
if (nrow(partitions$untrimmed_r1) != nrow(partitions$untrimmed_r2)) {
warning("Differing number of pre-Trimmomatic R1 and R2 records")
}
if (nrow(partitions$trimmed_r1) != nrow(partitions$trimmed_r2)) {
warning("Differing number of post-Trimmomatic preserved pair R1 and R2 records")
}
if (nrow(partitions$unpaired_r1) != nrow(partitions$unpaired_r2)) {
warning("Differing number of unpaired R1 and R2 records")
}
if (nrow(partitions$untrimmed_r1) != nrow(partitions$trimmed_r1)) {
warning("Differing number of pre- and post-Trimmomatic R1 records")
}
if (nrow(partitions$untrimmed_r2) != nrow(partitions$trimmed_r2)) {
warning("Differing number of pre- and post-Trimmomatic R2 records")
}
return(partitions)
}
#' Sequencing depth per sample scatterplot
#'
#' Create a sequencing depth by sample scatterplot.
#'
#' @param data A data frame with sequencing depth data in the same format as `fastqc` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param point_size A numeric. The size for data points.
#' @param fill A string. The fill color for data points.
#' @param alpha A numeric. The alpha level for data points.
#' @param sort A logical. Should the x-axis be ordered by sequencing depth?
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_seq_depth <- function(data,
point_size = 4,
fill = "red4",
alpha = 0.75,
sort = T,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
if (sort) {
data <- data[order(data$Total.Sequences), ]
}
ggout <- ggplot(data, aes(x = seq_len(nrow(data)), y = Total.Sequences / 1000000)) +
geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
labs(y = "Reads (millions)", x = "Sample index") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Sequencing depth histogram
#'
#' Create a sequencing depth histogram.
#'
#' @param data A data frame with sequencing depth data in the same format as `fastqc` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param binsize A numeric value for the histogram bin widths.
#' @param fill A string. The fill color for histogram bars.
#' @param alpha A numeric. The alpha level for histogram bars.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_seq_depth_hist <- function(data,
binsize = NULL,
fill = "red4",
alpha = 0.75,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
if (is.null(binsize)) {
binsize <- diff(range(data$Total.Sequences / 1000000)) / 25
}
ggout <- ggplot(data, aes(x = Total.Sequences / 1000000)) +
geom_histogram(binwidth = binsize, fill = fill, color = "white", alpha = alpha) +
labs(y = "Number of Samples", x = "Reads (millions)") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Per read PHRED frequency plot
#'
#' Create a line plot with per read PHRED frequencies.
#'
#' @param data A list of data frames with data in the same format as `phred_seq` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @param labels A vector of strings for legend labels corresponding to the data frames in `data`.
#' @param line_colors A vector of strings. Line colors corresponding to the data frames in `data`.
#' @param alpha A numeric. The alpha level for lines.
#' @param line_width A numeric value for the line width.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_phred_per_read <- function(data,
labels = NULL,
line_colors = rep("red4", length(data)),
alpha = 0.25,
line_width = 1.5,
ids = NULL,
invert = F) {
plot_df_list <- sapply(
seq_len(length(data)),
function(i) {
df <- data[[i]]
if (!is.null(ids)) {
if (invert) {
df <- df[!rownames(df) %in% ids, , drop = F]
} else {
df <- df[rownames(df) %in% ids, , drop = F]
}
}
df$id <- rownames(df)
new_df <- melt(
df,
id.vars = "id",
variable.name = "PHRED",
value.name = "count",
factorsAsStrings = T
)
new_df$phred <- gsub(
as.character(new_df$PHRED),
pattern = "PHRED_",
replacement = "",
fixed = T
)
new_df$phred <- as.numeric(new_df$phred)
new_df$id <- reorder(new_df$id, new_df$phred)
if (is.null(labels)) {
new_df$group <- names(data)[i]
} else {
new_df$group <- labels[i]
}
return(new_df)
},
simplify = F
)
plot_data <- do.call(rbind, plot_df_list)
ggout <- ggplot(plot_data, aes(x = phred, y = log10(count + 1), group = id, col = group)) +
geom_line(lwd = line_width, alpha = alpha) +
scale_color_manual(values = line_colors) +
xlim(0, 40) +
labs(x = "Mean PHRED score", y = "Read frequency (log10)", col = "") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1),
legend.title = element_text(size = 16),
legend.text = element_text(size = 16)
)
return(ggout)
}
#' Per base pair PHRED score plot
#'
#' Create a line plot with per base pair mean PHRED scores.
#'
#' @param data A list of data frames with data in the same format as `phred_bp` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @inheritParams plot_phred_per_read
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_phred_per_bp <- function(data,
labels = NULL,
line_colors = rep("red4", length(data)),
alpha = 0.25,
line_width = 1.5,
ids = NULL,
invert = F) {
plot_df_list <- sapply(
seq_len(length(data)),
function(i) {
df <- data[[i]]
if (!is.null(ids)) {
if (invert) {
df <- df[!rownames(df) %in% ids, , drop = F]
} else {
df <- df[rownames(df) %in% ids, , drop = F]
}
}
df$id <- rownames(df)
new_df <- melt(
df,
id.vars = "id",
variable.name = "BP",
value.name = "score",
factorsAsStrings = T
)
new_df$bp <- gsub(
as.character(new_df$BP),
pattern = "BP_",
replacement = "",
fixed = T
)
new_df$bp <- as.numeric(new_df$bp)
new_df$id <- reorder(new_df$id, new_df$bp)
if (is.null(labels)) {
new_df$group <- names(data)[i]
} else {
new_df$group <- labels[i]
}
return(new_df)
},
simplify = F
)
plot_data <- do.call(rbind, plot_df_list)
ggout <- ggplot(plot_data, aes(x = bp, y = score, group = id, col = group)) +
geom_line(lwd = line_width, alpha = alpha) +
scale_color_manual(values = line_colors) +
ylim(0, 40) +
labs(y = "Mean PHRED score", x = "Read position", col = "") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1),
legend.title = element_text(size = 16),
legend.text = element_text(size = 16)
)
return(ggout)
}
#' Plot per sample mean of per read mean PHRED scores
#'
#' Create a histogram and boxplot of per sample mean of per read mean PHRED scores. The two plots
#' are intended to be used as a single column, two row figure panel.
#'
#' @param data A data frame with data in the same format as `phred_seq` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match rownames
#' in `data`.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_phred_mean <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Mean of mean PHRED score",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
phred_mean <- apply(
data,
1,
function(phred_freq) {
phred_scores <- gsub(names(phred_freq), pattern = "PHRED_", replacement = "", fixed = T)
phred_scores <- as.numeric(phred_scores)
phred_freq <- as.numeric(phred_freq)
phred_mean <- sum(phred_freq * phred_scores, na.rm = T) / sum(phred_freq, na.rm = T)
return(phred_mean)
}
)
phred_mean <- data.frame(values = phred_mean)
rownames(phred_mean) <- rownames(data)
plot_data <- merge(x = phred_mean, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
data = plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' GC content distribution line plot
#'
#' Create a line plot with read GC content distributions.
#'
#' @param data A list of data frames with data in the same format as `gc_content` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @inheritParams plot_phred_per_read
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_gc_content <- function(data,
labels = NULL,
line_colors = rep("red4", length(data)),
alpha = 0.25,
line_width = 1.5,
ids = NULL,
invert = F) {
plot_df_list <- sapply(
seq_len(length(data)),
function(i) {
df <- data[[i]]
if (!is.null(ids)) {
if (invert) {
df <- df[!rownames(df) %in% ids, , drop = F]
} else {
df <- df[rownames(df) %in% ids, , drop = F]
}
}
df$id <- rownames(df)
new_df <- melt(
df,
id.vars = "id",
variable.name = "GC",
value.name = "percent",
factorsAsStrings = T
)
new_df$gc <- gsub(
as.character(new_df$GC),
pattern = "GC_",
replacement = "",
fixed = T
)
new_df$gc <- as.numeric(new_df$gc)
new_df$id <- reorder(new_df$id, new_df$gc)
if (is.null(labels)) {
new_df$group <- names(data)[i]
} else {
new_df$group <- labels[i]
}
return(new_df)
},
simplify = F
)
plot_data <- do.call(rbind, plot_df_list)
ggout <- ggplot(plot_data, aes(x = gc, y = percent, group = id, col = group)) +
geom_line(lwd = line_width, alpha = alpha) +
scale_color_manual(values = line_colors) +
labs(y = "% reads", x = "Mean % GC", col = "") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1),
legend.title = element_text(size = 16),
legend.text = element_text(size = 16)
)
return(ggout)
}
#' GC content histogram
#'
#' Create a GC content by sample histogram.
#'
#' @param data A list of data frames with data in the same format as `fastqc` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param labels A vector of strings for legend labels corresponding to the data frames in `data`.
#' @inheritParams plot_seq_depth_hist
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_gc_content_hist <- function(data,
binsize = 2.5,
labels = NULL,
fill = rep("red4", length(data)),
alpha = 0.25,
ids = NULL,
invert = F) {
plot_df_list <- sapply(
seq_len(length(data)),
function(i) {
df <- data[[i]]
if (!is.null(ids)) {
if (invert) {
df <- df[!rownames(df) %in% ids, , drop = F]
} else {
df <- df[rownames(df) %in% ids, , drop = F]
}
}
df$id <- rownames(df)
new_df <- df[, c("id", "X.GC")]
colnames(new_df) <- c("id", "gc")
new_df$gc <- as.numeric(new_df$gc)
if (is.null(labels)) {
new_df$group <- names(data)[i]
} else {
new_df$group <- labels[i]
}
return(new_df)
},
simplify = F
)
plot_data <- do.call(rbind, plot_df_list)
ggout <- ggplot(plot_data, aes(x = gc, fill = group)) +
geom_histogram(
position = "identity",
binwidth = binsize,
color = "white",
alpha = alpha
) +
scale_fill_manual(values = fill) +
labs(y = "Number of Samples", x = "Mean % GC", fill = "") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1),
legend.title = element_text(size = 16),
legend.text = element_text(size = 16)
)
return(ggout)
}
#' Plot per sample mean read GC content
#'
#' Create a histogram and boxplot of per sample mean read GC content. The two plots
#' are intended to be used as a single column, two row figure panel.
#'
#' @param data A data frame with data in the same format as `gc_content` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_phred_mean
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean GC content and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_gc_mean <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Mean read GC content (%)",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
gc_mean <- apply(
data,
1,
function(gc_freq) {
gc_scores <- gsub(names(gc_freq), pattern = "GC_", replacement = "", fixed = T)
gc_scores <- as.numeric(gc_scores)
gc_freq <- as.numeric(gc_freq)
gc_mean <- sum(gc_freq * gc_scores, na.rm = T) / sum(gc_freq, na.rm = T)
return(gc_mean)
}
)
gc_mean <- data.frame(values = gc_mean)
rownames(gc_mean) <- rownames(data)
plot_data <- merge(x = gc_mean, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
data = plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Adapter content distribution line plot
#'
#' Create a line plot with per position adapter frequency.
#'
#' @param data A list of data frames with data in the same format as `adapter_content` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @inheritParams plot_phred_per_read
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_adapter_content <- function(data,
labels = NULL,
line_colors = rep("red4", length(data)),
alpha = 0.25,
line_width = 1.5,
ids = NULL,
invert = F) {
plot_df_list <- sapply(
seq_len(length(data)),
function(i) {
df <- data[[i]]
if (!is.null(ids)) {
if (invert) {
df <- df[!rownames(df) %in% ids, , drop = F]
} else {
df <- df[rownames(df) %in% ids, , drop = F]
}
}
df$id <- rownames(df)
new_df <- melt(
df,
id.vars = c("id", "adapter"),
variable.name = "BP",
value.name = "percent",
factorsAsStrings = T
)
new_df$bp <- gsub(
as.character(new_df$BP),
pattern = "BP_",
replacement = "",
fixed = T
)
new_df$bp <- as.numeric(new_df$bp)
new_df$id <- reorder(new_df$id, new_df$bp)
if (is.null(labels)) {
new_df$group <- names(data)[i]
} else {
new_df$group <- labels[i]
}
return(new_df)
},
simplify = F
)
plot_data <- do.call(rbind, plot_df_list)
ggout <- ggplot(plot_data, aes(x = bp, y = percent, group = id, col = group)) +
geom_line(lwd = line_width, alpha = alpha) +
scale_color_manual(values = line_colors) +
labs(y = "Cumulative % reads", x = "Read position", col = "") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1),
legend.title = element_text(size = 16),
legend.text = element_text(size = 16)
)
return(ggout)
}
#' Unique read percentage per sample scatterplot
#'
#' Create a unique read percentage by sample scatterplot.
#'
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match rownames
#' in `data`.
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams plot_seq_depth
#' @return A ggplot object.
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' unique read percentage and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_unique_read_pct <- function(data,
group = NULL,
point_size = 4,
fill = NULL,
alpha = 0.75,
sort = T,
ids = NULL,
invert = F,
return_data = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = data$total_deduplicated_percentage)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (sort) {
plot_data <- plot_data[order(plot_data$values), ]
}
if (return_data) {
return(plot_data)
}
ggout <- ggplot(plot_data, aes(x = seq_len(nrow(plot_data)), y = values, fill = group)) +
geom_point(shape = 21, size = point_size, col = "white", alpha = alpha) +
labs(y = "% unique reads", x = "Sample index", fill = "") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
if (is.null(group0)) {
ggout <- ggout +
theme(legend.position = "none")
}
if (is.null(fill)) {
ggout <- ggout + scale_fill_brewer(palette = "Dark2")
} else {
ggout <- ggout + scale_fill_manual(values = fill)
}
return(ggout)
}
#' Sequencing depth vs. library complexity scatterplot
#'
#' Create a sequencing depth vs. unique read percentage scatterplot.
#'
#' @param data A data frame with sequencing depth and unique read percentage data in the same
#' format as `fastqc` returned by \code{\link{load_paired_end_qc_data}}.
#' @param point_size A numeric. The size for data points.
#' @param fill A string. The fill color for data points.
#' @param alpha A numeric. The alpha level for data points.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_seq_depth_vs_lib_complexity <- function(data,
point_size = 4,
fill = "red4",
alpha = 0.75,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
ggout <- ggplot(data, aes(x = total_deduplicated_percentage, y = Total.Sequences / 1000000)) +
geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
xlim(0, 100) +
labs(y = "Reads (millions)", x = "% unique reads") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Sequence duplication line plot
#'
#' Create a line plot with per sample sequence duplication levels.
#'
#' @param data A list of data frames with data in the same format as `seq_duplication` returned
#' by \code{\link{load_paired_end_qc_data}}. Each data frame is intended to represent a different
#' category of data points (e.g., R1 vs. R2 reads in paired-end data).
#' @inheritParams plot_phred_per_read
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_seq_duplication <- function(data,
labels = NULL,
line_colors = rep("red4", length(data)),
alpha = 0.25,
line_width = 1.5,
ids = NULL,
invert = F) {
plot_df_list <- sapply(
seq_len(length(data)),
function(i) {
df <- data[[i]]
if (!is.null(ids)) {
if (invert) {
df <- df[!rownames(df) %in% ids, , drop = F]
} else {
df <- df[rownames(df) %in% ids, , drop = F]
}
}
df$id <- rownames(df)
new_df <- melt(
df,
id.vars = "id",
variable.name = "Level",
value.name = "percent",
factorsAsStrings = T
)
new_df$level <- gsub(
as.character(new_df$Level),
pattern = "level_",
replacement = "",
fixed = T
)
new_df$level <- gsub(
as.character(new_df$level),
pattern = ">",
replacement = "",
fixed = T
)
new_df$level <- gsub(
as.character(new_df$level),
pattern = "(k.)|(k)",
replacement = "000",
perl = T
)
new_df$level <- as.numeric(new_df$level)
new_df$percent <- as.numeric(new_df$percent)
new_df$id <- reorder(new_df$id, new_df$level)
if (is.null(labels)) {
new_df$group <- names(data)[i]
} else {
new_df$group <- labels[i]
}
return(new_df)
},
simplify = F
)
plot_data <- do.call(rbind, plot_df_list)
x_names <- c(
"1", "2", "3", "4", "5",
"6", "7", "8", "9", ">10",
">50", ">100", ">500", ">1K",
">5K", ">10K"
)
dup_level_names <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500, 1000, 5000, 10000)
dup_levels <- seq_len(length(dup_level_names))
names(dup_levels) <- dup_level_names
plot_data$level <- dup_levels[as.character(plot_data$level)]
ggout <- ggplot(plot_data, aes(x = level, y = percent, group = id, col = group)) +
geom_line(lwd = line_width, alpha = alpha) +
scale_color_manual(values = line_colors) +
labs(y = "% reads", x = "Duplication level", col = "") +
scale_x_continuous(breaks = dup_levels, labels = x_names) +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1),
legend.title = element_text(size = 16),
legend.text = element_text(size = 16)
)
return(ggout)
}
#' Trimmomatic statistics histograms
#'
#' Create a histograms for read retention and filtering rates.
#'
#' Uses output generated by MultiQC on Trimmomatic output for paired-end sequencing data to
#' produce sample frequency histograms of read retention and filtering rates. Note: this function
#' only works with Trimmomatic output for paired-end sequencing data.
#'
#' @param data A data frame with sequencing depth data in the same format as `trimmomatic` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param binsize A 2-item numeric vector that denotes the bin width for histogram bars. The
#' first value is for the `excluded` plot, and the other value corresponds to the `retained`
#' plot.
#' @param fill A 3-item vector of strings that denotes the colors for histogram bars. The first
#' color is for the `excluded` plot, and the other colors correspond to R1 and R2 reads in the
#' `retained` plot.
#' @inheritParams plot_seq_depth_hist
#' @return A list with two ggplot objects, `excluded` and `retained` that correspond to ggplot
#' objects for reads excluded and retained after Trimmomatic respectively.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_trimmomatic_paired <- function(data,
binsize = c(2.5, 0.5),
fill = c("gray10", "goldenrod2", "steelblue4"),
alpha = 0.5,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
ggout_list <- list()
ggout_list$excluded <- ggplot(data, aes(x = dropped_pct)) +
geom_histogram(binwidth = binsize[1], fill = fill[1], color = "white", alpha = alpha) +
labs(y = "Number of Samples", x = "% reads excluded") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
plot_data <- melt(
data[, c("forward_only_surviving_pct", "reverse_only_surviving_pct")],
measure.vars = c("forward_only_surviving_pct", "reverse_only_surviving_pct"),
variable.name = "group",
value.name = "percent"
)
plot_data$percent <- plot_data$percent
plot_data$group <- factor(
x = plot_data$group,
levels = c("forward_only_surviving_pct", "reverse_only_surviving_pct"),
labels = c("R1", "R2")
)
ggout_list$retained <- ggplot(plot_data, aes(x = percent, fill = group)) +
geom_histogram(position = "identity", binwidth = binsize[2], color = "white", alpha = alpha) +
scale_fill_manual(values = fill[2:3]) +
labs(y = "Number of Samples", x = "% unpaired reads retained", fill = "") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1),
legend.title = element_text(size = 16),
legend.text = element_text(size = 16)
)
return(ggout_list)
}
#' Fragment length frequency line plot
#'
#' Create a fragment length frequency line plot.
#'
#' @param data A data frame with fragment length data in the same format as `frag_length` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param line_width A numeric. The line size for plot lines.
#' @param line_color A string. The color for plot lines.
#' @param alpha A numeric. The alpha level for data points.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_frag_length <- function(data,
line_width = 1.5,
line_color = "red4",
alpha = 0.5,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
data$id <- rownames(data)
data <- melt(
data,
id.vars = "id",
variable.name = "BP",
value.name = "percent",
factorsAsStrings = T
)
data$percent <- data$percent * 100
data$bp <- gsub(
as.character(data$BP),
pattern = "BP_",
replacement = "",
fixed = T
)
data$bp <- as.numeric(data$bp)
data$id <- reorder(data$id, data$bp)
ggout <- ggplot(data, aes(x = bp, y = percent, group = id)) +
geom_line(lwd = line_width, alpha = alpha, color = line_color) +
labs(y = "% RNA fragments", x = "Fragment length") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Plot Salmon mapping statistics as a scatterplot
#'
#' Create a read count total vs. reads mapped scatterplot.
#'
#' @param data A data frame Salmon mapping data in the same format as `salmon` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_seq_depth_vs_lib_complexity
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_salmon_stats <- function(data,
point_size = 4,
fill = "red4",
alpha = 0.75,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
axis_min <- min(data$num_mapped / 1000000, data$num_processed / 1000000, na.rm = T) * 0.85
axis_max <- max(data$num_mapped / 1000000, data$num_processed / 1000000, na.rm = T) * 1.15
ggout <- ggplot(data, aes(x = num_mapped / 1000000, y = num_processed / 1000000)) +
geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
geom_abline(intercept = 0, slope = 1, size = 1, lty = 3, color = "gray60") +
xlim(axis_min, axis_max) +
ylim(axis_min, axis_max) +
labs(y = "Reads input (millions)", x = "Reads mapped (millions)") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Plot Salmon mapping percentage
#'
#' Create a Salmon mapping percentage histogram and boxplot. The two plots are intended to be used
#' as a single column, two row figure panel.
#'
#' @param data A data frame with Salmon mapping data in the same format as `salmon` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match rownames
#' in `data`.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_salmon_mapping_pct <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "% reads mapped",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = data$percent_mapped)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
data = plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot Salmon mapped read count
#'
#' Create a Salmon mapped read count histogram and boxplot. The two plots are intended to be used
#' as a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_salmon_mapped_reads <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Reads mapped (millions)",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = data$num_mapped / 1000000)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot Salmon percent decoy fragments
#'
#' Create a Salmon decoy mapping rate histogram and boxplot. The two plots are intended to be used
#' as a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso `\link{load_paired_end_qc_data}`
#' @export
plot_decoy_mapping_rate <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "% mapped to decoys",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = (data$num_decoy_fragments / data$num_processed) * 100)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot Salmon percent dovetail fragments
#'
#' Create a Salmon dovetail fragment percentage histogram and boxplot. The two plots are intended
#' to be used as a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso `\link{load_paired_end_qc_data}`
#' @export
plot_dovetail_mapping_rate <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "% dovetail fragments",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = (data$num_dovetail_fragments / data$num_processed) * 100)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot Salmon percent score filtered fragments
#'
#' Create a Salmon score-based filtering rate histogram and boxplot. The two plots are intended to
#' be used as a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso `\link{load_paired_end_qc_data}`
#' @export
plot_salmon_score_filtered_rate <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "% low-score filtered fragments",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = (data$num_fragments_filtered_vm / data$num_processed) * 100)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot Salmon failed mapping count
#'
#' Create a Salmon failed mapping histogram and boxplot. The two plots are intended to be used as
#' a single column, two row figure panel.
#'
#' @inheritParams plot_salmon_mapping_pct
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' mean of mean PHRED scores and grouping variable.
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso `\link{load_paired_end_qc_data}`
#' @export
plot_failed_mapping_count <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = paste0(
c(
"Number of failed mappings",
"for mapped fragments (millions)"
),
collapse = "\n"
),
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
failed_mappings <- data$num_alignments_below_threshold_for_mapped_fragments_vm
plot_data <- data.frame(values = failed_mappings / 1000000)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot Salmon mapping category percentages
#'
#' Create barplots with percentages for types of Salmon mappings.
#'
#' Create barplots with the percentage of reads/read pairs input into Salmon that were
#' mapped to the transcriptome, decoy fragments, as dovetails, and as low-score filtered mappings.
#' Categories are not mutually exclusive. X-axes are ordered the same across plots.
#'
#' @param data A data frame with Salmon mapping data in the same format as `salmon` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param fill A string vector with 4 colors corresponding to bars for the barplots for the
#' transcriptome, decoys, dovetails, and filtered mappings respectively.
#' @param sort A logical. Should the x-axis be ordered by transcriptome mapping rate?
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A list with four ggplot objects `transcriptome`, `decoy`, `dovetail`, and `filtered`
#' that correspond to the plots for the transcriptome, decoys, dovetails, and filtered mappings
#' respectively.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_salmon_categories <- function(data,
fill = brewer.pal(n = 4, name = "Spectral"),
sort = F,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
plot_data <- data.frame(id = rownames(data))
plot_data$transcriptome <- data$num_mapped / data$num_processed * 100
plot_data$decoy <- data$num_decoy_fragments / data$num_processed * 100
plot_data$dovetail <- data$num_dovetail_fragments / data$num_processed * 100
plot_data$filtered <- data$num_fragments_filtered_vm / data$num_processed * 100
if (sort) {
plot_data$id <- reorder(plot_data$id, plot_data$transcriptome)
}
ggout_list <- list()
ggout_list$transcriptome <- ggplot(plot_data, aes(x = id, y = transcriptome)) +
geom_bar(position = "dodge", stat = "identity", color = "black", lwd = 0.5, fill = fill[1]) +
ylim(0, 100) +
labs(x = "Samples", y = "% of fragments", title = "Transcriptome mappings") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
legend.title = element_blank(),
legend.text = element_blank(),
axis.text = element_text(size = 18),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
ggout_list$decoy <- ggplot(plot_data, aes(x = id, y = decoy)) +
geom_bar(position = "dodge", stat = "identity", color = "black", lwd = 0.5, fill = fill[2]) +
ylim(0, 100) +
labs(x = "Samples", y = "% of fragments", title = "Decoy mappings") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
legend.title = element_blank(),
legend.text = element_blank(),
axis.text = element_text(size = 18),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
ggout_list$dovetail <- ggplot(plot_data, aes(x = id, y = dovetail)) +
geom_bar(position = "dodge", stat = "identity", color = "black", lwd = 0.5, fill = fill[3]) +
ylim(0, 100) +
labs(x = "Samples", y = "% of fragments", title = "Dovetail mappings") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
legend.title = element_blank(),
legend.text = element_blank(),
axis.text = element_text(size = 18),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
ggout_list$filtered <- ggplot(plot_data, aes(x = id, y = filtered)) +
geom_bar(position = "dodge", stat = "identity", color = "black", lwd = 0.5, fill = fill[4]) +
ylim(0, 100) +
labs(x = "Samples", y = "% of fragments", title = "Low-score filtered mappings") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
legend.title = element_blank(),
legend.text = element_blank(),
axis.text = element_text(size = 18),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout_list)
}
#' Plot HISAT2 mapping statistics as a scatterplot
#'
#' Create a concordant alignment vs. overall alignment rate scatterplot. Only applicable for
#' paired-end sequencing data.
#'
#' @param data A data frame HISAT2 mapping data in the same format as `hisat2` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_seq_depth_vs_lib_complexity
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_hisat2_stats <- function(data,
point_size = 4,
fill = "red4",
alpha = 0.75,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
data$overall_alignment_rate <- as.numeric(data$overall_alignment_rate)
data$revised_alignment_rate <- as.numeric(data$revised_alignment_rate)
axis_min <- min(data$overall_alignment_rate, data$revised_alignment_rate, na.rm = T) * 0.85
axis_min <- max(axis_min, 0, na.rm = T)
axis_max <- max(data$overall_alignment_rate, data$revised_alignment_rate, na.rm = T) * 1.15
axis_max <- min(axis_max, 1, na.rm = T)
ggout <- ggplot(data, aes(x = overall_alignment_rate, y = revised_alignment_rate)) +
geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
geom_abline(intercept = 0, slope = 1, size = 1, lty = 3, color = "gray60") +
xlim(axis_min, axis_max) +
ylim(axis_min, axis_max) +
labs(y = "Concordant alignment rate", x = "Overall alignment rate") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Plot HISAT2 mapping percentage
#'
#' Create a HISAT2 concordant alignment rate histogram and boxplot. The two plots are intended to
#' be used as a single column, two row figure panel. Only applicable for paired-end sequencing
#' data.
#'
#' @param data A data frame with HISAT2 mapping data in the same format as `hisat2` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @inheritParams hist_boxplot
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_hisat2_mapping_rate <- function(data,
ids = NULL,
invert = F,
binsize = NULL,
hist_fill = "gray10",
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_color = "gray30",
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Concordant alignment rate",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
data$revised_alignment_rate <- as.numeric(data$revised_alignment_rate)
if (is.null(binsize)) {
binsize <- diff(range(data$revised_alignment_rate)) / 25
}
ggout_list <- hist_boxplot(
data[, "revised_alignment_rate", drop = F],
binsize = binsize,
hist_fill = hist_fill,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_color = jitter_color,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
return(ggout_list)
}
#' Plot HISAT2 mapped read count
#'
#' Create a HISAT2 mapped read count histogram and boxplot. The two plots are intended to be used
#' as a single column, two row figure panel. Only applicable for paired-end sequencing data.
#'
#' @param data A data frame with HISAT2 mapping data in the same format as `hisat2` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_hisat2_mapping_rate
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_hisat2_mapped_reads <- function(data,
ids = NULL,
invert = F,
binsize = NULL,
hist_fill = "gray10",
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_color = "gray30",
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Reads pairs aligned (millions)",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
data$revised_alignment_rate <- as.numeric(data$revised_alignment_rate)
data$paired_total <- as.numeric(data$paired_total)
data$aligned_reads_count <- (data$paired_total * data$revised_alignment_rate) / 1000000
if (is.null(binsize)) {
binsize <- diff(range(data$aligned_reads_count)) / 25
}
plot_data <- data[, "aligned_reads_count", drop = F]
ggout_list <- hist_boxplot(
plot_data,
binsize = binsize,
hist_fill = hist_fill,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_color = jitter_color,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
return(ggout_list)
}
#' Plot HISAT2 vs. Salmon mapping rates
#'
#' Create a HISAT2 vs Salmon mapping rate scatterplot. Only applicable for
#' paired-end sequencing data.
#'
#' Creates a scatterplot comparing mapping rates between HISAT2 and Salmon for samples that have
#' matchable IDs between data from the two tools. IDs are formatted for matching based on the
#' assumption that sample names are equivalent up until the first `|` delimiter (if any) in the
#' name.
#'
#' @param hisat_data A data frame of HISAT2 mapping data in the same format as `hisat2` returned by
#' \code{\link{load_paired_end_qc_data}}.
#' @param salmon_data A data frame of Salmon mapping data in the same format as `hisat2` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param ids A vector of rownames for subsetting `hisat_data` and `salmon_data` for plotting.
#' @inheritParams plot_seq_depth_vs_lib_complexity
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_hisat2_vs_salmon <- function(hisat_data,
salmon_data,
point_size = 4,
fill = "red4",
alpha = 0.75,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
hisat_data <- hisat_data[!rownames(hisat_data) %in% ids, , drop = F]
salmon_data <- salmon_data[!rownames(salmon_data) %in% ids, , drop = F]
} else {
hisat_data <- hisat_data[rownames(hisat_data) %in% ids, , drop = F]
salmon_data <- salmon_data[rownames(salmon_data) %in% ids, , drop = F]
}
}
hisat_data$revised_alignment_rate <- as.numeric(hisat_data$revised_alignment_rate)
hisat_data$id <- rownames(hisat_data)
hisat_data$id <- gsub(hisat_data$id, pattern = " \\|.+$", replacement = "", perl = T)
hisat_data <- hisat_data[, c("id", "revised_alignment_rate")]
salmon_data$id <- rownames(salmon_data)
salmon_data$id <- gsub(salmon_data$id, pattern = " \\|.+$", replacement = "", perl = T)
salmon_data <- salmon_data[, c("id", "percent_mapped")]
plot_data <- merge(x = hisat_data, y = salmon_data, by = "id")
plot_data$salmon_rate <- plot_data$percent_mapped / 100
axis_min <- min(plot_data$revised_alignment_rate, plot_data$salmon_rate, na.rm = T) * 0.85
axis_min <- max(axis_min, 0, na.rm = T)
axis_max <- max(plot_data$revised_alignment_rate, plot_data$salmon_rate, na.rm = T) * 1.15
axis_max <- min(axis_max, 1, na.rm = T)
ggout <- ggplot(plot_data, aes(x = salmon_rate, y = revised_alignment_rate)) +
geom_point(shape = 21, size = point_size, fill = fill, col = "white", alpha = alpha) +
geom_abline(intercept = 0, slope = 1, size = 1, lty = 3, color = "gray60") +
xlim(axis_min, axis_max) +
ylim(axis_min, axis_max) +
labs(x = "Salmon mapping rate", y = "HISAT2 mapping rate") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Splice junction saturation plot
#'
#' Create a splice junction saturation line plot.
#'
#' @param data A data frame with splice junction data in the same format as `known_junction`
#' or `novel_junction` returned by \code{\link{load_paired_end_qc_data}}.
#' @inheritParams plot_frag_length
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_junction_saturation <- function(data,
line_width = 1.5,
line_color = "red4",
alpha = 0.5,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
data$id <- rownames(data)
data <- melt(
data,
id.vars = "id",
variable.name = "percent",
value.name = "count",
factorsAsStrings = F
)
data$percent <- gsub(
data$percent,
pattern = "percent_",
replacement = "",
fixed = T
)
data$percent <- as.numeric(data$percent)
data$count <- data$count / 1000
data$id <- reorder(data$id, data$percent)
ggout <- ggplot(data, aes(x = percent, y = count, group = id)) +
geom_line(lwd = line_width, alpha = alpha, color = line_color) +
labs(y = "Recovered splice junctions\n(thousands)", x = "% of reads") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Read pair inner distance distribution
#'
#' Create a paired-end read inner distance line plot. Applicable only to paired-end sequencing
#' data.
#'
#' @param data A data frame with fragment length data in the same format as `inner_dist` returned
#' by \code{\link{load_paired_end_qc_data}}.
#' @param line_width A numeric. The line size for plot lines.
#' @param line_color A string. The color for plot lines.
#' @param alpha A numeric. The alpha level for data points.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @return A ggplot object.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_inner_dist <- function(data,
line_width = 1.5,
line_color = "red4",
alpha = 0.5,
ids = NULL,
invert = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
data$id <- rownames(data)
data <- melt(
data,
id.vars = "id",
variable.name = "distance",
value.name = "count",
factorsAsStrings = F
)
data$distance <- gsub(
data$distance,
pattern = "BP_.",
replacement = "-",
fixed = T
)
data$distance <- gsub(
data$distance,
pattern = "BP_",
replacement = "",
fixed = T
)
data$distance <- as.numeric(data$distance)
data$count <- data$count / 1000
data$id <- reorder(data$id, data$distance)
ggout <- ggplot(data, aes(x = distance, y = count, group = id)) +
geom_line(lwd = line_width, alpha = alpha, color = line_color) +
labs(y = "Alignment count (thousands)", x = "Distance between read pairs") +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1)
)
return(ggout)
}
#' Plot annotation category percentages for alignments
#'
#' Create stacked barplots with percentages for types of genomic annotations.
#'
#' Create barplots with the percentage of alignments that map to a genomic region with annotations
#' of intergenic, proximally down and upstream of genes, intronic, UTRs or coding sequence.
#' Includes the option to consolidate categories into more general categories for intergenic,
#' intronic, and exonic regions.
#'
#' @param data A data frame with annotation data in the same format as `rseqc_alignment_category`
#' returned by \code{\link{load_paired_end_qc_data}}.
#' @param fill A string vector with 3 or 6 colors corresponding to cateogires for the barplot.
#' @param sort A logical. Should the x-axis be ordered by coding sequence/exonic alignment
#' percentage?
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param consolidate A logical. Should mapping categories be consolidated into the three more
#' general categories of exonic, intronic, and intergenic?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @return A ggplot object. If `return_data` is `TRUE`, then a data frame with the plot data
#' values for each genomic annotation category as columns.
#' @seealso \code{\link{load_paired_end_qc_data}}
#' @export
plot_mapping_categories <- function(data,
fill = NULL,
sort = F,
ids = NULL,
invert = F,
consolidate = F,
return_data = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
if (sort) {
data$id <- reorder(data$id, data$CDS)
}
if (consolidate) {
intergenic_col_ids <- c("Other_intergenic", "Downstream_10kb", "Upstream_10kb")
intronic_col_ids <- c("Intron")
exonic_col_ids <- c("UTR_3", "UTR_5", "CDS")
intergenic_index <- which(colnames(data) %in% intergenic_col_ids)
intronic_index <- which(colnames(data) %in% intronic_col_ids)
exonic_index <- which(colnames(data) %in% exonic_col_ids)
sample_index <- which(colnames(data) == "id")
data <- t(
apply(
data,
1,
function(x) {
new_row <- data.frame(
id = x[sample_index],
Intergenic = sum(as.numeric(x[intergenic_index])),
Intronic = sum(as.numeric(x[intronic_index])),
Exonic = sum(as.numeric(x[exonic_index]))
)
return(new_row)
}
)
)
data <- as.data.frame(do.call("rbind", data))
if (sort) {
data$id <- reorder(data$id, data$Exonic)
}
}
if (return_data) {
rownames(data) <- data$id
data <- data[, -1 * which(colnames(data) == "id")]
return(data)
}
plot_data <- melt(
data,
id.vars = "id",
variable.name = "type",
value.name = "percent",
factorsAsStrings = T
)
ggout <- ggplot(plot_data, aes(y = percent, x = id, fill = type)) +
geom_bar(position = "stack", stat = "identity", color = NA, lwd = 0) +
labs(x = "Samples", y = "% primary alignments", fill = "") +
geom_hline(
yintercept = c(12.5, 25, 37.5, 50, 62.5, 75, 87.5),
lty = 3,
lwd = 1,
color = "white",
alpha = 0.5
) +
theme(
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "cm"),
title = element_text(size = 16),
axis.text = element_text(size = 18),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title = element_text(size = 16),
axis.title.y = element_text(vjust = 3),
axis.title.x = element_text(vjust = -1),
legend.title = element_text(size = 16),
legend.text = element_text(size = 16)
)
if (is.null(fill)) {
if (consolidate) {
ggout <- ggout + scale_fill_manual(values = c("#3b7854", "#d49b00", "#30485e"))
} else {
ggout <- ggout + scale_fill_brewer(palette = "Spectral")
}
} else {
ggout <- ggout + scale_fill_manual(values = fill)
}
return(ggout)
}
#' ggplot2-based PCA plot
#'
#' Create an eigenvectors scatterplot from principal component analysis.
#'
#' Conducts a principal component analysis (PCA) then produces a scatterplot using the
#' user-specified eigenvectors (ie., principal components or PCs) from the eigenvector matrix. Data
#' points can be optionally color-coded based on a user-specified variable.
#'
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @inheritParams ggpca
#' @return A ggplot object unless `return_data` is `TRUE`, then a data frame with the
#' user-specified PCs, grouping variable, and an attribute for the percent variace explained for
#' each user-specified PC.
#' @seealso \code{\link{prcomp}}, \code{\link{SummarizedExperiment}}, \code{\link{assay}},
#' \code{\link{colData}}
#' @export
plot_pca <- function(data,
ids = NULL,
invert = F,
group_var = NULL,
pc_x = 1,
pc_y = 2,
ntop = nrow(data),
center = T,
scale = T,
equal_axes = F,
point_size = 3,
alpha = 0.75,
color = "red4",
palette = "Paired",
return_data = F) {
if (!is.null(ids)) {
if (invert) {
data <- data[, !colnames(data) %in% ids, drop = F]
} else {
data <- data[, colnames(data) %in% ids, drop = F]
}
}
ggout <- ggpca(
data = data,
group_var = group_var,
pc_x = pc_x,
pc_y = pc_y,
ntop = ntop,
center = center,
scale = scale,
equal_axes = equal_axes,
point_size = point_size,
alpha = alpha,
color = color,
palette = palette,
return_data = return_data
)
return(ggout)
}
#' Plot feature set expression fraction
#'
#' Create a feature set fraction histogram and boxplot.
#'
#' Calculates the fraction of feature (e.g., gene) expression represented by a user-specified set
#' of features and plots this as a histogram and boxplot. The two plots are intended to be used as
#' a single column, two row figure panel.
#'
#' @param data A feature (row) by sample (column) data frame or matrix with expression count
#' values.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param feature_ids A vector of row names for subsetting `data` for plotting.
#' @param sample_ids A vector of column names for subsetting `data` for plotting.
#' @param invert_rows A logical. Should `feature_ids` be used for excluding rows from plotting
#' instead?
#' @param invert_cols A logical. Should `sample_ids` be used for excluding columns from plotting
#' instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_expression_fraction <- function(data,
group = NULL,
feature_ids = NULL,
sample_ids = NULL,
invert_rows = F,
invert_cols = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "% of total expression",
y_title = "Sample count") {
if (!is.null(sample_ids)) {
if (invert_cols) {
data <- data[, !colnames(data) %in% sample_ids, drop = F]
} else {
data <- data[, colnames(data) %in% sample_ids, drop = F]
}
}
total_expr <- colSums(data, na.rm = F)
if (!is.null(feature_ids)) {
if (invert_rows) {
data <- data[!rownames(data) %in% feature_ids, ]
} else {
data <- data[rownames(data) %in% feature_ids, ]
}
}
subset_expr <- colSums(data, na.rm = F)
expr_pct <- (subset_expr / total_expr) * 100
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, ncol(data))))
rownames(group) <- colnames(data)
}
plot_data <- data.frame(values = expr_pct)
rownames(plot_data) <- colnames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (ncol(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot shannon diversity index
#'
#' Create a shannon index histogram and boxplot.
#'
#' Calculates the shannon index for a user-specified set of expression features (e.g., genes) and
#' plots this as a histogram and boxplot. The two plots are intended to be used as a single column,
#' two row figure panel.
#'
#' @param min_value The minimum value (i.e., expression level) in `data` for which greater values
#' are considered in the shannon index calculation.
#' @inheritParams plot_expression_fraction
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{load_paired_end_qc_data}} \code{\link{hist_boxplot2}}
#' @export
plot_shannon_index <- function(data,
group = NULL,
min_value = 0,
feature_ids = NULL,
sample_ids = NULL,
invert_rows = F,
invert_cols = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Shannon index",
y_title = "Sample count") {
if (!is.null(sample_ids)) {
if (invert_cols) {
data <- data[, !colnames(data) %in% sample_ids, drop = F]
} else {
data <- data[, colnames(data) %in% sample_ids, drop = F]
}
}
if (!is.null(feature_ids)) {
if (invert_rows) {
data <- data[!rownames(data) %in% feature_ids, ]
} else {
data <- data[rownames(data) %in% feature_ids, ]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, ncol(data))))
rownames(group) <- colnames(data)
}
diversity <- apply(
data,
2,
function(sample_data) {
sample_data <- sample_data[which(sample_data > min_value)]
p <- sample_data / sum(sample_data, na.rm = T)
shannon_index <- -1 * sum(p * log(p))
return(shannon_index)
}
)
plot_data <- data.frame(values = diversity, row.names = colnames(data))
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (ncol(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot partitioned mean expression
#'
#' Create a histogram and boxplot with per sample mean expression levels partitioned by a
#' categorical variable.
#'
#' Calculates the mean across a user-specified set of expression features (e.g., genes) for each
#' sample and plots this as a histogram and boxplot. The two plots are intended to be used as a
#' single column, two row figure panel. Data points in the plots are partitioned by a categorical
#' variable.
#'
#' @param data A SummarizedExperiment-like object. Must be compatible with `assay()` and
#' `colData()`.
#' @param group_var A string for the categorical variable to use for partitioning.
#' @param feature_ids A vector of row names for subsetting `data` for plotting.
#' @param sample_ids A vector of column names for subsetting `data` for plotting.
#' @param invert_rows A logical. Should `feature_ids` be used for excluding rows from plotting
#' instead?
#' @param invert_cols A logical. Should `sample_ids` be used for excluding columns from plotting
#' instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{hist_boxplot2}}
#' @export
plot_partitioned_mean_expression <- function(data,
group_var,
feature_ids = NULL,
sample_ids = NULL,
invert_rows = F,
invert_cols = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "gray50",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.5,
jitter_size = 1.75,
x_title = "Mean expression",
y_title = "Sample count") {
if (!is.null(sample_ids)) {
if (invert_cols) {
data <- data[, !colnames(data) %in% sample_ids, drop = F]
} else {
data <- data[, colnames(data) %in% sample_ids, drop = F]
}
}
if (!is.null(feature_ids)) {
if (invert_rows) {
data <- data[!rownames(data) %in% feature_ids, ]
} else {
data <- data[rownames(data) %in% feature_ids, ]
}
}
plot_data <- data.frame(
values = colMeans(assay(data)),
group = colData(data)[, group_var, drop = T]
)
rownames(plot_data) <- colnames(data)
if (return_data) {
return(plot_data)
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
ggout_list <- hist_boxplot2(
data = plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
return(ggout_list)
}
#' Plot partitioned principal component
#'
#' Create a histogram and boxplot with per sample principal component values partitioned by a
#' categorical variable.
#'
#' Runs PCA using a user-specified set of expression features (e.g., genes) and plots a
#' user-specified PC as a histogram and boxplot. The two plots are intended to be used as a
#' single column, two row figure panel. Data points in the plots are partitioned by a categorical
#' variable.
#'
#' @param pc An integer for the principal component (ranked from highest to lowest variance
#' contribution) to use for plotting.
#' @param center A logical. Should the data be zero-centered prior to PCA?
#' @param scale A logical. Should the data be scaled to unit variance prior to PCA?
#' @inheritParams plot_partitioned_mean_expression
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{hist_boxplot2}} \code{\link{prcomp}}
#' @export
plot_partitioned_pc <- function(data,
pc = 1,
group_var,
center = T,
scale = T,
feature_ids = NULL,
sample_ids = NULL,
invert_rows = F,
invert_cols = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "gray50",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.5,
jitter_size = 1.75,
x_title = NULL,
y_title = "Sample count") {
if (!is.null(sample_ids)) {
if (invert_cols) {
data <- data[, !colnames(data) %in% sample_ids, drop = F]
} else {
data <- data[, colnames(data) %in% sample_ids, drop = F]
}
}
if (!is.null(feature_ids)) {
if (invert_rows) {
data <- data[!rownames(data) %in% feature_ids, ]
} else {
data <- data[rownames(data) %in% feature_ids, ]
}
}
pc_name <- paste0("PC", pc)
pca <- prcomp(t(SummarizedExperiment::assay(data)), center = center, scale. = scale)
pct_var <- pca$sdev[pc]^2 / sum(pca$sdev^2)
plot_data <- data.frame(
values = pca$x[, pc],
group = SummarizedExperiment::colData(data)[, group_var, drop = T]
)
rownames(plot_data) <- colnames(data)
if (return_data) {
return(plot_data)
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (is.null(x_title)) {
x_title <- paste0(pc_name, " (", round(pct_var * 100), "% variance)")
}
ggout_list <- hist_boxplot2(
data = plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
return(ggout_list)
}
#' Plot RIN scores
#'
#' Create a histogram and boxplot of RIN scores. The two plots are intended to be used as a single
#' column, two row figure panel.
#'
#' @param data A single column data frame with RIN score values and sample IDs as rownames.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param ids A vector of rownames for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{hist_boxplot2}}
#' @export
plot_rin <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "RIN",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, , drop = F]
} else {
data <- data[rownames(data) %in% ids, , drop = F]
}
}
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = data[, 1])
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot intergenic-genic ratio
#'
#' Create an intergenic-genic mapping proportion histogram and boxplot.
#'
#' Calculates the ratio between intergenic and genic mapping rates and plots this as a histogram
#' and boxplot. The two plots are intended to be used as a single column, two row figure panel. The
#' intergenic and genic mapping rates can be calculated uing
#' \code{\link{plot_mapping_categories}}.
#'
#' @param data A three column data frame with intergenic, intronic, and exonic mapping rates as
#' the first, second, and third column, respectively.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param ids A vector of row names for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{plot_mapping_categories}} \code{\link{hist_boxplot2}}
#' @export
plot_intergenic_genic_ratio <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Intergenic-genic mappping ratio",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, ]
} else {
data <- data[rownames(data) %in% ids, ]
}
}
colnames(data) <- c("intergenic", "intronic", "exonic")
contamination_ratio <- data$intergenic / (data$intronic + data$exonic)
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = contamination_ratio)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot DNA contamination ratio
#'
#' Create an intergenic-intronic mapping proportion histogram and boxplot.
#'
#' Calculates the ratio between intergenic and intronic mapping rates and plots this as a histogram
#' and boxplot. The two plots are intended to be used as a single column, two row figure panel. The
#' intergenic and intronic mapping rates can be calculated uing
#' \code{\link{plot_mapping_categories}}.
#'
#' @param data A two column data frame with intergenic mapping rates as the first column and
#' intronic mapping rates as the second column.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param ids A vector of row names for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{plot_mapping_categories}} \code{\link{hist_boxplot2}}
#' @export
plot_dna_contamination_ratio <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Intergenic-intronic mappping ratio",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, ]
} else {
data <- data[rownames(data) %in% ids, ]
}
}
colnames(data) <- c("intergenic", "intronic")
mapping_ratio <- data$intergenic / data$intronic
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = mapping_ratio)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
#' Plot gene mapping rate
#'
#' Create a gene mapping rate histogram and boxplot.
#'
#' Calculates the gene mapping rate (combined exonic and intronic mapping rates) and plots this as
#' a histogram and boxplot. The two plots are intended to be used as a single column, two row
#' figure panel. The exonic and intronic mapping rates can be calculated uing
#' \code{\link{plot_mapping_categories}}.
#'
#' @param data A two column data frame with exonic mapping rates as the first column and
#' intronic mapping rates as the second column.
#' @param group A single column data frame denoting categorical variable values to use to
#' partition the data points into separate distributions/boxplots. Row names should match
#' rownames in `data`.
#' @param ids A vector of row names for subsetting `data` for plotting.
#' @param invert A logical. Should `ids` be used for excluding rows from plotting instead?
#' @param return_data A logical. Should plot data be returned instead of a ggplot object?
#' @inheritParams hist_boxplot2
#' @return A list with two ggplot objects `hist` and `boxplot` corresponding to a histogram and
#' jittered boxplot respectively. If `return_data` is `TRUE`, then a data frame with the plot
#' data values and grouping variable.
#' @seealso \code{\link{plot_mapping_categories}} \code{\link{hist_boxplot2}}
#' @export
plot_gene_mapping_rate <- function(data,
group = NULL,
ids = NULL,
invert = F,
return_data = F,
binsize = NULL,
colors = NULL,
hist_alpha = 0.75,
box_fill = "goldenrod",
box_alpha = 0.5,
box_lwd = 1,
jitter_alpha = 0.75,
jitter_size = 1.75,
x_title = "Gene mappping %",
y_title = "Sample count") {
if (!is.null(ids)) {
if (invert) {
data <- data[!rownames(data) %in% ids, ]
} else {
data <- data[rownames(data) %in% ids, ]
}
}
colnames(data) <- c("exonic", "intronic")
mapping_pct <- (data$exonic + data$intronic)
group0 <- group
if (is.null(group)) {
group <- data.frame(group = factor(rep(1, nrow(data))))
rownames(group) <- rownames(data)
}
plot_data <- data.frame(values = mapping_pct)
rownames(plot_data) <- rownames(data)
plot_data <- merge(x = plot_data, y = group, by = 0, all = F)
rownames(plot_data) <- plot_data[, 1]
plot_data <- plot_data[, -1] # remove new row name column
plot_data$values <- as.numeric(plot_data$values)
if (nrow(data) > nrow(plot_data)) {
warning("Not all samples from 'data' have a value in 'group'")
}
if (is.null(binsize)) {
binsize <- diff(range(plot_data$values)) / 25
}
if (return_data) {
return(plot_data)
}
ggout_list <- hist_boxplot2(
plot_data,
binsize = binsize,
colors = colors,
hist_alpha = hist_alpha,
box_fill = box_fill,
box_alpha = box_alpha,
box_lwd = box_lwd,
jitter_alpha = jitter_alpha,
jitter_size = jitter_size,
x_title = x_title,
y_title = y_title
)
if (is.null(group0)) {
ggout_list$hist <- ggout_list$hist +
theme(legend.position = "none")
ggout_list$boxplot <- ggout_list$boxplot +
theme(legend.position = "none")
}
return(ggout_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.