#!/usr/bin/env Rscript
#
# Copyright 2013 - 2022 Michael K. Schuster
#
# Biomedical Sequencing Facility (BSF), part of the genomics core facility of
# the Research Center for Molecular Medicine (CeMM) of the Austrian Academy of
# Sciences and the Medical University of Vienna (MUW).
#
#
# This file is part of BSF R.
#
# BSF R is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BSF R is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with BSF R. If not, see <http://www.gnu.org/licenses/>.
# Description -------------------------------------------------------------
# BSF R script to summarise a variant calling analysis. Picard Duplication
# Metrics, Picard Alignment Summary Metrics and Picard Hybrid Selection Metrics
# reports are read for each sample and plotted at the read group or sample
# level.
# Option Parsing ----------------------------------------------------------
suppressPackageStartupMessages(expr = library(package = "optparse"))
argument_list <-
optparse::parse_args(object = optparse::OptionParser(
option_list = list(
optparse::make_option(
opt_str = "--verbose",
action = "store_true",
default = TRUE,
help = "Print extra output [default]",
type = "logical"
),
optparse::make_option(
opt_str = "--quiet",
action = "store_false",
default = FALSE,
dest = "verbose",
help = "Print little output",
type = "logical"
),
optparse::make_option(
opt_str = "--prefix",
dest = "prefix",
help = "File name prefix",
type = "character"
),
optparse::make_option(
opt_str = "--plot-width",
default = 7.0,
dest = "plot_width",
help = "Plot width in inches [7.0]",
type = "double"
),
optparse::make_option(
opt_str = "--plot-height",
default = 7.0,
dest = "plot_height",
help = "Plot height in inches [7.0]",
type = "double"
)
)
))
# Library Import ----------------------------------------------------------
# CRAN r-lib
suppressPackageStartupMessages(expr = library(package = "sessioninfo"))
# CRAN Tidyverse
suppressPackageStartupMessages(expr = library(package = "ggplot2"))
suppressPackageStartupMessages(expr = library(package = "tidyr"))
# Save plots in the following formats.
graphics_formats <- c("pdf" = "pdf", "png" = "png")
# Maximum size for the PNG device in inches.
graphics_maximum_size_png <- 100.0
# Assign a file prefix.
prefix_summary <- "variant_calling_summary"
if (is.null(x = argument_list$prefix)) {
# If a prefix was not provided, try to get it from a cohort-level file.
file_names <-
base::list.files(pattern = "^variant_calling_process_cohort_.*_genotyped_raw_snp_raw_indel.vcf.gz$")
for (file_name in file_names) {
cohort_name <-
base::gsub(pattern = "^variant_calling_process_cohort_(.*?)_genotyped_raw_snp_raw_indel.vcf.gz$",
replacement = "\\1",
x = file_name)
message("Cohort name: ", cohort_name)
prefix_summary <-
paste("variant_calling_summary", cohort_name, sep = "_")
rm(cohort_name)
}
rm(file_names)
} else {
prefix_summary <- argument_list$prefix
}
# Picard Duplication Metrics ----------------------------------------------
# Process Picard Duplication Metrics reports.
message("Processing Picard Duplication Metrics reports for sample:")
combined_metrics_sample_frame <- NULL
file_names <-
base::list.files(pattern = "^variant_calling_process_sample_.*_duplicate_metrics.tsv$")
for (file_name in file_names) {
sample_name <-
base::gsub(pattern = "^variant_calling_process_sample_(.*?)_duplicate_metrics.tsv$",
replacement = "\\1",
x = file_name)
message(" ", sample_name)
# Picard Tools added a histogram section that needs excluding from parsing.
# Find the lines starting with "## METRICS CLASS" and "## HISTOGRAM" and read
# that many lines.
metrics_lines <- readLines(con = file_name)
metrics_line <-
which(x = grepl(pattern = "## METRICS CLASS", x = metrics_lines))
histogram_line <-
which(x = grepl(pattern = "## HISTOGRAM", x = metrics_lines))
if (length(x = histogram_line)) {
# Set the number of rows to read excluding 3 more lines,
# the "## HISTOGRAM" line, the blank line and the header line.
number_read <- histogram_line[1L] - metrics_line[1L] - 3L
number_skip <- metrics_line[1L]
} else {
number_read <- -1L
number_skip <- metrics_line[1L]
}
picard_metrics_sample_frame <-
utils::read.table(
file = file_name,
header = TRUE,
sep = "\t",
nrows = number_read,
skip = number_skip,
fill = TRUE,
comment.char = "",
stringsAsFactors = FALSE
)
rm(number_read,
number_skip,
histogram_line,
metrics_line,
metrics_lines)
# Add the sample name, which is not part of the Picard report.
picard_metrics_sample_frame$SAMPLE <-
as.character(x = sample_name)
# The Picard Duplication Metrics report has changed format through versions.
# Column SECONDARY_OR_SUPPLEMENTARY_RDS was added at a later stage.
if (is.null(x = picard_metrics_sample_frame$SECONDARY_OR_SUPPLEMENTARY_RDS)) {
picard_metrics_sample_frame$SECONDARY_OR_SUPPLEMENTARY_RDS <- 0L
}
combined_metrics_sample_frame <-
base::rbind(combined_metrics_sample_frame,
picard_metrics_sample_frame)
rm(picard_metrics_sample_frame)
}
rm(file_name, file_names)
if (!is.null(x = combined_metrics_sample_frame)) {
# Order the sample frame by SAMPLE.
combined_metrics_sample_frame <-
combined_metrics_sample_frame[order(combined_metrics_sample_frame$SAMPLE),]
# Convert the SAMPLE column into factors, which come more handy for plotting.
combined_metrics_sample_frame$SAMPLE <-
as.factor(x = combined_metrics_sample_frame$SAMPLE)
# Add additional percentages into the table.
combined_metrics_sample_frame$PERCENT_UNPAIRED_READ_DUPLICATION <-
combined_metrics_sample_frame$UNPAIRED_READ_DUPLICATES / combined_metrics_sample_frame$UNPAIRED_READS_EXAMINED
combined_metrics_sample_frame$PERCENT_READ_PAIR_DUPLICATION <-
combined_metrics_sample_frame$READ_PAIR_DUPLICATES / combined_metrics_sample_frame$READ_PAIRS_EXAMINED
combined_metrics_sample_frame$PERCENT_READ_PAIR_OPTICAL_DUPLICATION <-
combined_metrics_sample_frame$READ_PAIR_OPTICAL_DUPLICATES / combined_metrics_sample_frame$READ_PAIRS_EXAMINED
utils::write.table(
x = combined_metrics_sample_frame,
file = paste(prefix_summary, "duplication_metrics_sample.tsv", sep = "_"),
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
# Adjust the plot width according to batches of 24 samples or read groups.
plot_width_sample <-
argument_list$plot_width + (ceiling(x = nlevels(x = combined_metrics_sample_frame$SAMPLE) / 24L) - 1L) * argument_list$plot_width * 0.3
# message("Plot width sample: ", plot_width_sample)
# Plot Duplication Fraction per Sample ----------------------------------
message("Plotting the duplication fraction per sample")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_sample_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(x = .data$SAMPLE, y = .data$PERCENT_DUPLICATION))
ggplot_object <-
ggplot_object + ggplot2::labs(x = "Sample", y = "Duplication Fraction", title = "Duplication Fraction per Sample")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("duplication_percentage_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot Duplication Classes per Sample -----------------------------------
# Plot PERCENT_UNPAIRED_READ_DUPLICATION, PERCENT_READ_PAIR_DUPLICATION,
# PERCENT_READ_PAIR_OPTICAL_DUPLICATION and PERCENT_DUPLICATION per sample.
message("Plotting the duplication levels per sample")
ggplot_object <- ggplot2::ggplot(
data = tidyr::pivot_longer(
data = combined_metrics_sample_frame,
cols = c(
"PERCENT_UNPAIRED_READ_DUPLICATION",
"PERCENT_READ_PAIR_DUPLICATION",
"PERCENT_READ_PAIR_OPTICAL_DUPLICATION",
"PERCENT_DUPLICATION"
),
names_to = "DUPLICATION",
values_to = "fraction"
)
)
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(
x = .data$SAMPLE,
y = .data$fraction,
colour = .data$DUPLICATION
))
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Sample",
y = "Fraction",
colour = "Duplication Level",
title = "Duplication Levels per Sample"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("duplication_levels_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
rm(plot_width_sample)
}
rm(combined_metrics_sample_frame)
# Picard Alignment Summary Metrics ----------------------------------------
# Process Picard Alignment Summary Metrics reports.
message("Processing Picard Alignment Summary Metrics reports for sample:")
combined_metrics_sample_frame <- NULL
combined_metrics_read_group_frame <- NULL
file_names <-
base::list.files(pattern = "^variant_calling_process_sample_.*_alignment_summary_metrics.tsv$")
for (file_name in file_names) {
sample_name <-
base::gsub(pattern = "^variant_calling_process_sample_(.*?)_alignment_summary_metrics.tsv$",
replacement = "\\1",
x = file_name)
message(" ", sample_name)
# Since the Illumina2bam tools BamIndexDecoder uses a hash character (#) in
# the read group component to separate platform unit and sample name, the
# Picard reports need special parsing. Find the ## METRICS CLASS line and
# parse without allowing further comments.
metrics_lines <- readLines(con = file_name)
metrics_line <-
which(x = grepl(pattern = "## METRICS CLASS", x = metrics_lines))
picard_metrics_total_frame <-
utils::read.table(
file = file_name,
header = TRUE,
sep = "\t",
skip = metrics_line[1L],
fill = TRUE,
comment.char = "",
stringsAsFactors = FALSE
)
rm(metrics_line, metrics_lines)
# To support numeric sample names the read.table(stringsAsFactors = FALSE) is
# turned off. Convert SAMPLE, LIBRARY and READ_GROUP into character vectors.
picard_metrics_total_frame$SAMPLE <-
as.character(x = picard_metrics_total_frame$SAMPLE)
picard_metrics_total_frame$LIBRARY <-
as.character(x = picard_metrics_total_frame$LIBRARY)
picard_metrics_total_frame$READ_GROUP <-
as.character(x = picard_metrics_total_frame$READ_GROUP)
# The Picard Alignment Metrics report has changed format through versions.
# Columns PF_READS_IMPROPER_PAIRS and PCT_PF_READS_IMPROPER_PAIRS were added
# at a later stage.
if (is.null(x = picard_metrics_total_frame$PF_READS_IMPROPER_PAIRS)) {
picard_metrics_total_frame$PF_READS_IMPROPER_PAIRS <- 0L
}
if (is.null(x = picard_metrics_total_frame$PCT_PF_READS_IMPROPER_PAIRS)) {
picard_metrics_total_frame$PCT_PF_READS_IMPROPER_PAIRS <- 0.0
}
# Select only rows showing the SAMPLE summary, i.e. showing SAMPLE, but no
# LIBRARY and READ_GROUP information.
picard_metrics_sample_frame <-
picard_metrics_total_frame[(!is.na(x = picard_metrics_total_frame$SAMPLE)) &
(picard_metrics_total_frame$SAMPLE != "") &
(picard_metrics_total_frame$LIBRARY == "") &
(picard_metrics_total_frame$READ_GROUP == ""), ]
combined_metrics_sample_frame <-
base::rbind(combined_metrics_sample_frame,
picard_metrics_sample_frame)
rm(picard_metrics_sample_frame)
# Select only rows showing READ_GROUP summary, i.e. showing READ_GROUP
# information.
picard_metrics_read_group <-
picard_metrics_total_frame[(picard_metrics_total_frame$READ_GROUP != ""), ]
combined_metrics_read_group_frame <-
base::rbind(combined_metrics_read_group_frame,
picard_metrics_read_group)
rm(picard_metrics_read_group)
rm(sample_name, picard_metrics_total_frame)
}
rm(file_name, file_names)
if (!is.null(x = combined_metrics_sample_frame)) {
# Order the data frame by SAMPLE.
combined_metrics_sample_frame <-
combined_metrics_sample_frame[order(combined_metrics_sample_frame$SAMPLE),]
# Manually convert CATEGORY and SAMPLE columns into factors, which are handy
# for plotting.
combined_metrics_sample_frame$CATEGORY <-
as.factor(x = combined_metrics_sample_frame$CATEGORY)
combined_metrics_sample_frame$SAMPLE <-
as.factor(x = combined_metrics_sample_frame$SAMPLE)
# Add an additional LABEL factor column defined as a concatenation of SAMPLE
# and CATEGORY.
combined_metrics_sample_frame$LABEL <-
as.factor(
x = paste(
combined_metrics_sample_frame$SAMPLE,
combined_metrics_sample_frame$CATEGORY,
sep =
"_"
)
)
utils::write.table(
x = combined_metrics_sample_frame,
file = paste(prefix_summary, "alignment_metrics_sample.tsv", sep = "_"),
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
# Order the data frame by READ_GROUP
combined_metrics_read_group_frame <-
combined_metrics_read_group_frame[order(combined_metrics_read_group_frame$READ_GROUP),]
# Manually convert CATEGORY and READ_GROUP columns into factors, which are
# handy for plotting.
combined_metrics_read_group_frame$CATEGORY <-
as.factor(x = combined_metrics_read_group_frame$CATEGORY)
combined_metrics_read_group_frame$READ_GROUP <-
as.factor(x = combined_metrics_read_group_frame$READ_GROUP)
# Add an additional LABEL factor column defined as a concatenation of
# READ_GROUP and CATEGORY.
combined_metrics_read_group_frame$LABEL <-
as.factor(
x = paste(
combined_metrics_read_group_frame$READ_GROUP,
combined_metrics_read_group_frame$CATEGORY,
sep =
"_"
)
)
utils::write.table(
x = combined_metrics_read_group_frame,
file = paste(prefix_summary, "alignment_metrics_read_group.tsv", sep = "_"),
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
# Adjust the plot width according to batches of 24 samples or read groups.
plot_width_sample <-
argument_list$plot_width + (ceiling(x = nlevels(x = combined_metrics_sample_frame$SAMPLE) / 24L) - 1L) * argument_list$plot_width * 0.25
plot_width_read_group <-
argument_list$plot_width + (ceiling(x = nlevels(x = combined_metrics_read_group_frame$READ_GROUP) / 24L) - 1L) * argument_list$plot_width * 0.35
# message("Plot width sample: ", plot_width_sample)
# message("Plot width read group: ", plot_width_read_group)
# Plot the aligned pass-filter reads number per sample ------------------
message("Plotting the aligned pass-filter reads number per sample")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_sample_frame[, c("CATEGORY", "SAMPLE", "PF_READS_ALIGNED"), drop = FALSE])
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(
x = .data$SAMPLE,
y = .data$PF_READS_ALIGNED,
colour = .data$CATEGORY
))
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Sample",
y = "Number",
colour = "Category",
title = "Aligned Pass-Filter Reads Number per Sample"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("alignment_absolute_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the aligned pass-filter reads number per read group --------------
message("Plotting the aligned pass-filter reads number per read group")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_read_group_frame[, c("CATEGORY", "READ_GROUP", "PF_READS_ALIGNED"), drop = FALSE])
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$READ_GROUP,
y = .data$PF_READS_ALIGNED,
colour = .data$CATEGORY
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Read Group",
y = "Number",
colour = "Category",
title = "Aligned Pass-Filter Reads Number per Read Group"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("alignment_absolute_read_group", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_read_group > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_read_group,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the aligned pass-filter reads fraction per sample ----------------
message("Plotting the aligned pass-filter reads fraction per sample")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_sample_frame[, c("CATEGORY", "SAMPLE", "PCT_PF_READS_ALIGNED"), drop = FALSE])
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$SAMPLE,
y = .data$PCT_PF_READS_ALIGNED,
colour = .data$CATEGORY
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Sample",
y = "Fraction",
colour = "Category",
title = "Aligned Pass-Filter Reads Fraction per Sample"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("alignment_percentage_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the aligned pass-filter reads fraction per read group ------------
message("Plotting the aligned pass-filter reads fraction per read group")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_read_group_frame[, c("CATEGORY", "READ_GROUP", "PCT_PF_READS_ALIGNED"), drop = FALSE])
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$READ_GROUP,
y = .data$PCT_PF_READS_ALIGNED,
colour = .data$CATEGORY
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Read Group",
y = "Fraction",
colour = "Category",
title = "Aligned Pass-Filter Reads Fraction per Read Group"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("alignment_percentage_read_group", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_read_group > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_read_group,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the strand balance of aligned pass-filter reads per sample -------
message("Plotting the strand balance of aligned pass-filter reads per sample")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_sample_frame[, c("CATEGORY", "SAMPLE", "STRAND_BALANCE"), drop = FALSE])
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(
x = .data$SAMPLE,
y = .data$STRAND_BALANCE,
colour = .data$CATEGORY
))
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Sample",
y = "Fraction",
colour = "Category",
title = "Strand Balance of Aligned Pass-Filter Reads per Sample"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("alignment_strand_balance_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the strand balance of aligned pass-filter reads per read group ----
message("Plotting the strand balance of aligned pass-filter reads per read group")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_read_group_frame[, c("CATEGORY", "READ_GROUP", "STRAND_BALANCE"), drop = FALSE])
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$READ_GROUP,
y = .data$STRAND_BALANCE,
colour = .data$CATEGORY
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Read Group",
y = "Fraction",
colour = "Category",
title = "Strand Balance of Aligned Pass-Filter Reads per Read Group"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste(
"alignment_strand_balance_read_group",
graphics_format,
sep = "."
),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_read_group > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_read_group,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
rm(plot_width_read_group, plot_width_sample)
}
rm(combined_metrics_read_group_frame,
combined_metrics_sample_frame)
# Picard Hybrid Selection Metrics -----------------------------------------
# Process Picard Hybrid Selection Metrics reports.
message("Processing Picard Hybrid Selection Metrics reports for sample:")
combined_metrics_sample_frame <- NULL
combined_metrics_read_group_frame <- NULL
file_names <-
base::list.files(pattern = "^variant_calling_diagnose_sample_.*_hybrid_selection_metrics.tsv$")
for (file_name in file_names) {
sample_name <-
base::gsub(pattern = "^variant_calling_diagnose_sample_(.*?)_hybrid_selection_metrics.tsv$",
replacement = "\\1",
x = file_name)
message(" ", sample_name)
# Picard Tools added a histogram section that needs excluding from parsing.
# Find the lines starting with "## METRICS CLASS" and "## HISTOGRAM" and read
# that many lines.
metrics_lines <- readLines(con = file_name)
metrics_line <-
which(x = grepl(pattern = "## METRICS CLASS", x = metrics_lines))
histogram_line <-
which(x = grepl(pattern = "## HISTOGRAM", x = metrics_lines))
if (length(x = histogram_line)) {
number_read <- histogram_line[1L] - metrics_line[1L] - 3L
number_skip <- metrics_line[1L]
} else {
number_read <- -1L
number_skip <- metrics_line[1L]
}
picard_metrics_total_frame <-
utils::read.table(
file = file_name,
header = TRUE,
sep = "\t",
# Set the number of rows excluding 3 more lines,
# the "## HISTOGRAM" line, the blank line and the header line.
nrows = number_read,
skip = number_skip,
fill = TRUE,
comment.char = "",
stringsAsFactors = FALSE
)
rm(number_read,
number_skip,
histogram_line,
metrics_line,
metrics_lines)
# To support numeric sample names the read.table(stringsAsFactors = FALSE) is
# turned off. Convert SAMPLE, LIBRARY and READ_GROUP into character vectors.
picard_metrics_total_frame$SAMPLE <-
as.character(x = picard_metrics_total_frame$SAMPLE)
picard_metrics_total_frame$LIBRARY <-
as.character(x = picard_metrics_total_frame$LIBRARY)
picard_metrics_total_frame$READ_GROUP <-
as.character(x = picard_metrics_total_frame$READ_GROUP)
# The Picard Hybrid Selection Metrics report has changed format through versions.
# Column PCT_TARGET_BASES_1X was added at a later stage.
if (!"PCT_TARGET_BASES_1X" %in% base::names(x = picard_metrics_total_frame)) {
picard_metrics_total_frame$PCT_TARGET_BASES_1X <- 0.0
}
if (!"MAX_TARGET_COVERAGE" %in% base::names(x = picard_metrics_total_frame)) {
picard_metrics_total_frame$MAX_TARGET_COVERAGE <- 0L
}
if (!"PCT_EXC_ADAPTER" %in% base::names(x = picard_metrics_total_frame)) {
picard_metrics_total_frame$PCT_EXC_ADAPTER <- 0.0
}
if (!"PF_BASES" %in% base::names(x = picard_metrics_total_frame)) {
picard_metrics_total_frame$PF_BASES <- 0L
}
# Select only rows showing the SAMPLE summary, i.e. showing SAMPLE, but no
# LIBRARY and READ_GROUP information.
picard_metrics_sample_frame <-
picard_metrics_total_frame[(!is.na(x = picard_metrics_total_frame$SAMPLE)) &
(picard_metrics_total_frame$SAMPLE != "") &
(picard_metrics_total_frame$LIBRARY == "") &
(picard_metrics_total_frame$READ_GROUP == ""), ]
combined_metrics_sample_frame <-
base::rbind(combined_metrics_sample_frame,
picard_metrics_sample_frame)
rm(picard_metrics_sample_frame)
# Select only rows showing READ_GROUP summary, i.e. showing READ_GROUP
# information.
picard_metrics_read_group <-
picard_metrics_total_frame[(picard_metrics_total_frame$READ_GROUP != ""), ]
combined_metrics_read_group_frame <-
base::rbind(combined_metrics_read_group_frame,
picard_metrics_read_group)
rm(picard_metrics_read_group)
rm(sample_name, picard_metrics_total_frame)
}
rm(file_name, file_names)
# The Picard Hybrid Selection Metrics is currently optional.
if (!is.null(x = combined_metrics_sample_frame)) {
# Sort the data frame by SAMPLE.
combined_metrics_sample_frame <-
combined_metrics_sample_frame[order(combined_metrics_sample_frame$SAMPLE), ]
# Manually convert BAIT_SET and SAMPLE columns into factors, which are handy
# for plotting.
combined_metrics_sample_frame$BAIT_SET <-
as.factor(x = combined_metrics_sample_frame$BAIT_SET)
combined_metrics_sample_frame$SAMPLE <-
as.factor(x = combined_metrics_sample_frame$SAMPLE)
utils::write.table(
x = combined_metrics_sample_frame,
file = paste(prefix_summary, "hybrid_metrics_sample.tsv", sep = "_"),
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
# Sort the data frame by READ_GROUP.
combined_metrics_read_group_frame <-
combined_metrics_read_group_frame[order(combined_metrics_read_group_frame$READ_GROUP), ]
# Manually convert BAIT_SET and READ_GROUP columns into factors, which are
# handy for plotting.
combined_metrics_read_group_frame$BAIT_SET <-
as.factor(x = combined_metrics_read_group_frame$BAIT_SET)
combined_metrics_read_group_frame$READ_GROUP <-
as.factor(x = combined_metrics_read_group_frame$READ_GROUP)
utils::write.table(
x = combined_metrics_read_group_frame,
file = paste(prefix_summary, "hybrid_metrics_read_group.tsv", sep = "_"),
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
# Adjust the plot width according to batches of 24 samples or read groups.
plot_width_sample <- argument_list$plot_width + (ceiling(x = (
nlevels(x = combined_metrics_sample_frame$SAMPLE) / 24L
)) - 1L) * argument_list$plot_width * 0.25
plot_width_read_group <- argument_list$plot_width + (ceiling(x = (
nlevels(x = combined_metrics_read_group_frame$READ_GROUP) / 24L
)) - 1L) * argument_list$plot_width * 0.25
# message("Plot width sample: ", plot_width_sample)
# message("Plot width read group: ", plot_width_read_group)
# Plot the percentage of unique pass-filter reads per sample ------------
message("Plotting the percentage of unique pass-filter reads per sample")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_sample_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(x = .data$SAMPLE, y = .data$PCT_PF_UQ_READS))
ggplot_object <-
ggplot_object + ggplot2::labs(x = "Sample" , y = "Fraction PF Unique", title = "Unique Pass-Filter Reads per Sample")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("hybrid_unique_percentage_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the percentage of unique pass-filter reads per read group --------
message("Plotting the percentage of unique pass-filter reads per read group")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_read_group_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$READ_GROUP,
y = .data$PCT_PF_UQ_READS,
shape = .data$BAIT_SET
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Read Group",
y = "PF Unique Fraction",
shape = "Bait Set",
title = "Unique Pass-Filter Reads per Read Group"
)
# For more than six shapes (scale_shape()), a manual scale
# (scale_shape_manual()) needs setting up.
# https://ggplot2.tidyverse.org/reference/scale_shape.html
ggplot_object <-
ggplot_object + ggplot2::scale_shape_manual(values = seq_len(
length.out = nlevels(x = combined_metrics_read_group_frame$BAIT_SET)
))
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.5),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste(
"hybrid_unique_percentage_read_group",
graphics_format,
sep = "."
),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_read_group > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_read_group,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the mean target coverage per sample ------------------------------
message("Plotting the mean target coverage per sample")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_sample_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(x = .data$SAMPLE, y = .data$MEAN_TARGET_COVERAGE))
ggplot_object <-
ggplot_object + ggplot2::labs(x = "Sample", y = "Mean Target Coverage", title = "Mean Target Coverage per Sample")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("hybrid_target_coverage_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the mean target coverage per read group --------------------------
message("Plotting the mean target coverage per read group")
ggplot_object <-
ggplot2::ggplot(data = combined_metrics_read_group_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$READ_GROUP,
y = .data$MEAN_TARGET_COVERAGE,
shape = .data$BAIT_SET
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Read Group",
y = "Mean Target Coverage",
shape = "Bait Set",
title = "Mean Target Coverage per Read Group"
)
# For more than six shapes (scale_shape()), a manual scale
# (scale_shape_manual()) needs setting up.
# https://ggplot2.tidyverse.org/reference/scale_shape.html
ggplot_object <-
ggplot_object + ggplot2::scale_shape_manual(values = seq_len(
length.out = nlevels(x = combined_metrics_read_group_frame$BAIT_SET)
))
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.5),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste(
"hybrid_target_coverage_read_group",
graphics_format,
sep = "."
),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_read_group > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_read_group,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the percentage of exluded bases per sample -----------------------
message("Plotting the percentage of excluded bases per sample")
ggplot_object <- ggplot2::ggplot(
data = tidyr::pivot_longer(
data = combined_metrics_sample_frame,
cols = c(
"PCT_EXC_DUPE",
"PCT_EXC_MAPQ",
"PCT_EXC_BASEQ",
"PCT_EXC_OVERLAP",
"PCT_EXC_OFF_TARGET"
),
names_to = "EXCLUDED",
values_to = "fraction"
)
)
ggplot_object <-
ggplot_object + ggplot2::geom_col(
mapping = ggplot2::aes(
x = .data$SAMPLE,
y = .data$fraction,
fill = .data$EXCLUDED
),
alpha = I(1 / 3)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Sample",
y = "Fraction",
fill = "Excluded",
title = "Excluded Bases per Sample"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("hybrid_excluded_bases_sample",
graphics_format,
sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the percentage of excluded bases per read group ------------------
message("Plotting the percentage of excluded bases per read group")
ggplot_object <- ggplot2::ggplot(
data = tidyr::pivot_longer(
data = combined_metrics_read_group_frame,
cols = c(
"PCT_EXC_DUPE",
"PCT_EXC_MAPQ",
"PCT_EXC_BASEQ",
"PCT_EXC_OVERLAP",
"PCT_EXC_OFF_TARGET"
),
names_to = "EXCLUDED",
values_to = "fraction"
)
)
ggplot_object <-
ggplot_object + ggplot2::geom_col(
mapping = ggplot2::aes(
x = .data$READ_GROUP,
y = .data$fraction,
fill = .data$EXCLUDED,
colour = .data$BAIT_SET
),
alpha = I(1 / 3)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Read Group",
y = "Fraction",
fill = "Excluded",
colour = "Bait Set",
title = "Excluded Bases per Read Group"
)
ggplot_object <-
ggplot_object + ggplot2::guides(
fill = ggplot2::guide_legend(order = 1L),
colour = ggplot2::guide_legend(order = 2L)
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.5),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste(
"hybrid_excluded_bases_read_group",
graphics_format,
sep = "."
),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot target coverage levels per sample --------------------------------
# Plot PCT_TARGET_BASES_1X, PCT_TARGET_BASES_2X, PCT_TARGET_BASES_10X,
# PCT_TARGET_BASES_20X, PCT_TARGET_BASES_30X, PCT_TARGET_BASES_40X,
# PCT_TARGET_BASES_50X, PCT_TARGET_BASES_100X per sample.
message("Plotting the coverage levels per sample")
ggplot_object <- ggplot2::ggplot(
data = tidyr::pivot_longer(
data = combined_metrics_sample_frame,
cols = c(
"PCT_TARGET_BASES_1X",
"PCT_TARGET_BASES_2X",
"PCT_TARGET_BASES_10X",
"PCT_TARGET_BASES_20X",
"PCT_TARGET_BASES_30X",
"PCT_TARGET_BASES_40X",
"PCT_TARGET_BASES_50X",
"PCT_TARGET_BASES_100X"
),
names_to = "COVERAGE",
values_to = "fraction"
)
)
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(
x = .data$SAMPLE,
y = .data$fraction,
colour = .data$COVERAGE
))
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Sample",
y = "Fraction",
colour = "Coverage Level",
title = "Coverage Levels per Sample"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste(
"hybrid_target_coverage_levels_sample",
graphics_format,
sep = "."
),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot target coverage levels per read group ----------------------------
# Plot PCT_TARGET_BASES_1X, PCT_TARGET_BASES_2X, PCT_TARGET_BASES_10X,
# PCT_TARGET_BASES_20X, PCT_TARGET_BASES_30X, PCT_TARGET_BASES_40X,
# PCT_TARGET_BASES_50X, PCT_TARGET_BASES_100X per read group.
message("Plotting the coverage levels per read group")
ggplot_object <- ggplot2::ggplot(
data = tidyr::pivot_longer(
data = combined_metrics_read_group_frame,
cols = c(
"PCT_TARGET_BASES_1X",
"PCT_TARGET_BASES_2X",
"PCT_TARGET_BASES_10X",
"PCT_TARGET_BASES_20X",
"PCT_TARGET_BASES_30X",
"PCT_TARGET_BASES_40X",
"PCT_TARGET_BASES_50X",
"PCT_TARGET_BASES_100X"
),
names_to = "COVERAGE",
values_to = "fraction"
)
)
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$READ_GROUP,
y = .data$fraction,
colour = .data$COVERAGE,
shape = .data$BAIT_SET
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Read Group",
y = "Fraction",
colour = "Coverage",
shape = "Bait Set",
title = "Coverage Levels per Read Group"
)
# For more than six shapes (scale_shape()), a manual scale
# (scale_shape_manual()) needs setting up.
# https://ggplot2.tidyverse.org/reference/scale_shape.html
ggplot_object <-
ggplot_object + ggplot2::scale_shape_manual(values = seq_len(
length.out = nlevels(x = combined_metrics_read_group_frame$BAIT_SET)
))
ggplot_object <-
ggplot_object + ggplot2::guides(
colour = ggplot2::guide_legend(order = 1L),
shape = ggplot2::guide_legend(order = 2L)
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.5),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste(
"hybrid_target_coverage_levels_read_group",
graphics_format,
sep = "."
),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object)
# Plot the nominal coverage per sample ----------------------------------
# Plot the nominal coverage (i.e. PF_BASES_ALIGNED / TARGET_TERRITORY) per
# sample.
message("Plotting the nominal coverage per sample")
plotting_frame <-
combined_metrics_sample_frame[, c("SAMPLE",
"READ_GROUP",
"BAIT_SET",
"PF_BASES_ALIGNED",
"TARGET_TERRITORY")]
plotting_frame$NOMINAL_COVERAGE <-
plotting_frame$PF_BASES_ALIGNED / plotting_frame$TARGET_TERRITORY
ggplot_object <- ggplot2::ggplot(data = plotting_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(x = .data$SAMPLE,
y = .data$NOMINAL_COVERAGE))
ggplot_object <-
ggplot_object + ggplot2::labs(x = "Sample", y = "Nominal Coverage", title = "Nominal Coverage per Sample")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("hybrid_nominal_coverage_sample",
graphics_format,
sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object, plotting_frame)
# Plot the nominal coverage per read group ------------------------------
# Plot the nominal coverage (i.e. PF_BASES_ALIGNED / TARGET_TERRITORY) per
# read group.
message("Plotting the nominal coverage per read group")
plotting_frame <-
combined_metrics_read_group_frame[, c("SAMPLE",
"READ_GROUP",
"BAIT_SET",
"PF_BASES_ALIGNED",
"TARGET_TERRITORY")]
plotting_frame$NOMINAL_COVERAGE <-
plotting_frame$PF_BASES_ALIGNED / plotting_frame$TARGET_TERRITORY
ggplot_object <- ggplot2::ggplot(data = plotting_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$READ_GROUP,
y = .data$NOMINAL_COVERAGE,
shape = .data$BAIT_SET
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Read Group",
y = "Nominal Coverage",
shape = "Bait Set",
title = "Nominal Coverage per Read Group"
)
# For more than six shapes (scale_shape()), a manual scale
# (scale_shape_manual()) needs setting up.
# https://ggplot2.tidyverse.org/reference/scale_shape.html
ggplot_object <-
ggplot_object + ggplot2::scale_shape_manual(values = seq_len(
length.out = nlevels(x = combined_metrics_read_group_frame$BAIT_SET)
))
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.5),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste(
"hybrid_nominal_coverage_read_group",
graphics_format,
sep = "."
),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_read_group > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_read_group,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object, plotting_frame)
rm(plot_width_read_group, plot_width_sample)
}
rm(combined_metrics_read_group_frame,
combined_metrics_sample_frame)
# Non-callable Summary Reports --------------------------------------------
# Process non-callable summary reports.
message("Processing non-callable summary reports for sample:")
# Initialise a data frame with all possible columns and rows at once.
combined_metrics_sample_frame <- data.frame(
row.names = base::list.files(pattern = "^variant_calling_diagnose_sample_.*_non_callable_summary.tsv$")
)
combined_metrics_sample_frame$file_name <-
base::row.names(x = combined_metrics_sample_frame)
combined_metrics_sample_frame$exon_path <-
character(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$exon_number_raw <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$exon_width_raw <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$exon_number <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$exon_width <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$exon_width_flank <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$transcribed_number <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$transcribed_width <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$target_path <-
character(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$target_number_raw <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$target_width_raw <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$target_width_flank <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$constrained_number <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$constrained_width <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$callable_loci_path <-
character(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$sample_name <-
character(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$non_callable_number_raw <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame$non_callable_width_raw <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
for (level in c(
"REF_N",
# "PASS" according to documentation, but should be "CALLABLE" in practice.
"NO_COVERAGE",
"LOW_COVERAGE",
"EXCESSIVE_COVERAGE",
"POOR_MAPPING_QUALITY"
)) {
combined_metrics_sample_frame[, paste("non_callable_number_constrained", level, sep = ".")] <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
combined_metrics_sample_frame[, paste("non_callable_width_constrained", level, sep = ".")] <-
integer(length = base::nrow(x = combined_metrics_sample_frame))
}
rm(level)
for (i in seq_len(length.out = base::nrow(x = combined_metrics_sample_frame))) {
sample_name <-
base::gsub(pattern = "^variant_calling_diagnose_sample_(.*?)_non_callable_summary.tsv$",
replacement = "\\1",
x = combined_metrics_sample_frame$file_name[i])
message(" ", sample_name)
non_callable_metrics_sample_frame <-
utils::read.table(
file = combined_metrics_sample_frame$file_name[i],
header = TRUE,
colClasses = c(
"exon_path" = "character",
"exon_number_raw" = "integer",
"exon_width_raw" = "integer",
"exon_number" = "integer",
"exon_width" = "integer",
"exon_width_flank" = "integer",
"transcribed_number" = "integer",
"transcribed_width" = "integer",
"target_path" = "character",
"target_number_raw" = "integer",
"target_width_raw" = "integer",
"target_width_flank" = "integer",
"constrained_number" = "integer",
"constrained_width" = "integer",
"callable_loci_path" = "character",
"sample_name" = "character",
"non_callable_number_raw" = "integer",
"non_callable_width_raw" = "integer",
"non_callable_number_constrained.TOTAL" = "integer",
"non_callable_width_constrained.TOTAL" = "integer",
"non_callable_number_constrained.REF_N" = "integer",
"non_callable_width_constrained.REF_N" = "integer",
# "non_callable_number_constrained.CALLABLE" = "integer",
# "non_callable_width_constrained.CALLABLE" = "integer",
"non_callable_number_constrained.NO_COVERAGE" = "integer",
"non_callable_width_constrained.NO_COVERAGE" = "integer",
"non_callable_number_constrained.LOW_COVERAGE" = "integer",
"non_callable_width_constrained.LOW_COVERAGE" = "integer",
"non_callable_number_constrained.EXCESSIVE_COVERAGE" = "integer",
"non_callable_width_constrained.EXCESSIVE_COVERAGE" = "integer",
"non_callable_number_constrained.POOR_MAPPING_QUALITY" = "integer",
"non_callable_width_constrained.POOR_MAPPING_QUALITY" = "integer"
),
fill = TRUE
)
for (column_name in base::names(x = combined_metrics_sample_frame)) {
if (column_name %in% base::names(x = non_callable_metrics_sample_frame)) {
combined_metrics_sample_frame[i, column_name] <-
non_callable_metrics_sample_frame[[1, column_name]]
} else {
# With the exception of the "file_name" component, set all components
# undefined in the sample-specific data frame to NA in the combined data
# frame.
if (column_name != "file_name") {
combined_metrics_sample_frame[i, column_name] <- NA
}
}
}
rm(column_name,
non_callable_metrics_sample_frame,
sample_name)
}
rm(i)
if (base::nrow(x = combined_metrics_sample_frame) > 0L) {
# Sort the data frame by sample_name.
combined_metrics_sample_frame <-
combined_metrics_sample_frame[order(combined_metrics_sample_frame$sample_name), ]
# Convert the sample_name column into factors, which come more handy for
# plotting.
combined_metrics_sample_frame$sample_name <-
as.factor(x = combined_metrics_sample_frame$sample_name)
# Adjust the plot width according to batches of 24 samples or read groups.
plot_width_sample <- argument_list$plot_width + (ceiling(x = (
nlevels(x = combined_metrics_sample_frame$sample_name) / 24L
)) - 1L) * argument_list$plot_width * 0.25
# message("Plot width sample: ", plot_width_sample)
utils::write.table(
x = combined_metrics_sample_frame,
file = paste(prefix_summary, "non_callable_metrics_sample.tsv", sep = "_"),
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
# Plot the number of non-callable loci per sample -----------------------
message("Plotting the number of non-callable loci per sample")
plotting_frame <- data.frame(
"sample_name" = combined_metrics_sample_frame$sample_name,
"constrained_number" = combined_metrics_sample_frame$constrained_number
)
# Only columns that begin with "^non_callable_number_constrained\." are
# required, the remainder is the mapping status.
column_names <- base::names(x = combined_metrics_sample_frame)
mapping_status <-
base::gsub(pattern = "^non_callable_number_constrained\\.(.*)$",
replacement = "\\1",
x = column_names)
# Extract only those columns which had a match and set the mapping status as
# their new name.
for (i in which(x = grepl(pattern = "^non_callable_number_constrained\\.", x = column_names))) {
plotting_frame[, mapping_status[i]] <-
combined_metrics_sample_frame[, column_names[i]]
}
rm(i, mapping_status, column_names)
# Now, pivot the data frame, but keep sample_name and constrained_width
# as identifiers.
plotting_frame <- tidyr::pivot_longer(
data = plotting_frame,
cols = -c("sample_name", "constrained_number"),
names_to = "mapping_status",
values_to = "number"
)
# Calculate the fractions on the basis of the constrained target number.
plotting_frame$fraction <-
plotting_frame$number / plotting_frame$constrained_number
# For the moment, remove lines with "TOTAL".
plotting_frame <-
plotting_frame[plotting_frame$mapping_status != "TOTAL", ]
ggplot_object <- ggplot2::ggplot(data = plotting_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$sample_name,
y = .data$number,
colour = .data$mapping_status
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Sample",
y = "Number",
colour = "Non-Callable",
title = "Number of Non-Callable Loci per Sample"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("non_callable_absolute_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object, plotting_frame)
# Plot the fraction of non-callable loci per sample ---------------------
message("Plotting the fraction of non-callable loci per sample")
# Reorganise the combined_metrics_sample_frame data frame for plotting non-callable
# target widths.
plotting_frame <- data.frame(
"sample_name" = combined_metrics_sample_frame$sample_name,
"constrained_width" = combined_metrics_sample_frame$constrained_width
)
# Only columns that begin with "^non_callable_width_constrained\." are
# required, the remainder is the mapping status.
column_names <- base::names(x = combined_metrics_sample_frame)
mapping_status <-
base::gsub(pattern = "^non_callable_width_constrained\\.(.*)$",
replacement = "\\1",
x = column_names)
# Extract only those columns which had a match and set the mapping status as
# their new name.
for (i in which(x = grepl(pattern = "^non_callable_width_constrained\\.", x = column_names))) {
plotting_frame[, mapping_status[i]] <-
combined_metrics_sample_frame[, column_names[i]]
}
rm(i, mapping_status, column_names)
# Now, pivot the data frame, but keep sample_name and constrained_width
# as identifiers.
plotting_frame <- tidyr::pivot_longer(
data = plotting_frame,
cols = -c("sample_name", "constrained_width"),
names_to = "mapping_status",
values_to = "width"
)
# Calculate the fractions on the basis of the constrained target width.
plotting_frame$fraction <-
plotting_frame$width / plotting_frame$constrained_width
# For the moment, remove lines with "TOTAL".
plotting_frame <-
plotting_frame[plotting_frame$mapping_status != "TOTAL", ]
ggplot_object <- ggplot2::ggplot(data = plotting_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(
x = .data$sample_name,
y = .data$fraction,
colour = .data$mapping_status
)
)
ggplot_object <-
ggplot_object + ggplot2::labs(
x = "Sample",
y = "Fraction",
colour = "Non-Callable",
title = "Fraction of Non-Callable Loci per Sample"
)
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
for (graphics_format in graphics_formats) {
ggplot2::ggsave(
filename = paste(
prefix_summary,
paste("non_callable_percentage_sample", graphics_format, sep = "."),
sep = "_"
),
plot = ggplot_object,
width = if (graphics_format == "png" &&
plot_width_sample > graphics_maximum_size_png)
graphics_maximum_size_png
else
plot_width_sample,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(graphics_format, ggplot_object, plotting_frame)
rm(plot_width_sample)
}
rm(combined_metrics_sample_frame)
rm(prefix_summary,
argument_list,
graphics_maximum_size_png,
graphics_formats)
message("All done")
# Finally, print all objects that have not been removed from the environment.
if (length(x = ls())) {
print(x = ls())
}
print(x = sessioninfo::session_info())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.