#!/usr/bin/env Rscript
#
# Copyright 2013 - 2022 Michael K. Schuster
#
# Biomedical Sequencing Facility (BSF), part of the genomics core facility of
# the Research Center for Molecular Medicine (CeMM) of the Austrian Academy of
# Sciences and the Medical University of Vienna (MUW).
#
#
# This file is part of BSF R.
#
# BSF R is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BSF R is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with BSF R. If not, see <http://www.gnu.org/licenses/>.
# Description -------------------------------------------------------------
# BSF R script to post-processes Cuffdiff output utilising the cummeRbund
# package. This script creates a cummeRbund SQLite database, writes a set of QC
# plots in PDF and PNG format, splits the large and unwieldy differential data
# tables into pairwise comparisons and creates symbolic links to the original
# Cuffdiff differential tables.
# Option Parsing ----------------------------------------------------------
suppressPackageStartupMessages(expr = library(package = "optparse"))
argument_list <-
optparse::parse_args(object = optparse::OptionParser(
option_list = list(
optparse::make_option(
opt_str = "--verbose",
action = "store_true",
default = TRUE,
help = "Print extra output [default]",
type = "logical"
),
optparse::make_option(
opt_str = "--quiet",
action = "store_false",
default = FALSE,
dest = "verbose",
help = "Print little output",
type = "logical"
),
optparse::make_option(
opt_str = "--comparison-name",
dest = "comparison_name",
help = "Comparison name",
type = "character"
),
optparse::make_option(
opt_str = "--gtf-assembly",
default = NULL,
dest = "gtf_assembly",
help = "GTF file specifying an assembled and merged transcriptome",
type = "character"
),
optparse::make_option(
opt_str = "--gtf-reference",
default = NULL,
dest = "gtf_reference",
help = "GTF file specifying a reference transcriptome",
type = "character"
),
optparse::make_option(
opt_str = "--genome-version",
default = NULL,
dest = "genome_version",
help = "Genome version",
type = "character"
),
optparse::make_option(
opt_str = "--genome-directory",
default = ".",
dest = "genome_directory",
help = "Genome directory path [.]",
type = "character"
),
optparse::make_option(
opt_str = "--output-directory",
default = ".",
dest = "output_directory",
help = "Output directory path [.]",
type = "character"
),
optparse::make_option(
opt_str = "--plot-width",
default = 7.0,
dest = "plot_width",
help = "Plot width in inches [7.0]",
type = "double"
),
optparse::make_option(
opt_str = "--plot-height",
default = 7.0,
dest = "plot_height",
help = "Plot height in inches [7.0]",
type = "double"
)
)
))
if (is.null(x = argument_list$comparison_name)) {
stop("Missing --comparison_name option")
}
if (is.null(x = argument_list$gtf_assembly)) {
stop("Missing --gtf_assembly option")
}
if (is.null(x = argument_list$gtf_reference)) {
stop("Missing --gtf_reference option")
}
if (is.null(x = argument_list$genome_version)) {
stop("Missing --genome_version option")
}
# Library Import ----------------------------------------------------------
# CRAN r-lib
suppressPackageStartupMessages(expr = library(package = "sessioninfo"))
# CRAN Tidyverse
suppressPackageStartupMessages(expr = library(package = "ggplot2"))
# Bioconductor
suppressPackageStartupMessages(expr = library(package = "BiocVersion"))
suppressPackageStartupMessages(expr = library(package = "Biostrings"))
suppressPackageStartupMessages(expr = library(package = "cummeRbund"))
suppressPackageStartupMessages(expr = library(package = "rtracklayer"))
#' Convert character vector into a comma-separated value (CSV).
#' Apply unique() and sort(), before collapsing a character vector
#' into a single element, comma-separated value.
#'
#' @param x: Character vector
#' @return: Single element character vector of comma-sepatated values
character_to_csv <- function(x) {
return(paste(sort(x = base::unique(x = x)), collapse = ","))
}
# Save plots in the following formats.
graphics_formats <- c("pdf" = "pdf", "png" = "png")
# Define Cuffdiff and output directory names relative to the
# working directory.
cuffdiff_directory <-
file.path(
argument_list$genome_directory,
paste("rnaseq", "cuffdiff", argument_list$comparison_name, sep = "_")
)
prefix <-
paste("rnaseq",
"process",
"cuffdiff",
argument_list$comparison_name,
sep = "_")
# Create a new sub-directory for plots if it does not exist.
output_directory <-
file.path(argument_list$output_directory, prefix)
if (!file.exists(output_directory)) {
dir.create(path = output_directory,
showWarnings = TRUE,
recursive = FALSE)
}
# Create a cummeRbund database --------------------------------------------
# Read and index Cuffdiff output and create a CuffSet object.
# Load also the GTF file assembled by Cuffmerge, which fulfills foreign
# key constraints between the "features", "genes" and "isoforms" SQL tables.
# The CuffSet object has slots "genes", "isoforms", "TSS" and "CDS" that each
# are instances of the CuffData class. By default, CuffData accessor methods
# applied to a CuffSet class will operate on the "genes" slot.
message("Creating or loading a cummeRbund database")
cuff_set <-
cummeRbund::readCufflinks(
dir = cuffdiff_directory,
gtfFile = argument_list$gtf_assembly,
genome = argument_list$genome_version
)
# Process Cuffdiff run information ----------------------------------------
frame_path <-
file.path(output_directory, paste0(prefix, "_run_information.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping a run information table")
} else {
message("Creating a run information table")
utils::write.table(
x = cummeRbund::runInfo(object = cuff_set),
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
rm(frame_path)
# Process Cuffdiff sample information -------------------------------------
samples_frame <- cummeRbund::samples(object = cuff_set)
# Get the number of samples.
sample_number <- base::nrow(x = samples_frame)
# Write the samples_frame table.
frame_path <-
file.path(output_directory, paste0(prefix, "_samples.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping a sample table")
} else {
message("Creating a sample table")
utils::write.table(
x = samples_frame,
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
rm(frame_path)
# Define an array of all possible pairwise sample comparisons or sample pairs.
sample_pairs <- combn(x = samples_frame$sample_name, m = 2L)
# Write a table of sample pair information by transposing the sample pairs
# array. This table allows the Python web code to link in pairwise plots.
frame_path <-
file.path(output_directory, paste0(prefix, "_sample_pairs.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping a sample pairs table")
} else {
message("Creating a sample pairs table")
utils::write.table(
x = aperm(a = sample_pairs),
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
rm(frame_path, samples_frame)
# Process Cuffdiff replicate information ----------------------------------
replicates_frame <- cummeRbund::replicates(object = cuff_set)
# Get the number of replicates.
replicate_number <- base::nrow(x = replicates_frame)
# Some plots require replicates. Check whether at least one row has a
# replicate column value greater than 0.
have_replicates <-
(base::nrow(x = replicates_frame[replicates_frame$replicate > 0L,]) > 0L)
# Write the replicates frame.
frame_path <-
file.path(output_directory, paste0(prefix, "_replicates.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping a replicate table")
} else {
message("Creating a replicate table")
utils::write.table(
x = replicates_frame,
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
rm(frame_path)
# Plot the log10(internal_scale) of the replicates_frame to
# visualise outliers.
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "replicate", "scale", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a library scale plot on replicates")
} else {
message("Creating a library scale plot on replicates")
ggplot_object <- ggplot2::ggplot(data = replicates_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(x = .data$rep_name, y = log10(.data$internal_scale)))
ggplot_object <-
ggplot_object + ggplot2::labs(x = "replicate", y = "log10(internal_scale)")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
)
)
plot_width <-
argument_list$plot_width + (ceiling(x = replicate_number / 24L) - 1L) * argument_list$plot_width * 0.25
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, plot_width, ggplot_object)
}
rm(plot_paths, replicates_frame)
# Starting QC plotting ----------------------------------------------------
message("Starting QC plotting")
# Dispersion Plot on Genes ------------------------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "dispersion", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Dispersion Plot on Genes")
} else {
message("Creating a Dispersion Plot on Genes")
ggplot_object <-
cummeRbund::dispersionPlot(object = cummeRbund::genes(object = cuff_set))
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
# Dispersion Plot on Isoforms ---------------------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "isoforms", "dispersion", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Dispersion Plot on Isoforms")
} else {
message("Creating a Dispersion Plot on Isoforms")
base::tryCatch(
expr = {
ggplot_object <-
cummeRbund::dispersionPlot(object = cummeRbund::isoforms(object = cuff_set))
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
},
error = function(cond) {
message("Dispersion Plot on Isoforms failed with message:")
message(cond, appendLF = TRUE)
return(NULL)
}
)
}
rm(plot_paths)
# Squared Coefficient of Variation (SCV) Plot on Genes --------------------
plot_paths <-
file.path(output_directory, paste(paste(prefix, "genes", "scv", sep = "_"), graphics_formats, sep = "."))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a SCV Plot on Genes")
} else {
# The plot requires replicates.
if (have_replicates) {
message("Creating a SCV Plot on Genes")
ggplot_object <-
cummeRbund::fpkmSCVPlot(object = cummeRbund::genes(object = cuff_set))
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
} else {
message("Skipping a SCV Plot on Genes in lack of replicates")
}
}
rm(plot_paths)
# Squared Coefficient of Variation (SCV) Plot on Isoforms -----------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "isoforms", "scv", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a SCV Plot on Isoforms")
} else {
# The plot requires replicates.
if (have_replicates) {
message("Creating a SCV Plot on Isoforms")
ggplot_object <-
cummeRbund::fpkmSCVPlot(object = cummeRbund::isoforms(object = cuff_set))
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
} else {
message("Skipping a SCV Plot on Isoforms in lack of replicates")
}
}
rm(plot_paths)
# Density Plot on Genes without replicates --------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "density", "wo", "replicates", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Density Plot on Genes without replicates")
} else {
message("Creating a Density Plot on Genes without replicates")
ggplot_object <-
cummeRbund::csDensity(object = cummeRbund::genes(object = cuff_set),
replicates = FALSE)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
# Density Plot on Genes with replicates -----------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "density", "w", "replicates", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Density Plot on Genes with replicates")
} else {
message("Creating a Density Plot on Genes with replicates")
ggplot_object <-
cummeRbund::csDensity(object = cummeRbund::genes(object = cuff_set),
replicates = TRUE)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
# Density Plot on Isoforms without replicates -----------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "isoforms", "density", "wo", "replicates", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Density Plot on Isoforms without replicates")
} else {
message("Creating a Density Plot on Isoforms without replicates")
ggplot_object <-
cummeRbund::csDensity(object = cummeRbund::isoforms(object = cuff_set),
replicates = FALSE)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
# Density Plot on Isoforms with replicates --------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "isoforms", "density", "w", "replicates", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Density Plot on Isoforms with replicates")
} else {
message("Creating a Density Plot on Isoforms with replicates")
ggplot_object <-
cummeRbund::csDensity(object = cummeRbund::isoforms(object = cuff_set),
replicates = TRUE)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
# Box Plot on Genes with replicates ---------------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "box", "w", "replicates", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Box Plot on Genes with replicates")
} else {
message("Creating a Box Plot on Genes with replicates")
# By default, the csBoxplot function adds a pseudocount of 1e-04 for log10
# transformed fpkms. In case of a large number of missing values in
# shallowly sequenced samples, this affects the plot.
# Hence reproduce the plot here.
# ggplot_object <-
# cummeRbund::csBoxplot(object = cummeRbund::genes(object = cuff_set),
# replicates = TRUE)
genes_fpkm_frame <-
cummeRbund::repFpkm(object = cummeRbund::genes(object = cuff_set))
# Rename the "rep_name" column into "condition".
base::names(x = genes_fpkm_frame)[base::names(x = genes_fpkm_frame) == "rep_name"] <-
"condition"
ggplot_object <- ggplot2::ggplot(data = genes_fpkm_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_boxplot(
mapping = ggplot2::aes(
x = .data$condition,
y = log10(.data$fpkm),
fill = .data$condition
),
size = 0.3,
alpha = I(1 / 3)
)
ggplot_object <-
ggplot_object + ggplot2::labs(x = "condition", y = "log10(FPKM)", fill = "condition")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
),
legend.text = ggplot2::element_text(size = ggplot2::rel(x = 0.8))
)
ggplot_object <-
ggplot_object + ggplot2::scale_fill_hue(l = 50, h.start = 200)
# Arrange a maximum of 24 replicates in each guide column.
# ggplot_object <-
# ggplot_object + ggplot2::guides(fill = ggplot2::guide_legend(nrow = 24))
# Use the base plot_width and add a quarter of the width for each additional
# guide legend column.
plot_width <-
argument_list$plot_width + (ceiling(x = replicate_number / 24L) - 1L) * argument_list$plot_width * 0.25
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, plot_width, ggplot_object, genes_fpkm_frame)
}
rm(plot_paths)
# Box Plot on Genes without replicates ------------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "box", "wo", "replicates", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Box Plot on Genes without replicates")
} else {
message("Creating a Box Plot on Genes without replicates")
# ggplot_object <-
# cummeRbund::csBoxplot(object = cummeRbund::genes(object = cuff_set),
# replicates = FALSE)
genes_fpkm_frame <-
cummeRbund::fpkm(object = cummeRbund::genes(object = cuff_set))
# Rename the "sample_name" column into "condition".
base::names(x = genes_fpkm_frame)[base::names(x = genes_fpkm_frame) == "sample_name"] <-
"condition"
ggplot_object <- ggplot2::ggplot(data = genes_fpkm_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_boxplot(
mapping = ggplot2::aes(
x = .data$condition,
y = log10(.data$fpkm),
fill = .data$condition
),
size = 0.3,
alpha = I(1 / 3)
)
ggplot_object <-
ggplot_object + ggplot2::labs(x = "condition", y = "log10(FPKM)", fill = "condition")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
),
legend.text = ggplot2::element_text(size = ggplot2::rel(x = 0.8))
)
ggplot_object <-
ggplot_object + ggplot2::scale_fill_hue(l = 50, h.start = 200)
# Arrange a maximum of 24 samples in each guide column.
# ggplot_object <-
# ggplot_object + ggplot2::guides(fill = ggplot2::guide_legend(nrow = 24))
# Use the base plot_witdh and add a quarter of the width for each additional
# guide legend column.
plot_width <-
argument_list$plot_width + (ceiling(x = sample_number / 24L) - 1L) * argument_list$plot_width * 0.25
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, plot_width, ggplot_object, genes_fpkm_frame)
}
rm(plot_paths)
# Box Plot on Isoforms with replicates ------------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "isoforms", "box", "w", "replicates", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Box Plot on Isoforms with replicates")
} else {
message("Creating a Box Plot on Isoforms with replicates")
# By default, the csBoxplot function adds a pseudocount of 1e-04 for log10
# transformed fpkms. In case of a large number of missing values in
# shallowly sequenced samples, this affects the plot.
# Hence reproduce the plot here.
# ggplot_object <-
# cummeRbund::csBoxplot(object = cummeRbund::isoforms(object = cuff_set),
# replicates = TRUE)
isoforms_fpkm_frame <-
cummeRbund::repFpkm(object = cummeRbund::isoforms(object = cuff_set))
# Rename the "rep_name" column into "condition".
base::names(x = isoforms_fpkm_frame)[base::names(x = isoforms_fpkm_frame) == "rep_name"] <-
"condition"
ggplot_object <- ggplot2::ggplot(data = isoforms_fpkm_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_boxplot(
mapping = ggplot2::aes(
x = .data$condition,
y = log10(.data$fpkm),
fill = .data$condition
),
size = 0.3,
alpha = I(1 / 3)
)
ggplot_object <-
ggplot_object + ggplot2::labs(x = "condition", y = "log10(FPKM)", fill = "condition")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
),
legend.text = ggplot2::element_text(size = ggplot2::rel(x = 0.8))
)
ggplot_object <-
ggplot_object + ggplot2::scale_fill_hue(l = 50, h.start = 200)
# Arrange a maximum of 24 replicates in each guide column.
# ggplot_object <-
# ggplot_object + ggplot2::guides(fill = ggplot2::guide_legend(nrow = 24))
# Use the base plot_witdh and add a quarter of the width for each additional
# guide legend column.
plot_width <-
argument_list$plot_width + (ceiling(x = replicate_number / 24L) - 1L) * argument_list$plot_width * 0.25
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, plot_width, ggplot_object, isoforms_fpkm_frame)
}
rm(plot_paths)
# Box Plot on Isoforms without replicates ---------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "isoforms", "box", "wo", "replicates", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Box Plot on Isoforms without replicates")
} else {
message("Creating a Box Plot on Isoforms without replicates")
# ggplot_object <-
# cummeRbund::csBoxplot(object = cummeRbund::isoforms(object = cuff_set),
# replicates = FALSE)
isoforms_fpkm_frame <-
cummeRbund::fpkm(object = cummeRbund::isoforms(object = cuff_set))
# Rename the "sample_name" column into "condition".
base::names(x = isoforms_fpkm_frame)[base::names(x = isoforms_fpkm_frame) == "sample_name"] <-
"condition"
ggplot_object <- ggplot2::ggplot(data = isoforms_fpkm_frame)
ggplot_object <-
ggplot_object + ggplot2::geom_boxplot(
mapping = ggplot2::aes(
x = .data$condition,
y = log10(.data$fpkm),
fill = .data$condition
),
size = 0.3,
alpha = I(1 / 3)
)
ggplot_object <-
ggplot_object + ggplot2::labs(x = "condition", y = "log10(FPKM)", fill = "condition")
ggplot_object <-
ggplot_object + ggplot2::theme(
axis.text.x = ggplot2::element_text(
size = ggplot2::rel(x = 0.8),
hjust = 0.0,
vjust = 0.5,
angle = 90.0
),
legend.text = ggplot2::element_text(size = ggplot2::rel(x = 0.8))
)
ggplot_object <-
ggplot_object + ggplot2::scale_fill_hue(l = 50, h.start = 200)
# Arrange a maximum of 24 replicates in each guide column.
# ggplot_object <-
# ggplot_object + ggplot2::guides(fill = ggplot2::guide_legend(nrow = 24))
# Use the base plot_witdh and add a quarter of the with for each additional
# guide legend column.
plot_width <-
argument_list$plot_width + (ceiling(x = sample_number / 24L) - 1L) * argument_list$plot_width * 0.25
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, plot_width, ggplot_object, isoforms_fpkm_frame)
}
rm(plot_paths)
# Scatter Matrix Plot on Genes --------------------------------------------
# Only include the plot for less than or equal to 20 samples.
if (sample_number <= 20L) {
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "scatter", "matrix", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Scatter Matrix Plot on Genes")
} else {
message("Creating a Scatter Matrix Plot on Genes")
ggplot_object <-
cummeRbund::csScatterMatrix(object = cummeRbund::genes(object = cuff_set))
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
}
# Scatter Matrix Plot on Isoforms -----------------------------------------
if (sample_number <= 20L) {
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "isoforms", "scatter", "matrix", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Scatter Matrix Plot on Isoforms")
} else {
message("Creating a Scatter Matrix Plot on Isoforms")
ggplot_object <-
cummeRbund::csScatterMatrix(object = cummeRbund::isoforms(object = cuff_set))
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
}
# Scatter Plot on Genes for each sample pair ------------------------------
for (i in seq_along(along.with = sample_pairs[1L,])) {
plot_paths <-
file.path(output_directory, paste(
paste(
prefix,
sample_pairs[1L, i],
sample_pairs[2L, i],
"genes",
"scatter",
sep = "_"
),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Scatter Plot on Genes for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i])
} else {
message("Creating a Scatter Plot on Genes for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i])
# Unfortunately, the standard cummeRbund csScatter() function
# does not allow colouring by significance.
# ggplot_object <-
# cummeRbund::csScatter(
# object = cummeRbund::genes(object = cuff_set),
# x = sample_pairs[1L, i],
# y = sample_pairs[2L, i],
# colorByStatus = TRUE
# )
# Re-implement scatter plots here.
#
# Retrieve a data frame with differntial expression data for genes, but
# remove the first "gene_id" column, which is duplicated in the second
# column as a consequence of a SQL table join between the "genes" and
# "geneExpDiffData" tables.
genes_diff_data_frame <-
cummeRbund::diffData(
object = cummeRbund::genes(object = cuff_set),
x = sample_pairs[1L, i],
y = sample_pairs[2L, i],
features = FALSE
)[,-1L]
ggplot_object <-
ggplot2::ggplot(data = genes_diff_data_frame,
mapping = ggplot2::aes(x = .data$value_1, y = .data$value_2))
ggplot_object <- ggplot_object + ggplot2::theme_light()
ggplot_object <-
ggplot_object + ggplot2::labs(x = sample_pairs[1L, i], y = sample_pairs[2L, i])
if (TRUE) {
# Plot the non-significant genes with ggplot2::geom_hex(),
# which performs much better with typical numbers of genes.
ggplot_object <-
ggplot_object + ggplot2::geom_hex(
data = genes_diff_data_frame[genes_diff_data_frame$significant == "no",],
alpha = I(1 / 3),
show.legend = TRUE,
bins = 50
)
# Manually set scale colours.
ggplot_object <-
ggplot_object +
ggplot2::scale_fill_continuous(low = "#e6f0ff", high = "#0066ff")
# Plot the significant genes with ggplot2::geom_point() in red.
ggplot_object <-
ggplot_object + ggplot2::geom_point(
data = genes_diff_data_frame[genes_diff_data_frame$significant == "yes",],
colour = "red",
size = 1.2,
alpha = I(1 / 3)
)
} else {
# Plot significant and non-significant genes with ggplot2::geom_point().
ggplot_object <-
ggplot_object + ggplot2::geom_point(
mapping = ggplot2::aes(colour = .data$significant),
size = 1.2,
alpha = I(1 / 3)
)
# Manually set scale colours.
ggplot_object <-
ggplot_object + ggplot2::scale_colour_manual(values = c("black", "red"))
}
ggplot_object <-
ggplot_object + ggplot2::geom_abline(intercept = 0,
slope = 1,
linetype = 2)
ggplot_object <-
ggplot_object + ggplot2::geom_rug(size = 0.8, alpha = 0.01)
# ggplot_object <-
# ggplot_object + ggplot2::stat_smooth(method = "lm", fill = "blue", alpha = 0.2)
ggplot_object <-
ggplot_object + ggplot2::scale_y_log10() + ggplot2::scale_x_log10()
# Annotate the plot with the (Pearson) correlation coefficient and the
# number of significantly up and downregulated genes.
# For defining the data range for label placement,
# eliminate rows with value 0.0.
range_value_1 <-
range(genes_diff_data_frame[genes_diff_data_frame$value_1 > 0.0,]$value_1)
range_value_2 <-
range(genes_diff_data_frame[genes_diff_data_frame$value_2 > 0.0,]$value_2)
# Calculate the (Pearson) correlation coefficient and place it on the plot
# in the lower right corner at 95% x and 5% y.
ggplot_object <-
ggplot_object + ggplot2::annotate(
geom = "text",
x = 10L ^ (log10(x = range_value_1[1L]) + (
log10(x = range_value_1[2L]) - log10(x = range_value_1[1L])
) * 0.95),
y = 10L ^ (log10(x = range_value_2[1L]) + (
log10(x = range_value_2[2L]) - log10(x = range_value_2[1L])
) * 0.05),
label = paste0("r = ", round(
x = cor(x = genes_diff_data_frame$value_1, y = genes_diff_data_frame$value_2),
digits = 3L
))
)
# Calculate the numbers of up and down regulated genes and place them
# on the plot in the upper left corner at 5%x and 95% y.
ggplot_object <-
ggplot_object + ggplot2::annotate(
geom = "text",
x = 10L ^ (log10(x = range_value_1[1L]) + (
log10(x = range_value_1[2L]) - log10(x = range_value_1[1L])
) * 0.05),
y = 10L ^ (log10(x = range_value_2[1L]) + (
log10(x = range_value_2[2L]) - log10(x = range_value_2[1L])
) * 0.95),
label = paste0(
"Up: ",
base::nrow(x = genes_diff_data_frame[genes_diff_data_frame$log2_fold_change > 0.0 &
genes_diff_data_frame$significant == "yes",]),
"\n",
"Down: ",
base::nrow(x = genes_diff_data_frame[genes_diff_data_frame$log2_fold_change < 0.0 &
genes_diff_data_frame$significant == "yes",])
),
colour = "red",
hjust = 0.0
)
rm(range_value_1, range_value_2)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object, genes_diff_data_frame)
}
rm(plot_paths)
}
# Dendrogram Plot on Genes ------------------------------------------------
# The csDendro function returns a dendrogram object that cannot be saved with
# the ggplot2::ggsave() function.
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "dendrogram", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Dendrogram Plot on Genes")
} else {
message("Creating a Dendrogram Plot on Genes")
grDevices::pdf(
file = plot_paths[1L],
width = argument_list$plot_width,
height = argument_list$plot_height
)
cummeRbund::csDendro(object = cummeRbund::genes(object = cuff_set))
base::invisible(x = grDevices::dev.off())
grDevices::png(
filename = plot_paths[2L],
width = argument_list$plot_width,
height = argument_list$plot_height,
units = "in",
res = 300L
)
cummeRbund::csDendro(object = cummeRbund::genes(object = cuff_set))
base::invisible(x = grDevices::dev.off())
}
rm(plot_paths)
# MA Plot on Genes for each sample pair based on FPKM values --------------
for (i in seq_along(along.with = sample_pairs[1L,])) {
plot_paths <-
file.path(output_directory, paste(
paste(prefix, sample_pairs[1L, i], sample_pairs[2L, i], "genes", "ma", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a MAplot on Genes for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i])
} else {
message("Creating a MAplot on Genes for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i])
ggplot_object <-
cummeRbund::MAplot(
object = cummeRbund::genes(object = cuff_set),
x = sample_pairs[1L, i],
y = sample_pairs[2L, i]
)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
}
# TODO: Create a MAplot on genes for each sample pair based on count data.
# Volcano Matrix Plot on Genes --------------------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "volcano", "matrix", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Volcano Matrix Plot on Genes")
} else {
message("Creating a Volcano Matrix Plot on Genes")
ggplot_object <-
cummeRbund::csVolcanoMatrix(object = cummeRbund::genes(object = cuff_set))
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
# Volcano Plot on Genes for each sample pair ------------------------------
for (i in seq_along(along.with = sample_pairs[1L,])) {
plot_paths <-
file.path(output_directory,
paste(
paste(
prefix,
sample_pairs[1L, i],
sample_pairs[2L, i],
"genes",
"volcano",
sep = "_"
),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Volcano Plot on Genes for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i])
} else {
message("Creating a Volcano Plot on Genes for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i])
# FIXME: The definition of the generic function "csVolcano" does not
# include the "alpha" and "showSignificant" options, although the function
# definition contains them. It does not seem that the option defaults are
# used.
# In R/methods-CuffData.R:
# .volcano <-
# function(object, x, y, alpha = 0.05, showSignificant = TRUE,
# features = FALSE, xlimits = c(-20, 20), ...)
# setMethod("csVolcano", signature(object = "CuffData"), .volcano)
# In R/AllGenerics.R:
# setGeneric("csVolcano",
# function(object, x, y, features = F, ...) standardGeneric("csVolcano"))
ggplot_object <-
cummeRbund::csVolcano(
object = cummeRbund::genes(object = cuff_set),
x = sample_pairs[1L, i],
y = sample_pairs[2L, i],
alpha = 0.05,
showSignificant = TRUE
)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
}
# Multidimensional Scaling (MDS) Plot on Genes ----------------------------
# Plot only, if the CuffData object contains more than two replicates.
if (replicate_number > 2L) {
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "mds", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Multidimensional Scaling Plot on Genes")
} else {
# if (have_replicates) {
message("Creating a Multidimensional Scaling Plot on Genes")
# Nothing ever is simple. If the set has too many replicates, the
# standard cummeRbund MDSplot() falls down.
if (replicate_number <= 24L) {
ggplot_object <-
cummeRbund::MDSplot(object = cummeRbund::genes(object = cuff_set),
replicates = TRUE)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
} else {
# The standard MDSplot has too many replicates.
gene_rep_fit <-
stats::cmdscale(
d = cummeRbund::JSdist(mat = cummeRbund::makeprobs(
a = cummeRbund::repFpkmMatrix(object = cummeRbund::genes(object = cuff_set))
)),
eig = TRUE,
k = 2
)
gene_rep_res <-
data.frame(
"names" = base::rownames(x = gene_rep_fit$points),
"M1" = gene_rep_fit$points[, 1L],
"M2" = gene_rep_fit$points[, 2L],
stringsAsFactors = FALSE
)
ggplot_object <- ggplot2::ggplot(data = gene_rep_res)
ggplot_object <-
ggplot_object + ggplot2::theme_bw() # Add theme black and white.
ggplot_object <-
ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(
x = .data$M1,
y = .data$M2,
colour = .data$names
)) # Draw points in any case.
if (replicate_number <= 24L) {
# Only add text for a sensible number of replicates i.e. less than or
# equal to 24.
ggplot_object <-
ggplot_object + ggplot2::geom_text(
mapping = ggplot2::aes(
x = .data$M1,
y = .data$M2,
label = .data$names,
colour = .data$names
),
size = 4
)
}
# Arrange a maximum of 24 replicates in each guide column.
ggplot_object <-
ggplot_object + ggplot2::guides(colour = ggplot2::guide_legend(nrow = 24))
# Use the base plot_witdh and add a quarter of the width for each
# additional guide legend column.
plot_width <-
argument_list$plot_width + (ceiling(x = replicate_number / 24L) - 1L) * argument_list$plot_width * 0.25
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path,
plot_width,
ggplot_object,
gene_rep_res,
gene_rep_fit)
}
}
rm(plot_paths)
} else {
message("Skipping Multidimensional Scaling Plot on genes in lack of sufficient replicates")
}
# Principal Component Analysis (PCA) Plot on Genes ------------------------
# TODO: Add also other principal components or even better,
# use plots of the PCA package?
plot_paths <-
file.path(output_directory, paste(paste(prefix, "genes", "pca", sep = "_"), graphics_formats, sep = "."))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a Principal Component Analysis Plot (PCA) on Genes")
} else {
message("Creating a Principal Component Analysis Plot (PCA) on Genes")
ggplot_object <-
cummeRbund::PCAplot(
object = cummeRbund::genes(object = cuff_set),
x = "PC1",
y = "PC2",
replicates = TRUE
)
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
# Finishing QC plotting ---------------------------------------------------
message("Finishing QC plotting")
# Starting data table splitting -------------------------------------------
# TODO: Process gene and isoform sets and lists. Feature-level information can
# be accessed directly from a CuffData object using the fpkm, repFpkm, count,
# diffData, or annotation methods.
# Split the large and unwieldy differential data tables into
# pairwise comparisons, but read or assemble gene and isoform
# annotation first.
# Gene and Isoform Annotation ---------------------------------------------
genes_annotation_frame <- NULL
isoforms_annotation_frame <- NULL
frame_path_genes <-
file.path(output_directory, paste0(prefix, "_genes_annotation.tsv"))
frame_path_isoforms <-
file.path(output_directory,
paste0(prefix, "_isoforms_annotation.tsv"))
if (file.exists(frame_path_genes) &&
file.info(frame_path_genes)$size > 0L) {
message("Reading gene annotation from file")
genes_annotation_frame <-
utils::read.table(file = frame_path_genes,
header = TRUE,
sep = "\t")
isoforms_annotation_frame <-
utils::read.table(file = frame_path_isoforms,
header = TRUE,
sep = "\t")
} else {
# Unfortunately, cummeRbund does not seem to offer a convenient way to
# correlate Cufflinks (XLOC) gene identifiers with the original
# Ensembl (ENSG) gene identifiers.
# Thus, the following steps are neccessary:
#
# 1. Read the reference GTF to correlate Ensembl (ENSG) gene and
# (ENST) transcript identifiers, as well as official gene symbols.
# Read only features of type "transcript".
message("Reading reference transcriptome annotation")
reference_granges <- rtracklayer::import(
con = argument_list$gtf_reference,
format = "gtf",
genome = argument_list$genome_version,
feature.type = "transcript"
)
# Selecting via S4Vectors::mcols()[] returns a S4Vectors::DataFrame object.
reference_frame <- base::unique.data.frame(
x = data.frame(
"ensembl_gene_id" = S4Vectors::mcols(x = reference_granges)$gene_id,
"ensembl_transcript_id" = S4Vectors::mcols(x = reference_granges)$transcript_id,
stringsAsFactors = TRUE
)
)
rm(reference_granges)
# 2. Read the assembly GTF, which specifies Cufflinks (XLOC) gene and
# Cufflinks (TCONS) transcript identifiers, but does no longer provide
# information about Ensembl (ENSG) gene identifiers. Reference transcriptome
# loci may have been merged or split by Cuffmerge.
# The Cuffmerge GTF file has only features of type "exon".
message("Reading assembled transcriptome annotation")
assembly_granges <- rtracklayer::import(
con = argument_list$gtf_assembly,
format = "gtf",
genome = argument_list$genome_version,
feature.type = "exon"
)
if ("nearest_ref" %in% S4Vectors::colnames(x = S4Vectors::mcols(x = assembly_granges))) {
# If a "nearest_ref" variable is defined, the GTF is a Cuffmerge assembly.
#
# Example: gene_id "XLOC_000001"; transcript_id "TCONS_00000001"; exon_number "1";
# gene_name "DDX11L1"; oId "CUFF.1.2"; nearest_ref "ENST00000450305";
# class_code "="; tss_id "TSS1";
#
# Selecting via S4Vectors::mcols()[] returns a S4Vectors::DataFrame object.
assembly_frame <- base::unique.data.frame(
x = data.frame(
"gene_id" = S4Vectors::mcols(x = assembly_granges)$gene_id,
"transcript_id" = S4Vectors::mcols(x = assembly_granges)$transcript_id,
"gene_name" = S4Vectors::mcols(x = assembly_granges)$gene_name,
"ensembl_transcript_id" = S4Vectors::mcols(x = assembly_granges)$nearest_ref,
stringsAsFactors = TRUE
)
)
# 3. Join the reference and assembly data frames to correlate
# Cufflinks (XLOC) gene identifiers with Ensembl (ENSG) gene identifiers
# via Ensembl (ENST) transcript identifiers.
message("Merging reference and assembled transcriptome annotation")
ensembl_frame <-
base::merge.data.frame(
x = assembly_frame,
y = reference_frame,
by = "ensembl_transcript_id",
# Keep all XLOC entries.
all.x = TRUE
)
rm(reference_frame, assembly_frame)
# 4. Create a new, normalised Ensembl annotation data frame that correlates
# each Cufflinks (XLOC) gene identifier with comma-separated lists of one or
# more Ensembl (ENSG) gene and Ensembl (ENST) transcript identifiers.
message("Aggregating Ensembl annotation by gene_id")
aggregate_frame <-
base::aggregate.data.frame(
x = ensembl_frame,
by = list("gene_id" = ensembl_frame$gene_id),
FUN = paste
)
rm(ensembl_frame)
# The aggregate frame consists of list objects of character vectors
# with one or more elements aggregated in each group.
# For each character vector, the character_to_csv() function finds unique
# elements and sorts them, before collapsing them into a single,
# comma-separated value.
message("Collapsing aggregated Ensembl annotation")
ensembl_annotation_frame <-
data.frame(
"gene_id" = unlist(x = lapply(
X = aggregate_frame$gene_id, FUN = character_to_csv
)),
"transcript_ids" = unlist(
x = lapply(X = aggregate_frame$transcript_id, FUN = character_to_csv)
),
# "gene_names" = unlist(x = lapply(X = aggregate_frame$gene_name, FUN = character_to_csv)),
"ensembl_gene_ids" = unlist(
x = lapply(X = aggregate_frame$ensembl_gene_id, FUN = character_to_csv)
),
"ensembl_transcript_ids" = unlist(
x = lapply(X = aggregate_frame$ensembl_transcript_id, FUN = character_to_csv)
),
stringsAsFactors = FALSE
)
rm(aggregate_frame)
} else {
# If a "nearest_ref" variable is missing,
# the GTF is the original reference without de-novo transcript assembly.
#
# Example: gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2";
# exon_number "1"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene";
# havana_gene "OTTHUMG00000000961"; havana_gene_version "2";
# transcript_name "DDX11L1-002"; transcript_source "havana"; transcript_biotype "processed_transcript";
# havana_transcript "OTTHUMT00000362751"; havana_transcript_version "1";
# exon_id "ENSE00002234944"; exon_version "1";
# tag "basic"; transcript_support_level "1";
#
# 3. Joining reference and assembly frames is not required.
# 4. Create a new, normalised Ensembl annotation data frame.
ensembl_annotation_frame <- data.frame(
"gene_id" = reference_frame$ensembl_gene_id,
"transcript_ids" = reference_frame$ensembl_transcript_id,
"ensembl_gene_ids" = reference_frame$ensembl_gene_id,
"ensembl_transcript_ids" = reference_frame$ensembl_transcript_id,
stringsAsFactors = FALSE
)
rm(reference_frame)
}
rm(assembly_granges)
# 5. Create a gene annotation frame, ready for enriching
# cummeRbund gene information, by merging the "gene_id", "gene_short_name"
# and "locus" variables of the gene annotation frame with the "transcript_ids",
# "ensembl_gene_ids" and "ensembl_transcript_ids" of the
# Ensembl annotation frame.
message("Creating gene annotation information")
cufflinks_annotation_frame <-
cummeRbund::annotation(object = cummeRbund::genes(object = cuff_set))
genes_annotation_frame <- base::merge.data.frame(
# Merge with a simplified cummeRbund gene annotation data frame, since
# variables "class_code", "nearest_ref_id", "length" and "coverage" seem
# empty by design. Remove hidden exon information by finding unique
# rows only.
x = base::unique.data.frame(x = cufflinks_annotation_frame[, c("gene_id", "gene_short_name", "locus"), drop = FALSE]),
# Merge with the Ensembl annotation data frame that correlates gene (XLOC)
# identifiers with comma-separated lists of Ensembl Gene and Transcript
# identifiers.
y = ensembl_annotation_frame,
# Merge on gene (XLOC) identifiers, but also keep all rows of the
# cufflinks_annotation_frame, or else, locus information would be lost.
by = "gene_id",
all.x = TRUE
)
rm(ensembl_annotation_frame, cufflinks_annotation_frame)
# 6. Create an isoform annotation data frame, ready for enriching
# cummeRbund isoform information.
# Simplify the cummeRbund isoform annotation data frame,
# since variables "CDS_id" and "coverage" seem empty by design.
# Remove hidden exon information by finding unique rows only.
message("Creating isoform annotation information")
cufflinks_annotation_frame <-
cummeRbund::annotation(object = cummeRbund::isoforms(object = cuff_set))
isoforms_annotation_frame <-
base::unique.data.frame(x = cufflinks_annotation_frame[, c(
"isoform_id",
"gene_id",
"gene_short_name",
"TSS_group_id",
"class_code",
"nearest_ref_id",
"locus",
"length"
), drop = FALSE])
rm(cufflinks_annotation_frame)
utils::write.table(
x = genes_annotation_frame,
file = frame_path_genes,
sep = "\t",
row.names = FALSE
)
utils::write.table(
x = isoforms_annotation_frame,
file = frame_path_isoforms,
sep = "\t",
row.names = FALSE
)
}
rm(frame_path_genes, frame_path_isoforms)
# Update the cummeRbund SQLite database -----------------------------------
# Push the aggregated Ensembl gene annotation back into the SQLite database.
if ("ensembl_gene_ids" %in% base::names(x = cummeRbund::annotation(object = cummeRbund::genes(object = cuff_set)))) {
message("Skipping Ensembl annotation for the SQLite database")
} else {
message("Creating Ensembl annotation for the SQLite database")
cummeRbund::addFeatures(
object = cuff_set,
features = data.frame(
"gene_id" = genes_annotation_frame$gene_id,
"ensembl_gene_ids" = genes_annotation_frame$ensembl_gene_ids,
"ensembl_transcript_ids" = genes_annotation_frame$ensembl_transcript_ids,
stringsAsFactors = FALSE
),
level = "genes"
)
}
# Differential Genes per sample pair --------------------------------------
# Create an annotated differential genes data frame for each sample pair.
for (i in seq_along(along.with = sample_pairs[1L,])) {
frame_path <-
file.path(
output_directory,
paste(prefix, sample_pairs[1L, i], sample_pairs[2L, i], "genes_diff.tsv", sep = "_")
)
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message(
"Skipping a differential data frame on Genes for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i]
)
} else {
message(
"Creating a differential data frame on Genes for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i]
)
# The cummeRbund::diffData() function allows automatic merging with feature
# annotation, but that includes some empty columns. For cleaner result
# tables, merge with the smaller genes_annotation_frame established above.
#
# Retrieve a data frame with differential expression data for genes, but
# remove the first "gene_id" column, which is duplicated in the second
# column as a consequence of a SQL table join between the "genes" and
# "geneExpDiffData" tables.
genes_diff_data_frame <-
cummeRbund::diffData(
object = cummeRbund::genes(object = cuff_set),
x = sample_pairs[1L, i],
y = sample_pairs[2L, i],
features = FALSE
)[,-1L]
# Calculate ranks for the effect size (log2_fold_change), absolute level
# and statistical significance (q_value).
genes_diff_data_frame$rank_log2_fold_change <-
rank(
x = -abs(x = genes_diff_data_frame$log2_fold_change),
ties.method = c("min")
)
genes_diff_data_frame$rank_value <-
rank(
x = -abs(x = genes_diff_data_frame$value_2 - genes_diff_data_frame$value_1),
ties.method = c("min")
)
genes_diff_data_frame$rank_q_value <-
rank(x = genes_diff_data_frame$q_value, ties.method = c("min"))
# Calculate the rank sum for the three ranks.
# genes_diff_data_frame$rank_sum <-
# genes_diff_data_frame$rank_log2_fold_change + genes_diff_data_frame$rank_value +
# genes_diff_data_frame$rank_q_value
# Calculate the maximum rank of value and q-value, as
# rank_log2_fold_change is dominated by +/- infinity resulting from genes
# measured only once.
genes_diff_data_frame$max_rank <-
pmax(genes_diff_data_frame$rank_value,
genes_diff_data_frame$rank_q_value)
utils::write.table(
x = base::merge.data.frame(
x = genes_annotation_frame,
y = genes_diff_data_frame,
by = "gene_id",
all = TRUE,
sort = TRUE
),
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
rm(genes_diff_data_frame)
}
rm(frame_path)
}
# Differential Isoforms per sample pair -----------------------------------
# Create an annotated differential isoforms data frame for each sample pair.
for (i in seq_along(along.with = sample_pairs[1L,])) {
frame_path <-
file.path(
output_directory,
paste(
prefix,
sample_pairs[1L, i],
sample_pairs[2L, i],
"isoforms_diff.tsv",
sep = "_"
)
)
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message(
"Skipping a differential data frame on Isoforms for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i]
)
} else {
message(
"Creating a differential data frame on Isoforms for ",
sample_pairs[1L, i],
" versus ",
sample_pairs[2L, i]
)
# The cummeRbund::diffData() function allows automatic merging with feature
# annotation, but that includes some empty columns. For cleaner result
# tables, merge with the smaller isoforms_annotation_frame established above.
#
# Retrieve a data frame with differential expression data for genes, but
# remove the first "isoform_id" column, which is duplicated in the second
# column as a consequence of a SQL table join between the "isoforms" and
# "isoformExpDiffData" tables.
isoforms_diff_data_frame <-
cummeRbund::diffData(
object = cummeRbund::isoforms(object = cuff_set),
x = sample_pairs[1L, i],
y = sample_pairs[2L, i],
features = FALSE
)[,-1L]
# Calculate ranks for the effect size (log2_fold_change), absolute level
# and statistical significance (q_value).
isoforms_diff_data_frame$rank_log2_fold_change <-
rank(
x = -abs(x = isoforms_diff_data_frame$log2_fold_change),
ties.method = c("min")
)
isoforms_diff_data_frame$rank_value <-
rank(
x = -abs(x = isoforms_diff_data_frame$value_2 - isoforms_diff_data_frame$value_1),
ties.method = c("min")
)
isoforms_diff_data_frame$rank_q_value <-
rank(x = isoforms_diff_data_frame$q_value, ties.method = c("min"))
# Calculate the rank sum for the three ranks.
# isoforms_diff_data_frame$rank_sum <-
# isoforms_diff_data_frame$rank_log2_fold_change + isoforms_diff_data_frame$rank_value +
# isoforms_diff_data_frame$rank_q_value
# Calculate the maximum rank of value and q-value, as rank_log2_fold_change
# is dominated by +/- infinity resulting from genes measured only once.
isoforms_diff_data_frame$max_rank <-
pmax(isoforms_diff_data_frame$rank_value,
isoforms_diff_data_frame$rank_q_value)
utils::write.table(
x = base::merge.data.frame(
x = isoforms_annotation_frame,
y = isoforms_diff_data_frame,
by = "isoform_id",
all = TRUE,
sort = TRUE
),
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
rm(isoforms_diff_data_frame)
}
rm(frame_path)
}
# Aggregate status information --------------------------------------------
# Aggregate the "status" variable of differential data frames to inform about
# the number of NOTEST, HIDATA, LOWDATA and FAIL states.
# It is silly to redo this outside of the loops splitting the results, but if
# files above were partially written, the status frame would not be complete.
# Status frame to aggregate the test status columns in pairwise comparisons of genes.
frame_path <-
file.path(output_directory,
paste0(prefix, "_genes_status.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping aggregated status information on Genes")
} else {
message("Creating aggregated status information on Genes")
status_frame <-
data.frame(
# The sample_pairs character matrix contains pairs in columns,
# so that the apply function needs to process by columns (i.e. MARGIN = 2L).
"PAIR" = apply(
X = sample_pairs,
MARGIN = 2L,
FUN = function(x) {
paste(x, collapse = "__")
}
),
"OK" = integer(length = length(x = sample_pairs[1L,])),
"NOTEST" = integer(length = length(x = sample_pairs[1L,])),
"HIDATA" = integer(length = length(x = sample_pairs[1L,])),
"LOWDATA" = integer(length = length(x = sample_pairs[1L,])),
"FAIL" = integer(length = length(x = sample_pairs[1L,])),
"SUM" = integer(length = length(x = sample_pairs[1L,])),
row.names = apply(
X = sample_pairs,
MARGIN = 2L,
FUN = function(x) {
paste(x, collapse = "__")
}
)
)
for (i in seq_along(along.with = sample_pairs[1L,])) {
genes_diff_data_frame <-
cummeRbund::diffData(
object = cummeRbund::genes(object = cuff_set),
x = sample_pairs[1L, i],
y = sample_pairs[2L, i],
features = FALSE
)[,-1L]
# Aggregate the test Status column.
status_integer <- table(genes_diff_data_frame$status)
for (status in base::names(x = status_frame)) {
if (!is.na(x = status_integer[status])) {
status_frame[i, status] <- status_integer[status]
}
}
rm(status, status_integer, genes_diff_data_frame)
status_frame$SUM[i] <- sum(status_frame[i, 2L:6L])
}
utils::write.table(
x = status_frame,
file = frame_path,
row.names = FALSE,
sep = "\t"
)
rm(status_frame)
}
rm(frame_path)
# Status frame to aggregate the test status columns in pairwise comparisons of isoforms.
frame_path <-
file.path(output_directory,
paste0(prefix, "_isoforms_status.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping aggregated status information on Isoforms")
} else {
message("Creating aggregated status information on Isoforms")
status_frame <-
data.frame(
# The sample_pairs character matrix contains pairs in columns,
# so that the apply function needs to process by columns (i.e. MARGIN = 2L).
"PAIR" = apply(
X = sample_pairs,
MARGIN = 2L,
FUN = function(x) {
paste(x, collapse = "__")
}
),
"OK" = integer(length = length(x = sample_pairs[1L,])),
"NOTEST" = integer(length = length(x = sample_pairs[1L,])),
"HIDATA" = integer(length = length(x = sample_pairs[1L,])),
"LOWDATA" = integer(length = length(x = sample_pairs[1L,])),
"FAIL" = integer(length = length(x = sample_pairs[1L,])),
"SUM" = integer(length = length(x = sample_pairs[1L,])),
row.names = apply(
X = sample_pairs,
MARGIN = 2L,
FUN = function(x) {
paste(x, collapse = "__")
}
)
)
for (i in seq_along(along.with = sample_pairs[1L,])) {
isoforms_diff_data_frame <-
cummeRbund::diffData(
object = cummeRbund::isoforms(object = cuff_set),
x = sample_pairs[1L, i],
y = sample_pairs[2L, i],
features = FALSE
)[,-1L]
# Aggregate the test Status column.
status_integer <- table(isoforms_diff_data_frame$status)
for (status in base::names(x = status_frame)) {
if (!is.na(x = status_integer[status])) {
status_frame[i, status] <- status_integer[status]
}
}
rm(status, status_integer, isoforms_diff_data_frame)
status_frame$SUM[i] <- sum(status_frame[i, 2L:6L])
}
utils::write.table(
x = status_frame,
file = frame_path,
row.names = FALSE,
sep = "\t"
)
rm(status_frame)
}
rm(frame_path)
# Matrix of FPKM values per replicate on Genes ----------------------------
frame_path <-
file.path(output_directory,
paste0(prefix, "_genes_fpkm_replicates.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping a matrix of FPKM values per replicates on genes")
} else {
message("Creating a matrix of FPKM values per replicates on genes")
genes_fpkm_frame <-
cummeRbund::repFpkmMatrix(object = cummeRbund::genes(object = cuff_set))
utils::write.table(
x = base::merge.data.frame(
x = genes_annotation_frame,
y = genes_fpkm_frame,
by.x = "gene_id",
by.y = "row.names",
all = TRUE,
sort = TRUE
),
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
rm(genes_fpkm_frame)
}
rm(frame_path)
# Matrix of count values per replicate on genes.
frame_path <-
file.path(output_directory,
paste0(prefix, "_genes_counts_replicates.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping a matrix of count values per replicates on genes")
} else {
message("Creating a matrix of count values per replicates on genes")
genes_count_frame <-
cummeRbund::repCountMatrix(object = cummeRbund::genes(object = cuff_set))
utils::write.table(
x = base::merge.data.frame(
x = genes_annotation_frame,
y = genes_count_frame,
by.x = "gene_id",
by.y = "row.names",
all = TRUE,
sort = TRUE
),
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
rm(genes_count_frame)
}
rm(frame_path)
# Matrix of FPKM values per replicate on Isoforms -------------------------
frame_path <-
file.path(output_directory,
paste0(prefix, "_isoforms_fpkm_replicates.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping a matrix of FPKM per replicates on isoforms")
} else {
message("Creating a matrix of FPKM per replicates on isoforms")
isoforms_fpkm_frame <-
cummeRbund::repFpkmMatrix(object = cummeRbund::isoforms(object = cuff_set))
utils::write.table(
x = base::merge.data.frame(
x = isoforms_annotation_frame,
y = isoforms_fpkm_frame,
by.x = "isoform_id",
by.y = "row.names",
all = TRUE,
sort = TRUE
),
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
rm(isoforms_fpkm_frame)
}
rm(frame_path)
# Matrix of count values per replicate on Isoforms ------------------------
frame_path <-
file.path(output_directory,
paste0(prefix, "_isoforms_counts_replicates.tsv"))
if (file.exists(frame_path) && file.info(frame_path)$size > 0L) {
message("Skipping a matrix of count values per replicates on isoforms")
} else {
message("Creating a matrix of count values per replicates on isoforms")
isoforms_count_frame <-
cummeRbund::repCountMatrix(object = cummeRbund::isoforms(object = cuff_set))
utils::write.table(
x = base::merge.data.frame(
x = isoforms_annotation_frame,
y = isoforms_count_frame,
by.x = "isoform_id",
by.y = "row.names",
all = TRUE,
sort = TRUE
),
file = frame_path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
rm(isoforms_count_frame)
}
rm(frame_path)
# Significance Matrix on Genes --------------------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "genes", "significance", "matrix", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a significance matrix plot on Genes")
} else {
message("Creating a significance matrix plot on Genes")
ggplot_object <- sigMatrix(object = cuff_set, level = "genes")
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
# Significance Matrix on Isoforms -----------------------------------------
plot_paths <-
file.path(output_directory, paste(
paste(prefix, "isoforms", "significance", "matrix", sep = "_"),
graphics_formats,
sep = "."
))
if (all(file.exists(plot_paths)) &&
all(file.info(plot_paths)$size > 0L)) {
message("Skipping a significance matrix plot on Isoforms")
} else {
message("Creating a significance matrix plot on Isoforms")
ggplot_object <- sigMatrix(object = cuff_set, level = "isoforms")
for (plot_path in plot_paths) {
ggplot2::ggsave(
filename = plot_path,
plot = ggplot_object,
width = argument_list$plot_width,
height = argument_list$plot_height,
limitsize = FALSE
)
}
rm(plot_path, ggplot_object)
}
rm(plot_paths)
rm(genes_annotation_frame, isoforms_annotation_frame)
# Get a CuffFeatureSet or CuffGeneSet of significant genes.
# TODO: Comment-out for the moment, as the data is currently not used.
# significant_gene_ids <-
# cummeRbund::getSig(object = cuff_set, level = "genes")
# significant_genes <-
# cummeRbund::getGenes(object = cuff_set, geneIdList = significant_gene_ids)
# The csHeatmap plot does not seem to be a sensible option for larger sets
# of significant genes.
# for (i in seq_along(along.with = sample_pairs[1L,])) {
# significant_genes_diff <-
# }
# Starting symbolic linking -----------------------------------------------
# Finally, create comparison-specific relative symbolic links for cuffdiff
# results in the rnaseq_process_cuffdiff_* directory to avoid identical file
# names between comparisons.
message("Starting symbolic linking to cuffdiff results")
link_path <-
file.path(output_directory, paste0(prefix, "_cds_exp_diff.tsv"))
if (!file.exists(link_path)) {
if (!file.symlink(from = file.path("..", cuffdiff_directory, "cds_exp.diff"),
to = link_path)) {
warning("Encountered an error linking the cds_exp.diff file.")
}
}
rm(link_path)
link_path <-
file.path(output_directory, paste0(prefix, "_genes_exp_diff.tsv"))
if (!file.exists(link_path)) {
if (!file.symlink(from = file.path("..", cuffdiff_directory, "gene_exp.diff"),
to = link_path)) {
warning("Encountered an error linking the gene_exp.diff file.")
}
}
rm(link_path)
link_path <-
file.path(output_directory, paste0(prefix, "_isoforms_exp_diff.tsv"))
if (!file.exists(link_path)) {
if (!file.symlink(from = file.path("..", cuffdiff_directory, "isoform_exp.diff"),
to = link_path)) {
warning("Encountered an error linking the isoform_exp.diff file.")
}
}
rm(link_path)
link_path <-
file.path(output_directory, paste0(prefix, "_promoters_diff.tsv"))
if (!file.exists(link_path)) {
if (!file.symlink(from = file.path("..", cuffdiff_directory, "promoters.diff"),
to = link_path)) {
warning("Encountered an error linking the promoters.diff file.")
}
}
rm(link_path)
link_path <-
file.path(output_directory, paste0(prefix, "_splicing_diff.tsv"))
if (!file.exists(link_path)) {
if (!file.symlink(from = file.path("..", cuffdiff_directory, "splicing.diff"),
to = link_path)) {
warning("Encountered an error linking the splicing.diff file.")
}
}
rm(link_path)
link_path <-
file.path(output_directory, paste0(prefix, "_tss_group_exp_diff.tsv"))
if (!file.exists(link_path)) {
if (!file.symlink(from = file.path("..", cuffdiff_directory, "tss_group_exp.diff"),
to = link_path)) {
warning("Encountered an error linking the tss_group_exp.diff file.")
}
}
rm(link_path)
# Finishing symbolic linking ----------------------------------------------
message("Finishing symbolic linking to cuffdiff results")
rm(
cuff_set,
output_directory,
cuffdiff_directory,
sample_pairs,
sample_number,
replicate_number,
have_replicates,
prefix,
i,
graphics_formats,
argument_list,
character_to_csv
)
message("All done")
# Finally, print all objects that have not been removed from the environment.
if (length(x = ls())) {
print(x = ls())
}
print(x = sessioninfo::session_info())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.