#' HCL Colors Wrapper
#'
#' Generates a set number of colors, for use by plot functions such as
#' jellyfish_plot which can re-use colors for different subsets of the data.
#'
#' @param n Number of colors to generate
#' @return n colors, generated by hcl, from R base's grDevices package.
#' @references
#' [1] R Core Team (2017). R: A language and environment for statistical
#' computing. R Foundation for Statistical Computing, Vienna, Austria.
#' URL https://www.R-project.org/.
#' # TODO: citations for other packages involved elsewhere
get_colors <- function(n) {
grDevices::hcl(h = (seq(15, 375, length = n)), c = 100, l = 65)
}
#' Jellyfish Plot
#'
#' Generates a plot showing the log cumulative isoform abundance
#' distribution for one or more genes. Normalized read count is along the
#' x-axis, and log cumulative isoform count is along the y-axis. Each gene is
#' represented by one line, allowing for comparison between the skewness of
#' relative abundance of isoforms between multiple genes.
#'
#' @param database A compiled Database object.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param dot_size Size of points to plot. Default is to scale this value by
#' the number of genes being plotted, but can be manually set for appearance.
#' @param stabilize_colors Logical. Set to TRUE if you would like to plot a
#' subset of genes, and have the colors match the colors used when all genes
#' are plotted. Default is FALSE, and colors are generated according to the
#' number of genes currently being plotted.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param insert_title String to customize the title of the plot.
#' @return Jellyfish plot (ggplot object).
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1"), Name = c("Gene1"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' jellyfish <- jellyfish_plot(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
jellyfish_plot <- function(database,
genes_to_include = unique(database$GeneDB$Name),
dot_size = -1, stabilize_colors = F, use_ORFs = F,
insert_title = NULL) {
if (!requireNamespace("scales", quietly = T)) {
stop("Error: missing the scales package.")
return(NULL)
}
if (use_ORFs) {
db <- database$OrfDB
if (is.null(insert_title)) insert_title <- "ORF Jellyfish Plot"
} else {
db <- database$TranscriptDB
if (is.null(insert_title)) insert_title <- "Isoform Jellyfish Plot"
}
# recommended dot_size is 1.5 for a few genes, default 0.75 for many genes
if (dot_size == -1) {
dot_size <- ifelse(length(genes_to_include) >= 6, 0.75, 1.5)
}
genes_sorted <- sort(unique(db$Gene))
all_colors <- get_colors(length(genes_sorted))
# for overriding R's color recalculation by # of genes
included_colors <- all_colors[sapply(genes_to_include,
function(x) {which(genes_sorted == x)})]
# reformat as lists, add 0 (one isoform exists until 0% cutoff)
data_reformat <- lapply(genes_to_include, function(name) {
c(0,sort(db$CumPercAbundance[db$Gene == name]))
})
data_reformat <- lapply(data_reformat, function(x) {
cbind(Counts = seq_along(x), Percents = x)
})
# get number of isoforms for each gene
rowcounts <- sapply(data_reformat, nrow)
data_reformat <- as.data.frame(do.call("rbind", data_reformat))
# add column for gene names in order to group genes in plot
data_reformat$Genes <- rep(genes_to_include, rowcounts)
yaxis <- c(2)
while (yaxis[length(yaxis)] < max(rowcounts)) {
yaxis <- c(yaxis, yaxis[length(yaxis)] * 4)
}
jellyfish <- ggplot(data_reformat,
aes_string(x = "Percents", y = "Counts",
colour = "Genes")) +
scale_y_continuous(trans = scales::log2_trans(),
breaks = yaxis + 1,
labels = yaxis,
limits = c(1,max(yaxis)),
expand = c(0, 0)) +
geom_line(size = 0.4) + geom_point(size = dot_size) +
labs(title = insert_title) +
xlab("Percent of Gene's Total Reads") +
ylab("Log Isoform Count") + theme_classic()
if (stabilize_colors) {
jellyfish <- jellyfish + scale_color_manual(values = included_colors)
}
return(jellyfish)
}
#' Exon-Abundance Distribution Plots
#'
#' Generates a bar plot showing the abundances of normalized exon
#' counts for one or more genes. If each transcript is represented as the
#' fraction of exons it contains out of the maximum number of exons found in a
#' gene, this plot is merely a histogram of those representations, weighted by
#' the read count for each transcript. Normalized exon percent is along the
#' x-axis, and abundance is along the y-axis. If only one gene name is given, a
#' second plot is generated where the x-axis is not normalized, instead showing
#' the exon count of each transcript individually, and the y-axis is
#' log-transformed.
#'
#' @param database A compiled Database object.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param sum_dist Logical. If TRUE, the result is a histogram-like bar plot,
#' where the x-axis is binned. Otherwise, individual isoforms are plotted
#' as points, and the y-axis is log-transformed (single gene only).
#' @param bin_width The histogram bin width, used only when multiple genes
#' are input and the x-axis is the fraction of total exons per gene.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' # one plot, x-axis normalized
#' plot_exon_dist(DB)
#' # two plots, x-axis not normalized
#' plot_exon_dist(DB, genes_to_include = c("Gene1"))
#' }
#' @seealso Database
#' @export
plot_exon_dist <- function(database,
genes_to_include = unique(database$GeneDB$Name),
sum_dist = T, bin_width = 0.02,
insert_title = "Exon-Abundance Distribution") {
transcriptDB <- database$TranscriptDB
if (length(genes_to_include) == 1) {
# plot just one gene's exon distribution
gene <- genes_to_include[1]
db <- transcriptDB[transcriptDB$Gene == gene, ]
if (insert_title == "Exon-Abundance Distribution") { # default
insert_title <- paste(insert_title, gene, sep = ": ")
}
if (sum_dist) {
gene_max_exons <- max(database$GeneDB$Exons[database$GeneDB$Name == gene])
exon_abundances <- sapply(1:gene_max_exons, function(i) {
sum(db$FL_reads[db$ExonCount == i])
})
df <- data.frame(Exons = 1:gene_max_exons, Reads = exon_abundances)
ggplot(df, aes_string(x = "Exons", y = "Reads")) +
geom_bar(stat = "identity", fill = "blue", color = "blue4") +
scale_y_continuous(expand = c(0, 0)) + theme_classic() +
labs(title = insert_title)
} else {
df <- data.frame(xaxis = db$ExonCount, yaxis = db$FL_reads)
ggplot(df, aes_string(x = "xaxis", y = "yaxis")) +
geom_jitter(width = 0.15, size = 3, col = grDevices::rgb(0, 0, 1, 0.4)) +
scale_y_continuous(trans = "log2") +
xlab("Exons") + ylab("Log Reads per Transcript") +
labs(title = insert_title) + theme_classic()
}
} else {
db <- transcriptDB[transcriptDB$Gene %in% genes_to_include, ]
norm_exon_counts <- c()
for (i in 1:nrow(db)) {
gene_max <- database$GeneDB$Exons[database$GeneDB$Name == db$Gene[i]]
norm_count <- db$ExonCount[i] / gene_max
norm_exon_counts <- c(norm_exon_counts, rep(norm_count, db$FL_reads[i]))
}
ggplot(data.frame(Counts = norm_exon_counts), aes_string(x = "Counts")) +
geom_histogram(binwidth = bin_width, fill = "blue", color = "blue4") +
scale_y_continuous(expand = c(0, 0)) + theme_classic() +
xlab("Fraction of Gene's Total Exons") +
ylab("Reads") +
labs(title = insert_title)
}
}
#' Unique Isoforms Barplot
#'
#' Generates a bar plot showing the number of unique isoform
#' transcripts and unique ORFs for each gene.
#'
#' @param database A compiled Database object.
#' @param use_log Logical. If true, y-axis is plotted on a log scale (base 2).
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param use_counts One of both of the strings "Isoforms" and "ORFs",
#' indicating which whould be included in the plot. Default is both and to
#' give a warning if no ORF information is in the database.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1"), Name = c("Gene1"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_isoform_orf_counts(DB)
#' }
#' @seealso Database
#' @export
plot_counts <- function(database, use_log = F,
genes_to_include = unique(database$GeneDB$Name),
use_counts = c("Isoforms", "ORFs"),
insert_title = NULL) {
# TODO: standardize filtering by gene
df <- database$GeneDB[database$GeneDB$Name %in% genes_to_include, ]
if ("Isoforms" %in% use_counts) {
if ("ORFs" %in% use_counts & "UniqueORFs" %in% colnames(df)) {
df <- df[order(df$UniqueORFs),]
df <- df[, c("Name", "Isoforms", "UniqueORFs")]
cols <- c("blue4", "cornflowerblue")
if (is.null(insert_title)) {
insert_title <- "Unique Isoforms & ORFs Per Gene"
ylabel <- "Counts"
}
} else {
if ("ORFs" %in% use_counts) {
warning("The Database is missing ORF counts,
so none will be plotted.")
}
df <- df[order(df$Isoforms),]
df <- df[, c("Name", "Isoforms")]
cols <- c("blue4")
insert_title <- "Unique Isoforms Per Gene"
ylabel <- "Isoforms"
}
} else {
if ("ORFs" %in% use_counts & "UniqueORFs" %in% colnames(df)) {
df <- df[order(df$UniqueORFs),]
df <- df[, c("Name", "UniqueORFs")]
cols <- c("cornflowerblue")
insert_title <- "Unique ORFs Per Gene"
ylabel <- "ORFs"
} else {
stop("Ensure that the use_counts argument is correctly set,
and that if ORF coutns are included, the database has an OrfDB.")
return(NULL)
}
}
melted <- reshape::melt(df, id.vars = 1)
colnames(melted) <- c("Gene", "Statistic", "Count")
melted$Statistic <- paste(" ", melted$Statistic) # fix legend spacing
if (use_log) {
upper_lim <- max(melted$Count) * 2
} else {
upper_lim <- max(melted$Count) * 1.1
}
ggplot(melted, aes_string(fill = "Statistic", x = "Gene", y = "Count")) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = insert_title) + xlab("Gene") +
ylab(ifelse(use_log, paste("Log", ylabel, sep = " "), ylabel)) +
scale_fill_manual(values = cols) +
scale_y_continuous(expand = c(0, 0), limits = c(NA, upper_lim),
trans = ifelse(use_log, "log2", "identity")) +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
axis.ticks = element_blank(), legend.title = element_blank(),
legend.position = ifelse("Counts" %in% ylabel, "right", "none"),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.2)) +
geom_text(aes_string(group = "Statistic", label = "Count"), vjust = -0.5,
color = "black", size = 2, position = position_dodge(width = 0.9))
}
#' Gene N50/N75 Barplot
#'
#' Generates a bar plot showing the N50 and N75 statistics for each
#' gene (transcripts or ORFs).
#'
#' @param database A compiled Database object.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_N50_N75(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
plot_N50_N75 <- function(database, use_ORFs = F,
genes_to_include = unique(database$GeneDB$Name),
insert_title = NULL) {
if (!(requireNamespace("reshape", quietly = T) |
requireNamespace("reshape2", quietly = T))) {
stop("Error: reshape/reshape2 package required.")
return(NULL)
} else {
if (use_ORFs) {
select_cols <- c("Name", "ORFN50", "ORFN75")
y_axis_label <- "Unique ORFs"
if (is.null(insert_title)) {
insert_title <- "Gene ORF Statistics"
}
} else {
select_cols <- c("Name", "N50", "N75")
y_axis_label <- "Isoforms"
if (is.null(insert_title)) {
insert_title <- "Gene Isoform Statistics"
}
}
select_genes <- sapply(database$GeneDB$Name, function(gene) {
return(gene %in% genes_to_include)
})
if (requireNamespace("reshape", quietly = T)) {
melted <- reshape::melt(database$GeneDB[select_genes, select_cols],
id.vars = 1)
} else {
melted <- reshape2::melt(database$GeneDB[select_genes, select_cols],
id.vars = 1)
}
colnames(melted) <- c("Gene", "Statistic", "Count")
melted$Statistic <- paste(" ", melted$Statistic) # fixes legend spacing
ggplot(melted, aes_string(x = "Gene", y = "Count")) +
geom_bar(aes_string(fill = "Statistic"),
stat = "identity", position = "dodge") +
geom_text(aes_string(fill = "Statistic", label = "Count"), vjust = -0.5,
color = "black", size = 2, position = position_dodge(width = 0.9)) +
xlab("Gene") + ylab(y_axis_label) + labs(title = insert_title) +
scale_y_continuous(expand = c(0, 0)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
axis.ticks = element_blank(), legend.title = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.2))
}
}
#' Treemap Plot
#'
#' Generates a treemap plot showing how individual transcripts/ORFs and genes
#'account for abundance within the dataset as a whole.
#'
#' @param database A compiled Database object.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_length_dist(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
plot_treemap <- function(database, use_ORFs = F,
genes_to_include = unique(database$GeneDB$Name),
insert_title = NULL) {
if (!(requireNamespace("treemapify", quietly = T))) {
stop("Error: treemapify packages required.")
return(NULL)
}
if (use_ORFs) {
db <- database$OrfDB[, c("PBID", "FL_reads", "Gene")]
db <- db[db$Gene %in% genes_to_include, ]
db$PBID <- as.character(seq_len(nrow(db)))
if (is.null(insert_title)) insert_title <- "ORF Distributions"
} else {
db <- database$TranscriptDB[, c("PBID", "FL_reads", "Gene")]
db <- db[db$Gene %in% genes_to_include, ]
if (is.null(insert_title)) insert_title <- "Isoform Distributions"
}
ggplot(db, aes_string(area = "FL_reads", group = "Gene",
fill = "Gene", subgroup = "Gene",
label = "PBID", subgroup2 = "PBID")) +
treemapify::geom_treemap() +
treemapify::geom_treemap_subgroup2_border(colour = "black", size = 0.25) +
treemapify::geom_treemap_subgroup_border(colour = "black", size = 0.75) +
treemapify::geom_treemap_subgroup_text(colour = "white",
place = "center",
fontface = "italic") +
theme(legend.position = "none") + labs(title = insert_title)
}
#' Shannon Index Plot
#'
#' Generates a plot showing the Shannon Index for isoform/ORF diversity on
#' the x-axis, and each gene on the y-axis.
#'
#' @param database A compiled Database object.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_Shannon_index(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
plot_Shannon_index <- function(database, use_ORFs = F,
genes_to_include = unique(database$GeneDB$Name),
insert_title = NULL) {
if (use_ORFs) {
if (!("ORFShannon" %in% colnames(database$GeneDB))) {
stop("Error: ORF Shannon index not calculated in this database.")
return(NULL)
}
db <- database$GeneDB[, c("Name", "ORFShannon")]
db <- db[db$Name %in% genes_to_include, ]
db$PBID <- as.character(seq_len(nrow(db)))
if (is.null(insert_title)) insert_title <- "ORF Shannon Index"
} else {
if (!("Shannon" %in% colnames(database$GeneDB))) {
stop("Error: isoform Shannon index not calculated in this database.")
return(NULL)
}
db <- database$GeneDB[, c("Name", "Shannon")]
db <- db[db$Name %in% genes_to_include, ]
if (is.null(insert_title)) insert_title <- "Isoform Shannon Index"
}
names(db) <- c("Gene", "Shannon_Index")
buffer <- (max(db$Shannon_Index) - min(db$Shannon_Index)) * 0.025
db$Max <- rep(max(db$Shannon_Index) + buffer, nrow(db))
db$Min <- rep(min(db$Shannon_Index) - buffer, nrow(db))
db <- db[order(db$Shannon_Index), ]
db$Gene <- factor(db$Gene, levels = db$Gene)
ggplot(db, aes_string(x = "Gene",
y = "Shannon_Index",
colour = "Shannon_Index")) +
geom_segment(aes_string(x = "Gene", xend = "Gene",
y = "Min", yend = "Max"),
linetype = "dashed", size = 0.1, colour = "black") +
geom_point(size = 3) + scale_y_continuous(expand = c(0, 0)) +
scale_colour_gradientn(colours = c("blue", "red")) +
ylab("Shannon Index") +
labs(title = insert_title) + coord_flip() + theme_classic() +
theme(axis.ticks.y = element_blank(), legend.position = "none")
}
#' Isoform Length Distribution Plot
#'
#' Generates a dot plot showing how transcripts/ORFs are distributed in length
#' for each gene in the database.
#'
#' @param database A compiled Database object.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param bins The number of bins to use in segmenting the range of lengths
#' over the entire dataset. This parameter determines vertical dot spread.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param horiz_spread Numeric. This parameter determines horizontal dot
#' spread across all the genes visualized.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_length_dist(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
plot_length_dist <- function(database, use_ORFs = F, bins = 200,
genes_to_include = unique(database$GeneDB$Name),
horiz_spread = 0.3, insert_title = NULL) {
if (use_ORFs) {
if (is.null(database$OrfDB)) {
stop("Error: ORFs not included in this database.")
return(NULL)
}
db <- database$OrfDB[, c("Gene", "ORFLength")]
db <- db[db$Gene %in% genes_to_include, ]
if (is.null(insert_title)) insert_title <- "Unique ORF Length Distributions"
} else {
db <- database$TranscriptDB[, c("Gene", "TranscriptLength")]
db <- db[db$Gene %in% genes_to_include, ]
if (is.null(insert_title)) insert_title <- "Transcript Length Distributions"
}
names(db) <- c("Gene", "Length")
len_range <- max(db$Length) - min(db$Length)
binwidth <- ceiling(len_range / bins)
ggplot(db, aes_string(x = "Gene", y = "Length")) +
geom_dotplot(binaxis = 'y', dotsize = 1, binwidth = binwidth,
method = "histodot", stackdir = 'center',
stackratio = horiz_spread) +
labs(title = insert_title) + theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.