#' Make a ggplot graph of library sizes.
#'
#' It is often useful to have a quick view of which samples have more/fewer
#' reads. This does that and maintains one's favorite color scheme and tries to
#' make it pretty!
#'
#' @param data Expt, dataframe, or expressionset of samples.
#' @param condition Vector of sample condition names.
#' @param colors Color scheme if the data is not an expt.
#' @param text Add the numeric values inside the top of the bars of the plot?
#' @param order Explicitly set the order of samples in the plot?
#' @param plot_title Title for the plot.
#' @param yscale Whether or not to log10 the y-axis.
#' @param expt_names Design column or manually selected names for printing sample names.
#' @param label_chars Maximum number of characters before abbreviating sample names.
#' @param ... More parameters for your good time!
#' @return a ggplot2 bar plot of every sample's size
#' @seealso [ggplot2] [prettyNum] [plot_sample_bars()]
#' @examples
#' \dontrun{
#' libsize_plot <- plot_libsize(expt = expt)
#' libsize_plot ## ooo pretty bargraph
#' }
#' @export
plot_libsize <- function(data, condition = NULL, colors = NULL,
text = TRUE, order = NULL, plot_title = NULL, yscale = NULL,
expt_names = NULL, label_chars = 10,
...) {
arglist <- list(...)
if (is.null(text)) {
text <- TRUE
}
## In response to Keith's recent comment when there are more than 8 factors
chosen_palette <- "Dark2"
if (!is.null(arglist[["palette"]])) {
chosen_palette <- arglist[["palette"]]
}
mtrx <- as.matrix(data)
if (is.null(colors)) {
pal <- RColorBrewer::brewer.pal(ncol(mtrx), chosen_palette)
colors <- grDevices::colorRampPalette(pal)(ncol(mtrx))
}
## Get conditions
if (is.null(condition)) {
stop("Missing condition label vector.")
}
values <- as.numeric(mtrx)
integerp <- all.equal(values, as.integer(values))
colors <- as.character(colors)
sum <- NULL
if (!is.null(expt_names) && class(expt_names) == "character") {
if (length(expt_names) == 1) {
colnames(mtrx) <- make.names(design[[expt_names]], unique = TRUE)
} else {
colnames(mtrx) <- expt_names
}
}
if (!is.null(label_chars) && is.numeric(label_chars)) {
colnames(mtrx) <- abbreviate(colnames(mtrx), minlength = label_chars)
}
libsize_df <- data.frame("id" = as.character(colnames(mtrx)),
"sum" = colSums(mtrx),
"condition" = condition,
"colors" = as.character(colors))
summary_df <- data.table::setDT(libsize_df)[, list(
"min" = min(sum),
"1st" = quantile(x = sum, probs = 0.25),
"median" = median(x = sum),
"mean" = mean(sum),
"3rd" = quantile(x = sum, probs = 0.75),
"max" = max(sum)),
by = "condition"]
libsize_plot <- plot_sample_bars(
libsize_df, condition = condition, colors = colors,
text = text, order = order, plot_title = plot_title, integerp = integerp,
yscale = yscale, ...)
retlist <- list(
"plot" = libsize_plot,
"table" = libsize_df,
"summary" = summary_df)
class(retlist) <- "libsize_plot"
return(retlist)
}
setGeneric("plot_libsize")
#' Visualize genes observed before/after filtering.
#'
#' Thanks to Sandra Correia for this! This function attempts to represent the
#' change in the number of genes which are well/poorly represented in the data
#' before and after performing a low-count filter.
#'
#' @param expt Input expressionset.
#' @param low_limit Threshold to define 'low-representation.'
#' @param filter Method used to low-count filter the data.
#' @param ... Extra arbitrary arguments to pass to normalize_expt()
#' @return Bar plot showing the number of genes below the low_limit before and
#' after filtering the data.
#' @seealso [plot_libsize()] [filter_counts()]
#' @export
plot_libsize_prepost <- function(expt, low_limit = 2, filter = TRUE, ...) {
start <- plot_libsize(expt, text = FALSE)
norm <- sm(normalize_expt(expt, filter = filter,
...))
end <- plot_libsize(norm)
## Gather the number of genes which are <= the low limit, before and after filtering
lt_min_start <- colSums(exprs(expt) <= low_limit)
lt_min_end <- colSums(exprs(norm) <= low_limit)
start_tab <- as.data.frame(start[["table"]])
end_tab <- as.data.frame(end[["table"]])
## Get the number of counts in each samples.
start_tab[["sum"]] <- as.numeric(start_tab[["sum"]])
start_tab[["colors"]] <- as.character(start_tab[["colors"]])
start_tab[["alpha"]] <- ggplot2::alpha(start_tab[["colors"]], 0.75)
## Get the number of low genes in each sample.
start_tab[["low"]] <- lt_min_start
start_tab[["sub_low"]] <- 0
start_tab[["subtraction"]] <- 0
start_tab[["subtraction_string"]] <- ""
## Get the number of counts after filtering.
end_tab[["sum"]] <- as.numeric(end_tab[["sum"]])
end_tab[["colors"]] <- as.character(end_tab[["colors"]])
end_tab[["alpha"]] <- ggplot2::alpha(end_tab[["colors"]], 1.0)
## Find how many counts were lost from filtering.
subtract_count_sums <- start_tab[["sum"]] - end_tab[["sum"]]
end_tab[["subtraction"]] <- subtract_count_sums
## Get the number of genes after filtering.
end_tab[["low"]] <- lt_min_end
end_tab[["sub_low"]] <- 0
end_tab[["subtraction_string"]] <- ""
## Get the number of genes lost from filtering.
subtract_gene_sums <- start_tab[["low"]] - end_tab[["low"]]
undef <- is.na(subtract_gene_sums)
## When this comes up NA, just use the original number.
subtract_gene_sums[undef] <- start_tab[undef, "low"]
start_tab[["sub_low"]] <- subtract_gene_sums
start_tab[["subtraction_string"]] <- paste0(subtract_count_sums, " counts from ",
subtract_gene_sums, " genes.")
all_tab <- rbind(start_tab, end_tab)
count_title <- glue("Counts remaining after filtering less than {low_limit} reads,
labeled by counts/genes removed.")
## Suppressing 'Using alpha for a discrete variable is not advised.'
count_columns <- ggplot(all_tab,
aes(x = .data[["id"]], y = .data[["sum"]])) +
ggplot2::geom_col(position = "identity", color = "black",
aes(fill = .data[["colors"]])) +
ggplot2::scale_fill_manual(values = c(levels(as.factor(all_tab[["colors"]])))) +
ggplot2::geom_text(parse = FALSE, angle = 90, size = 4, color = "white", hjust = 1.2,
aes(x = .data[["id"]],
## label='as.character(all_tab$subtraction)')) +
label = .data[["subtraction_string"]])) +
ggplot2::theme(axis.text = ggplot2::element_text(size = 10, colour = "black"),
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
legend.position = "none") +
ggplot2::scale_y_continuous(trans = "log10", labels = scales::scientific) +
ggplot2::ggtitle(count_title)
low_title <- glue("Genes with less than {low_limit} reads, labeled by delta.")
low_columns <- ggplot(all_tab, aes(x = .data[["id"]], y = .data[["low"]])) +
ggplot2::geom_col(position = "identity", color = "black",
aes(alpha = .data[["alpha"]], fill = .data[["colors"]])) +
ggplot2::scale_fill_manual(values = c(levels(as.factor(all_tab[["colors"]])))) +
ggplot2::geom_text(parse = FALSE, angle = 90, size = 4, color = "black", hjust = 1.2,
aes(x = .data[["id"]], label = .data[["sub_low"]])) +
ggplot2::theme(axis.text = ggplot2::element_text(size = 10, colour = "black"),
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
legend.position = "none") +
ggplot2::ggtitle(low_title)
retlist <- list(
"start" = start,
"end" = end,
"table" = all_tab,
"count_plot" = count_columns,
"lowgene_plot" = low_columns)
class(retlist) <- "prepost_filter"
return(retlist)
}
#' Make a ggplot graph of the percentage/number of reads kept/removed.
#'
#' The function expt_exclude_genes() removes some portion of the original reads.
#' This function will make it possible to see what is left.
#'
#' @param data Dataframe of the material remaining, usually expt$summary_table
#' @param row Row name to plot.
#' @param condition Vector of sample condition names.
#' @param colors Color scheme if the data is not an expt.
#' @param names Alternate names for the x-axis.
#' @param text Add the numeric values inside the top of the bars of the plot?
#' @param plot_title Title for the plot.
#' @param yscale Whether or not to log10 the y-axis.
#' @param ... More parameters for your good time!
#' @return a ggplot2 bar plot of every sample's size
#' @seealso [plot_sample_bars()]
#' @examples
#' \dontrun{
#' kept_plot <- plot_pct_kept(expt_removed)
#' kept_plot ## ooo pretty bargraph
#' }
#' @export
plot_pct_kept <- function(data, row = "pct_kept", condition = NULL, colors = NULL,
names = NULL, text = TRUE, plot_title = NULL, yscale = NULL, ...) {
arglist <- list(...)
table <- data
if (class(data) == "expt" || class(data) == "SummarizedExperiment") {
table <- data[["summary_table"]]
}
## In response to Keith's recent comment when there are more than 8 factors
chosen_palette <- "Dark2"
if (!is.null(arglist[["palette"]])) {
chosen_palette <- arglist[["palette"]]
}
data_class <- class(data)[1]
if (data_class == "expt" || data_class == "SummarizedExperiment") {
condition <- pData(data)[["condition"]]
colors <- data[["colors"]]
names <- data[["names"]]
data <- exprs(data) ## Why does this need the simplifying
## method of extracting an element? (eg. data['expressionset'] does not work)
## that is _really_ weird!
} else if (data_class == "ExpressionSet") {
data <- exprs(data)
} else if (data_class == "matrix" || data_class == "data.frame") {
data <- as.data.frame(data)
} else {
stop("This function understands types: expt, ExpressionSet, data.frame, and matrix.")
}
if (is.null(colors)) {
colors <- grDevices::colorRampPalette(
RColorBrewer::brewer.pal(ncol(data),
chosen_palette))(ncol(data))
}
if (is.null(condition)) {
stop("Missing condition label vector.")
}
colors <- as.character(colors)
kept_df <- data.frame("id" = colnames(data),
"sum" = table[row, ],
"condition" = condition,
"colors" = as.character(colors))
kept_plot <- plot_sample_bars(kept_df, condition = condition, colors = colors,
names = names, text = text,
plot_title = plot_title, yscale = yscale, ...)
return(kept_plot)
}
#' The actual library size plotter.
#'
#' This makes a ggplot2 plot of library sizes.
#'
#' @param sample_df Expt, dataframe, or expressionset of samples.
#' @param condition Vector of sample condition names.
#' @param colors Color scheme if the data is not an expt.
#' @param integerp Is this comprised of integer values?
#' @param order Explicitly set the order of samples in the plot?
#' @param text Add the numeric values inside the top of the bars of the plot?
#' @param plot_title Title for the plot.
#' @param yscale Whether or not to log10 the y-axis.
#' @param ... Used to catch random arguments which are unused here.
plot_sample_bars <- function(sample_df, condition = NULL, colors = NULL,
integerp = FALSE, order = NULL,
text = TRUE, plot_title = NULL, yscale = NULL, ...) {
arglist <- list(...)
y_label <- "Library size in pseudocounts."
if (isTRUE(integerp)) {
y_label <- "Library size in counts."
}
if (!is.null(arglist[["y_label"]])) {
y_label <- arglist[["y_label"]]
}
sample_df[["order"]] <- factor(sample_df[["id"]], as.character(sample_df[["id"]]))
if (is.null(order)) {
## Order it by sample names lexically
lexical <- order(levels(sample_df[["order"]]))
sample_df <- sample_df[lexical, ]
## The next two lines are an attempt to ensure that the factor
## levels make sense and are in the same order as the samples.
new_order <- as.character(sample_df[["order"]])
sample_df[["order"]] <- as.factor(new_order)
} else {
new_df <- data.frame()
for (o in order) {
matches <- grep(pattern = o, x = sample_df[["order"]])
adders <- sample_df[matches, ]
new_df <- rbind(new_df, adders)
}
sample_df <- new_df
sample_df[["order"]] <- factor(sample_df[["order"]], as.character(sample_df[["order"]]))
}
color_listing <- sample_df[, c("condition", "colors")]
color_listing <- unique(color_listing)
color_list <- as.character(color_listing[["colors"]])
names(color_list) <- as.character(color_listing[["condition"]])
## Set a cutoff of the sum of the rgbs of the bar colors, if the color is > than that cutoff
## Set the text color to black.
## I reversed the logic for these colors!
## sample_df[["text_color"]] <- "#ffffff"
sample_df[["text_color"]] <- "#FFFFFF"
text_lst <- color_int(sample_df[["colors"]])
for (r in seq_len(nrow(sample_df))) {
## Some recommendations for color cutoffs from:
## https://stackoverflow.com/questions/3942878/how-to-decide-font-color-in-white-or-black-depending-on-background-color
## Thanks to Theresa
color_sum <- (text_lst[["red"]][[r]] * 0.299) +
(text_lst[["green"]][[r]] * 0.587) +
(text_lst[["blue"]][[r]] * 0.114)
if (color_sum >= 186) {
## Hahah I am a doofus and reversed the logic!
sample_df[r, "text_color"] <- "#000000"
}
}
text_colors <- sample_df[["text_color"]]
sample_plot <- ggplot(data = sample_df,
colour = colors,
aes(x = .data[["order"]], y = .data[["sum"]])) +
ggplot2::geom_bar(stat = "identity",
colour = "black",
fill = sample_df[["colors"]],
aes(x = .data[["order"]])) +
ggplot2::xlab("Sample ID") +
ggplot2::ylab(y_label) +
ggplot2::theme_bw(base_size = base_size) +
ggplot2::theme(axis.text = ggplot2::element_text(size = base_size, colour = "black"),
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5))
if (isTRUE(text)) {
if (!isTRUE(integerp)) {
sample_df[["sum"]] <- sprintf("%.2f", round(as.numeric(sample_df[["sum"]]), 2))
}
sample_plot <- sample_plot +
ggplot2::geom_text(
parse = FALSE, angle = 90, size = 4, hjust = 1.2, colour = text_colors,
aes(x = .data[["order"]],
label = prettyNum(as.character(sample_df[["sum"]]), big.mark = ",")))
}
if (!is.null(plot_title)) {
sample_plot <- sample_plot + ggplot2::ggtitle(plot_title)
}
if (is.null(yscale)) {
scale_difference <- max(as.numeric(sample_df[["sum"]])) /
min(as.numeric(sample_df[["sum"]]))
if (scale_difference > 10.0) {
mesg("The scale difference between the smallest and largest
libraries is > 10. Assuming a log10 scale is better, set scale = FALSE if not.")
yscale <- TRUE
} else {
yscale <- FALSE
}
}
if (isTRUE(yscale)) {
sample_plot <- sample_plot +
ggplot2::scale_y_continuous(trans = "log10",
labels = scales::scientific)
} else if (yscale == "log10") {
sample_plot <- sample_plot +
ggplot2::scale_y_continuous(trans = "log10",
labels = scales::scientific)
} else if (yscale == "log2") {
sample_plot <- sample_plot +
ggplot2::scale_y_continuous(trans = "log2",
labels = scales::scientific)
} else {
sample_plot <- sample_plot +
ggplot2::scale_y_continuous(labels = scales::scientific)
}
return(sample_plot)
}
#' Make relatively pretty bar plots of coverage in a genome.
#'
#' This was written for ribosome profiling coverage / gene.
#' It should however, work for any data with little or no modification, it was
#' also written when I was first learning R and when I look at it now I see a
#' few obvious places which can use improvement.
#'
#' @param input Coverage / position filename.
#' @param workdir Where to put the resulting images.
#' @param output Output image filename.
#' @param name Gene name to print at the bottom of the plot.
#' @param start Relative to 0, where is the gene's start codon.
#' @param end Relative to 0, where is the gene's stop codon.
#' @param strand Is this on the + or - strand? (+1/-1)
#' @param padding How much space to provide on the sides?
#' @return coverage plot surrounging the ORF of interest
#' @seealso [ggplot2]
plot_rpm <- function(input, workdir = "images", output = "01.svg", name = "LmjF.01.0010",
start = 1000, end = 2000, strand = 1, padding = 100) {
mychr <- gsub(pattern = "\\.\\d+$", replacement = "", x = name, perl = TRUE)
plotted_start <- start - padding
plotted_end <- end + padding
my_start <- start
my_end <- end
region_idx <- input[["chromosome"]] == mychr &
input[["position"]] >= plotted_start &
input[["position"]] <= plotted_end
rpm_region <- rpm_region[region_idx, ]
rpm_region <- rpm_region[, -1]
rpm_region[["log"]] <- log2(rpm_region[["rpm"]] + 1)
## pre_start = subset(rpm_region, position < my_start)
start_idx <- rpm_region[["position"]] < my_start
pre_start <- rpm_region[start_idx, ]
## post_stop = subset(rpm_region, position > my_end)
stop_idx <- rpm_region[["position"]] > my_end
post_stop <- rpm_region[stop_idx, ]
## cds = subset(rpm_region, position >= my_start & position < my_end)
cds_idx <- rpm_region[["position"]] >= my_start & rpm_region[["position"]] < my_end
cds <- rpm_region[cds_idx, ]
eval(substitute(
expr = {
stupid <- aes(y = 0, yend = 0, x = my_start, xend = my_end)
},
env <- list(my_start = my_start, my_end = my_end)))
if (strand == "+") {
gene_arrow <- grid::arrow(type = "closed", ends = "last")
} else {
gene_arrow <- grid::arrow(type = "closed", ends = "first")
}
xlabel_string <- glue("{name}: {my_start} to {my_end}")
my_plot <- ggplot(rpm_region, aes(x = .data[["position"]], y = .data[["log"]])) +
ggplot2::xlab(xlabel_string) +
ggplot2::ylab("Log2(RPM) reads") +
ggplot2::geom_bar(data = rpm_region, stat = "identity", fill = "black", colour = "black") +
ggplot2::geom_bar(data = pre_start, stat = "identity", fill = "red", colour = "red") +
ggplot2::geom_bar(data = post_stop, stat = "identity", fill = "red", colour = "red") +
ggplot2::geom_segment(data = rpm_region, mapping = stupid,
arrow = gene_arrow, size = 2, color = "blue") +
ggplot2::theme_bw(base_size = base_size)
return(my_plot)
}
#' Plot significant genes by contrast with different colors for significance levels.
#'
#' This is my attempt to recapitulate some plots made in Laura and Najib's mbio
#' paper. The goal of the plot is to show a few ranges of significance as
#' differently colored and stacked bars. The colors are nice because Najib and
#' Laura chose them.
#'
#' @param ups Set of up-regulated genes.
#' @param downs Set of down-regulated genes.
#' @param maximum Maximum/minimum number of genes to display.
#' @param text Add text at the ends of the bars describing the number of genes >/< 0 fc.
#' @param color_list Set of colors to use for the bars.
#' @param color_names Categories associated with aforementioned colors.
#' @return weird significance bar plots
#' @seealso [ggplot2] [extract_significant_genes()]
#' @export
plot_significant_bar <- function(ups, downs, maximum = NULL, text = TRUE,
color_list = c("lightcyan", "lightskyblue", "dodgerblue",
"plum1", "orchid", "purple4"),
color_names = c("a_up_inner", "b_up_middle", "c_up_outer",
"a_down_inner", "b_down_middle", "c_down_outer")) {
choose_max <- function(u, d) {
## m is the maximum found in the ups/downs
m <- 0
## which is extracted from ups and downs
um <- max(as.numeric(u))
dm <- max(as.numeric(d))
## choose the maximum by which is biggest!
if (um >= dm) {
m <- um
} else {
m <- dm
}
## Figure out the number of digits in the number
digits <- nchar(as.character(m))
## And the number of zeroes in it.
num_zeroes <- digits - 1.0
## Add 1 to the first digit
first_digit <- as.numeric(strsplit(x = as.character(m), split = "")[[1]][[1]]) + 1.0
## And set maximum to that number * 10 to the number of zeroes.
maximum <- first_digit * (10 ^ num_zeroes)
return(maximum)
}
up_sums <- list()
down_sums <- list()
comp_names <- ups[ups[["variable"]] == "a_up_inner", ][["comparisons"]]
for (comp in seq_along(comp_names)) {
comp_name <- comp_names[[comp]]
idx <- ups[["comparisons"]] == comp_name
up_sums[[comp_name]] <- sum(as.numeric(ups[idx, ][["value"]]))
idx <- downs[["comparisons"]] == comp_name
down_sums[[comp_name]] <- sum(as.numeric(downs[idx, ][["value"]])) * -1.0
}
if (is.null(maximum)) {
maximum <- choose_max(up_sums, down_sums)
}
## Try to ensure that ggplot orders my colors and bars in the specific order I want.
## holy ass crackers this is annoying and difficult to get correct, as the
## ordering is (to my eyes) arbitrary.
names(color_list) <- color_names
levels(ups[["variable"]]) <- c("c_up_outer", "b_up_middle", "a_up_inner")
levels(downs[["variable"]]) <- c("c_down_outer", "b_down_middle", "a_down_inner")
sigbar_plot <- ggplot() +
ggplot2::geom_col(
data = ups,
aes(x = .data[["comparisons"]], y = .data[["value"]],
fill = .data[["variable"]])) +
ggplot2::geom_col(
data = downs,
aes(x = .data[["comparisons"]], y = .data[["value"]],
fill = .data[["variable"]])) +
ggplot2::scale_fill_manual(values = color_list) +
ggplot2::coord_flip() +
ggplot2::theme_bw(base_size = base_size) +
ggplot2::theme(panel.grid.minor = ggplot2::element_blank(),
legend.position = "none")
if (isTRUE(text)) {
for (comp in seq_along(comp_names)) {
comp_name <- comp_names[[comp]]
upstring <- as.character(up_sums[[comp_name]])
downstring <- as.character(down_sums[[comp_name]])
sigbar_plot <- sigbar_plot +
ggplot2::annotate("text", x = comp, y = maximum, label = upstring, angle = -90) +
ggplot2::annotate("text", x = comp, y = maximum * -1, label = downstring, angle = 90)
}
}
return(sigbar_plot)
}
## EOF
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.