# Plotting functions for Gumshoe
# Functions ----
by_plot <- function(sleuth_obj) {
est_count_matrix <- sleuth:::spread_abundance_by(sleuth_obj$obs_norm_filt, "est_counts", sleuth_obj$sample_to_covariates$sample)
pca <- prcomp(t(est_count_matrix))
plot_vector_length <- sqrt((pca$rotation[, 1]^2) + (pca$rotation[, 2]^2))
# Highlight the ten longest
keep <- names(tail(sort(plot_vector_length), 10))
n <- rownames(pca$rotation)
pca$rotation[!(rownames(pca$rotation) %in% keep), ] <- c(0)
# Add gene name to the loadings viz of available
if (!is.null(sleuth_obj$target_mapping)) {
t2g <- sleuth_obj$target_mapping
rownames(t2g) <- t2g$target_id
rownames(pca$rotation) <- lapply(n, function(x) {
ifelse(x %in% keep, paste0(x, "-", t2g[x, 2]), ".")
})
}
autoplot(pca, data = t(est_count_matrix), label = TRUE, shape = FALSE, loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3, loadings.label.vjust = 1.7)
}
#' Generate a volcano plot from a data frame containing a gene name, q-value, and fold change with user-selected cutoffs and title.
#'
#' @param df A data frame or list containing ensemble transcript id's
#' @param label_column_name The column name of the data point labels. The default is external_gene_name, which is the column name from the output of the ensembl_to_id function.
#' @param x_axis_column_name The column name for the intended x-axis values. The default is "b", which is the name of the column corresponding to the FC after Sleuth analysis.
#' @param y_axis_column_name The column name for the intended y-axis values. The default is "qval", which is the name of the column corresponding to the Benjamini-Hochberg multiple testing corrected p-value after Sleuth analysis.
#' @param x_cutoff X-axis cutoff, which corresponds to the fold change. Default value is 2.
#' @param y_cutoff Y-axis cutoff, which corresponds to the q-value. Default value is -log10(.05).
#' @param graph_name Name of the graph. Default is "Sample Graph Name".
#'
#' @export
#' @examples
#' # Create a volcano plot from a data frame with gene symbols, q-value, and FC.
#' volc_plot(wald_sig_results, graph_name = "Treated v. WT - Sex Factor")
volc_plot <- function(df, label_column_name = "external_gene_name", x_axis_column_name = "b", y_axis_column_name = "qval", x_cutoff = 2, y_cutoff = -log10(.05), graph_name = "Sample Graph Name") {
if (!"ggrepel" %in% (.packages())) {
cat("Function requires the ggrepel package. If the package is not installed, refer to the gumshoe wiki, otherwise, run the following command to load the package:\n")
cat("\n")
cat('library("ggrepel")')
} else {
df <- df[, c(label_column_name, y_axis_column_name, x_axis_column_name)]
colnames(df) <- c("gene_symbol", "Y_axis", "log2FC")
# Set colours and labels
df$diff_expressed <- "Not sig."
df$diff_expressed[df$log2FC > x_cutoff & df$Y_axis < y_cutoff] <- "Up-regulated"
df$diff_expressed[df$log2FC < -x_cutoff & df$Y_axis < y_cutoff] <- "Down-regulated"
df$label <- NA
df$label[df$diff_expressed != "Not sig."] <- df$gene_symbol[df$diff_expressed != "Not sig."]
if (any("Not sig." %in% df$diff_expressed)) {
ggplot(df, aes(x = log2FC, y = -log10(Y_axis), col = diff_expressed, label = label)) +
geom_point(alpha = 0.5) +
theme_minimal() +
geom_text_repel(show.legend = FALSE, min.segment.length = 0.5) +
scale_color_manual(name = "Expression", values = c("blue", "black", "red")) +
geom_vline(xintercept = c(-x_cutoff, x_cutoff), col = "black", linetype = "dashed") +
geom_hline(yintercept = y_cutoff, col = "black", linetype = "dashed") +
labs(title = graph_name) +
xlab(expression(Log[2] ~ (Fold ~ Change))) +
ylab(expression(-Log[10] ~ (Adj. ~ p - Value)))
} else {
ggplot(df, aes(x = log2FC, y = -log10(Y_axis), col = diff_expressed, label = label)) +
geom_point(alpha = 0.5) +
theme_minimal() +
geom_text_repel(show.legend = FALSE, min.segment.length = 0.5) +
scale_color_manual(name = "Expression", values = c("blue", "red")) +
geom_vline(xintercept = c(-x_cutoff, x_cutoff), col = "black", linetype = "dashed") +
geom_hline(yintercept = y_cutoff, col = "black", linetype = "dashed") +
labs(title = graph_name) +
xlab(expression(Log[2] ~ FC)) +
ylab(expression(-Log[10] ~ (Adj. ~ p - Value)))
}
}
}
#' Spread abundance by a column. Take a data.frame from a sleuth object
#' (e.g. \code{obs_raw}) and cast it into a matrix where the rows are the
#' target_ids and the columns are the sample ids. The values are the variable
#' you are "spreading" on.
#'
#' @param abund the abundance \code{data.frame} from a \code{sleuth} object
#' @param var a character array of length one. The variable for which to get "spread" on (e.g. "est_counts").
#'
#' @export
spread_abundance_by <- function(abund, var, which_order) {
# var <- lazyeval::lazy(var)
abund <- data.table::as.data.table(abund)
var_spread <- data.table::dcast(abund, target_id ~ sample, value.var = var)
# there is a discrepancy between data table's sorting of character vectors
# and how tidyr previously (or the order function) sorts character vectors
# so next step is needed to make sure the order is correct
var_spread <- var_spread[order(var_spread$target_id), ]
var_spread <- as.data.frame(var_spread, stringsAsFactors = FALSE)
rownames(var_spread) <- var_spread$target_id
var_spread["target_id"] <- NULL
result <- as.matrix(var_spread)
result[, which_order, drop = FALSE]
}
#' Generate a PCA plot from a sleuth object with user-selected colour_by and title.
#'
#' @param obj A sleuth object.
#' @param pc_x Integer denoting the principle component to use for the x-axis.
#' @param pc_y Integer denoting the principle component to use for the y-axis.
#' @param units Either 'est_counts' ('scaled_reads_per_base' for gene_mode) or 'tpm'.
#' @param color_by Column name to colour the samples by. Default is NULL.
#' @param show_legend Boolean value if the legend should be shown in the figure out not.
#' @param graph_name Name of the graph. Default is "Sample Plot Name".
#' @param auto_save If TRUE, the plot will be saved automatically. Defaults to FALSE.
#' @param scaled Only works if auto_save is TRUE. The plot will be automatically saved and scaled to the amount of variation per axis. Scale factor is 1:5.
#'
#' @export
#' @examples
#' # Create a PCA plot from a sleuth object.
#' self_plot_pca(so, colour_by = "tissue", graph_name = "All Samples")
self_plot_pca <- function(obj, pc_x = 1L, pc_y = 2L, units = "est_counts",
color_by = NULL, show_legend = TRUE, graph_name = "Sample Plot Name", auto_save = FALSE, scaled = TRUE) {
if (!is.null(color_by)){
color_by <- sym(color_by)
}
mat <- spread_abundance_by(
obj$obs_norm_filt, units,
obj$sample_to_covariates$sample
)
pca_res <- prcomp(t(mat))
df <- as.data.frame(pca_res$x[, c(pc_x, pc_y)])
df$sample <- rownames(df)
rownames(df) <- NULL
ppv <- plot_pc_variance(obj)
PCpc <- ppv$data$var
pc_1 <- paste0("PC1 ", "(", round(PCpc[1], 2), "%)")
pc_2 <- paste0("PC2 ", "(", round(PCpc[2], 2), "%)")
df <- dplyr::left_join(df, obj$sample_to_covariates,
by = "sample"
)
print(df)
if (show_legend){
plt <- ggplot(df, aes(PC1, PC2, colour = !!color_by, label = sample)) +
theme_bw() +
geom_point(alpha = 0.5) +
geom_text_repel(show.legend = FALSE) +
scale_color_manual(name = color_by, values = c("black", "blue", "red", "green")) +
labs(title = graph_name) +
xlab(pc_1) +
ylab(pc_2)
}
else {
plt <- ggplot(df, aes(PC1, PC2, colour = !!color_by, label = sample)) +
theme_bw() +
geom_point(alpha = 0.5) +
theme(legend.position = "none") +
geom_text_repel(show.legend = FALSE) +
scale_color_manual(name = color_by, values = c("black", "blue", "red", "green")) +
labs(title = graph_name) +
xlab(pc_1) +
ylab(pc_2)
}
if (auto_save){
if (scaled) {
ggsave("figure1.png",
dpi = 300, dev = "png", height = PCpc[2] / 5,
width = PCpc[1] / 5,
units = "in"
)
} else {
ggsave("figure1.png",
dpi = 300, dev = "png", height = 4.5,
width = 6,
units = "in"
)
}
}
else{
plt
}
}
#' Generate a box plot from kruskal-wallis results with user-selected title.
#'
#' @param kruskal_result A sleuth object.
#' @param graph_name Name of the graph. Default is "Sample Plot Name".
#'
#' @export
#' @examples
#' # Create a box plot from a kruskal-wallis result retrived using sleuth_kruskal_wallis().
#' isoform_boxplot(kw_result, graph_name = "All Isoforms")
isoform_boxplot <- function(kruskal_result, graph_name = "Sample Plot Name") {
ggplot(kruskal_result, aes(x = target_id, y = est_counts, col = target_id)) +
theme_minimal() +
geom_boxplot() +
guides(color = "none") +
geom_jitter() +
scale_fill_brewer(palette = "Dark2") +
ylab("Estimated Counts") +
xlab("Target ID") +
labs(title = graph_name) +
rotate_x_text(90)
}
#' Generate a isoform level heat map. Can only show a maximum of three genes at the same time.
#'
#' @param sleuth_obj An existing Sleuth object as generated by sleuth_prep() and fit by sleuth_fit(). The transcript name column must be target_id and the gene name column must be 'ext_gene'. Also, the sleuth_obj must have been prepared in non-gene aggregation mode, but with target mapping enabled.
#' @param genes A vector containing gene names whose transcripts should be plotted.
#' @param q_max The maximum q-value that should be used as a cutoff to determine significant transcripts. If set to FALSE, an FDR cutoff will not be applied. Defaults to .05.
#' @param iqf A numeric value between 0 and 1 (inclusive) to denote the quantile of data to display. Defaults to 0, which displays all data.
#' @param test The sleuth test to be analyzed using sleuth_results, can be either "wt" or "lrt". Defaults to "wt".
#' @param sig_marker_values A vector of values that should be used as a cutoff for significance labelling. Defaults to .05, .01, and .001.
#' @param sig_markers Markers of signifigance. Defaults to \*, \*\*, \*\*\*.
#' @param grouping_colours A list of groups and the associated level-colour pairs. See example for syntax. Defaults to empty list.
#' @param boxplot A Boolean value that indicates whether a box plot of scaled transcripts should be displayed.
#'
#' @export
#' @examples
#' # Create a isoform heat map from a sleuth_object.
#' heatmap_plot(so, genes = c("Tp53", "Tnf", "Egfr", "Egr1"), q_max = .01, test = "lrt", grouping_colours = list(sex = c("F" = "thistle1", "M" = "turquoise1"), treatment = c("drug_A" = "indianred1", "drug_B" = "olivedrab1")), boxplot = FALSE)
heatmap_plot <- function(sleuth_obj, genes, q_max = .05, test = "wt", iqf = 0, sig_marker_values = c(.05, .01, .001), sig_markers = c("*", "**", "***"),
grouping_colours = list(), plot_title = "Sleuth Heatmap", boxplot = TRUE) {
# Retrieve the scaled transcript counts for the set of genes passed to the function
scaled_trancript_counts <- filtered_scaled_transcript_counts(sleuth_obj = sleuth_obj, genes = genes)
# Check to see if any genes have been retrieved
if (dim(scaled_trancript_counts)[1] == 0) {
return("Confirm that the gene name is correct.")
}
if (sleuth_obj$gene_mode == TRUE){
scaled_trancript_counts <- aggregate(est_counts ~ ext_gene + sample, scaled_trancript_counts, mean)
# Dcast the scaled_trancript_counts to make the columns the sample names, rows the transcript ids, and the cells the est_count values
scaled_trancript_counts <- dcast(scaled_trancript_counts, ext_gene ~ sample, value.var = "est_counts")
rownames(scaled_trancript_counts) <- scaled_trancript_counts$ext_gene
}
else {
scaled_trancript_counts$target_id <- paste(scaled_trancript_counts$ext_gene, scaled_trancript_counts$target_id, sep = " - ")
# Dcast the scaled_trancript_counts to make the columns the sample names, rows the transcript ids, and the cells the est_count values
scaled_trancript_counts <- dcast(scaled_trancript_counts, target_id ~ sample, value.var = "est_counts")
rownames(scaled_trancript_counts) <- scaled_trancript_counts$target_id
}
# Remove an extra column created from the dcast that contains the transcript ids,
# take the log2 of the est_counts to plot them in more a representative way,
# remove the infinite values that occur when taking the log of a zero value,
# and convert from a df to a matrix.
scaled_trancript_counts <- scaled_trancript_counts[-1]
scaled_trancript_counts <- log2(scaled_trancript_counts)
scaled_trancript_counts[sapply(scaled_trancript_counts, is.infinite)] <- NA
assign("stc1", scaled_trancript_counts, envir = .GlobalEnv)
# Calculate the quantile value based on all the est_counts for every transcript for the selected gene with a probability of the user selected iqf parameter
transcript_mean <- apply(scaled_trancript_counts, 1, function(v) mean(as.numeric(v), na.rm = TRUE))
quant_val <- quantile(scaled_trancript_counts, probs = iqf, na.rm = TRUE)
# Remove all the transcripts that do not have an average est_count that is greater than the quant_val
scaled_trancript_counts <- scaled_trancript_counts[rownames(scaled_trancript_counts) %in% c(names(transcript_mean[transcript_mean > quant_val])), ]
assign("stc2", scaled_trancript_counts, envir = .GlobalEnv)
scaled_trancript_count_matrix <- as.matrix(scaled_trancript_counts)
# Gather the differentially expressed transcripts after running a wt or lrt
all_results <- sleuth_object_result(sleuth_obj = sleuth_obj, all_data = FALSE, sig_data = FALSE, single_df = TRUE, retrived_from_model = TRUE, test = test)
all_results <- all_results[all_results$ext_gene %in% genes, ]
if (sleuth_obj$gene_mode == TRUE){
all_results$target_id <- paste(all_results$ext_gene)
}
else {
all_results$target_id <- paste(all_results$ext_gene, all_results$target_id, sep = " - ")
}
if (is.numeric(q_max)) {
all_results <- fdr_cutoff(all_results, q_cutoff = q_max)
}
# Check to see if any genes have been retrieved
if (dim(all_results)[1] == 0) {
return("No transcript of the genes chosen have a significant difference.")
}
# Obtain the q-value per transcript.
transcript_p_val <- dcast(all_results, target_id ~ models, value.var = "qval", fun.aggregate = sum)
rownames(transcript_p_val) <- transcript_p_val$target_id
transcript_p_val <- transcript_p_val[-1]
# Merge the transcript q-values and all transcripts
transcript_p_val <- merge(transcript_p_val, scaled_trancript_counts, by = "row.names", all.y = TRUE)
rownames(transcript_p_val) <- transcript_p_val$Row.names
transcript_p_val <- transcript_p_val[2:(length(unique(all_results$models)) + 1)]
transcript_p_val[transcript_p_val == 0] <- NA
# Check the following 3 lines of code, with gene_agg FALSE
transcript_p_val <- transcript_p_val[rowSums(is.na(transcript_p_val)) != ncol(transcript_p_val),]
scaled_trancript_counts <- scaled_trancript_counts[(rownames(scaled_trancript_counts) %in% rownames(transcript_p_val)),]
scaled_trancript_count_matrix <- as.matrix(scaled_trancript_counts)
# Heatmap Annotation Construction
ht_opt(legend_border = "black", heatmap_border = TRUE, annotation_border = TRUE)
top_anno_df <- sleuth_obj$sample_to_covariates[, 1:ncol(sleuth_obj$sample_to_covariates)]
top_anno_df <- top_anno_df[order(top_anno_df[,1]),]
top_anno_df <- top_anno_df[-1]
if (length(grouping_colours) == 0) {
ha_top <- HeatmapAnnotation(df = top_anno_df)
} else {
ha_top <- HeatmapAnnotation(df = top_anno_df, col = grouping_colours)
}
# Generate gene significance heatmap annotation with asterisk.
qvalue_col_fun <- colorRamp2(c(0, 1), c("white", "white"))
model_names <- letters[1:length(colnames(transcript_p_val))]
transcript_p_val[transcript_p_val > max(attributes(qvalue_col_fun)$breaks)] <- NA
# Determine what symbols should be assigned to the significant markers
pch <- transcript_p_val
annotation_symbol <- letters[1:length(sig_marker_values)]
sig_marker_values <- sort(sig_marker_values)
for (cutoff in sig_marker_values) {
pch[pch < as.numeric(cutoff)] <- annotation_symbol[match(cutoff, sig_marker_values)]
}
annotations <- na.exclude(as.character(unique(unlist(pch))))
annotations <- sort(annotations)
sig_markers <- sort(sig_markers, decreasing = TRUE)
for (anno in annotations) {
pch[pch == anno] <- sig_markers[match(anno, annotations)]
}
for (model in colnames(transcript_p_val)) {
if (model != "(Intercept)") {
letter_index <- match(model, colnames(transcript_p_val))
letter <- model_names[letter_index]
ha_temp <- HeatmapAnnotation(model = anno_simple(transcript_p_val[model], col = qvalue_col_fun, pch = as.matrix(pch[model]), pt_size = unit(1, "snpc") * 0.7, width = max_text_width(sig_markers) * 1.2, na_col = "white", border = TRUE), annotation_label = model, which = "row")
names(ha_temp@anno_list) <- letter
ha_temp@anno_list[[letter]]@name <- letter
if (!exists("ha_right")) {
ha_right <- ha_temp
} else {
ha_right <- c(ha_right, ha_temp)
}
}
}
# Create a legend for the significant markers
legend_label <- paste("<", sig_marker_values, sep = " ")
lgd_sig <- Legend(labels = legend_label, border = TRUE, type = "points", pch = sig_markers, title = "q-Value Markers", grid_width = max_text_width(sig_markers) * 1.2, background = qvalue_col_fun(c(sig_marker_values)))
if (boxplot) {
boxplot_vals <- apply(scaled_trancript_count_matrix, 1, function(v) median(as.numeric(v), na.rm = TRUE))
boxplot_col <- colorRamp2(c(min(boxplot_vals), max(boxplot_vals)), c("blue", "red"))
if (sleuth_obj$gene_mode == TRUE){
ha_left <- rowAnnotation("Avg. Scaled Reads Per Base" = anno_boxplot(scaled_trancript_count_matrix, gp = gpar(fill = boxplot_col(boxplot_vals))))
ha_c <- Heatmap(scaled_trancript_count_matrix,
name = "Scaled Reads Per Base", rect_gp = gpar(col = "white", lwd = 1), border_gp = gpar(col = "black"), clustering_distance_rows = "pearson", clustering_distance_columns = "pearson",
row_title = "Genes", row_title_rot = 0, row_names_max_width = max_text_width(rownames(scaled_trancript_count_matrix), gp = gpar(fontsize = 12)),
show_column_names = FALSE,
na_col = "black",
top_annotation = ha_top, left_annotation = ha_left, right_annotation = ha_right
)
}
else {
ha_left <- rowAnnotation("Scaled Transcript Est Counts" = anno_boxplot(scaled_trancript_count_matrix, gp = gpar(fill = boxplot_col(boxplot_vals))))
ha_c <- Heatmap(scaled_trancript_count_matrix,
name = "Scaled Counts", rect_gp = gpar(col = "white", lwd = 1), border_gp = gpar(col = "black"), clustering_distance_rows = "pearson", clustering_distance_columns = "pearson",
row_title = "Transcripts", row_title_rot = 0, row_names_max_width = max_text_width(rownames(scaled_trancript_count_matrix), gp = gpar(fontsize = 12)),
show_column_names = FALSE,
na_col = "black",
top_annotation = ha_top, left_annotation = ha_left, right_annotation = ha_right
)
}
}
else {
if (sleuth_obj$gene_mode == TRUE){
ha_c <- Heatmap(scaled_trancript_count_matrix,
name = "Scaled Reads Per Base", rect_gp = gpar(col = "white", lwd = 1), border_gp = gpar(col = "black"), clustering_distance_rows = "pearson", clustering_distance_columns = "pearson",
row_title = "Genes", row_title_rot = 0, row_names_max_width = max_text_width(rownames(scaled_trancript_count_matrix), gp = gpar(fontsize = 12)),
show_column_names = FALSE,
na_col = "black",
top_annotation = ha_top, left_annotation = ha_left, right_annotation = ha_right
)
}
else {
ha_c <- Heatmap(scaled_trancript_count_matrix,
name = "Scaled Counts", rect_gp = gpar(col = "white", lwd = 1), border_gp = gpar(col = "black"), clustering_distance_rows = "pearson", clustering_distance_columns = "pearson",
row_title = "Transcripts", row_title_rot = 0, row_names_max_width = max_text_width(rownames(scaled_trancript_count_matrix), gp = gpar(fontsize = 12)),
show_column_names = FALSE,
na_col = "black",
top_annotation = ha_top, right_annotation = ha_right
)
}
}
draw(ha_c, annotation_legend_list = list(lgd_sig), merge_legend = TRUE, column_title = plot_title, padding = unit(c(2, 20, 2, 2), "mm"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.