#' volcano plot for DEA results, split into 4 panels; with/without labels and with/without thresholding foldchange-outliers
#'
#' If there are multiple contrasts in your DEA results you should either use the 'wrapper function' plot_volcano_allcontrast(), OR, first subset the DEA result table for 1 contrast before calling this plot function (i.e. don't plot multiple contrasts into 1 volcano plot).
#'
#' @param stats_de typically these are the output generated by the MS-DAP dea() function. If you use the entire pipeline, i.e. analysis_quickstart(), the resulting dataset objects holds these in the dataset$de_proteins property. Include a column "gene_symbols_or_id" to plot gene symbols instead of protein_id, see examples below
#' @param log2foldchange_threshold threshold for significance of log2 foldchanges. Set to NA to disregard (default) or provide a single numeric value (cutoff will be applied symetrically for both up- and down-regulated)
#' @param qvalue_threshold Q-value threshold for significant hits. Set to NA to disregard (default), otherwise it is assumed the input data table contains a boolean column 'signif'
#' @param mtitle optionally, a title for the plot
#' @param label_mode which class of proteins should be labeled in the plot. Options: topn_pvalue (top N smallest p-value, default), signif (all significant proteins), protein_id (a provided set of protein_id). Defaults to topN for consistency and clarity; if the upstream analysis yielded hundreds of hits the labels will be unreadable
#' @param label_target further specification of the label_mode parameter. For instance, if 'topn_pvalue' is set, here you can set the number of proteins that should be labeled. Analogously, if label_mode='protein_id' is set you can here provide an array of protein_id values (that are available in the stats_de data table)
#' @param label_avoid_overlap use the ggrepel R package to try and place labels with minimal overlap (only works when the number of labeled proteins is relatively low and sparse, e.g. for topN 25). Options: TRUE, FALSE
#' @param show_plots boolean parameter; should each plot be printed/shown immediately? If `FALSE` (default) you'll have to extract the ggplot objects from the resulting list in order to print the plots
#' @return returns a named list that contains a list, with properties 'ggplot' and 'ggplot_data', for each unique 'dea_algorithm' in the input stats_de table
#'
#' @examples
#' ### Exampes. Note that these assume that prior, the MS-DAP pipeline was successfully run
#' # using `dataset = analysis_quickstart(...)`.
#' # If your dataset contains multiple contrasts, follow example 5
#'
#' ## example 1: add protein-metadata to the DEA results (dataset$de_proteins), plotting the
#' # top 10 'best pvalue' hits while hardcoding the cutoffs for foldchange and Q-value
#' \dontrun{
#' plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins),
#' log2foldchange_threshold = 1, qvalue_threshold = 0.01,
#' mtitle = "volcano, label top 10", label_mode = "topn_pvalue", label_target = 10,
#' label_avoid_overlap = TRUE, show_plots = TRUE
#' )
#' }
#'
#' ## example 2: analogous, but now show all significant proteins and disable "repelled labels"
#' # (instead, print protein labels just below each data point)
#' \dontrun{
#' plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins),
#' log2foldchange_threshold = 1, qvalue_threshold = 0.01,
#' mtitle = "volcano, label all significant", label_mode = "signif",
#' label_avoid_overlap = FALSE, show_plots = TRUE
#' )
#' }
#'
#' ## example 3: show labels for some set of protein IDs. First line selects all proteins where symbol
#' # starts with GRIA or DLG (arbitrary example, either adapt the regex or use other filters/criteria
#' # to define a subset of protein_id from your dataset). Second line shows how to specify protein_id
#' # to be used as a label
#' \dontrun{
#' pid_label = dataset$proteins %>%
#' filter(grepl("^(GRIA|DLG)", gene_symbols_or_id, ignore.case=T)) %>% pull(protein_id)
#' plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins),
#' log2foldchange_threshold = 1, qvalue_threshold = 0.01,
#' mtitle = "volcano, label selected proteins", label_mode = "protein_id",
#' label_target = pid_label, label_avoid_overlap = FALSE, show_plots = TRUE
#' )
#' }
#'
#' ## example 4: plot all significant labels as before, then add custom labels for some subset of
#' # proteins (we here regex select some labels in the plot, you should adapt the regex to match
#' # some proteins in your dataset to make this work, but you can also further filter by other
#' # properties in the 'ggplot_data' tibble like the y-coordinate aka qvalue)
#' \dontrun{
#' plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins),
#' log2foldchange_threshold = 1, qvalue_threshold = 0.01, mtitle = "volcano,
#' label all significant + custom labels", label_mode = "signif",
#' label_avoid_overlap = FALSE, show_plots = FALSE
#' )
#' l = plot_list[[1]]
#' l$ggplot + ggrepel::geom_text_repel(
#' alpha=1, color="green", data = l$ggplot_data %>%
#' filter(plottype %in% c("asis_lab", "lim_lab") & grepl("^(GRIA|DLG)", label, ignore.case=T)),
#' segment.alpha = 0.3, min.segment.length = unit(0.25, 'lines'),
#' vjust = 0.6, show.legend = FALSE, size = 2
#' )
#' }
#'
#' # example 5: iterate over contrasts before calling plot_volcano().
#' # this is essentially the same as using helper function plot_volcano_allcontrast()
#' \dontrun{
#' contrasts = unique(dataset$de_proteins$contrast)
#' for(contr in contrasts) {
#' # subset the DEA results for the current contrast
#' tib_volcano = dataset$de_proteins %>% filter(contrast==contr) %>% left_join(dataset$proteins)
#'
#' # volcano plot function (compared to above example, now include the contrast in the title)
#' plot_list = msdap::plot_volcano(tib_volcano, log2foldchange_threshold = 1,
#' qvalue_threshold = 0.01, mtitle = paste(contr, "volcano, label top 10"),
#' label_mode = "topn_pvalue", label_target = 10, label_avoid_overlap = TRUE,
#' show_plots = TRUE
#' )
#' }
#' }
#'
#' @importFrom ggrepel geom_text_repel
#' @export
plot_volcano = function(stats_de, log2foldchange_threshold = NA, qvalue_threshold = NA, mtitle = "", label_mode = "topn_pvalue", label_target = 25, label_avoid_overlap = TRUE, show_plots = FALSE) {
if(!is.data.frame(stats_de) || nrow(stats_de) == 0) {
return(list())
}
# input validation
stopifnot(length(label_mode) == 1 && label_mode %in% c("topn_pvalue", "signif", "protein_id"))
stopifnot(!(label_mode == "topn_pvalue" && !is.finite(label_target))) # if labeling 'top N best pvalue', must specify an integer amount
stopifnot(!(label_mode == "protein_id" && !all(is.character(label_target))) ) # if labeling protein_id, these must all be strings
# note; for label_mode="signif" setting the label_target parameter is ignored so we don't have to input validate it
# replace invalid input with NA (0 is invalid input, as it doesn't filter/discard anything)
if(length(log2foldchange_threshold) != 1 || !is.finite(log2foldchange_threshold) || log2foldchange_threshold == 0) {
log2foldchange_threshold = NA
}
if(length(qvalue_threshold) != 1 || !is.finite(qvalue_threshold) || qvalue_threshold == 0) {
qvalue_threshold = NA
}
# iterating over contrasts should be done upstream
if("contrast" %in% names(stats_de) && n_distinct(stats_de$contrast) != 1) {
append_log("stats_de parameter contains more than 1 unique contrast. If your DEA results contain multiple contrasts, iterative over these and call this volcano plot function for clean subsets of data that only contain stats for 1 contrast", type = "error")
}
########### format input data
stats_de$pvalue[!is.finite(stats_de$pvalue)] = NA
stats_de$qvalue[!is.finite(stats_de$qvalue)] = NA
# optionally, if the user wants to re-define the qvalue cutoff for this plot, do that first. Next, optionally take all significant hits and add filtering by foldchange
if(!is.na(qvalue_threshold)) {
stats_de = stats_de %>% mutate(signif = is.finite(qvalue) & qvalue <= qvalue_threshold)
}
if(!"signif" %in% colnames(stats_de) || !all(stats_de$signif %in% c(T,F))) {
append_log("stats_de parameter must contain a column 'signif', or provide a numeric value for parameter 'qvalue_threshold'", type = "error")
}
if(!is.na(log2foldchange_threshold)) {
log2foldchange_threshold = abs(log2foldchange_threshold)
stats_de = stats_de %>% mutate(signif = signif & abs(foldchange.log2) >= log2foldchange_threshold)
}
if(!"dea_algorithm" %in% colnames(stats_de)) {
stats_de$dea_algorithm = "statistics"
}
## pretty-print labels
stats_de = add_protein_prettyprint_label(stats_de)
########### create volcano plots
result = list()
for (algo_name in unique(stats_de$dea_algorithm)) { #algo_name = "ebayes"
# prepare data for current contrast
tib = stats_de %>%
filter(dea_algorithm == algo_name) %>%
drop_na(foldchange.log2, pvalue, qvalue) %>%
# minlog10 conversion must be performed within this loop! scales zero's to max-value, ONLY valid within same statistical test
mutate(minlog10qval = minlog10(qvalue),
foldchange.log2_abs = abs(foldchange.log2)) %>%
# order by p-value so we draw the most significant proteins last (thus their symbols/PCH are on top)
arrange(desc(pvalue)) %>%
# reduce tibble size
select(protein_id, label, foldchange.log2, foldchange.log2_abs, minlog10qval, signif)
if(nrow(tib) == 0) {
next
}
# which proteins should get a text label?
tib$flag_plot_label = FALSE
lbl_style = ""
if(label_mode == "signif") { # all significant proteins
tib$flag_plot_label = tib$signif == TRUE
lbl_style = "significant"
}
if(label_mode == "topn_pvalue") { # topN 'best' pvalue
tmp = min(nrow(tib), label_target)
tib$flag_plot_label = rep(c(F,T), c(nrow(tib) - tmp, tmp)) # set last N rows to TRUE, this works because we just sorted `tib` by pvalue descending
lbl_style = paste(tmp, "best qvalue")
}
if(label_mode == "protein_id") { # user-specified protein(group) IDs
tib$flag_plot_label = tib$protein_id %in% label_target
lbl_style = "selected proteins"
}
# classify up/down regulated
tib$updown = ifelse(tib$foldchange.log2 < 0, "down", "up")
tib$updown[tib$signif != TRUE] = "unchanged"
### find outliers, values that are so far away that they may skew the plot's appearance, and classify data points accordingly
xmax_nooutlier = c(-1,1) * max(tib$foldchange.log2_abs, na.rm = T)
ymax_nooutlier = c(0, max(2, tib$minlog10qval, na.rm=T)) # hardcoded limit; y-axis data goes to 10^-2 at least
xmax = c(-1,1) * max(abs(quantile(tib$foldchange.log2, probs = c(.005, .995), na.rm = T)))
ymax = c(0, max(2, quantile(tib$minlog10qval, probs = .995, na.rm = T))) # hardcoded limit; y-axis data goes to 10^-2 at least
tib$isoutlier_x_low = tib$foldchange.log2 < xmax[1]
tib$isoutlier_x_high = tib$foldchange.log2 > xmax[2]
tib$isoutlier_y = tib$minlog10qval > ymax[2]
tib$x_outlier = tib$foldchange.log2
tib$y_outlier = tib$minlog10qval
tib$x_outlier[tib$isoutlier_x_low] = xmax[1]
tib$x_outlier[tib$isoutlier_x_high] = xmax[2]
tib$y_outlier[tib$isoutlier_y] = ymax[2]
tib$updown_outlier = tib$updown
tib$updown_outlier[tib$updown == "unchanged" & (tib$isoutlier_x_low | tib$isoutlier_x_high | tib$isoutlier_y)] = "unchanged_outlier"
tib$updown_outlier[tib$updown == "down" & (tib$isoutlier_x_low | tib$isoutlier_y)] = "down_outlier"
tib$updown_outlier[tib$updown == "up" & (tib$isoutlier_x_high | tib$isoutlier_y)] = "up_outlier"
### construct facets
plottype_labels = c(asis = "data as-is, no labels",
asis_lab = paste("data as-is, label", lbl_style),
lim = "limited x- and y-axis, no labels",
lim_lab = paste("limited x- and y-axis, label", lbl_style))
tib_facets = bind_rows(tib %>% select(label, x=foldchange.log2, y=minlog10qval, pch=updown, flag_plot_label) %>% mutate(flag_plot_label=FALSE) %>% add_column(plottype = "asis"),
tib %>% select(label, x=foldchange.log2, y=minlog10qval, pch=updown, flag_plot_label) %>% add_column(plottype = "asis_lab"),
tib %>% select(label, x=x_outlier, y=y_outlier, pch=updown_outlier, flag_plot_label) %>% mutate(flag_plot_label=FALSE) %>% add_column(plottype = "lim"),
tib %>% select(label, x=x_outlier, y=y_outlier, pch=updown_outlier, flag_plot_label) %>% add_column(plottype = "lim_lab"))
tib_facets$pch = factor(tib_facets$pch, levels = c("up", "down", "up_outlier", "down_outlier", "unchanged", "unchanged_outlier"))
# some mock data to enforce symmetric x-axis and expand the y-limit a bit to make room for the labels (this is a workaround because we cannot hardcode separate y-axis limits per facet)
blank_data = bind_rows(tibble(x=xmax_nooutlier, y=ymax_nooutlier[2] * 1.2, plottype = "asis"),
tibble(x=xmax_nooutlier, y=ymax_nooutlier[2] * 1.2, plottype = "asis_lab"),
tibble(x=xmax, y=ymax[2] * 1.2, plottype = "lim"),
tibble(x=xmax, y=ymax[2] * 1.2, plottype = "lim_lab") ) %>%
add_column(pch="unchanged", label="")
### volcano plot
p = ggplot(tib_facets, aes(x, y, colour = pch, fill = pch, shape = pch, label = label)) +
geom_point(na.rm = TRUE, show.legend = TRUE) + # show legend is needed since ggplot 3.5.0 , i.e. `drop=FALSE` in the scale is now ignored... https://github.com/tidyverse/ggplot2/issues/5728
geom_blank(mapping = aes(x, y), data = blank_data, inherit.aes = FALSE, show.legend = FALSE) +
scale_discrete_manual(
aesthetics = c("colour", "fill"),
values = c(up = "#d55e00aa", down = "#56b4e9aa", up_outlier = "#d55e00aa", down_outlier = "#56b4e9aa", unchanged = "#22222299", unchanged_outlier = "#22222299"),
labels = c(up = "up regulated", down = "down regulated", up_outlier = "up regulated & outside plot limits", down_outlier = "down regulated & outside plot limits",
unchanged = "not significant", unchanged_outlier = "not significant & outside plot limits"),
breaks = c("up", "down", "up_outlier", "down_outlier", "unchanged", "unchanged_outlier"),
drop = F
) +
scale_shape_manual(
values = c(up = 24, down = 25, up_outlier = 2, down_outlier = 6, unchanged = 19, unchanged_outlier = 1),
labels = c(up = "up regulated", down = "down regulated", up_outlier = "up regulated & outside plot limits", down_outlier = "down regulated & outside plot limits",
unchanged = "not significant", unchanged_outlier = "not significant & outside plot limits"),
breaks = c("up", "down", "up_outlier", "down_outlier", "unchanged", "unchanged_outlier"),
drop = F
) +
guides(colour = guide_legend(byrow = FALSE, override.aes = list(alpha = 1))) + # don't have to repease the guide_legend for fill and shape
facet_wrap(~plottype, nrow = 2, ncol = 2, scales = "free", labeller = labeller(plottype=plottype_labels)) +
labs(x = "log2 fold-change", y = "-log10 FDR adjusted p-value", title = paste(algo_name, "@", mtitle)) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size=8),
legend.position = "bottom",
legend.title = element_blank(),
legend.text = element_text(size=8))
if(any(tib_facets$flag_plot_label)) {
if(label_avoid_overlap) {
p = p + ggrepel::geom_text_repel(data = tib_facets %>% filter(flag_plot_label == TRUE), alpha=1, # vjust = 0.6,
show.legend = FALSE, size = 2, segment.alpha = .3, min.segment.length = 0, na.rm = TRUE, max.time = 1, max.iter = 1e5, max.overlaps = Inf, point.padding = 0, box.padding = 0.2, seed = 123) # min.segment.length = unit(0.2, 'lines')
} else {
p = p + geom_text(alpha=1, data = tib_facets %>% filter(flag_plot_label == TRUE), vjust = 1.1, show.legend = FALSE, size = 2)
}
}
if(!is.na(log2foldchange_threshold)) {
p = p + geom_vline(xintercept = c(-1,1) * log2foldchange_threshold, colour = "darkgrey", linetype = "dashed")
}
if(!is.na(qvalue_threshold)) {
p = p + geom_hline(yintercept = -log10(qvalue_threshold), colour = "darkgrey", linetype = "dashed")
}
result[[algo_name]] = list(ggplot = p, ggplot_data = tib_facets)
if(show_plots) {
print(p)
}
}
return(invisible(result))
}
#' a simple wrapper around plot_volcano() that iterates all contrasts
#'
#' returns the plot_volcano() output within a list (1 element per contrast @ stats_de$contrast column)
#'
#' @examples \dontrun{
#' # refer to the examples for msdap::plot_volcano()
#' # the same syntax can be applied here
#' }
#' @param stats_de input data.frame, see `plot_volcano()`
#' @param mtitle plot title, see `plot_volcano()`
#' @param ... parameters passed to `plot_volcano()`, check that function's documentation and examples
#' @export
plot_volcano_allcontrast = function(stats_de, mtitle, ...) {
if(!is.data.frame(stats_de) || nrow(stats_de) == 0) {
return(list())
}
result = list()
if(!"contrast" %in% colnames(stats_de)) {
stats_de$contrast = "default_contrast"
}
if(any(is.na(stats_de$contrast) | !is.character(stats_de$contrast) | stats_de$contrast == "")) {
append_log('parameter stats_de column "contrast" must contain string values that are not NA and not "" (empty string)', type = "error")
}
for(contr in unique(stats_de$contrast)) {
result[[contr]] = plot_volcano(stats_de %>% filter(contrast == contr), mtitle = paste(contr, mtitle), ...)
}
return(invisible(result))
}
#' placeholder title
#' @param tib pre-define color_code column
#' @param mtitle todo
plot_foldchanges = function(tib, mtitle="") {
param_density_bandwidth = "sj"
param_density_adjust = 1.0 # alternatively, 0.9
if(!is.data.frame(tib) || nrow(tib) == 0) {
return(NULL)
}
ggplot(tib, aes(foldchange.log2, colour = color_code)) +
geom_vline(xintercept=0) +
# same density function as default in base R's stats::density()
geom_line(stat = "density", bw = param_density_bandwidth, adjust = param_density_adjust, na.rm = T) +
facet_wrap( ~ dea_algorithm, nrow = ceiling(sqrt(n_distinct(tib$color_code))), scales = "free_y") +
coord_cartesian(xlim = c(-1,1) * max(abs(quantile(tib$foldchange.log2, probs = c(0.005, 0.995), na.rm=T)))) +
labs(x="log2 foldchange", colour = "", title=mtitle) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size=8),
legend.position = "none") # bottom
}
#' placeholder title
#' @param tib todo
#' @param mtitle todo
plot_pvalue_histogram = function(tib, mtitle="") {
if(!is.data.frame(tib) || nrow(tib) == 0) {
return(NULL)
}
# debug_tib_pvalue_hist <<- tib
binwidth = 0.05
tib_summ = tib %>% filter(is.finite(pvalue)) %>% group_by(dea_algorithm) %>% summarise(yintercept_line = binwidth * n())
ggplot(tib, aes(pvalue)) +
geom_histogram(binwidth = binwidth, boundary = 0, colour = "white", fill="darkgrey", na.rm=T) +
geom_hline(data = tib_summ, aes(yintercept = yintercept_line), colour = "darkblue") +
scale_x_continuous(breaks = (0:10)/10, minor_breaks = (0:5)/5) +
facet_wrap( ~ dea_algorithm, nrow = 2, scales = "fixed") +
labs(x="p-value", y="number of proteins", colour = "", title=mtitle) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size=8),
legend.position = "none")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.