R/plot_volcano.R

Defines functions plot_pvalue_histogram plot_foldchanges plot_volcano_allcontrast plot_volcano

Documented in plot_foldchanges plot_pvalue_histogram plot_volcano plot_volcano_allcontrast

#' volcano plot for DEA results, split into 4 panels; with/without labels and with/without thresholding foldchange-outliers
#'
#' If there are multiple contrasts in your DEA results you should either use the 'wrapper function' plot_volcano_allcontrast(), OR, first subset the DEA result table for 1 contrast before calling this plot function (i.e. don't plot multiple contrasts into 1 volcano plot).
#'
#' @param stats_de typically these are the output generated by the MS-DAP dea() function. If you use the entire pipeline, i.e. analysis_quickstart(), the resulting dataset objects holds these in the dataset$de_proteins property. Include a column "gene_symbols_or_id" to plot gene symbols instead of protein_id, see examples below
#' @param log2foldchange_threshold threshold for significance of log2 foldchanges. Set to NA to disregard (default) or provide a single numeric value (cutoff will be applied symetrically for both up- and down-regulated)
#' @param qvalue_threshold Q-value threshold for significant hits. Set to NA to disregard (default), otherwise it is assumed the input data table contains a boolean column 'signif'
#' @param mtitle optionally, a title for the plot
#' @param label_mode which class of proteins should be labeled in the plot. Options: topn_pvalue (top N smallest p-value, default), signif (all significant proteins), protein_id (a provided set of protein_id). Defaults to topN for consistency and clarity; if the upstream analysis yielded hundreds of hits the labels will be unreadable
#' @param label_target further specification of the label_mode parameter. For instance, if 'topn_pvalue' is set, here you can set the number of proteins that should be labeled. Analogously, if label_mode='protein_id' is set you can here provide an array of protein_id values (that are available in the stats_de data table)
#' @param label_avoid_overlap use the ggrepel R package to try and place labels with minimal overlap (only works when the number of labeled proteins is relatively low and sparse, e.g. for topN 25). Options: TRUE, FALSE
#' @param show_plots boolean parameter; should each plot be printed/shown immediately? If `FALSE` (default) you'll have to extract the ggplot objects from the resulting list in order to print the plots
#' @return returns a named list that contains a list, with properties 'ggplot' and 'ggplot_data', for each unique 'dea_algorithm' in the input stats_de table
#'
#' @examples
#' ### Exampes. Note that these assume that prior, the MS-DAP pipeline was successfully run
#' # using `dataset = analysis_quickstart(...)`.
#' # If your dataset contains multiple contrasts, follow example 5
#'
#' ## example 1: add protein-metadata to the DEA results (dataset$de_proteins), plotting the
#' # top 10 'best pvalue' hits while hardcoding the cutoffs for foldchange and Q-value
#' \dontrun{
#'   plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins),
#'     log2foldchange_threshold = 1, qvalue_threshold = 0.01,
#'     mtitle = "volcano, label top 10", label_mode = "topn_pvalue", label_target = 10,
#'     label_avoid_overlap = TRUE, show_plots = TRUE
#'   )
#' }
#'
#' ## example 2: analogous, but now show all significant proteins and disable "repelled labels"
#' # (instead, print protein labels just below each data point)
#' \dontrun{
#'   plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins),
#'     log2foldchange_threshold = 1, qvalue_threshold = 0.01,
#'     mtitle = "volcano, label all significant", label_mode = "signif",
#'     label_avoid_overlap = FALSE, show_plots = TRUE
#'   )
#' }
#'
#' ## example 3: show labels for some set of protein IDs. First line selects all proteins where symbol
#' # starts with GRIA or DLG (arbitrary example, either adapt the regex or use other filters/criteria
#' # to define a subset of protein_id from your dataset). Second line shows how to specify protein_id
#' # to be used as a label
#' \dontrun{
#'   pid_label = dataset$proteins %>%
#'     filter(grepl("^(GRIA|DLG)", gene_symbols_or_id, ignore.case=T)) %>% pull(protein_id)
#'   plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins),
#'     log2foldchange_threshold = 1, qvalue_threshold = 0.01,
#'     mtitle = "volcano, label selected proteins", label_mode = "protein_id",
#'     label_target = pid_label, label_avoid_overlap = FALSE, show_plots = TRUE
#'   )
#' }
#'
#' ## example 4: plot all significant labels as before, then add custom labels for some subset of
#' # proteins (we here regex select some labels in the plot, you should adapt the regex to match
#' # some proteins in your dataset to make this work, but you can also further filter by other
#' # properties in the 'ggplot_data' tibble like the y-coordinate aka qvalue)
#' \dontrun{
#'   plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins),
#'     log2foldchange_threshold = 1, qvalue_threshold = 0.01, mtitle = "volcano,
#'     label all significant + custom labels", label_mode = "signif",
#'     label_avoid_overlap = FALSE, show_plots = FALSE
#'   )
#'   l = plot_list[[1]]
#'   l$ggplot + ggrepel::geom_text_repel(
#'     alpha=1, color="green", data = l$ggplot_data %>%
#'       filter(plottype %in% c("asis_lab", "lim_lab") & grepl("^(GRIA|DLG)", label, ignore.case=T)),
#'     segment.alpha = 0.3, min.segment.length = unit(0.25, 'lines'),
#'     vjust = 0.6, show.legend = FALSE, size = 2
#'   )
#' }
#'
#' # example 5: iterate over contrasts before calling plot_volcano().
#' # this is essentially the same as using helper function plot_volcano_allcontrast()
#' \dontrun{
#'   contrasts = unique(dataset$de_proteins$contrast)
#'   for(contr in contrasts) {
#'     # subset the DEA results for the current contrast
#'     tib_volcano = dataset$de_proteins %>% filter(contrast==contr) %>% left_join(dataset$proteins)
#'
#'     # volcano plot function (compared to above example, now include the contrast in the title)
#'     plot_list = msdap::plot_volcano(tib_volcano, log2foldchange_threshold = 1,
#'       qvalue_threshold = 0.01, mtitle = paste(contr, "volcano, label top 10"),
#'       label_mode = "topn_pvalue", label_target = 10, label_avoid_overlap = TRUE,
#'       show_plots = TRUE
#'     )
#'   }
#' }
#'
#' @importFrom ggrepel geom_text_repel
#' @export
plot_volcano = function(stats_de, log2foldchange_threshold = NA, qvalue_threshold = NA, mtitle = "", label_mode = "topn_pvalue", label_target = 25, label_avoid_overlap = TRUE, show_plots = FALSE) {
  if(!is.data.frame(stats_de) || nrow(stats_de) == 0) {
    return(list())
  }

  # input validation
  stopifnot(length(label_mode) == 1 && label_mode %in% c("topn_pvalue", "signif", "protein_id"))
  stopifnot(!(label_mode == "topn_pvalue" && !is.finite(label_target))) # if labeling 'top N best pvalue', must specify an integer amount
  stopifnot(!(label_mode == "protein_id" && !all(is.character(label_target))) ) # if labeling protein_id, these must all be strings
  # note; for label_mode="signif" setting the label_target parameter is ignored so we don't have to input validate it

  # replace invalid input with NA (0 is invalid input, as it doesn't filter/discard anything)
  if(length(log2foldchange_threshold) != 1 || !is.finite(log2foldchange_threshold) || log2foldchange_threshold == 0) {
    log2foldchange_threshold = NA
  }
  if(length(qvalue_threshold) != 1 || !is.finite(qvalue_threshold) || qvalue_threshold == 0) {
    qvalue_threshold = NA
  }

  # iterating over contrasts should be done upstream
  if("contrast" %in% names(stats_de) && n_distinct(stats_de$contrast) != 1) {
    append_log("stats_de parameter contains more than 1 unique contrast. If your DEA results contain multiple contrasts, iterative over these and call this volcano plot function for clean subsets of data that only contain stats for 1 contrast", type = "error")
  }


  ########### format input data
  stats_de$pvalue[!is.finite(stats_de$pvalue)] = NA
  stats_de$qvalue[!is.finite(stats_de$qvalue)] = NA

  # optionally, if the user wants to re-define the qvalue cutoff for this plot, do that first. Next, optionally take all significant hits and add filtering by foldchange
  if(!is.na(qvalue_threshold)) {
    stats_de = stats_de %>% mutate(signif = is.finite(qvalue) & qvalue <= qvalue_threshold)
  }
  if(!"signif" %in% colnames(stats_de) || !all(stats_de$signif %in% c(T,F))) {
    append_log("stats_de parameter must contain a column 'signif', or provide a numeric value for parameter 'qvalue_threshold'", type = "error")
  }
  if(!is.na(log2foldchange_threshold)) {
    log2foldchange_threshold = abs(log2foldchange_threshold)
    stats_de = stats_de %>% mutate(signif = signif & abs(foldchange.log2) >= log2foldchange_threshold)
  }

  if(!"dea_algorithm" %in% colnames(stats_de)) {
    stats_de$dea_algorithm = "statistics"
  }

  ## pretty-print labels
  stats_de = add_protein_prettyprint_label(stats_de)


  ########### create volcano plots

  result = list()
  for (algo_name in unique(stats_de$dea_algorithm)) { #algo_name = "ebayes"
    # prepare data for current contrast
    tib = stats_de %>%
      filter(dea_algorithm == algo_name) %>%
      drop_na(foldchange.log2, pvalue, qvalue) %>%
      # minlog10 conversion must be performed within this loop! scales zero's to max-value, ONLY valid within same statistical test
      mutate(minlog10qval = minlog10(qvalue),
             foldchange.log2_abs = abs(foldchange.log2)) %>%
      # order by p-value so we draw the most significant proteins last (thus their symbols/PCH are on top)
      arrange(desc(pvalue)) %>%
      # reduce tibble size
      select(protein_id, label, foldchange.log2, foldchange.log2_abs, minlog10qval, signif)

    if(nrow(tib) == 0) {
      next
    }

    # which proteins should get a text label?
    tib$flag_plot_label = FALSE
    lbl_style = ""
    if(label_mode == "signif") { # all significant proteins
      tib$flag_plot_label = tib$signif == TRUE
      lbl_style = "significant"
    }
    if(label_mode == "topn_pvalue") { # topN 'best' pvalue
      tmp = min(nrow(tib), label_target)
      tib$flag_plot_label = rep(c(F,T), c(nrow(tib) - tmp, tmp)) # set last N rows to TRUE, this works because we just sorted `tib` by pvalue descending
      lbl_style = paste(tmp, "best qvalue")
    }
    if(label_mode == "protein_id") { # user-specified protein(group) IDs
      tib$flag_plot_label = tib$protein_id %in% label_target
      lbl_style = "selected proteins"
    }

    # classify up/down regulated
    tib$updown = ifelse(tib$foldchange.log2 < 0, "down", "up")
    tib$updown[tib$signif != TRUE] = "unchanged"

    ### find outliers, values that are so far away that they may skew the plot's appearance, and classify data points accordingly
    xmax_nooutlier = c(-1,1) * max(tib$foldchange.log2_abs, na.rm = T)
    ymax_nooutlier = c(0, max(2, tib$minlog10qval, na.rm=T)) # hardcoded limit; y-axis data goes to 10^-2 at least
    xmax = c(-1,1) * max(abs(quantile(tib$foldchange.log2, probs = c(.005, .995), na.rm = T)))
    ymax = c(0, max(2, quantile(tib$minlog10qval, probs = .995, na.rm = T))) # hardcoded limit; y-axis data goes to 10^-2 at least

    tib$isoutlier_x_low = tib$foldchange.log2 < xmax[1]
    tib$isoutlier_x_high = tib$foldchange.log2 > xmax[2]
    tib$isoutlier_y = tib$minlog10qval > ymax[2]

    tib$x_outlier = tib$foldchange.log2
    tib$y_outlier = tib$minlog10qval
    tib$x_outlier[tib$isoutlier_x_low] = xmax[1]
    tib$x_outlier[tib$isoutlier_x_high] = xmax[2]
    tib$y_outlier[tib$isoutlier_y] = ymax[2]

    tib$updown_outlier = tib$updown
    tib$updown_outlier[tib$updown == "unchanged" & (tib$isoutlier_x_low | tib$isoutlier_x_high | tib$isoutlier_y)] = "unchanged_outlier"
    tib$updown_outlier[tib$updown == "down" & (tib$isoutlier_x_low | tib$isoutlier_y)] = "down_outlier"
    tib$updown_outlier[tib$updown == "up" & (tib$isoutlier_x_high | tib$isoutlier_y)] = "up_outlier"

    ### construct facets

    plottype_labels = c(asis = "data as-is, no labels",
                        asis_lab = paste("data as-is, label", lbl_style),
                        lim = "limited x- and y-axis, no labels",
                        lim_lab = paste("limited x- and y-axis, label", lbl_style))

    tib_facets = bind_rows(tib %>% select(label, x=foldchange.log2, y=minlog10qval, pch=updown, flag_plot_label) %>% mutate(flag_plot_label=FALSE) %>% add_column(plottype = "asis"),
                           tib %>% select(label, x=foldchange.log2, y=minlog10qval, pch=updown, flag_plot_label)                                   %>% add_column(plottype = "asis_lab"),
                           tib %>% select(label, x=x_outlier, y=y_outlier, pch=updown_outlier, flag_plot_label) %>% mutate(flag_plot_label=FALSE)  %>% add_column(plottype = "lim"),
                           tib %>% select(label, x=x_outlier, y=y_outlier, pch=updown_outlier, flag_plot_label)                                    %>% add_column(plottype = "lim_lab"))
    tib_facets$pch = factor(tib_facets$pch, levels = c("up", "down", "up_outlier", "down_outlier", "unchanged", "unchanged_outlier"))

    # some mock data to enforce symmetric x-axis and expand the y-limit a bit to make room for the labels (this is a workaround because we cannot hardcode separate y-axis limits per facet)
    blank_data = bind_rows(tibble(x=xmax_nooutlier, y=ymax_nooutlier[2] * 1.2, plottype = "asis"),
                           tibble(x=xmax_nooutlier, y=ymax_nooutlier[2] * 1.2, plottype = "asis_lab"),
                           tibble(x=xmax, y=ymax[2] * 1.2, plottype = "lim"),
                           tibble(x=xmax, y=ymax[2] * 1.2, plottype = "lim_lab") ) %>%
      add_column(pch="unchanged", label="")

    ### volcano plot
    p = ggplot(tib_facets, aes(x, y, colour = pch, fill = pch, shape = pch, label = label)) +
      geom_point(na.rm = TRUE, show.legend = TRUE) + # show legend is needed since ggplot 3.5.0 , i.e. `drop=FALSE` in the scale is now ignored... https://github.com/tidyverse/ggplot2/issues/5728
      geom_blank(mapping = aes(x, y), data = blank_data, inherit.aes = FALSE, show.legend = FALSE) +
      scale_discrete_manual(
        aesthetics = c("colour", "fill"),
        values = c(up = "#d55e00aa", down = "#56b4e9aa", up_outlier = "#d55e00aa", down_outlier = "#56b4e9aa", unchanged = "#22222299", unchanged_outlier = "#22222299"),
        labels = c(up = "up regulated", down = "down regulated", up_outlier = "up regulated & outside plot limits", down_outlier = "down regulated & outside plot limits",
                   unchanged = "not significant", unchanged_outlier = "not significant & outside plot limits"),
        breaks = c("up", "down", "up_outlier", "down_outlier", "unchanged", "unchanged_outlier"),
        drop = F
      ) +
      scale_shape_manual(
        values = c(up = 24, down = 25, up_outlier = 2, down_outlier = 6, unchanged = 19, unchanged_outlier = 1),
        labels = c(up = "up regulated", down = "down regulated", up_outlier = "up regulated & outside plot limits", down_outlier = "down regulated & outside plot limits",
                   unchanged = "not significant", unchanged_outlier = "not significant & outside plot limits"),
        breaks = c("up", "down", "up_outlier", "down_outlier", "unchanged", "unchanged_outlier"),
        drop = F
      ) +
      guides(colour = guide_legend(byrow = FALSE, override.aes = list(alpha = 1))) + # don't have to repease the guide_legend for fill and shape
      facet_wrap(~plottype, nrow = 2, ncol = 2, scales = "free", labeller = labeller(plottype=plottype_labels)) +
      labs(x = "log2 fold-change", y = "-log10 FDR adjusted p-value", title = paste(algo_name, "@", mtitle)) +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5, size=8),
            legend.position = "bottom",
            legend.title = element_blank(),
            legend.text = element_text(size=8))

    if(any(tib_facets$flag_plot_label)) {
      if(label_avoid_overlap) {
        p = p + ggrepel::geom_text_repel(data = tib_facets %>% filter(flag_plot_label == TRUE), alpha=1, # vjust = 0.6,
                                         show.legend = FALSE, size = 2, segment.alpha = .3, min.segment.length = 0, na.rm = TRUE, max.time = 1, max.iter = 1e5, max.overlaps = Inf, point.padding = 0, box.padding = 0.2, seed = 123) # min.segment.length = unit(0.2, 'lines')
      } else {
        p = p + geom_text(alpha=1, data = tib_facets %>% filter(flag_plot_label == TRUE), vjust = 1.1, show.legend = FALSE, size = 2)
      }
    }

    if(!is.na(log2foldchange_threshold)) {
      p = p + geom_vline(xintercept = c(-1,1) * log2foldchange_threshold, colour = "darkgrey", linetype = "dashed")
    }
    if(!is.na(qvalue_threshold)) {
      p = p + geom_hline(yintercept = -log10(qvalue_threshold), colour = "darkgrey", linetype = "dashed")
    }

    result[[algo_name]] = list(ggplot = p, ggplot_data = tib_facets)

    if(show_plots) {
      print(p)
    }
  }

  return(invisible(result))
}



#' a simple wrapper around plot_volcano() that iterates all contrasts
#'
#' returns the plot_volcano() output within a list (1 element per contrast @ stats_de$contrast column)
#'
#' @examples \dontrun{
#'   # refer to the examples for msdap::plot_volcano()
#'   # the same syntax can be applied here
#' }
#' @param stats_de input data.frame, see `plot_volcano()`
#' @param mtitle plot title, see `plot_volcano()`
#' @param ... parameters passed to `plot_volcano()`, check that function's documentation and examples
#' @export
plot_volcano_allcontrast = function(stats_de, mtitle, ...) {
  if(!is.data.frame(stats_de) || nrow(stats_de) == 0) {
    return(list())
  }
  result = list()
  if(!"contrast" %in% colnames(stats_de)) {
    stats_de$contrast = "default_contrast"
  }
  if(any(is.na(stats_de$contrast) | !is.character(stats_de$contrast) | stats_de$contrast == "")) {
    append_log('parameter stats_de column "contrast" must contain string values that are not NA and not "" (empty string)', type = "error")
  }
  for(contr in unique(stats_de$contrast)) {
    result[[contr]] = plot_volcano(stats_de %>% filter(contrast == contr), mtitle = paste(contr, mtitle), ...)
  }
  return(invisible(result))
}



#' placeholder title
#' @param tib pre-define color_code column
#' @param mtitle todo
plot_foldchanges = function(tib, mtitle="") {
  param_density_bandwidth = "sj"
  param_density_adjust = 1.0 # alternatively, 0.9

  if(!is.data.frame(tib) || nrow(tib) == 0) {
    return(NULL)
  }

  ggplot(tib, aes(foldchange.log2, colour = color_code)) +
    geom_vline(xintercept=0) +
    # same density function as default in base R's stats::density()
    geom_line(stat = "density", bw = param_density_bandwidth, adjust = param_density_adjust, na.rm = T) +
    facet_wrap( ~ dea_algorithm, nrow = ceiling(sqrt(n_distinct(tib$color_code))), scales = "free_y") +
    coord_cartesian(xlim = c(-1,1) * max(abs(quantile(tib$foldchange.log2, probs = c(0.005, 0.995), na.rm=T)))) +
    labs(x="log2 foldchange", colour = "", title=mtitle) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5, size=8),
          legend.position = "none") # bottom
}



#' placeholder title
#' @param tib todo
#' @param mtitle todo
plot_pvalue_histogram = function(tib, mtitle="") {
  if(!is.data.frame(tib) || nrow(tib) == 0) {
    return(NULL)
  }
  # debug_tib_pvalue_hist <<- tib
  binwidth = 0.05
  tib_summ = tib %>% filter(is.finite(pvalue)) %>% group_by(dea_algorithm) %>% summarise(yintercept_line = binwidth * n())

  ggplot(tib, aes(pvalue)) +
    geom_histogram(binwidth = binwidth, boundary = 0, colour = "white", fill="darkgrey", na.rm=T) +
    geom_hline(data = tib_summ, aes(yintercept = yintercept_line), colour = "darkblue") +
    scale_x_continuous(breaks = (0:10)/10, minor_breaks = (0:5)/5) +
    facet_wrap( ~ dea_algorithm, nrow = 2, scales = "fixed") +
    labs(x="p-value", y="number of proteins", colour = "", title=mtitle) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5, size=8),
          legend.position = "none")
}
ftwkoopmans/msdap documentation built on March 5, 2025, 12:15 a.m.