enrichr: Compute Protein Enrichment

Documented in psmplot zplot

#' Visualize control PSMs versus test PSMs on a log-log plot.
#'
#' Graphs control PSMs versus test PSMs on a log-log plot. A pseudocount of 1 is adding to all PSMs for log-log visualization. Colored by Z-test threshold. See README for file formats and example usage on the \href{https://github.com/rachaelcox/enrichr}{enrichr GitHub Repository}.
#'
#' @param data A data frame produced by `enrich()`.
#' @param ylab Text for y-axis label; will correspond to test PSMs.
#' @param xlab Text for x-axis label; will correspond to control PSMs.
#' @param threshold Optional. FDR threshold to color points by. One of c(90, 95, 99). Default = 5% FDR (fdr_bh <= 0.05).
#' @param num_labs Optional. Number of annotation labels to display. Requires input to have an annotation column named `gene_names_primary`. Default = 0.
#' @param  label_file Optional. Comma-separated file containing names of genes to be labeled on the plot; requires input to have an annotation column named `gene_names_primary` and labels must match format in the column.
#' @param outfile_prefix Optional. File prefix for output `.png` and `.pdf` files. Default = "output."
#' @return PSM log-log plots in `.png` and `.pdf` format.
#' @export
psmplot <- function(data, outfile_prefix, threshold, num_labs,
                    label_file, ylab, xlab, point_color){

  theme_set(cowplot::theme_cowplot())
  palette_pretty <- c("#0072B2","#E69F00","#009E24",
                      "#FF0000", "#979797","#5530AA", "#1E1E1E")

  log10_minor_break <- function (...){
    function(x) {
      minx         = floor(min(log10(x), na.rm=T))-1;
      maxx         = ceiling(max(log10(x), na.rm=T))+1;
      n_major      = maxx-minx+1;
      major_breaks = seq(minx, maxx, by=1)
      minor_breaks =
        rep(log10(seq(1, 9, by=1)), times = n_major)+
        rep(major_breaks, each = 9)
      return(10^(minor_breaks))
    }
  }

  if(missing(outfile_prefix)){
    outfile_prefix = "output"
  }

  if(missing(num_labs)){
    num_labs = 10
  }

  if(missing(threshold)){
    threshold = 99
  }

  if(missing(point_color)){
    point_color = palette_pretty[4]
  }

  xcol <- grep('ctrl_PSMs$', names(data), value = TRUE)
  ycol <- grep('exp_PSMs$', names(data), value = TRUE)
  zcol <- grep('_zscore$', names(data), value = TRUE)
  acol <- grep('gene_names_primary$', names(data), value = TRUE)

  # calculate thresholds
  data_conf <- data %>%
    dplyr::mutate(conf_90 = dplyr::case_when(fdr_bh <= 0.10 ~ TRUE,
                                      TRUE ~ FALSE),
                  conf_95 = dplyr::case_when(fdr_bh <= 0.05 ~ TRUE,
                                      TRUE ~ FALSE),
                  conf_99 = dplyr::case_when(fdr_bh <= 0.01 ~ TRUE,
                                      TRUE ~ FALSE))

  # add 1 to all PSMs for log-log visualization
  print('Adding pseudocounts to PSMs...')

  data_conf[ycol] <- data_conf[ycol]+1
  data_conf[xcol] <- data_conf[xcol]+1

  data_conf %>%
    dplyr::select(accession, xcol, ycol) %>%
    print()

  if(missing(label_file)){
    label_subset <- data_conf %>%
      dplyr::arrange(desc(zcol)) %>%
      dplyr::slice_head(n = num_labs)
  } else {
    labels <- readr::read_csv(label_file, col_names = FALSE)
    label_subset <- data_conf %>%
      dplyr::filter(grepl(paste(labels$X1, collapse = "|"), get(acol),
                   ignore.case = TRUE))
  }

  print(sprintf('Labeled plot points for "%s_PSMloglog.png"', outfile_prefix))
  label_subset %>%
    dplyr::select(accession, zcol, acol) %>%
    print()

  if(threshold == 90){
    p1 <- ggplot(data_conf, aes(x = get(xcol), y = get(ycol),
                                color = conf_90,  label = get(acol))) +
      geom_point(alpha = 0.75, size = 1.5) +
      annotate(geom = "text", x = Inf, y = 1, vjust = 1, hjust = 1,
               label = "10% FDR", color = point_color, size = 4)

  } else if(threshold == 95) {
    p1 <- ggplot(data_conf, aes(x = get(xcol), y = get(ycol),
                                color = conf_95, label = get(acol))) +
      geom_point(alpha = 0.75, size = 1.5) +
      annotate(geom = "text", x = Inf, y = 1, vjust = 1, hjust = 1,
               label = "5% FDR", color = point_color, size = 4)

  } else {
    p1 <- ggplot(data_conf, aes(x = get(xcol), y = get(ycol),
                                color = conf_99, label = get(acol))) +
      geom_point(alpha = 0.75, size = 1.5) +
      annotate(geom = "text", x = Inf, y = 1, vjust = 1, hjust = 1,
               label = "1% FDR", color = point_color, size = 4)

  }

  p1
  p2 <- p1 + ggrepel::geom_text_repel(data = label_subset,
                             size = 6.5/.pt, # font size
                             fontface = "bold",
                             alpha = 1.0,
                             hjust = 0.2,
                             point.padding = unit(0.5, "lines"),
                             box.padding = unit(0.5, "lines"),
                             nudge_y = 0.3,
                             nudge_x = -0.3,
                             colour = palette_pretty[7],
                             set.seed(3333)) +
    scale_x_log10(minor_breaks = log10_minor_break()) +
    scale_y_log10(minor_breaks = log10_minor_break()) +
    scale_color_manual(values = rev(c(point_color, palette_pretty[5]))) +
    theme(legend.position = "none") +
    theme(panel.grid.major = element_line(colour = palette_pretty[5], size = 0.1, linetype = "dashed"),
          panel.grid.minor = element_line(colour = palette_pretty[5], size = 0.1, linetype = "dashed")) +
    xlab(xlab) +
    ylab(ylab)

  p2

  ggsave(sprintf("%s_PSMloglog.pdf", outfile_prefix), plot = p2,
         device = "pdf", width = 8, height = 6, units = "in")

  ggsave(sprintf("%s_PSMloglog.png", outfile_prefix), plot = p2,
         device = "png", width = 8, height = 6, units = "in")

  return(p2)
}

#' Plot Z-scores across biological replicates.
#'
#' Colored by a multiple hypothesis corrected false discovery rate (FDR) computed on a weighted Z-score. See README for file formats and example usage on the \href{https://github.com/rachaelcox/enrichr}{enrichr GitHub Repository}.
#'
#' @param data A data frame produced by `combine_reps()`.
#' @param ycol Name of column containing y-axis data; should correspond to replicate #1 Z-scores.
#' @param xcol Name of column containing x-axis data; should correspond to replicate #1 Z-scores.
#' @param ylab Text for y-axis label; should correspond to replicate #1 Z-scores.
#' @param xlab Text for x-axis label; should correspond to replicate #2 Z-scores.
#' @param threshold Optional. FDR threshold to color points by. One of c(90, 95, 99). Default = 5% FDR (fdr_bh <= 0.05).
#' @param num_labs Optional. Number of annotation labels to display. Requires input to have an annotation column named `gene_names_primary`. Default = 0.
#' @param  label_file Optional. Comma-separated file containing names of genes to be labeled on the plot; requires input to have an annotation column named `gene_names_primary` and labels must match format in the column.
#' @param outfile_prefix Optional. File prefix for output `.png` and `.pdf` files. Default = "output."
#' @return Plot depicting replicate 1 z-score vs replicate 2 z-scores in `.png` and `.pdf` format. Colored by FDR.
#' @export
zplot <- function(data, outfile_prefix, ylab, xlab, ycol, xcol,
                      threshold, label_file, num_labels){

  theme_set(cowplot::theme_cowplot())
  palette_pretty <- c("#1E1E1E", "#FF0000", "#0072B2",
                      "#E69F00", "#009E24", "#979797","#5530AA", "#1E1E1E")

  if(missing(outfile_prefix)){
    outfile_prefix = "output"
  }

  if(missing(num_labels)){
    num_labels = 10
  }

  if(missing(threshold)){
    threshold = 95
  }

  acol <- grep('gene_names_primary$', names(data), value = TRUE)
  zcol <- grep('joint_zscore', names(data), value = TRUE)

  if(length(zcol) > 1){
    zcol <- zcol[2]
  }

  if(missing(label_file)){   # take top n labels if no label file
    label_subset <- data %>%
      dplyr::arrange(desc(zcol)) %>%
      dplyr::slice_head(n = num_labels)

  } else {
    labels <- readr::read_csv(label_file, col_names = FALSE)
    label_subset <- data %>%
      dplyr::filter(grepl(paste(labels$X1, collapse = "|"), get(acol), ignore.case = TRUE))
  }

  print(sprintf('Labeled plot points for "%s_zscore_comparison.png & %s_zscore_comparison.pdf:"', outfile_prefix, outfile_prefix))
  label_subset %>%
    dplyr::select(accession, matches("zscore"), matches("gene_names_primary")) %>%
    print()

  # z-scores vs z-scores plot

  if(threshold == 90){
    p_base <- ggplot(data, aes(x = get(xcol), y = get(ycol),
                               color = conf_90, label = get(acol))) +
      annotate(geom = "text",
               x = max(data[xcol]),
               y = min(data[ycol]),
               vjust = 1,
               label = "10% FDR",
               color = palette_pretty[2], size = 4) +
      geom_hline(yintercept = 1.282, alpha = 0.6) +
      geom_hline(yintercept = -1.282, alpha = 0.6) +
      geom_vline(xintercept = 1.282, alpha = 0.6) +
      geom_vline(xintercept = -1.282, alpha = 0.6) +
      scale_x_continuous(breaks = c(-5, -1.282, 0, 1.282, 5, 10),
                         labels = c(-5, -1.282, 0, 1.282, 5, 10)) +
      scale_y_continuous(breaks = c(-5, -1.282, 0, 1.282, 5, 10),
                         labels = c(-5, -1.282, 0, 1.282, 5, 10))

  } else if(threshold == 95) {
    p_base <- ggplot(data, aes(x = get(xcol), y = get(ycol),
                                      color = conf_95, label = get(acol))) +
      #labs(color = "95% Confidence\n") +
      annotate(geom = "text",
               x = max(data[xcol]),
               y = min(data[ycol]),
               vjust = 1,
               label = "5% FDR",
               color = palette_pretty[2], size = 4) +
      geom_hline(yintercept = 1.645, alpha = 0.6) +
      geom_hline(yintercept = -1.645, alpha = 0.6) +
      geom_vline(xintercept = 1.645, alpha = 0.6) +
      geom_vline(xintercept = -1.645, alpha = 0.6) +
      scale_x_continuous(breaks = c(-5, -1.645, 0, 1.645, 5, 10),
                         labels = c(-5, -1.645, 0, 1.645, 5, 10)) +
      scale_y_continuous(breaks = c(-5, -1.645, 0, 1.645, 5, 10),
                         labels = c(-5, -1.645, 0, 1.645, 5, 10))

  } else {
    p_base <- ggplot(data, aes(x = get(xcol), y = get(ycol),
                                      color = conf_99, label = get(acol))) +
      annotate(geom = "text",
               x = max(data[xcol]),
               y = min(data[ycol]),
               vjust = 1,
               label = "1% FDR",
               color = palette_pretty[2], size = 4) +
      geom_hline(yintercept = 2.33, alpha = 0.6) +
      geom_hline(yintercept = -2.33, alpha = 0.6) +
      geom_vline(xintercept = 2.33, alpha = 0.6) +
      geom_vline(xintercept = -2.33, alpha = 0.6) +
      scale_x_continuous(breaks = c(-5, -2.33, 0, 2.33, 5, 10),
                         labels = c(-5, -2.33, 0, 2.33, 5, 10)) +
      scale_y_continuous(breaks = c(-5, -2.33, 0, 2.33, 5, 10),
                         labels = c(-5, -2.33, 0, 2.33, 5, 10))
  }


  pz <- p_base +
    geom_point(alpha = 0.75, size = 1.5) +
    ggrepel::geom_text_repel(data = label_subset,
                    size = 6.5/.pt, # font size
                    fontface = "bold",
                    alpha = 1.0,
                    hjust = 0.2,
                    point.padding = unit(0.5, "lines"),
                    box.padding = unit(0.5, "lines"),
                    nudge_y = -0.3,
                    nudge_x = 0.8,
                    colour = palette_pretty[1],
                    set.seed(3333)) +
    scale_color_manual(values = palette_pretty) +
    theme(legend.position = "none") +
    #theme(legend.position = "bottom") +
    #theme(legend.position = c(0.7, 0.15),
    #      legend.title = element_blank()) +
    #guides(color = guide_legend(ncol = 1, label.theme = element_text(size = 9))) +
    theme(panel.grid.major = element_line(colour = palette_pretty[6],
                                          size = 0.1, linetype = "dashed"),
          panel.grid.minor = element_line(colour = palette_pretty[6],
                                          size = 0.1, linetype = "dashed")) +
    xlab(xlab) +
    ylab(ylab)

  ggsave(sprintf("%s_zscore_comparison.pdf", outfile_prefix), plot = pz,
         width = 8, height = 6, units = "in")

  ggsave(sprintf("%s_zscore_comparison.png", outfile_prefix), plot = pz,
         width = 8, height = 6, units = "in")

  print(pz)
  return(pz)
}