R/diff_expression.R
In RNAseqQC: Quality Control for RNA-Seq Data

Documented in plot_gene plot_ma plot_sample_MAs plot_within_level_sample_MAs

#' MA-plot of a differential testing result
#'
#' @param de_res An object returned by `DESeq2::results()` or `DESeq2::lfcShrink()`
#' @param dds The DESeqDataSet that was used to build the ´de_res´ object. This is needed for gene name annotation.
#' @param annotate_top_n Annotate the top n significant genes by fold change (up- and down-regulated)
#' @param highlight_genes Vector of gene names or gene IDs to highlight on the plot (overwrites top_n annotation)
#'
#' @return A ggplot object of the ggplot2 package that contains the MA-plot. The plot shows three classes of points:
#' Light gray points are genes with low counts that are removed from the analysis by independent filtering. Darker gray points
#' are not significant genes that show a density map to visualize where the majority of non-significant points are located. Finally,
#' red point show significant genes.
#'
#'
#' @examples
#' library("DESeq2")
#' set.seed(1)
#' dds <- makeExampleDESeqDataSet(n=1500, m=6, betaSD=.3, interceptMean=6)
#' rowData(dds)$gene_name <- rownames(dds)
#' dds <- DESeq(dds)
#' de_res <- results(dds)
#' plot_ma(de_res, dds)
#'
#' @export
plot_ma <- function(de_res, dds, annotate_top_n = 5, highlight_genes = NULL) {

  fdr <- de_res@metadata$alpha

  if(!identical(rownames(dds),rownames(de_res))){
    stop("Rownames of de_res and dds do not match.")
  }

  # add gene names from rowdata
  de_res <- de_res %>%
    as_tibble(rownames = "gene_id") %>%
    left_join(
      rowData(dds) %>%
        as_tibble(rownames = "gene_id") %>%
        select(gene_id, gene_name),
      by = "gene_id"
    )

  # treat special case if de_res resulted from lfcShrink()
  # with a method that produced svalues
  if ("svalue" %in% colnames(de_res)) {
    # remove differential testing columns in case
    # lfcShrink() was provided with a DESeqResults object
    # in the 'res' argument
    de_res$pvalue <- NULL
    de_res$svalue <- NULL
    colnames(de_res)[colnames(de_res)=="svalue"] <- "padj"
  }

  de_res <- de_res %>%
    mutate(significant = padj < fdr)

  # top n significant genes
  annotate_top_n_genes <- de_res %>%
    filter(significant) %>%
    filter(
      (rank(log2FoldChange) <= annotate_top_n) |
        (rank(-log2FoldChange) <= annotate_top_n)
    ) %>%
    pull(gene_id)

  # either regular MA-plot with top n genes annotated or
  # with highlighted genes
  if (is.null(highlight_genes)) {
    p <- de_res %>%
      filter(!significant) %>%
      ggplot(aes(baseMean, log2FoldChange, label = gene_name)) +
      # not passing independent filtering
      geom_point(data = filter(de_res, is.na(padj)), color = "gray85") +
      # not significant
      ggpointdensity::geom_pointdensity()

      # if there are no significant genes, geom_point will throw an error,
      # so this case needs to be treated separately
      if(any(de_res$significant, na.rm=T)==TRUE){
        # significant
        p <- p +
          geom_point(data = filter(de_res, significant), aes(fill = "significant"), color = "red", alpha = .7) +
          # highlight top significant
          ggrepel::geom_text_repel(
            data = de_res %>%
              filter(gene_id %in% annotate_top_n_genes),
            color = "red", max.overlaps = Inf
          )
      }
      p + scale_color_gradient(low = "gray70", high = "gray10") +
      guides(
        color = guide_colorbar(order = 1),
        fill = guide_legend(order = 2)
      ) +
      scale_x_log10() +
      labs(x = "mean normalized count", y = "log2-fold change", fill = NULL) +
      cowplot::theme_cowplot()
  } else {
    de_res %>%
      arrange(significant) %>%
      ggplot(aes(baseMean, log2FoldChange, color = significant)) +
      geom_point() +
      gghighlight::gghighlight(gene_name %in% highlight_genes | gene_id %in% highlight_genes,
        label_key = gene_name, use_group_by = F,
        label_params = list(fill = NA, label.size = NA, fontface = "bold")
      ) +
      scale_color_manual(values = c("red", "black"), breaks = c(TRUE, FALSE)) +
      scale_x_log10() +
      labs(x = "mean normalized count", y = "log2-fold change") +
      cowplot::theme_cowplot()
  }
}

#' Plot correlations of samples within a level of a group
#'
#' For the given level, the gene-wise median over all samples is computed
#' to obtain a reference sample. Then, each sample is plotted against the
#' reference as MA-plot.
#' @param vsd An object generated by `DESeq2::vst()`
#' @param group A grouping variable, must be a column of `colData(vsd)`
#' @param level A level of the grouping variable
#' @param y_lim Y-axis limits, the axis will run from `-y_lim` to `y_lim`
#' @param rasterise Whether to rasterise the points using ggrastr.
#' @param ... Other parameters passed on to ggrastr::rasterise
#'
#' @return A list of ggplot objects of the ggplot2 package that contains for
#' each sample of the specified level the the sample vs reference MA-plot.
#'
#' @examples
#' library("DESeq2")
#' set.seed(1)
#' dds <- makeExampleDESeqDataSet(n=1000, m=4, interceptMean=10)
#' colData(dds)$type <- c("A","A","B","B")
#' vsd <- vst(dds)
#' plot_within_level_sample_MAs(vsd, group="type", level="A")
#'
#' @export
plot_within_level_sample_MAs <- function(vsd, group, level, y_lim = 4, rasterise = FALSE, ...) {

  # prevent 'no visible binding for global variable' package warnings
  reference <- NULL

  # samples of the level
  level_samples <- colnames(vsd)[vsd[[group]] == level]

  # gene-wise median
  median_sample <- rowMedians(assay(vsd)[, level_samples, drop = F], na.rm=T)

  if(rasterise == TRUE){
    raster_fun <- function(x) {ggrastr::rasterise(x, ...)}
  } else {
    raster_fun <- identity
  }

  map(level_samples, function(s) {
    plot <- tibble(sample = median_sample, reference = assay(vsd)[, s]) %>%
      ggplot(aes(x = (sample + reference) / 2, y = sample - reference)) +
        raster_fun(ggpointdensity::geom_pointdensity()) +
        scale_color_viridis_c() +
        ylim(-y_lim, y_lim) +
        labs(x = "mean log2(norm. count)", y = paste0("log2-fold change\nsample vs. reference"), title = paste0(s, " vs. ", level)) +
        geom_hline(yintercept = 0, linetype = "55") +
        cowplot::theme_cowplot()
  })
}

#' MA plots of samples
#'
#' For each level of the grouping variable, the gene-wise median over all samples is computed
#' to obtain a reference sample. Then, each sample is plotted against the reference.
#' @param vsd An object generated by `DESeq2::vst()`
#' @param group A grouping variable, must be a column of `colData(vsd)`
#' @param y_lim Y-axis limits, the axis will run from `-y_lim` to `y_lim`
#' @param rasterise Whether to rasterise the points using ggrastr.
#' @param ... Other parameters passed on to ggrastr::rasterise
#'
#' @examples
#' library("DESeq2")
#' set.seed(1)
#' dds <- makeExampleDESeqDataSet(n=1000, m=4, interceptMean=10)
#' colData(dds)$type <- c("A","A","B","B")
#' vsd <- vst(dds)
#' plot_sample_MAs(vsd, group="type")
#'
#' @return A list of ggplot objects of the ggplot2 package, with each element corresponding to one MA-plot.
#' @export
plot_sample_MAs <- function(vsd, group, y_lim = 3, rasterise = FALSE, ...) {
  if (!group %in% colnames(colData(vsd))) {
    stop("Variable 'group' must be a column of colData(vsd).")
  }

  vsd[[group]] %>%
    unique() %>%
    sort() %>%
    map(~ plot_within_level_sample_MAs(vsd, group, level = .x, y_lim = y_lim, rasterise = rasterise, ...)) %>%
    purrr::flatten()
}

#' Plot a gene
#'
#' @param gene A gene ID or gene name, i.e. an element of `rownames(dds)` or of `rowData(dds)$gene_name`
#' @param dds a DESeqDataSet
#' @param x_var Variable to plot on the x-axis. If NULL, then each sample is plotted separately.
#' @param color_by Variable (column in `colData(dds)`) to color points by.
#' @param point_alpha alpha value of `geom_point()`
#' @param point_rel_size relative size of `geom_point()`
#' @param show_plot Whether to show the plot or not
#'
#' @return The function displays the plot and returns invisible the data frame of
#' expression values and colData annotation for the gene.
#'
#' @examples
#' library("DESeq2")
#' set.seed(1)
#' dds <- makeExampleDESeqDataSet()
#' colData(dds)$type <- c("A","A","A","B","B","B")
#' colData(dds)$patient <- c("1","1","2","2","3","3")
#' dds <- estimateSizeFactors(dds)
#' plot_gene("gene1", dds)
#' plot_gene("gene1", dds, x_var="patient", color_by="type")
#'
#' @export
plot_gene <- function(gene, dds, x_var = NULL, color_by = NULL, point_alpha = .7, point_rel_size = 2, show_plot = TRUE) {

  if (gene %in% rownames(dds)) {
    gene_id <- gene
    gene_name <- rowData(dds)$gene_name[rownames(dds) == gene]
  } else if (gene %in% rowData(dds)$gene_name) {
    if( length(get_gene_id(gene, dds)) > 1) {
      stop(str_c("Gene ", gene, "appears multiple times in the dataset. Use 'get_gene_id()' and specify a gene ID."))
    }
    gene_id <- rownames(dds)[rowData(dds)$gene_name == gene]
    gene_name <- gene
  } else {
    stop("Variable 'gene' must be either an element of rownames(dds) or of rowData(dds)$gene_name.")
  }

  gene_data <- log2(1 + counts(dds, normalized = T)[gene_id, ]) %>%
    enframe(name = "sample_id", value = "count") %>%
    bind_cols(colData(dds) %>% as_tibble() %>% select(-tidyselect::any_of("sample_id")))

  if (is.null(x_var)) {
    x_var <- "sample_id"
  }

  plot <- gene_data %>%
    ggplot(aes(x = .data[[x_var]], y = count, color = if (!is.null(color_by)){.data[[color_by]]} else {NULL})) +
      geom_point(size = rel(point_rel_size), alpha = point_alpha) +
      labs(y = "log2(1 + norm. count)", title = gene_name, color = color_by) +
      cowplot::theme_cowplot()

  if (x_var == "sample_id") {
    plot <- plot + theme(axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1))
  }

  if (show_plot) {
    print(plot)
  }

  invisible(list("plot" = plot, "data" = gene_data))
}