R/gs_summaries.R

Defines functions gs_summary_heat gs_horizon gs_summary_overview_pair gs_summary_overview

Documented in gs_horizon gs_summary_heat gs_summary_overview gs_summary_overview_pair

#' Plots a summary of enrichment results
#'
#' Plots a summary of enrichment results for one set
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to check the
#' formatting requirements (a minimal set of columns should be present).
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be displayed
#' @param p_value_column Character string, specifying the column of `res_enrich`
#' where the p-value to be represented is specified. Defaults to `gs_pvalue`
#' (it could have other values, in case more than one p-value - or an adjusted
#' p-value - have been specified).
#' @param color_by Character, specifying the column of `res_enrich` to be used
#' for coloring the plotted gene sets. Defaults sensibly to `z_score`.
#'
#' @return A `ggplot` object
#'
#' @seealso [gs_summary_overview_pair()], [gs_horizon()]
#' @export
#'
#' @examples
#'
#' library("macrophage")
#' library("DESeq2")
#' library("org.Hs.eg.db")
#' library("AnnotationDbi")
#'
#' # dds object
#' data("gse", package = "macrophage")
#' dds_macrophage <- DESeqDataSet(gse, design = ~line + condition)
#' rownames(dds_macrophage) <- substr(rownames(dds_macrophage), 1, 15)
#' dds_macrophage <- estimateSizeFactors(dds_macrophage)
#'
#' # annotation object
#' anno_df <- data.frame(
#'   gene_id = rownames(dds_macrophage),
#'   gene_name = mapIds(org.Hs.eg.db,
#'                      keys = rownames(dds_macrophage),
#'                      column = "SYMBOL",
#'                      keytype = "ENSEMBL"),
#'   stringsAsFactors = FALSE,
#'   row.names = rownames(dds_macrophage)
#' )
#'
#' # res object
#' data(res_de_macrophage, package = "GeneTonic")
#' res_de <- res_macrophage_IFNg_vs_naive
#'
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#' res_enrich <- get_aggrscores(res_enrich, res_de, anno_df)
#'
#' gs_summary_overview(res_enrich)
#'
gs_summary_overview <- function(res_enrich,
                                n_gs = 20,
                                p_value_column = "gs_pvalue",
                                color_by = "z_score"
                                # , size_by = "gs_de_count"
                                ) {
  if (!(color_by %in% colnames(res_enrich))) {
    stop("Your res_enrich object does not contain the ",
         color_by,
         " column.\n",
         "Compute this first or select another column to use for the color.")
  }

  re <- res_enrich
  re$logp10 <- -log10(res_enrich[[p_value_column]])
  re <- re[seq_len(n_gs), ]

  re_sorted <- re %>%
    arrange(.data$logp10) %>%
    mutate(gs_description = factor(.data$gs_description, .data$gs_description))
  p <- ggplot(re_sorted, (aes_string(x = "gs_description", y = "logp10"))) +
    geom_segment(aes_string(x = "gs_description", xend = "gs_description", y = 0, yend = "logp10"), color = "grey") +
    geom_point(aes(col = .data[[color_by]]), size = 4) +
    # geom_point(aes(col = .data[[color_by]], size = .data[[size_by]])) +
    scale_color_gradient2(low = "#313695", mid = "#FFFFE5", high = "#A50026") +
    coord_flip() +
    labs(x = "Gene set description",
         y = "-log10 p-value",
         col = color_by) +
    theme_minimal()

  return(p)
}

#' Plots a summary of enrichment results
#'
#' Plots a summary of enrichment results - for two sets of results
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to check the
#' formatting requirements (a minimal set of columns should be present).
#' @param res_enrich2 As `res_enrich`, the result of functional enrichment analysis,
#' in a scenario/contrast different than the first set.
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be displayed
#' @param p_value_column Character string, specifying the column of `res_enrich`
#' where the p-value to be represented is specified. Defaults to `gs_pvalue`
#' (it could have other values, in case more than one p-value - or an adjusted
#' p-value - have been specified).
#' @param color_by Character, specifying the column of `res_enrich` to be used
#' for coloring the plotted gene sets. Defaults sensibly to `z_score`.
#' @param alpha_set2 Numeric value, between 0 and 1, which specified the alpha
#' transparency used for plotting the points for gene set 2.
#'
#' @return A `ggplot` object
#'
#' @seealso [gs_summary_overview()], [gs_horizon()]
#'
#' @export
#'
#' @examples
#'
#' library("macrophage")
#' library("DESeq2")
#' library("org.Hs.eg.db")
#' library("AnnotationDbi")
#'
#' # dds object
#' data("gse", package = "macrophage")
#' dds_macrophage <- DESeqDataSet(gse, design = ~line + condition)
#' rownames(dds_macrophage) <- substr(rownames(dds_macrophage), 1, 15)
#' dds_macrophage <- estimateSizeFactors(dds_macrophage)
#'
#' # annotation object
#' anno_df <- data.frame(
#'   gene_id = rownames(dds_macrophage),
#'   gene_name = mapIds(org.Hs.eg.db,
#'                      keys = rownames(dds_macrophage),
#'                      column = "SYMBOL",
#'                      keytype = "ENSEMBL"),
#'   stringsAsFactors = FALSE,
#'   row.names = rownames(dds_macrophage)
#' )
#'
#' # res object
#' data(res_de_macrophage, package = "GeneTonic")
#' res_de <- res_macrophage_IFNg_vs_naive
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#' res_enrich <- get_aggrscores(res_enrich, res_de, anno_df)
#'
#' res_enrich2 <- res_enrich[1:42, ]
#' set.seed(42)
#' shuffled_ones <- sample(seq_len(42)) # to generate permuted p-values
#' res_enrich2$gs_pvalue <- res_enrich2$gs_pvalue[shuffled_ones]
#' res_enrich2$z_score <- res_enrich2$z_score[shuffled_ones]
#' res_enrich2$aggr_score <- res_enrich2$aggr_score[shuffled_ones]
#' # ideally, I would also permute the z scores and aggregated scores
#' gs_summary_overview_pair(res_enrich = res_enrich,
#'                          res_enrich2 = res_enrich2)
gs_summary_overview_pair <- function(res_enrich,
                                     res_enrich2,
                                     n_gs = 20,
                                     p_value_column = "gs_pvalue",
                                     color_by = "z_score",
                                     alpha_set2 = 1) {
  if (!(color_by %in% colnames(res_enrich))) {
    stop("Your res_enrich object does not contain the ",
         color_by,
         " column.\n",
         "Compute this first or select another column to use for the color.")
  }
  # same for set2
  if (!(color_by %in% colnames(res_enrich2))) {
    stop("Your res_enrich object does not contain the ",
         color_by,
         " column.\n",
         "Compute this first or select another column to use for the color.")
  }

  gs_set1 <- res_enrich$gs_id
  gs_set2 <- res_enrich2$gs_id
  gs_common <- intersect(gs_set1, gs_set2)

  if (length(gs_common) == 0) {
    stop("No gene sets have been found in common to the two enrichment results")
  }

  # restrict to the top common n_gs
  gs_common <- gs_common[seq_len(min(n_gs, length(gs_common)))]

  common_re1 <- res_enrich[gs_common, ]
  common_re2 <- res_enrich2[gs_common, ]

  common_re1$logp10 <- -log10(common_re1[[p_value_column]])
  common_re2$logp10 <- -log10(common_re2[[p_value_column]])

  re_both <- common_re1
  re_both[["logp10_2"]] <- common_re2$logp10
  re_both[[color_by]] <- common_re1[[color_by]]
  re_both[[paste0(color_by, "_2")]] <- common_re2[[color_by]]

  re_both_sorted <- re_both %>%
    arrange(.data$logp10) %>%
    mutate(gs_description = factor(.data$gs_description, .data$gs_description))

  p <- ggplot(re_both_sorted, aes_string(x = "gs_description", y = "logp10")) +
    geom_segment(aes_string(x = "gs_description", xend = "gs_description", y = "logp10_2", yend = "logp10"), color = "grey") +
    geom_point(aes(fill = .data[[color_by]]), size = 4, pch = 21) +
    geom_point(aes_string(y = "logp10_2", col = paste0(color_by, "_2")),
               size = 4, alpha = alpha_set2) +
    scale_color_gradient2(low = "#313695", mid = "#FFFFE5", high = "#A50026") +
    scale_fill_gradient2(low = "#313695", mid = "#FFFFE5", high = "#A50026", guide = FALSE) +
    coord_flip() +
    labs(x = "Gene set description",
         y = "-log10 p-value",
         col = color_by) +
    ylim(0, NA) +
    theme_minimal()

  return(p)
}


#' Plots a summary of enrichment results
#'
#' Plots a summary of enrichment results - horizon plot to compare one or more
#' sets of results
#'
#' @details It makes sense to have the results in `res_enrich` sorted by
#' increasing `gs_pvalue`, to make sure the top results are first sorted by the
#' significance (when selecting the common gene sets across the `res_enrich`
#' elements provided in `compared_res_enrich_list`)
#'
#' The gene sets included are a subset of the ones in common to all different
#' scenarios included in `res_enrich` and the elements of `compared_res_enrich_list`.
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to check the
#' formatting requirements (a minimal set of columns should be present).
#' @param compared_res_enrich_list A named list, where each element is a `data.frame`
#' formatted like the standard `res_enrich` objects used by `GeneTonic`. The
#' names of the list are the names of the scenarios.
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be displayed
#' @param p_value_column Character string, specifying the column of `res_enrich`
#' where the p-value to be represented is specified. Defaults to `gs_pvalue`
#' (it could have other values, in case more than one p-value - or an adjusted
#' p-value - have been specified).
#' @param color_by Character, specifying the column of `res_enrich` to be used
#' for coloring the plotted gene sets. Defaults sensibly to `z_score`.
#' @param ref_name Character, defining the name of the scenario to compare
#' against (the one in `res_enrich`) - defaults to "ref_scenario".
#' @param sort_by Character string, either "clustered", or "first_set". This
#' controls the sorting order of the included terms in the final plot.
#' "clustered" presents the terms grouped by the scenario where they assume the
#' highest values. "first_set" sorts the terms by the significance value in the
#' reference scenario.
#'
#' @return A `ggplot` object
#'
#' @seealso [gs_summary_overview()], [gs_summary_overview_pair()]
#'
#' @export
#'
#' @examples
#'
#' library("macrophage")
#' library("DESeq2")
#' library("org.Hs.eg.db")
#' library("AnnotationDbi")
#'
#' # dds object
#' data("gse", package = "macrophage")
#' dds_macrophage <- DESeqDataSet(gse, design = ~line + condition)
#' rownames(dds_macrophage) <- substr(rownames(dds_macrophage), 1, 15)
#' dds_macrophage <- estimateSizeFactors(dds_macrophage)
#'
#' # annotation object
#' anno_df <- data.frame(
#'   gene_id = rownames(dds_macrophage),
#'   gene_name = mapIds(org.Hs.eg.db,
#'                      keys = rownames(dds_macrophage),
#'                      column = "SYMBOL",
#'                      keytype = "ENSEMBL"),
#'   stringsAsFactors = FALSE,
#'   row.names = rownames(dds_macrophage)
#' )
#'
#' # res object
#' data(res_de_macrophage, package = "GeneTonic")
#' res_de <- res_macrophage_IFNg_vs_naive
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#' res_enrich <- get_aggrscores(res_enrich, res_de, anno_df)
#'
#' res_enrich2 <- res_enrich[1:42, ]
#' res_enrich3 <- res_enrich[1:42, ]
#' res_enrich4 <- res_enrich[1:42, ]
#'
#' set.seed(2*42)
#' shuffled_ones_2 <- sample(seq_len(42)) # to generate permuted p-values
#' res_enrich2$gs_pvalue <- res_enrich2$gs_pvalue[shuffled_ones_2]
#' res_enrich2$z_score <- res_enrich2$z_score[shuffled_ones_2]
#' res_enrich2$aggr_score <- res_enrich2$aggr_score[shuffled_ones_2]
#'
#' set.seed(3*42)
#' shuffled_ones_3 <- sample(seq_len(42)) # to generate permuted p-values
#' res_enrich3$gs_pvalue <- res_enrich3$gs_pvalue[shuffled_ones_3]
#' res_enrich3$z_score <- res_enrich3$z_score[shuffled_ones_3]
#' res_enrich3$aggr_score <- res_enrich3$aggr_score[shuffled_ones_3]
#'
#' set.seed(4*42)
#' shuffled_ones_4 <- sample(seq_len(42)) # to generate permuted p-values
#' res_enrich4$gs_pvalue <- res_enrich4$gs_pvalue[shuffled_ones_4]
#' res_enrich4$z_score <- res_enrich4$z_score[shuffled_ones_4]
#' res_enrich4$aggr_score <- res_enrich4$aggr_score[shuffled_ones_4]
#'
#' compa_list <- list(
#'   scenario2 = res_enrich2,
#'   scenario3 = res_enrich3,
#'   scenario4 = res_enrich4
#' )
#'
#' gs_horizon(res_enrich,
#'            compared_res_enrich_list = compa_list,
#'            n_gs = 50,
#'            sort_by = "clustered")
#' gs_horizon(res_enrich,
#'            compared_res_enrich_list = compa_list,
#'            n_gs = 20,
#'            sort_by = "first_set")
gs_horizon <- function(res_enrich,
                       compared_res_enrich_list,
                       n_gs = 20,
                       p_value_column = "gs_pvalue",
                       color_by = "z_score",
                       ref_name = "ref_scenario",
                       sort_by = c("clustered", "first_set")) {
  if (!(color_by %in% colnames(res_enrich))) {
    stop("Your res_enrich object does not contain the ",
         color_by,
         " column.\n",
         "Compute this first or select another column to use for the color.")
  }

  if (!n_gs > 0) {
    stop("Please select a value for `n_gs` greater than 0")
  }

  if (is.null(names(compared_res_enrich_list))) {
    message("You provided a list for comparison without specifying names, adding some defaults")
    names(compared_res_enrich_list) <-
      paste0("other_", seq_len(length(compared_res_enrich_list)))
  }

  if (!is(compared_res_enrich_list, "list")) {
    stop("You need to provide a list for comparison (even versus one scenario)")
  }

  colnames_res_enrich <- c("gs_id",
                           "gs_description",
                           "gs_pvalue",
                           "gs_genes",
                           "gs_de_count",
                           "gs_bg_count")
  for (i in seq_len(length(compared_res_enrich_list))) {
    this_re <- compared_res_enrich_list[[i]]

    if (!all(colnames_res_enrich %in% colnames(this_re)))
      stop("One of the provided `res_enrich` objects does not respect the format ",
           "required to use in GeneTonic\n",
           "e.g. all required column names have to be present.\n",
           "You might want to use one of the `shake_*` functions to convert it.\n",
           "Required columns: ", paste(colnames_res_enrich, collapse = ", "),
           "\nThis occurred at the element ", i, " in your `compared_res_enrich_list`")

    if (!p_value_column %in% colnames(this_re))
      stop("Required column (p-value) `", p_value_column, "` not found in a component of ",
           "`compared_res_enrich_list` object.",
           "\nThis occurred at the element ", i, " in your `compared_res_enrich_list`")
    if (!color_by %in% colnames(this_re))
      stop("Required column (for coloring) `", color_by, "` not found in a component of ",
           "`compared_res_enrich_list` object.",
           "\nThis occurred at the element ", i, " in your `compared_res_enrich_list`")
  }

  sort_by <- match.arg(sort_by, c("clustered", "first_set"))

  # compared_res_enrich_list
  # append original ref
  all_res_enrichs <- compared_res_enrich_list
  all_res_enrichs[[ref_name]] <- res_enrich

  all_gsids <- lapply(all_res_enrichs, function(arg) arg[["gs_id"]])

  gs_common <- Reduce(intersect, all_gsids)

  if (length(gs_common) == 0) {
    stop("No gene sets have been found in common to the two enrichment results")
  }

  # restrict to the top common n_gs
  gs_common <- gs_common[seq_len(min(n_gs, length(gs_common)))]

  # append scenario info
  res_enrich[["scenario"]] <- ref_name
  compared_res_enrich_list <- lapply(seq_len(length(compared_res_enrich_list)),
                                     function(arg) {
                                       re <- compared_res_enrich_list[[arg]]
                                       re[["scenario"]] <- names(compared_res_enrich_list)[arg]
                                       return(re)
                                     })

  # reduce to common sets
  re_ref <- res_enrich[gs_common, ]
  re_comp <- lapply(seq_len(length(compared_res_enrich_list)),
                    function(arg) {
                      re <- compared_res_enrich_list[[arg]]
                      re <- re[gs_common, ]
                      return(re)
                    })

  merged_res_enh <- rbind(
    re_ref,
    do.call(rbind, re_comp)
  )
  merged_res_enh$logp10 <- -log10(merged_res_enh$gs_pvalue)

  if (sort_by == "first_set") {
    # sorted by category in scenario1
    p <- merged_res_enh %>%
      mutate(gs_description = factor(.data$gs_description, rev(unique(.data$gs_description)))) %>%
      arrange((.data$logp10)) %>%
      ggplot(aes_string(x = "gs_description", y = "logp10")) +
      geom_line(aes_string(group = "scenario", col = "scenario"), size = 3, alpha = 0.7) +
      geom_point(aes_string(fill = "z_score"), size = 4, pch = 21) +
      scale_color_brewer(palette = "Set2") +
      scale_fill_gradient2(low = "#313695", mid = "#FFFFE5", high = "#A50026") +
      ylim(c(0, NA)) +
      coord_flip() +
      theme_minimal()
  } else if (sort_by == "clustered") {
    # with a nicer sorting - "grouped" by scenario
    nicerorder_terms <- merged_res_enh %>%
      group_by(.data$gs_description) %>%
      mutate(main_category = .data$scenario[which.max(.data$logp10)],
             max_value = max(.data$logp10)) %>%
      arrange(.data$main_category, desc(.data$max_value)) %>%
      dplyr::pull(.data$gs_description)

    p <- merged_res_enh %>%
      # mutate(gs_description=factor(gs_description, unique(gs_description))) %>%
      mutate(gs_description = factor(.data$gs_description, rev(unique(nicerorder_terms)))) %>%
      arrange(desc(.data$logp10)) %>%
      ggplot(aes_string(x = "gs_description", y = "logp10")) +
      geom_line(aes_string(group = "scenario", col = "scenario"), size = 3, alpha = 0.7) +
      scale_color_brewer(palette = "Set2") +
      geom_point(aes_string(fill = "z_score"), size = 4, pch = 21) +
      scale_fill_gradient2(low = "#313695", mid = "#FFFFE5", high = "#A50026") +
      ylim(c(0, NA)) +
      coord_flip() +
      theme_minimal()
  }

  p <- p + labs(x = "Gene set description",
                y = "-log10 p-value",
                col = color_by)

  return(p)
}


#' Plots a heatmap for genes and genesets
#'
#' Plots a heatmap for genes and genesets, useful to spot out intersections across
#' genesets and an overview of them
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to check the
#' formatting requirements (a minimal set of columns should be present).
#' @param res_de A `DESeqResults` object.
#' @param annotation_obj A `data.frame` object with the feature annotation
#' information, with at least two columns, `gene_id` and `gene_name`.
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be displayed
#'
#' @return A `ggplot` object
#' @export
#'
#' @examples
#'
#' library("macrophage")
#' library("DESeq2")
#' library("org.Hs.eg.db")
#' library("AnnotationDbi")
#'
#' # dds object
#' data("gse", package = "macrophage")
#' dds_macrophage <- DESeqDataSet(gse, design = ~line + condition)
#' rownames(dds_macrophage) <- substr(rownames(dds_macrophage), 1, 15)
#' dds_macrophage <- estimateSizeFactors(dds_macrophage)
#'
#' # annotation object
#' anno_df <- data.frame(
#'   gene_id = rownames(dds_macrophage),
#'   gene_name = mapIds(org.Hs.eg.db,
#'                      keys = rownames(dds_macrophage),
#'                      column = "SYMBOL",
#'                      keytype = "ENSEMBL"),
#'   stringsAsFactors = FALSE,
#'   row.names = rownames(dds_macrophage)
#' )
#'
#' # res object
#' data(res_de_macrophage, package = "GeneTonic")
#' res_de <- res_macrophage_IFNg_vs_naive
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#' res_enrich <- get_aggrscores(res_enrich, res_de, anno_df)
#'
#' gs_summary_heat(res_enrich = res_enrich,
#'                 res_de = res_de,
#'                 annotation_obj = anno_df,
#'                 n_gs = 20)

gs_summary_heat <- function(res_enrich,
                            res_de,
                            annotation_obj,
                            n_gs = 80) {

  res_enrich2 <- res_enrich[seq_len(n_gs), ]

  # enriched_gsids <- res_enrich2[["gs_id"]]
  # enriched_gsnames <- res_enrich2[["gs_description"]]
  # enriched_gsdescs <- vapply(enriched_gsids, function(arg) Definition(GOTERM[[arg]]), character(1))

  # rownames(res_enrich) <- enriched_gsids

  gs_expanded <- tidyr::separate_rows(res_enrich2, "gs_genes", sep = ",")
  gs_expanded$log2FoldChange <-
    res_de[annotation_obj$gene_id[match(gs_expanded$gs_genes, annotation_obj$gene_name)], ]$log2FoldChange

  # keep them as factor to prevent rearrangement!
  gs_expanded[["gs_id"]] <- factor(gs_expanded[["gs_id"]], levels = res_enrich2[["gs_id"]])
  gs_expanded[["gs_description"]] <- factor(gs_expanded[["gs_description"]], levels = res_enrich2[["gs_description"]])
  gs_expanded[["gs_genes"]] <- factor(gs_expanded[["gs_genes"]], levels = unique(gs_expanded[["gs_genes"]]))

  p <- ggplot(gs_expanded,
              aes_string(x = "gs_genes", y = "gs_description")) +
    geom_tile(aes_string(fill = "log2FoldChange"),
              color = "white") +
    scale_fill_gradient2(low = muted("deepskyblue"),
                         mid = "lightyellow",
                         high = muted("firebrick"),
                         name = "log2FoldChange") +
    xlab(NULL) + ylab(NULL) + theme_minimal() +
    theme(panel.grid.major = element_blank(),
          axis.text.x = element_text(angle = 75, hjust = 1))

  return(p)
}

Try the GeneTonic package in your browser

Any scripts or data that you put into this service are public.

GeneTonic documentation built on Nov. 8, 2020, 5:27 p.m.