R/gs_mds.R

Defines functions gs_mds

Documented in gs_mds

#' Multi Dimensional Scaling plot for gene sets
#'
#' Multi Dimensional Scaling plot for gene sets, extracted from a `res_enrich`
#' object
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to check the
#' formatting requirements (a minimal set of columns should be present).
#' @param res_de A `DESeqResults` object.
#' @param annotation_obj A `data.frame` object with the feature annotation
#' information, with at least two columns, `gene_id` and `gene_name`.
#' @param gtl A `GeneTonic`-list object, containing in its slots the arguments
#' specified above: `dds`, `res_de`, `res_enrich`, and `annotation_obj` - the names
#' of the list _must_ be specified following the content they are expecting
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be included (from the top ranked ones). Defaults to the number of rows of
#' `res_enrich`
#' @param gs_ids Character vector, containing a subset of `gs_id` as they are
#' available in `res_enrich`. Lists the gene sets to be included, additionally to
#' the ones specified via `n_gs`. Defaults to NULL.
#' @param similarity_measure Character, currently defaults to `kappa_matrix`, to
#' specify how to compute the similarity measure between gene sets
#' @param mds_k Integer value, number of dimensions to compute in the multi
#' dimensional scaling procedure
#' @param mds_labels Integer, defines the number of labels to be plotted on top
#' of the scatter plot for the provided gene sets.
#' @param mds_colorby Character specifying the column of `res_enrich` to be used
#' for coloring the plotted gene sets. Defaults sensibly to `z_score`.
#' @param gs_labels Character vector, containing a subset of `gs_id` as they are
#' available in `res_enrich`. Lists the gene sets to be labeled.
#' @param plot_title Character string, used as title for the plot. If left `NULL`,
#' it defaults to a general description of the plot and of the DE contrast
#' @param return_data Logical, whether the function should just return the
#' data.frame of the MDS coordinates, related to the original `res_enrich`
#' object. Defaults to FALSE.
#'
#' @return A `ggplot` object
#'
#' @seealso [create_kappa_matrix()] is used to calculate the similarity between
#' gene sets
#'
#' @export
#'
#' @examples
#'
#' library("macrophage")
#' library("DESeq2")
#' library("org.Hs.eg.db")
#' library("AnnotationDbi")
#'
#' # dds object
#' data("gse", package = "macrophage")
#' dds_macrophage <- DESeqDataSet(gse, design = ~ line + condition)
#' rownames(dds_macrophage) <- substr(rownames(dds_macrophage), 1, 15)
#' dds_macrophage <- estimateSizeFactors(dds_macrophage)
#'
#' # annotation object
#' anno_df <- data.frame(
#'   gene_id = rownames(dds_macrophage),
#'   gene_name = mapIds(org.Hs.eg.db,
#'     keys = rownames(dds_macrophage),
#'     column = "SYMBOL",
#'     keytype = "ENSEMBL"
#'   ),
#'   stringsAsFactors = FALSE,
#'   row.names = rownames(dds_macrophage)
#' )
#'
#' # res object
#' data(res_de_macrophage, package = "GeneTonic")
#' res_de <- res_macrophage_IFNg_vs_naive
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#' res_enrich <- get_aggrscores(res_enrich, res_de, anno_df)
#'
#' gs_mds(res_enrich,
#'   res_de,
#'   anno_df,
#'   n_gs = 200,
#'   mds_labels = 10
#' )
gs_mds <- function(res_enrich,
                   res_de,
                   annotation_obj,
                   gtl = NULL,
                   n_gs = nrow(res_enrich),
                   gs_ids = NULL,
                   similarity_measure = "kappa_matrix",
                   mds_k = 2,
                   mds_labels = 0,
                   mds_colorby = "z_score",
                   gs_labels = NULL,
                   plot_title = NULL,
                   return_data = FALSE) { # or aggr_score

  if (!is.null(gtl)) {
    checkup_gtl(gtl)
    dds <- gtl$dds
    res_de <- gtl$res_de
    res_enrich <- gtl$res_enrich
    annotation_obj <- gtl$annotation_obj
  }

  similarity_measure <- match.arg(
    similarity_measure,
    c("kappa_matrix", "overlap_matrix")
  )

  # require res_enrich to have aggregated scores and so
  if (!("z_score" %in% colnames(res_enrich))) {
    res_enrich <- get_aggrscores(res_enrich,
      res_de,
      annotation_obj = annotation_obj
    )
  }

  n_gs <- min(n_gs, nrow(res_enrich))

  gs_to_use <- unique(
    c(
      res_enrich$gs_id[seq_len(n_gs)], # the ones from the top
      gs_ids[gs_ids %in% res_enrich$gs_id] # the ones specified from the custom list
    )
  )


  if (similarity_measure == "kappa_matrix") {
    my_simmat <- create_kappa_matrix(res_enrich, n_gs = n_gs, gs_ids = gs_ids)
  } else if (similarity_measure == "overlap_matrix") {
    my_simmat <- create_jaccard_matrix(res_enrich, n_gs = n_gs, gs_ids = gs_ids, return_sym = TRUE)
  } # else ...

  # subset here, internally
  res_enrich <- res_enrich[gs_to_use, ]
  mds_labels <- min(mds_labels, nrow(res_enrich))

  mysets <- res_enrich[["gs_id"]]
  mysets_names <- res_enrich[["gs_description"]]


  mds_gs <- cmdscale((1 - my_simmat), eig = TRUE, k = mds_k)

  mds_gs_df <- data.frame(
    dim1 = mds_gs$points[, 1],
    dim2 = mds_gs$points[, 2],
    gs_id = mysets,
    gs_name = mysets_names,
    gs_DEcount = res_enrich$DE_count,
    gs_colby = res_enrich[[mds_colorby]],
    gs_text = paste0(mysets, ": ", mysets_names),
    stringsAsFactors = FALSE
  )

  if (return_data) {
    return(mds_gs_df)
  }

  max_z <- max(abs(range(mds_gs_df$gs_colby)))
  limit <- max_z * c(-1, 1)

  this_contrast <- (sub(".*p-value: (.*)", "\\1", mcols(res_de, use.names = TRUE)["pvalue", "description"]))

  p <- ggplot(mds_gs_df, aes(
    x = .data$dim1,
    y = .data$dim2,
    text = .data$gs_text
  )) +
    geom_point(aes(
      color = .data$gs_colby,
      size = .data$gs_DEcount
    )) +
    scale_color_gradient2(
      limit = limit,
      low = muted("deepskyblue"),
      mid = "lightyellow",
      high = muted("firebrick")
    ) +
    labs(
      x = "Dimension 1",
      y = "Dimension 2",
      col = mds_colorby,
      size = "Geneset \nmembers"
    ) +
    theme_bw()

  if (mds_labels > 0) {
    label_these <- mds_gs_df$gs_id[seq_len(mds_labels)]
  } else {
    label_these <- NULL
  }

  if (!is.null(gs_labels)) {
    if (!all(gs_labels %in% res_enrich$gs_id)) {
      warning("Not all specified geneset ids were found in the `res_enrich` object")
    }
    label_those <- mds_gs_df[mds_gs_df$gs_id %in% gs_labels, "gs_id"]
  } else {
    label_those <- NULL
  }

  df_gs_labels <- mds_gs_df[mds_gs_df$gs_id %in% unique(c(label_these, label_those)), ]

  p <- p + geom_label_repel(
    aes(label = .data$gs_name),
    data = df_gs_labels,
    size = 3,
    min.segment.length = 0
  )

  # handling the title
  if (is.null(plot_title)) {
    p <- p + ggtitle(paste0("Geneset MDS plot - ", this_contrast))
  } else {
    p <- p + ggtitle(plot_title)
  }

  return(p)
}
federicomarini/GeneTonic documentation built on March 27, 2024, 4:19 p.m.