R/simplify_similarity.R

Defines functions plot_network plot_lollipop plot_heatmap geneset_similarity_matrix genesets_as_matrix cluster_genesets reduce_genesets

Documented in cluster_genesets plot_heatmap plot_lollipop plot_network reduce_genesets

#' Reduce the set of significant genesets to a minimum
#'
#' Analyses are performed independently per 'source' of genesets. The result of this function is the geneset table with a newly appended column 'signif_and_reduced'
#'
#' @param clusters results from `cluster_genesets()`
#' @param simscore_threshold similarity score (0~1) that is required to consider one geneset to be a "parent term" of another.
#' Setting a lower value will yield fewer genesets / stronger summarization.
#' Typical settings for this parameter are 0.8~0.99 (0.9 is default)
#' @param universe_fraction discard genesets that cover more than X fraction of all genes in the universe (unique set of genes covered by all significant genesets).
#' Setting this to 0.25 will deprioritize genesets that cover 25% of all genes (in significant genesets).
#' This prevents very generic GO terms like "protein-containing complex" to be included in results.
#' Typical settings for this parameter are 0.1~0.5  (0.25 is default)
#' @param signifgenes_fraction the minimum fraction of "foreground genes" ('genes_signif' column) found across all significant genesets that should be covered by the reduced geneset collection.
#' This parameter doesn't do anything if there are fewer than 5 "foreground genes" alltogether.
#' Typical settings for this parameter are 0.75~0.95 (0.9 is default)
#' @return the genesets table from the `clusters` parameter, with results in column "signif_and_reduced"
#' @export
reduce_genesets = function(clusters, simscore_threshold = 0.9, universe_fraction = 0.25, signifgenes_fraction = 0.9) {
  ngenes_signif = genes_signif = rescue_count = ngenes = rescue_score = ngenes_input = NULL # fix invisible bindings R package NOTE
  # init result column
  clusters$genesets = clusters$genesets |> mutate(signif_and_reduced = signif)

  for(src in names(clusters$similarity)) { # src = "GO_CC"
    src_rows = clusters$genesets$signif %in% TRUE & clusters$genesets$source == src
    result_subset = clusters$genesets |> filter(src_rows)
    if(nrow(result_subset) < 2) next # nothing to do
    x = clusters$similarity[[src]] # geneset*geneset similarity matrix where scores 0~1 indicate if col j is a parent of row i
    diag(x) = 0 # remove self-similarities
    universe = n_distinct(unlist(result_subset$genes)) # count unique genes covered by all significant genesets

    # goal; ignore terms that are parents of more than X% of the universe
    ids_ignore = NULL
    # iterate columns; values on rows indicate if j is a parent of <row>
    for(j in 1:ncol(x)) {
      # find all children of j, but disregard those that are already on the ignore list.
      # this prevents scenarios where 3 highly overlapping terms, that cover a large part of the universe,
      # are all removed under the assumption that one of the other terms is a child term that'll be retained
      j_children = x[ ! rownames(x) %in% ids_ignore, j] >= simscore_threshold
      if(sum(j_children) > 0) {
        # from the geneset table obtained through test_genesets(), collect the list of genes for genesets that are in geneset j or any of its child terms
        g = result_subset$genes[ result_subset$id %in% colnames(x)[c(j, which(j_children))] ]
        n = n_distinct(unlist(g)) # count unique genes
        if(n / universe > universe_fraction) {
          ids_ignore = c(ids_ignore, colnames(x)[j])
        }
      }
    }

    # subset the similarity matrix
    X = x[ ! rownames(x) %in% ids_ignore, ! colnames(x) %in% ids_ignore, drop = FALSE]
    # find genesets that do not have parents -->> we retain these as representatives of all genesets
    X = X > simscore_threshold
    ids_retain = rownames(X)[rowSums(X) == 0]

    # deal with edge-cases where after pruning terms, less than X% of the universe is covered.
    # here recursively reinstate ids_ignore that cover the most 'missing genes'
    foreground_universe = unique(unlist(result_subset$genes_signif)) # all signif genes
    if(length(foreground_universe) >= 5) {
      # subset of signif genes present in "reduced gene sets"
      foreground_covered = unique(unlist(result_subset$genes_signif[result_subset$id %in% ids_retain]))
      while(length(foreground_covered) / length(foreground_universe) < signifgenes_fraction) {
        # subset of singif genes not represented by current "reduced gene sets"
        foreground_universe_missing = setdiff(foreground_universe, foreground_covered)
        # order the ignore IDs by the fraction of genelist they cover (i.e. include terms in order of 'purity')
        tmp = result_subset |>
          # update; dropped the requirement for 'at least N signif genes' (for genesets that should be added to results)
          filter( ! id %in% ids_retain & ngenes_signif >= 1) |>
          # rescue count represents the number of signif genes we'll be adding to the resulting "reduced gene sets"
          mutate(rescue_count = lengths(lapply(genes_signif, intersect, foreground_universe_missing)), rescue_score = rescue_count / ngenes) |>
          arrange(desc(rescue_score), ngenes_input)
        # something's wrong if we cannot add any 'missing foreground genes' to the results; stop looping
        if(tmp$rescue_score[1] == 0) {
          break
        }
        # update results
        ids_retain = c(ids_retain, tmp$id[1])
        foreground_covered = unique(unlist(result_subset$genes_signif[result_subset$id %in% ids_retain]))
      }
    }

    # finally, store results by updating clusters$genesets
    clusters$genesets$signif_and_reduced[src_rows] = clusters$genesets$id[src_rows] %in% ids_retain
  }

  return(clusters$genesets)
}



#' cluster significant genesets from `test_genesets()` by geneset similarity (separately for each 'geneset source')
#'
#' @param x results from `test_genesets()`
#' @param genelist should be the same as provided to `test_genesets()`
#' @param hclust_method hierarchical clustering method, any of; 'ward.D', 'ward.D2' (default), 'single', 'complete', 'average'
#' @return a list with elements genesets (param `x`), similarity, hc_row, hc_col
#' @export
cluster_genesets = function(x, genelist, hclust_method = "ward.D2") {
  x = validate_genesets(x)
  genelist = validate_genelist(genelist)
  # in addition to standard geneset validation, also check that this is a 'genelist' result; we need to know which are significant
  stopifnot("geneset table should contain a 'signif' column with boolean values (i.e. the result from test_genesets() )" = (
    "signif" %in% colnames(x) && is.logical(x$signif) # allow NA values @ signif
  ))
  stopifnot("'hclust_method' parameter must be any of; 'ward.D', 'ward.D2', 'single', 'complete', 'average'" =
              length(hclust_method) == 1 && hclust_method %in% c('ward.D', 'ward.D2', 'single', 'complete', 'average'))


  score_type = unique(x$score_type)
  # default to a distance matrix based on pvalue-based gene scores
  if(length(score_type) == 0) {
    score_type = "pvalue"
  }

  stopifnot("score_type column in input table can only contain these values; 'effectsize', 'effectsize_up', 'effectsize_down', 'effectsize_abs', 'effectsize_3way', 'pvalue', 'pvalue_effectsize'" = (
    length(score_type) > 0 && all(score_type %in% c("effectsize", "effectsize_up", "effectsize_down", "effectsize_abs", "effectsize_3way", "pvalue", "pvalue_effectsize"))
  ))

  ### genelist scores, analogous to test_genesets_goat()
  # matrix where columns are 'score types' (e.g. "effectsize_up" and "effectsize_down") and rows are genes (aligned with genelist table)
  genelist_scores = goat_testgene_score_matrix(genelist, score_type)

  # init result variables
  rows = x$signif %in% TRUE
  similarity = hc_row = hc_col = list()

  for(src in sort(unique(x$source))) { # src = "GO_CC"
    rows = x$source == src & x$signif %in% TRUE # use %in% to guard against NA values / 'not tested'
    N = sum(rows)
    # nothing to do, go next
    if(N == 0) next
    # don't compute distance matrix nor clustering if there is only 1 geneset
    if(N == 1) {
      similarity[[src]] = matrix(1, nrow = 1, ncol = 1, dimnames = list(x$id[rows], x$id[rows]))
      next
    }

    # compute distance matrix
    y = x |> filter(rows)
    m = genesets_as_matrix(y)
    sim = geneset_similarity_matrix(mat = m$mat, mat_row_geneid = m$rowid, mat_col_gsid = m$colid, genelist_geneid = genelist$gene, genelist_scores = genelist_scores, weighted = TRUE)
    # if(symmetric_similarity_matrix) {
    #   sim = (t(sim) + sim) / 2
    # }
    similarity[[src]] = sim

    hc_row[[src]] = stats::hclust(stats::dist(sim), method = hclust_method)
    hc_col[[src]] = stats::hclust(stats::dist(t(sim)), method = hclust_method)
  }

  # settings as string
  settings = sprintf("simplify_genesets(hclust_method='%s')", hclust_method)
  attr(x, "settings") <- c(attr(x, "settings"), settings)

  return(list(genesets = x, similarity = similarity, hc_row = hc_row, hc_col = hc_col))
}



#' utility function that converts a genesets table into a sparse gene*geneset identity matrix
#'
#' @param genesets genesets tibble. Must contain columns; "id", "genes", "ngenes"
#' @noRd
genesets_as_matrix = function(genesets) {
  stopifnot(c("id", "genes", "ngenes") %in% colnames(genesets))
  # unlist the geneset table (row = geneset, genes are packed in a list column)
  ul_id = rep(genesets$id, genesets$ngenes)
  ul_gene = unlist(genesets$genes, recursive = FALSE, use.names = FALSE)
  # ul_score = genelist$score_pval[match(ul_gene, genelist$gene)]  # this strategy is much slower on huge datasets
  # tmp = aggregate(ul_score, by = list(ul_id), FUN=sum)             # this strategy is much slower on huge datasets
  ugene = unique(ul_gene)
  # return a sparse gene*geneset identity matrix
  return( list(mat = Matrix::sparseMatrix(i = match(ul_gene, ugene),
                                          j = match(ul_id, genesets$id)),
               rowid = ugene,
               colid = genesets$id) )
}



#' geneset similarity matrix computation
#'
#' importantly, if multiple genelist_scores are provided, we use the gene score type for geneset j that yields the highest sum score
#' @param mat numeric matrix
#' @param mat_row_geneid for mat rows; gene identifiers
#' @param mat_col_gsid for mat columns; geneset identifiers
#' @param genelist_geneid same identifiers as `mat_row_geneid`
#' @param genelist_scores a matrix where columns are various gene score types
#' @param weighted boolean value
#' @noRd
geneset_similarity_matrix = function(mat, mat_row_geneid, mat_col_gsid, genelist_geneid, genelist_scores, weighted = TRUE) {

  ### 1) find the score for each row in the gene*geneset matrix via lookup in the genelist table
  # geneset score; gene score vector along all rows * identity matrix = matrix with gene score values in respective genesets -->> sum of gene scores / ngenes = geneset scores
  i = match(mat_row_geneid, genelist_geneid)
  stopifnot("all genes in the genelist table must be present in the geneset matrix" = !anyNA(i))

  ### 2) asymmetric distance matrix;
  # gene identity vector for geneset g * identity matrix = matrix with 'gene on row x is present in both genesets'
  Ngs = ncol(mat)
  d = matrix(0, nrow = Ngs, ncol = Ngs, dimnames = list(mat_col_gsid, mat_col_gsid))
  j_scoretypes = rep(0, ncol(genelist_scores))
  # iterate columns in the genes*genesets identity matrix
  for(j in seq_len(Ngs)) {
    if(weighted) {
      ## A) for current geneset j, compute the 'gene score type' (e.g. rank transformed p-values or effectsizes)
      ##    that yields the highest geneset score  =  score type most relevant for this geneset -->> use that for similarity computation downstream
      for(index_scoretype in 1L:ncol(genelist_scores)) {
        j_scoretypes[index_scoretype] = sum(mat[,j] * genelist_scores[i, index_scoretype])
      }
      index_scoretype = which.max(j_scoretypes)

      ## B) weighted overlap between geneset j (current loop iteration) and all genesets (columns in 'mat')
      j_score = mat[,j] * genelist_scores[i, index_scoretype]    # genes without a score in geneset j are set to zero score through multiplication with identity matrix 'mat'
      j_overlap_score = Matrix::colSums(mat * j_score)            # sum of gene scores in overlapping set of genes present in geneset j and all other genesets
      d[j,] = j_overlap_score / j_overlap_score[j]                # divide by the total sum of gene scores in geneset j; result = proportion of gene scores in geneset j identified in other genesets
    } else {
      # not weighted = simple case; count number of overlapping genes
      j_overlap_score = Matrix::colSums(mat * mat[,j])
      d[j,] = j_overlap_score / j_overlap_score[j] # overlap count for geneset_ij, divided by respective geneset size = proportion of overlap
    }
  }

  return(d)
}



#' plot the geneset similarity matrix as a heatmap
#'
#' @examples \donttest{
#' # note; this example downloads data when first run, and typically takes ~60seconds
#'
#' # store the downloaded files in the following directory. Here, the temporary file
#' # directory is used. Alternatively, consider storing this data in a more permanent location.
#' # e.g. output_dir="~/data/goat" on unix systems or output_dir="C:/data/goat" on Windows
#' output_dir = tempdir()
#'
#' ## first run the default example from test_genesets() to obtain geneset results
#' datasets = download_goat_manuscript_data(output_dir)
#' genelist = datasets$`Wingo 2020:mass-spec:PMID32424284`
#' genesets_asis = download_genesets_goatrepo(output_dir)
#' genesets_filtered = filter_genesets(genesets_asis, genelist)
#' result = test_genesets(genesets_filtered, genelist, method = "goat",
#'   score_type = "effectsize", padj_method = "bonferroni", padj_cutoff = 0.05)
#'
#' # prior to running this function, cluster the genesets
#' clusters = cluster_genesets(result, genelist)
#'
#' # use the plot heatmap function and try various color palettes
#' plot_heatmap(clusters, output_dir, colors = hcl.colors(100, "Viridis", rev = FALSE))
#' plot_heatmap(clusters, output_dir, colors = hcl.colors(100, "Inferno", rev = FALSE))
#' plot_heatmap(clusters, output_dir, colors = hcl.colors(100, "Lajolla", rev = TRUE))
#' plot_heatmap(clusters, output_dir, colors = hcl.colors(100, "Mako", rev = FALSE))
#' plot_heatmap(clusters, output_dir, colors = hcl.colors(100, "Turku", rev = TRUE))
#' plot_heatmap(clusters, output_dir, colors = hcl.colors(100, "Grays", rev = TRUE))
#' }
#' @param x result from `cluster_genesets()`
#' @param output_dir set to NA to directly show the figures instead of writing them to file.
#' Otherwise, this is the full path to the directory where the downloaded files should be stored. Directory is created if it does not exist.
#' e.g. `output_dir="~/data"` on unix systems, `output_dir="C:/data"` on Windows, or set to `output_dir=getwd()` to write output to the current working directory
#' @param colors a vector of 100 colors to be used for the heatmap (101 breaks are computed between 0 and the max value in the distance matrix)
#' @param fontsize parameter sent to pheatmap::pheatmap(); control the size of labels in the plot, defaults to 10. Note that you can also change the plot device size, see examples
#' @return does not return a value, plots are printed to device or files depending on `output_dir` parameter
#' @export
plot_heatmap = function(x, output_dir, colors = grDevices::hcl.colors(100, "Viridis", rev = FALSE), fontsize = 10) {
  check_dependency("pheatmap", "plotting heatmaps")
  stopifnot("parameter output_dir must be a full path for the output files, this directory must already exists" =
              length(output_dir) == 1 && (is.na(output_dir) || (is.character(output_dir) && nchar(output_dir) > 4 && dir.exists(output_dir)) ) )
  stopifnot("colors parameter must be a vector of 100 html color-codes" = length(colors) == 100 && is.character(colors) && all(grepl("^#[0-9a-zA-Z]{6}$", colors)))
  stopifnot("fontsize parameter must be a positive number" = length(fontsize) == 1 && is.numeric(fontsize) && is.finite(fontsize) && fontsize > 0)
  if(length(x) == 0 || !is.list(x) || !"similarity" %in% names(x) || length(names(x$similarity)) == 0) {
    warning("empty data provided to heatmap plot function")
    return()
  }
  stopifnot("parameter x must be a result from the cluster_genesets() function; cannot find a geneset table" =
              "genesets" %in% names(x) && is.data.frame(x$genesets) && all(c("id", "source", "name", "ngenes") %in% colnames(x$genesets)))

  fontsize = as.integer(round(fontsize))

  score_types = NULL
  if("score_type" %in% colnames(x$genesets)) {
    score_types = sort(stats::na.omit(unique(x$genesets$score_type)))
  }

  for(src in names(x$similarity)) {
    hm_data = x$similarity[[src]]
    if(length(hm_data) == 0 || !is.matrix(hm_data)) {
      next
    }
    if(!all(is.finite(hm_data))) {
      stop(paste("invalid similarity object; distance matrix contains non-finite values @", src))
    }
    mat_names = rownames(hm_data)
    if(length(mat_names) != nrow(hm_data)) {
      stop(paste("invalid similarity object; missing rownames in the distance matrix @", src))
    }
    i = match(mat_names, x$genesets$id)
    if(anyNA(i)) {
      stop(paste("invalid similarity object; some rownames in the distance matrix don't match the geneset table @", src))
    }
    mat_ngenes = x$genesets$ngenes[i]
    mat_labels = string_trunc_right(x$genesets$name[i], width = 50)
    mat_names_scoretype = rep("", length(i))

    if(length(score_types) > 0) {
      mat_names_scoretype = x$genesets$score_type[i]
    }


    hc_row = hc_col = NULL
    if("hc_row" %in% names(x) && is.list(x$hc_row) && src %in% names(x$hc_row)) {
      hc_row = x$hc_row[[src]]
    }
    if("hc_col" %in% names(x) && is.list(x$hc_col) && src %in% names(x$hc_col)) {
      hc_col = x$hc_col[[src]]
    }

    is_symmetric = isSymmetric(hm_data)
    do_cluster = nrow(hm_data) > 2 && length(hc_row) > 0
    if(!do_cluster) {
      hc_row = hc_col = FALSE
    }


    ## manually bin the geneset size data into 10 distinct values and assign colors (i.e. don't use pheatmap defaults)
    # breaks from 0 to max value
    # nbreaks = 11
    # mat_ngenes_breaks = seq(from = min(c(10, mat_ngenes)), to = max(c(mat_ngenes, 50)), length.out = nbreaks)
    mat_ngenes_breaks = pretty(c(min(c(10, mat_ngenes)), max(c(mat_ngenes, 50))), n = 8)
    nbreaks = length(mat_ngenes_breaks)
    mat_ngenes_colors = stats::setNames(
      grDevices::hcl.colors(nbreaks - 1, palette = "Grays", rev = TRUE),
      mat_ngenes_breaks[-1]
    )
    # bin / color index (goes to nbreaks-1 max)
    mat_ngenes_binned = rep(1L, length(mat_ngenes))
    for(i in 1:(nbreaks-1)) {
      mat_ngenes_binned[mat_ngenes > mat_ngenes_breaks[i]] = i
    }

    annot_row = data.frame(
      `gene count` = mat_ngenes_breaks[mat_ngenes_binned],
      row.names = rownames(hm_data),
      check.names = FALSE
    )
    annot_clr = list(
      `gene count` = mat_ngenes_colors
    )
    if(length(score_types) > 0 && any(mat_names_scoretype != "")) {
      annot_row$score_type = as.character(mat_names_scoretype)
      annot_clr$score_type = stats::setNames(grDevices::rainbow(length(score_types)), score_types)
    }

    heatmap_breaks = seq(from = 0, to = 1, length.out = 101)

    f = NA
    if(!is.na(output_dir)) {
      f = sprintf("%s/goat_heatmap_%s.pdf", output_dir, gsub("[^a-zA-Z0-9 _-]+", "", src))
      if(file.exists(f)) {
        file.remove(f)
        if(file.exists(f)) {
          stop(paste("failed to overwrite existing file when creating a heatmap plot:", f))
        }
      }
    }

    p = pheatmap::pheatmap(
      hm_data,
      cluster_rows = hc_row,
      cluster_cols = hc_col,
      labels_row = mat_labels,
      labels_col = mat_labels,
      fontsize = fontsize - 3L,
      fontsize_col = fontsize,
      fontsize_row = fontsize,
      annotation_row = annot_row,
      annotation_colors = annot_clr,
      annotation_legend = TRUE,
      breaks = heatmap_breaks,
      color = colors,
      cellwidth = 10,
      cellheight = 10,
      filename = f,
      main = sprintf(
        "%s - %d genesets %s",
        src,
        nrow(hm_data),
        ifelse(is_symmetric,
               " - symmetric similarity matrix\ncolor-coded by gene overlap (weighted by gene scores)",
               " - asymmetric similarity matrix\nscores reflect if columns are a supersets of rows")
        # at score=1, the superset/parent is shown in the column and child as row
        # High values imply respective columns contain most genes from geneset @ row
      )
    )
  }
}



#' Lollipop chart or barplot visualization of geneset enrichment testing results
#'
#' @examples \donttest{
#' # note; this example downloads data when first run, and typically takes ~60seconds
#'
#' # store the downloaded files in the following directory. Here, the temporary file
#' # directory is used. Alternatively, consider storing this data in a more permanent location.
#' # e.g. output_dir="~/data/go" on unix systems or output_dir="C:/data/go" on Windows
#' output_dir = tempdir()
#'
#' ## first run the default example from test_genesets() to obtain geneset results
#' datasets = download_goat_manuscript_data(output_dir)
#' genelist = datasets$`Wingo 2020:mass-spec:PMID32424284`
#' genesets_asis = download_genesets_goatrepo(output_dir)
#' genesets_filtered = filter_genesets(genesets_asis, genelist)
#' result = test_genesets(genesets_filtered, genelist, method = "goat",
#'   score_type = "effectsize", padj_method = "bonferroni", padj_cutoff = 0.05)
#'
#' # generate lollipop charts for each GO domain (CC/BP/MF), with geneset -log10
#' # adjusted p-value on the x-axis and color-coding by geneset up/down-regulation
#' plot_lollipop(
#'   result, output_dir, plot_type = "lollipop", topn = 50,
#'   score_xaxis = "minlogp", score_color = "updown"
#' )
#'
#' # alternatively, as a barplot
#' plot_lollipop(
#'   result, output_dir, plot_type = "barplot", topn = 50,
#'   score_xaxis = "minlogp", score_color = "updown"
#' )
#'
#' # alternatively, color-code genesets by enrichment of significant genes using
#' # parameter `score_color="oddsratio"` . See further `score_geneset_oddsratio`
#' # function documentation for definition/computation of this score.
#' plot_lollipop(
#'   result, output_dir, plot_type = "lollipop", topn = 50,
#'   score_xaxis = "minlogp", score_color = "oddsratio"
#' )
#' }
#' @param x results from function `test_genesets`
#' @param output_dir set to NA to directly show the figures instead of writing them to file.
#' Otherwise, this is the full path to the directory where the downloaded files should be stored. Directory is created if it does not exist.
#' e.g. `output_dir="~/data"` on unix systems, `output_dir="C:/data"` on Windows, or set to `output_dir=getwd()` to write output to the current working directory
#' @param only_reduced only show the reduced/summarized set of significant genesets. This requires that you first applied the `reduce_genesets` function
#' @param plot_type Options: "barplot", "lollipop" (default)
#' @param show_pvalue boolean parameter that indicates whether adjusted p-values should be shown next to each data point
#' @param score_xaxis type of score to show on the x-axis. Options: "minlogp" to show -log10(adjusted pvalue), which is default. Use "oddsratio" to show the enrichment of significant genes.
#' For further details on this score and its computation, see the `score_geneset_oddsratio` function documentation.
#' Basically, the genesets in this plot are sorted by their proportion of foreground/significant genes
#' (and this ratio is standardized against the overall ratio of significant genes as to make this statistic comparable across analyses/datasets).
#' @param score_color analogous to `score_xaxis`, here you can specify the data used for color-coding the plot. Options: "minlogp", "oddsratio", "updown".
#' The former 2 options are the same as for `score_xaxis`, the latter enables color-coding by up-/down-regulation as encoded in the "score_type" column.
#' Note that this only works when geneset testing based on "effectsize" was performed and thus the "score_type" column has values "effectsize_up" or "effectsize_down" (encoding directionality).
#' Rows with other values are assumed NA and colored as grey.
#' @param score_color_limits defines the limits for the color scales. options; `score_color_limits = "source"` use the range of values per 'source' to compute colors scales (default). Set to "overall" in order to have a unified color scale across 'source'  (e.g. same color bar across GO_CC/GO_MF/GO_BP). Alternatively, provide a numeric vector of 2 values to manually define lower/upper-limits.
#' @param score_color_updown array of 2 strings that describe the colors for up- and down-regulation (used when `score_color` is set to "updown"). Default color-coding; up = red, down = blue. Use hex color codes, e.g. "#ff0000" for red.
#' @param max_ngenes only plot terms with less than N genes (quick way to get rid of large/unspecific terms)
#' @param topn topn terms to show after sorting genesets by p-value. For example, this makes it easy to plot the top10 GO terms. Set to NA to ignore this filter (default)
#' @param padj_cutoff adjusted pvalue cutoff for terms shown in the plot. If set to NA (default), all significant genesets are used (i.e. 'signif' column in the input geneset table)
#' @return if `output_dir` is `NA`, a list of ggplot2 objects. Otherwise, write plots to file and do not return any value
#' @export
plot_lollipop = function(x, output_dir, only_reduced = FALSE, plot_type = "lollipop", show_pvalue = FALSE, score_xaxis = "minlogp", score_color = ifelse(is.data.frame(x) && "score_type" %in% colnames(x) && is.character(x$score_type) && any(x$score_type %in% c("effectsize_up","effectsize_down")), "updown", "minlogp"), score_color_limits = "source", score_color_updown = c("#E57373", "#5E7ABC"), max_ngenes = NA, topn = NA, padj_cutoff = NA) {
  signif_and_reduced = ngenes = pvalue_adjust = pvalue = rank__ = name = score_oddsratio = score_clr = xvalue = clrvalue = NULL # fix invisible bindings R package NOTE
  check_dependency("ggplot2", "plot lollipop chart")
  x = validate_genesets(x)
  stopifnot("parameter output_dir must be a full path for the output files, this directory must already exists" =
              length(output_dir) == 1 && (is.na(output_dir) || (is.character(output_dir) && nchar(output_dir) > 4 && dir.exists(output_dir)) ) )
  stopifnot("parameter plot_type must be any of; 'barplot', 'lollipop'" = length(plot_type) == 1 && plot_type %in% c("barplot", "lollipop"))
  stopifnot("parameter score_xaxis must be any of; 'minlogp', 'oddsratio'" = length(score_xaxis) == 1 && score_xaxis %in% c("minlogp", "oddsratio"))
  stopifnot("parameter score_color must be any of; 'minlogp', 'oddsratio', 'updown'" = length(score_xaxis) == 1 && score_xaxis %in% c("minlogp", "oddsratio", "updown"))
  stopifnot("parameter score_color_limits a single string designating the color-scale scope (value must be either 'source' or 'overall'), or 2 numeric values that hardcode the color-scale limits" =
              (length(score_color_limits) == 1 && score_color_limits %in% c("source", "overall")) || (length(score_color_limits) == 2 && all(is.numeric(score_color_limits) & is.finite(score_color_limits)))  )
  stopifnot("parameter max_ngenes must be a single positive number (or NA to disable)" = length(max_ngenes) == 1 && (is.na(max_ngenes) || (is.numeric(max_ngenes) & is.finite(max_ngenes) & max_ngenes > 0)))
  stopifnot("parameter topn must be a single positive number (or NA to disable)" = length(topn) == 1 && (is.na(topn) || (is.numeric(topn) & is.finite(topn) & topn > 0)))
  stopifnot("parameter padj_cutoff must be a single positive number (or NA to disable)" = length(padj_cutoff) == 1 && (is.na(padj_cutoff) || (is.numeric(padj_cutoff) & is.finite(padj_cutoff) & padj_cutoff > 0)))
  stopifnot("parameter only_reduced must be a single boolean value" = length(only_reduced) == 1 && only_reduced %in% c(TRUE, FALSE))
  stopifnot("parameter only_reduced should not be used in conjunction with other filtering parameters like 'max_ngenes', 'topn', 'padj_cutoff'" = (only_reduced == FALSE) || (only_reduced && is.na(max_ngenes) && is.na(topn) && is.na(padj_cutoff)))

  if(score_xaxis == "oddsratio") {
    stopifnot("parameter score_xaxis was set to 'oddsratio' so we expect column 'score_oddsratio' in the provided data table (with finite numeric values)" = "score_oddsratio" %in% colnames(x) && all(is.finite(x$score_oddsratio)) )
  }
  if(score_color == "oddsratio") {
    stopifnot("parameter score_color was set to 'oddsratio' so we expect column 'score_oddsratio' in the provided data table (with finite numeric values)" = "score_oddsratio" %in% colnames(x) && all(is.finite(x$score_oddsratio)))
  }
  if(score_color == "updown") {
    stopifnot("parameter score_color was set to 'updown' so we expect column 'score_type' in the provided data table AND it should contain 'effectsize_up' and/or 'effectsize_down' values (i.e. when using GOAT with parameter score_type='effectsize')" = "score_type" %in% colnames(x) && all(is.character(x$score_type)) )
    if(!any(x$score_type %in% c("effectsize_up","effectsize_down"))) {
      warning("parameter score_color was set to 'updown' but we did not find any 'effectsize_up' or 'effectsize_down' values in column 'score_type' and thus all elements will be color-coded in grey (NA). Try another color-coding, or use GOAT with parameter score_type='effectsize'")
    }
  }
  if(only_reduced) {
    stopifnot("'signif_and_reduced' column is not present in the data table, did you forget to use the reduce_genesets() function ?" = "signif_and_reduced" %in% colnames(x))
    x = x |> filter(signif_and_reduced %in% TRUE)
  }


  # optional user-provided filter criteria for the genesets to plot
  if(is.finite(max_ngenes)) {
    x = x |> filter(ngenes <= max_ngenes)
  }
  if(is.finite(padj_cutoff)) {
    x = x |> filter(pvalue_adjust <= padj_cutoff)
  } else {
    x = x |> filter(signif == TRUE)
  }
  if(is.finite(topn)) {
    x = x |>
      arrange(pvalue) |>
      group_by(source) |>
      mutate(rank__ = 1L:n()) |>
      ungroup() |>
      filter(rank__ <= topn) |>
      select(-rank__)
  }

  # return if there is no data to plot
  if(nrow(x) == 0) {
    warning("zero genesets match your filtering criteria, cannot generate plots")
    return()
  }

  # truncate geneset names
  label_nchar = 60
  x = x |> mutate(name = string_trunc_right(name, width = label_nchar))


  # define x-axis values
  xlab_title = color_title = "" # init
  x$xvalue = 0 # init column
  if(score_xaxis == "oddsratio") {
    xlab_title = "Significant gene odds ratio"
    x$xvalue = x$score_oddsratio
  } else {
    xlab_title = "-log10 adjusted p-value"
    x$xvalue = minlog10_fixzero(x$pvalue_adjust)
  }

  # analogous for color values
  clr_break_limits = NA # init val
  clr_break_limits_minmax = NA # sane default that defines minima/maxima
  x$clrvalue = 0 # init column
  if(score_color == "updown") {
    color_title = "Direction"
    x$clrvalue = "NA"
    x$clrvalue[x$score_type %in% "effectsize_up"] = "up"
    x$clrvalue[x$score_type %in% "effectsize_down"] = "down"
  } else if(score_color == "oddsratio") {
    color_title = "Odds ratio"
    x$clrvalue = x$score_oddsratio
    clr_break_limits_minmax = c(0.5, 2)
  } else {
    color_title = "-log10 p.adjust"
    clr_break_limits_minmax = c(0, 4)
    x$clrvalue = minlog10_fixzero(x$pvalue_adjust)
  }


  plotlist = list()

  # generate a plot for all 'source' elements
  for(src in sort(unique(x$source))) {
    # data subset for figure @ current source
    x_src = x |> filter(source == src)
    if(nrow(x_src) > 100) {
      warning(paste(src, "has more than 100 significant genesets, not creating plot. You can run this function and explicitly request to plot only the topn genesets, see documentation for 'topn' parameter"))
      next
    }

    # color scale
    if(length(score_color_limits) == 2) { # use hardcoded color scale limits if provided
      clr_break_limits = score_color_limits
    } else {
      if(score_color == "updown") {
        clr_break_limits = intersect(c("up", "down", "NA"), unique(x_src$clrvalue))
      } else {
        if(score_color_limits == "source") {
          clr_break_limits = c(min(clr_break_limits_minmax[1], min(x_src$clrvalue)), max(clr_break_limits_minmax[2], max(x_src$clrvalue)) )
        } else if(score_color_limits == "overall") {
          clr_break_limits = c(min(clr_break_limits_minmax[1], min(x$clrvalue)), max(clr_break_limits_minmax[2], max(x$clrvalue)) )
        } else {
          stop("unknown parameter for 'score_color_limits'")
        }
      }
    }

    # threshold colors to plot range
    if(length(clr_break_limits) == 2 && all(is.numeric(clr_break_limits))) {
      x_src$clrvalue[x_src$clrvalue < clr_break_limits[1]] = clr_break_limits[1]
      x_src$clrvalue[x_src$clrvalue > clr_break_limits[2]] = clr_break_limits[2]
    }

    # define y-axis plot order
    x_src = x_src |>
      arrange(desc(xvalue)) |>
      mutate(id = factor(id, levels = rev(unique(id))))

    # hack to always have same plot area for labels; pre-pad whitespace to fixed-width strings of N characters.
    # one could use stringi::stri_pad_left() or a wrapper thereof like stringr::str_pad(),
    # but instead we here use some naive vanilla R code to prevent an additional package dependency just for this.
    rows_pad = nchar(x_src$name) < label_nchar
    while(any(rows_pad)) {
      x_src$name[rows_pad] = paste0(" ", x_src$name[rows_pad])
      rows_pad = nchar(x_src$name) < label_nchar
    }

    # better implementation is available through labeling::extended() function, which is not included here to keep external dependencies to a minimum
    myxlim = c(0, max(pretty(c(0, max(1, max(x_src$xvalue))), n = 4)))
    size_breaks = c(10, 30, 100, 300, 1000)
    size_breaks = size_breaks[size_breaks <= max(x_src$ngenes)] # don't show "geneset size" legend elements that are larger than the actual data
    has_few_rows = nrow(x_src) < 10

    # create a very compact figure, reducing whitespace where possible
    p = NULL # declare val
    if(plot_type == "barplot") { # barplot
      p = ggplot2::ggplot(x_src, ggplot2::aes(x = xvalue, y = id, fill = clrvalue)) +
        ggplot2::geom_col(fill = "white") +
        ggplot2::geom_col()
    } else { # lollipop
      p = ggplot2::ggplot(x_src, ggplot2::aes(x = xvalue, y = id, size = ngenes, colour = clrvalue)) +
        ggplot2::geom_segment(ggplot2::aes(x=0, xend=xvalue, y=id, yend=id), colour = "darkgrey", size = 0.5) +
        ggplot2::geom_point(colour = "white", show.legend = FALSE) + # to plot alpha-colors as-is, first plot as white dot to overwrite panel + segment
        ggplot2::geom_point() +
        ggplot2::scale_size(range = c(2,6), limits = c(min(10, x_src$ngenes), max(1000, x_src$ngenes)), breaks = size_breaks, trans = "log10", name = "Count") +
        ggplot2::guides(size = ggplot2::guide_legend(keywidth = ggplot2::unit(0.1, "lines"), keyheight = ggplot2::unit(0.1, "lines"), order = 0))
    }

    p = p +
      ggplot2::coord_cartesian(clip = "off") +
      ggplot2::scale_x_continuous(limits = myxlim, expand = ggplot2::expansion(mult = 0)) +
      ggplot2::scale_y_discrete(breaks = levels(x_src$id), labels = x_src$name[match(levels(x_src$id), x_src$id)]) +
      ggplot2::labs(x = xlab_title, y = "") +
      ggplot2::theme_bw()

    if(score_color == "updown") {
      p = p +
        # ifelse(plot_type == "barplot", color_title, "") # remove title for "up/down" to reduce required space
        ggplot2::scale_colour_manual(values = c("up"=score_color_updown[1], "down"=score_color_updown[2], "NA"="#7F7F7F"), breaks = clr_break_limits, name = color_title, aesthetics = c("colour", "fill")) +
        ggplot2::guides(colour = ggplot2::guide_legend(order = 1, override.aes = list(size = 3)))
    } else {
      p = p + ggplot2::scale_colour_viridis_c(direction = 1, alpha = 0.7, limits = clr_break_limits, name = color_title, aesthetics = c("colour", "fill"))

      # deal with legends for very small plots
      # there seems to be a bug in ggplot2 that causes the legend keys to change based on the data / legend-element-count (first size then color legend, or vice versa)
      # so we hardcode the guides() here to always flow in the same ordering and use the 'order' parameter
      if(has_few_rows) {
        p = p + ggplot2::guides(
          colour = ggplot2::guide_colourbar(barwidth = ggplot2::unit(4, "lines"), order = 1, title.vjust = 1, label.theme = ggplot2::element_text(angle = -90, vjust = 0.5, size = 9))
        )
      } else {
        p = p + ggplot2::guides(colour = ggplot2::guide_colourbar(barheight = ggplot2::unit(4, "lines"), order = 1))
      }
    }

    p = p +
      ggplot2::theme(
        panel.grid.major.y = ggplot2::element_blank(),
        panel.grid.minor.y = ggplot2::element_blank(),
        panel.border = ggplot2::element_blank(),
        axis.line.x.bottom = ggplot2::element_line(size=0.5, colour = "black"),
        axis.ticks.y.left = ggplot2::element_blank(),
        axis.title.x.bottom = ggplot2::element_text(size = 11),
        legend.spacing.y = ggplot2::unit(0.1, "lines"), # reduce spacing between legend keys and items
        legend.title = ggplot2::element_text(size = 11),
        plot.margin = ggplot2::unit(c(0.5,1,0.5,0.5), "lines") # add some extra whitespace on the right side of the plot
      )

    if(has_few_rows) {
      p = p + ggplot2::theme(
        legend.position = "bottom",
        legend.margin = ggplot2::margin(2, 0, 0, 0), # reduce space between legend groups
      )
      if(plot_type == "lollipop") {
        p = p + ggplot2::theme(legend.justification = c(1, 0.5))
      }
    }

    if(show_pvalue) {
      p = p + ggplot2::geom_text(ggplot2::aes(label = sprintf("%.1e", pvalue_adjust)), size = 2.5, colour = "black", hjust = 0, nudge_x = max(x_src$xvalue)/15, show.legend = FALSE)
    }

    plotlist[[src]] = p

    # fiddle parameters to compute plot height;
    # <space needed for axis + vertical whitespace = independent of data>  +  <number of rows in the plot * X>  +  <optional padding if the legend is shown at the bottom>
    if(!is.na(output_dir)) {
      f = sprintf(
        "%s/goat_%s_%s%s%s%s%s_x=%s_clr=%s.pdf",
        output_dir,
        plot_type,
        gsub("[^a-zA-Z0-9 _-]+", "", src),
        ifelse(only_reduced, "_reduced", ""),
        ifelse(is.finite(max_ngenes), paste0("_", max_ngenes, "max_ngenes"), ""),
        ifelse(is.finite(topn), paste0("_", topn, "topn"), ""),
        ifelse(is.finite(padj_cutoff), paste0("_", padj_cutoff, "padj_cutoff"), ""),
        score_xaxis,
        score_color
      )

      if(file.exists(f)) {
        file.remove(f)
        if(file.exists(f)) {
          stop(paste("failed to overwrite existing file when creating a new plot:", f))
        }
      }

      ggplot2::ggsave(filename = f, plot = p, width = 7, height = 0.75 + nrow(x_src) * 0.14 + (has_few_rows * 0.7))
    }
  }

  if(is.na(output_dir)) {
    return(plotlist)
  }
}



#' plot geneset distance matrix as a network
#'
#'
#' @param clusters result from `cluster_genesets`
#' @param src source property (e.g. "GO_CC")
#' @param show_clusters boolean value
#' @param show_text boolean value
#' @param topn_edges topN edges to retain per geneset (typically 5~8)
#' @param clr_default default color for the network, used only when `show_clusters` is set to `FALSE`
#' @param node_color_palette function with 1 parameter, N, that returns N colors (default is goat function `gg_color_hue`)
#' @return a ggplot2 object
#' @export
plot_network = function(clusters, src, show_clusters = TRUE, show_text = FALSE, topn_edges = 5, clr_default = "#29b6f6", node_color_palette = goat::gg_color_hue) {
  source = cl = weight = genecount = cluster = label = NULL
  check_dependency("igraph", "creating network plots")
  check_dependency("ggraph", "creating network plots")
  ## input validation
  # TODO

  ## prep data
  plot_title = src
  gs = clusters$genesets |> filter(source == src)
  mat = clusters$similarity[[src]]
  diag(mat) = 0

  ## from matrix to edgelist
  el = NULL
  for(i in 1:nrow(mat)) {
    i_cols = utils::head(order(mat[i,], decreasing = TRUE), topn_edges)
    el = bind_rows(el, data.frame(from = rownames(mat)[i], to = colnames(mat)[i_cols], weight = mat[i,i_cols]) )
  }
  el = el |> filter(weight > 0.1)
  # transform weights to increase importance
  el$weight = el$weight^2

  ## create graph
  g = igraph::graph_from_data_frame(el)
  igraph::V(g)$genecount = clusters$genesets$ngenes[match(igraph::V(g)$name, clusters$genesets$id)]
  igraph::V(g)$label = clusters$genesets$name[match(igraph::V(g)$name, clusters$genesets$id)]

  ## layout
  gl = igraph::layout_with_graphopt(g, charge = 0.05, spring.constant = 10)
  # gl = igraph::layout_with_fr(g, niter = 1000)

  ## cluster
  g_clust = igraph::cluster_optimal(g) # slow but reliable
  # g_clust = igraph::cluster_infomap(g)
  tmp = igraph::membership(g_clust)
  # cluster is assumed to be a continuous set of integers from 1 to N
  igraph::V(g)$cluster = as.integer(tmp[match(igraph::V(g)$name, names(tmp))])

  ## color
  df_clusters = data.frame(cl = unique(igraph::V(g)$cluster)) |> arrange(cl)
  df_clusters$clr = df_clusters$fill = node_color_palette(nrow(df_clusters))
  df_clusters$fill = lighten_color(df_clusters$fill, 0.1)
  df_clusters$clr = darken_color(df_clusters$clr, 0.1)
  # df_clusters$clr = node_color_palette(nrow(df_clusters))
  # df_clusters$fill = lighten_color(df_clusters$clr, 0.25)


  p = ggraph::ggraph(g, layout = gl) +
    ggraph::geom_edge_link0(ggplot2::aes(width = I(0.5 + weight / 10),
                                         colour = I(grDevices::colorRampPalette(c("#ECEFF1", "#CFD8DC"))(10)[ cut(weight, breaks = (0:10)/10, include.lowest = TRUE, labels = FALSE) ]) ),
                            show.legend = FALSE)

  if(show_clusters) {
    if(show_text) {
      p = p + ggraph::geom_node_point(ggplot2::aes(size = genecount, colour = I(df_clusters$fill[cluster])), show.legend = FALSE)
    } else {
      p = p + ggraph::geom_node_point(ggplot2::aes(size = genecount, colour = I(df_clusters$clr[cluster]), fill = I(df_clusters$fill[cluster])), shape = 21, show.legend = FALSE)
    }
  } else {
    p = p + ggraph::geom_node_point(ggplot2::aes(size = genecount), colour = clr_default, show.legend = FALSE)
  }

  p = p +
    ggplot2::scale_size_continuous(trans = "log10", range = c(2, 6)) + # range = min/max node size
    ggplot2::guides(size = "none") +
    ggplot2::labs(title = plot_title) +
    ggplot2::theme_void() + # note; ggraph::theme_graph() is bugged
    ggplot2::theme(legend.position = "bottom",
                   legend.title = ggplot2::element_blank())

  if(show_text) {
    p = p + ggraph::geom_node_text(ggplot2::aes(label = label), size = 2)
  }

  return(p)
}

Try the goat package in your browser

Any scripts or data that you put into this service are public.

goat documentation built on April 3, 2025, 6:05 p.m.