R/save_genesets.R
In goat: Gene Set Analysis Using the Gene Set Ordinal Association Test

Documented in save_genesets

#' Write a geneset table to file.
#'
#'
#' @description
#'  Works for any filtered geneset table generated by this package, e.g. results from any of these functions;
#'  `filter_genesets()`, `test_genesets()` or `simplify_genesets()`.
#'
#' The genelist table is required to 1) lookup gene symbols and 2) determine the ordering of genes within-geneset.
#' The latter makes it such that the output table shows the most important genes (in context of user's data) first.
#'
#' All output columns that list gene identifiers or symbols are trimmed to 25000 characters. Depending on your input
#' gene identifier type and dataset filtering settings (e.g. max geneset size), this might lead to some truncation.
#' For example, 15 characters per ensembl gene ID (if these are used in provided genesets instead of default NCBI
#' Entrez) and 1 char as delimiter, only the top ~1500 ensembl gene identifiers are included. If your geneset
#' filtering allows for larger genesets and you want to check if any gene identifiers are lost upon saving,
#' keep an eye out for the trailing ellipsis ('...'). Because genes in each geneset of the output table are sorted
#' by order of importance as indicated in the genelist table, this shouldn't be an issue in typical use-cases
#' where the output table is to check the topN most important/significant genes of a geneset of interest.
#'
#' @param x result from `filter_genesets()`, `test_genesets()` or `simplify_genesets()`. When saving the latter, output will include clusters assigned to each significant geneset
#' @param genelist same as provided for `test_genesets()`. A column named 'symbol' is required, it'll be used to pretty-print gene symbols per geneset in the output table
#' @param filename full path to the output file. Supported file extensions; csv, tsv, xlsx. Optionally, set to NA to not write to disk and return the result table instead
#' @param arrange_genes set to TRUE (default) to arrange the genelist table by best p-value on top (if column 'pvalue' exists), alternatively by descending absolute effectsize (if no pvalue but effectize is available). Set FALSE to use sorting of the genelist table as-is
#' @return if `filename` is `NA`, returns the validated and formatted geneset result table. Otherwise, writes the table to file and does not return a value
#' @export
save_genesets = function(x, genelist, filename, arrange_genes = TRUE) {
  pvalue = effectsize = genes = genes_signif = NULL # fix invisible bindings R package NOTE
  stopifnot("arrange_genes parameter must be TRUE or FALSE" = length(arrange_genes) == 1 && arrange_genes %in% c(TRUE, FALSE))
  stopifnot("filename parameter must be a non-empty string, or NA to return the result table and not write to file at all" = length(filename) == 1 && is.na(filename) || (is.character(filename) && nchar(filename) > 4))
  # check output dir & right to delete/overwrite output file if it already exists
  filename_log = NA
  if(!is.na(filename)) {
    if(!dir.exists(dirname(filename))) {
      stop(paste0("'", filename, "' points to a directory that does not exist"))
    }
    if(file.exists(filename) && !file.remove(filename)) {
      stop(paste0("'", filename, "' is an existing file, but unable to overwrite. Is it currently opened?"))
    }
    filename_log = paste0(filename, ".log")
    if(file.exists(filename_log) && !file.remove(filename_log)) {
      stop(paste0("'", filename_log, "' is an existing file, but unable to overwrite. Is it currently opened?"))
    }
  }

  is_xlsx = grepl("\\.xlsx$", filename, ignore.case = TRUE)
  is_tsv = grepl("\\.tsv$", filename, ignore.case = TRUE)
  is_csv = grepl("\\.csv$", filename, ignore.case = TRUE)
  stopifnot("unknown file extension; filename should should end with '.xslx' or '.tsv' or '.csv'" = is.na(filename) || is_xlsx || is_tsv || is_csv)

  # note that we again repeat input validation of genesets and genelist to enforce integer ID type -> otherwise, matching/lookup might fail
  genelist = validate_genelist(genelist)
  # additional requirement; also require a 'symbol' column @ genelist
  stopifnot("genelist table should contain a character column with symbols (no NA or empty strings allowed)" = "symbol" %in% colnames(genelist) && is.character(genelist$symbol) && !anyNA(genelist$symbol))
  # validate genesets parameter
  stopifnot("parameter x must be a geneset table or a list that contains one as x$genesets" = is.data.frame(x) || (is.list(x) && "genesets" %in% names(x) && is.data.frame(x$genesets)))
  geneset_test_result = x
  if(!is.data.frame(x)) {
    geneset_test_result = x$genesets
  }
  geneset_test_result = validate_genesets(geneset_test_result)
  settings = attributes(geneset_test_result)$settings


  # reference genelist table, genes are sorted in order of importance
  # respective genes per geneset will be stored in the same order (i.e. this is the reference table)
  ref = genelist
  if(arrange_genes) {
    if("pvalue" %in% colnames(ref)) {
      ref = ref |> arrange(pvalue)
    } else {
      if("effectsize" %in% colnames(ref)) {
        ref = ref |> arrange(desc(abs(effectsize)))
      } else {
        warning("save_genesets(); arrange_genes = TRUE, but genelist table has no pvalue nor effectsize column so no sorting is applied")
      }
    }
  }


  # order genes per geneset in same order as reference table
  result = geneset_test_result |>
    mutate(
      genes = lapply(genes, function(x) x[order(match(x, ref$gene))]),
      genes_signif = lapply(genes_signif, function(x) x[order(match(x, ref$gene))])
    ) |>
    tibble::add_column(genes_symbol = "", genes_signif_symbol = "", .after = "genes_signif")

  # efficiently gene symbols by unlisting, vectorized matching, and relisting
  gene_to_symbol = function(l, ref) {
    ul = unlist(l, recursive = TRUE, use.names = FALSE)
    i = match(ul, ref$gene)
    stopifnot("some genes in the genesets table could not be matched to the genelist table" = !anyNA(i))
    ul = ref$symbol[i]
    result = utils::relist(ul, skeleton = utils::as.relistable(l))
    unlist(lapply(result, paste, collapse = ","), recursive = FALSE, use.names = FALSE)
  }

  result$genes_symbol = gene_to_symbol(result$genes, ref)
  result$genes_signif_symbol = gene_to_symbol(result$genes_signif, ref)
  # hardcoded trimming of all gene columns to 25k characters. Note that the Excel limit is 32767 per cell
  nchar_limit = 25000
  result$genes_symbol = string_trunc_right(result$genes_symbol, width = nchar_limit)
  result$genes_signif_symbol = string_trunc_right(result$genes_signif_symbol, width = nchar_limit)
  # collapse gene identifiers as well + truncate
  result$genes = string_trunc_right(unlist(lapply(result$genes, paste, collapse = ","), recursive = FALSE, use.names = FALSE), width = nchar_limit)
  result$genes_signif = string_trunc_right(unlist(lapply(result$genes_signif, paste, collapse = ","), recursive = FALSE, use.names = FALSE), width = nchar_limit)

  result = result |>
    # remove any remaining list-type columns that the user may have added
    select( - tidyselect::where(is.list)) |>
    # arrange by source, then pvalue
    arrange(source, pvalue)


  if(is.na(filename)) {
    return(result)
  } else {
    # write geneset table to file
    if(is_xlsx) {
      writexl::write_xlsx(result, path = filename, col_names = TRUE, format_headers = TRUE)
    }
    if(is_tsv) {
      utils::write.table(result, file = filename, quote = TRUE, sep = "\t", na = "", row.names = FALSE, col.names = TRUE)
    }
    if(is_csv) {
      utils::write.table(result, file = filename, quote = TRUE, sep = ",", na = "", row.names = FALSE, col.names = TRUE)
    }

    # if object has settings, write to file
    if(length(settings) > 0) {
      writeLines(
        paste0(
          "\n", goat_logo(),
          "\nGOAT R package version: ", goat_version(),
          "\n\nAnalysis date: ", format(Sys.time(), format = "%Y-%m-%d %H:%M"),
          "\n\nFilter settings:\n", paste(settings, collapse = "\n"),
          "\n\nMaterials & Methods:\n\n", methods_text(x, genelist, settings)
        ),
        filename_log
      )
    }

    # in addition to the user-friendly csv/tsv/xlsx output table, write objects to RData file
    # (so users can easily load previous results, from possibly another GOAT version, into R and apply plotting functions without re-running GOAT)
    result = x
    save(result, genelist, file = sub("\\.[^.]+$", ".rda", filename, ignore.case = TRUE), compress = TRUE)
  }
}



#' create M&M text
#'
#' @param x see `save_genesets()`
#' @param genelist see `save_genesets()`
#' @param settings settings attribute attached to the genesets result table
#' @noRd
methods_text = function(x, genelist, settings) {
  usource = unique(x$source)
  settings = paste(settings, collapse=" ")
  # extract parameters from settings string
  is_genesets_go = grepl("load_genesets_go", settings, ignore.case = TRUE)
  is_genesets_syngo = grepl("syngo", settings, ignore.case = TRUE)

  get_setting = function(attr, s, default = " *** unknown, check code/logfile *** ") {
    x = gsub(paste0(".*", attr, " *= *'*([^' ,;]+).*"), "\\1", s)
    if(x == s) {
      x = default
    }
    return(x)
  }

  method = get_setting("test_genesets *\\( *method", settings)
  padj_method = get_setting("padj_method", settings)
  padj_sources = get_setting("padj_sources", settings)
  padj_cutoff = get_setting("padj_cutoff", settings)
  score_type = get_setting("score_type", settings, "")
  min_overlap = get_setting("min_overlap", settings)
  max_overlap = get_setting("max_overlap", settings)
  max_overlap_fraction = get_setting("max_overlap_fraction", settings)

  if(score_type == "") {
    score_type = paste0(' genes and was')
  } else {
    score_type = paste0(' genes and its ', score_type, '-derived gene scores were')
  }


  # cite R package
  result = paste0(
    'Gene set analyses\n',
    'The GOAT R package [PMID:38898151] (version ', goat_version(),
    ', https://github.com/ftwkoopmans/goat ) was used to perform gene set enrichment analyses',
    # algorithm / method
    ' with the ', toupper(method), ' algorithm'
  )

  # genesets
  if (is_genesets_go) {
    result = paste0(result, ' using gene sets from the Gene Ontology database [PMID:10802651] (', x$source_version[1], '). ')
  } else if (is_genesets_syngo) {
    result = paste0(result, ' using gene sets from the SynGO knowledgebase [PMID:31171447] (', x$source_version[1], '). ')
  } else {
    result = paste0(result, ' using gene sets from "', x$source_version[1], '". ')
  }

  # filtering settings
  paste0(
    result, 'The input gene list contained ', nrow(genelist),
    score_type,
    ' used to test for enriched gene sets that contained at least ',
    min_overlap , ' and at most ' , max_overlap , ' genes (or ',
    round(as.numeric(max_overlap_fraction)*100),'% of the gene list, whichever was smaller) that overlapped with the input gene list. ',
    # multiple testing correction
    'Multiple testing correction was ',
    ifelse(length(usource) > 1,
           paste0('independently applied per gene set "source" (i.e. ', paste(usource, collapse=", "), ')'),
           'applied'),
    ifelse(tolower(padj_method) == "bonferroni",
           ' using Bonferroni adjustment',
           ' using the Benjamini-Hochberg procedure (FDR)'),
    ifelse(padj_sources == "TRUE" && length(usource) > 1,
           paste0(' and subsequently, all p-values were adjusted (again) using Bonferroni adjustment to account for ',
                  length(usource), ' separate tests across "sources". '),
           '. '),
    'The significance threshold for adjusted p-values was set to ', padj_cutoff, '.\n\n'
  )
}