Nothing
#' Write a geneset table to file.
#'
#'
#' @description
#' Works for any filtered geneset table generated by this package, e.g. results from any of these functions;
#' `filter_genesets()`, `test_genesets()` or `simplify_genesets()`.
#'
#' The genelist table is required to 1) lookup gene symbols and 2) determine the ordering of genes within-geneset.
#' The latter makes it such that the output table shows the most important genes (in context of user's data) first.
#'
#' All output columns that list gene identifiers or symbols are trimmed to 25000 characters. Depending on your input
#' gene identifier type and dataset filtering settings (e.g. max geneset size), this might lead to some truncation.
#' For example, 15 characters per ensembl gene ID (if these are used in provided genesets instead of default NCBI
#' Entrez) and 1 char as delimiter, only the top ~1500 ensembl gene identifiers are included. If your geneset
#' filtering allows for larger genesets and you want to check if any gene identifiers are lost upon saving,
#' keep an eye out for the trailing ellipsis ('...'). Because genes in each geneset of the output table are sorted
#' by order of importance as indicated in the genelist table, this shouldn't be an issue in typical use-cases
#' where the output table is to check the topN most important/significant genes of a geneset of interest.
#'
#' @param x result from `filter_genesets()`, `test_genesets()` or `simplify_genesets()`. When saving the latter, output will include clusters assigned to each significant geneset
#' @param genelist same as provided for `test_genesets()`. A column named 'symbol' is required, it'll be used to pretty-print gene symbols per geneset in the output table
#' @param filename full path to the output file. Supported file extensions; csv, tsv, xlsx. Optionally, set to NA to not write to disk and return the result table instead
#' @param arrange_genes set to TRUE (default) to arrange the genelist table by best p-value on top (if column 'pvalue' exists), alternatively by descending absolute effectsize (if no pvalue but effectize is available). Set FALSE to use sorting of the genelist table as-is
#' @return if `filename` is `NA`, returns the validated and formatted geneset result table. Otherwise, writes the table to file and does not return a value
#' @export
save_genesets = function(x, genelist, filename, arrange_genes = TRUE) {
pvalue = effectsize = genes = genes_signif = NULL # fix invisible bindings R package NOTE
stopifnot("arrange_genes parameter must be TRUE or FALSE" = length(arrange_genes) == 1 && arrange_genes %in% c(TRUE, FALSE))
stopifnot("filename parameter must be a non-empty string, or NA to return the result table and not write to file at all" = length(filename) == 1 && is.na(filename) || (is.character(filename) && nchar(filename) > 4))
# check output dir & right to delete/overwrite output file if it already exists
filename_log = NA
if(!is.na(filename)) {
if(!dir.exists(dirname(filename))) {
stop(paste0("'", filename, "' points to a directory that does not exist"))
}
if(file.exists(filename) && !file.remove(filename)) {
stop(paste0("'", filename, "' is an existing file, but unable to overwrite. Is it currently opened?"))
}
filename_log = paste0(filename, ".log")
if(file.exists(filename_log) && !file.remove(filename_log)) {
stop(paste0("'", filename_log, "' is an existing file, but unable to overwrite. Is it currently opened?"))
}
}
is_xlsx = grepl("\\.xlsx$", filename, ignore.case = TRUE)
is_tsv = grepl("\\.tsv$", filename, ignore.case = TRUE)
is_csv = grepl("\\.csv$", filename, ignore.case = TRUE)
stopifnot("unknown file extension; filename should should end with '.xslx' or '.tsv' or '.csv'" = is.na(filename) || is_xlsx || is_tsv || is_csv)
# note that we again repeat input validation of genesets and genelist to enforce integer ID type -> otherwise, matching/lookup might fail
genelist = validate_genelist(genelist)
# additional requirement; also require a 'symbol' column @ genelist
stopifnot("genelist table should contain a character column with symbols (no NA or empty strings allowed)" = "symbol" %in% colnames(genelist) && is.character(genelist$symbol) && !anyNA(genelist$symbol))
# validate genesets parameter
stopifnot("parameter x must be a geneset table or a list that contains one as x$genesets" = is.data.frame(x) || (is.list(x) && "genesets" %in% names(x) && is.data.frame(x$genesets)))
geneset_test_result = x
if(!is.data.frame(x)) {
geneset_test_result = x$genesets
}
geneset_test_result = validate_genesets(geneset_test_result)
settings = attributes(geneset_test_result)$settings
# reference genelist table, genes are sorted in order of importance
# respective genes per geneset will be stored in the same order (i.e. this is the reference table)
ref = genelist
if(arrange_genes) {
if("pvalue" %in% colnames(ref)) {
ref = ref |> arrange(pvalue)
} else {
if("effectsize" %in% colnames(ref)) {
ref = ref |> arrange(desc(abs(effectsize)))
} else {
warning("save_genesets(); arrange_genes = TRUE, but genelist table has no pvalue nor effectsize column so no sorting is applied")
}
}
}
# order genes per geneset in same order as reference table
result = geneset_test_result |>
mutate(
genes = lapply(genes, function(x) x[order(match(x, ref$gene))]),
genes_signif = lapply(genes_signif, function(x) x[order(match(x, ref$gene))])
) |>
tibble::add_column(genes_symbol = "", genes_signif_symbol = "", .after = "genes_signif")
# efficiently gene symbols by unlisting, vectorized matching, and relisting
gene_to_symbol = function(l, ref) {
ul = unlist(l, recursive = TRUE, use.names = FALSE)
i = match(ul, ref$gene)
stopifnot("some genes in the genesets table could not be matched to the genelist table" = !anyNA(i))
ul = ref$symbol[i]
result = utils::relist(ul, skeleton = utils::as.relistable(l))
unlist(lapply(result, paste, collapse = ","), recursive = FALSE, use.names = FALSE)
}
result$genes_symbol = gene_to_symbol(result$genes, ref)
result$genes_signif_symbol = gene_to_symbol(result$genes_signif, ref)
# hardcoded trimming of all gene columns to 25k characters. Note that the Excel limit is 32767 per cell
nchar_limit = 25000
result$genes_symbol = string_trunc_right(result$genes_symbol, width = nchar_limit)
result$genes_signif_symbol = string_trunc_right(result$genes_signif_symbol, width = nchar_limit)
# collapse gene identifiers as well + truncate
result$genes = string_trunc_right(unlist(lapply(result$genes, paste, collapse = ","), recursive = FALSE, use.names = FALSE), width = nchar_limit)
result$genes_signif = string_trunc_right(unlist(lapply(result$genes_signif, paste, collapse = ","), recursive = FALSE, use.names = FALSE), width = nchar_limit)
result = result |>
# remove any remaining list-type columns that the user may have added
select( - tidyselect::where(is.list)) |>
# arrange by source, then pvalue
arrange(source, pvalue)
if(is.na(filename)) {
return(result)
} else {
# write geneset table to file
if(is_xlsx) {
writexl::write_xlsx(result, path = filename, col_names = TRUE, format_headers = TRUE)
}
if(is_tsv) {
utils::write.table(result, file = filename, quote = TRUE, sep = "\t", na = "", row.names = FALSE, col.names = TRUE)
}
if(is_csv) {
utils::write.table(result, file = filename, quote = TRUE, sep = ",", na = "", row.names = FALSE, col.names = TRUE)
}
# if object has settings, write to file
if(length(settings) > 0) {
writeLines(
paste0(
"\n", goat_logo(),
"\nGOAT R package version: ", goat_version(),
"\n\nAnalysis date: ", format(Sys.time(), format = "%Y-%m-%d %H:%M"),
"\n\nFilter settings:\n", paste(settings, collapse = "\n"),
"\n\nMaterials & Methods:\n\n", methods_text(x, genelist, settings)
),
filename_log
)
}
# in addition to the user-friendly csv/tsv/xlsx output table, write objects to RData file
# (so users can easily load previous results, from possibly another GOAT version, into R and apply plotting functions without re-running GOAT)
result = x
save(result, genelist, file = sub("\\.[^.]+$", ".rda", filename, ignore.case = TRUE), compress = TRUE)
}
}
#' create M&M text
#'
#' @param x see `save_genesets()`
#' @param genelist see `save_genesets()`
#' @param settings settings attribute attached to the genesets result table
#' @noRd
methods_text = function(x, genelist, settings) {
usource = unique(x$source)
settings = paste(settings, collapse=" ")
# extract parameters from settings string
is_genesets_go = grepl("load_genesets_go", settings, ignore.case = TRUE)
is_genesets_syngo = grepl("syngo", settings, ignore.case = TRUE)
get_setting = function(attr, s, default = " *** unknown, check code/logfile *** ") {
x = gsub(paste0(".*", attr, " *= *'*([^' ,;]+).*"), "\\1", s)
if(x == s) {
x = default
}
return(x)
}
method = get_setting("test_genesets *\\( *method", settings)
padj_method = get_setting("padj_method", settings)
padj_sources = get_setting("padj_sources", settings)
padj_cutoff = get_setting("padj_cutoff", settings)
score_type = get_setting("score_type", settings, "")
min_overlap = get_setting("min_overlap", settings)
max_overlap = get_setting("max_overlap", settings)
max_overlap_fraction = get_setting("max_overlap_fraction", settings)
if(score_type == "") {
score_type = paste0(' genes and was')
} else {
score_type = paste0(' genes and its ', score_type, '-derived gene scores were')
}
# cite R package
result = paste0(
'Gene set analyses\n',
'The GOAT R package [PMID:38898151] (version ', goat_version(),
', https://github.com/ftwkoopmans/goat ) was used to perform gene set enrichment analyses',
# algorithm / method
' with the ', toupper(method), ' algorithm'
)
# genesets
if (is_genesets_go) {
result = paste0(result, ' using gene sets from the Gene Ontology database [PMID:10802651] (', x$source_version[1], '). ')
} else if (is_genesets_syngo) {
result = paste0(result, ' using gene sets from the SynGO knowledgebase [PMID:31171447] (', x$source_version[1], '). ')
} else {
result = paste0(result, ' using gene sets from "', x$source_version[1], '". ')
}
# filtering settings
paste0(
result, 'The input gene list contained ', nrow(genelist),
score_type,
' used to test for enriched gene sets that contained at least ',
min_overlap , ' and at most ' , max_overlap , ' genes (or ',
round(as.numeric(max_overlap_fraction)*100),'% of the gene list, whichever was smaller) that overlapped with the input gene list. ',
# multiple testing correction
'Multiple testing correction was ',
ifelse(length(usource) > 1,
paste0('independently applied per gene set "source" (i.e. ', paste(usource, collapse=", "), ')'),
'applied'),
ifelse(tolower(padj_method) == "bonferroni",
' using Bonferroni adjustment',
' using the Benjamini-Hochberg procedure (FDR)'),
ifelse(padj_sources == "TRUE" && length(usource) > 1,
paste0(' and subsequently, all p-values were adjusted (again) using Bonferroni adjustment to account for ',
length(usource), ' separate tests across "sources". '),
'. '),
'The significance threshold for adjusted p-values was set to ', padj_cutoff, '.\n\n'
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.