#' Visually enhances a functional enrichment result table
#'
#' Creates a visual summary for the results of a functional enrichment analysis,
#' by displaying also the components of each gene set and their expression change
#' in the contrast of interest
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to check the
#' formatting requirements (a minimal set of columns should be present).
#' @param res_de A `DESeqResults` object.
#' @param annotation_obj A `data.frame` object with the feature annotation.
#' information, with at least two columns, `gene_id` and `gene_name`.
#' @param gtl A `GeneTonic`-list object, containing in its slots the arguments
#' specified above: `dds`, `res_de`, `res_enrich`, and `annotation_obj` - the names
#' of the list _must_ be specified following the content they are expecting
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be displayed.
#' @param gs_ids Character vector, containing a subset of `gs_id` as they are
#' available in `res_enrich`. Lists the gene sets to be displayed.
#' @param chars_limit Integer, number of characters to be displayed for each
#' geneset name.
#' @param plot_style Character value, one of "point" or "ridgeline". Defines the
#' style of the plot to summarize visually the table.
#' @param ridge_color Character value, one of "gs_id" or "gs_score", controls the
#' fill color of the ridge lines. If selecting "gs_score", the `z_score` column
#' must be present in the enrichment results table - see `get_aggrscores()` to do
#' that.
#' @param plot_title Character string, used as title for the plot. If left `NULL`,
#' it defaults to a general description of the plot and of the DE contrast.
#'
#' @return A `ggplot` object
#' @export
#'
#' @examples
#'
#' library("macrophage")
#' library("DESeq2")
#' library("org.Hs.eg.db")
#' library("AnnotationDbi")
#'
#' # dds object
#' data("gse", package = "macrophage")
#' dds_macrophage <- DESeqDataSet(gse, design = ~ line + condition)
#' rownames(dds_macrophage) <- substr(rownames(dds_macrophage), 1, 15)
#' dds_macrophage <- estimateSizeFactors(dds_macrophage)
#'
#' # annotation object
#' anno_df <- data.frame(
#' gene_id = rownames(dds_macrophage),
#' gene_name = mapIds(org.Hs.eg.db,
#' keys = rownames(dds_macrophage),
#' column = "SYMBOL",
#' keytype = "ENSEMBL"
#' ),
#' stringsAsFactors = FALSE,
#' row.names = rownames(dds_macrophage)
#' )
#'
#' # res object
#' data(res_de_macrophage, package = "GeneTonic")
#' res_de <- res_macrophage_IFNg_vs_naive
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#' res_enrich <- get_aggrscores(res_enrich, res_de, anno_df)
#' enhance_table(res_enrich,
#' res_de,
#' anno_df,
#' n_gs = 10
#' )
#'
#' # using the ridge line as a style, also coloring by the Z score
#' res_enrich_withscores <- get_aggrscores(
#' res_enrich,
#' res_de,
#' anno_df
#' )
#' enhance_table(res_enrich_withscores,
#' res_de,
#' anno_df,
#' n_gs = 10,
#' plot_style = "ridgeline",
#' ridge_color = "gs_score"
#' )
enhance_table <- function(res_enrich,
res_de,
annotation_obj,
gtl = NULL,
n_gs = 50,
gs_ids = NULL,
chars_limit = 70,
plot_style = c("point", "ridgeline"),
ridge_color = c("gs_id", "gs_score"),
plot_title = NULL) {
if (!is.null(gtl)) {
checkup_gtl(gtl)
dds <- gtl$dds
res_de <- gtl$res_de
res_enrich <- gtl$res_enrich
annotation_obj <- gtl$annotation_obj
}
plot_style <- match.arg(plot_style, c("point", "ridgeline"))
ridge_color <- match.arg(ridge_color, c("gs_id", "gs_score"))
n_gs <- min(n_gs, nrow(res_enrich))
gs_to_use <- unique(
c(
res_enrich$gs_id[seq_len(n_gs)], # the ones from the top
gs_ids[gs_ids %in% res_enrich$gs_id] # the ones specified from the custom list
)
)
gs_fulllist <- lapply(gs_to_use, function(gs) {
genes_thisset <- res_enrich[gs, "gs_genes"]
genes_thisset <- unlist(strsplit(genes_thisset, ","))
genesid_thisset <- annotation_obj$gene_id[match(genes_thisset, annotation_obj$gene_name)]
# removing the genes not finding a match in the annotation
no_anno_match <- is.na(genesid_thisset)
genes_thisset_anno <- genes_thisset[!no_anno_match]
genesid_thisset_anno <- genesid_thisset[!no_anno_match]
# ... and informing on which genes might be troublesome
if (any(no_anno_match)) {
message("Could not find a match in the annotation for some genes. ",
"Please inspect your results in detail for geneset ",
gs,
" the gene(s) named: ",
paste0(genes_thisset[no_anno_match], collapse = ", "))
}
res_thissubset <- res_de[genesid_thisset_anno, ]
res_thissubset <- as.data.frame(res_thissubset)
res_thissubset$gene_name <- genes_thisset_anno
res_thissubset$gs_desc <- as.factor(res_enrich[gs, "gs_description"])
res_thissubset$gs_id <- res_enrich[gs, "gs_id"]
# return(as.data.frame(res_thissubset))
return(res_thissubset)
})
gs_fulllist <- do.call(rbind, gs_fulllist)
# message(dim(gs_fulllist)[1])
this_contrast <- (sub(".*p-value: (.*)", "\\1", mcols(res_de, use.names = TRUE)["pvalue", "description"]))
# to have first rows viewed on top
gs_fulllist <- gs_fulllist[rev(seq_len(nrow(gs_fulllist))), ]
gs_fulllist$gs_desc <- factor(gs_fulllist$gs_desc, levels = rev(levels(gs_fulllist$gs_desc)))
max_lfc <- max(abs(range(gs_fulllist$log2FoldChange)))
# common elements here
gs_labels <- paste0(
substr(as.character(unique(gs_fulllist$gs_desc)), 1, chars_limit),
" | ", unique(gs_fulllist$gs_id)
)
if (plot_style == "point") {
p <- ggplot(
gs_fulllist, aes(
x = .data$log2FoldChange,
y = .data$gs_desc,
fill = .data$gs_id,
text = .data$gene_name
)
) +
scale_x_continuous(limits = c(-max_lfc, max_lfc)) +
geom_point(alpha = 0.7, shape = 21, size = 2) +
theme_minimal() +
geom_vline(aes(xintercept = 0), col = "steelblue", alpha = 0.4) +
theme(legend.position = "none") +
scale_y_discrete(
name = "",
labels = gs_labels
) +
labs(x = "log2 Fold Change")
} else if (plot_style == "ridgeline") {
if (ridge_color == "gs_score" & is.null(res_enrich$z_score)) {
message("Fallback to plotting the ridgelines according to geneset id (Z score required)")
ridge_color <- "gs_id"
}
if (ridge_color == "gs_score") {
gs_fulllist$gs_zscore <- res_enrich$z_score[match(gs_fulllist$gs_id, res_enrich$gs_id)]
p <- ggplot(
gs_fulllist, aes(
x = .data$log2FoldChange,
y = .data$gs_desc,
fill = .data$gs_zscore
)
) +
scale_x_continuous(limits = c(-max_lfc, max_lfc)) +
scale_fill_gradient2(low = "#313695", mid = "#FFFFE5", high = "#A50026") +
ggridges::geom_density_ridges(
aes(group = .data$gs_id),
point_color = "#00000066",
jittered_points = TRUE, scale = .95, rel_min_height = .01,
point_shape = "|", point_size = 3, linewidth = 0.25,
position = ggridges::position_points_jitter(height = 0)) +
theme_minimal() +
geom_vline(aes(xintercept = 0), col = "steelblue", alpha = 0.4) +
scale_y_discrete(
name = "",
labels = gs_labels
) +
labs(x = "log2 Fold Change")
}
else if (ridge_color == "gs_id") {
p <- ggplot(
gs_fulllist, aes(
x = .data$log2FoldChange,
y = .data$gs_desc,
fill = .data$gs_id
)
) +
scale_x_continuous(limits = c(-max_lfc, max_lfc)) +
ggridges::geom_density_ridges(
aes(group = .data$gs_id),
point_color = "#00000066",
jittered_points = TRUE, scale = .95, rel_min_height = .01,
point_shape = "|", point_size = 3, linewidth = 0.25,
position = ggridges::position_points_jitter(height = 0)) +
theme_minimal() +
geom_vline(aes(xintercept = 0), col = "steelblue", alpha = 0.4) +
theme(legend.position = "none") +
scale_y_discrete(
name = "",
labels = gs_labels
) +
labs(x = "log2 Fold Change")
}
}
if (is.null(plot_title)) {
p <- p + ggtitle(paste0("Enrichment overview - ", this_contrast))
} else {
p <- p + ggtitle(plot_title)
}
return(p)
}
#' Compute aggregated scores for gene sets
#'
#' Computes for each gene set in the `res_enrich` object a Z score and an aggregated
#' score (using the log2FoldChange values, provided in the `res_de`)
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to check the
#' formatting requirements (a minimal set of columns should be present).
#' @param res_de A `DESeqResults` object.
#' @param annotation_obj A `data.frame` object with the feature annotation
#' information, with at least two columns, `gene_id` and `gene_name`.
#' @param gtl A `GeneTonic`-list object, containing in its slots the arguments
#' specified above: `dds`, `res_de`, `res_enrich`, and `annotation_obj` - the names
#' of the list _must_ be specified following the content they are expecting
#' @param aggrfun Specifies the function to use for aggregating the scores for
#' each term. Common values could be `mean` or `median`.
#'
#' @return A `data.frame` with the same columns as provided in the input, with
#' additional information on the `z_score` and the `aggr_score` for each gene set.
#' This information is used by other functions such as [gs_volcano()] or
#' [enrichment_map()]
#'
#' @seealso [gs_volcano()] and [enrichment_map()] make efficient use of the computed
#' aggregated scores
#'
#' @export
#'
#' @examples
#'
#' library("macrophage")
#' library("DESeq2")
#' library("org.Hs.eg.db")
#' library("AnnotationDbi")
#'
#' # dds object
#' data("gse", package = "macrophage")
#' dds_macrophage <- DESeqDataSet(gse, design = ~ line + condition)
#' rownames(dds_macrophage) <- substr(rownames(dds_macrophage), 1, 15)
#' dds_macrophage <- estimateSizeFactors(dds_macrophage)
#'
#' # annotation object
#' anno_df <- data.frame(
#' gene_id = rownames(dds_macrophage),
#' gene_name = mapIds(org.Hs.eg.db,
#' keys = rownames(dds_macrophage),
#' column = "SYMBOL",
#' keytype = "ENSEMBL"
#' ),
#' stringsAsFactors = FALSE,
#' row.names = rownames(dds_macrophage)
#' )
#'
#' # res object
#' data(res_de_macrophage, package = "GeneTonic")
#' res_de <- res_macrophage_IFNg_vs_naive
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#'
#' res_enrich <- get_aggrscores(
#' res_enrich,
#' res_de,
#' anno_df
#' )
get_aggrscores <- function(res_enrich,
res_de,
annotation_obj,
gtl = NULL,
aggrfun = mean) {
if (!is.null(gtl)) {
checkup_gtl(gtl)
dds <- gtl$dds
res_de <- gtl$res_de
res_enrich <- gtl$res_enrich
annotation_obj <- gtl$annotation_obj
}
gs_expanded <- tidyr::separate_rows(res_enrich, "gs_genes", sep = ",")
gs_expanded$log2FoldChange <-
res_de[annotation_obj$gene_id[match(gs_expanded$gs_genes, annotation_obj$gene_name)], ]$log2FoldChange
gs_aggregated <- lapply(seq_len(nrow(res_enrich)), function(i) {
this_gsid <- res_enrich$gs_id[i]
this_subset <- gs_expanded[gs_expanded$gs_id == this_gsid, ]
upgenes <- sum(this_subset$log2FoldChange > 0)
downgenes <- sum(this_subset$log2FoldChange < 0)
z_score <- (upgenes - downgenes) / sqrt(upgenes + downgenes)
aggr_score <- aggrfun(this_subset$log2FoldChange)
return(c(
"DE_count" = nrow(this_subset),
"Z_score" = z_score,
"aggr_score" = aggr_score
))
})
names(gs_aggregated) <- res_enrich$gs_id
res_enrich$DE_count <- vapply(gs_aggregated, "[", 1, FUN.VALUE = numeric(1))
res_enrich$z_score <- vapply(gs_aggregated, "[", 2, FUN.VALUE = numeric(1))
res_enrich$aggr_score <- vapply(gs_aggregated, "[", 3, FUN.VALUE = numeric(1))
return(res_enrich)
}
#' Distill enrichment results
#'
#' Distill the main topics from the enrichment results, based on the graph derived
#' from constructing an enrichment map
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis.
#' @param res_de A `DESeqResults` object. As for the `dds` parameter, this is
#' also commonly used in the `DESeq2` framework.
#' @param annotation_obj A `data.frame` object, containing two columns, `gene_id`
#' with a set of unambiguous identifiers (e.g. ENSEMBL ids) and `gene_name`,
#' containing e.g. HGNC-based gene symbols.
#' @param gtl A `GeneTonic`-list object, containing in its slots the arguments
#' specified above: `dds`, `res_de`, `res_enrich`, and `annotation_obj` - the names
#' of the list _must_ be specified following the content they are expecting
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be used.
#' @param cluster_fun Character, referring to the name of the function used for
#' the community detection in the enrichment map graph. Could be one of "cluster_markov",
#' "cluster_louvain", or "cluster_walktrap", as they all return a `communities`
#' object.
#'
#' @return A list containing three objects:
#' - the distilled table of enrichment, `distilled_table`, where the new meta-genesets
#' are identified and defined, specifying e.g. the names of each component, and the
#' genes associated to these.
#' - the distilled graph for the enrichment map, `distilled_em`, with the information
#' on the membership
#' - the original `res_enrich`, augmented with the information of the membership
#' related to the meta-genesets
#'
#' @export
#'
#' @examples
#' library("macrophage")
#' library("DESeq2")
#' library("org.Hs.eg.db")
#' library("AnnotationDbi")
#'
#' # dds object
#' data("gse", package = "macrophage")
#' dds_macrophage <- DESeqDataSet(gse, design = ~ line + condition)
#' rownames(dds_macrophage) <- substr(rownames(dds_macrophage), 1, 15)
#' dds_macrophage <- estimateSizeFactors(dds_macrophage)
#'
#' # annotation object
#' anno_df <- data.frame(
#' gene_id = rownames(dds_macrophage),
#' gene_name = mapIds(org.Hs.eg.db,
#' keys = rownames(dds_macrophage),
#' column = "SYMBOL",
#' keytype = "ENSEMBL"
#' ),
#' stringsAsFactors = FALSE,
#' row.names = rownames(dds_macrophage)
#' )
#'
#' # res object
#' data(res_de_macrophage, package = "GeneTonic")
#' res_de <- res_macrophage_IFNg_vs_naive
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#' res_enrich <- get_aggrscores(res_enrich, res_de, anno_df)
#'
#' distilled <- distill_enrichment(res_enrich,
#' res_de,
#' annotation_obj,
#' n_gs = 100,
#' cluster_fun = "cluster_markov"
#' )
#' colnames(distilled$distilled_table)
#' distilled$distilled_em
distill_enrichment <- function(res_enrich,
res_de,
annotation_obj,
gtl = NULL,
n_gs = nrow(res_enrich),
cluster_fun = "cluster_markov") {
cluster_fun <- match.arg(
cluster_fun, c("cluster_markov", "cluster_louvain", "cluster_walktrap")
)
cluster_fun <- match.fun(cluster_fun)
if (!is.null(gtl)) {
checkup_gtl(gtl)
dds <- gtl$dds
res_de <- gtl$res_de
res_enrich <- gtl$res_enrich
annotation_obj <- gtl$annotation_obj
}
n_gs <- min(n_gs, nrow(res_enrich))
em <- enrichment_map(res_enrich,
res_de,
annotation_obj,
n_gs = n_gs
)
# subset accordingly
res_enrich <- res_enrich[seq_len(n_gs), ]
gs_communities <- cluster_fun(em)
res_enrich$gs_membership <- factor(gs_communities$membership)
V(em)$membership <- gs_communities$membership
V(em)$color <- gs_communities$membership
# aggregate the results according to the defined gs_membership column
distilled_res <- data.frame(
metags_cluster = unique(res_enrich$gs_membership),
metags_n_gs = NA,
metags_genes = NA,
metags_n_genes = NA,
metags_gsidlist = NA,
metags_gsdesclist = NA,
metags_msgs = NA,
metags_mcgs = NA,
stringsAsFactors = FALSE
)
for (i in seq_along(distilled_res$metags_cluster)) {
# message(i)
# message(distilled_res$metags_cluster[i])
subset_enrich <- res_enrich[res_enrich$gs_membership == distilled_res$metags_cluster[i], ]
distilled_res[i, "metags_n_gs"] <- nrow(subset_enrich)
all_genes_singlevec <- unique(strsplit(paste0(subset_enrich$gs_genes, collapse = ","), ",")[[1]])
distilled_res[i, "metags_genes"] <- paste0(all_genes_singlevec, collapse = ",")
distilled_res[i, "metags_n_genes"] <- length(all_genes_singlevec)
distilled_res[i, "metags_gsidlist"] <- paste0(subset_enrich$gs_id, collapse = ",") # nested list or collapsed?
distilled_res[i, "metags_gsdesclist"] <- paste0(subset_enrich$gs_description, collapse = ",")
most_sig <- which.min(subset_enrich$gs_pvalue)
distilled_res[i, "metags_msgs"] <- paste0(
subset_enrich$gs_id[most_sig], "|",
subset_enrich$gs_description[most_sig], "|",
subset_enrich$gs_pvalue[most_sig]
)
mgs_graph <- induced_subgraph(em, subset_enrich$gs_description)
distilled_res[i, "metags_mcgs"] <- names(which.max(strength(mgs_graph)))
}
# later add something maybe even based on NLP/wordcloud or so
# lapply(unique(res_enrich$gs_membership), function(gs_cluster) {
# cur_clus <- gs_cluster
# message(cur_clus)
# clu_n_gs <- res_enrich$gs_membership
# })
return(
list(
distilled_table = distilled_res,
distilled_em = em,
res_enrich = res_enrich
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.