#' @title ge_gsea
#'
#' @description Performs Gene Set Enrichment Analysis using gene sets specified by the user.
#'
#' @param annot_res The output of ge_annot. List containing data frames of
#' differential gene expression results between the different groups.
#' @param gmt Path to the gmt file that contain the gene sets of interest.
#' @param gsea_pvalue Numeric value to define the adjusted pvalue cutoff during
#' GSEA. Set to 0.2 by default.
#'
#' @return Returns ggplot objects and a list of gseaResult objects.
#'
#' @export
#'
#' @import dplyr
#' @import clusterProfiler
#' @import GSEAmining
#'
#' @examples
#' results.dds <- ge_diff_exp(counts = sample_counts,
#' genes_id = 'ensembl_gene_id',
#' metadata = sample_metadata,
#' design = 'MSI_status',
#' ref_level = c('MSI_status', 'MSS'),
#' shrink = 'apeglm')
#' annot.res <- ge_annot(results_dds = results_dds,
#' genes_id = 'ensembl_gene_id',
#' biomart = ensembl_biomart_GRCh38_p13)
#' gsea.res <- ge_gsea(annot_res = annot.res,
#' gmt = 'inst/extdata/c2.cp.reactome.v7.5.1.symbols.gmt',
#' gsea_pvalue = 0.2)
#'
ge_gsea <- function(annot_res,
gmt,
gsea_pvalue = 0.2) {
# 0. If samples come from mouse (by the presence of one extra column)
if('human_ortholog' %in% colnames(annot_res[[1]]) == TRUE){
# Iterate over annotated results list
for (i in seq_along(annot_res)){
# By each data frame in annot_res
annot_res[[i]] <- annot_res[[i]] %>%
# Substitute mouse gene symbol with the human homolog symbol
dplyr::mutate(hgnc_symbol = human_ortholog) %>%
# Filter those missing gene symbols
dplyr::filter(hgnc_symbol != '') %>%
# Remove duplicated genes
dplyr::distinct(hgnc_symbol, .keep_all = TRUE)
}
}
# 1. Obtain geneLists
## Create a list to store as many geneList as conditions
geneLists <- list()
## Create two temporal objects
temp_df <- NULL
temp_gs <- NULL
## Create geneLists
### Iterate over the annotated results list
for (i in seq_along(annot_res)) {
# Get the data frame with gene expression and sort by fold change
temp_df <- annot_res[[i]] %>%
#Filter genes whose log2FoldChange == NA
filter(!is.na(.data$log2FoldChange)) %>%
arrange(desc(.data$log2FoldChange))
# Create a vector with the log2FoldChange
temp_gs <- temp_df$log2FoldChange
# Get the gene symbols of each log2FoldChange
names(temp_gs) <- as.character(temp_df$hgnc_symbol)
# Store the geneList
geneLists[[i]] <- temp_gs
names(geneLists)[i] <- paste0('geneList_', names(annot_res)[i])
}
# 2. Perform GSEA
## Create an empty list to store GSEA results
GSEA.res <- list()
temp_gsea <- NULL
## Read the gmt file
gmt <- clusterProfiler::read.gmt(gmt)
## Execute GSEA
### Iterate over the annotated results list
for (i in seq_along(annot_res)) {
temp_gsea <- clusterProfiler::GSEA(geneList = geneLists[[i]],
TERM2GENE = gmt,
pvalueCutoff = gsea_pvalue)
# Delay the next process 0.5 seconds
Sys.sleep(0.5)
# Save the GSEA results
GSEA.res[[i]] <- temp_gsea
names(GSEA.res)[i] <- paste0('GSEA_', names(annot_res)[i])
# Check if GSEA result is empty and print a message
if (nrow(GSEA.res[[i]]@result) == 0) {
print('No gene sets are enriched under specific pvalueCutoff')
} else {
# 2.1. GSEAmining
# Filter gene sets to analyse the top ones
gs.filt <- GSEA.res[[i]]@result %>%
dplyr::arrange(desc(.data$NES)) %>%
dplyr::mutate(group = ifelse(test = .data$NES > 0,
yes = 'Positive',
no = 'Negative')) %>%
dplyr::group_by(.data$group) %>%
dplyr::filter(.data$p.adjust < gsea_pvalue) %>%
dplyr::top_n(., n = 20, wt = abs(.data$NES)) %>%
dplyr::ungroup(.) %>%
dplyr::select(.data$ID,
.data$NES,
.data$p.adjust,
.data$leading_edge,
.data$core_enrichment)
# 2.2. Bubble plot
print(
gs.filt %>%
separate(leading_edge , into= 'tags', sep=',') %>%
separate(tags, into = c('tags', 'core_perc'), sep='=') %>%
separate(core_perc, into = 'core_perc', sep='%') %>%
mutate(core_perc = as.numeric(core_perc)) %>%
dplyr::select(-tags) %>%
ggplot(.,
aes(x= NES,
y=reorder(ID, NES),
size= core_perc,
colour = p.adjust))+
geom_point(alpha=0.5)+
geom_vline(xintercept = 0)+
scale_color_gradient(low = "#FF9900", high = "#FF3300")+
labs(title = names(annot_res)[i],
size='% of genes in\n leading edge', colour = 'p.adjust')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5, size = 15),
panel.grid = element_blank(),
axis.title.y = element_blank(),
axis.title.x = element_text(size=15),
axis.text.x = element_text(size=15, face = "bold"),
axis.text.y = element_text(size=12, face = "bold"),
legend.title = element_text(face='bold', size =12),
legend.text = element_text(size =12))
)
# 2.3. Cluster gene sets
gs.cl <- GSEAmining::gm_clust(df = gs.filt)
# Plot cluster
GSEAmining::gm_dendplot(df = gs.filt,
hc = gs.cl)
# 2.4. Plot enriched terms in gene sets names
print(GSEAmining::gm_enrichterms(df = gs.filt,
hc = gs.cl))
# 2.5. Plot enriched cores (leading edge analysis)
print(GSEAmining::gm_enrichcores(df = gs.filt,
hc = gs.cl))
}
}
return(GSEA.res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.