#' Obtains TERM2GENE object for corGSEA
#'
#' Wrapper for msgidb::msigdbr() function
#'
#' @param Species Species to obtain gene names for.
#' Either 'hsapiens' or 'mmusculus'
#'
#' @param GSEA_Type Which pathway annotations should be considered? Options listed in
#' correlationAnalyzeR::pathwayCategories -- See details below for more info.
#'
#' @param sampler If TRUE, will only return 100,000 random genesets from either
#' simple or complex TERM2GENEs. Useful for reducing GSEA computational burden.
#'
#' @param listReturn If TRUE, will return annotations as a list object.
#'
#' @return A tbl object with columns "gs_name" and "gene_symbol"
#'
#' @details GSEA_Type category names and their MSIGDB description:
#'
#' \strong{Hallmark} (a.k.a "H" in MSIGDB): "Hallmark gene sets summarize and represent
#' specific well-defined biological states or processes and display coherent expression.
#' These gene sets were generated by a computational methodology based on identifying
#' overlaps between gene sets in other MSigDB collections and retaining genes that display
#' coordinate expression."
#'
#' \strong{Cytogenic bands} (a.k.a "C1" in MSIGDB): "Gene sets corresponding to each human
#' chromosome and each cytogenetic band that has at least one gene."
#'
#' \strong{Perturbations} (a.k.a. "C2:CGP" in MSIGDB): "Gene sets represent
#' expression signatures of genetic and chemical perturbations. A number of these gene
#' sets come in pairs: xxx_UP (and xxx_DN) gene set representing genes induced
#' (and repressed) by the perturbation."
#'
#' \strong{Canonical pathways} (a.k.a. "C2:CP" in MSIGDB): "Gene sets from pathway databases.
#' Usually, these gene sets are canonical representations of a biological process
#' compiled by domain experts."
#'
#' \strong{BioCarta} (a.k.a. "C2:CP:BIOCARTA" in MSIGDB): "Gene sets derived from the
#' BioCarta pathway database."
#'
#' \strong{KEGG} (a.k.a. "C2:CP:KEGG" in MSIGDB): "Gene sets derived from the
#' KEGG pathway database."
#'
#' \strong{PID} (a.k.a. "C2:CP:PID" in MSIGDB): "Gene sets derived from the
#' PID pathway database."
#'
#' \strong{Reactome} (a.k.a. "C2:CP:REACTOME" in MSIGDB): "Gene sets derived from the
#' Reactome pathway database."
#'
#' \strong{miRNA targets} (a.k.a. "C3:MIR" in MSIGDB): "Gene sets that contain genes
#' sharing putative target sites (seed matches) of human mature miRNA in their 3'-UTRs."
#'
#' \strong{TF targets} (a.k.a. "C3:TFT" in MSIGDB): "Gene sets that share upstream
#' cis-regulatory motifs which can function as potential transcription factor
#' binding sites. Based on work by Xie et al. 2005"
#'
#' \strong{Cancer gene neighborhoods} (a.k.a. "C4:CGN" in MSIGDB): "Gene sets defined
#' by expression neighborhoods centered on 380 cancer-associated genes. This collection
#' is described in Subramanian, Tamayo et al. 2005"
#'
#' \strong{Cancer modules} (a.k.a. "C4:CGN" in MSIGDB): "Gene sets defined by Segal
#' et al. 2004. Briefly, the authors compiled gene sets ('modules') from a variety of
#' resources such as KEGG, GO, and others. By mining a large compendium of cancer-related
#' microarray data, they identified 456 such modules as significantly changed in a variety
#' of cancer conditions."
#'
#' \strong{GO:BP} (a.k.a. "C5:BP" in MSIGDB): "Gene sets derived from the GO Biological Process Ontology."
#'
#' \strong{GO:CC} (a.k.a. "C5:CC" in MSIGDB): "Gene sets derived from the GO Cellular Component Ontology."
#'
#' \strong{GO:MF} (a.k.a. "C5:MF" in MSIGDB): "Gene sets derived from the GO Molecular Function Ontology."
#'
#' \strong{Oncogenic signatures} (a.k.a. "C6" in MSIGDB): "Gene sets that represent signatures of cellular
#' pathways which are often dis-regulated in cancer. The majority of signatures were generated directly
#' from microarray data from NCBI GEO or from internal unpublished profiling experiments involving perturbation
#' of known cancer genes."
#'
#' \strong{Immunological signatures} (a.k.a. "C7" in MSIGDB): "Gene sets that represent cell states and
#' perturbations within the immune system. The signatures were generated by manual curation of
#' published studies in human and mouse immunology."
#'
#' \strong{Cell Type signatures} (a.k.a. "C8" in MSIGDB): "Gene sets that contain curated cluster
#' markers for cell types identified in single-cell sequencing studies of human tissue."
#'
#' \strong{simple}: This is the combination of "Hallmark", "Perturbations",
#' "BioCarta", "GO:BP", "GO:CC", "GO:MF", "KEGG", "Canonical pathways", "PID", and "Reactome"
#'
#' \strong{complex}: This includes all possible gene sets.
#'
#' @examples
#' TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple")
#' TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = c("Hallmark", "KEGG"))
#'
#' @importFrom rlang .data
#' @import dplyr
#' @import tibble
#' @import tidyr
#'
#' @export
getTERM2GENE <- function(GSEA_Type = c("simple"),
Species = c("hsapiens", "mmusculus"),
sampler = FALSE, listReturn = FALSE) {
# Species = "hsapiens"
# GSEA_Type = "simple"
# sampler = FALSE
if (Species[1] == "hsapiens") {
msigSpec <- "Homo sapiens"
} else {
msigSpec <- "Mus musculus"
}
# Get data object
MDFraw <- msigdbr::msigdbr(species = msigSpec)
if (listReturn) {
MDFThin <- MDF[,c(1, 8)]
cats <- paste0(MDF$gs_cat, ":", MDF$gs_subcat)
MDFL <- split(MDFThin, f = cats)
names(MDFL) <- gsub(names(MDFL), pattern = "(.+):$", replacement = "\\1")
return(MDFL)
}
MDF <- MDFraw
MDF$gs_subcat <- gsub(MDF$gs_subcat, pattern = "CP:", replacement = "", perl = TRUE)
MDF$gs_cat <- paste0(MDF$gs_cat, ":", MDF$gs_subcat)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = ":$", replacement = "", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C1", replacement = "Cytogenic bands", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C6", replacement = "Oncogenic signatures", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C7", replacement = "Immunological signatures", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C8", replacement = "Cell Type signatures", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C2:", replacement = "", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C5:", replacement = "", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "H$", replacement = "Hallmark", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "CP", replacement = "Canonical pathways", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "CGP", replacement = "Perturbations", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C4:CGN", replacement = "Cancer gene neighborhoods", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C4:CM", replacement = "Cancer modules", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C3:MIR:MIRDB", replacement = "miRNA targets", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C3:TFT:GTRD", replacement = "TF targets", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "BIOCARTA", replacement = "BioCarta", perl = TRUE)
MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "REACTOME", replacement = "Reactome", perl = TRUE)
# Filter for pathways of interest
optionsNow <- c("simple", "complex", unique(MDF$gs_cat))
if (! all(GSEA_Type %in% optionsNow)) {
stop("\nPlease enter a valid GSEA_Type. Use ?getTERM2GENE to see available options.\n")
}
categories <- c()
if ("simple" %in% GSEA_Type) {
categories <- c(categories, "Hallmark", "Perturbations", "BioCarta",
"GO:BP", "KEGG", "Canonical pathways", "Reactome", "GO:MF", "GO:CC", "PID")
}
if ("complex" %in% GSEA_Type) {
categories <- c(categories, optionsNow)
}
categories <- unique(c(categories, GSEA_Type))
TERM2GENE <- MDF %>%
filter(.data$gs_cat %in% categories) %>%
select(.data$gs_name, .data$gene_symbol)
if (sampler) {
print("Using sampler!")
set.seed(1)
TERM2GENE <- TERM2GENE[sample(nrow(TERM2GENE), size = 100000),]
}
return(TERM2GENE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.