R/getTERM2GENE.R

Defines functions getTERM2GENE

Documented in getTERM2GENE

#' Obtains TERM2GENE object for corGSEA
#'
#' Wrapper for msgidb::msigdbr() function
#'
#' @param Species Species to obtain gene names for.
#' Either 'hsapiens' or 'mmusculus'
#'
#' @param GSEA_Type Which pathway annotations should be considered? Options listed in
#' correlationAnalyzeR::pathwayCategories -- See details below for more info.
#'
#' @param sampler If TRUE, will only return 100,000 random genesets from either
#' simple or complex TERM2GENEs. Useful for reducing GSEA computational burden.
#'
#' @param listReturn If TRUE, will return annotations as a list object.
#'
#' @return A tbl object with columns "gs_name" and "gene_symbol"
#'
#' @details GSEA_Type category names and their MSIGDB description:
#'
#' \strong{Hallmark} (a.k.a "H" in MSIGDB): "Hallmark gene sets summarize and represent
#' specific well-defined biological states or processes and display coherent expression.
#' These gene sets were generated by a computational methodology based on identifying
#' overlaps between gene sets in other MSigDB collections and retaining genes that display
#' coordinate expression."
#'
#' \strong{Cytogenic bands} (a.k.a "C1" in MSIGDB): "Gene sets corresponding to each human
#' chromosome and each cytogenetic band that has at least one gene."
#'
#' \strong{Perturbations} (a.k.a. "C2:CGP" in MSIGDB): "Gene sets represent
#' expression signatures of genetic and chemical perturbations. A number of these gene
#' sets come in pairs: xxx_UP (and xxx_DN) gene set representing genes induced
#' (and repressed) by the perturbation."
#'
#' \strong{Canonical pathways} (a.k.a. "C2:CP" in MSIGDB): "Gene sets from pathway databases.
#' Usually, these gene sets are canonical representations of a biological process
#'  compiled by domain experts."
#'
#' \strong{BioCarta} (a.k.a. "C2:CP:BIOCARTA" in MSIGDB): "Gene sets derived from the
#' BioCarta pathway database."
#'
#' \strong{KEGG} (a.k.a. "C2:CP:KEGG" in MSIGDB): "Gene sets derived from the
#' KEGG pathway database."
#'
#' \strong{PID} (a.k.a. "C2:CP:PID" in MSIGDB): "Gene sets derived from the
#' PID pathway database."
#'
#' \strong{Reactome} (a.k.a. "C2:CP:REACTOME" in MSIGDB): "Gene sets derived from the
#' Reactome pathway database."
#'
#' \strong{miRNA targets} (a.k.a. "C3:MIR" in MSIGDB): "Gene sets that contain genes
#' sharing putative target sites (seed matches) of human mature miRNA in their 3'-UTRs."
#'
#' \strong{TF targets} (a.k.a. "C3:TFT" in MSIGDB): "Gene sets that share upstream
#'  cis-regulatory motifs which can function as potential transcription factor
#'   binding sites. Based on work by Xie et al. 2005"
#'
#' \strong{Cancer gene neighborhoods} (a.k.a. "C4:CGN" in MSIGDB): "Gene sets defined
#'  by expression neighborhoods centered on 380 cancer-associated genes. This collection
#'  is described in Subramanian, Tamayo et al. 2005"
#'
#' \strong{Cancer modules} (a.k.a. "C4:CGN" in MSIGDB): "Gene sets defined by Segal
#'  et al. 2004. Briefly, the authors compiled gene sets ('modules') from a variety of
#'  resources such as KEGG, GO, and others. By mining a large compendium of cancer-related
#'   microarray data, they identified 456 such modules as significantly changed in a variety
#'    of cancer conditions."
#'
#' \strong{GO:BP} (a.k.a. "C5:BP" in MSIGDB): "Gene sets derived from the GO Biological Process Ontology."
#'
#' \strong{GO:CC} (a.k.a. "C5:CC" in MSIGDB): "Gene sets derived from the GO Cellular Component Ontology."
#'
#' \strong{GO:MF} (a.k.a. "C5:MF" in MSIGDB): "Gene sets derived from the GO Molecular Function Ontology."
#'
#' \strong{Oncogenic signatures} (a.k.a. "C6" in MSIGDB): "Gene sets that represent signatures of cellular
#'  pathways which are often dis-regulated in cancer. The majority of signatures were generated directly
#'  from microarray data from NCBI GEO or from internal unpublished profiling experiments involving perturbation
#'  of known cancer genes."
#'
#' \strong{Immunological signatures} (a.k.a. "C7" in MSIGDB): "Gene sets that represent cell states and
#' perturbations within the immune system. The signatures were generated by manual curation of
#' published studies in human and mouse immunology."
#'
#' \strong{Cell Type signatures} (a.k.a. "C8" in MSIGDB): "Gene sets that contain curated cluster
#'  markers for cell types identified in single-cell sequencing studies of human tissue."
#'
#' \strong{simple}: This is the combination of "Hallmark", "Perturbations",
#' "BioCarta", "GO:BP", "GO:CC", "GO:MF", "KEGG", "Canonical pathways", "PID", and "Reactome"
#'
#' \strong{complex}: This includes all possible gene sets.
#'
#' @examples
#' TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple")
#' TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = c("Hallmark", "KEGG"))
#'
#' @importFrom rlang .data
#' @import dplyr
#' @import tibble
#' @import tidyr
#'
#' @export
getTERM2GENE <- function(GSEA_Type = c("simple"),
                         Species = c("hsapiens", "mmusculus"),
                         sampler = FALSE, listReturn = FALSE) {

  # Species = "hsapiens"
  # GSEA_Type = "simple"
  # sampler = FALSE

  if (Species[1] == "hsapiens") {
    msigSpec <- "Homo sapiens"
  } else {
    msigSpec <- "Mus musculus"
  }

  # Get data object
  MDFraw <- msigdbr::msigdbr(species = msigSpec)
  if (listReturn) {
    MDFThin <- MDF[,c(1, 8)]
    cats <- paste0(MDF$gs_cat, ":", MDF$gs_subcat)
    MDFL <- split(MDFThin, f = cats)
    names(MDFL) <- gsub(names(MDFL), pattern = "(.+):$", replacement = "\\1")
    return(MDFL)
  }

  MDF <- MDFraw
  MDF$gs_subcat <- gsub(MDF$gs_subcat, pattern = "CP:", replacement = "", perl = TRUE)
  MDF$gs_cat <- paste0(MDF$gs_cat, ":", MDF$gs_subcat)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = ":$", replacement = "", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C1", replacement = "Cytogenic bands", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C6", replacement = "Oncogenic signatures", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C7", replacement = "Immunological signatures", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C8", replacement = "Cell Type signatures", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C2:", replacement = "", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C5:", replacement = "", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "H$", replacement = "Hallmark", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "CP", replacement = "Canonical pathways", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "CGP", replacement = "Perturbations", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C4:CGN", replacement = "Cancer gene neighborhoods", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C4:CM", replacement = "Cancer modules", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C3:MIR:MIRDB", replacement = "miRNA targets", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C3:TFT:GTRD", replacement = "TF targets", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "BIOCARTA", replacement = "BioCarta", perl = TRUE)
  MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "REACTOME", replacement = "Reactome", perl = TRUE)

  # Filter for pathways of interest
  optionsNow <- c("simple", "complex", unique(MDF$gs_cat))
  if (! all(GSEA_Type %in% optionsNow)) {
    stop("\nPlease enter a valid GSEA_Type. Use ?getTERM2GENE to see available options.\n")
  }

  categories <- c()
  if ("simple" %in% GSEA_Type) {
    categories <- c(categories, "Hallmark", "Perturbations", "BioCarta",
                    "GO:BP", "KEGG", "Canonical pathways", "Reactome", "GO:MF", "GO:CC", "PID")
  }

  if ("complex" %in% GSEA_Type) {
    categories <- c(categories, optionsNow)
  }

  categories <- unique(c(categories, GSEA_Type))
  TERM2GENE <- MDF %>%
    filter(.data$gs_cat %in% categories) %>%
    select(.data$gs_name, .data$gene_symbol)

  if (sampler) {
    print("Using sampler!")
    set.seed(1)
    TERM2GENE <- TERM2GENE[sample(nrow(TERM2GENE), size = 100000),]
  }

  return(TERM2GENE)
}
Bishop-Laboratory/correlationAnalyzeR documentation built on June 28, 2022, 8:31 p.m.