data-raw/createMSigDB_SummaryTable.R

library(msigdbr)

cat.descriptions <- c("H" = "Hallmark gene sets summarize and represent specific well-defined biological states or processes and display coherent expression. These gene sets were generated by a computational methodology based on identifying overlaps between gene sets in other MSigDB collections and retaining genes that display coordinate expression.",
                      "C1" = "Gene sets corresponding to each human chromosome and each cytogenetic band",
                      "C2" = "Gene sets in this collection are curated from various sources, including online pathway databases and the biomedical literature. Many sets are also contributed by individual domain experts.",
                      "C3" = "Gene sets representing potential targets of regulation by transcription factors or microRNAs. The sets consist of genes grouped by elements they share in their non-protein coding regions. The elements represent known or likely cis-regulatory elements in promoters and 3'-UTRs.",
                      "C4" = "Computational gene sets defined by mining large collections of cancer-oriented microarray data.",
                      "C5" = "Gene sets that contain genes annotated by the same GO term.",
                      "C6" = "Gene sets that represent signatures of cellular pathways which are often dis-regulated in cancer. The majority of signatures were generated directly from microarray data from NCBI GEO or from internal unpublished profiling experiments involving perturbation of known cancer genes.",
                      "C7" = "Gene sets that represent cell states and perturbations within the immune system. The signatures were generated by manual curation of published studies in human and mouse immunology.")

subcat.descriptions <- c("CGP" = "Chemical and genetic perturbations",
                         "CP" = "Additional currated pathways",
                         "CP:BIOCARTA" = "Canonical Pathways gene sets derived from the BioCarta pathway database.",
                         "CP:KEGG" = "Canonical Pathways gene sets derived from the KEGG pathway database.",
                         "CP:PID" = "Canonical Pathways gene sets derived from the Pathway Interaction Database (PID) pathway database.",
                         "CP:REACTOME" = "Canonical Pathways gene sets derived from the Reactome pathway database.",
                         "MIR" = "All miRNA target prediction gene sets. Combined superset of both miRDB prediction methods and legacy sets.",
                         "TFT" = "All transcription factor target prediction gene sets. Combined superset of both GTRD prediction methods and legacy sets.",
                         "CGN" = "Gene sets defined by expression neighborhoods centered on 380 cancer-associated genes. This collection is described in Subramanian, Tamayo et al. 2005",
                         "CM" = "Gene sets defined by Segal et al. 2004. Briefly, the authors compiled gene sets ('modules') from a variety of resources such as KEGG, GO, and others. By mining a large compendium of cancer-related microarray data, they identified 456 such modules as significantly changed in a variety of cancer conditions.",
                         "BP" = "Gene sets derived from the GO Biological Process Ontology.",
                         "CC" = "Gene sets derived from the GO Cellular Component Ontology.",
                         "MF" = "Gene sets derived from the GO Molecular Function Ontology.",
                         "N/A" = "No subcategory available.")

# Download gene sets and find unique categories/subcategories
m <- as.data.frame(msigdbr::msigdbr(species = "Homo sapiens"))
u <- unique(m[,c("gs_cat", "gs_subcat")])

# Create unique ids
u.id <- u[,1]
u.sub.present.ix <- u[,2] != ""
u.id[u.sub.present.ix] <- paste0(u[u.sub.present.ix,1], "-", u[u.sub.present.ix,2])
u[!u.sub.present.ix,2] <- "N/A"

# Create final table
msigdb_table <- data.frame(ID = u.id,
                  Category = u[,1],
                  Subcategory = u[,2],
                  Category_Description = cat.descriptions[u[,1]],
                  Subcategory_Description = subcat.descriptions[u[,2]],
                  stringsAsFactors = FALSE)
msigdb_table <- tab[order(tab[,1]),]

# Save to file, assuming this script is being run in the "data-raw" folder
# within the singleCellTK package
save(msigdb_table, file = "../data/msigdb_table.rda")
compbiomed/singleCellTK documentation built on May 8, 2024, 6:58 p.m.