# This script should be more appropriately called `ontology.R` because it deals
# with not only KEGG terms, but GO, Reactome, GSEA as well
#
# Install the latest version of clusterProfiler
# It appears that this is not a problem for R 4.0 according to
# https://github.com/YuLab-SMU/clusterProfiler/issues/245
# However, for R 3.6, I need to:
# remove.packages ('enrichplot')
# remove.packages ('DOSE')
# remove.packages ('fgsea')
# devtools::install_github("ctlab/fgsea")
# devtools::install_github('https://github.com/YuLab-SMU/enrichplot')
# devtools::install_github('https://github.com/YuLab-SMU/DOSE')
# devtools::install_github('https://github.com/YuLab-SMU/clusterProfiler')
#
# The side effect is that the compareClusterResult object will lack the
# `termsim` attribute, which needs to be calculated manually:
# according to https://rdrr.io/github/GuangchuangYu/enrichplot/man/pairwise_termsim.html:
# library (GOSemSim)
# library (enrichplot)
# library (clusterProfiler)
# kk <- compareCluster(gene_list, fun="enrichKEGG", organism="hsa",
# pvalueCutoff=0.05)
# d <- godata('org.Hs.eg.db', ont="BP")
# kk <- pairwise_termsim(kk, method="JC", semData = d)
# A similar issue is reported: https://github.com/YuLab-SMU/clusterProfiler/issues/252
#--------------------------------------------------
#' Obtain the species name for KEGG
#'
#' @param name common name for the species
#' @noRd
get_kegg <- function (name){
org_names <- c('human'='hsa', 'mouse'='mmu', 'marmoset'='cjc')
if (!name %in% names (org_names)){
org <- KEGGREST::keggList ('organism')
kegg_name <- org [grep (name, org [, 'species'],
ignore.case=T)[1], 'organism']
rm (org)
return (kegg_name[1])
}else{
return (org_names [names (org_names) == name ])
}
}
#' Convert gene names into entrez ID
#'
#' @export
gene_name_to_entrez <- function (genename, organism_db){
if (class (organism_db == 'Mart') ){
}else{
entrez <- AnnotationDbi::mapIds(organism_db, as.character (
genename), 'ENTREZID', 'SYMBOL')
}
}
remove_terms <- function (xx_res, AP){
for ( i in AP$remove_keys){
rm_index <- grepl (i, xx_res, ignore.case=T)
xx_res <- xx_res [!rm_index]
}
return (xx_res)
}
#' Remove terms from GO/KEGG analysis that are not related to development
#'
#' @param xx a compareClusterResult object
#' @noRd
clean_terms <- function (xx, AP, attr_name='compareClusterResult',
term_column='Description'){
xx_res <- attr (xx, attr_name)
keep_terms <- remove_terms (xx_res [, term_column], AP)
xx_res <- xx_res [xx_res [, term_column] %in% keep_terms, ]
attr (xx, attr_name) <- xx_res
return (xx)
}
#' Show terms from specific groups
#'
#' @param xx a compareClusterResult object
#' @param subset_class which clusters to show
subset_terms <- function (xx, subset_class, group_col='Cluster',
attr_name='compareClusterResult'){
xx_res <- attr (xx, attr_name)
xx_res <- xx_res [xx_res [, group_col] %in% subset_class, ]
attr (xx, attr_name) <- xx_res
return (xx)
}
simplify_terms <- function (xx, simple_terms, term_col='Description',
attr_name='compareClusterResult'){
dat <- attr (xx, attr_name)
new_terms <- as.character (simple_terms$sub [match (
dat [, term_col], simple_terms$ori) ])
na_terms <- is.na (new_terms)
new_terms [na_terms] <- as.character (dat [na_terms, term_col])
names (new_terms) <- dat [, term_col]
new_terms [!na_terms & new_terms %in% dat [, term_col] ] <- NA
new_terms [!na_terms & duplicated (new_terms) ] <- NA
new_terms <- gsub ('signaling pathway', 'signaling', new_terms)
# Cancer terms are simplified because they are not the focus of this research
cancer_terms <- grep ('cancer', new_terms, ignore.case=T)
new_terms [cancer_terms [1] ] <- 'Cancer'
if (length (cancer_terms)>1) {new_terms [cancer_terms [2:length (cancer_terms)]] <- NA}
return (new_terms)
}
simplify_gsea <- function (xx_df, simple_terms=NULL, term_col='category'){
if (!is.null (simple_terms)){
new_terms <- as.character (simple_terms$sub [match (
xx_df [, term_col], simple_terms$ori) ])
na_terms <- is.na (new_terms)
new_terms [na_terms] <- as.character (xx_df [na_terms, term_col])
names (new_terms) <- xx_df [, term_col]
# simplify signaling pathway in general
new_terms <- gsub ('signaling pathway', 'signaling', new_terms)
# simplify cancer terms
cancer_terms <- grep ('cancer', new_terms, ignore.case=T)
# old version
#new_terms [cancer_terms [1] ] <- 'Cancer'
new_terms [cancer_terms ] <- 'Cancer'
xx_df [, term_col] <- new_terms
}
return (xx_df)
}
append_default_dictionary <- function (new_dict, append_default){
if (append_default){
data (GOsimp, package='TBdev')
if (is.null (new_dict)){new_dict <- GOsimp
}else{
new_dict <- rbind (new_dict, GOsimp)
new_dict <- new_dict [!duplicated (new_dict$ori), ]
}
}
return (new_dict)
}
from_term_to_genes <- function (xx_df, term_index, organism_db){
all_genes <- xx_df$geneID [term_index ]
all_genes <- strsplit (all_genes, '/') [[1]]
return (AnnotationDbi::mapIds(organism_db, as.character (
all_genes), 'SYMBOL', 'ENTREZID'))
}
#' For the results from `compareClusterResult` object
#'
#' @description This function will print all the GO/KEGG/Reactome terms that
#' match the requested keyword or phrases with their p values. It will
#' optionally return a list of all the genes corresponding to the matches terms
#' @param xx a compareClusterResult object
#' @param term a keyword that matches the term of interest
#' @param organism_db which organism database, for example, org.Hs.eg.db
#' @param category_col which column in x that has the terms
#' @param return_val whether to return the gene names
#' @examples
#' KG <- modules::use ('KEGG_path.R')
#' library (GOSemSim)
#' d <- godata('org.Hs.eg.db', ont="BP")
#' kk <- KG$compare_cluster_enrichment (markers, d, enrich_area='KEGG')
#' path_val <- KG$gene_per_term (kk, 'JAK-STAT', org.Hs.eg.db, return_val=T)
#' @export
gene_per_term <- function (xx, term, organism_db, category_col='Description', return_val=F){
if (class (xx) == 'compareClusterResult' ){
xx_df <- xx@compareClusterResult
}else{ xx_df <- xx }
all_index <- grep (term, xx_df [, category_col], ignore.case=T)
all_terms <- list ()
for (i in 1:length (all_index)){
print (paste ('the genes for', xx_df[all_index [i], category_col ] ))
print (paste ('pvalue is', xx_df$p.adjust [all_index [i] ] ) )
gene_entrez <- from_term_to_genes (xx_df, all_index[i], organism_db)
names(gene_entrez) <- NULL
all_terms[[i]] <- gene_entrez
print (paste ('there are', length (gene_entrez), 'genes' ) )
}
if (return_val){
names (all_terms) <- xx_df [all_index, category_col]
return (all_terms)
}
}
gsea_entrez_to_name <- function (all_genes, organism_db){
all_genes <- sapply (all_genes, function (x) {strsplit (x, '\\.')[[1]][3]})
gene_entrez <- AnnotationDbi::mapIds(organism_db, as.character (
all_genes), 'SYMBOL', 'ENTREZID')
names (gene_entrez) <- names (all_genes)
return (gene_entrez)
}
#' Obtain genes of each enriched terms
#'
#' @description For the data frame generated from enrichment analysis, achieves
#' the same function as `gene_per_term`
#' @importFrom magrittr %>%
#' @export
gene_per_enriched_term <- function (xx, term, organism_db, cell_type=NULL,
category_col='category',
cluster_col='cluster'){
if (!is.null (cell_type) ){
xx_df <- xx %>% dplyr::filter (!!as.symbol (cluster_col) == cell_type)
}else{ xx_df <- xx }
all_index <- grep (term, xx_df [, category_col], ignore.case=T)
uniq_terms <- unique ( xx_df [all_index, category_col] )
all_terms <- list ()
for (i in 1:length (uniq_terms) ){
print (paste ('the genes for', uniq_terms [i] ))
xx_df %>% dplyr::filter ( !!as.symbol (category_col) %in% uniq_terms[i] ) %>%
rownames () -> all_genes
gene_entrez <- gsea_entrez_to_name (all_genes, organism_db)
names(gene_entrez) <- NULL
all_terms[[i]] <- gene_entrez
}
return (all_terms)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.