#' Plot for gene enrichment analysis of ORA method
#'
#' Over-representation analysis (ORA) is a simple method for objectively deciding whether a set of variables of
#' known or suspected biological relevance, such as a gene set or pathway, is more prevalent in a set of variables
#' of interest than we expect by chance.
#'
#' @param enrich_df Enrichment analysis `data.frame` result.
#' @param fold_change Fold change or logFC values with gene IDs as names. Used in "heat" and "chord"
#' plot.
#' @param plot_type Choose from "bar", "wego","bubble","dot", "lollipop","geneheat", "genechord",
#' "network","gomap","goheat","gotangram","wordcloud","upset".
#' @param term_metric Pathway term metric from one of 'GeneRatio','Count','FoldEnrich' and
#' 'RichFactor'.
#' @param stats_metric Statistic metric from one of "pvalue", "p.adjust", "qvalue".
#' @param sim_method Method of calculating the similarity between nodes, one of one of "Resnik",
#' "Lin", "Rel", "Jiang" , "Wang" or "JC" (Jaccard’s similarity index). Only "JC" supports KEGG data.
#' Used in "map","goheat","gotangram","wordcloud".
#' @param up_color Color of higher statistical power (e.g. Pvalue 0.01) or higher logFC, default is "red".
#' @param down_color Color of lower statistical power (e.g. Pvalue 1) or lower logFC, default is
#' "blue".
#' @param show_gene Select genes to show. Default is "all". Used in "heat" and "chord" plot.
#' @param xlim_left X-axis left limit, default is 0.
#' @param xlim_right X-axis right limit, default is NA.
#' @param wrap_length Numeric, wrap text if longer than this length. Default is NULL.
#' @param org Organism name from `biocOrg_name`.
#' @param ont One of "BP", "MF", and "CC".
#' @param scale_ratio Numeric, scale of node and line size.
#' @param layout Grapgh layout in "map" plot, e,g, "circle", "dh", "drl", "fr","graphopt", "grid",
#' "lgl", "kk", "mds", "nicely" (default),"randomly", "star".
#' @param n_term Number of terms (used in WEGO plot)
#' @param ... other arguments from `plot_theme` function
#'
#' @importFrom ggplot2 ggplot aes aes_string aes_ facet_grid element_text element_blank labs geom_point
#' geom_segment geom_bar geom_col geom_tile guide_colorbar guides scale_color_continuous
#' scale_size_continuous scale_x_discrete scale_y_discrete scale_x_continuous scale_size
#' scale_fill_discrete scale_fill_continuous scale_y_continuous sec_axis theme xlab ylab xlim
#' @importFrom dplyr arrange mutate group_by top_n ungroup select case_when distinct rename pull
#' @importFrom stringr str_replace str_split
#' @importFrom rlang .data
#' @importFrom stats setNames
#' @importFrom ggraph ggraph geom_node_text geom_edge_link circle geom_node_point
#' geom_node_label
#' @importFrom igraph graph.data.frame delete.edges
#' @importFrom igraph V "V<-"
#' @importFrom igraph E "E<-"
#'
#' @return A ggplot object
#' @export
#' @examples
#' \donttest{
#' ## example data
#' ## More examples please refer to https://www.genekitr.fun/plot-ora-1.html
#' library(ggplot2)
#' data(geneList, package = "genekitr")
#' id <- names(geneList)[abs(geneList) > 1.5]
#' logfc <- geneList[id]
#'
#' gs <- geneset::getGO(org = "human",ont = "bp",data_dir = tempdir())
#' ego <- genORA(id, geneset = gs)
#' ego <- ego[1:10, ]
#'
#' ## example plots
#' plotEnrich(ego, plot_type = "dot")
#'
#' #plotEnrich(ego, plot_type = "bubble", scale_ratio = 0.4)
#'
#' #plotEnrich(ego, plot_type = "bar")
#'
#' }
#'
plotEnrich <- function(enrich_df,
fold_change = NULL,
plot_type = c(
"bar", "wego", "dot", "bubble", "lollipop", "geneheat", "genechord",
"network", "gomap", "goheat", "gotangram", "wordcloud", "upset"
),
term_metric = c("FoldEnrich", "GeneRatio", "Count", "RichFactor"),
stats_metric = c("p.adjust", "pvalue", "qvalue"),
sim_method = c("Resnik", "Lin", "Rel", "Jiang", "Wang", "JC"),
up_color = "#E31A1C",
down_color = "#1F78B4",
show_gene = "all",
xlim_left = 0,
xlim_right = NA,
wrap_length = NULL,
org = NULL,
ont = NULL,
scale_ratio,
layout,
n_term,
...) {
#--- args ---#
lst <- list(...) # store outside arguments in list
plot_type <- match.arg(plot_type)
term_metric <- match.arg(term_metric)
stats_metric <- match.arg(stats_metric)
sim_method <- match.arg(sim_method)
if (missing(layout)) layout <- "nicely"
compare_group <- any(grepl("cluster", colnames(enrich_df), ignore.case = T))
if (compare_group) plot_type <- "dot"
all_go <-any(grepl("ONTOLOGY", colnames(enrich_df), ignore.case = T))
if(all_go){
colnames(enrich_df)[tolower(colnames(enrich_df))%in%'ontology'] = 'ONTOLOGY'
}
if(any(grepl("nes",colnames(enrich_df),ignore.case = T))) term_metric <- "Count"
# if (all_go & !plot_type %in% c("bar", "wego")) {
# warning(paste0(
# 'If you want to plot all ontologies data, please choose "plot_type" from "bar","wego"',
# "\nInstead, bar plot will be plotted..."
# ))
# }
types <- c("GeneRatio", "Count", "FoldEnrich", "RichFactor")
legends <- c("p.adjust", "pvalue", "qvalue")
if (!term_metric %in% colnames(enrich_df)) {
stop(
term_metric, " not included in this dataframe, try: ",
paste(intersect(colnames(enrich_df), types), collapse = " | ")
)
}
if (!stats_metric %in% colnames(enrich_df)) {
stop(
stats_metric, " not included in this dataframe, try: ",
paste(intersect(colnames(enrich_df), legends), collapse = " | ")
)
}
#--- codes ---#
## uppercase first letter of description
tryCatch(
{
enrich_df$Description <- stringr::str_replace(enrich_df$Description, "^\\w{1}", toupper)
},
error = function(e) {
message(paste0(
"We need the 'Description' column which means pathway detailed description", "\n",
"Maybe you should rename the column name..."
))
}
)
## set labels
term_metric_label <- ifelse(term_metric == "FoldEnrich", "Fold Enrichment",
ifelse(term_metric == "GeneRatio", "Gene Ratio",
ifelse(term_metric == "RichFactor", "Rich Factor", "Count")
)
)
stats_metric_label <- ifelse(stats_metric == "pvalue", "Pvalue",
ifelse(stats_metric == "p.adjust", "P.adjust", "FDR")
)
## set pathway as factor
if (!compare_group & !all_go) {
if(any(duplicated(enrich_df$Description))){
dup_term <- enrich_df$Description[duplicated(enrich_df$Description)]
warning(paste('Checked duplicated terms in Description column, such as:',
paste0(utils::head(dup_term,3),collapse = '|'),'\n','please make them unique then plot'))
}
enrich_df <- enrich_df %>%
dplyr::arrange(eval(parse(text = term_metric))) %>%
dplyr::mutate(Description = factor(.$Description, levels = unique(.$Description), ordered = T))
} else if (!compare_group & all_go) {
if(any(duplicated(enrich_df$Description))){
dup_term <- enrich_df$Description[duplicated(enrich_df$Description)]
warning(paste('Checked duplicated terms exist in Description column, such as:',
paste0(utils::head(dup_term,3),collapse = '|','\n','please make them unique then plot')))
}
enrich_df <- enrich_df %>%
dplyr::mutate(Description = factor(.$Description, levels = unique(.$Description), ordered = T)) %>%
dplyr::group_by(ONTOLOGY) %>%
dplyr::arrange(eval(parse(text = term_metric)), .by_group = T)
}
#--- GO/KEGG: dot plot ---#
if (plot_type == "dot") {
## no group
if (missing(scale_ratio)) scale_ratio <- 0.3
if (!compare_group) {
p <- ggplot(enrich_df, aes_string(x = term_metric, y = "Description")) +
geom_point(aes_string(
color = stats_metric,
size = "Count"
)) +
scale_size(range = c(min(enrich_df$Count) / 2, max(enrich_df$Count) / 2) * scale_ratio) +
scale_color_continuous(
low = up_color, high = down_color, name = stats_metric_label,
guide = guide_colorbar(reverse = TRUE),
labels = function(x) format(x, scientific = T)
) +
xlab(term_metric_label) +
labs(color = stats_metric) +
xlim(xlim_left, xlim_right) +
plot_theme(...)
} else {
## compare groups
if(missing(n_term)) n_term = 5
enrich_df <- enrich_df %>%
dplyr::arrange(desc(FoldEnrich)) %>%
dplyr::group_by(Cluster) %>%
dplyr::slice_head(n=n_term)
calc <- enrich_df%>%
dplyr::group_by(Cluster) %>%
dplyr::summarise(sum_count = sum(Count)) %>%
as.data.frame()
xtick_lab <- paste0(as.character(calc[,1]), "\n(", calc[,2], ")")
p <- ggplot(enrich_df, aes_string(x = "Cluster", y = "Description")) +
geom_point(aes_string(
color = stats_metric,
size = term_metric
)) +
scale_size(range = c(min(enrich_df$Count) / 2, max(enrich_df$Count) / 2) * scale_ratio) +
scale_color_continuous(
low = up_color, high = down_color, name = stats_metric_label,
guide = guide_colorbar(reverse = TRUE),
labels = function(x) format(x, scientific = T)
) +
scale_x_discrete(labels = xtick_lab) +
xlab("Group") +
labs(color = stats_metric) +
plot_theme(...) +
theme(axis.text.x = element_text(
angle = 45,
vjust = 0.5, hjust = 0.5
))
}
}
#--- GO/KEGG: bubble plot ---#
## each bubble is a pathway term
## x-axis: stats metric e.g. pvalue/qvalue/p.adjust
## y-axis: fold enrichment
if (plot_type == "bubble") {
if (missing(scale_ratio)) scale_ratio <- 0.3
# default main and lengend text size
if (!"main_text_size" %in% names(lst)) lst$main_text_size <- 8
mapping <- aes(
x = -log10(eval(parse(text = stats_metric))),
y = FoldEnrich
)
p <- ggplot(enrich_df, mapping) +
geom_point(aes(size = Count, fill = Description), alpha = 2, shape = 21) +
scale_size(range = c(min(enrich_df$Count) / 2, max(enrich_df$Count) / 2) * scale_ratio) +
scale_fill_discrete(guide = "none") +
ggrepel::geom_text_repel(aes(label = Description),
size = lst$main_text_size / 2.8,
segment.color = "black"
) +
scale_x_continuous(
limits = c(0, ceiling(max(-log10(enrich_df[stats_metric]))) + 3),
breaks = seq(0, ceiling(max(-log10(enrich_df[stats_metric]))) + 3, by = 3)
) +
xlab(paste0("-log10(", stats_metric_label, ")")) +
ylab("Fold Enrichment") +
plot_theme( ...)
}
#--- GO/KEGG: bar plot ---#
if (plot_type == "bar") {
p <- ggplot(data = enrich_df, aes_string(x = term_metric, y = "Description", fill = stats_metric)) +
geom_bar(stat = "identity") +
scale_fill_continuous(
low = up_color, high = down_color, name = stats_metric_label,
guide = guide_colorbar(reverse = TRUE),
labels = function(x) format(x, scientific = T)
) +
xlab(term_metric_label) +
labs(color = stats_metric) +
xlim(xlim_left, xlim_right) +
plot_theme(...)
if (all_go) {
p <- p + ggplot2::facet_grid(ONTOLOGY ~ ., scales = "free") +
plot_theme(...)
}
}
#--- GO/KEGG: lollipop plot ---#
if (plot_type == "lollipop") {
if (missing(scale_ratio)) scale_ratio <- 0.3
p <- ggplot(
data = enrich_df,
aes(
eval(parse(text = term_metric)),
forcats::fct_reorder(Description, eval(parse(text = term_metric)))
)
) +
geom_segment(aes_string(
xend = 0, yend = "Description",
colour = stats_metric, size = 2 * scale_ratio
), show.legend = F) +
geom_point(aes_string(color = stats_metric, size = "Count")) +
theme(legend.key = element_rect(fill = "transparent")) +
scale_size_continuous(name = "Count", range = c(min(enrich_df$Count) / 2, max(enrich_df$Count) / 2) * scale_ratio) +
scale_color_continuous(
low = up_color, high = down_color, name = stats_metric_label,
guide = guide_colorbar(reverse = TRUE),
labels = function(x) format(x, scientific = T)
) +
xlab(term_metric_label) +
ylab(NULL) +
labs(color = stats_metric) +
plot_theme(...)
}
#--- GO/KEGG: geneheat plot ---#
## heatmap to show interaction between go term and gene id
if (plot_type == "geneheat") {
id <- enrich_df %>%
dplyr::pull(geneID) %>%
stringr::str_split("\\/") %>%
unlist()
id_symbol <- enrich_df %>%
dplyr::pull(geneID_symbol) %>%
stringr::str_split("\\/") %>%
unlist()
id_df <- data.frame(geneID = id, geneID_symbol = id_symbol) %>%
dplyr::distinct()
if (all(show_gene == "all")) {
plot_df <- enrich_df %>%
dplyr::select(Description, geneID_symbol) %>%
tidyr::separate_rows(geneID_symbol, sep = "\\/") %>%
dplyr::rename(geneID = geneID_symbol)
} else {
# if show_gene is not symbol, first extract matching symbol
if (all(show_gene %in% id)) {
show_gene <- id_df %>%
dplyr::filter(geneID %in% show_gene) %>%
dplyr::pull(geneID_symbol)
}
plot_df <- enrich_df %>%
dplyr::select(Description, geneID_symbol) %>%
tidyr::separate_rows(geneID_symbol, sep = "\\/") %>%
dplyr::filter(geneID_symbol %in% show_gene) %>%
dplyr::rename(geneID = geneID_symbol)
}
if (is.null(fold_change)) {
p <- ggplot(plot_df, aes_(~geneID, ~Description)) +
geom_tile(color = "white") +
xlab(NULL) +
ylab(NULL) +
plot_theme(border_thick = 0, ...) +
theme(
panel.grid.major = element_blank(),
axis.text.x = element_text(angle = 50, hjust = 1)
)
} else {
# add logfc
fold_change <- data.frame(geneID = names(fold_change), logfc = fold_change)
if (all(id_df$geneID %in% fold_change$geneID)) {
m1 <- merge(id_df, fold_change, by = "geneID")
plot_df <- merge(plot_df, m1, by.x = "geneID", by.y = "geneID_symbol") %>%
dplyr::select(-geneID.y)
} else {
plot_df <- merge(plot_df, fold_change, by.x = "geneID")
}
p <- ggplot(plot_df, aes_(~geneID, ~Description)) +
geom_tile(aes_(fill = ~logfc), color = "white") +
xlab(NULL) +
ylab(NULL) +
plot_theme(border_thick = 0, ...) +
scale_fill_continuous(
low = down_color, high = up_color,
name = "logFC"
) +
theme(
panel.grid.major = element_blank(),
axis.text.x = element_text(angle = 50, hjust = 1)
)
}
}
#--- GO/KEGG: genechord plot ---#
## chord plot to show interaction between go term and gene id
if (plot_type == "genechord") {
# if (!requireNamespace("GOplot", quietly = TRUE)) {
# utils::install.packages("GOplot")
# }
id <- enrich_df %>%
dplyr::pull(geneID) %>%
stringr::str_split("\\/") %>%
unlist()
id_symbol <- enrich_df %>%
dplyr::pull(geneID_symbol) %>%
stringr::str_split("\\/") %>%
unlist()
id_df <- data.frame(geneID = id, geneID_symbol = id_symbol) %>%
dplyr::distinct()
# if show_gene is not symbol, first extract matching symbol
if(all(show_gene == 'all')) stop('Please specify gene name to "show_gene"...')
if (length(show_gene %in% id) > length(show_gene %in% id_symbol)) {
show_gene <- id_df %>%
dplyr::filter(geneID %in% show_gene) %>%
dplyr::pull(geneID_symbol)
}else{
show_gene <- id_df %>%
dplyr::filter(geneID_symbol %in% show_gene) %>%
dplyr::pull(geneID_symbol)
}
plot_df <- enrich_df %>%
dplyr::select(Description, geneID_symbol) %>%
tidyr::separate_rows(geneID_symbol, sep = "\\/") %>%
dplyr::filter(geneID_symbol %in% show_gene) %>%
dplyr::rename(geneID = geneID_symbol)
# define color
my_cols <- c(
"#B2DF8A", "#FB9A99", "#E31A1C", "#B15928", "#6A3D9A", "#CAB2D6",
"#A6CEE3", "#1F78B4", "#FDBF6F", "#999999", "#FF7F00"
)
backup_cols <- c(
"#223D6C", "#D20A13", "#FFD121", "#088247", "#11AA4D", "#58CDD9",
"#7A142C", "#5D90BA", "#029149", "#431A3D", "#91612D", "#6E568C",
"#E0367A", "#D8D155", "#64495D", "#7CC767"
)
term <- unique(plot_df$Description)
if (length(my_cols) < length(term)) {
my_cols <- c(my_cols, backup_cols)
cols <- sample(my_cols, length(term), replace = F)
}
cols <- my_cols[1:length(term)]
# prepare chord data
id <- unique(plot_df$geneID)
dat <- sapply(id, function(x) {
check <- plot_df %>%
dplyr::filter(geneID %in% x) %>%
dplyr::pull(Description) %>%
unique()
ifelse(term %in% check, 1, 0)
}) %>%
t() %>%
as.data.frame() %>%
stats::setNames(term)
dat = dat[show_gene,]
# default main and lengend text size
if (!"main_text_size" %in% names(lst)) lst$main_text_size <- 3
if (!"legend_text_size" %in% names(lst)) lst$legend_text_size <- 8
if(!"gene_space" %in% names(lst)) lst$gene_space <- 0.3
if (is.null(fold_change)) {
p <- suppressWarnings(
GOplot::GOChord(dat,
space = 0.02,
gene.space = lst$gene_space,
gene.order = "none",
gene.size = lst$main_text_size,
process.label = lst$legend_text_size,
border.size = 0.1,
ribbon.col = cols
)
)
} else {
# add logfc
fold_change <- data.frame(geneID = names(fold_change), logfc = fold_change)
if (all(rownames(dat) %in% fold_change$geneID)) {
m1 <- fold_change %>%
dplyr::filter(geneID %in% rownames(dat)) %>%
dplyr::arrange(match(geneID_symbol, rownames(dat))) %>%
dplyr::pull(logfc)
} else {
m1 <- merge(id_df, fold_change, by.x = "geneID") %>%
dplyr::filter(geneID_symbol %in% rownames(dat)) %>%
dplyr::arrange(match(geneID_symbol, rownames(dat))) %>%
dplyr::pull(logfc)
}
dat <- dat %>% dplyr::mutate(logFC = m1)
p <- suppressWarnings(
GOplot::GOChord(dat,
space = 0.02,
gene.space = lst$gene_space,
gene.order = "logFC",
gene.size = lst$main_text_size,
process.label = lst$legend_text_size,
border.size = 0.1,
ribbon.col = cols,
lfc.col = c( up_color,"grey50",down_color )
)
)
}
# p <- p+ theme(legend.title=element_text(size=lst$legend_text_size))
}
#--- GO/KEGG: network plot ---#
## plot enriched terms in network with edges connecting overlapping gene sets,
## mutually overlapping gene sets are tend to cluster together
if (plot_type == "network") {
if (missing(scale_ratio)) scale_ratio <- 1
# pkgs <- c("ggnewscale", "ggraph", "igraph")
# invisible(sapply(pkgs, function(x) {
# if (!requireNamespace(x, quietly = TRUE)) {
# utils::install.packages(x)
# }
# }))
id <- enrich_df[,grepl("^id",colnames(enrich_df),ignore.case = T)]
enrichGenes <- strsplit(enrich_df$geneID, "\\/") %>% setNames(id)
# kegg result just use JC method
if (!any(grepl("GO:", id))) sim_method <- "JC"
if (sim_method == "JC") {
m <- get_JC_data(enrich_df)
} else {
m <- get_sim_data(enrich_df, org = NULL, ont = NULL, sim_method)[["m"]]
rownames(m) <- colnames(m) <- enrich_df %>%
dplyr::filter(.[[1]] %in% colnames(m)) %>%
dplyr::pull(Description)
}
mm <- reshape2::melt(m)
mm <- mm[mm[, 1] != mm[, 2], ]
mm <- mm[!is.na(mm[, 3]), ]
# construct igraph
g <- graph.data.frame(mm[, -3], directed = FALSE)
E(g)$width <- sqrt(mm[, 3] * 5) * scale_ratio
E(g)$weight <- mm[, 3]
g <- delete.edges(g, E(g)[mm[, 3] < 0.2])
id_order <- unlist(sapply(V(g)$name, function(x) which(x == enrich_df$Description)))
id_genes <- sapply(enrichGenes[id_order], length)
V(g)$size <- id_genes
V(g)$color <- enrich_df[id_order, stats_metric]
# default main and lengend text size
if (!"main_text_size" %in% names(lst)) lst$main_text_size <- 3
if (!"legend_text_size" %in% names(lst)) lst$legend_text_size <- 8
# igraph to ggplot
p <- ggraph(g, layout) +
geom_edge_link(
alpha = .8, aes_(width = ~ I(width)),
colour = "darkgrey"
) +
ggnewscale::new_scale_fill() +
geom_point(shape = 21, aes_(
x = ~x, y = ~y,
fill = ~color,
size = ~size
)) +
scale_size_continuous(
name = "Number of genes",
guide = "legend",
range = c(min(V(g)$size) / 2, max(V(g)$size) / 2) * scale_ratio
) +
scale_fill_continuous(
low = up_color, high = down_color,
name = stats_metric_label
) +
theme(panel.background = element_blank()) +
geom_node_text(aes_(label = ~name),
data = NULL,
size = lst$main_text_size,
bg.color = "white",
repel = TRUE, segment.size = 0.2
) +
guides(fill = guide_colorbar(reverse = TRUE)) +
plot_theme(
remove_border = T, remove_main_text = T,
border_thick = 0, ...
)
}
#--- GO/KEGG: wordcloud ---#
if (plot_type == 'wordcloud'){
# if (!requireNamespace("wordcloud", quietly = TRUE)) {
# utils::install.packages("wordcloud")
# }
mypal <- RColorBrewer::brewer.pal(9, "Paired")
dat <- data.frame(term = enrich_df$Description)
x <- tm::Corpus(tm::VectorSource(dat$term))
tdm <- tm::TermDocumentMatrix(x, control = list(removePunctuation = TRUE,
stopwords = TRUE))
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)
p <- invisible(wordcloud::wordcloud(d$word, d$freq, min.freq = 0,colors = mypal))
}
#--- GO/KEGG: upset plot ---#
if (plot_type == "upset") {
if (!"main_text_size" %in% names(lst)) lst$main_text_size <- 10
if (!"legend_text_size" %in% names(lst)) lst$legend_text_size <- 8
if (!"legend_position" %in% names(lst)) lst$legend_position <- 'left'
plot_df <- enrich_df %>% as.upset() %>%
do.call(cbind,.) %>% as.data.frame()
p <- plotVenn(enrich_df, use_venn = FALSE,...)
}
#--- GO: wego plot ---#
## WEGO plot show all ontologies
if (plot_type == "wego") {
if (!"ontology" %in% tolower(colnames(enrich_df))) {
stop("WEGO plot needs a column named 'ontology' which infers ontology type of 'bp', 'cc' or 'mf'...")
}else{
colnames(enrich_df)[grep("ontology", colnames(enrich_df), ignore.case = T)] <- "Ontology"
}
if(missing(n_term)) n_term = 5
if(is.null(wrap_length)) wrap_length = 30
if (!"main_text_size" %in% names(lst)) lst$main_text_size <- 8
wego <- enrich_df %>%
dplyr::select(1,'Description', "Count", "GeneRatio", "Ontology") %>%
dplyr::mutate(GeneRatio = GeneRatio * 100) %>%
dplyr::group_by(Ontology) %>%
dplyr::top_n(n_term, GeneRatio) %>%
dplyr::ungroup() %>%
dplyr::arrange(Ontology, GeneRatio) %>%
dplyr::mutate(Position = dplyr::n():1) %>%
dplyr::mutate(Ontology = dplyr::case_when(
tolower(Ontology) == "bp" ~ "Biological Process",
tolower(Ontology) == "cc" ~ "Cellular Component",
tolower(Ontology) == "mf" ~ "Molecular Function"
))
normalizer <- max(wego$Count) / max(wego$GeneRatio)
p <- ggplot(data = wego, aes(
x = forcats::fct_reorder(Description, sort(Position)),
y = GeneRatio, fill = Ontology
)) +
ggsci::scale_fill_nejm() +
geom_col(data = wego, aes(
x = forcats::fct_reorder(Description, sort(Position)),
y = Count / normalizer
)) +
scale_y_continuous(sec.axis = sec_axis(
trans = ~ . * normalizer, name = "Number of genes",
labels = function(b) {
round(b, 0)
}
)) +
plot_theme(remove_legend = T, ...) +
scale_x_discrete(labels = text_wraper(wrap_length)) +
xlab(NULL) +
ylab("Gene Ratio(%)") +
facet_grid(. ~ Ontology, scales = "free") +
theme(
axis.text.x = element_text(angle = 70, hjust = 1),
strip.text.x = element_text(size = lst$main_text_size)
)
}
#--- GO: term map plot ---#
## show enriched terms structure (its parents and children terms)
if (plot_type == "gomap") {
if (missing(scale_ratio)) scale_ratio <- 1
# if (!requireNamespace("ggraph", quietly = TRUE)) {
# utils::install.packages("ggraph")
# }
# if (!requireNamespace("igraph", quietly = TRUE)) {
# utils::install.packages("igraph")
# }
requireNamespace("ggraph", quietly = T)
requireNamespace("igraph", quietly = T)
rownames(enrich_df) = enrich_df[,1]
id <- enrich_df[, 1]
enrichGenes <- strsplit(enrich_df$geneID, "\\/") %>% setNames(id)
# get all id ancestors
GOANCESTOR <- get_ancestor_data(enrich_df, ont = NULL)
id_anc <- AnnotationDbi::mget(id, GOANCESTOR)
anc1 <- id_anc[[1]]
for (i in 2:length(id_anc)) {
anc1 <- intersect(anc1, id_anc[[i]])
}
uanc <- unique(unlist(id_anc))
uanc <- uanc[!uanc %in% anc1]
# get plot edge and nodes data
gotbl <- get_gosim_data()
edge <- gotbl[gotbl$go_id %in% unique(c(id, uanc)), ] %>%
dplyr::select(c(5, 1, 4))
node <- gotbl %>%
dplyr::filter(go_id %in% unique(c(edge[, 1], edge[, 2]))) %>%
dplyr::select(1:3) %>%
dplyr::distinct() %>%
dplyr::mutate(
color = enrich_df[go_id, stats_metric],
size = sapply(enrichGenes[go_id], length)
)
rm(gotbl, envir = .genekitrEnv)
# only show top nodes/id-related parent and child nodes
show_node_top <- table(edge$parent) %>%
sort() %>%
utils::tail(3) %>%
names()
show_node_parent <- edge %>%
dplyr::filter(go_id %in% id) %>%
dplyr::pull(parent)
show_node_child <- edge %>%
dplyr::filter(parent %in% id) %>%
dplyr::pull(go_id)
show_nodes <- unique(c(id, show_node_top, show_node_parent, show_node_child))
sub_node <- node %>%
dplyr::mutate(Term = ifelse(go_id %in% show_nodes,
stringr::str_replace(Term, "^\\w{1}", toupper), NA
)) %>%
dplyr::mutate(Term = ifelse(is.na(Term), NA,
sapply(strwrap(Term, width = wrap_length, simplify = FALSE),
paste,
collapse = "\n"
)
))
g <- igraph::graph.data.frame(edge, directed = TRUE, vertices = sub_node)
E(g)$Relationship <- edge[, 3]
# default main and lengend text size
if (!"main_text_size" %in% names(lst)) lst$main_text_size <- 3
if (!"legend_text_size" %in% names(lst)) lst$legend_text_size <- 8
p <- ggraph(g, layout = "sugiyama") +
geom_edge_link(aes_(linetype = ~Relationship),
arrow = grid::arrow(length = unit(1, "mm")),
end_cap = circle(1, "mm"),
colour = "darkgrey"
) +
geom_node_point(size = 3 * scale_ratio, aes_(color = ~color)) +
scale_color_continuous(
low = up_color, high = down_color, name = stats_metric_label,
guide = guide_colorbar(reverse = TRUE)
) +
geom_node_label(aes_(label = ~Term, color = ~color),
size = lst$main_text_size,
repel = TRUE, segment.size = 0.2,
max.overlaps = 16
) +
scale_fill_continuous(
low = up_color, high = down_color, name = stats_metric_label,
guide = guide_colorbar(reverse = TRUE), na.value = "white"
) +
plot_theme(border_thick = 0, remove_main_text = T, ...)
}
#--- GO: term heatmap---#
if(plot_type == "goheat"){
if (!sim_method %in% c("Resnik", "Lin", "Rel", "Jiang", "Wang")) {
stop('Please choose "sim_method" from: "Resnik", "Lin", "Rel", "Jiang" , "Wang"!')
}
l <- get_sim_data(enrich_df, org = NULL, ont = NULL, sim_method)
simMatrix <- l[["m"]]
reducedTerms <- l[["r"]]
if (!"main_text_size" %in% names(lst)) lst$main_text_size <- 10
if (!"legend_text_size" %in% names(lst)) lst$legend_text_size <- 8
if (!"remove_legend" %in% names(lst)) lst$remove_legend <- FALSE
my_cols <- c(
"#B2DF8A", "#FB9A99", "#E31A1C", "#B15928", "#6A3D9A", "#CAB2D6",
"#A6CEE3", "#1F78B4", "#FDBF6F", "#999999", "#FF7F00",
"#223D6C", "#D20A13", "#FFD121", "#088247", "#11AA4D", "#58CDD9",
"#7A142C", "#5D90BA", "#029149", "#431A3D", "#91612D", "#6E568C",
"#E0367A", "#D8D155", "#64495D", "#7CC767"
)
anno <- data.frame(ParentTerm = factor(reducedTerms[match(rownames(simMatrix),
reducedTerms$go), "parentTerm"]),
row.names = rownames(simMatrix))
anno_col <- list(ParentTerm = my_cols[1:length(unique(anno$ParentTerm))])
anno$ParentTerm <- factor(stringr::str_replace(anno$ParentTerm, "^\\w{1}", toupper))
names(anno_col$ParentTerm) <- levels(anno$ParentTerm)
p <- pheatmap::pheatmap(simMatrix,
# color = colorRampPalette(c("#1F78B4", "white", "#E31A1C"))(50),
annotation_row = anno,
annotation_colors = anno_col,
annotation_names_row = F,
show_colnames = F,
treeheight_col = 0,
fontsize = lst$legend_text_size,
fontsize_row = lst$main_text_size,
legend = !lst$remove_legend
# treeheight_row = 0
)
}
#--- GO: term tangram ---#
if (plot_type == "gotangram") {
if (!sim_method %in% c("Resnik", "Lin", "Rel", "Jiang", "Wang")) {
stop('Please choose "sim_method" from: "Resnik", "Lin", "Rel", "Jiang" , "Wang"!')
}
my_cols <- c(
"#B2DF8A", "#FB9A99", "#E31A1C", "#B15928", "#6A3D9A", "#CAB2D6",
"#A6CEE3", "#1F78B4", "#FDBF6F", "#999999", "#FF7F00",
"#223D6C", "#D20A13", "#FFD121", "#088247", "#11AA4D", "#58CDD9",
"#7A142C", "#5D90BA", "#029149", "#431A3D", "#91612D", "#6E568C",
"#E0367A", "#D8D155", "#64495D", "#7CC767"
)
l <- get_sim_data(enrich_df, org = NULL, ont = NULL, sim_method)
simMatrix <- l[["m"]]
reducedTerms <- l[["r"]]
if (!"main_text_size" %in% names(lst)) lst$main_text_size <- 8
p <- treemap::treemap(reducedTerms,
index = c("parentTerm", "term"),
vSize = "score", type = "index", title = "",
palette = RColorBrewer::brewer.pal(length(unique(reducedTerms$parent)),'Set2'),
fontsize.labels = lst$main_text_size,
fontcolor.labels = c("#FFFFFFDD", "#00000080"),
bg.labels = 0,
border.col = "#00000080")
}
#--- keggpath ---#
# if(plot_type == 'keggpath'){
# # if (!requireNamespace("pathview", quietly = TRUE)) {
# # warning('Depends on pathview package. Installing...')
# # utils::install.packages('pathview')
# # }
#
# if(is.null(fold_change))
# stop('Please give fold change or logFC values with gene IDs as names!')
#
# org = substr(enrich_df[1,1],1,3) #e.g. hsa
# ids = enrich_df$ID
# if(!all(grepl("^[a-z]{3}.*",ids))) stop('Please give a kegg result...')
#
# max_fc <- max(abs(fold_change))
# bins <- ceiling(max_fc) * 2
# p <- lapply(ids, function(i) {
# print(paste0('Now plotting ',which(ids%in%i),'/',length(ids),': ',i))
# suppressPackageStartupMessages(pathview::pathview(gene.data=fold_change,
# pathway.id = i,
# species = org,
# limit = list(gene=max_fc, cpd=1),
# bins = list(gene=bins, cpd=10),
# low = list(gene="blue", cpd="blue"),
# high = list(gene="red", cpd="yellow"),
# out.suffix='genekitr',
# kegg.native=TRUE,
# new.signature=FALSE))
# })
# invisible(p)
# }
# wrap long text
if (!is.null(wrap_length) & is.numeric(wrap_length) & plot_type != 'wego') {
p <- p + scale_y_discrete(labels = text_wraper(wrap_length))
}
if(plot_type != "gotangram") return(p)
}
#--- sub-function---#
text_wraper <- function(width) {
function(x) {
lapply(strwrap(x, width = width, simplify = FALSE), paste, collapse = "\n")
}
}
ratio_intersect <- function(n, m) {
n <- unlist(n)
m <- unlist(m)
length(intersect(n, m)) / length(unique(c(n, m)))
}
get_JC_data <- function(enrich_df) {
id <- enrich_df[, 1]
enrichGenes <- strsplit(enrich_df$geneID, "\\/") %>% setNames(id)
n <- nrow(enrich_df)
m <- matrix(NA, nrow = n, ncol = n)
colnames(m) <- rownames(m) <- enrich_df$Description
for (i in seq_len(n - 1)) {
for (x in (i + 1):n) {
m[i, x] <- ratio_intersect(enrichGenes[id[i]], enrichGenes[id[x]])
}
}
return(m)
}
get_sim_data <- function(enrich_df, org = NULL, ont = NULL, sim_method) {
org_name <- genekitr::biocOrg_name$short_name
if (is.null(org)) {
tryCatch(
{
nm <- strsplit(colnames(enrich_df)[1], "_") %>% unlist()
org <- nm[1]
},
error = function(e) {
message("Please rename the ID column with organism short name from genekitr::biocOrg_name!\nOR you can use 'JC' method")
}
)
} else if (!tolower(org) %in% org_name) {
stop("Please rename the ID column with organism short name from genekitr::biocOrg_name!\nOR you can use 'JC' method.")
}
if (is.null(ont)) {
tryCatch(
{
nm <- strsplit(colnames(enrich_df)[1], "_") %>% unlist()
ont <- nm[2]
},
error = function(e) {
message('Please rename the ID column with ontology of "BP", "CC" and "MF"!\nOR you can use "JC" method.')
}
)
} else if (!tolower(ont) %in% c("bp", "cc", "mf")) {
stop('Please rename the ID column with ontology of "BP", "CC" and "MF"!\nOR you can use "JC" method.')
}
ont <- toupper(ont)
# if(all(tolower(nm) == 'id')){
# stop(paste0('Please give organism name to "org" such as "Hs", "Mm"...\n',
# 'also give ontology to "ont" from "BP","CC" and "MF"'))
# }else{
# org = nm[1]; ont = nm[2]
# }
if(tolower(org) == 'at'){
orgdb <- 'org.At.tair.db'
} else if (tolower(org) == 'sc') {
orgdb <- 'org.Sc.sgd.db'
}else{
orgdb <- paste0("org.", stringr::str_to_title(org), ".eg.db")
}
id <- enrich_df[, 1]
# save godata is saving time
data_dir <- tools::R_user_dir("genekitr", which = "data")
data_dir <- paste0(data_dir, "/godata")
if (!dir.exists(data_dir)) dir.create(data_dir, recursive = TRUE)
destfile <- paste0(data_dir, "/", org, "_", ont, "_godata.rda")
if (!file.exists(destfile)) {
orgdb <- loadOrgdb(orgdb)
semdata <- GOSemSim::godata(orgdb, ont = ont)
save(semdata, file = destfile)
} else {
load(destfile)
}
# calculate simMatrix(m)
id <- unique(id)
found <- id %in% names(semdata@IC)
getAncestors <- utils::getFromNamespace("getAncestors", "GOSemSim")
hasAncestor <- !is.na(sapply(id, function(x) {
tryCatch(getAncestors(ont)[x],
error = function(e) NA
)
}))
id <- id[found & hasAncestor]
m <- suppressMessages(matrix(GOSemSim::goSim(id, id, semData = semdata, measure = sim_method),
ncol = length(id), dimnames = list(id, id)
))
out <- apply(m, 2, function(x) all(is.na(x)))
m <- m[!out, !out]
# reduce redundant terms
if(all(enrich_df$qvalue%in%NA)){
scores <- setNames(-log10(enrich_df$p.adjust), enrich_df[, 1])
}else{
scores <- setNames(-log10(enrich_df$qvalue), enrich_df[, 1])
}
r <- rrvgo::reduceSimMatrix(m, scores, orgdb = orgdb)
return(list(m = m, r = r))
}
get_ancestor_data <- function(enrich_df, ont = NULL) {
nm <- strsplit(colnames(enrich_df)[1], "_") %>% unlist()
if (all(tolower(nm) == "id")) {
stop(paste0('Please give ontology to "ont" from "BP","CC" and "MF"'))
} else {
ont <- nm[2]
}
# save ancestor_data is saving time
data_dir <- tools::R_user_dir("genekitr", which = "data")
data_dir <- paste0(data_dir, "/ancestor_data")
if (!dir.exists(data_dir)) dir.create(data_dir, recursive = TRUE)
destfile <- paste0(data_dir, "/", ont, "_ancestor.rda")
getAncestors <- utils::getFromNamespace("getAncestors", "GOSemSim")
GOANCESTOR <- getAncestors(ont)
return(GOANCESTOR)
}
loadOrgdb <- function(orgdb) {
if (!requireNamespace(orgdb, quietly = TRUE)) {
stop("Bioconductor orgdb for ", orgdb, " not found. You should install first.",
call. = FALSE
)
}
eval(parse(text = paste0(orgdb, "::", orgdb)))
}
get_gosim_data <- function(){
.initial()
utils::data("gotbl", package = "GOSemSim",envir = .genekitrEnv)
gotbl <- get("gotbl", envir = .genekitrEnv)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.