knitr::opts_chunk$set(echo = TRUE, cache = FALSE, warning = FALSE, message = FALSE) library(tidyverse); library(knitr);
databaseFiles <- c("BioCarta_2016.csv", "KEGG_2019_Human.csv", "Reactome_2016.csv", "WikiPathways_2019_Human.csv") pathwayDB <- lapply(databaseFiles, function(pathwayName){ cat("Processing: ", pathwayName, fill = TRUE) dat <- read.csv(here::here("inst", "extdata", "pathwayDB", "data", pathwayName), header = FALSE) dat[dat == ""] <- NA dat %>% gather(Members, Genes, -V1) %>% filter(!is.na(Genes)) %>% rename(Pathways = V1) %>% dplyr::select(Pathways, Genes) %>% mutate(DB = gsub(".csv", "", pathwayName)) }) %>% do.call(rbind, .)
pathwayDB %>% dplyr::select(DB, Pathways) %>% group_by(DB) %>% summarise(n = n_distinct(Pathways)) %>% ggplot(aes(x = reorder(DB, -n), y = n)) + geom_bar(stat = "identity") + ylab("Number of pathways per DB") + xlab("DB") + theme_classic()
Reactome has the most genesets whereas BioCarta has the least number of genesets.
pathwayDB %>% group_by(DB) %>% summarise(n = n_distinct(Genes)) %>% ggplot(aes(x = reorder(DB, -n), y = n)) + geom_bar(stat = "identity") + ylab("Number of genesets") + xlab("DB") + theme_classic()
BioCarta captures the least number of unique genes, whereas the remianing three capture >5K genes.
pathwayTally <- pathwayDB %>% group_by(DB, Pathways) %>% summarise(n = n()) pathwayTally %>% ggplot(aes(x = n)) + geom_histogram() + facet_wrap(vars(DB), scales = "free") + scale_y_log10() + ylab("Frequency of genesets with a given number of genes") + xlab("Number of genes") + theme_classic()
r kable(pathwayTally %>%
arrange(desc(n)) %>%
slice(1), "markdown")
r kable(pathwayTally %>%
arrange(n) %>%
slice(1), "markdown")
usethis::use_data(pathwayDB, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.