knitr::opts_chunk$set(echo = TRUE, cache = FALSE, warning = FALSE, message = FALSE)

library(tidyverse); library(knitr);

Steps to reproduce pathwayDB

databaseFiles <- c("BioCarta_2016.csv", "KEGG_2019_Human.csv", "Reactome_2016.csv", "WikiPathways_2019_Human.csv")
pathwayDB <- lapply(databaseFiles, function(pathwayName){
  cat("Processing: ", pathwayName, fill = TRUE)
  dat <- read.csv(here::here("inst", "extdata", "dataCleaning", "pathwayDB", "data", pathwayName), header = FALSE)
  dat[dat == ""] <- NA
  dat %>% 
    gather(Members, Genes, -V1) %>% 
    filter(!is.na(Genes)) %>% 
    rename(Pathways = V1) %>% 
    dplyr::select(Pathways, Genes) %>% 
    mutate(DB = gsub(".csv", "", pathwayName))
}) %>% 
  do.call(rbind, .)

Pathway DB characteristics

Which DB has the most genesets?

pathwayDB %>% 
  dplyr::select(DB, Pathways) %>% 
  group_by(DB) %>% 
  summarise(n = n_distinct(Pathways)) %>% 
  ggplot(aes(x = reorder(DB, -n), y = n)) +
  geom_bar(stat = "identity") +
  ylab("Number of pathways per DB") +
  xlab("DB") +
  theme_classic()

Reactome has the most genesets whereas BioCarta has the least number of genesets.

Which DB captures the most number of unique genes?

pathwayDB %>% 
  group_by(DB) %>% 
  summarise(n = n_distinct(Genes)) %>% 
  ggplot(aes(x = reorder(DB, -n), y = n)) +
  geom_bar(stat = "identity") +
  ylab("Number of genesets") +
  xlab("DB") +
  theme_classic()

BioCarta captures the least number of unique genes, whereas the remianing three capture >5K genes.

Which pathway is the largest per DB?

pathwayTally <- pathwayDB %>% 
  group_by(DB, Pathways) %>% 
  summarise(n = n())

pathwayTally %>% 
  ggplot(aes(x = n)) +
  geom_histogram() +
  facet_wrap(vars(DB), scales = "free") + 
  scale_y_log10() +
  ylab("Frequency of genesets with a given number of genes") +
  xlab("Number of genes") +
  theme_classic()

Genesets with the most number of gene members (n)

r kable(pathwayTally %>% arrange(desc(n)) %>% slice(1), "markdown")

Genesets with the least number of gene members (n)

r kable(pathwayTally %>% arrange(n) %>% slice(1), "markdown")

Save package data

usethis::use_data(pathwayDB, overwrite = TRUE)


singha53/omicsCentralDatasets documentation built on March 7, 2020, 2:02 a.m.