knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center"
)
# run before class:
# devtools::install_github('rnabioco/practical-data-analysis')
# BiocManager::install("biomaRt")

library(pbda)
library(tidyverse)
library(cowplot)
library(gprofiler2)

Did experiment, ran through analysis pipeline, what now?

Handle gene lists

Try to connect prior knowledge to the results (genes you study, GO terms, or something like yeast_prot_prop)

Using some basic R and tibble manipulation commands

Retrieve gene lists from RNA-seq or scRNA-seq

Recall how data is saved in objects

Or whatever file you already saved containing lists of gene names

# from DESeq2 output, get a tibble back
# res <- results(dds)
# saveRDS(res, "res_class4.rds") # save any object from environment for next time
loc <- system.file("extdata", "res_class4.rds", package = 'pbda')
res <- readRDS(loc) # loading a DESeq2 object automatically loads DESeq2

head(res) # basically a dataframe

res_tibble <- as_tibble(res, rownames = "gene")

res_tibble

# res_tibble contains calculations for all genes, pick out "interesting" genes
gene_tibble <- res_tibble %>% 
  filter(!is.na(padj)) %>% # NA values from padj are from tossing low expressing genes
  filter(padj <= 0.01) %>% # padj <= 0.05 is common, or 0.01, sometimes even 0.1 is acceptable
  filter(log2FoldChange >= 1 | log2FoldChange <= -1) %>%
  arrange(desc(log2FoldChange))

gene_tibble

# get top 10 down-regulated genes
gene_tibble_down10 <- gene_tibble %>%
  filter(log2FoldChange < 0) %>% 
  arrange(log2FoldChange) %>% 
  dplyr::slice(1:10) 

# get a vector of gene names
gene_vector <- gene_tibble$gene
gene_vector <- gene_tibble %>% pull(gene) # at least 5 more ways to do this
write_csv(gene_tibble, "gene_tibble.csv")

# or load already saved file on disk
loc <- system.file("extdata", "gene_tibble.csv", package = 'pbda')
gene_tibble2 <- read_csv(loc) # reads out as tibble

# or write as txt file, without col name
write_csv(gene_tibble %>% select(gene), "gene_tibble.txt", col_names = FALSE) # write from tibble
write_lines(gene_vector, "gene_tibble_v.txt") # write for vector

Gene set enrichment: feed gene lists to g:Profiler, get enrichment score back

g:Profiler https://biit.cs.ut.ee/gprofiler/

Here we consider the simple case of single, unordered lists of significantly differentially expressed human genes. g:Profiler can also handle ordered lists, other species, multiple lists in the same query, arbitry thresholds, several different multiple testing correction methods, arbitrary source selection, and much more.

# generate lists of singificantly up/downregulated genes from the RNA-seq example
upregulated_genes <- gene_tibble %>%
  filter(log2FoldChange >= 1, padj <= 0.01) %>%
  pull(gene)

downregulated_genes <- gene_tibble %>%
  filter(log2FoldChange <= -1, padj <= 0.01) %>%
  pull(gene)

# gost: gene list functional enrichment (for unordered human lists, only the list is required)
gost_up <- gost(upregulated_genes,
                organism = "hsapiens", 
                ordered_query = FALSE
                )

gost_down <- gost(downregulated_genes,
                organism = "hsapiens",
                ordered_query = FALSE)

# inspect the resulting gost output:
# meta slot contains information about the query, results and gene mapping
# All three print pretty unweildy lists, but can be very useful for troubleshooting
# gost_up$meta$query_metadata
# gost_up$meta$result_metadata
# gost_up$meta$genes_metadata 

# result slot contains the enrichment information
as_tibble(gost_up$result)

# for a quick look a the results, we can plot the -log10 of top 20 entries in both lists
gost_log <- bind_rows(
  as_tibble(gost_up$result) %>%
    mutate(list = "up"),
  as_tibble(gost_down$result) %>%
    mutate(list = "down")
) %>%
  mutate(minus_log10 = -log10(p_value),
         list = factor(list, levels = c("up", "down"))) %>%
  group_by(list) %>%
  arrange(desc(minus_log10)) %>%
  dplyr::slice(1:20)

gost_log %>% select(list, term_name, minus_log10)

ggplot(gost_log, aes(x = reorder(term_name, minus_log10), y = minus_log10)) + #reorder Term by minus_log10
  geom_bar(stat = "identity") + # otherwise ggplot tries to count
  xlab("Term Name") +
  facet_wrap(~list, scales = "free") +
  coord_flip() +
  theme(axis.text.y = element_text(size = 7))

# adding source information for the enriched terms can also help with interpretation:
gost_log <- gost_log %>% mutate(term_name = paste(source,term_name, sep = ": ")) %>% select(list, term_name, minus_log10)

ggplot(gost_log, aes(x = reorder(term_name, minus_log10), y = minus_log10)) + #reorder Term by minus_log10
  geom_bar(stat = "identity") + # otherwise ggplot tries to count
  xlab("Term Name") +
  facet_wrap(~list, scales = "free") +
  coord_flip() +
  theme(axis.text.y = element_text(size = 7))

other options:

Panther http://www.geneontology.org/page/go-enrichment-analysis

GSEA (requires ranked list, for instance by fold-change, and requires full list of genes instead of just "significant") http://software.broadinstitute.org/gsea/index.jsp

or, GO and GSEA right in R (will require additional packages):

topGO: https://bioconductor.org/packages/release/bioc/html/topGO.html

fgsea: http://bioconductor.org/packages/release/bioc/html/fgsea.html

Sometimes transformation of names is needed - stringr package, part of tidyverse

Depending on the transcriptome and analysis options used in RNA-seq/scRNA-seq, the list of names can be messy.

loc <- system.file("extdata", "ensg_tibble.csv", package = 'pbda')
ensg_tibble <- read_csv(loc)

ensg_tibble # those version numbers are often not recognized by software

str_replace: find and replace (run ?str_replace for documentation)

ensg_tibble_rm <- ensg_tibble %>% mutate(gene = str_replace(gene, 
                                                            pattern = "ENSG",
                                                            replacement = "#"))

ensg_tibble_rm

# trimming of gene/transcript version
ensg_tibble2 <- ensg_tibble %>% mutate(gene_new = str_replace(gene, 
                                                          pattern = "\\..*", # . and everything after
                                                          replacement = "")) # empty string
# regular expressions are useful and worth the time to learn, see ?base::regex

ensg_tibble2

# `separate` works too, if you want to keep the info
ensg_tibble3 <- ensg_tibble %>% separate(gene, into = c("gene", "version"), sep = "\\.")
# revisit the gprofiler output plot, clean up "term_name" column, stripping motif match info
gost_log2 <- gost_log %>% mutate(term_name = str_replace(term_name,
                                                         pattern = ";.*", # everything after and including ;
                                                         replacement = "")) %>%
  group_by(list, term_name) %>% # we now have duplicated term_names
  arrange(desc(minus_log10)) %>% # we arranges them by minus_log10
  dplyr::slice(1) # and take the most significant for each term

ggplot(gost_log2, aes(x = reorder(term_name, minus_log10), y = minus_log10)) + #reorder term_name by minus_log10
  geom_bar(stat = "identity") + # otherwise ggplot tries to count
  xlab("Term Name") +
  facet_wrap(~list, scales = "free") +
  coord_flip() +
  theme(axis.text.y = element_text(size = 7))
# to save plot
ggsave("gprofiler.png", plot = last_plot(), dpi = 300)

str_detect: spits out TRUE or FALSE

# "-AS" denotes antisense genes, we can find all those
str_detect(gene_tibble$gene, "-AS")[1:50]

# use in `filter`
gene_tibble_as <- gene_tibble %>% filter(str_detect(gene, "-AS"))

gene_tibble_as

str_c: concatenate strings

# also add annotations of our own
gene_tibble_as2 <- gene_tibble_as %>% mutate(gene = str_c("HUMAN-", gene))

gene_tibble_as2

str_sub: extract substrings

str_sub("HUMAN-ELOVL2-AS1", 2, 4)

gene_tibble_as3 <- gene_tibble_as2 %>% mutate(gene = str_sub(gene, 2, 4))

gene_tibble_as3

# to remove annotations of fixed length
gene_tibble_as4 <- gene_tibble_as2 %>% mutate(gene = str_sub(gene, 7, -5)) # extract just the gene name from "HUMAN-ELOVL2-AS1"

gene_tibble_as4

Various conversions in R - BioMart

BioMart can be used outside of R too: https://www.ensembl.org/biomart/martview

We will be using biomaRt

library(biomaRt)
# biomaRt contains many useful datasets
listDatasets(useMart("ENSEMBL_MART_ENSEMBL", host = "https://www.ensembl.org", )) %>% as_tibble() # hundreds of organisms

listAttributes(useMart("ensembl", dataset = "hsapiens_gene_ensembl")) %>% as_tibble() # thousands of rows/attributes

listFilters(useMart("ensembl", dataset = "hsapiens_gene_ensembl")) %>% as_tibble() # ways to filter the data

common usage 1: id to gene symbol conversion

human_mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", mirror = 'uswest') # assign the dataset to use

ensg_sym2 <- getBM(attributes = c("hgnc_symbol", "ensembl_gene_id"), # columns you want back
                 filters = "ensembl_gene_id",                       # type of info you are providing
                 mart = human_mart,                                 # dataset of certain species
                 values = ensg_tibble2$gene) %>% as_tibble()        # values of the filter

ensg_sym2

# try running this with ensg_tibble (version # intact), can't find matches

common usage 2: functional GO term annotation

listAttributes(useMart("ensembl", dataset = "hsapiens_gene_ensembl")) %>% 
  as_tibble() %>% 
  filter(str_detect(description, "GO")) # look at attributes related to GO 
drerio_mart <- useEnsembl("ensembl", dataset = "drerio_gene_ensembl", mirror = 'uswest')

loc <- system.file("extdata", "genes_drerio.csv", package = 'pbda')
genes_drerio <- read_csv(loc)

genes_drerio

genesGO <- getBM(attributes = c("zfin_id_symbol", "go_id", "name_1006", "namespace_1003"), 
                 filters = "zfin_id_symbol", 
                 mart = drerio_mart, 
                 values = genes_drerio$gene, TRUE) %>% as_tibble()

genesGO

genesGO <- genesGO %>% filter(go_id != "") # note that there are lines with empty go terms, remember to remove those

# join GO dataframe with original RNA-seq/scRNA-seq data if needed
genes_drerio_joinedGO <- left_join(genes_drerio, genesGO, 
                                   by = c("gene" = "zfin_id_symbol"))

genes_drerio_joinedGO
# or, get all genes with GO term of interest, by providing multiple filters
genesGO_filtered <- getBM(attributes = c("zfin_id_symbol"), 
                  filters = c("zfin_id_symbol", "go"), 
                  mart = drerio_mart, 
                  values = list(genes_drerio$gene, c("GO:0003677")))

genesGO_filtered

common usage 3: ortholog conversion between species

# example of zebrafish to human gene symbol conversion
human_mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", mirror = 'uswest')
drerio_mart <- useEnsembl("ensembl", dataset = "drerio_gene_ensembl", mirror = 'uswest')

genes_human <- getLDS(attributes = c("hgnc_symbol"), # "linked datasets"
                      filters = "hgnc_symbol", 
                      mart = human_mart, 
                      attributesL = c("zfin_id_symbol"), 
                      martL = drerio_mart, 
                      values = genes_drerio$gene) %>% as_tibble()

genes_human

Exercise

  1. Delete one argument from code above used to generate genesGO_filtered to obtain a list of all genes with GO id of "GO:0003677".

  2. Take a look at genesGO. Describe in pseudo-code, how you could add GO-CC back into the brauer_gene_exp.

Additional resources/reading

More sophisticated analysis than simply gene lists:



IDPT7810/practical-data-analysis documentation built on July 8, 2022, 9:32 p.m.