knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.align = "center" )
# run before class: # devtools::install_github('rnabioco/practical-data-analysis') # BiocManager::install("biomaRt") library(pbda) library(tidyverse) library(cowplot) library(gprofiler2)
Handle gene lists
Try to connect prior knowledge to the results (genes you study, GO terms, or something like yeast_prot_prop
)
Using some basic R and tibble manipulation commands
Recall how data is saved in objects
Or whatever file you already saved containing lists of gene names
# from DESeq2 output, get a tibble back # res <- results(dds) # saveRDS(res, "res_class4.rds") # save any object from environment for next time loc <- system.file("extdata", "res_class4.rds", package = 'pbda') res <- readRDS(loc) # loading a DESeq2 object automatically loads DESeq2 head(res) # basically a dataframe res_tibble <- as_tibble(res, rownames = "gene") res_tibble # res_tibble contains calculations for all genes, pick out "interesting" genes gene_tibble <- res_tibble %>% filter(!is.na(padj)) %>% # NA values from padj are from tossing low expressing genes filter(padj <= 0.01) %>% # padj <= 0.05 is common, or 0.01, sometimes even 0.1 is acceptable filter(log2FoldChange >= 1 | log2FoldChange <= -1) %>% arrange(desc(log2FoldChange)) gene_tibble # get top 10 down-regulated genes gene_tibble_down10 <- gene_tibble %>% filter(log2FoldChange < 0) %>% arrange(log2FoldChange) %>% dplyr::slice(1:10) # get a vector of gene names gene_vector <- gene_tibble$gene gene_vector <- gene_tibble %>% pull(gene) # at least 5 more ways to do this
write_csv(gene_tibble, "gene_tibble.csv") # or load already saved file on disk loc <- system.file("extdata", "gene_tibble.csv", package = 'pbda') gene_tibble2 <- read_csv(loc) # reads out as tibble # or write as txt file, without col name write_csv(gene_tibble %>% select(gene), "gene_tibble.txt", col_names = FALSE) # write from tibble write_lines(gene_vector, "gene_tibble_v.txt") # write for vector
g:Profiler https://biit.cs.ut.ee/gprofiler/
Here we consider the simple case of single, unordered lists of significantly differentially expressed human genes. g:Profiler can also handle ordered lists, other species, multiple lists in the same query, arbitry thresholds, several different multiple testing correction methods, arbitrary source selection, and much more.
# generate lists of singificantly up/downregulated genes from the RNA-seq example upregulated_genes <- gene_tibble %>% filter(log2FoldChange >= 1, padj <= 0.01) %>% pull(gene) downregulated_genes <- gene_tibble %>% filter(log2FoldChange <= -1, padj <= 0.01) %>% pull(gene) # gost: gene list functional enrichment (for unordered human lists, only the list is required) gost_up <- gost(upregulated_genes, organism = "hsapiens", ordered_query = FALSE ) gost_down <- gost(downregulated_genes, organism = "hsapiens", ordered_query = FALSE) # inspect the resulting gost output: # meta slot contains information about the query, results and gene mapping # All three print pretty unweildy lists, but can be very useful for troubleshooting # gost_up$meta$query_metadata # gost_up$meta$result_metadata # gost_up$meta$genes_metadata # result slot contains the enrichment information as_tibble(gost_up$result) # for a quick look a the results, we can plot the -log10 of top 20 entries in both lists gost_log <- bind_rows( as_tibble(gost_up$result) %>% mutate(list = "up"), as_tibble(gost_down$result) %>% mutate(list = "down") ) %>% mutate(minus_log10 = -log10(p_value), list = factor(list, levels = c("up", "down"))) %>% group_by(list) %>% arrange(desc(minus_log10)) %>% dplyr::slice(1:20) gost_log %>% select(list, term_name, minus_log10) ggplot(gost_log, aes(x = reorder(term_name, minus_log10), y = minus_log10)) + #reorder Term by minus_log10 geom_bar(stat = "identity") + # otherwise ggplot tries to count xlab("Term Name") + facet_wrap(~list, scales = "free") + coord_flip() + theme(axis.text.y = element_text(size = 7)) # adding source information for the enriched terms can also help with interpretation: gost_log <- gost_log %>% mutate(term_name = paste(source,term_name, sep = ": ")) %>% select(list, term_name, minus_log10) ggplot(gost_log, aes(x = reorder(term_name, minus_log10), y = minus_log10)) + #reorder Term by minus_log10 geom_bar(stat = "identity") + # otherwise ggplot tries to count xlab("Term Name") + facet_wrap(~list, scales = "free") + coord_flip() + theme(axis.text.y = element_text(size = 7))
other options:
Panther http://www.geneontology.org/page/go-enrichment-analysis
GSEA (requires ranked list, for instance by fold-change, and requires full list of genes instead of just "significant") http://software.broadinstitute.org/gsea/index.jsp
or, GO and GSEA right in R (will require additional packages):
topGO: https://bioconductor.org/packages/release/bioc/html/topGO.html
fgsea: http://bioconductor.org/packages/release/bioc/html/fgsea.html
stringr
package, part of tidyverseDepending on the transcriptome and analysis options used in RNA-seq/scRNA-seq, the list of names can be messy.
loc <- system.file("extdata", "ensg_tibble.csv", package = 'pbda') ensg_tibble <- read_csv(loc) ensg_tibble # those version numbers are often not recognized by software
str_replace
: find and replace
(run ?str_replace
for documentation)
ensg_tibble_rm <- ensg_tibble %>% mutate(gene = str_replace(gene, pattern = "ENSG", replacement = "#")) ensg_tibble_rm # trimming of gene/transcript version ensg_tibble2 <- ensg_tibble %>% mutate(gene_new = str_replace(gene, pattern = "\\..*", # . and everything after replacement = "")) # empty string # regular expressions are useful and worth the time to learn, see ?base::regex ensg_tibble2 # `separate` works too, if you want to keep the info ensg_tibble3 <- ensg_tibble %>% separate(gene, into = c("gene", "version"), sep = "\\.")
# revisit the gprofiler output plot, clean up "term_name" column, stripping motif match info gost_log2 <- gost_log %>% mutate(term_name = str_replace(term_name, pattern = ";.*", # everything after and including ; replacement = "")) %>% group_by(list, term_name) %>% # we now have duplicated term_names arrange(desc(minus_log10)) %>% # we arranges them by minus_log10 dplyr::slice(1) # and take the most significant for each term ggplot(gost_log2, aes(x = reorder(term_name, minus_log10), y = minus_log10)) + #reorder term_name by minus_log10 geom_bar(stat = "identity") + # otherwise ggplot tries to count xlab("Term Name") + facet_wrap(~list, scales = "free") + coord_flip() + theme(axis.text.y = element_text(size = 7)) # to save plot ggsave("gprofiler.png", plot = last_plot(), dpi = 300)
str_detect
: spits out TRUE
or FALSE
# "-AS" denotes antisense genes, we can find all those str_detect(gene_tibble$gene, "-AS")[1:50] # use in `filter` gene_tibble_as <- gene_tibble %>% filter(str_detect(gene, "-AS")) gene_tibble_as
str_c
: concatenate strings
# also add annotations of our own gene_tibble_as2 <- gene_tibble_as %>% mutate(gene = str_c("HUMAN-", gene)) gene_tibble_as2
str_sub
: extract substrings
str_sub("HUMAN-ELOVL2-AS1", 2, 4) gene_tibble_as3 <- gene_tibble_as2 %>% mutate(gene = str_sub(gene, 2, 4)) gene_tibble_as3 # to remove annotations of fixed length gene_tibble_as4 <- gene_tibble_as2 %>% mutate(gene = str_sub(gene, 7, -5)) # extract just the gene name from "HUMAN-ELOVL2-AS1" gene_tibble_as4
BioMart can be used outside of R too: https://www.ensembl.org/biomart/martview
We will be using biomaRt
library(biomaRt) # biomaRt contains many useful datasets listDatasets(useMart("ENSEMBL_MART_ENSEMBL", host = "https://www.ensembl.org", )) %>% as_tibble() # hundreds of organisms listAttributes(useMart("ensembl", dataset = "hsapiens_gene_ensembl")) %>% as_tibble() # thousands of rows/attributes listFilters(useMart("ensembl", dataset = "hsapiens_gene_ensembl")) %>% as_tibble() # ways to filter the data
common usage 1: id to gene symbol conversion
human_mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", mirror = 'uswest') # assign the dataset to use ensg_sym2 <- getBM(attributes = c("hgnc_symbol", "ensembl_gene_id"), # columns you want back filters = "ensembl_gene_id", # type of info you are providing mart = human_mart, # dataset of certain species values = ensg_tibble2$gene) %>% as_tibble() # values of the filter ensg_sym2 # try running this with ensg_tibble (version # intact), can't find matches
common usage 2: functional GO term annotation
listAttributes(useMart("ensembl", dataset = "hsapiens_gene_ensembl")) %>% as_tibble() %>% filter(str_detect(description, "GO")) # look at attributes related to GO
drerio_mart <- useEnsembl("ensembl", dataset = "drerio_gene_ensembl", mirror = 'uswest') loc <- system.file("extdata", "genes_drerio.csv", package = 'pbda') genes_drerio <- read_csv(loc) genes_drerio genesGO <- getBM(attributes = c("zfin_id_symbol", "go_id", "name_1006", "namespace_1003"), filters = "zfin_id_symbol", mart = drerio_mart, values = genes_drerio$gene, TRUE) %>% as_tibble() genesGO genesGO <- genesGO %>% filter(go_id != "") # note that there are lines with empty go terms, remember to remove those # join GO dataframe with original RNA-seq/scRNA-seq data if needed genes_drerio_joinedGO <- left_join(genes_drerio, genesGO, by = c("gene" = "zfin_id_symbol")) genes_drerio_joinedGO
# or, get all genes with GO term of interest, by providing multiple filters genesGO_filtered <- getBM(attributes = c("zfin_id_symbol"), filters = c("zfin_id_symbol", "go"), mart = drerio_mart, values = list(genes_drerio$gene, c("GO:0003677"))) genesGO_filtered
common usage 3: ortholog conversion between species
# example of zebrafish to human gene symbol conversion human_mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", mirror = 'uswest') drerio_mart <- useEnsembl("ensembl", dataset = "drerio_gene_ensembl", mirror = 'uswest') genes_human <- getLDS(attributes = c("hgnc_symbol"), # "linked datasets" filters = "hgnc_symbol", mart = human_mart, attributesL = c("zfin_id_symbol"), martL = drerio_mart, values = genes_drerio$gene) %>% as_tibble() genes_human
Delete one argument from code above used to generate genesGO_filtered
to obtain a list of all genes with GO id of "GO:0003677".
Take a look at genesGO
. Describe in pseudo-code, how you could add GO-CC back into the brauer_gene_exp.
More sophisticated analysis than simply gene lists:
Find enriched sequence elements in promoters, UTRs, etc. See https://bedtools.readthedocs.io/en/latest/, http://meme-suite.org/, etc
WGCNA: weighted correlation network analysis
SCENIC: infer gene regulatory networks and cell types from single-cell RNA-seq data
Come to RBI office hours (Thursday 1-2:30)!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.