knitr::opts_chunk$set(

)

In this vignette, we demonstrate how to use the bbcRNA package to run an RNA-seq analysis starting from gene read count files produced by the STAR aligner.

Install bbcRNA if not installed

if (!requireNamespace("devtools", quietly = TRUE))
  install.packages("devtools")
if (!requireNamespace("bbcRNA", quietly = TRUE))
  devtools::install_github("vari-bbc/bbcRNA")

Load packages

library(bbcRNA)
library(SummarizedExperiment)
library(org.Mm.eg.db) # load the org.db for your organism
library(enrichplot)
library(ggplot2)

Get counts

ext_data_dir <- system.file("extdata/tcell", package = "bbcRNA")

tcell <- star_to_mat(dir = ext_data_dir, 
                     rgx = "^[^_]+_[^_]+", column = 2)

head(tcell)

make GRanges

This step is slow (several minutes to finish). It can be skipped if the genomic intervals of the genes are not of interest.

# txdb <- GenomicFeatures::makeTxDbFromGFF("~/Desktop/gencode.vM13.annotation.gtf",
#                                         format="gtf")
# granges <- GenomicFeatures::genes(txdb)

Make BbcSE object

The granges parameter for the BbcSE constructor is optional. If not provided, dummy GRanges data will be used. SummarizedExperiment can be coerced to RangedSummarizedExperiment without GRanges data, but DEFormat (package for converting between SummarizedExperiment, DGEList and DESeq2Data) does not work with RangedSummarizedExperiment objects with missing data for rowRanges.

# bbc_obj <- BbcSE(counts = tcell, granges = granges)
bbc_obj <- BbcSE(counts = tcell)

# show more information about the BbcSE class
getClassDef(class(bbc_obj))

bbc_obj

Get sample meta data

col_meta <- read_col_meta(paste0(ext_data_dir, "/meta.txt"))

# Add column meta data. May implement a function to add column meta data
# eventually
colData(bbc_obj)[, colnames(col_meta)] <- col_meta[colnames(bbc_obj), ]

colData(bbc_obj)

Get mapping rates

Use read_star_aln_metrics to import 'Log.final.out' files from STAR aligner. Counts for total input reads, uniqly aligning reads and multi-mapping reads are parsed from the STAR output. Overall alignment and uniquely mapping rates are calculated. Metrics are stored in the 'aln_metrics' slot.

aln_metrics <- read_star_aln_metrics(dir = ext_data_dir, 
                                 rgx = "^[^_]+_[^_]+")

# store the mapping metrics in the BbcSE object
aln_metrics(bbc_obj) <- aln_metrics

# aln_metrics is the name of both the setter and the getter function for
# aln_metrics
aln_metrics(bbc_obj)

Change the column (sample) names

Here we demonstrate how to change the sample names from the default SRA IDs to something more meaningful.

# add new column in colData by concatenating two genotype and rep columns
colData(bbc_obj)$new_sample <- paste0(colData(bbc_obj)$genotype, 
                                      colData(bbc_obj)$rep)

# store a copy of the original column names
colData(bbc_obj)$orig_sample <- colnames(bbc_obj)

# change the colnames to new_sample
colnames(bbc_obj) <- colData(bbc_obj)$new_sample

# check whether BbcSE object is still valid. colnames() setter does not check
# for unique names, but valid BbcSE objects are required to have unique row and
# col names. Could write a BbcSE method that does check for that at some point
validObject(bbc_obj)

Plot alignment rates

Here we demonstrate the plot_aln_metrics function and show that the output is a ggplot object that can be manipulated as usual, removing the legend in this case.

plot_aln_metrics(x = bbc_obj, type = "uniq_aln_reads", facet_by="genotype", fill="genotype") + ggplot2::theme(legend.position = "none")

plot_aln_metrics(x = bbc_obj, type = "uniq_map_rate", facet_by="genotype", fill="genotype") + ggplot2::theme(legend.position = "none")

Add gene symbols

ens2sym can be used to import gene symbols from an Org.Db object. Output is stored as row data and can be accessed using rowData().

bbc_obj <- ens2sym(bbc_obj, org.Mm.eg.db)

# BioMart is also supported. In mouse, Biomart seems to give more matches and
# the more appropriate symbol. However, for most protein-coding genes, the
# results should be the same between orgdb and biomart. Biomart is slower and
# can take 30 seconds-1min to finish. Another disadvantage is that while orgdb
# is a package, Biomart data needs to be downloaded each time and thus is less
# reproducible.
# bbc_obj <- ens2sym(bbc_obj, BMdataset = "mmusculus_gene_ensembl")

rowData(bbc_obj)

Get Entrez IDs for gene set analyses after DE genes are identified

Similarly, ens2entrez can be used to import Entrez IDs from an Org.Db object. Output is again stored as row data and can be accessed using rowData(). ens2sym was written with visualization in mind, so 1:1 matches are required. In contrast, ens2entrez was written to facilitate downstream gene set enrichment analyses, so we require only that each Entrez ID maps to only one Ensembl ID, but will take the first match if there are multiple possible Entrez IDs for a particular Ensembl ID. See the documentation for each function for more details.

bbc_obj <- ens2entrez(bbc_obj, org.Mm.eg.db)

# BioMart is also supported. This is handy for certain species that for some
# reason don't have Ensembl IDs in their orgdb (pig as of Nov 2019 for example)
# bbc_obj <- ens2sym(bbc_obj, BMdataset = "mmusculus_gene_ensembl")

rowData(bbc_obj)

make DGEList object

Use the makeDGEList function to make a DGEList that is stored in a BbcEdgeR object in the edger slot of the BbcSE object. By default, lowly expressed genes are removed, normalization factors are calculated and normalized counts are calculated and stored in the norm_cts slot of the BbcEdgeR object as a SummarizedExperiment. In the same SummarizedExperiment, mean normalized counts for each group specified in the DGEList object are also calculated and stored as row data.

# by default, low expression genes filtered out, normalization factors
# calculated, normalized counts calculated
bbc_obj <- makeDGEList(bbc_obj, group="genotype")

# what's in the edger slot now?
str(edger(bbc_obj))

# Number of rows in the SE
nrow(bbc_obj)

# Number of genes in edger$DGEList
nrow(dgelist(edger(bbc_obj)))

PCA

plot_PCA runs a PCA and also tests for significant clusters using vegan::adonis(). Output is a list containing: 1. ggplot object for the PCA plot 2. ggplot object for the scree plot 3. a list of results from adonis().

set.seed(100) # adonis uses permutations. Set seed to make reproducible.
pca_plots <- plot_PCA(bbc_obj, color_by="genotype", shape_by="genotype", adonis_by=c("genotype","rep"))
pca_plots[[1]]
pca_plots[[2]]

edgeR DE workflow

Use findDEGs to fit a model and test for DE genes for various contrasts. For the edgeR workflow, either glmQLFTest or glmTreat can be selected using the test argument.

bbc_obj <- findDEGs(bbc_obj, 
                    de_pkg = "edger",
                    test = "glmQLFTest", 
                    design = "~0+genotype",
                    contrasts = list(c("genotype", "mut", "WT"),
                                     c("genotype", "WT", "mut")))

# plot the P-value distribution
plot_pval_distrib(bbc_obj)

Volcano plot

# uses EnhancedVolcano package
plot_volcano(bbc_obj)

Extract the DE results as a dataframe

de_res <- get_de_table(bbc_obj)

str(de_res)

plot heatmap

Here we demonstrate the plot_heatmap function. Column names from rowData(BbcSE_object) and colData(BbcSE_object) can be passed to annotate or split the heatmap. In this example, we demonstrate how to create a new row data column to represent up-regulated versus down-regulated genes and pass it to plot_heatmap.

# get the most DE genes (by adjust pval)
top_genes <- rownames(edgeR::topTags(
  de_results(edger(bbc_obj))[[2]],
  n=20)
)

# get the sign of the LFC and store in rowData
## get the de_results slot of the BbcEdgeR object
bbc_de_res <- de_results(edger(bbc_obj))
## get contrast results
de_table <- bbc_de_res$`genotype_mut_-_WT`$table
## get the LFC sign
mut_vs_WT_sign <- ifelse(de_table$logFC > 0, "up", "down")
names(mut_vs_WT_sign) <- rownames(de_table)
## store LFC sign in rowData. Genes with no matches will be NA.
rowData(bbc_obj)$mut_vs_WT_sign <- 
  mut_vs_WT_sign[match(rownames(bbc_obj), names(mut_vs_WT_sign))]
## make rep a factor
colData(bbc_obj)$rep <- factor(colData(bbc_obj)$rep, levels=c("1", "2", "3", "4"))

# plot Zscores of normalized log2 cpms
plot_heatmap(x = bbc_obj, genes = top_genes, zscores=TRUE, 
             rowdata_annot = "mut_vs_WT_sign", coldata_annot = c("rep", "new_sample", "genotype"),
             rowdata_split = "mut_vs_WT_sign", coldata_split="genotype", 
             gene_labels = "uniq_syms")

# plot average normalized log2 cpms
plot_heatmap(x = bbc_obj, genes = top_genes, zscores=FALSE, 
             rowdata_split = "mut_vs_WT_sign",
             gene_labels = "uniq_syms", grouped=TRUE)

GSEA -- Hallmark msigdb

run_gsea is a wrapper for clusterProfiler and can be used to run GSEA for various gene sets. The output are gseaResult objects that can be visualized using enrichplot methods or processed further with other clusterProfiler methods.

# set seed to make GSEA deterministic
set.seed(42)

H_gsea_results_list <- run_gsea(x = bbc_obj,
                                de_pkg = "edger",
                                gene_set = "H",
                                orgDb = org.Mm.eg.db,
                                organism = "Mus musculus")

names(H_gsea_results_list)

# to get the missing genes
missing_genes <- find_missing_in_gseaResult(H_gsea_results_list, bbc_obj)
enrichplot::dotplot(H_gsea_results_list[[1]], showCategory=10, 
                    title="Top gene sets", split=".sign") + 
  facet_grid(.~.sign)

GSEA -- KEGG

kegg_gsea_results_list <- run_gsea(x = bbc_obj,
                                de_pkg = "edger",
                                gene_set = "kegg",
                                orgDb = org.Mm.eg.db,
                                organism = "mmu",
                                minGSSize = 30,
                                use_internal_data = FALSE)

names(kegg_gsea_results_list)
enrichplot::dotplot(kegg_gsea_results_list[[1]], showCategory=10, 
                    title="Top gene sets", split=".sign") + 
  facet_grid(.~.sign)

# shorten the descriptions to 30 characters or less
enrichplot::dotplot(shorten_desc(kegg_gsea_results_list[[1]], 30), showCategory=10, 
                    title="Top gene sets", split=".sign") + 
  facet_grid(.~.sign)

#gseaplot(kegg_gsea_results_list[[1]], geneSetID = "mmu05200", by="runningScore")

Hypergeometric test -- KEGG

kegg_hyper_results_list <- run_hypergeometric(x = bbc_obj,
                                de_pkg = "edger",
                                gene_set = "kegg",
                                orgDb = org.Mm.eg.db,
                                organism = "mmu")

names(kegg_hyper_results_list)
res_idx <- 1
enrichplot::dotplot(kegg_hyper_results_list[[res_idx]], showCategory=10, 
                    title=paste0("Top gene sets -- ", names(kegg_hyper_results_list)[res_idx]))

#gseaplot(kegg_gsea_results_list[[1]], geneSetID = "mmu05200", by="runningScore")

Session info

sessionInfo()


vari-bbc/bbcRNA documentation built on Nov. 10, 2020, 11:09 p.m.