hciR: RNA-seq workflows at HCI

DESeq analysis of mouse liver samples

August 24, 2021

This guide follows the Bioconductor RNA-Seq workflow to find differentially expressed genes in GSE132056 using DESeq2 version 1.30.1. For more details about the statistics, check the original paper or online tutorials like the one from Harvard.

Load the sample table with treatment and diet in the extdata directory.

library(tidyverse)
extdata <- system.file("extdata", package="hciR")
samples <- read_tsv(paste(extdata, "liver_samples.tsv", sep="/"))
samples
#  # A tibble: 16 x 4
#     id       name      trt      diet 
#     <chr>    <chr>     <chr>    <chr>
#   1 15089X1  194-Liver Control  NCD  
#   2 15089X2  198-Liver Control  NCD  
#   3 15089X3  209-Liver Control  NCD  
#   4 15089X4  220-Liver Control  NCD  
#   5 15089X5  179-Liver Degs1_KO NCD  
#   6 15089X6  185-Liver Degs1_KO NCD  
#   7 15089X7  186-Liver Degs1_KO NCD  
#   8 15089X8  187-Liver Degs1_KO NCD  
#   9 15089X9  61-Liver  Control  HFD  
#  10 15089X10 70-Liver  Control  HFD  
#  11 15089X11 71-Liver  Control  HFD  
#  12 15089X12 76-Liver  Control  HFD  
#  13 15089X13 82-Liver  Degs1_KO HFD  
#  14 15089X14 89-Liver  Degs1_KO HFD  
#  15 15089X15 90-Liver  Degs1_KO HFD  
#  16 15089X16 92-Liver  Degs1_KO HFD

Load the combined featureCounts matrix.

counts <- read_tsv(paste(extdata, "liver_counts.tsv", sep="/"))
counts[, 1:8]
#  # A tibble: 53,801 x 8
#     geneid             `15089X1` `15089X2` `15089X3` `15089X4` `15089X5` `15089X6` `15089X7`
#     <chr>                  <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
#   1 ENSMUSG00000000001      1941      1745      2768      2910      1124      2577      2265
#   2 ENSMUSG00000000003         0         0         0         0         0         0         1
#   3 ENSMUSG00000000028        21        23        27        27        14        28        17
#   4 ENSMUSG00000000031         3         3         1         5         3         3        10
#   5 ENSMUSG00000000037         0         0         0         0         0         0         4
#   6 ENSMUSG00000000049     28341     30970     50511     49518     22421     33501     31659
#   7 ENSMUSG00000000056       748       728      1665      1570       716      1002       694
#   8 ENSMUSG00000000058        53        84        94        75        28       110       121
#   9 ENSMUSG00000000078       249       187       200       355       149       180       221
#  10 ENSMUSG00000000085       102        84        93       121        89        77        90
#  # … with 53,791 more rows

Remove 17597 features with zero counts and 16431 features with 5 or fewer reads in every sample to create a final count matrix with 19773 rows.

library(hciR)
counts <- filter_counts(counts, n=5)
#  Removed 17597 features with 0 reads
#  Removed 16431 features with <=5 maximum reads

Load the mouse annotations from Ensembl 92 in hciRdata and check genes with the highest number of assigned reads.

library(hciRdata)
n1 <- rowMeans(as_matrix(counts))
inner_join( dplyr::select(mouse92, 1:4,8),
 tibble(id= names(n1), mean_count = n1)) %>%
 mutate(description=trimws(substr(description, 1, 40))) %>%
 arrange(desc(mean_count))
#  Joining, by = "id"
#  # A tibble: 19,773 x 6
#     id                 gene_name biotype        chromosome description                               mean_count
#     <chr>              <chr>     <chr>          <chr>      <chr>                                          <dbl>
#   1 ENSMUSG00000029368 Alb       protein_coding 5          "Serum albumin"                             1179072.
#   2 ENSMUSG00000064339 mt-Rnr2   Mt_rRNA        MT         "mitochondrially encoded 16S rRNA"           445705.
#   3 ENSMUSG00000020609 Apob      protein_coding 12         "Apolipoprotein B-100 Apolipoprotein B-4…    363466.
#   4 ENSMUSG00000002985 Apoe      protein_coding 7          "Apolipoprotein E"                           277628.
#   5 ENSMUSG00000058207 Serpina3k protein_coding 12         "serine (or cysteine) peptidase inhibito…    198378.
#   6 ENSMUSG00000064351 mt-Co1    protein_coding MT         "mitochondrially encoded cytochrome c ox…    194757.
#   7 ENSMUSG00000037071 Scd1      protein_coding 19         "stearoyl-Coenzyme A desaturase 1"           145974.
#   8 ENSMUSG00000066154 Mup3      protein_coding 4          ""                                           129677.
#   9 ENSMUSG00000024164 C3        protein_coding 17         "Complement C3 Complement C3 beta chain …    121210.
#  10 ENSMUSG00000025479 Cyp2e1    protein_coding 7          "Cytochrome P450 2E1"                        114009.
#  # … with 19,763 more rows

Drop the two MT rRNAs.

counts <- semi_join(counts, filter(mouse92, biotype!="Mt_rRNA"), by=c(geneid="id"))

Following the DESeq2 vignette on interactions, there are a few ways to model the data.

Combine trt and diet into a single column and select the pairwise comparisons of interest, for example Degs1 KO_NCD vs Control_NCD.
Test interactions using \~ trt * diet in the design formula
Analyze a subset of samples like those from NCD. See the DEseq2 FAQ for more details on when to split the analysis into pairs of groups.

Combine treatment and diet into a new column and order the factor levels. NOTE: Starting in hciR version 1.5, control groups should be listed first and pairwise combinations are selected in reverse order.

samples <- mutate(samples, trt_diet = gsub("Degs1_", "", paste(trt, diet, sep="_")))
samples$trt_diet <- factor(samples$trt_diet,
     levels = c("Control_NCD", "Control_HFD", "KO_NCD", "KO_HFD"))

Run DESeq using the new trt_diet column in the design formula and get the regularized log (rlog) counts for sample visualizations. These values are similar to the log2 normalized counts except the variance in low count genes is reduced.

dds1 <- deseq_from_tibble(counts, samples, design = ~ trt_diet )
#  estimating size factors
#  estimating dispersions
#  gene-wise dispersion estimates
#  mean-dispersion relationship
#  final dispersion estimates
#  fitting model and testing
rld1 <-  r_log(dds1)

Plot the first two principal components using the rlog values from the top 500 variable genes.

# plot_pca(rld1, "trt_diet", tooltip=c("id", "name", "diet") , width=700)
plot_pca(rld1, "trt_diet", ggplot=TRUE, label="id")

Cluster all the rlog values using the R function dist to calculate the Euclidean distance between samples.

plot_dist(rld1, c("trt", "diet"), na_col="white")

Run check_contrasts to list the pairwise comparisons.

data.frame(vs=check_contrasts(dds1$trt_diet))
#  6 contrasts:
#                             vs
#  1           KO_HFD vs. KO_NCD
#  2      KO_HFD vs. Control_HFD
#  3      KO_HFD vs. Control_NCD
#  4      KO_NCD vs. Control_HFD
#  5      KO_NCD vs. Control_NCD
#  6 Control_HFD vs. Control_NCD

Use the subset option to skip the 3rd and 4th contrasts and compare the remaining rows using a 5% false discovery rate (FDR).

res <- results_all(dds1, mouse92, subset=c(2,5,6,1))
#  Using adjusted p-value < 0.05
#  Adding shrunken fold changes to log2FoldChange
#  1. KO_HFD vs. Control_HFD:      267 up and 415 down regulated
#  2. KO_NCD vs. Control_NCD:      35 up and 17 down regulated
#  3. Control_HFD vs. Control_NCD: 494 up and 278 down regulated
#  4. KO_HFD vs. KO_NCD:           1150 up and 1115 down regulated

Plot fold changes and p-values from high fat KO vs. Control in the first contrast in a volcano plot.

plot_volcano(res[[1]], pvalue=3)

Plot the mean normalized count and fold change in an MA-plot.

plot_ma(res[[1]])

Cluster the rlog values from all 682 significant genes and scale by rows, so values represent the number of standard deviations from the mean rlog value.

x <- top_counts(res[[1]], rld1, top=1000)
nrow(x)
#  [1] 682
plot_genes(x, c("trt", "diet"), scale ="row", annotation_names_col=FALSE,
 show_rownames=FALSE)

Optionally, drop the normal chow samples.

x <- filter_top_counts(x, diet == "HFD")
plot_genes(x, "trt", scale ="row", annotation_names_col=FALSE,
 show_rownames=FALSE)

Find genes in the PPAR Signaling Pathway using the MSigDB pathways in hciRdata. Note the mouse annotations include human homologs from MGI.

p1 <- filter(res[[1]], human_homolog %in% msig_pathways$KEGG[["PPAR Signaling Pathway"]])
dplyr::select(p1, 1:7,12)
#  # A tibble: 70 x 8
#     id        gene_name biotype   chromosome description                         human_homolog baseMean    padj
#     <chr>     <chr>     <chr>     <chr>      <chr>                               <chr>            <dbl>   <dbl>
#   1 ENSMUSG0… Pparg     protein_… 6          peroxisome proliferator activated … PPARG           219.    0.565 
#   2 ENSMUSG0… Nr1h3     protein_… 2          Mus musculus nuclear receptor subf… NR1H3          1721.    0.113 
#   3 ENSMUSG0… Ppard     protein_… 17         Peroxisome proliferator-activated … PPARD            52.9   0.207 
#   4 ENSMUSG0… Angptl4   protein_… 17         Angiopoietin-related protein 4      ANGPTL4        1921.    0.959 
#   5 ENSMUSG0… Cd36      protein_… 5          Platelet glycoprotein 4             CD36           1889.    0.505 
#   6 ENSMUSG0… Apoa2     protein_… 1          Apolipoprotein A-II Proapolipoprot… APOA2         88339.    0.0723
#   7 ENSMUSG0… Cpt1c     protein_… 7          Mus musculus carnitine palmitoyltr… CPT1C             5.15 NA     
#   8 ENSMUSG0… Ubc       protein_… 5          ubiquitin C                         UBC             871.    0.498 
#   9 ENSMUSG0… Acaa1b    protein_… 9          acetyl-Coenzyme A acyltransferase … ACAA1         11724.    0.914 
#  10 ENSMUSG0… Lpl       protein_… 8          lipoprotein lipase                  LPL             640.    0.0284
#  # … with 60 more rows

Cluster the PPAR genes in a heatmap. There are 70 expressed genes but only 3 are significant - this will plot 37 genes with an FDR < 50%.

x <- top_counts( filter(p1, padj < 0.5), rld1, filter=FALSE)
nrow(x)
#  [1] 37
x <- filter_top_counts(x, diet == "HFD")
plot_genes(x, "trt", fontsize_row=8, scale = "row")

Run DESeq using \~ trt * diet in the design formula.

dds2 <- deseq_from_tibble(counts, samples, design = ~ trt * diet)
#  estimating size factors
#  estimating dispersions
#  gene-wise dispersion estimates
#  mean-dispersion relationship
#  final dispersion estimates
#  fitting model and testing
rld2 <- r_log(dds2)

Check if the treatment effect differs across diets using a 5% false discovery rate (FDR). There are 105 signfiicant interactions.

DESeq2::resultsNames(dds2)
#  [1] "Intercept"               "trt_Degs1_KO_vs_Control" "diet_NCD_vs_HFD"         "trtDegs1_KO.dietNCD"
int <- DESeq2::results(dds2, name = "trtDegs1_KO.dietNCD", alpha = 0.05)
DESeq2::summary(int)
#  
#  out of 19771 with nonzero total read count
#  adjusted p-value < 0.05
#  LFC > 0 (up)       : 63, 0.32%
#  LFC < 0 (down)     : 42, 0.21%
#  outliers [1]       : 64, 0.32%
#  low counts [2]     : 6470, 33%
#  (mean count < 11)
#  [1] see 'cooksCutoff' argument of ?results
#  [2] see 'independentFiltering' argument of ?results

Add gene names and biotypes to the results.

int <- annotate_results(int, mouse92)

Create an interaction plot using the scaled rlog values from the top 25 genes sorted by adjusted p-value.

x <- top_counts( int, rld1, top=25)
plot_interactions(x, c( "diet", "trt"), ylab="Z-score") + theme_bw()

Save the DESeq and interaction results, raw counts, normalized counts, regularized log counts and mouse annotations to a single Excel file in DESeq.xlsx and R objects to a binary data file to load into a new session.

res_all <- c(res, list(Interactions=int))
write_deseq(res_all, dds1, rld1, mouse92)
save(res, int, dds1, rld1, dds2, rld2, file="dds.rda")

There are a number of options for pathway analysis and most can be divided into one of two groups based on the input dataset. Gene set enrichment methods like Broad’s GSEA require all expressed genes sorted by fold change and calculate a running sum statistic to find pathways that are enriched with either up- or down-regulated genes.

Over representation methods require a smaller subset of significant genes and use a Fisher’s test to identify significant pathways. There are many online tools like Enrichr that accept a list of significant genes as input and return enriched sets. To get a list of genes, just sort the DESeq results in the Excel file by adjusted p-value and copy and paste the gene names into the search box.

The fgsea package (fast gene set enrichment analysis) is similar to Broad’s GSEA and finds pathways that are enriched with either up- or down-regulated human genes. Load the KEGG pathways from MSigDB and run fgsea using a 10% FDR.

set.seed(77)
k1 <- fgsea_all(res, msig_pathways$KEGG)
#  1. KO_HFD vs. Control_HFD:       114 enriched sets (68 positive, 46 negative)
#  2. KO_NCD vs. Control_NCD:       38 enriched sets (3 positive, 35 negative)
#  3. Control_HFD vs. Control_NCD:  28 enriched sets (4 positive, 24 negative)
#  4. KO_HFD vs. KO_NCD:            65 enriched sets (36 positive, 29 negative)

Print the top pathways from KO_HFD vs. KO_NCD and check the GSEA user guide for details about the statistics.

group_by(k1[[1]][, -8], enriched) %>% top_n(4, abs(NES)) %>% ungroup()
#  # A tibble: 8 x 8
#    pathway                                  pval    padj     ES   NES nMoreExtreme  size enriched
#    <chr>                                   <dbl>   <dbl>  <dbl> <dbl>        <dbl> <int> <chr>   
#  1 Ribosome                             0.000353 0.00168 -0.830 -3.68            0    83 negative
#  2 Oxidative Phosphorylation            0.000387 0.00171 -0.779 -3.60            0   107 negative
#  3 Parkinsons Disease                   0.000381 0.00171 -0.748 -3.45            0   104 negative
#  4 Alzheimers Disease                   0.000458 0.00189 -0.618 -3.01            0   149 negative
#  5 Cell Adhesion Molecules Cams         0.000135 0.00126  0.594  2.40            0   106 positive
#  6 ECM Receptor Interaction             0.000140 0.00126  0.620  2.40            0    78 positive
#  7 Leishmania Infection                 0.000145 0.00126  0.642  2.38            0    60 positive
#  8 Toll Like Receptor Signaling Pathway 0.000139 0.00126  0.603  2.36            0    84 positive

Get the fold change vector and create an enrichment plot for Ribosome.

library(fgsea)
fc <- write_gsea_rnk(res, write=FALSE)
head(fc[[1]])
#    COL1A1   PHLDA3    GPNMB     GBP6     CD8A    GDF15 
#  2.024805 1.840050 1.437978 1.422271 1.381950 1.347245
plotEnrichment(msig_pathways$KEGG[["Ribosome"]],  fc[[1]]) +
ggplot2::labs(title="Ribosome")

Compare to ECM Receptor Interaction with mostly up-regulated genes.

plotEnrichment(msig_pathways$KEGG[["ECM Receptor Interaction"]],  fc[[1]]) +
ggplot2::labs(title="ECM Receptor Interaction")

Plot NES scores from significant pathways in two or more contrasts.

plot_fgsea(k1, fontsize_row=7, sets =2)
#  82 total sets

Save the enriched pathways to an Excel file.

openxlsx::write.xlsx(k1, file = "KEGG_pathways.xlsx")

The genes from MSigDB are saved as a list of vectors and include hallmark, pathways, go, motifs, cancer, immunologic and oncogenic sets.

lapply(msig_hallmark[1:3], head, 7)
#  $`Tnfa Signaling Via Nfkb`
#  [1] "ABCA1"   "ACKR3"   "AREG"    "ATF3"    "ATP2B1"  "B4GALT1" "B4GALT5"
#  
#  $Hypoxia
#  [1] "ACKR3"   "ADM"     "ADORA2B" "AK4"     "AKAP12"  "ALDOA"   "ALDOB"  
#  
#  $`Cholesterol Homeostasis`
#  [1] "ABCA2" "ACAT2" "ACSS2" "ACTG1" "ADH4"  "ALCAM" "ALDOC"

Four datasets are a list of lists and include two or more groups, so select a list element like msig_pathways$REACTOME to return the sets.

names(msig_pathways)
#   [1] "BIOCARTA" "KEGG"     "NABA"     "PID"      "REACTOME" "SA"       "SIG"      "ST"       "WNT"     
#  [10] "WP"
names(msig_go)
#  [1] "BP" "MF" "CC"
names(msig_motifs)
#  [1] "TFT" "MIR"
names(msig_cancer)
#  [1] "CAR"    "GCM"    "GNF2"   "MODULE" "MORF"

HuntsmanCancerInstitute/hciR documentation built on June 9, 2025, 12:47 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com