Suggested answers to the workshop questions are below. You might have some different code e.g. to customise the volcano plot as you like. Feel free to comment on any of these solutions in the workshop website as described here.

# load libraries
library(airway)

# tidyverse core packages
library(tibble)
library(dplyr)
library(tidyr)
library(readr)
library(stringr)
library(purrr)
library(ggplot2)

# tidyverse-friendly packages
library(plotly)
library(ggrepel)
library(tidyHeatmap)
library(tidybulk)

Part 1 Bulk RNA-seq Core

Airway dataset

# Set up data
data("airway")

counts_tt <- 
  airway %>%
    tidybulk()

What fraction of variance is explained by PC3?

counts_tt %>% 
  scale_abundance() %>%
  reduce_dimensions(method="PCA", .dims=3)

How many differentially expressed transcripts are there for FDR < 0.05 if we did not include cell line in the formula?

counts_tt %>% 
  keep_abundant(factor_of_interest=dex) %>%
    test_differential_abundance(
      .formula = ~ 0 + dex,
      .contrasts = c("dextrt - dexuntrt"),
      omit_contrast_in_colnames = TRUE
    ) %>%
  filter(FDR < 0.05) %>%
  summarise(num_de = n_distinct(feature))

Pasilla dataset

# load data
data("pasilla", package = "rpharma2020tidytranscriptomics")

# create tidybulk tibble
counts_tt <-
    pasilla %>%
    tidybulk() %>%
    mutate(symbol = AnnotationDbi::mapIds(org.Dm.eg.db::org.Dm.eg.db, keys=as.character(feature), keytype = "FLYBASE", column="SYMBOL", multiVals = "first"))

# filter counts
counts_filtered <- counts_tt %>% keep_abundant(factor_of_interest = condition)

# scale counts
counts_scaled <- counts_filtered %>% scale_abundance()

What is the Fraction of Variance for PC1?

counts_scal_PCA <-
  counts_scaled %>%
  reduce_dimensions(method="PCA")

How many differentially expressed genes are there for treated vs untreated (FDR < 0.05)?

counts_de <-
  counts_tt %>%
  test_differential_abundance(.formula = ~ 0 + condition + type, 
                              .contrasts = c("conditiontreated - conditionuntreated"), 
                              omit_contrast_in_colnames = TRUE)
counts_de %>% 
  filter(FDR < 0.05) %>% 
  summarise(num_de = n_distinct(feature))

What is the FBgn id of the 10th most differentially expressed gene (by smallest P value)?

topgenes <- counts_de %>%
    pivot_transcript() %>%
  arrange(PValue) %>%
  head(10)

topgenes

Extra

Question 1.4

What code can generate a heatmap of variable genes (starting from count_scaled)?

counts_scaled %>% 

    # extract 500 most variable genes
    keep_variable( .abundance = counts_scaled, top = 500) %>%

    # create heatmap
    heatmap(
          .column = sample,
          .row = feature,
          .value = counts_scaled,
          annotation = c(condition, type),
          transform = log1p 
      )

Question 1.5

What code can you use to visualise expression of the pasilla gene (gene id: FBgn0261552)

counts_scaled %>%

    # extract counts for pasilla gene
    filter(feature == "FBgn0261552") %>%

    # make stripchart
    ggplot(aes(x = condition, y = counts_scaled + 1, fill =condition, label = sample)) +
    geom_boxplot() +
    geom_jitter() +
    scale_y_log10()+
    theme_bw()

Question 1.6

What code can generate an interactive volcano plot that has gene ids showing on hover?

p <- counts_de %>%
    pivot_transcript() %>%

  # Subset data
    mutate(significant = FDR<0.05 & abs(logFC) >=2) %>%

  # Plot
    ggplot(aes(x = logFC, y = PValue, label=feature)) +
    geom_point(aes(color = significant, size = significant, alpha=significant)) +
    geom_text_repel() +

    # Custom scales
    scale_y_continuous(trans = "log10_reverse") +
    scale_color_manual(values=c("black", "#e11f28")) +
    scale_size_discrete(range = c(0, 2)) +
    theme_bw()

ggplotly(p, tooltip = c("text"))

Tip: You can use "text" instead of "label" if you don't want the column name to show up in the hover e.g. above will give "FBgn0261552" rather than "feature:FBgn0261552".

Question 1.7

What code can generate a heatmap of the top 100 DE genes?

top100 <- 
    counts_de %>%
    pivot_transcript() %>%
    arrange(PValue) %>%
    head(100)

counts_scaled %>% 
  filter(feature %in% top100$feature) %>%
    heatmap(
          .column = sample,
          .row = feature,
          .value = counts_scaled,
          annotation = c(condition, type),
          transform = log1p 
      )

Part 2 Bulk RNA-seq Extended

Comparison of methods

Which method detects the most differentially abundant transcripts, p value adjusted for multiple testing < 0.05 (FDR, adj.P.Val, padj)?

# Set up data
pasilla_de <- 
  rpharma2020tidytranscriptomics::pasilla %>% 

  # Convert SE object to tibble
  tidybulk %>%

   # Scale abundance for plotting
  identify_abundant(factor_of_interest=condition) 

de_all <- 

  pasilla_de %>%

  # edgeR QLT
  test_differential_abundance(
    ~ condition + type, 
    method = "edger_quasi_likelihood",
    prefix = "edgerQLT_"
  )  %>%

  # edgeR LRT
  test_differential_abundance(
    ~ condition + type, 
    method = "edger_likelihood_ratio",
    prefix = "edgerLR_"
  )  %>%

  # limma-voom
  test_differential_abundance(
    ~ condition + type, 
    method = "limma_voom",
    prefix = "voom_"
  ) %>%

  # DESeq2
  test_differential_abundance(
    ~ condition + type, 
    method = "deseq2",
    prefix = "deseq2_"
  ) 
de_all %>%

    # Subset transcript information
    pivot_transcript() %>%

    # Reshape for nesting
    pivot_longer(
        cols = -c(feature, .abundant),
        names_sep = "_", 
        names_to = c("method", "statistic"), 
        values_to = "value"
    ) %>%

    # Filter statistic
    filter(statistic %in% c("FDR", "adj.P.Val", "padj")) %>%
    filter(value < 0.05) %>%

    # Nesting
    dplyr::count(method)

Cell type composition

What is the most abundant cell type overall in BRCA samples?

BRCA_cell_type_long %>% 
    group_by(cell_type) %>% 
    summarise(m = median(proportion)) %>% 
    dplyr::arrange(dplyr::desc(m))


stemangiola/rpharma2020_tidytranscriptomics documentation built on Oct. 9, 2020, 9:41 p.m.