Manuscript code - differential transcript abundance

This decument includes the code used for the manuscript, for the differential transcript abundance.

library(knitr)
#library(kableExtra)
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
                      message = FALSE, cache.lazy = FALSE)
#options(width = 120)
options(pillar.min_title_chars = Inf)

library(magrittr)
library(tibble)
library(dplyr)
library(magrittr)
library(tidyr)
library(ggplot2)
library(rlang)
library(purrr)
library(ggrepel)
library(tidyHeatmap)
library(pasilla)
library(tidybulk)

my_theme =  
    theme_bw() +
    theme(
        panel.border = element_blank(),
        axis.line = element_line(),
        panel.grid.major = element_line(size = 0.2),
        panel.grid.minor = element_line(size = 0.1),
        text = element_text(size=12),
        legend.position="bottom",
        aspect.ratio=1,
        strip.background = element_blank(),
        axis.title.x  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
        axis.title.y  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
    )
pasCts = system.file("extdata",
                                         "pasilla_gene_counts.tsv",
                                         package = "pasilla",
                                         mustWork = TRUE)
pasAnno = system.file(
    "extdata",
    "pasilla_sample_annotation.csv",
    package = "pasilla",
    mustWork = TRUE
)
cts = as.matrix(read.csv(pasCts, sep = "\t", row.names = "gene_id"))
coldata = read.csv(pasAnno, row.names = 1)
coldata = coldata[, c("condition", "type")]

# Create tidybulk object
counts =
    cts %>%
    as_tibble(rownames = "transcript") %>%
    pivot_longer(names_to = "sample",
                             values_to = "count",
                             cols = -transcript) %>%
    left_join(
        coldata %>%
        as_tibble(rownames = "sample") %>%
        mutate(sample = gsub("fb", "", sample))
    ) %>%
    mutate_if(is.character, as.factor)
# Create a tt object with unique raw and normalised counts
tt_scaled <- 
    tidybulk(counts, sample, transcript, count) %>%
    aggregate_duplicates() %>%
    identify_abundant() %>%
    scale_abundance()

# Plot count densities
tt_scaled %>%
    pivot_longer(
        c(count, count_scaled),
        values_to = "count", 
        names_to = "Normalisation"
    ) %>%
    ggplot(aes(count + 1, group=sample, color=type)) +
    facet_grid(~Normalisation) +
    geom_density() +
    scale_x_log10()
# Reduce data dimensionality with arbitrary number of dimensions
tt_mds <- tt_scaled %>% reduce_dimensions(method="MDS", .dims = 3)

# Plot all-vs-all MDS dimensions 
tt_mds %>%
    pivot_sample() %>%
    GGally::ggpairs(columns = 7:9, ggplot2::aes(colour=condition))
# Adjust for visualisation
tt_adj <- tt_mds %>% adjust_abundance(~ condition + type)

# Visualise the association between reduced dimensions and factors
tt_mds_adj_mds <- 
    tt_adj %>%
    filter( count_scaled_adjusted %>% is.na %>% `!`) %>%

    # Calculate reduced dimensions on the adjusted counts as well
    reduce_dimensions(
       .abundance = count_scaled_adjusted, 
       method="MDS", .dim = 3
    ) 
# Data manipulation and visualisation 
tt_mds_adj_mds %>%
    pivot_sample() %>%

    # First level reshaping
    pivot_longer(contains("Dim"), names_to = "Dim", values_to = ".value")   %>%
    separate(Dim, c("Dim", "Adj"), sep="\\.") %>%
    mutate(Adj = ifelse(Adj == "y", "non", "adj") %>% factor(c("scaled", "adj"))) %>%

    # Second level reshaping
    pivot_longer(c(type, condition), names_to = "covar", values_to = "which") %>%

    # Visualise the integrative plot
    ggplot(aes(y = .value, x = covar, fill = `which`)) +
    geom_boxplot() +
    facet_grid(Adj ~ Dim)
tt_test <- tt_adj %>% test_differential_abundance(~ condition + type)

# MA plot
tt_test %>%
        keep_abundant() %>%
      pivot_transcript() %>%

    # Subset data
    mutate(significant = FDR<0.05 & abs(logFC) >=2) %>%
    mutate(transcript = ifelse(significant, as.character(transcript), NA)) %>%

    # Plot
    ggplot(aes(x = logCPM, y = logFC, label=transcript)) +
    geom_point(aes(color = significant, size = significant, alpha=significant)) +
    geom_text_repel() +
    scale_color_manual(values=c("black", "#e11f28")) +
    scale_size_discrete(range = c(0, 2)) 
tt_test %>%

    # Select top genes and reshape data
    inner_join( arrange((.), PValue) %>% distinct(transcript) %>% head(6)) %>%

    # High level reshaping of the data. 
    # All three count columns are shaped as two columns: 
    # (i) the columns name and (ii) the value of those columns
    pivot_longer(
        c(count, count_scaled, count_scaled_adjusted), 
        names_to = "Stage", values_to = "count"
    ) %>%

    # This allows the faceted plot
    ggplot(aes(x = Stage, y = count + 1, fill = condition)) +
        geom_boxplot() +
    facet_wrap(~transcript) +
    scale_y_log10() 
# Heatmap
tt_test %>%

    # Select differentially abundant
    filter(FDR < 0.05 & abs(logFC) > 2) %>%

    # Plot
        as_tibble() %>%
    heatmap( transcript, sample, count_scaled_adjusted) %>% 
    add_tile(condition) %>%
    add_tile(type)


Try the tidybulk package in your browser

Any scripts or data that you put into this service are public.

tidybulk documentation built on April 7, 2021, 6 p.m.