knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center",
  out.width = '100%',
  fig.wide = TRUE,
  cache= T
)
options(width=100)

# path to extra figures
figpath <- "../inst/cdoc/RAPToR-DEcorrection_figs/"
transp <- function(col, a=.5){
  # Make a color transparent
  colr <- col2rgb(col)
  return(rgb(colr[1,], colr[2,], colr[3,], a*255, maxColorValue = 255))
}

twoTicks <- function(side = c(1,2), col = NA, 
                     col.ticks = 1, las = c(1,2), ...){
  # Generate 2 (edge) ticks on given axis of current plot
  # col = NA & col.ticks = 1 makes the axis line dissapear, but keeps ticks
  for(i in seq_along(side)){
    axt <- axTicks(side[i])
    axis(side[i], at = axt[c(1, length(axt))], col = col, 
         col.ticks = col.ticks, las = las[i], ...)
  }
}
gg_logFC <- function(x, y=NULL, rg = range(xy, na.rm = T)*1.02,
                     l.breaks = log1p(c(0, 1, 5, 10, 50, 100, Inf)),
                     l.labels = c('1', '5', '10', '50', '100', '100+'),
                     nbins = 200,
                     xlab = "log2(FC) in x",
                     ylab = "log2(FC) in y",
                     main = "", get.r =T, add.vd = T,
                     DEgsel = NULL,
                     ...){
  # Make a 2d binned hexplot for showing logFC comparison
  require(ggplot2)
  require(viridisLite)

  if(is.null(y))
    xy <- x
  else
    xy <- cbind(x, y)
  xy <- as.data.frame(xy)

  colnames(xy) <- c("x", "y")

  g <- ggplot(data = xy, mapping = aes(x=x, y=y)) + 
    stat_bin_hex(aes(fill = after_stat(cut(log1p(after_stat(count)), breaks = l.breaks, 
                                 labels = F, right = T, include.lowest = T))), 
             bins=nbins) +
    scale_fill_gradientn(colors = viridisLite::inferno(length(l.breaks)), 
                         name = 'count', labels = l.labels) + 
    theme_classic() + xlim(rg) + ylim(rg) +
    xlab(xlab) + ylab(ylab) + ggtitle(main) + coord_fixed()
    if(get.r){
    if(any(is.na(xy))){
      message("Warning: removed NA values to compute correlation coefs")
      rmsel <- apply(xy, 1, function(r) any(is.na(r)))
      xy <- xy[!rmsel,]
      if(!is.null(DEgsel))
        DEgsel <- DEgsel[!rmsel]
    }

    cc <- cor(xy)[1,2]
    cc2 <- cor(xy, method='spearman')[1,2]
    cctxt <- paste0("r = ", round(cc, 3), '\nrho = ', round(cc2, 3))
    if(add.vd)
      cctxt <- paste0(cctxt, "\n", round(100*(cc^2), 1), "% VarDev")
    if(!is.null(DEgsel)){
      ccDE <- cor(xy[DEgsel,])[1,2]
      cctxt <- paste0(cctxt, "\nr(DE) = ", round(ccDE, 3))
      if(add.vd)
        cctxt <- paste0(cctxt, "\n", round(100*(ccDE^2), 1), "% VarDev(DE)")
    }

    g <- g + annotate(geom="text", hjust=1, vjust=0,
                      x = rg[2],
                      y = rg[1],
                      label = cctxt)
  }

  return(g)
}
run_DESeq2_ref <- function(X, p, formula, ref, 
                           ae_values=NULL, window.extend=1, ns.df=3){
  # Run DEseq2 wt vs. mutant correcting for development with ref. data.

  # Do no not specify age in the formula, it is added directly by the function.
  # Age estimates should either be an 'ae' column of p or given as 'ae_values'.

  require(DESeq2)

  if(!any(colnames(p)=='ae') & is.null(ae_values)){
    stop("Age estimates should either be a column of p, or given to ae_values.")
  }
  if(!is.null(ae_values)){
    p$ae <- ae_values
  }

  ## Extract reference expression profiles in sample time window
  w.rg <- range(p$ae) + c(-window.extend, window.extend)
  w.idx <- seq(
    max(c(1, which.min(abs(w.rg[1]-ref$time))-1 )),
    min(c(length(ref$time), which.min(abs(w.rg[2]-ref$time))+1))
    )
  # ref window time values
  w.ts <- ref$time[w.idx]
  # ref window expression values
  w.GE <- ref_2counts(ref = ref, ae_values = w.ts)

  ## Join ref & sample data
  nX <- ncol(X)
  nR <- ncol(w.GE)

  # get overlapping genes & join expression data
  ovl <- RAPToR::format_to_ref(samp = X, refdata = w.GE, verbose = F)
  Xj <- as.matrix(cbind(ovl$refdata, ovl$samp))

  # get relevant fields from p 
  f0 <- as.formula(formula)
  p2 <- p[, attr(terms(f0), "term.labels"), drop=F]
  # get 1st level of each variable
  lev0 <- lapply(p2, function(col) levels(col)[1]) 

  # join p data and add Time and ref/sample Batch
  pj <- cbind(
    Time = c(w.ts, p$ae),
    Batch = factor(rep(c('r', 's'), c(nR, nX))),
    rbind(do.call(cbind, lapply(lev0, rep, times=nR)), p2) # other terms
    ) 
  rownames(pj) <- colnames(Xj)


  # Estimate dispersions with samples only
  s0 <- pj$Batch=="s"
  dds0 <- DESeqDataSetFromMatrix(countData = Xj[,s0],
                                 colData = pj[s0,],
                                 design = f0)
  dds0 <- estimateSizeFactors(dds0)
  dds0 <- estimateDispersions(dds0, fitType = "local") 
  dd <- dispersions(dds0) # store dispersions
  dd[is.na(dd)] <- 0 # remove NAs

  ## Build full DE model with reference
  # Add Time and ref/sample batch to model formula
  f1 <- update.formula(f0, substitute(
    ~ splines::ns(Time, df = ns.df) + Batch + ., 
    list(ns.df=ns.df)
    ))
  dds <-  DESeqDataSetFromMatrix(countData = Xj,
                                 colData = pj,
                                 design = f1)
  dds <- estimateSizeFactors(dds)
  # inject dispersions from sample-only model
  dispersions(dds) <- dd

  dds <- nbinomWaldTest(dds)

  return(dds)
}

Introduction

The developmental speed of fast-growing organisms such as C. elegans is influenced by many different factors -- including experimental conditions themselves -- making it difficult or impossible to collect age-matched individuals between conditions. For example, if a mutant has delayed development but controls and mutants are collected at the same chronological (and therefore different physiological) time, the perturbation of interest will be completely confounded with development (a). Because of this, purely developmental gene expression changes can be mislabeled as the effect of a variable of interest (b).

knitr::include_graphics(file.path(figpath,"dev_shift.png"))

When sample groups have age differences but still overlap, the developmental effect can simply be accounted for by including age as a covariate in the Differential Expression (DE) analysis. If there is no age overlap however, it becomes impossible to know whether an observed effect is due to the perturbation or age since both variables are completely confounded.

Using RAPToR reference data, we can bridge the gap between non-overlapping sample groups and rescue otherwise impossible DE analyses.


Quickstart

load("../inst/extdata/sc4_qsdata.RData")

Quantifying age-driven expression changes with ref_compare()

# load libraries
library(RAPToR)
library(wormRef)

Given transcript per million (TPM) expression profiles from 3 control and 3 mutant samples.

head(qs$tpm)
print(qs$pdat)

First, infer sample age with RAPToR.

# load reference 
r_cel <- prepare_refdata("Cel_larv_YA", "wormRef", 600)

# estimate sample age
ae_qs <- ae(qs$tpm, r_cel)
plot(ae_qs, group = qs$pdat$strain, g.line=3, lmar=5)

Run ref_compare() to estimate developmental expression changes between groups.

qs_rc <- ref_compare(
    X      = qs$tpm,        # sample data, log(TPM+1)
    ref    = r_cel,         # ref object
    ae_obj = ae_qs,         # ae object
    group  = qs$pdat$strain # factor defining compared groups (wt/mut)
)

Plot sample log2 fold-changes (logFCs) vs. expected developmental logFCs inferred from the reference.

qs_lfc <- get_logFC(qs_rc)

plot(qs_lfc, cex=.2, asp=1,
     xlab = "ctr vs. mut log2FCs", xlim = c(-10,10),
     ylab = "Developmental log2FCs", ylim = c(-10, 10))

Print the correlation between sample and reference logFCs, as well as the average age difference between groups.

print(qs_rc)

In this example, ctr vs. mut logFCs have a correlation of $r round(cor(qs_lfc[,1], qs_lfc[,2]),3)$ with the expected developmental logFCs computed from the reference data. Using $r^2$, this corresponds to approximately $r round(100*(cor(qs_lfc[,1], qs_lfc[,2])^2),2) \%$ of variance explained by development. Such a large developmental effect between the control and mutant groups is expected as they don't have any age overlap.

For an explanation of the analysis, see How it works.

A custom ggplot function to plot logFCs is also provided in Plotting functions, below.

gg_logFC(qs_lfc, main = "Developmental effect",
         xlab = "ctr vs. mut log2FC", ylab = "Developmental log2FC")

Correcting for age effects in differential expression analysis

# load libraries
library(RAPToR)
library(wormRef)

library(DESeq2)
library(splines)

Given raw counts and transcript per million (TPM) expression profiles from 3 control and 3 mutant samples.

head(qs$count)
head(qs$tpm)
print(qs$pdat)

First, infer sample age with RAPToR.


plot(ae_qs, group = qs$pdat$strain, g.line=3, lmar=5)

Define a reference window to integrate.

r.window <- range(ae_qs$age.estimates[,1]) + c(1,-1) # +1h margin
r.idx <- get_refTP(r_cel, ae_values = r.window) # indices in the ref object
r.idx <- r.idx[1]:r.idx[2]

# extract time and expression values
r.time <- r_cel$time[r.idx]
r.tpm <- r_cel$interpGE[,r.idx]

Find the optimal spline degree-of-freedom (df) to model expression dynamics in the selected window.

dfs <- 1:8
ssq <- sapply(dfs, function(i){
  # compute SSQ of linear model on expression data
  sum(residuals(lm( t(r.tpm) ~ splines::ns(r.time, df=i) ))^2)
})

plot(dfs, ssq, type='b', lwd=2) 
r.df <- 3 # select df=3

Transform reference TPMs into artificial counts using an arbitrary library size.

# get gene lengths
g.l <- wormRef::Cel_genes[match(rownames(r.tpm), wormRef::Cel_genes[,"wb_id"]),
                          "transcript_length"]
libsize <- 25e6

# convert tpm to artificial counts
r.count <- t( (t(exp(r.tpm)-1)/1e6) * (libsize/median(g.l))) * g.l
r.count[r.count < 0] <- 0
r.count <- round(r.count)

Combine sample and reference count matrices and pdata.

# filter low expressed genes (at least >5 in one sample)
qs$count <- qs$count[apply(qs$count, 1, max)>5, ]

# keep overlapping sample and ref genes, and merge in one matrix
comb.count <- do.call(cbind, format_to_ref(qs$count, r.count)[1:2])

# combine pdata
comb.p <- data.frame(
  time = c(ae_qs$age.estimates[,1], r.time),
  strain = c(qs$pdat$strain, factor(rep('ctr', ncol(r.count)), 
                                    levels = c('ctr', 'mut'))), # ref is ctr
  batch = factor(rep(c('samp', 'ref'), c(ncol(qs$count), ncol(r.count))),
                 levels = c("samp", "ref"))
)

Infer gene dispersions by fitting a model on sample data only.

# build DESeq object
dd0 <- DESeqDataSetFromMatrix(countData = comb.count[ ,1:6],
                              colData = comb.p[1:6, ],
                              design = ~strain) # no age yet
# compute gene dispersions
dd0 <- estimateSizeFactors(dd0)
dd0 <- estimateDispersions(dd0, fitType = "local") 
d0 <- dispersions(dd0) # store dispersions
d0[is.na(d0)] <- 0 # remove NAs

Build a model with combined reference and sample data, injecting previously estimated gene dispersions.

dd1 <-  DESeqDataSetFromMatrix(
  countData = comb.count,
  colData = comb.p,
  design = ~ splines::ns(time, df=3)+batch+strain # use df found above
)
dd1 <- estimateSizeFactors(dd1)
# inject dispersions from sample-only model
dispersions(dd1) <- d0

dd1 <- nbinomWaldTest(dd1)

Extract DE results.

res <- results(dd1)
summary(res)
DESeq2::plotMA(dd1)

Results are a standard DESeq2 output and can be manipulated as such.

For an explanation of the analysis, see How it works.

Custom functions to transform reference data into artificial counts and run DESeq2 as described above are given in DE functions, below (please note they may require tweaking). The following is equivalent to the code above.

dd1 <- run_DESeq2_ref(
  X = qs$count,         # sample counts
  p = qs$pdat,          # sample pdata
  formula = "~ strain", # formula (without age)
  ref = r_cel,          # ref. object
  ae_values = ae_qs$age.estimates[,1], # sample age estimates
  window.extend = 1,    # ref. window extension
  ns.df = 3             # spline df for ref. window
)

How it works

Quantifying age-driven expression changes

Between two conditions 'A' and 'B' where the sample groups have developmental differences, the expression changes (or log-fold changes, logFCs) observed between the groups will be a combination of perturbation and developmental effects (a).

We can determine the expression changes expected only from the difference in development using reference expression profiles matching the samples (b). Any correlation between observed (sample) and expected (reference) logFCs will then correspond to the developmental effect, with uncorrelated logFCs corresponding to the perturbation effect (c).

knitr::include_graphics(file.path(figpath,"refcompare_method.png"))

The ref_compare() function inputs:


The resulting sample and reference (log2) logFCs can be extracted with get_logFC() and plotted. When there are more than 2 compared conditions, you can specify the factor levels to compare to the control (with l) when extracting the logFCs.

A custom ggplot2 function for plotting logFCs can be found in the Plotting functions section below.


The $r^2$ between sample and reference logFCs is a rough estimate of the percentage of variance explained by development (VarDev, in the bottom right). In this case, over $75\%$ of expression changes can be explained by development, which is expected given the large age difference between the sample groups.

The average age difference between groups and correlation between sample and reference are also given directly in the output of ref_compare().

print(qs_rc)

Correcting for age effects by integrating reference data in DE analysis

When there is no developmental overlap between the sample groups to compare, integrating reference data in the DE analysis makes it possible to recover truly DE genes by bridging the gap, as shown in the cartoon below.

knitr::include_graphics(file.path(figpath,"ref_integration.png"))

Most DE analysis tools input raw counts for their particular statistical properties, so the interpolated reference data must be converted from TPM to (artificial) counts assuming an arbitrary fixed library size ($25\times 10^6$ counts). Then, because gene dispersions needed for statistical testing cannot be estimated from the noiseless artificial reference, they are inferred from a model without reference data and injected into the model with reference data.

Thus, resulting model coefficients (logFCs) between sample groups are corrected for development by the reference, and their respective statistical tests use dispersion values inferred only from samples. This approach improves upon what is presented in the RAPToR article (@bulteau2022real), generating valid p-values.

DISCLAIMER: we provide the functions find_df(), run_DESeq2_ref() and sub-functions needed for age correction DE analysis only in this vignette and not as part of the package because (1) they go beyond the scope of RAPToR, and (2) they might need to be tweaked by the user to adapt to their needs and experimental designs (unlike ref_compare()). This approach should also work with tools other than DEseq2 (e.g. edgeR).

Demonstration of DE correction in a controlled case

Data and strategy

@miki2017two profiled time-series of C. elegans wild-type (WT) and xrn-2 mutants (GSE97775). Code to download this data and generate the dsmiki2017 object can be found at the end of this vignette

By selecting specific samples of matching development from both time-series, we define a gold-standard of truly DE genes. Then, by sliding the window of WT samples we can evaluate the effect of increasing developmental shifts between the groups on the DE analysis, as well as measure the advantages of integrating reference data in the model.

library(RAPToR)
library(wormRef)

library(DESeq2)
library(splines)
library(ROCR)

# for plotting
library(ggplot2)
library(ggpubr)
library(viridis)

col.palette1 <- c('grey20', 'firebrick', 'royalblue', 'forestgreen')
col.palette2 <- viridisLite::viridis(5)
load("../inst/extdata/dsmiki2017.RData") 

Defining age-matched and shifted sample sets

We start by infering sample age.

# load reference 
r_cel <- prepare_refdata("Cel_larv_YA", "wormRef", 600)

# estimate sample age
ae_miki <- ae(dsmiki2017$g, r_cel)
dsmiki2017$p$ae <- ae_miki$age.estimates[, 1]
plot(ae_miki, main = "Miki et al. (2017) samples", 
     group = dsmiki2017$p$strain_long, show.boot_estimates = F,
     color = col.palette1[dsmiki2017$p$strain], lmar = 10, g.line= 1)

We then select a set of 3 WT and 3 mutant samples as the age-matched gold-standard, and define sets of 3 WT samples shifted by 1, 2, 3, 5, and 7 time points compared to the mutants.

GS_wt <- 8:10 # gold standard WT samples
GS_mut <- 19:21 # GS mutant samples

shifts <- -c(1,2,3,5,7) # number of timepoints to shift
subs <- c(list(gold.standard = c(GS_wt, GS_mut)),
          lapply(shifts, function(s) c(GS_wt + s, GS_mut)))
names(subs)[-1] <- paste0('s', abs(shifts))
par(mfrow = c(1,1), pty = "s", xpd=T)
plot(ae~age, data = dsmiki2017$p, 
     xlab = "Chronological age (h past L4 stage, 25°C)",
     ylab = "RAPToR age estimates (h post-hatching, 20°C)",
     col = col.palette1[dsmiki2017$p$strain], lwd = 2,
     xaxt = "n", yaxt = "n", bty = "l") ; twoTicks()
invisible(sapply(levels(dsmiki2017$p$strain), function(l){
  points(ae~age, data = dsmiki2017$p[dsmiki2017$p$strain==l,], type = "l", lty = 2,
         col = col.palette1[which(levels(dsmiki2017$p$strain)==l)])
}))
xyoffsets <- cbind(x=c(.7, .9, 1.1, 1.3, 1.5),
                   y= 0 )#c(.6,.5,.75,.75,.75))
invisible(lapply(seq_along(subs), function(i){
  s <- subs[[i]]
  if(names(subs)[i] == 'gold.standard'){
    ym <- dsmiki2017$p$ae[s[4:6]]
    xm <- (ym - 43)/1.7 
    yw <- dsmiki2017$p$ae[s[1:3]]
    xw <- (yw - 43)/1.7 

    points(xm-.5, ym, type = 'l', lwd = 3, col = col.palette1[2])
    text(mean(xm-.5), mean(ym), pos = 2, labels = "xrn-2\nsubset", adj = 1, 
         col = col.palette1[2], font = 2, offset = 1)
    segments(range(xm-.5), range(ym), range(dsmiki2017$p$age[s[4:6]]), range(ym), col = col.palette1[2], lty = 3)

    points(xw+.5, yw, type = 'l', lwd = 3, col = col.palette1[1])
    text(mean(xw+.75), mean(yw), pos = 4, labels = "WT gold\nstandard", adj = 0, 
         col = col.palette1[1], font = 2, offset = 1)
    segments(range(xw+.5), range(yw), range(dsmiki2017$p$age[s[1:3]]), range(yw), col = col.palette1[1], lty = 3)
  } else {
    y <- dsmiki2017$p$ae[s[1:3]]
    x <- (y - 43)/1.7

    points(x+xyoffsets[i-1,1], y-xyoffsets[i-1,2], type = 'l', lwd = 3, col = col.palette2[i-1])
    text(mean(x+xyoffsets[i-1,1]), mean(y-xyoffsets[i-1,2]), 
         pos = 4, labels = paste0("WT ", shifts[i-1]), adj = 0, 
         col = col.palette2[i-1], font = 2, offset = 1)
    segments(range(x+xyoffsets[i-1,1]), range(y-xyoffsets[i-1,2]), 
             range(dsmiki2017$p$age[s[1:3]]), range(y-xyoffsets[i-1,2]), col = col.palette2[i-1], lty = 3)

  }

}))

legend("bottomright", bty = "n", lty = 2, pch = 1, lwd = 2,
       col = col.palette1[1:2], legend = c("wild-type", "xrn-2(xe31)"), text.col = col.palette1[1:2])


Quantifying age-driven expression changes in the shifted sets

With ref_compare(), we quantify the effect of development in all the sample sets defined above, and plot the reference vs. sample logFCs.

rcs <- lapply(subs, function(s){
    RAPToR::ref_compare(
      X = dsmiki2017$g[, s],  
      ref = r_cel,            
      ae_values = ae_miki$age.estimates[s,1], 
      group = dsmiki2017$p$strain[s]          
      )
})
ggs <- lapply(1:6, function(i){
  logFCs <- RAPToR::get_logFC(rcs[[i]])
  gg_logFC(logFCs, rg = c(-10, 10), 
           xlab = "Sample logFC", ylab = "Reference logFC",
           main = c('Gold-Standard', paste("WT", shifts))[i], 
           get.r = T, add.vd = T)
})
ggarrange(plotlist = ggs, ncol=3, nrow = 2, common.legend = T, legend = "right") 

Unsurprisingly, age-matched reference logFCs (i.e development) account for increasing proportions of expression changes with larger age differences between groups.

par(mfrow = c(1,1), bty='l', mar = c(5,6,3,5))
ae_dif <- sapply(rcs, function(rci) diff(attr(rci, 'group.stats')$ae_avg))
r_coef <- sapply(rcs, function(rci) cor(rci$coefs$samp[,2], rci$coefs$ref[,2], method = 'pearson'))
plot(ae_dif, r_coef, type='b', ylim = c(-0.05,1.0),
     ylab = "Sample and reference logFCs\ncorrelation (r)", 
     xlab = "Average age difference between groups (h)", lwd=2, las=1)
points(ae_dif, r_coef, col = c(1, col.palette2), lwd=5)
segments(x0 = ae_dif[4], x1 = ae_dif[6], y0 = 0.75, lwd=4, col = 'grey60')
text(sum(ae_dif[4], ae_dif[6])/2, 0.75, pos=1, label="No overlap between groups",
     col = 'grey40', font = 2, cex=.8)
arrows(x0 = .5, x1 = 1, y0 = r_coef[1], lwd=3, col = 'grey20', code = 1, length = .05)
text(1, r_coef[1], pos=4, label="WT gold-standard",
     col = 'grey20', cex=.8)
text(ae_dif[-1], r_coef[-1], labels = paste0('WT', shifts), cex = .8, font = 2, col = col.palette2, pos=3)


Effect of developmental shifts on DE analysis performance

Gold-standard DE

Using the age-matched mutant and WT samples defined as gold-standard, we find a set truly DE genes using a standard DESeq2 workflow.

We filter genes to keep those overlapping with the reference, and with at least 5 counts in any sample.

g.filt <- dsmiki2017$g.raw[apply(dsmiki2017$g.raw, 1, function(r) max(r)>5),]
g.filt <- RAPToR::format_to_ref(g.filt, r_cel$interpGE)$samp

We then build a DESeq model including age and strain and test for significant differences between mutant and WT groups. We define a run_DESeq2_age() function to do this (see DE functions below for code).

run_DESeq2_age <- function(X, p){
  # Run DEseq2 wt vs. mutant (with age covariate)
  require(DESeq2)
  rownames(p) <- colnames(X)
  p$aes <- scale(p$ae) # scale age value
  dds <- DESeqDataSetFromMatrix(countData = X,
                                colData = p,
                                design = ~aes+strain)
  dds <- DESeq(dds, test = "Wald", fitType = "local")
  return(dds)
}

get_DEres <- function(dds, coefname="strain_xrn2_vs_wt"){
  # Get results table from deseq output, managing NAs

  res <- results(dds, name=coefname)
  # manage NAs
  res$padj[is.na(res$padj)] <- 1
  res$log2FoldChange[is.na(res$log2FoldChange)] <- 0

  return(res)
}
# run_DESeq2 <- function(X, p){
#   # Run DEseq2 wt vs. mutant (without age)
#   require(DESeq2)
#   rownames(p) <- colnames(X)
#   dds <- DESeqDataSetFromMatrix(countData = X,
#                                 colData = p,
#                                 design = ~strain)
#   dds <- DESeq(dds, test="Wald", fitType = "local")
#   return(dds)
# }
DE.GS <- run_DESeq2_age(g.filt[, subs$gold.standard], 
                        dsmiki2017$p[subs$gold.standard, ])
res.GS <- get_DEres(DE.GS, coefname = "strain_xrn2_vs_wt") # get results table

We consider genes as DE with the thresholds below.

# Define DE (with p < thr AND |logFC| > thr)
thr.p <- 0.01
thr.logFC <- 1.0

# get true DE genes from gold-standard
truth <- (res.GS$padj < thr.p) & abs(res.GS$log2FoldChange) >= thr.logFC
table(truth)

With thresholds of $p.value < r thr.p$ and $|logFC| > r thr.logFC$, we find r table(truth)[2] DE genes in the absence of developmental effects ("true" DE genes).

DE with developmental shifts

We now apply the same model as above on the 5 sample subsets with shifted WT.

DE.shifts <- lapply(subs[-1], function(s){ 
  run_DESeq2_age(g.filt[, s], dsmiki2017$p[s, ])
  })
res.shifts <- lapply(DE.shifts, get_DEres)

Precision and recall (P/R) metrics allow us to measure the performance the DE analysis (p-value and logFC) in discriminating truly DE genes (i.e. genes found in the gold-standard DE above) from non-DE genes. An area under the P/R curve of 1 is a perfect classifier.

# compute precision-recall curves
rocs_initial_age <- lapply(seq_along(subs[-1]), function(i){
  v <- res.shifts[[i]]$padj
  v[abs(res.shifts[[i]]$log2FoldChange)<=thr.logFC] <- 1
  ROCR::prediction(predictions = -v, labels = truth)
})
layout(matrix(0:3, byrow = 2, nrow = 1), widths = c(.1,.7,.3,.1))
par(pty="s")
# ROCR measures to plot
xmes <- "rec"
ymes <- "prec"

# plot initial
invisible(sapply(seq_along(rocs_initial_age), function(i){
  plot(performance(rocs_initial_age[[i]], measure = ymes, x.measure = xmes),
       lwd = 2, col = transp(col.palette2[i], a=1),
       lty = 1, add=ifelse(i==1, F, T), ylim=c(0,1), xlim = c(0,1),
       main = "DE performance (no correction)")
}))
mtext(text = paste0('WT ', shifts), 
      line = 1, side = 4, padj = 0, font = 2, cex = .9,
      las = 2, col = col.palette2, at = c(.9, .8, .7, .6, .5))
par(pty='m')
init_auc <- do.call(rbind, lapply(rocs_initial_age, function(rr) performance(rr, "aucpr")@y.values[[1]]))
barplot(t(init_auc), beside = T, col = transp( col.palette2, a=c(.9)), las = 2, 
        border = c(NA), names = paste0('WT',shifts), ylab = "", ylim = c(0,1), cex.names = .8, 
        yaxs = 'i', yaxt='n', main = "Area under curve");twoTicks(2);
axis(2, lwd.ticks = 0, labels = F, line=0) 
title(ylab = "P/R curve AUC", line = 0.5)

We see a sharp decline in the DE model performance in detecting true DE genes as the developmental shift increases, particularly once there is no more developmental overlap between the compared groups (starting at WT-3).

If we select DE genes with the same thresholds as the gold-standard, this drop in performance translates to both a decrease of true positives and an increase of false positives, as shown below.

# get number of true & false DE genes :
nDEgen <- do.call(rbind, lapply(res.shifts, function(r){
  isDE <- (r$padj < thr.p) & (abs(r$log2FoldChange) >= thr.logFC)
  cbind(DE.in.truth=sum(isDE & truth), DE.notin.truth=sum(isDE & !truth))
}))
nDEgen <- rbind(g=c(sum(truth), 0), nDEgen)
rownames(nDEgen) <- c('GS', paste0("WT", shifts))
bpnde <- function(nDE, main="", cpal = c('red', col.palette2), names= rownames(nDE), show_in_gs=F, las=1, ...){
  # to plot results 
  n <- nrow(nDE)
  barplot(t(nDE), ylab = '', las = las, names = names, col = 0, 
          border = NA, cex.names = .8, yaxs = 'i', yaxt='n', main=main, ...);twoTicks(2);
  axis(2, lwd.ticks = 0, labels = F, line=0) ; title(ylab = 'Nb. DE genes', line = 0.5)
  sapply(seq_len(n), function(i){
    x <- nDE
    x[-i,] <- NA
    rownames(x)[-i] <- NA
    par(lwd=2)
    barplot(t(x), col=transp(cpal[i], a=c(1,0.5)), border = "white", 
            add=T, axes=F, names=rep("",n))
    par(lwd=1)
  })
  if(show_in_gs){
    par(xpd=T)
    text(x = par("usr")[2], y = c(nDE[n,1]/2, nDE[n,1]+nDE[n,2]/2), 
         labels = c("True positives", "False positives"), col = transp(c(1,1), a=c(1, 0.5)),
         font = 2, srt = -90, cex=.8)
    par(xpd=F)
  }
}
par(mar=c(4,4,1,1))
bpnde(nDEgen, las=2, show_in_gs = T)



DE analysis integrating reference data to correct age effect

Selecting reference windows

find_df <- function(ref, w, dfs=2:8){
  # Compute ssq of spline fits to find optimal df

  w.idx <- seq(max(c(1L, which.min(abs(w[1]-ref$time))-1 )),
               min(c(length(ref$time), which.min(abs(w[2]-ref$time))+1)))
  # get time values of window
  ts <- ref$time[w.idx]
  # get PCs of reference window
  pc <- summary(stats::prcomp(t(ref$interpGE[,w.idx]), scale=F, center=T))
  # keep enough components for 99% var
  spc <- sum(pc$importance[3, ] < 0.99)+1 
  pcfit <- pc$x[, 1L:spc]
  w <- pc$importance[2, 1L:spc]
  # compute ssq of fit residuals with different dfs
  # for each df, compute weighted ssq of spline fit on 
  ssqs <- cbind(sapply(dfs, function(dfi){
    ssq <- sum(w * colSums( 
      stats::residuals(stats::lm(pcfit~splines::ns(ts, df=dfi)))^2
    ))
  }))/(length(ts))
  return(ssqs)
}


log1ptpm_2rawcounts <- function(X, glengths, nreadbygl){
  # Transform log1p(tpm) to (artificial) raw counts
  # note : nreadbygl = colSums(rawcounts/genelengths)
  if(length(nreadbygl) != ncol(X))
    stop("nreadbygl != ncol(X)")
  if(length(glengths) != nrow(X))
    stop("glengths must be of length nrow(X)")
  X <- t( (t(exp(X) - 1)/1e6) * nreadbygl ) * glengths
  X[X<0] <- 0
  return(round(X))

}


ref_2counts <- function(ref, ae_values, 
                        gltable = wormRef::Cel_genes[,c("wb_id", 
                                                        "transcript_length")], 
                        avg_librarysize = 25e6){
  # Get expression profiles of given age from a RAPToR reference as 
  # (artificial) count data.
  # note : gltable must have WBids as col 1 and gene length as col 2.

  # ref expression profiles at given timepoints :
  rX <- RAPToR::get_refTP(ref, ae_values=ae_values, return.idx = F)

  # transform to counts
  gl <- gltable[match(rownames(rX), gltable[,1]), 2]
  rX <- log1ptpm_2rawcounts(rX, gl, nreadbygl = rep(avg_librarysize, 
                                                    ncol(rX))/median(gl))

  return(rX)
}

We select a reference window spanning the sample age range with an added margin of 1 hour on either side for each subset, and find the optimal spline degree-of-freedom (df) to model expression dynamics covered by the reference window.

We define a find_df() function which computes the residual sum of squares for a range of spline dfs and choose the optimal df values where a plateau is reached (see DE functions below for code).

dfs <- 2:8
dfs_ssq <- lapply(subs, function(s){
  find_df(r_cel, # ref object
          w = range(ae_miki$age.estimates[s,1]) + c(-1, 1), # time window
          dfs = dfs) # df range to test
})
# optimal df for each subset based on plot below
df_opti <- c(3,4,4,4,5,5) 
par(pty='s', mfrow = c(1,1), bty='l')
plot(range(dfs), range(dfs_ssq)+c(0,12), 
     type = 'n', xaxt = 'n', yaxt = 'n', 
     xlab = '', ylab = "Residual SSQ", bty = 'l'); twoTicks()
title(xlab = 'spline df', line = 2)
invisible(sapply(seq_along(dfs_ssq), function(i) {
  padd. <- 2*i # added padding to distinguish between curves
  points(dfs, padd.+dfs_ssq[[i]], type = 'b', col = c('red',col.palette2)[i], lty=3, lwd=2)
  points(df_opti[i], padd.+dfs_ssq[[i]][which(dfs==df_opti[i])], cex = 1.75, pch=1)
}))
legend('topright', legend = c( "GS", paste("WT", shifts), 'Chosen df'), text.font = c(rep(2, 6),1), 
       text.col = c('red', col.palette2, 1), bty = "n", lwd=c(rep(3, 6), 1), lty=c(rep(3, 6), NA), pch=1, 
       col = c('red', col.palette2, 1), pt.cex = c(rep(1, 6), 1.75))

The selected df increases with the developmental shift, which is expected since the reference window to include gets larger and may thus contain more complex dynamics.

DE models with reference data

Having defined the appropriate fit parameters for the reference, we can integrate in the DE model. We define run_DESeq2_ref() to do this (see DE functions below for code).

# run DESeq2 with reference data inclusion on shifted subsets
DEref.shifts <- lapply(seq_along(subs), function(i){
  s <-subs[[i]]  # select sample subset
  run_DESeq2_ref(X = g.filt[, s],                # sample counts
                 p = dsmiki2017$p[s, ],          # sample pdata
                 ae_values = dsmiki2017$p$ae[s], # age estimates
                 formula = ~strain,   # formula (age is added by the function)
                 ref = r_cel,         # ref object
                 ns.df = df_opti[i],  # df for spline fit
                 window.extend = 1)   # reference window extension
})
res_ref.shifts <- lapply(DEref.shifts, get_DEres) # get results table

As done above, let's assess the performance of these models in detecting truly DE genes.

# compute age-corrected precision-recall curves
rocs_corrected_age <- lapply(seq_along(subs), function(i){
  v <- res_ref.shifts[[i]]$padj
  v[abs(res_ref.shifts[[i]]$log2FoldChange)<=thr.logFC] <- 1
  ROCR::prediction(predictions = -v, labels = truth)
})
layout(matrix(0:3, byrow = 2, nrow = 1), widths = c(.1,.7,.4,.1))
par(pty="s")
# ROCR measures to plot
xmes <- "rec"
ymes <- "prec"

# plot initial curves in background for comparison
invisible(sapply(seq_along(rocs_initial_age), function(i){
  plot(performance(rocs_initial_age[[i]], measure = ymes, x.measure = xmes),
       lwd = 2, col = transp(col.palette2[i], a=.3),
       lty = 1, add=ifelse(i==1, F, T), ylim=c(0,1), xlim = c(0,1),
       main = "DE performance (age-corrected)")
}))
# add corrected model curves
invisible(sapply(seq_along(rocs_corrected_age)[-1], function(i){
  plot(performance(rocs_corrected_age[[i]], measure = ymes, x.measure = xmes),
       lwd = 2, col = col.palette2[i-1],
       lty = 1, add=T)
}))
# add GS curve
plot(performance(rocs_corrected_age[[1]], measure = ymes, x.measure = xmes),
     lwd = 2, col = 'red',
     lty = 2, add=T)

mtext(text = paste0('WT ', shifts), 
      line = 1, side = 4, padj = 0, font = 2, cex = .9,
      las = 2, col = col.palette2, at = c(.9, .8, .7, .6, .5))
par(pty='m')
aucs <- cbind(
  init_auc = c(1, init_auc),
  corr_auc = do.call(rbind, lapply(rocs_corrected_age, function(rr) performance(rr, "aucpr")@y.values[[1]]))
  )

barplot(t(aucs), beside = T, col = transp(rep(c('red', col.palette2), e=2), a=c(.5,1)), las = 2, 
        border = c(NA), names = c("GS", paste0('WT',shifts)), ylab = "", ylim = c(0,1), cex.names = .8, 
        yaxs = 'i', yaxt='n', main = "Area under curve");twoTicks(2);
axis(2, lwd.ticks = 0, labels = F, line=0) 
title(ylab = "P/R curve AUC", line = 0.5)
legend('topright', legend = c("no correction", "age-corrected"),
       pt.bg = c(transp(c(1,1), a=c(0.3, 1))), pch = 22, col = 0, bty = "n", 
       pt.cex = 3, y.intersp = 1.1, inset = c(0, 0))

Compared to the non-corrected models we strongly rescue the performance of the analysis when there is no overlap between the compared sample groups (starting at WT-3).

In less shifted subsets or in the gold-standard, we see that the performance of the age-corrected DE analysis to find truly DE genes is comparable to applying no correction. This is expected since developmental dynamics can be properly modeled without reference data as long as there is overlap between the compared groups, nullifying the advantage of adding it to the model.

Functions and code

Code to generate dsmiki2017

Required libraries and variables:

data_folder <- "../inst/extdata/"

library("GEOquery") # bioconductor package

requireNamespace("wormRef", quietly = T)
requireNamespace("utils", quietly = T)

Note : set the data_folder variable to an existing path on your system where you want to store the objects.




Code to generate quickstart data

Given the dsmiki2017 data generated above:

sel <- c(5:7, 19:21) # select 3 WT and 3 mut

qs <- list(
  tpm = dsmiki2017$g[, sel],
  count = dsmiki2017$g.raw[, sel],
  pdat = data.frame( 
    sname = paste0(rep(c('ctr', 'mut'), e=3), rep(1:3,2)),
    strain = factor(rep(c('ctr', 'mut'), e=3)),
    stringsAsFactors = F)
)
colnames(qs$tpm) <- qs$pdat$sname
colnames(qs$count) <- qs$pdat$sname

save(qs, file = file.path(data_folder, 'sc4_qsdata.RData'), compress = "xz")


Plotting functions

Plotting a logFC comparison as binned heatmap (gg_logFC())


Make a color transparent (transp()) and add an axis with 2 tick marks twoTicks()


DE functions

Running a standard DESeq2 workflow (run_DESeq2_age()) and extracting a results table (get_DEres()).



Finding optimal df to fit a reference window (find_df()) and converting reference data to counts (log1ptpm_2rawcounts(), ref_2counts()).



Integrating reference data in a DESeq model (run_DESeq2_ref()).



References {.unnumbered}


SessionInfo {.unnumbered}

options(width = 80)
sessionInfo()


LBMC/wormAge documentation built on April 6, 2023, 3:52 a.m.