In LBMC/wormAge: Real Age Prediction from Transcriptome staging On Reference

knitr::opts_knit$set(out.format = "html", header = "")
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center",
  out.width = '100%',
  cache=T
)
options(width=100)

transp <- function(col, a=.5){
  colr <- col2rgb(col)
  return(rgb(colr[1,], colr[2,], colr[3,], a*255, maxColorValue = 255))
}

Including estimated age as a covariate in Differential Expression (DE) analysis can substantially reduce previously unexplained variation between samples. In this example, we show that even when chronological age of different time points is known, using estimated age results in better model fits and consequently better power to detect DE genes.

Data and strategy

@lehrbach2012post profiled C. elegans wild-type (wt) and pash-1(mj100) mutants (mut) at 4 time points (0, 6, 12, and 24 hours past the L4 stage), each in triplicate (Accession: E-MTAB-1333, dslehrbach2012).

Given the time-series experimental design, searching for Differentially Expresses (DE) genes between strains requires including development in the model. While true in any time series context, it is particularly important here as late-larval development of C. elegans is known for drastic changes in gene expression within short time frames (@snoek2014rapid).

Using identical DE models with either chronological age or RAPToR age estimates as covariates, we can determine which of the two is the best predictor and thus gives more power to detect differential expression.

Code to generate the dslehrbach2012 object can be found at the end of this section.

library(RAPToR)
library(wormRef)

library(stats)
library(parallel)
library(limma)

load("../inst/extdata/dslerhbach2012.RData")

Estimating sample age

We start by applying a quantile-normalization and $log(X+1)$ transformation to the data.

dslehrbach2012$g <- limma::normalizeBetweenArrays(dslehrbach2012$g, 
                                                  method = "quantile")
dslehrbach2012$g <- log1p(dslehrbach2012$g)

Next, we select a reference and stage the samples. Since sample (chronological) age ranges from the fourth larval molt (L4) to 24h past L4, we select Cel_YA_2 from wormRef that covers this range.

# load reference
r_ya <- prepare_refdata("Cel_YA_2", "wormRef", 400)

# estimate sample age
ae_lerhbach <- ae(dslehrbach2012$g, r_ya)

plot(ae_lerhbach, groups = as.factor(dslehrbach2012$p$tpastL4),
     main = "Lerhbach et al. (2012) samples, grouped by time point",
     subset = rev(1:24), lmar = 7, col = c(2,1)[dslehrbach2012$p$strain])

We can see quite a lot of variation in the sample timings, especially at the 0- and 6-hour time points.

Another curious effect can be noted, easier to see when grouping the samples by batch. In the first two replicates, mutants are systematically (and statistically significantly) older than WT, while the opposite effect is true for the third replicate.

# compute age difference between wt & mut at each time point
isWT <- dslehrbach2012$p$strain=="wt"
ae_dif <- ae_lerhbach$age.estimates[isWT,1] - ae_lerhbach$age.estimates[!isWT,1]

# test for effect significance with a linear model
batch <- dslehrbach2012$p$rep[isWT]
summary(lm(ae_dif~batch))

layout(matrix(1:2, nrow=1), widths = c(.7, .3))
par(mar = c(4,5,3,1))
plot(ae_lerhbach, groups = dslehrbach2012$p$rep, 
     main = "Lerhbach et al. (2012) samples, grouped by batch", 
     subset = rev(1:24), lmar=7, col = c(2,1)[dslehrbach2012$p$strain])

plot(ae_dif~batch, 
     ylab="Estimated age difference\nbetween wt and mut samples (h)", 
     xlab = "Batch", boxwex=.4);abline(h=0, lty=2, col='grey')

Impact of development on gene expression

Gene expression depends so strongly on development that correlation between samples is only predicted by their age difference, rather than by their genetic background.

The graph below shows correlation between all possible pairs of samples, with no clear effect of strain difference.

cc <- cor(dslehrbach2012$g, dslehrbach2012$g, method = "spearman")

ut <- upper.tri(cc, diag = F)
ya <- cc[ut]
xa <- abs(ae_lerhbach$age.estimates[col(cc)[ut],1] - ae_lerhbach$age.estimates[row(cc)[ut],1])
cols <- (dslehrbach2012$p$strain[col(cc)[ut]] != dslehrbach2012$p$strain[row(cc)[ut]]) + 1

plot(xa, ya, ylab = "Spearman correlation",
     xlab = "Absolute estimated age difference (h)",
     cex=.5, xlim = range(xa),
     lwd=2, col = cols)
legend("topright", legend = c("same strain", "different strain"), 
       title = "Sample pair of", bty = 'n', inset = .05,
       col = 1:2, pch = 1, lwd = 3, lty = NA, text.font = 2, text.col = 1:2)

Differential Expression analysis

We will use limma to perform a DE analysis with the following model:

$$ X \sim strain \times \mathtt{ns(age, df = 2)} $$ where $\mathtt{age}$ will either be chronological or estimated age, and the spline $\mathtt{ns()}$ will be used to handle non-linear expression dynamics along time.

To find differential expression of genes along development or between strains translates to comparing the following nested models: (2) vs. (1) for development, (3) vs. (1) for strain.

$$ \begin{aligned} Y & = \beta_0 + \beta_1 I_{strain} + (\alpha_1 age_{sp1} + \alpha_2 age_{sp2}) + (\gamma_1 I_{strain} age_{sp1} + \gamma_2 I_{strain} age_{sp2})\ Y & = \beta_0 + \beta_1 I_{strain}\ Y & = \beta_0 + (\alpha_1 age_{sp1} + \alpha_2 age_{sp2}) \end{aligned} $$ Genes will be considered DE with Benjamini-Holm adjusted p-values $< 0.05$ for the corresponding coefficients.

Note: for the purpose of this vignette, we don't consider whether genes are up- or down-regulated, but only if a significant effect is detected.

pred_lmFit <- function(Fit){
# Predictions from a limma model
  tcrossprod(Fit$coefficients, Fit$design)
}

GoF <- function(Fit, X){
# Compute Goodness of Fit
  pred <- pred_lmFit(Fit)
  res <- (X - pred)
  ss <- apply(X, 1, function(ro) sum((ro - mean(ro))^2))
  Rsq <- sapply(seq_len(nrow(X)), function(i){
    1 - sum(res[i,]^2)/ss[i]
  })
  return(Rsq)
}

DGE <- function(X, strain, age, df = 2, name = NULL, return.model = FALSE){
  require(splines)
  if(! length(strain) == ncol(X) | ! length(age) == ncol(X))
    stop("strain and age must be of length ncol(X).")

  # make pdat df
  pdat <- data.frame(strain = factor(strain), 
                     age = as.numeric(age), 
                     row.names = colnames(X))

  # build design matrix
  d <- model.matrix(~ 1 + ns(age, df = df) * strain, data = pdat) 

  # fix colnames
  colnames(d) <- c("b0", paste0(rep("a", df), 1L:df), 
                   "strainmut", paste0(rep("strainmut.a", df), 1L:df))

  # build contrast matrices for mut and age tests
  if(df == 2){
    cm.mut <- makeContrasts(mut = strainmut,
                            mut.i1 = strainmut.a1,
                            mut.i2 = strainmut.a2,
                            levels = d)
    cm.age <- makeContrasts(a1, a2,
                            a.i1 = strainmut.a1,
                            a.i2 = strainmut.a2,
                            levels = d)
  }

  if(df == 3){
    cm.mut <- makeContrasts(mut = strainmut,
                            mut.i1 = strainmut.a1,
                            mut.i2 = strainmut.a2,
                            mut.i3 = strainmut.a3,
                            levels = d)
    cm.age <- makeContrasts(a1, a2, a3,
                            a.i1 = strainmut.a1,
                            a.i2 = strainmut.a2,
                            a.i3 = strainmut.a3,
                            levels = d)
  }

  # fit model
  m.0 <- lmFit(object = X, design = d)
  # get GoF
  gof <- GoF(Fit = m.0, X = X)

  # find DE genes for mut
  m.m <- contrasts.fit(m.0, contrasts = cm.mut)
  m.m <- eBayes(m.m)

  # find DE genes for age
  m.a <- contrasts.fit(m.0, contrasts = cm.age)
  m.a <- eBayes(m.a)

  Tmut <- topTable(m.m, adjust.method = "BH", 
                   number = Inf, 
                   sort.by = "none")[, c("F", "P.Value", "adj.P.Val")]
  Tage <- topTable(m.a, adjust.method = "BH", 
                   number = Inf, 
                   sort.by = "none")[, c("F", "P.Value", "adj.P.Val")]

  res <- list(gof  = gof,
              tmut = Tmut,
              tage = Tage,
              name = name)
  if(return.model)
    res$model = m.0

  rm(m.0, m.m, m.a, Tmut, Tage, d, X, pdat, gof)
  gc(verbose = F)
  return(res)
}

We now run the same DE analysis using either chronological age or RAPToR age estimates as predictor, and compare results to determine if there is an improvement. Code for DGE() can be found at the end of this section.

# format gdata as log2 for limma input
X <- log2(exp(dslehrbach2012$g))

# with chron. age
dge.ca <- DGE(X = X, strain = dslehrbach2012$p$strain, 
              age = dslehrbach2012$p$tpastL4, 
              name = "dge.ca", return.model = T)

# with RAPToR ae
dge.ae <- DGE(X = X, strain = dslehrbach2012$p$strain, 
              age = ae_lerhbach$age.estimates[,1], 
              name = "dge.ae", return.model = T)

plot_DGE_pval_vs <- function(pv.x, pv.y, 
                             xlab = "-log10(pv.x)", ylab = "-log10(pv.y)", 
                             main = "DE genes with pv.x vs pv.y", ...)
{ 
  cols <- rowSums(cbind((pv.x < 0.05), (pv.y < 0.05),
                     (pv.x > 0.05)+1 & (pv.y < 0.05)))+1

  plot(-log10(pv.x), -log10(pv.y), cex=.3, pch=16, 
       xlab = xlab, ylab = ylab, main = main,
       col = cols, ...)

  abline(v=-log10(0.05), h=-log10(0.05), col=2)

  legend('bottomright', legend = paste(table(cols)), text.col = 1:4,
         text.font = 2, horiz = T, bty='n', x.intersp = .25)
  invisible(cols)
}

par(mfrow = c(1,2))
cols <- plot_DGE_pval_vs(dge.ca$tmut$adj.P.Val, dge.ae$tmut$adj.P.Val, main = "Strain effect detection (chron. vs ae)", 
                         xlab = "-log10(pval) with chron. age", ylab = "-log10(pval) with ae")

cols <- plot_DGE_pval_vs(dge.ca$tage$adj.P.Val, dge.ae$tage$adj.P.Val, main = "Development effect detection (chron. vs ae)",
                         xlab = "-log10(pval) with chron. age", ylab = "-log10(pval) with ae")

Estimated age shows a clear increase in performance over chronological age when comparing adjusted p-values of each gene for detection of an effect. Red bars in the plots above correspond to the $0.05$ threshold for significance and color-coded gene counts for each quadrant are indicated in the bottom-right.

# compute nb. of DE genes for mutation
mut_dif.ca <- sum(dge.ca$tmut$adj.P.Val < 0.05)
mut_dif.ae <- sum(dge.ae$tmut$adj.P.Val < 0.05)

# compute nb. of DE genes for development
age_dif.ca <- sum(dge.ca$tage$adj.P.Val < 0.05)
age_dif.ae <- sum(dge.ae$tage$adj.P.Val < 0.05)

# percentage increase of mutation DE genes
100 * (mut_dif.ae - mut_dif.ca) / (mut_dif.ca) # mut pct increase 

# percentage increase of development DE genes
100 * (age_dif.ae - age_dif.ca) / (age_dif.ca) # age pct increase

Indeed, we detect nearly twice as many DE genes for strain using RAPToR age estimates compared to chronological age, and around $8\%$ more genes with development. Furthermore, the large proportion of genes changing with development ($57 - 62\%$) compared to strain ($7-16\%$) also shows how crucial it is to properly model developmental dynamics in DE analyses.

The increase in detected DE genes is partially explained by better model fits, shown below by the goodness-of-fit (GoF) distribution across genes. The goodness-of-fit (GoF) computed is an $R^2 = 1 - \frac{SS_{res}}{SS_{tot}}$ per gene.

d.ca <- density(dge.ca$gof, from = 0, to = 1)
d.ae <- density(dge.ae$gof, from = 0, to = 1)

plot(d.ca$x, d.ca$y, type = "l", lwd = 3, ylim = range(c(d.ca$y, d.ae$y)), 
     xlab = "GoF", col = "red", ylab = "Density", main = paste("Model Goodness of Fit distribution across",nrow(X),"genes"))
points(d.ae$x, d.ae$y, type = 'l', col = "blue", lwd = 3) 
legend("bottom", lwd = 4, col = c("red", "blue"), title = "Model with",
       legend = c("Chronological age", "Estimated age"), title.col = "black",
       text.font = 2, text.col = c("red", "blue"), bty = 'n')

Further support for the increase in performance

If the asynchronicity that RAPToR detects between samples is erroneous, then adding random noise around the chronological age values should yield similar results to using our age estimates in the model. To test this, we simulate age sets with added noise of similar distribution to the age differences observed between chronological and estimated age.

set.seed(10) # for reproducibility

age_diffs <- (dslehrbach2012$p$tpastL4 + 50) - ae_lerhbach$age.estimates[,1]
# Note : we add 50 to shift tpastL4 to the age values and avoid negative 
#        tpastl4 values. This has no impact on the DE analysis.


# estimate density function of age_diffs
d_ad <- density(age_diffs)
# generate 100 age sets of with random age_diffs-like noise 
n <- 100
rd_ages <- lapply(seq_len(n), function(i){
  (dslehrbach2012$p$tpastL4 + 50) + 
    sample(x = d_ad$x, size = nrow(dslehrbach2012$p), 
           prob = d_ad$y, replace = T) 
})

As done above, we run identical DE models and compare results with the model goodness-of-fit (GoF) per gene and look at the number of DE genes found for strain and development (BH-adjusted p-value $< 0.05$).

# setup cluster for parallelization
cl <- parallel::makeCluster(6, "FORK")

# do DGE on all age sets
rd_dges <- parLapply(cl, seq_len(n), function(i){
  cat("\r", i,"/",n)
  DGE(X, dslehrbach2012$p$strain, rd_ages[[i]], name = paste0("rd.",i))
})

stopCluster(cl)
gc()

# get quantiles of GoF for plotting
qts <- seq(0,1, length.out = 100)
quants <- lapply(seq_len(n), function(i){
  quantile(rd_dges[[i]]$gof, probs = qts)
})
quants <- do.call(rbind, quants)

layout(mat = matrix(c(2,3,1,1), ncol = 2), widths = c(.45,.55))

plot(0:1, 0:1, type = "n", ylab = "GoF >= quantile", xlab = "quantile", main = "QQ-plot of Goodness of Fit distribution")
invisible(apply(quants, 1, function(ro) points(qts, ro, type = 'l', col = transp("black", a = .4))))
points(qts, quantile(dge.ca$gof, probs = qts), type = "l", lwd = 4, col = 2)
points(qts, quantile(dge.ae$gof, probs = qts), type = "l", lwd = 4, col = 4)

legend('bottomright', legend = c("Chronological age", "Estimated age", "Chron.Age + noise"), 
       col = c(2,4, 1), lwd = c(4,4,1), bty = "n")


diffgens <- lapply(seq_len(n), function(i){
  data.frame(mut = sum(rd_dges[[i]]$tmut$adj.P.Val < 0.05), age = sum(rd_dges[[i]]$tage$adj.P.Val < 0.05))
})

diffgens <- do.call(rbind, diffgens)

hist(diffgens$mut, breaks = 50, main = "Nb of DE genes for strain", xlim = c(0,3500), xlab = "Nb. genes")
abline(v = sum(dge.ca$tmut$adj.P.Val < 0.05), lwd = 3, col = 2)
abline(v = sum(dge.ae$tmut$adj.P.Val < 0.05), lwd = 3, col = 4)

hist(diffgens$age, breaks = 50, xlim = c(min(diffgens$age), 12000), main = "Nb of DE genes for devpt.", xlab = "Nb. genes")
abline(v = sum(dge.ca$tage$adj.P.Val < 0.05), lwd = 3, col = 2)
abline(v = sum(dge.ae$tage$adj.P.Val < 0.05), lwd = 3, col = 4)

We can see that using estimated age systematically leads to better model fits and increased detection of DE genes both for strain and development.

Gene GoF using estimated age and binned by chronological age GoF quantiles (below, left) also clearly shows that for the same genes, we tend to have better fits with estimated age, while that is not the case when adding noise.

ord <- order(dge.ca$gof)

gofs <- lapply(seq_len(n), function(i){
  rd_dges[[i]]$gof[ord]
})
gofs <- do.call(rbind, gofs)

plot_QQGoFbox <- function(gof, bins = 10, at = seq(0,1,l = bins), boxwex = .5/bins, add = F, plotargs = list(), ...){
  grps <- cut(1:length(gof), bins)

  bxs <- lapply(levels(grps), function(l){
    s <- which(grps == l)
    return(boxplot(gof[s], plot = F))
  })
  if(!add){
    do.call(plot, c(list(x = range(at), y=range(gof), type = "n"), plotargs))
  }

  invisible(lapply(seq_len(bins), function(i){
    bxp(z = bxs[[i]], at = at[i], add = T, frame.plot = F, axes = F, boxwex = boxwex, ...)
  }))
}

par(mfrow = c(1,2), mar = c(4,4,4,1))
plot(0:1, 0:1, type = "n", ylab = "GoF", xlab = "quantile", main = "Goodness of Fit distribution in chron. age quantile order\n(estimated age)")
points(qts, quantile(dge.ca$gof, probs = qts), type = "l", lwd = 4, col = 2)
plot_QQGoFbox(dge.ae$gof[ord], bins = 50, add = T, cex = .2, lwd = 1, boxcol=4, medcol = 4, medlwd=6, boxwex = .8/50)

legend('bottomright', legend = c("Chronological age GoF quantiles"), 
       col = c(2), lwd = c(4), bty = "n")


plot(0:1, 0:1, type = "n", ylab = "GoF >= quantile", xlab = "quantile", main = paste0("Goodness of Fit distribution in chron. age quantile order\n(joined simulated age + noise sets, n = ",n, ")"))
points(qts, quantile(dge.ca$gof, probs = qts), type = "l", lwd = 4, col = 2)
plot_QQGoFbox(c(gofs), bins = 50, add = T, cex = .2, lwd = 1, boxwex = .8/50)

Functions and code

Code to generate objects

data_folder <- "../inst/extdata/"

requireNamespace("wormRef", quietly = T)
requireNamespace("utils", quietly = T)

requireNamespace("biomaRt", quietly = T) # bioconductor
requireNamespace("limma", quietly = T)   # bioconductor
requireNamespace("affy", quietly = T)    # bioconductor
requireNamespace("gcrma", quietly = T)   # bioconductor