In LBMC/RAPToR: Real Age Prediction from Transcriptome staging On Reference

Data

In this example, we use two C. elegans RNAseq time-series:

A high-resolution time series of late larval development published by @hendriks2014extensive, dshendriks2014, used to build the reference. (Accession : GSE52861)
A time series of larval development in 4 different strains published by @aeschimann2017lin41, dsaeschimann2017, used for external validation. (Accession : GSE80157)

The data is the same used in the vignette above, but we flip which dataset is the reference and which is used for external validation. Code to generate dsaeschimann2017 and dshendriks2014 can be found at the end of this section

Normalization & exploration

We start by normalizing the data.

load("../inst/extdata/dsaeschimann2017.RData")
load("../inst/extdata/dshendriks2014.RData")

library(RAPToR)
library(limma)

dshendriks2014$g <- limma::normalizeBetweenArrays(dshendriks2014$g, 
                                                  method = "quantile")
dshendriks2014$g <- log1p(dshendriks2014$g)

dsaeschimann2017$g <- limma::normalizeBetweenArrays(dsaeschimann2017$g, 
                                                    method = "quantile")
dsaeschimann2017$g <- log1p(dsaeschimann2017$g)

Check the contents of the expression matrix and pheno data.

dshendriks2014$g[1:5,1:3]

head(dshendriks2014$p, n = 5)

Correlation between the samples.

cor_dshendriks <- cor(dshendriks2014$g, method = "spearman")
ord <- order(dshendriks2014$p$age)
heatmap(cor_dshendriks[ord, ord], 
        Colv = NA, Rowv = NA, scale = "none", 
        keep.dendro = F, margins = c(1,5),
        labRow = "", labCol = "")
# add time labels
par(xpd = T)
mtext(text = paste0(dshendriks2014$p$age), 
      side = 1, line = 4.1, at = seq(-.17,.86, l = 16), cex = .8)

# add color legend
l.values <- seq(min(cor_dshendriks), max(cor_dshendriks), l = 10)
image(x = c(.95,1), y = seq(0.6,1, l = 10), useRaster = T,
      z = matrix(l.values, ncol = 10),
      col = hcl.colors(12, "YlOrRd", rev = TRUE), add = T)
text(.975, 1, pos = 3, labels = expression(rho), font = 2)
text(1, y = seq(0.6,1, l = 10), pos = 4, 
     labels = round(l.values, 2), cex = .6)

mtext(at = 1.0, line = 4.1, side = 1, text = "(hours)", cex = .8)

boxplot(cor_dshendriks, names = paste0(dshendriks2014$p$age,'h'), 
        boxwex=.5, las=1)

Everything looks good, no outliers.

Now, plotting principal components.

pca_dshendriks <- summary(stats::prcomp(t(dshendriks2014$g), rank = 25,
                                        center = TRUE, scale = FALSE))

par(mfrow = c(2,3), pty='s')
invisible(sapply(1:6, function(i){
  plot(dshendriks2014$p$age, pca_dshendriks$x[,i], lwd = 2, type = 'b',
       xlab = "Chronological age", ylab = "PC", main = paste0("PC", i))
}))

Model fitting

We keep enough components to explain $99\%$ of the variance in the data, since it is very clean.

nc <- sum(pca_dshendriks$importance[3,] < .99) + 1
nc

We then fit a GAM model with a cubic spline on age on PCA components.

m_dshendriks2014 <- ge_im(X = dshendriks2014$g, p = dshendriks2014$p, 
                          formula = "X ~ s(age, bs = 'cr')", nc = nc)

Validation

First, check global model performance.

# global model performance
mp_hendriks <- mperf(dshendriks2014$g, predict(m_dshendriks2014), is.t = T)
print(do.call(cbind, mp_hendriks))

And then per gene.

ng_mp_hendriks <-  mperf(dshendriks2014$g, predict(m_dshendriks2014), 
                         is.t = T, global = F)
# remove NAs (eg. 0 variance genes) and Inf values (/0)
ng_mp_hendriks <- lapply(ng_mp_hendriks, na.omit)
ng_mp_hendriks$aRE <- ng_mp_hendriks$aRE[ng_mp_hendriks$aRE < Inf]

par(mfrow = c(2,2), mar=c(4,4,2,1))
invisible(sapply(names(ng_mp_hendriks), function(idx){
  rg <- range(na.omit(ng_mp_hendriks[[idx]]))
  # estimate density curve
  d <- density(na.omit(ng_mp_hendriks[[idx]]), from = rg[1], to = rg[2])
  plot(d, main = paste0(gsub("a", "", idx, fixed = T), 
                        " density (", length(ng_mp_hendriks[[idx]]), " genes)"), 
       xlab = idx, lwd = 2)
  # add global value
  abline(v = mp_hendriks[[idx]], lty = 2, lwd = 2, col = "firebrick")
  text(mp_hendriks[[idx]], .9*max(d$y), pos = 4, labels = idx, 
       font = 2, col = "firebrick")
}))

Excellent fits.

Then, build the ref object.

r_dshendriks2014 <- make_ref(
  m = m_dshendriks2014,
  n.inter = 100,                  # interpolation resolution
  t.unit = "h past egg-laying, 25C",   # time unit
  metadata = list("organism" = "C. elegans", # any metadata
                  "profiling" = "bulk RNAseq"))

Plot model predictions on components.

par(mfrow = c(2,5), pty='s', mar = c(4,4,3,1))
plot(m_dshendriks2014, r_dshendriks2014, ncs = 1:10)

No overfitting, good fits on the components explaining most of the variance. Some components flattened, but minimal associated variance.

Finally, we stage the samples from the reference, as well as the external validation data (dsaeschimann2017). We exclude the earliest dsaeschimann2017 samples (under 22h) from staging since they are outside the span covered by the reference

ae_dshendriks2014 <- ae(dshendriks2014$g, r_dshendriks2014)

too_young <- dsaeschimann2017$p$age <= 22
ae_dsaeschimann2017 <- ae(dsaeschimann2017$g[,!too_young], r_dshendriks2014)

par(mfrow = c(1,2), mar = c(4,5,3,1))
# dshendriks2014
dshendriks2014$p$age_est <- ae_dshendriks2014$age.estimates[,1]
rg <- range(c(dshendriks2014$p$age_est, dshendriks2014$p$age))
plot(age_est~age, data = dshendriks2014$p, 
     xlab = "Chronological age", ylab = "Estimated age (r_dshendriks2014)", 
     xlim = rg, ylim = rg,
     main = "Staging dshendriks2014 
on r_dshendriks2014", lwd = 2)
points(age_est~age, data = dshendriks2014$p, type = 'l', lty = 2)
abline(a = 0, b = 1, lty = 3, lwd = 2) # x=y
legend("bottomright", legend = "x = y", lwd=3, col=1, lty = 3, bty='n')

# dsaeschimann2017
dsaeschimann2017$p[!too_young, "age_est"] <- 
  ae_dsaeschimann2017$age.estimates[,1]
rg <- range(c(dsaeschimann2017$p$age_est[!too_young], 
              dsaeschimann2017$p$age[!too_young]))
plot(age_est~age, data = dsaeschimann2017$p[!too_young,], 
     xlab = "Chronological age", ylab = "Estimated age (r_dshendriks2014)", 
     xlim = rg, ylim = rg,
     main = "Staging dsaeschimann2017 
on r_dshendriks2014", 
     lwd = 2, col = factor(dsaeschimann2017$p$strain))
invisible(sapply(levels(dsaeschimann2017$p$strain), function(l){
  s <- dsaeschimann2017$p$strain == l & !too_young
  points(age_est~age, data = dsaeschimann2017$p[s,], 
         type = 'l', lty = 2, 
         col = which(l==levels(factor(dsaeschimann2017$p$strain))))
}))
abline(a = 0, b = 1, lty = 3, lwd = 2)
legend("bottomright", bty='n',
       legend = c("let-7", "lin-41", "let-7/lin-41", "N2", "x = y"), 
       lwd=3, col=c(1:4, 1), pch = c(1,1,1,1,NA), lty = c(rep(NA, 4), 3))

Code to generate objects

Required packages and variables:

data_folder <- "../inst/extdata/"

requireNamespace("wormRef", quietly = T)
requireNamespace("utils", quietly = T)
requireNamespace("GEOquery", quietly = T) # bioconductor
requireNamespace("Biobase", quietly = T)  # bioconductor