In LBMC/RAPToR: Real Age Prediction from Transcriptome staging On Reference

Data

This example uses two Danio rerio (zebrafish) embryo development time-series:

A time-series of zebrafish embryonic development (with uneven time sampling) published by @white2017high, dswhite2017, used to build the reference. (Data accessible from the publication)
A high-resolution time-series of embryonic development published by @levin2016mid, dslevin2016zeb, used for external validation. (Accession : GSE60619)

Code to generate dswhite2017 and dslevin2016zeb can be found at the end of this section

The reference data (dswhite2017) has uneven time sampling, as can often be the case to account for differences in dynamic ranges of expression: later time points are more sparse. We can apply a transformation on the time values (using asinh() in this case) so they become more uniform and thus avoid interpolation bias.

Normalization & Quick look

transp <- function(col, a=.5){
  colr <- col2rgb(col)
  return(rgb(colr[1,], colr[2,], colr[3,], a*255, maxColorValue = 255))
}

load("../inst/extdata/dswhite2017.RData")
load("../inst/extdata/dslevin2016zeb.RData")

We start by normalizing the data.

library(RAPToR)
library(limma)

dswhite2017$g <- limma::normalizeBetweenArrays(dswhite2017$g, 
                                               method = "quantile")
dswhite2017$g <- log1p(dswhite2017$g)

dslevin2016zeb$g <- limma::normalizeBetweenArrays(dslevin2016zeb$g, 
                                                  method = "quantile")
dslevin2016zeb$g <- log1p(dslevin2016zeb$g)

Check the contents of the expression matrix and pheno data.

dswhite2017$g[1:5, 1:4]

head(dswhite2017$p, n = 5)

Correlation between samples for outliers.

cor_dswhite2017 <- cor(dswhite2017$g, method = "spearman")
ord <- order(dswhite2017$p$age)
heatmap(cor_dswhite2017[ord, ord], Colv = NA, Rowv = NA, 
        scale = "none", keep.dendro = F, margins = c(1,5),
        RowSideColors = transp(as.numeric(dswhite2017$p$batch[ord])), 
        labRow = "", labCol = "")
par(xpd = T)
mtext(text = unique(dswhite2017$p$age), side = 1, line = c(3.8, 4), 
      at = seq(-.12, .915, l = length(unique(dswhite2017$p$age))), cex = .6)

# color key
l.values <- seq(min(cor_dswhite2017), max(cor_dswhite2017), l = 10)
image(x = c(.95,1), y = seq(0.6,1, l = 10), useRaster = T,
      z = matrix(l.values, ncol = 10),
      col = hcl.colors(12, "YlOrRd", rev = TRUE), add = T)

text(.975, 1, pos = 3, labels = expression(rho), font = 2)
text(1, y = seq(0.6,1, l = 10), pos = 4, 
     labels = round(l.values, 2), cex = .6)

xlp <- 1.025
batch_legend <- as.character(1:5)
text(xlp, .5, labels = "batch", font = 2, cex = .8, adj = .5)
text(xlp, seq(.3,.48, l = 5), labels = batch_legend, adj = 1, pos = 1,
     col = levels(dswhite2017$p$batch), font = 2, cex = .7)

mtext(at = xlp, line = 4, side = 1, text = "(hours)", cex = .8)

boxplot(cor_dswhite2017~interaction(dswhite2017$p$batch, 
                                    dswhite2017$p$age), 
        col = transp(1:5), # see 'Code to generate objects' for transp()
        xaxt = "n", at = seq(1,90, l = 90*(6/5))[c(T,T,T,T,T,F)], boxwex=.5,
        ylab = "Spearman correlation", xlab = "Chronological age (h)", las=1)
#add time labels
axis(side = 1, at = seq(2.5,87.5, l = 90/5), 
     labels = paste0(unique(dswhite2017$p$age), 'h'))
legend('bottom', fill = transp(1:5), title = "Batch", 
       legend = c(1:5), horiz = T, bty = "n")

No outliers.

Plotting principal components.

pca_dswhite2017 <- stats::prcomp(t(dswhite2017$g), rank = 25,
                                 center = TRUE, scale = FALSE)

par(mfrow = c(2,4), pty='s')
invisible(sapply(1:8, function(i){
  plot(dswhite2017$p$age, pca_dswhite2017$x[,i], 
       lwd = 2, col = dswhite2017$p$batch,
       xlab = "Chronological age", ylab = "PC", main = paste0("PC", i))
  # add dotted lines
  sapply(seq_along(levels(dswhite2017$p$batch)), function(l){
    s <- which(dswhite2017$p$batch == levels(dswhite2017$p$batch)[l])
    points(dswhite2017$p$age[s], pca_dswhite2017$x[s,i], col = l, 
           type = 'l', lty = 2)
  })
  # legend
  if(i == 1)
    legend("bottomright", bty = 'n', legend = batch_legend, title = "Batch",
           pch = c(rep(1, 5)), lty = c(rep(NA, 5)), col = c(1:5), lwd = 3)
}))

Sampling is sparser towards the end of the time series, and expression dynamics are also "wider". Because of this, fitting splines along age on these components will poorly fit the earlier time points.

To bypass this, we can use asinh(age), which has a similar effect to a logarithm to stretch the age values closer to a uniform scale. We also add an intercept to avoid overstretching the first few time points. The relationship between age and asinh(1+age) is shown below.

plot(asinh(1+age)~age, data=dswhite2017$p)
# add a grid
abline(v=dswhite2017$p$age, h=asinh(1+dswhite2017$p$age),
       col = 'grey80', lty=3)

We see that values are more evenly spread along the y axis, which is the intended effect. This effect also applies on the component dynamics, which will have better spline fits.

par(mfrow = c(2,4), pty='s')
invisible(sapply(1:8, function(i){
  plot(asinh(1+dswhite2017$p$age), pca_dswhite2017$x[,i], 
       lwd = 2, col = dswhite2017$p$batch,
       xlab = "asinh(1+age)", ylab = "PC", main = paste0("PC", i))
  # add dotted lines
  sapply(seq_along(levels(dswhite2017$p$batch)), function(l){
    s <- which(dswhite2017$p$batch == levels(dswhite2017$p$batch)[l])
    points(asinh(1+dswhite2017$p$age[s]), pca_dswhite2017$x[s,i], col = l, 
           type = 'l', lty = 2)
  })
  # legend
  if(i == 1)
    legend("bottomright", bty = 'n', legend = batch_legend, title = "Batch",
           pch = c(rep(1, 5)), lty = c(rep(NA, 5)), col = c(1:5), lwd = 3)
}))

The transformation will be specified in the model formula so that we can predict new points from time values on the age scale (otherwise, we would need to predict with using transformed values such that they correspond to a uniform time scale when transformed back).

Model fitting

We keep enough components to explain $99\%$ of the variance in the data.

nc <- sum(summary(pca_dswhite2017)$importance[3,] < .99) + 1
nc

We will fit a GAM on PCA components. The type of spline to fit as well as the value of the intercept in asinh() will be determined using cross-validation. Since sample batch is clearly absent from the components, we exclude it for a more parsimonious model.

set.seed(3)
# intercept values to try
intercepts <- c(0,1,2,5)

# list of formulas to test
flist <- as.list(paste0(
  "X ~ s(",
  c("age", # age without transformation
    paste0("asinh(", intercepts, "+age)")), # asinh with intercept
  ", bs = '",
  rep(c("tp", "gp"), e=1+length(intercepts)), # 2 different splines
  "', k=9)")) 
# print formula list
cat(unlist(flist), sep = '\n')

cv_dswhite2017 <- ge_imCV(X = dswhite2017$g, p = dswhite2017$p,
                          formula_list = flist, nc = nc, 
                          cv.n = 10, nb.cores = 6)

par(mar = c(7,4,3,1))
cpal <- c("grey30", "#420A68FF",
          "#932667FF", "#DD513AFF", "#FCA50AFF") # color palette
plot(cv_dswhite2017, to_plot = c("aRE", "MSE"),
     names = paste0("s:", rep(c("tp", "gp"), e=1+length(intercepts)), 
                    "/", c("nT", paste0("i=", intercepts))),
     col = NA, lwd=2, names.arrange = 5, boxwex = .5, outline=F,
     border = cpal, tcol = cpal, swarmargs = list(cex = .5, col = cpal))

We first see that with no asinh() transformation on time (grey, nT boxes), model performance and CV error are worse than on transformed data. Next, we note that than thin plate splines (tp) perform slightly better than Gaussian process splines (gp) from MSE. Finally, it seems that an intercept of 1 minimizes the overall CV error and model MSE.

We select s:tp/i=1, a thin plate spline with transformed age and intercept of 1.

m_dswhite2017 <- ge_im(X = dswhite2017$g, 
                       p = dswhite2017$p, 
                       formula = "X ~ s(asinh(1+age), bs = 'gp', k=9)", 
                       nc = nc)

Validation

First, check global model performance.

# global model performance
mp_white <- mperf(dswhite2017$g, predict(m_dswhite2017), is.t = T)
print(do.call(cbind, mp_white))

And then per gene.

ng_mp_white <-  mperf(dswhite2017$g, predict(m_dswhite2017), 
                     is.t = T, global = F)
# remove NAs (eg. 0 variance genes) and Inf values (/0)
ng_mp_white <- lapply(ng_mp_white, na.omit)
ng_mp_white$aRE <- ng_mp_white$aRE[ng_mp_white$aRE < Inf]

par(mfrow = c(2,2), mar=c(4,4,2,1))
invisible(sapply(names(ng_mp_white), function(idx){
  rg <- range(na.omit(ng_mp_white[[idx]]))
  # estimate density curve
  d <- density(na.omit(ng_mp_white[[idx]]), from = rg[1], to = rg[2])
  plot(d, main = paste0(gsub("a", "", idx, fixed = T), 
                        " density (", length(ng_mp_white[[idx]]), " genes)"), 
       xlab = idx, lwd = 2)
  # add global value
  abline(v = mp_white[[idx]], lty = 2, lwd = 2, col = "firebrick")
  text(mp_white[[idx]], .9*max(d$y), pos = 4, labels = idx, 
       font = 2, col = "firebrick")
}))

Very good fits.

Then, build the ref object.

# make a 'ref' object'
r_dswhite2017 <- make_ref(
  m_dswhite2017, 
  n.inter = 500, 
  t.unit = "h post-fertilization",     # time unit
  metadata = list("organism" = "D. rerio", # any metadata
                  "profiling" = "single-embryo RNAseq",
                  "note" = "asinh(1+age) interpolation"))

Plot component interpolation.

par(mfrow = c(1,4), pty='s')
plot(m_dswhite2017, r_dswhite2017, ncs=1:4, l.pos = 'bottomright')

We can zoom in on the early time points to ensure they are indeed properly fit.

par(mfrow = c(1,4), pty='s') 
plot(m_dswhite2017, r_dswhite2017, ncs=1:4, l.pos = 'bottomright', xlim = c(0,40))

Looks OK.

# stage samples
ae_dswhite2017 <- ae(dswhite2017$g, r_dswhite2017)
ae_dslevin2016zeb <- ae(dslevin2016zeb$g, r_dswhite2017)

par(mfrow = c(1,2), mar = c(4,5,3,1), pty='s')
rg <- range(c(ae_dswhite2017$age.estimates[,1], dswhite2017$p$age))
plot(ae_dswhite2017$age.estimates[,1]~dswhite2017$p$age, 
     xlab = "Chronological age", ylab = "Estimated age (dswhite2017)", 
     xlim = rg, ylim = rg,
     main = "Chron. vs Estimated ages for dswhite2017
(on r_dswhite2017)", 
     lwd = 2)
abline(a = 0, b = 1, lty = 3, lwd = 1)

plot(ae_dslevin2016zeb$age.estimates[,1]~dslevin2016zeb$p$age, 
     xlab = "Chronological age", ylab = "Estimated age (dswhite2017)", 
     xlim = rg, ylim = rg,
     main = "Chron. vs Estimated ages for dslevin2016zeb
(on r_dswhite2017)", lwd = 2)
abline(a = 0, b = 1, lty = 3, lwd = 2)

legend("bottomright", legend = "x = y", lwd=3, col=1, lty = 3, bty='n')

Model fitting without age transformation

We'll build the same model without considering uneven sampling, for comparison. This is purposely not the optimal model and will show interpolation issues to look out for.

m_dswhite2017_nT <- ge_im(X = dswhite2017$g, 
                          p = dswhite2017$p, 
                          formula = "X ~ s(age, bs = 'tp', k=9)", 
                          nc = nc)

mp_dswhite2017_nT <- mperf(scale(dswhite2017$g), predict(m_dswhite2017_nT), is.t = T)
print(do.call(cbind, mp_dswhite2017_nT))

Build a ref object

# make a 'ref' object'
r_dswhite2017_nT <- make_ref(
  m_dswhite2017_nT, 
  n.inter = 500, 
  t.unit = "h post-fertilization",     # time unit
  metadata = list("organism" = "D. rerio", # any metadata
                  "profiling" = "single-embryo RNAseq",
                  "note" = "Not the best model!"))

Plot component interpolation.

par(mfrow = c(1,4), pty='s')
plot(m_dswhite2017_nT, r_dswhite2017_nT, ncs=1:4, l.pos = 'bottomright')

The model has trouble predicting the dynamics accurately: PC3 and PC4 have flattened early dynamics and overfitting around 40hpf.

par(mfrow = c(1,2), pty='s')
plot(m_dswhite2017_nT, r_dswhite2017_nT, ncs=3:4, show.legend = F,
     xlim = c(0,50))

Consequences can be seen on when staging external data (i.e. the validation data), but not necessarily when staging the reference data as we'll see below. Validating references with independent/external data is thus highly recommended.

ae_dswhite2017_nT <- ae(dswhite2017$g, r_dswhite2017_nT)
ae_dslevin2016zeb_nT <- ae(dslevin2016zeb$g, r_dswhite2017_nT)

par(mfrow = c(1,2), mar = c(4,5,3,1), pty='s')
rg <- range(c(ae_dswhite2017_nT$age.estimates[,1], dswhite2017$p$age))
plot(ae_dswhite2017_nT$age.estimates[,1]~dswhite2017$p$age, 
     xlab = "Chronological age", ylab = "Estimated age (r_dswhite2017_nT)", 
     xlim = rg, ylim = rg,
     main = "Staging dswhite2017
(on r_dswhite2017_nT)", 
     lwd = 2)
abline(a = 0, b = 1, lty = 3, lwd = 1)

plot(ae_dslevin2016zeb_nT$age.estimates[,1]~dslevin2016zeb$p$age, 
     xlab = "Chronological age", ylab = "Estimated age (r_dswhite2017_nT)", 
     xlim = rg, ylim = rg,
     main = "Staging dslevin2016zeb
(on r_dswhite2017_nT)", lwd = 2)
abline(a = 0, b = 1, lty = 3, lwd = 2)

legend("bottomright", legend = "x = y", lwd=3, col=1, lty = 3, bty='n')

The "gaps" or "steps" in the validation data estimates are caused by interpolation bias (overfitting and poorly fit dynamics mentioned above). These model errors create local "unlikely/unrealistic" gene expression areas in the interpolation, which do not correlate with the samples of corresponding age.

Gaps will most often be in-between original time points of the reference data, meaning the effect can't be seen by staging reference samples. Validation data of sufficient temporal resolution will however have clear 'blank' ranges of age estimates, around which are clustered the samples of corresponding age.

The sub-optimal model we used had clear red flags on component plots, but interpolation bias can be much more subtle. In such cases, only staging external data will reveal the problem.

Code to generate objects

Required packages and variables:

data_folder <- "../inst/extdata/"

requireNamespace("utils", quietly = T)
requireNamespace("GEOquery", quietly = T) # bioconductor
requireNamespace("Biobase", quietly = T)  # bioconductor
requireNamespace("biomaRt", quietly = T)  # bioconductor