In LBMC/RAPToR: Real Age Prediction from Transcriptome staging On Reference

Data

In this example, we build a reference for Drosophila melanogaster embryo development. We use the following time-series:

A bulk embryo development time-series, part of the modENCODE project and published by @graveley2011developmental, dsgraveley2011, used to build the reference. (Data from fruitfly.org)
A high-resolution time-series of single embryos published by @levin2016mid, dslevin2016dmel, used for external validation. (Accession : GSE60471)

Code to generate dsgraveley2011 and dslevin2016dmel can be found at the end of this section

Normalization & exploration

load("../inst/extdata/dsgraveley2011.RData")
load("../inst/extdata/dslevin2016dmel.RData")

We start by normalizing the data.

library(RAPToR)
library(limma)

dsgraveley2011$g <- limma::normalizeBetweenArrays(dsgraveley2011$g, 
                                                  method = "quantile")
dsgraveley2011$g <- log1p(dsgraveley2011$g)

dslevin2016dmel$g <- limma::normalizeBetweenArrays(dslevin2016dmel$g, 
                                                   method = "quantile")
dslevin2016dmel$g <- log1p(dslevin2016dmel$g)

Check the contents of the expression matrix and pheno data.

dsgraveley2011$g[1:5, 1:5]

head(dsgraveley2011$p, n = 5)

Correlation between the samples for outliers.

cor_dsgraveley2011 <- cor(dsgraveley2011$g, method = "spearman")
ord <- order(dsgraveley2011$p$age)

heatmap(cor_dsgraveley2011[ord, ord], Colv = NA, Rowv = NA, 
        scale = "none", keep.dendro = F, margins = c(2,5),
        labRow = "", labCol = "")
# add time label
par(xpd = T)
mtext(text = dsgraveley2011$p$age, side = 1, 
      line = 4, at = seq(-.16,.85, l = 12))
mtext(at = 1, line = 4, side = 1, text = "(hours)", cex = .8)

# add color legend
l.values <- seq(min(cor_dsgraveley2011), max(cor_dsgraveley2011), l = 9)
image(x = c(.95,1), y = seq(0.6,1, l = 9), useRaster = T,
      z = matrix(l.values, ncol = 9),
      col = hcl.colors(12, "YlOrRd", rev = TRUE), add = T)
text(.975, 1, pos = 3, labels = expression(rho), font = 2)
text(1, y = seq(0.6,1, l = 9)[c(T,F)], pos = 4, 
     labels = round(l.values[c(T,F)], 2), cex = .6)

boxplot(cor_dsgraveley2011, names = paste0(dsgraveley2011$p$age,'h'), 
        boxwex=.5, las=1)

No outliers.

Plotting principal components.

pca_dsgraveley2011 <- summary(stats::prcomp(t(dsgraveley2011$g), rank = 12,
                                            center = TRUE, scale = FALSE))

par(mfrow = c(2,4), pty = 's')
invisible(sapply(seq_len(8), function(i){
  plot(dsgraveley2011$p$age, pca_dsgraveley2011$x[,i], type = 'b', lwd = 2,
       xlab = "Chronological age", ylab = "PC", main = paste0("PC", i))
}))

Model fitting

We keep enough components to explain $99\%$ of the variance in the data.

nc <- sum(pca_dsgraveley2011$importance[3,] < .99) + 1
nc

We then fit a GAM on PCA components with a cubic spline on age.

m_dsgraveley2011 <- ge_im(X = dsgraveley2011$g, 
                          p = dsgraveley2011$p, 
                          formula = "X ~ s(age, bs = 'cr')", nc = nc)

Validation

First, check global model performance.

# global model performance
mp_grav <- mperf(dsgraveley2011$g, predict(m_dsgraveley2011), is.t = T)
print(do.call(cbind, mp_grav))

And then per gene.

ng_mp_grav <-  mperf(dsgraveley2011$g, predict(m_dsgraveley2011), 
                     is.t = T, global = F)
# remove NAs (eg. 0 variance genes) and Inf values (/0)
ng_mp_grav <- lapply(ng_mp_grav, na.omit)
ng_mp_grav$aRE <- ng_mp_grav$aRE[ng_mp_grav$aRE < Inf]

par(mfrow = c(2,2), mar=c(4,4,2,1))
invisible(sapply(names(ng_mp_grav), function(idx){
  rg <- range(na.omit(ng_mp_grav[[idx]]))
  # estimate density curve
  d <- density(na.omit(ng_mp_grav[[idx]]), from = rg[1], to = rg[2])
  plot(d, main = paste0(gsub("a", "", idx, fixed = T), 
                        " density (", length(ng_mp_grav[[idx]]), " genes)"), 
       xlab = idx, lwd = 2)
  # add global value
  abline(v = mp_grav[[idx]], lty = 2, lwd = 2, col = "firebrick")
  text(mp_grav[[idx]], .9*max(d$y), pos = 4, labels = idx, 
       font = 2, col = "firebrick")
}))

Excellent fits.

Then, build the ref object.

r_dsgraveley2011 <- make_ref(
  m = m_dsgraveley2011,
  n.inter = 100,                # interpolation resolution
  t.unit = "h past egg-laying", # time unit
  metadata = list("organism" = "D. melanogaster", # any metadata
                  "profiling" = "bulk RNAseq"))

Plot model predictions on components.

par(mfrow = c(2,4), pty='s', mar = c(4,4,3,1))
plot(m_dsgraveley2011, r_dsgraveley2011, col.i = 'royalblue')

No overfitting, good fits on the components explaining most of the variance. Some components flattened, but minimal associated variance.

Finally, we stage the samples from the reference, as well as the external validation data (dslevin2016dmel).

ae_dsgraveley2011 <- ae(dsgraveley2011$g, r_dsgraveley2011)
ae_dslevin2016dmel <- ae(dslevin2016dmel$g, r_dsgraveley2011)

par(mfrow = c(1,2), mar = c(4,5,3,1))
# dsgraveley2011
dsgraveley2011$p$age_est <- ae_dsgraveley2011$age.estimates[,1]
rg <- range(c(dsgraveley2011$p$age_est, dsgraveley2011$p$age))
plot(age_est~age, data = dsgraveley2011$p, 
     xlab = "Chronological age", ylab = "Estimated age (r_dsgraveley2011)", 
     xlim = rg, ylim = rg, lwd = 2,
     main = "Staging dsgraveley2011
on r_dsgraveley2011")
points(age_est~age, data = dsgraveley2011$p, type = 'l', lty = 2)
abline(a = 0, b = 1, lty = 3, lwd = 2) # x=y
legend("bottomright", legend = "x = y", lwd=3, col=1, lty = 3, bty='n')

# dslevin2016dmel
dslevin2016dmel$p$age_est <- ae_dslevin2016dmel$age.estimates[,1]
rg <- range(c(dslevin2016dmel$p$age_est , dslevin2016dmel$p$age))
plot(age_est~age, data = dslevin2016dmel$p, 
     xlab = "Chronological age", ylab = "Estimated age (r_dsgraveley2011)", 
     xlim = rg, ylim = rg, lwd = 2,
     main = "Staging dslevin2016dmel
on r_dsgraveley2011")
abline(a = 0, b = 1, lty = 3, lwd = 2) #x=y
legend("bottomright", legend = "x = y", lwd=3, col=1, lty = 3, bty='n')

We see that the validation data age estimates vary from their chronological age, particularly in later stages. However, this is not due to errors in the reference or staging, but instead to the fact this data is single-embryo profiling. Indeed, inter-individual variation in developmental speed results in striking differences with expected age, where it would otherwise be averaged out in bulk data.

The dynamics of the dslevin2016dmel data further confirm this.

pca_dslevin2016dmel <- stats::prcomp(t(dslevin2016dmel$g), rank = 20,
                                     center = TRUE, scale = FALSE)

par(mfrow = c(1,5), pty='s')
invisible(sapply(1:5, function(i){
    plot(dslevin2016dmel$p$age, pca_dslevin2016dmel$x[,i], lwd = 2,
         xlab = "Chronological age", ylab = "PC", main = paste0("PC", i))
}))

Chronological age specified for the samples is inaccurate, and does not properly reflect developmental gene expression dynamics, as evidenced by noisy components.

Inferred age however, clearly restores the dynamics cleanly.

par(mfrow = c(1,5), pty='s')
invisible(sapply(1:5, function(i){
    plot(dslevin2016dmel$p$age_est, pca_dslevin2016dmel$x[,i], lwd = 2,
         xlab = "Estimated age", ylab = "PC", main = paste0("PC", i),
         col = 'firebrick')
  box(col = 'firebrick', lwd=2)
}))

This example shows how inter-individual variation in developmental speed makes it difficult to profile high-temporal-resolution time series. Furthermore, the temporal resolution difference between the reference and validation data also demonstrates the effectiveness of gene expression interpolation.

Code to generate objects

Required packages and variables:

data_folder <- "../inst/extdata/"

requireNamespace("utils", quietly = T)
requireNamespace("GEOquery", quietly = T) # bioconductor
requireNamespace("Biobase", quietly = T)  # bioconductor
requireNamespace("biomaRt", quietly = T)  # bioconductor