In this example, we use two C. elegans RNAseq time-series:
dshendriks2014
, used to build the reference. (Accession : GSE52861)dsaeschimann2017
, used for external validation. (Accession : GSE80157)The data is the same used in the vignette above, but we flip which dataset is the reference and which is used for external validation. Code to generate dsaeschimann2017
and dshendriks2014
can be found at the end of this section
We start by normalizing the data.
load("../inst/extdata/dsaeschimann2017.RData") load("../inst/extdata/dshendriks2014.RData")
library(RAPToR) library(limma) dshendriks2014$g <- limma::normalizeBetweenArrays(dshendriks2014$g, method = "quantile") dshendriks2014$g <- log1p(dshendriks2014$g) dsaeschimann2017$g <- limma::normalizeBetweenArrays(dsaeschimann2017$g, method = "quantile") dsaeschimann2017$g <- log1p(dsaeschimann2017$g)
Check the contents of the expression matrix and pheno data.
dshendriks2014$g[1:5,1:3] head(dshendriks2014$p, n = 5)
Correlation between the samples.
cor_dshendriks <- cor(dshendriks2014$g, method = "spearman") ord <- order(dshendriks2014$p$age) heatmap(cor_dshendriks[ord, ord], Colv = NA, Rowv = NA, scale = "none", keep.dendro = F, margins = c(1,5), labRow = "", labCol = "") # add time labels par(xpd = T) mtext(text = paste0(dshendriks2014$p$age), side = 1, line = 4.1, at = seq(-.17,.86, l = 16), cex = .8) # add color legend l.values <- seq(min(cor_dshendriks), max(cor_dshendriks), l = 10) image(x = c(.95,1), y = seq(0.6,1, l = 10), useRaster = T, z = matrix(l.values, ncol = 10), col = hcl.colors(12, "YlOrRd", rev = TRUE), add = T) text(.975, 1, pos = 3, labels = expression(rho), font = 2) text(1, y = seq(0.6,1, l = 10), pos = 4, labels = round(l.values, 2), cex = .6) mtext(at = 1.0, line = 4.1, side = 1, text = "(hours)", cex = .8)
boxplot(cor_dshendriks, names = paste0(dshendriks2014$p$age,'h'), boxwex=.5, las=1)
Everything looks good, no outliers.
Now, plotting principal components.
pca_dshendriks <- summary(stats::prcomp(t(dshendriks2014$g), rank = 25, center = TRUE, scale = FALSE))
par(mfrow = c(2,3), pty='s') invisible(sapply(1:6, function(i){ plot(dshendriks2014$p$age, pca_dshendriks$x[,i], lwd = 2, type = 'b', xlab = "Chronological age", ylab = "PC", main = paste0("PC", i)) }))
We keep enough components to explain $99\%$ of the variance in the data, since it is very clean.
nc <- sum(pca_dshendriks$importance[3,] < .99) + 1 nc
We then fit a GAM model with a cubic spline on age on PCA components.
m_dshendriks2014 <- ge_im(X = dshendriks2014$g, p = dshendriks2014$p, formula = "X ~ s(age, bs = 'cr')", nc = nc)
First, check global model performance.
# global model performance mp_hendriks <- mperf(dshendriks2014$g, predict(m_dshendriks2014), is.t = T) print(do.call(cbind, mp_hendriks))
And then per gene.
ng_mp_hendriks <- mperf(dshendriks2014$g, predict(m_dshendriks2014), is.t = T, global = F) # remove NAs (eg. 0 variance genes) and Inf values (/0) ng_mp_hendriks <- lapply(ng_mp_hendriks, na.omit) ng_mp_hendriks$aRE <- ng_mp_hendriks$aRE[ng_mp_hendriks$aRE < Inf]
par(mfrow = c(2,2), mar=c(4,4,2,1)) invisible(sapply(names(ng_mp_hendriks), function(idx){ rg <- range(na.omit(ng_mp_hendriks[[idx]])) # estimate density curve d <- density(na.omit(ng_mp_hendriks[[idx]]), from = rg[1], to = rg[2]) plot(d, main = paste0(gsub("a", "", idx, fixed = T), " density (", length(ng_mp_hendriks[[idx]]), " genes)"), xlab = idx, lwd = 2) # add global value abline(v = mp_hendriks[[idx]], lty = 2, lwd = 2, col = "firebrick") text(mp_hendriks[[idx]], .9*max(d$y), pos = 4, labels = idx, font = 2, col = "firebrick") }))
Excellent fits.
Then, build the ref
object.
r_dshendriks2014 <- make_ref( m = m_dshendriks2014, n.inter = 100, # interpolation resolution t.unit = "h past egg-laying, 25C", # time unit metadata = list("organism" = "C. elegans", # any metadata "profiling" = "bulk RNAseq"))
Plot model predictions on components.
par(mfrow = c(2,5), pty='s', mar = c(4,4,3,1)) plot(m_dshendriks2014, r_dshendriks2014, ncs = 1:10)
No overfitting, good fits on the components explaining most of the variance. Some components flattened, but minimal associated variance.
Finally, we stage the samples from the reference, as well as the external validation data (dsaeschimann2017
).
We exclude the earliest dsaeschimann2017
samples (under 22h) from staging since they are outside the span covered by the reference
ae_dshendriks2014 <- ae(dshendriks2014$g, r_dshendriks2014) too_young <- dsaeschimann2017$p$age <= 22 ae_dsaeschimann2017 <- ae(dsaeschimann2017$g[,!too_young], r_dshendriks2014)
par(mfrow = c(1,2), mar = c(4,5,3,1)) # dshendriks2014 dshendriks2014$p$age_est <- ae_dshendriks2014$age.estimates[,1] rg <- range(c(dshendriks2014$p$age_est, dshendriks2014$p$age)) plot(age_est~age, data = dshendriks2014$p, xlab = "Chronological age", ylab = "Estimated age (r_dshendriks2014)", xlim = rg, ylim = rg, main = "Staging dshendriks2014 on r_dshendriks2014", lwd = 2) points(age_est~age, data = dshendriks2014$p, type = 'l', lty = 2) abline(a = 0, b = 1, lty = 3, lwd = 2) # x=y legend("bottomright", legend = "x = y", lwd=3, col=1, lty = 3, bty='n') # dsaeschimann2017 dsaeschimann2017$p[!too_young, "age_est"] <- ae_dsaeschimann2017$age.estimates[,1] rg <- range(c(dsaeschimann2017$p$age_est[!too_young], dsaeschimann2017$p$age[!too_young])) plot(age_est~age, data = dsaeschimann2017$p[!too_young,], xlab = "Chronological age", ylab = "Estimated age (r_dshendriks2014)", xlim = rg, ylim = rg, main = "Staging dsaeschimann2017 on r_dshendriks2014", lwd = 2, col = factor(dsaeschimann2017$p$strain)) invisible(sapply(levels(dsaeschimann2017$p$strain), function(l){ s <- dsaeschimann2017$p$strain == l & !too_young points(age_est~age, data = dsaeschimann2017$p[s,], type = 'l', lty = 2, col = which(l==levels(factor(dsaeschimann2017$p$strain)))) })) abline(a = 0, b = 1, lty = 3, lwd = 2) legend("bottomright", bty='n', legend = c("let-7", "lin-41", "let-7/lin-41", "N2", "x = y"), lwd=3, col=c(1:4, 1), pch = c(1,1,1,1,NA), lty = c(rep(NA, 4), 3))
Required packages and variables:
data_folder <- "../inst/extdata/" requireNamespace("wormRef", quietly = T) requireNamespace("utils", quietly = T) requireNamespace("GEOquery", quietly = T) # bioconductor requireNamespace("Biobase", quietly = T) # bioconductor
Note : set data_folder
to an existing path on your system where you want to store the objects.
To build dshendriks2014
, C. elegans late larval development time series from @hendriks2014extensive
To build dsaeschimann2017
, C. elegans larval development time series of 4 strains from @aeschimann2017lin41
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.