knitr::opts_knit$set(out.format = "html", header = "") knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.align = "center", out.width = '100%' ) options(width=100) gen_figs <- T transp <- function(col, a=.5){ colr <- col2rgb(col) return(rgb(colr[1,], colr[2,], colr[3,], a*255, maxColorValue = 255)) }
In some species, tissues can develop at different rates, and these rates can vary between individuals. In C. elegans, @perez2017maternal have shown such developmental heterochrony between soma and germline tissues.
By restricting the genes RAPToR uses for staging to those associated with (or expressed in) specific tissues, it is possible to estimate development specific to these tissues.
@rockman2010selection published microarray profiling of 208 recombinant inbred lines of C. elegans N2 and Hawaii (CB4856) strains (Accession: GSE23857, dsrockman2010
).
These 208 samples were described as "developmentally synchronized" in the original article.
However, @francesconi2014effects later showed there is significant developmental spread between samples, spanning around 20 hours of 20°C late-larval development, essentially making this dataset a very high-resolution time course.
We will start by staging samples with all available genes to get their "global age".
Then, we stage samples a second time, but restricting the input to a germline
set of 2554 genes by combining the germline_intrinsic
, germline_oogenesis_enriched
, and germline_sperm_enriched
categories defined in @perez2017maternal (based on previous work by @reinke2004genome). This will give us a germline age for the samples.
Staging the samples a third time, with a soma
set of 2718 genes from the osc
(molting) gene category defined in @hendriks2014extensive, will give us the soma age.
Finally, we evaluate the results of staging using principal (PCA) or independent (ICA) components, that summarize the expression dynamics of the data. For example, we can expect that components corresponding to particular tissues will have noticeably cleaner dynamics along their respective age estimates (e.g. oscillatory molting dynamics would be less noisy with soma age than germline age).
Code to generate dsrockman2010
(RIL expression data) and gsubset
(germline/soma gene sets) can be found at the end of this section
library(RAPToR) library(wormRef) library(ica) library(stats) library(limma) # for plotting library(vioplot)
load("../inst/extdata/dsrockman2010.RData") load("../inst/extdata/sc3_gsubset.RData")
# Copied from supp data of Francesconi & Lehner (2014) francesconi_time <- data.frame( time = c( 4.862660944, 4.957081545, 5.051502146, 5.145922747, 5.240343348, 5.334763948, 5.429184549, 5.52360515, 5.618025751, 5.712446352, 5.806866953, 5.901287554, 5.995708155, 6.090128755, 6.184549356, 6.278969957, 6.373390558, 6.467811159, 6.56223176, 6.656652361, 6.751072961, 6.845493562, 6.939914163, 7.034334764, 7.128755365, 7.223175966, 7.317596567, 7.412017167, 7.506437768, 7.600858369, 7.69527897, 7.789699571, 7.884120172, 7.978540773, 8.072961373, 8.167381974, 8.261802575, 8.356223176, 8.450643777, 8.545064378, 8.639484979, 8.733905579, 8.82832618, 8.82832618, 8.82832618, 8.875536481, 8.875536481, 8.875536481, 8.875536481, 8.875536481, 8.875536481, 8.875536481, 8.875536481, 8.875536481, 8.969957082, 9.017167382, 9.017167382, 9.064377682, 9.064377682, 9.111587983, 9.206008584, 9.206008584, 9.206008584, 9.300429185, 9.394849785, 9.489270386, 9.489270386, 9.489270386, 9.489270386, 9.489270386, 9.583690987, 9.583690987, 9.583690987, 9.583690987, 9.583690987, 9.583690987, 9.583690987, 9.583690987, 9.630901288, 9.725321888, 9.819742489, 9.819742489, 9.819742489, 9.819742489, 9.819742489, 9.819742489, 9.819742489, 9.91416309, 10.00858369, 10.05579399, 10.05579399, 10.05579399, 10.05579399, 10.10300429, 10.19742489, 10.19742489, 10.29184549, 10.29184549, 10.29184549, 10.38626609, 10.38626609, 10.38626609, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.43347639, 10.527897, 10.6223176, 10.6223176, 10.6223176, 10.6223176, 10.6223176, 10.6223176, 10.6223176, 10.6695279, 10.6695279, 10.6695279, 10.7639485, 10.7639485, 10.7639485, 10.8583691, 10.8583691, 10.9527897, 10.9527897, 10.9527897, 11.0472103, 11.1416309, 11.2360515, 11.2360515, 11.3304721, 11.3304721, 11.3776824, 11.472103, 11.56652361, 11.66094421, 11.75536481, 11.84978541, 11.94420601, 12.03862661, 12.13304721, 12.22746781, 12.32188841, 12.41630901, 12.51072961, 12.60515021, 12.69957082, 12.79399142, 12.88841202, 12.98283262, 13.07725322, 13.17167382, 13.26609442, 13.36051502, 13.45493562, 13.54935622, 13.54935622, 13.54935622, 13.54935622, 13.54935622, 13.59656652, 13.69098712, 13.78540773, 13.78540773, 13.78540773, 13.87982833, 13.97424893, 14.06866953, 14.06866953, 14.06866953, 14.16309013, 14.25751073, 14.35193133, 14.44635193, 14.54077253, 14.63519313, 14.72961373, 14.82403433, 14.82403433, 14.82403433, 14.91845494, 14.96566524, 15.01287554, 15.10729614, 15.20171674, 15.29613734, 15.39055794, 15.48497854, 15.57939914, 15.67381974, 15.76824034, 15.86266094, 15.95708155, 16.05150215, 16.14592275, 16.24034335, 16.33476395, 16.42918455, 16.52360515), geo_accession = c( "GSM588291", "GSM588174", "GSM588110", "GSM588097", "GSM588271", "GSM588203", "GSM588105", "GSM588200", "GSM588123", "GSM588122", "GSM588115", "GSM588100", "GSM588171", "GSM588190", "GSM588229", "GSM588206", "GSM588277", "GSM588129", "GSM588175", "GSM588151", "GSM588273", "GSM588216", "GSM588099", "GSM588117", "GSM588179", "GSM588164", "GSM588184", "GSM588092", "GSM588285", "GSM588272", "GSM588228", "GSM588121", "GSM588170", "GSM588194", "GSM588143", "GSM588149", "GSM588156", "GSM588220", "GSM588212", "GSM588089", "GSM588209", "GSM588253", "GSM588091", "GSM588113", "GSM588130", "GSM588202", "GSM588191", "GSM588244", "GSM588227", "GSM588197", "GSM588233", "GSM588292", "GSM588163", "GSM588196", "GSM588224", "GSM588283", "GSM588267", "GSM588257", "GSM588221", "GSM588274", "GSM588090", "GSM588114", "GSM588195", "GSM588265", "GSM588182", "GSM588093", "GSM588157", "GSM588251", "GSM588177", "GSM588188", "GSM588269", "GSM588145", "GSM588205", "GSM588162", "GSM588210", "GSM588166", "GSM588125", "GSM588252", "GSM588207", "GSM588173", "GSM588102", "GSM588286", "GSM588107", "GSM588238", "GSM588189", "GSM588106", "GSM588295", "GSM588192", "GSM588134", "GSM588183", "GSM588103", "GSM588198", "GSM588293", "GSM588218", "GSM588259", "GSM588234", "GSM588137", "GSM588152", "GSM588133", "GSM588250", "GSM588168", "GSM588235", "GSM588148", "GSM588279", "GSM588140", "GSM588241", "GSM588111", "GSM588231", "GSM588128", "GSM588131", "GSM588101", "GSM588088", "GSM588281", "GSM588159", "GSM588249", "GSM588290", "GSM588118", "GSM588154", "GSM588136", "GSM588268", "GSM588204", "GSM588160", "GSM588135", "GSM588098", "GSM588294", "GSM588225", "GSM588181", "GSM588248", "GSM588096", "GSM588217", "GSM588147", "GSM588176", "GSM588116", "GSM588146", "GSM588127", "GSM588104", "GSM588108", "GSM588262", "GSM588223", "GSM588161", "GSM588237", "GSM588172", "GSM588284", "GSM588256", "GSM588165", "GSM588211", "GSM588242", "GSM588169", "GSM588240", "GSM588264", "GSM588219", "GSM588287", "GSM588124", "GSM588178", "GSM588167", "GSM588258", "GSM588232", "GSM588141", "GSM588112", "GSM588208", "GSM588215", "GSM588132", "GSM588278", "GSM588275", "GSM588155", "GSM588153", "GSM588109", "GSM588185", "GSM588138", "GSM588094", "GSM588226", "GSM588236", "GSM588266", "GSM588119", "GSM588222", "GSM588246", "GSM588150", "GSM588261", "GSM588201", "GSM588247", "GSM588186", "GSM588280", "GSM588255", "GSM588288", "GSM588260", "GSM588139", "GSM588245", "GSM588270", "GSM588276", "GSM588199", "GSM588254", "GSM588120", "GSM588144", "GSM588243", "GSM588214", "GSM588180", "GSM588126", "GSM588282", "GSM588187", "GSM588158", "GSM588193", "GSM588213", "GSM588230", "GSM588263", "GSM588142", "GSM588095"), stringsAsFactors = F) rownames(francesconi_time) <- francesconi_time$geo_accession
We start by applying a quantile-normalization and $log(X+1)$ transformation to the data.
dsrockman2010$g <- limma::normalizeBetweenArrays(dsrockman2010$g, method = "quantile") dsrockman2010$g <- log1p(dsrockman2010$g)
Then, we select an appropriate C. elegans reference (larval to young-adult), and stage the samples.
r_ya <- prepare_refdata("Cel_YA_1", "wormRef", n.inter = 400) ae_dsrockman2010 <- ae(dsrockman2010$g, r_ya)
par(mfrow = c(1,2)) plot(ae_dsrockman2010, cex = .5, lmar = 7, CIbar.width = .05, main = paste0("Samples 1 - ", round((nrow(dsrockman2010$p)/2))), subset=1L:round((nrow(dsrockman2010$p)/2))) plot(ae_dsrockman2010, cex = .5, lmar = 7, CIbar.width = .05, main = paste0("Samples ", 1+round((nrow(dsrockman2010$p)/2)), " - ", nrow(dsrockman2010$p)), subset=round(1+(nrow(dsrockman2010$p)/2)):nrow(dsrockman2010$p))
Our estimates confirm the wide developmental spread and match the previously inferred ages by @francesconi2014effects.
par(pty='s') plot(francesconi_time[dsrockman2010$p$geo_accession,"time"], ae_dsrockman2010$age.estimates[,1], lwd = 2, col = "black", main = "RAPToR vs. Francesconi & Lehner (2014) age", xlab = "Francesconi & Lehner (2014) estimates (h past L4 stage, 25°C)", ylab = "RAPToR (global) estimates (h past egg-laying, 20°C)") text(6.5, 67.5, labels=paste0("r = ",round(cor(na.omit(cbind( francesconi_time[dsrockman2010$p$geo_accession,"time"], ae_dsrockman2010$age.estimates[,1])), method = "pearson")[1,2], 3)))
Expression dynamics in the data can be observed through ICA components, and enrichment in component gene loadings (or contributions) allows us to associate them to specific processes.
We find out how many independent components (IC) to extract from how many principal components are needed to explain $90\%$ of the variance in the data.
pca_rock <- summary(stats::prcomp(t(dsrockman2010$g), rank = 30, center = TRUE, scale = FALSE)) nc <- sum(pca_rock$importance["Cumulative Proportion",] < .90) + 1 nc
Then, we perform an ICA extracting the r nc
components.
ica_rock <- ica::icafast(t(scale(t(dsrockman2010$g), center = T, scale = F)), nc = nc)
We can plot the first few independent components along global estimated age of the samples :
par(mfrow = c(3,5), mar = c(4,4,3,1), pty='s') invisible(sapply(1L:15L, function(i){ plot(ae_dsrockman2010$age.estimates[,1], ica_rock$M[,i], main = paste("IC", i), ylab = "IC", xlab = "Global age", cex = .8) }))
It seems that past IC7, components don't have a definite link to developmental processes, which is expected given the (intended) genetic background heterogeneity in the samples.
We can have a closer look at components clearly capturing development, and the enrichment of their loadings for the gene sets of interest.
dev_comps <- 1:7
oo_g <- which(rownames(dsrockman2010$g) %in% gsubset$germline_oogenesis) sp_g <- which(rownames(dsrockman2010$g) %in% gsubset$germline_sperm) so_g <- which(rownames(dsrockman2010$g) %in% gsubset$soma) gs <- factor(c(rep("All genes", nrow(dsrockman2010$g)), rep("Oogen.", length(oo_g)), rep("Sperm.", length(sp_g)), rep("Soma", length(so_g))), levels = c("All genes", "Oogen.", "Sperm.", "Soma")) cols <- c(1, "royalblue", "royalblue", "firebrick")
layout(matrix(1:14, nrow=2, byrow = T), heights = c(.45,.55)) par(pty='s', mar = c(5,4,2,1)) invisible(sapply(dev_comps, function(i){ plot(ae_dsrockman2010$age.estimates[,1], ica_rock$M[,i], main = paste("IC", i), ylab = "IC", xlab = "Global age") })) par(pty='m') invisible(sapply(dev_comps, function(i){ gl <- ica_rock$S[,i] dat <- data.frame(gs = gs, gl = c(gl, gl[oo_g], gl[sp_g], gl[so_g])) boxplot(gl~gs, data = dat, main = paste("Gene loadings on IC", i), at = c(1,.25+(2:4)), ylab = "Gene loadings", xlab = "", outline = F, boxwex = .4, ylim=c(-6,6), las=2, col = transp(cols, a = .4), border = cols, boxlwd = 2) abline(h = 0, lty = 2, col = "grey80") vioplot(gl~gs, data = dat, add = T, h = .3, at = c(1,.25+(2:4)), col = transp(cols, a = .4), border = cols, rectCol = cols, lineCol = cols, lwd = 2, frame.plot = F) abline(v = 1.625, lty = 2, col = "grey80") }))
From the gene loadings above, we can establish that
Now, we stage the samples using only germline or soma gene subsets.
ae_soma <- ae( dsrockman2010$g[rownames(dsrockman2010$g) %in% gsubset$soma,], r_ya) ae_germline <- ae( dsrockman2010$g[rownames(dsrockman2010$g) %in% gsubset$germline,], r_ya)
par(mfrow = c(1,3), pty='s') rg <- c(40,70) pch <- (seq_len(ncol(dsrockman2010$g)) %in% c(80, 141)) + 1 plot(ae_dsrockman2010$age.estimates[,1], ae_soma$age.estimates[,1], lwd = 2, col = "firebrick", xlab = "Global age", ylab = "Soma age", main = "Global vs. Soma age", pch = pch, xlim = rg, ylim = rg) box(lwd = 2, col = "firebrick") abline(a = 0, b = 1, lty = 2, col = "firebrick") plot(ae_dsrockman2010$age.estimates[,1], ae_germline$age.estimates[,1], lwd = 2, col = "royalblue", xlab = "Global age", ylab = "Germline age", main = "Global vs. Germline age", xlim = rg, ylim = rg) box(lwd = 2, col = "royalblue") abline(a = 0, b = 1, lty = 2, col = "royalblue") plot(ae_soma$age.estimates[,1], ae_germline$age.estimates[,1], lwd = 2, xlim = rg, ylim = rg, pch = pch, xlab = "Soma age", ylab = "Germline age", main = "Soma vs. Germline age") abline(a = 0, b = 1, lty = 2, col = "black")
We notice the soma estimates of two samples (marked with $\small{\triangle}$) are quite off from their global or germline age. This caused by the combination of both the fact we used a small gene set for inferring age, and that very similar expression profiles can occur at different times in oscillatory profiles (which is the case for molting genes).
If we look at the global, soma, or germline correlation profiles of one of these samples, we can see 2 peaks of similar correlation for the soma, which is not the case for germline or global correlation.
par(mfrow = c(1,3)) plot_cor(ae_soma, subset = 80) mtext("Soma", side = 3, line = -2, col = "firebrick", font = 2) box(lwd = 2, col = "firebrick") plot_cor(ae_germline, subset = 80, ) mtext("Germline", side = 3, line = -2, col = "royalblue", font = 2) box(lwd = 2, col = "royalblue") plot_cor(ae_dsrockman2010, subset = 80) mtext("Global", side = 3, line = -2, col = "black", font = 2)
This is the perfect situation to use a prior for staging. We can input the global age as a prior so that the correct peak of the soma correlation profile is favored in the erroneous samples.
ae_soma_prior <- ae( dsrockman2010$g[rownames(dsrockman2010$g)%in%gsubset$soma,], r_ya, prior = ae_dsrockman2010$age.estimates[,1], # gaussian prior values (mean) prior.params = 10 # gaussian prior sd )
Indeed, the estimate is now shifted to the first (correct) peak. Note that the correlation profile itself (central black line) is unchanged (dotted lines can be different as they are derived from bootstrap estimates with random gene sets).
par(mfrow = c(1,2)) plot_cor(ae_soma, subset = 80) mtext("Soma", side = 3, line = -1.5, col = "firebrick", font = 2) box(lwd = 2, col = "firebrick") plot_cor(ae_soma_prior, subset = 80, show.prior = T) mtext("Soma with prior", side = 3, line = -1.5, col = "firebrick", font = 2) box(lwd = 2, col = "firebrick")
At the same time, all the other estimates are essentially unchanged.
# 80 & 141 are the offset samples mean((ae_soma$age.estimates[,1] - ae_soma_prior$age.estimates[,1])[-c(80,141)])
Now, we can observe the expression dynamics in the data along tissue-specific estimates.
par(mfcol = c(2,7), pty='s', mar = c(4,4,2,1)) invisible(sapply(dev_comps, function(i){ # plot(ae_dsrockman2010$age.estimates[,1], ica_rock$M[,i], lwd = 2, col = "black", # xlab = "Global age", ylab = "IC", main = paste0("IC", i, " (global age)")) # plot(ae_soma_prior$age.estimates[,1], ica_rock$M[,i], lwd = 2, col = "firebrick", xlab = "Soma age", ylab = "IC", main = paste0("IC", i, " (soma age)")) box(lwd = 2, col = "firebrick") plot(ae_germline$age.estimates[,1], ica_rock$M[,i], lwd = 2, col = "royalblue", xlab = "Germline age", ylab = "IC", main = paste0("IC", i, " (germline age)")) box(lwd = 2, col = "royalblue") }))
As expected, components IC1, IC4, IC5, and IC7 that we previously associated with soma are much cleaner along the soma age, but very noisy along germline age.
At the same time, germline-associated components IC2 and IC3 appear quite noisy along soma age, but are very clean along germline age estimates.
This effect is the result of soma-germline heterochrony between the samples.
Note: In this vignette, we use Cel_YA_1
to stage the samples, which is different from the RAPToR article (@bulteau2022real) which uses Cel_larv_YA
. The article shows a combination of soma-germline heterochrony between the reference and the samples, as well as among the samples. We shortened the analysis for the vignette to only show heterochrony among RILs (e.g. the ICA does not combine reference and samples, so components are different; however, enrichment of soma or germline genes to specific components/dynamics is still clear).
data_folder <- "../inst/extdata/" requireNamespace("wormRef", quietly = T) requireNamespace("utils", quietly = T) requireNamespace("readxl", quietly = T) requireNamespace("GEOquery", quietly = T) # bioconductor requireNamespace("Biobase", quietly = T) # bioconductor requireNamespace("limma", quietly = T) # bioconductor
Note : set the data_folder
variable to an existing path on your system where you want to store the objects.
To generate dsrockman2010
:
To download the soma and germline gene sets (gsubset
) :
library(readxl) # germline sets from Perez et al. (2017) germline_url <- paste0("https://static-content.springer.com/esm/", "art%3A10.1038%2Fnature25012/MediaObjects/", "41586_2017_BFnature25012_MOESM3_ESM.xlsx") germline_file <- paste0(data_folder, "germline_gset.xlsx") utils::download.file(url = germline_url, destfile = germline_file) germline_set <- read_xlsx(germline_file, sheet = 3, na = "NA")[,c(1, 44:46)] germline_set[is.na(germline_set)] <- FALSE germline_set <- cbind(wb_id = germline_set[,1], germline = apply(germline_set[, 2:4], 1, function(r) any(r)), germline_set[, 2:4]) germline <- germline_set[germline_set$germline,1] germline_intrinsic <- germline_set[germline_set$germline_intrinsic,1] germline_oogenesis <- germline_set[germline_set$germline_oogenesis_enriched,1] germline_sperm <- germline_set[germline_set$germline_sperm_enriched,1] # soma set from Hendriks et al. (2014) soma_url <- paste0("https://ars.els-cdn.com/content/image/", "1-s2.0-S1097276513009039-mmc2.xlsx") soma_file <- paste0(data_folder, "soma_gset.xlsx") utils::download.file(url = soma_url, destfile = soma_file) soma_set <- read_xlsx(soma_file, skip = 3, na = "NA")[,c(1, 4)] soma_set$class <- factor(soma_set$class) soma_set$soma <- soma_set$class == "osc" soma_set <- soma_set[soma_set$soma, 1] # save gene sets gsubset <- list(germline = germline, soma = soma_set$`Gene WB ID`, germline_intrinsic = germline_intrinsic, germline_oogenesis = germline_oogenesis, germline_sperm = germline_sperm) save(gsubset, file = paste0(data_folder, "sc3_gsubset.RData"), compress = "xz") # cleanup file.remove(germline_file) file.remove(soma_file) rm(germline_url, germline_file, germline_set, soma_url, soma_file, soma_set)
@francesconi2014effects sample timings (francesconi_time
):
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.