knitr::opts_knit$set(out.format = "html", header = "") knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.align = "center", out.width = '100%' ) options(width=100) transp <- function(col, a=.5){ colr <- col2rgb(col) return(rgb(colr[1,], colr[2,], colr[3,], a*255, maxColorValue = 255)) }
Your organism of interest may not be well-studied or have an abundance of reference time-series data available. However, RAPToR still works when using a close organism as a reference, thanks to the conserved nature of developmental processes across species (particularly in early development).
Indeed, samples can be staged cross-species using ortholog genes.
@li2014comparison defined a set of orthologs between D. melanogaster and C. elegans (hereafter, glist
), which we will use to stage C. elegans single embryos on a D. melanogaster reference (of note, ensembl ortholog sets also work).
The C. elegans time-series of single-embryos was profiled and published by @levin2016mid (Accession : GSE60755, dslevin2016cel
).
The Drosophila melanogaster embryonic development time-series is part of the modENCODE project and published by @graveley2011developmental (data downloaded from fruitfly.org, dsgraveley2011
)
Code to generate glist
(orthologs), dslevin2016cel
(C. elegans data), and dsgraveley2011
(D. melanogaster data) can be found at the end of this section.
library(RAPToR) library(drosoRef) library(limma) library(stats)
load("../inst/extdata/dslevin2016cel.RData") load("../inst/extdata/dsgraveley2011.RData") load("../inst/extdata/sc2_glist.RData")
We start by applying a quantile-normalization and $log(X+1)$ transformation to the data.
dsgraveley2011$g <- limma::normalizeBetweenArrays(dsgraveley2011$g, method = "quantile") dsgraveley2011$g <- log1p(dsgraveley2011$g) dslevin2016cel$g <- limma::normalizeBetweenArrays(dslevin2016cel$g, method = "quantile") dslevin2016cel$g <- log1p(dslevin2016cel$g)
4 outliers in the C. elegans data are then removed from the analysis (see About the outliers).
# outlier samples (see below) rem <- c(sample_0001 = 53L, sample_0002 = 54L, sample_0003 = 58L, sample_0004 = 59L)
To stage samples we must build a reference from the Drosophila data, which happens to be the Dme_embryo
reference of the drosoRef
package.
It can thus be directly loaded with prepare_refdata()
.
# reference built from dsgraveley2011 r_grav <- prepare_refdata("Dme_embryo", "drosoRef", 500)
We then convert the C. elegans gene IDs to their D. melanogaster orthologs, and stage the C. elegans embryos with the Drosophila reference. In this case, many-to-one orthologs are averaged and one-to-many are assigned to the first ID match.
dslevin2016cel$g_dmel <- format_ids(dslevin2016cel$g, glist, from = "wb_id", to = "fb_id") ae_cel_on_dmel <- ae(dslevin2016cel$g_dmel, r_grav)
par(mar = c(4,4,4,2), pty='s') plot(dslevin2016cel$p$age[-rem], ae_cel_on_dmel$age.estimates[-rem,1], xlab = "C. elegans chronological age (h past 4-cell stage)", ylab = "Age estimates on D. melanogaster reference (h past egg-laying)", main = "Staging C. elegans on D. melanogaster\n(dslevin2016cel on r_grav)", lwd = 2, col = "darkblue", las=1, cex = .8) lm_cd <- lm(ae_cel_on_dmel$age.estimates[-rem,1]~ dslevin2016cel$p$age[-rem]) mtext(text = paste("R² =", round(summary(lm_cd)$adj.r.squared, 3)), side = 3, line = -2, at = mean(par("usr")[1:2]))
The gaps we see in the staging results are likely at timings where there are incompatible expression dynamics between the two species.
By re-building the Drosophila reference on the first 2 components which are broad or monotonic, we can keep only these expression dynamics of development in the reference. This can improve staging because it filters out the incompatible dynamics.
# build same model, but restrict interpolation to 2 components instead of 8 m_grav2 <- ge_im(X = dsgraveley2011$g, p = dsgraveley2011$p, formula = "X ~ s(age, bs = 'cr')", nc = 2) # make adjusted reference object r_grav2 <- make_ref(m_grav2, n.inter = 500, t.unit = "h past egg-laying", metadata = list("organism"="D. melanogaster"))
par(mfrow=c(1,2), mar=c(4,4,3,2)) plot(m_grav2, r_grav2, col=2)
We then stage the C. elegans embryos on this adjusted reference.
ae_cel_on_dmel2 <- ae(dslevin2016cel$g_dmel, r_grav2)
par(mar = c(4,4,4,2), pty='s') plot(dslevin2016cel$p$age[-rem], ae_cel_on_dmel2$age.estimates[-rem,1], xlab = "C. elegans chronological age (h past 4-cell stage)", ylab = "Age estimates on adjusted D. melanogaster reference (h past egg-laying)", main = "Staging C. elegans on D. melanogaster \n(dslevin2016cel on r_grav2)", lwd = 2, col = "darkblue", cex = .8) lm_cd2 <- lm(ae_cel_on_dmel2$age.estimates[-rem,1]~ dslevin2016cel$p$age[-rem]) mtext(text = paste("R² =", round(summary(lm_cd2)$adj.r.squared, 3)), side = 3, line = -2, at = mean(par("usr")[1:2]))
We removed 4 outlier samples from the analyses above because they have an erroneous noted age. This is evidenced by their position on principal components, where they appear to be around 2 hours "older" than their specified age.
pca_cel <- stats::prcomp(t(dslevin2016cel$g), rank = 10, center = TRUE, scale = FALSE)
n <- nrow(dslevin2016cel$p) par(mfrow = c(1,3), pty='s') invisible(sapply(1L:3L, function(i){ plot(dslevin2016cel$p$age, pca_cel$x[,i], col=c("darkblue", "deeppink")[1+(1:n)%in%rem], pch=c(1,4)[1+(1:n)%in%rem], lwd=2, xlab = "Chronological age (h past 4-cell stage)", ylab = paste0("PC",i)) arrows(x0 = dslevin2016cel$p$age[rem]+0.2, x1 = dslevin2016cel$p$age[rem] + 2.60, y0 = pca_cel$x[rem,i], length = .05, lwd=2, col = "orange") }))
This is even further confirmed by their estimated age on the Drosophila reference, which places them as expected.
n <- nrow(dslevin2016cel$p) par(mfrow = c(1,3), pty='s') invisible(sapply(1L:3L, function(i){ plot(ae_cel_on_dmel2$age.estimates[,1], pca_cel$x[,i], col=c("darkblue", "deeppink")[1+(1:n)%in%rem], pch=c(1,4)[1+(1:n)%in%rem], lwd=2, xlab = "Estimated age (h post hatching, D. melanogaster)", ylab = paste0("PC",i)) }))
Required packages and variables:
data_folder <- "../inst/extdata/" requireNamespace("wormRef", quietly = T) requireNamespace("utils", quietly = T) requireNamespace("GEOquery", quietly = T) # bioconductor requireNamespace("Biobase", quietly = T) # bioconductor requireNamespace("biomaRt", quietly = T) # bioconductor
Note : set the data_folder
variable to an existing path on your system where you want to store the objects.
To build glist
, ortholog genes between C. elegans and D. melanogaster from @li2014comparison Supplementary Table 1:
tmp_file <- paste0(data_folder, "dmel_cel_orth.zip") tmp_fold <- paste0(data_folder, "dmel_cel_orth/") f_url <- paste0("https://genome.cshlp.org/content/suppl/2014/05/15/", "gr.170100.113.DC1/Supplemental_Files.zip") utils::download.file(url = f_url, destfile = tmp_file) utils::unzip(tmp_file, exdir = tmp_fold) glist <- read.table( paste0(tmp_fold, "Supplementary\ files/TableS1\ fly-worm\ ortholog\ pairs.txt"), skip = 1, h=T, sep = "\t", as.is = T, quote = "\"" ) colnames(glist) <- c("fb_id", "dmel_name", "cel_id", "cel_name") glist$wb_id <- wormRef::Cel_genes[ match(glist$cel_id, wormRef::Cel_genes$sequence_name), "wb_id"] save(glist, file = paste0(data_folder, "sc2_glist.RData"), compress = "xz") # cleanup file.remove(tmp_file) unlink(tmp_fold, recursive = T) rm(tmp_file, tmp_fold, f_url)
To build dsgraveley2011
, first get drosophila genes from ensembl:
Then, download D. melanogaster data from @graveley2011developmental :
To build dslevin2016cel
, C. elegans single-embryo data from @levin2016mid
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.