knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.align = "center", out.width = '100%', fig.wide = TRUE, cache= T ) options(width=100) # path to extra figures figpath <- "../inst/cdoc/RAPToR-DEcorrection_figs/"
transp <- function(col, a=.5){ # Make a color transparent colr <- col2rgb(col) return(rgb(colr[1,], colr[2,], colr[3,], a*255, maxColorValue = 255)) } twoTicks <- function(side = c(1,2), col = NA, col.ticks = 1, las = c(1,2), ...){ # Generate 2 (edge) ticks on given axis of current plot # col = NA & col.ticks = 1 makes the axis line dissapear, but keeps ticks for(i in seq_along(side)){ axt <- axTicks(side[i]) axis(side[i], at = axt[c(1, length(axt))], col = col, col.ticks = col.ticks, las = las[i], ...) } }
gg_logFC <- function(x, y=NULL, rg = range(xy, na.rm = T)*1.02, l.breaks = log1p(c(0, 1, 5, 10, 50, 100, Inf)), l.labels = c('1', '5', '10', '50', '100', '100+'), nbins = 200, xlab = "log2(FC) in x", ylab = "log2(FC) in y", main = "", get.r =T, add.vd = T, DEgsel = NULL, ...){ # Make a 2d binned hexplot for showing logFC comparison require(ggplot2) require(viridisLite) if(is.null(y)) xy <- x else xy <- cbind(x, y) xy <- as.data.frame(xy) colnames(xy) <- c("x", "y") g <- ggplot(data = xy, mapping = aes(x=x, y=y)) + stat_bin_hex(aes(fill = after_stat(cut(log1p(after_stat(count)), breaks = l.breaks, labels = F, right = T, include.lowest = T))), bins=nbins) + scale_fill_gradientn(colors = viridisLite::inferno(length(l.breaks)), name = 'count', labels = l.labels) + theme_classic() + xlim(rg) + ylim(rg) + xlab(xlab) + ylab(ylab) + ggtitle(main) + coord_fixed() if(get.r){ if(any(is.na(xy))){ message("Warning: removed NA values to compute correlation coefs") rmsel <- apply(xy, 1, function(r) any(is.na(r))) xy <- xy[!rmsel,] if(!is.null(DEgsel)) DEgsel <- DEgsel[!rmsel] } cc <- cor(xy)[1,2] cc2 <- cor(xy, method='spearman')[1,2] cctxt <- paste0("r = ", round(cc, 3), '\nrho = ', round(cc2, 3)) if(add.vd) cctxt <- paste0(cctxt, "\n", round(100*(cc^2), 1), "% VarDev") if(!is.null(DEgsel)){ ccDE <- cor(xy[DEgsel,])[1,2] cctxt <- paste0(cctxt, "\nr(DE) = ", round(ccDE, 3)) if(add.vd) cctxt <- paste0(cctxt, "\n", round(100*(ccDE^2), 1), "% VarDev(DE)") } g <- g + annotate(geom="text", hjust=1, vjust=0, x = rg[2], y = rg[1], label = cctxt) } return(g) }
run_DESeq2_ref <- function(X, p, formula, ref, ae_values=NULL, window.extend=1, ns.df=3){ # Run DEseq2 wt vs. mutant correcting for development with ref. data. # Do no not specify age in the formula, it is added directly by the function. # Age estimates should either be an 'ae' column of p or given as 'ae_values'. require(DESeq2) if(!any(colnames(p)=='ae') & is.null(ae_values)){ stop("Age estimates should either be a column of p, or given to ae_values.") } if(!is.null(ae_values)){ p$ae <- ae_values } ## Extract reference expression profiles in sample time window w.rg <- range(p$ae) + c(-window.extend, window.extend) w.idx <- seq( max(c(1, which.min(abs(w.rg[1]-ref$time))-1 )), min(c(length(ref$time), which.min(abs(w.rg[2]-ref$time))+1)) ) # ref window time values w.ts <- ref$time[w.idx] # ref window expression values w.GE <- ref_2counts(ref = ref, ae_values = w.ts) ## Join ref & sample data nX <- ncol(X) nR <- ncol(w.GE) # get overlapping genes & join expression data ovl <- RAPToR::format_to_ref(samp = X, refdata = w.GE, verbose = F) Xj <- as.matrix(cbind(ovl$refdata, ovl$samp)) # get relevant fields from p f0 <- as.formula(formula) p2 <- p[, attr(terms(f0), "term.labels"), drop=F] # get 1st level of each variable lev0 <- lapply(p2, function(col) levels(col)[1]) # join p data and add Time and ref/sample Batch pj <- cbind( Time = c(w.ts, p$ae), Batch = factor(rep(c('r', 's'), c(nR, nX))), rbind(do.call(cbind, lapply(lev0, rep, times=nR)), p2) # other terms ) rownames(pj) <- colnames(Xj) # Estimate dispersions with samples only s0 <- pj$Batch=="s" dds0 <- DESeqDataSetFromMatrix(countData = Xj[,s0], colData = pj[s0,], design = f0) dds0 <- estimateSizeFactors(dds0) dds0 <- estimateDispersions(dds0, fitType = "local") dd <- dispersions(dds0) # store dispersions dd[is.na(dd)] <- 0 # remove NAs ## Build full DE model with reference # Add Time and ref/sample batch to model formula f1 <- update.formula(f0, substitute( ~ splines::ns(Time, df = ns.df) + Batch + ., list(ns.df=ns.df) )) dds <- DESeqDataSetFromMatrix(countData = Xj, colData = pj, design = f1) dds <- estimateSizeFactors(dds) # inject dispersions from sample-only model dispersions(dds) <- dd dds <- nbinomWaldTest(dds) return(dds) }
The developmental speed of fast-growing organisms such as C. elegans is influenced by many different factors -- including experimental conditions themselves -- making it difficult or impossible to collect age-matched individuals between conditions. For example, if a mutant has delayed development but controls and mutants are collected at the same chronological (and therefore different physiological) time, the perturbation of interest will be completely confounded with development (a). Because of this, purely developmental gene expression changes can be mislabeled as the effect of a variable of interest (b).
knitr::include_graphics(file.path(figpath,"dev_shift.png"))
When sample groups have age differences but still overlap, the developmental effect can simply be accounted for by including age as a covariate in the Differential Expression (DE) analysis. If there is no age overlap however, it becomes impossible to know whether an observed effect is due to the perturbation or age since both variables are completely confounded.
Using RAPToR
reference data, we can bridge the gap between non-overlapping sample groups and rescue otherwise impossible DE analyses.
load("../inst/extdata/sc4_qsdata.RData")
ref_compare()
# load libraries library(RAPToR) library(wormRef)
Given transcript per million (TPM) expression profiles from 3 control and 3 mutant samples.
head(qs$tpm) print(qs$pdat)
First, infer sample age with RAPToR.
# load reference r_cel <- prepare_refdata("Cel_larv_YA", "wormRef", 600) # estimate sample age ae_qs <- ae(qs$tpm, r_cel)
plot(ae_qs, group = qs$pdat$strain, g.line=3, lmar=5)
Run ref_compare()
to estimate developmental expression changes between groups.
qs_rc <- ref_compare( X = qs$tpm, # sample data, log(TPM+1) ref = r_cel, # ref object ae_obj = ae_qs, # ae object group = qs$pdat$strain # factor defining compared groups (wt/mut) )
Plot sample log2 fold-changes (logFCs) vs. expected developmental logFCs inferred from the reference.
qs_lfc <- get_logFC(qs_rc) plot(qs_lfc, cex=.2, asp=1, xlab = "ctr vs. mut log2FCs", xlim = c(-10,10), ylab = "Developmental log2FCs", ylim = c(-10, 10))
Print the correlation between sample and reference logFCs, as well as the average age difference between groups.
print(qs_rc)
In this example, ctr vs. mut logFCs have a correlation of $r round(cor(qs_lfc[,1], qs_lfc[,2]),3)
$ with the expected developmental logFCs computed from the reference data.
Using $r^2$, this corresponds to approximately $r round(100*(cor(qs_lfc[,1], qs_lfc[,2])^2),2)
\%$ of variance explained by development.
Such a large developmental effect between the control and mutant groups is expected as they don't have any age overlap.
For an explanation of the analysis, see How it works.
A custom ggplot
function to plot logFCs is also provided in Plotting functions, below.
gg_logFC(qs_lfc, main = "Developmental effect", xlab = "ctr vs. mut log2FC", ylab = "Developmental log2FC")
# load libraries library(RAPToR) library(wormRef) library(DESeq2) library(splines)
Given raw counts and transcript per million (TPM) expression profiles from 3 control and 3 mutant samples.
head(qs$count) head(qs$tpm) print(qs$pdat)
First, infer sample age with RAPToR.
plot(ae_qs, group = qs$pdat$strain, g.line=3, lmar=5)
Define a reference window to integrate.
r.window <- range(ae_qs$age.estimates[,1]) + c(1,-1) # +1h margin r.idx <- get_refTP(r_cel, ae_values = r.window) # indices in the ref object r.idx <- r.idx[1]:r.idx[2] # extract time and expression values r.time <- r_cel$time[r.idx] r.tpm <- r_cel$interpGE[,r.idx]
Find the optimal spline degree-of-freedom (df) to model expression dynamics in the selected window.
dfs <- 1:8 ssq <- sapply(dfs, function(i){ # compute SSQ of linear model on expression data sum(residuals(lm( t(r.tpm) ~ splines::ns(r.time, df=i) ))^2) }) plot(dfs, ssq, type='b', lwd=2) r.df <- 3 # select df=3
Transform reference TPMs into artificial counts using an arbitrary library size.
# get gene lengths g.l <- wormRef::Cel_genes[match(rownames(r.tpm), wormRef::Cel_genes[,"wb_id"]), "transcript_length"] libsize <- 25e6 # convert tpm to artificial counts r.count <- t( (t(exp(r.tpm)-1)/1e6) * (libsize/median(g.l))) * g.l r.count[r.count < 0] <- 0 r.count <- round(r.count)
Combine sample and reference count matrices and pdata.
# filter low expressed genes (at least >5 in one sample) qs$count <- qs$count[apply(qs$count, 1, max)>5, ] # keep overlapping sample and ref genes, and merge in one matrix comb.count <- do.call(cbind, format_to_ref(qs$count, r.count)[1:2]) # combine pdata comb.p <- data.frame( time = c(ae_qs$age.estimates[,1], r.time), strain = c(qs$pdat$strain, factor(rep('ctr', ncol(r.count)), levels = c('ctr', 'mut'))), # ref is ctr batch = factor(rep(c('samp', 'ref'), c(ncol(qs$count), ncol(r.count))), levels = c("samp", "ref")) )
Infer gene dispersions by fitting a model on sample data only.
# build DESeq object dd0 <- DESeqDataSetFromMatrix(countData = comb.count[ ,1:6], colData = comb.p[1:6, ], design = ~strain) # no age yet # compute gene dispersions dd0 <- estimateSizeFactors(dd0) dd0 <- estimateDispersions(dd0, fitType = "local") d0 <- dispersions(dd0) # store dispersions d0[is.na(d0)] <- 0 # remove NAs
Build a model with combined reference and sample data, injecting previously estimated gene dispersions.
dd1 <- DESeqDataSetFromMatrix( countData = comb.count, colData = comb.p, design = ~ splines::ns(time, df=3)+batch+strain # use df found above ) dd1 <- estimateSizeFactors(dd1) # inject dispersions from sample-only model dispersions(dd1) <- d0 dd1 <- nbinomWaldTest(dd1)
Extract DE results.
res <- results(dd1) summary(res)
DESeq2::plotMA(dd1)
Results are a standard DESeq2 output and can be manipulated as such.
For an explanation of the analysis, see How it works.
Custom functions to transform reference data into artificial counts and run DESeq2 as described above are given in DE functions, below (please note they may require tweaking). The following is equivalent to the code above.
dd1 <- run_DESeq2_ref( X = qs$count, # sample counts p = qs$pdat, # sample pdata formula = "~ strain", # formula (without age) ref = r_cel, # ref. object ae_values = ae_qs$age.estimates[,1], # sample age estimates window.extend = 1, # ref. window extension ns.df = 3 # spline df for ref. window )
Between two conditions 'A' and 'B' where the sample groups have developmental differences, the expression changes (or log-fold changes, logFCs) observed between the groups will be a combination of perturbation and developmental effects (a).
We can determine the expression changes expected only from the difference in development using reference expression profiles matching the samples (b). Any correlation between observed (sample) and expected (reference) logFCs will then correspond to the developmental effect, with uncorrelated logFCs corresponding to the perturbation effect (c).
knitr::include_graphics(file.path(figpath,"refcompare_method.png"))
The ref_compare()
function inputs:
ae
object (or age estimate values for the samples),The resulting sample and reference (log2) logFCs can be extracted with get_logFC()
and plotted.
When there are more than 2 compared conditions, you can specify the factor levels to compare to the control (with l
) when extracting the logFCs.
A custom ggplot2
function for plotting logFCs can be found in the Plotting functions section below.
The $r^2$ between sample and reference logFCs is a rough estimate of the percentage of variance explained by development (VarDev
, in the bottom right).
In this case, over $75\%$ of expression changes can be explained by development, which is expected given the large age difference between the sample groups.
The average age difference between groups and correlation between sample and reference are also given directly in the output of ref_compare()
.
print(qs_rc)
When there is no developmental overlap between the sample groups to compare, integrating reference data in the DE analysis makes it possible to recover truly DE genes by bridging the gap, as shown in the cartoon below.
knitr::include_graphics(file.path(figpath,"ref_integration.png"))
Most DE analysis tools input raw counts for their particular statistical properties, so the interpolated reference data must be converted from TPM to (artificial) counts assuming an arbitrary fixed library size ($25\times 10^6$ counts). Then, because gene dispersions needed for statistical testing cannot be estimated from the noiseless artificial reference, they are inferred from a model without reference data and injected into the model with reference data.
Thus, resulting model coefficients (logFCs) between sample groups are corrected for development by the reference, and their respective statistical tests use dispersion values inferred only from samples. This approach improves upon what is presented in the RAPToR article (@bulteau2022real), generating valid p-values.
DISCLAIMER: we provide the functions find_df()
, run_DESeq2_ref()
and sub-functions needed for age correction DE analysis only in this vignette and not as part of the package because (1) they go beyond the scope of RAPToR, and (2) they might need to be tweaked by the user to adapt to their needs and experimental designs (unlike ref_compare()
). This approach should also work with tools other than DEseq2
(e.g. edgeR
).
@miki2017two profiled time-series of C. elegans wild-type (WT) and xrn-2 mutants (GSE97775). Code to download this data and generate the dsmiki2017
object can be found at the end of this vignette
By selecting specific samples of matching development from both time-series, we define a gold-standard of truly DE genes. Then, by sliding the window of WT samples we can evaluate the effect of increasing developmental shifts between the groups on the DE analysis, as well as measure the advantages of integrating reference data in the model.
library(RAPToR) library(wormRef) library(DESeq2) library(splines) library(ROCR) # for plotting library(ggplot2) library(ggpubr) library(viridis) col.palette1 <- c('grey20', 'firebrick', 'royalblue', 'forestgreen') col.palette2 <- viridisLite::viridis(5)
load("../inst/extdata/dsmiki2017.RData")
We start by infering sample age.
# load reference r_cel <- prepare_refdata("Cel_larv_YA", "wormRef", 600) # estimate sample age ae_miki <- ae(dsmiki2017$g, r_cel) dsmiki2017$p$ae <- ae_miki$age.estimates[, 1]
plot(ae_miki, main = "Miki et al. (2017) samples", group = dsmiki2017$p$strain_long, show.boot_estimates = F, color = col.palette1[dsmiki2017$p$strain], lmar = 10, g.line= 1)
We then select a set of 3 WT and 3 mutant samples as the age-matched gold-standard, and define sets of 3 WT samples shifted by 1, 2, 3, 5, and 7 time points compared to the mutants.
GS_wt <- 8:10 # gold standard WT samples GS_mut <- 19:21 # GS mutant samples shifts <- -c(1,2,3,5,7) # number of timepoints to shift subs <- c(list(gold.standard = c(GS_wt, GS_mut)), lapply(shifts, function(s) c(GS_wt + s, GS_mut))) names(subs)[-1] <- paste0('s', abs(shifts))
par(mfrow = c(1,1), pty = "s", xpd=T) plot(ae~age, data = dsmiki2017$p, xlab = "Chronological age (h past L4 stage, 25°C)", ylab = "RAPToR age estimates (h post-hatching, 20°C)", col = col.palette1[dsmiki2017$p$strain], lwd = 2, xaxt = "n", yaxt = "n", bty = "l") ; twoTicks() invisible(sapply(levels(dsmiki2017$p$strain), function(l){ points(ae~age, data = dsmiki2017$p[dsmiki2017$p$strain==l,], type = "l", lty = 2, col = col.palette1[which(levels(dsmiki2017$p$strain)==l)]) })) xyoffsets <- cbind(x=c(.7, .9, 1.1, 1.3, 1.5), y= 0 )#c(.6,.5,.75,.75,.75)) invisible(lapply(seq_along(subs), function(i){ s <- subs[[i]] if(names(subs)[i] == 'gold.standard'){ ym <- dsmiki2017$p$ae[s[4:6]] xm <- (ym - 43)/1.7 yw <- dsmiki2017$p$ae[s[1:3]] xw <- (yw - 43)/1.7 points(xm-.5, ym, type = 'l', lwd = 3, col = col.palette1[2]) text(mean(xm-.5), mean(ym), pos = 2, labels = "xrn-2\nsubset", adj = 1, col = col.palette1[2], font = 2, offset = 1) segments(range(xm-.5), range(ym), range(dsmiki2017$p$age[s[4:6]]), range(ym), col = col.palette1[2], lty = 3) points(xw+.5, yw, type = 'l', lwd = 3, col = col.palette1[1]) text(mean(xw+.75), mean(yw), pos = 4, labels = "WT gold\nstandard", adj = 0, col = col.palette1[1], font = 2, offset = 1) segments(range(xw+.5), range(yw), range(dsmiki2017$p$age[s[1:3]]), range(yw), col = col.palette1[1], lty = 3) } else { y <- dsmiki2017$p$ae[s[1:3]] x <- (y - 43)/1.7 points(x+xyoffsets[i-1,1], y-xyoffsets[i-1,2], type = 'l', lwd = 3, col = col.palette2[i-1]) text(mean(x+xyoffsets[i-1,1]), mean(y-xyoffsets[i-1,2]), pos = 4, labels = paste0("WT ", shifts[i-1]), adj = 0, col = col.palette2[i-1], font = 2, offset = 1) segments(range(x+xyoffsets[i-1,1]), range(y-xyoffsets[i-1,2]), range(dsmiki2017$p$age[s[1:3]]), range(y-xyoffsets[i-1,2]), col = col.palette2[i-1], lty = 3) } })) legend("bottomright", bty = "n", lty = 2, pch = 1, lwd = 2, col = col.palette1[1:2], legend = c("wild-type", "xrn-2(xe31)"), text.col = col.palette1[1:2])
With ref_compare()
, we quantify the effect of development in all the sample sets defined above, and plot the reference vs. sample logFCs.
rcs <- lapply(subs, function(s){ RAPToR::ref_compare( X = dsmiki2017$g[, s], ref = r_cel, ae_values = ae_miki$age.estimates[s,1], group = dsmiki2017$p$strain[s] ) })
ggs <- lapply(1:6, function(i){ logFCs <- RAPToR::get_logFC(rcs[[i]]) gg_logFC(logFCs, rg = c(-10, 10), xlab = "Sample logFC", ylab = "Reference logFC", main = c('Gold-Standard', paste("WT", shifts))[i], get.r = T, add.vd = T) }) ggarrange(plotlist = ggs, ncol=3, nrow = 2, common.legend = T, legend = "right")
Unsurprisingly, age-matched reference logFCs (i.e development) account for increasing proportions of expression changes with larger age differences between groups.
par(mfrow = c(1,1), bty='l', mar = c(5,6,3,5)) ae_dif <- sapply(rcs, function(rci) diff(attr(rci, 'group.stats')$ae_avg)) r_coef <- sapply(rcs, function(rci) cor(rci$coefs$samp[,2], rci$coefs$ref[,2], method = 'pearson')) plot(ae_dif, r_coef, type='b', ylim = c(-0.05,1.0), ylab = "Sample and reference logFCs\ncorrelation (r)", xlab = "Average age difference between groups (h)", lwd=2, las=1) points(ae_dif, r_coef, col = c(1, col.palette2), lwd=5) segments(x0 = ae_dif[4], x1 = ae_dif[6], y0 = 0.75, lwd=4, col = 'grey60') text(sum(ae_dif[4], ae_dif[6])/2, 0.75, pos=1, label="No overlap between groups", col = 'grey40', font = 2, cex=.8) arrows(x0 = .5, x1 = 1, y0 = r_coef[1], lwd=3, col = 'grey20', code = 1, length = .05) text(1, r_coef[1], pos=4, label="WT gold-standard", col = 'grey20', cex=.8) text(ae_dif[-1], r_coef[-1], labels = paste0('WT', shifts), cex = .8, font = 2, col = col.palette2, pos=3)
Using the age-matched mutant and WT samples defined as gold-standard, we find a set truly DE genes using a standard DESeq2
workflow.
We filter genes to keep those overlapping with the reference, and with at least 5 counts in any sample.
g.filt <- dsmiki2017$g.raw[apply(dsmiki2017$g.raw, 1, function(r) max(r)>5),] g.filt <- RAPToR::format_to_ref(g.filt, r_cel$interpGE)$samp
We then build a DESeq model including age and strain and test for significant differences between mutant and WT groups. We define a run_DESeq2_age()
function to do this (see DE functions below for code).
run_DESeq2_age <- function(X, p){ # Run DEseq2 wt vs. mutant (with age covariate) require(DESeq2) rownames(p) <- colnames(X) p$aes <- scale(p$ae) # scale age value dds <- DESeqDataSetFromMatrix(countData = X, colData = p, design = ~aes+strain) dds <- DESeq(dds, test = "Wald", fitType = "local") return(dds) } get_DEres <- function(dds, coefname="strain_xrn2_vs_wt"){ # Get results table from deseq output, managing NAs res <- results(dds, name=coefname) # manage NAs res$padj[is.na(res$padj)] <- 1 res$log2FoldChange[is.na(res$log2FoldChange)] <- 0 return(res) }
# run_DESeq2 <- function(X, p){ # # Run DEseq2 wt vs. mutant (without age) # require(DESeq2) # rownames(p) <- colnames(X) # dds <- DESeqDataSetFromMatrix(countData = X, # colData = p, # design = ~strain) # dds <- DESeq(dds, test="Wald", fitType = "local") # return(dds) # }
DE.GS <- run_DESeq2_age(g.filt[, subs$gold.standard], dsmiki2017$p[subs$gold.standard, ]) res.GS <- get_DEres(DE.GS, coefname = "strain_xrn2_vs_wt") # get results table
We consider genes as DE with the thresholds below.
# Define DE (with p < thr AND |logFC| > thr) thr.p <- 0.01 thr.logFC <- 1.0 # get true DE genes from gold-standard truth <- (res.GS$padj < thr.p) & abs(res.GS$log2FoldChange) >= thr.logFC table(truth)
With thresholds of $p.value < r thr.p
$ and $|logFC| > r thr.logFC
$, we find r table(truth)[2]
DE genes in the absence of developmental effects ("true" DE genes).
We now apply the same model as above on the 5 sample subsets with shifted WT.
DE.shifts <- lapply(subs[-1], function(s){ run_DESeq2_age(g.filt[, s], dsmiki2017$p[s, ]) }) res.shifts <- lapply(DE.shifts, get_DEres)
Precision and recall (P/R) metrics allow us to measure the performance the DE analysis (p-value and logFC) in discriminating truly DE genes (i.e. genes found in the gold-standard DE above) from non-DE genes. An area under the P/R curve of 1 is a perfect classifier.
# compute precision-recall curves rocs_initial_age <- lapply(seq_along(subs[-1]), function(i){ v <- res.shifts[[i]]$padj v[abs(res.shifts[[i]]$log2FoldChange)<=thr.logFC] <- 1 ROCR::prediction(predictions = -v, labels = truth) })
layout(matrix(0:3, byrow = 2, nrow = 1), widths = c(.1,.7,.3,.1)) par(pty="s") # ROCR measures to plot xmes <- "rec" ymes <- "prec" # plot initial invisible(sapply(seq_along(rocs_initial_age), function(i){ plot(performance(rocs_initial_age[[i]], measure = ymes, x.measure = xmes), lwd = 2, col = transp(col.palette2[i], a=1), lty = 1, add=ifelse(i==1, F, T), ylim=c(0,1), xlim = c(0,1), main = "DE performance (no correction)") })) mtext(text = paste0('WT ', shifts), line = 1, side = 4, padj = 0, font = 2, cex = .9, las = 2, col = col.palette2, at = c(.9, .8, .7, .6, .5)) par(pty='m') init_auc <- do.call(rbind, lapply(rocs_initial_age, function(rr) performance(rr, "aucpr")@y.values[[1]])) barplot(t(init_auc), beside = T, col = transp( col.palette2, a=c(.9)), las = 2, border = c(NA), names = paste0('WT',shifts), ylab = "", ylim = c(0,1), cex.names = .8, yaxs = 'i', yaxt='n', main = "Area under curve");twoTicks(2); axis(2, lwd.ticks = 0, labels = F, line=0) title(ylab = "P/R curve AUC", line = 0.5)
We see a sharp decline in the DE model performance in detecting true DE genes as the developmental shift increases, particularly once there is no more developmental overlap between the compared groups (starting at WT-3).
If we select DE genes with the same thresholds as the gold-standard, this drop in performance translates to both a decrease of true positives and an increase of false positives, as shown below.
# get number of true & false DE genes : nDEgen <- do.call(rbind, lapply(res.shifts, function(r){ isDE <- (r$padj < thr.p) & (abs(r$log2FoldChange) >= thr.logFC) cbind(DE.in.truth=sum(isDE & truth), DE.notin.truth=sum(isDE & !truth)) })) nDEgen <- rbind(g=c(sum(truth), 0), nDEgen) rownames(nDEgen) <- c('GS', paste0("WT", shifts))
bpnde <- function(nDE, main="", cpal = c('red', col.palette2), names= rownames(nDE), show_in_gs=F, las=1, ...){ # to plot results n <- nrow(nDE) barplot(t(nDE), ylab = '', las = las, names = names, col = 0, border = NA, cex.names = .8, yaxs = 'i', yaxt='n', main=main, ...);twoTicks(2); axis(2, lwd.ticks = 0, labels = F, line=0) ; title(ylab = 'Nb. DE genes', line = 0.5) sapply(seq_len(n), function(i){ x <- nDE x[-i,] <- NA rownames(x)[-i] <- NA par(lwd=2) barplot(t(x), col=transp(cpal[i], a=c(1,0.5)), border = "white", add=T, axes=F, names=rep("",n)) par(lwd=1) }) if(show_in_gs){ par(xpd=T) text(x = par("usr")[2], y = c(nDE[n,1]/2, nDE[n,1]+nDE[n,2]/2), labels = c("True positives", "False positives"), col = transp(c(1,1), a=c(1, 0.5)), font = 2, srt = -90, cex=.8) par(xpd=F) } }
par(mar=c(4,4,1,1)) bpnde(nDEgen, las=2, show_in_gs = T)
find_df <- function(ref, w, dfs=2:8){ # Compute ssq of spline fits to find optimal df w.idx <- seq(max(c(1L, which.min(abs(w[1]-ref$time))-1 )), min(c(length(ref$time), which.min(abs(w[2]-ref$time))+1))) # get time values of window ts <- ref$time[w.idx] # get PCs of reference window pc <- summary(stats::prcomp(t(ref$interpGE[,w.idx]), scale=F, center=T)) # keep enough components for 99% var spc <- sum(pc$importance[3, ] < 0.99)+1 pcfit <- pc$x[, 1L:spc] w <- pc$importance[2, 1L:spc] # compute ssq of fit residuals with different dfs # for each df, compute weighted ssq of spline fit on ssqs <- cbind(sapply(dfs, function(dfi){ ssq <- sum(w * colSums( stats::residuals(stats::lm(pcfit~splines::ns(ts, df=dfi)))^2 )) }))/(length(ts)) return(ssqs) } log1ptpm_2rawcounts <- function(X, glengths, nreadbygl){ # Transform log1p(tpm) to (artificial) raw counts # note : nreadbygl = colSums(rawcounts/genelengths) if(length(nreadbygl) != ncol(X)) stop("nreadbygl != ncol(X)") if(length(glengths) != nrow(X)) stop("glengths must be of length nrow(X)") X <- t( (t(exp(X) - 1)/1e6) * nreadbygl ) * glengths X[X<0] <- 0 return(round(X)) } ref_2counts <- function(ref, ae_values, gltable = wormRef::Cel_genes[,c("wb_id", "transcript_length")], avg_librarysize = 25e6){ # Get expression profiles of given age from a RAPToR reference as # (artificial) count data. # note : gltable must have WBids as col 1 and gene length as col 2. # ref expression profiles at given timepoints : rX <- RAPToR::get_refTP(ref, ae_values=ae_values, return.idx = F) # transform to counts gl <- gltable[match(rownames(rX), gltable[,1]), 2] rX <- log1ptpm_2rawcounts(rX, gl, nreadbygl = rep(avg_librarysize, ncol(rX))/median(gl)) return(rX) }
We select a reference window spanning the sample age range with an added margin of 1 hour on either side for each subset, and find the optimal spline degree-of-freedom (df) to model expression dynamics covered by the reference window.
We define a find_df()
function which computes the residual sum of squares for a range of spline dfs and choose the optimal df values where a plateau is reached (see DE functions below for code).
dfs <- 2:8 dfs_ssq <- lapply(subs, function(s){ find_df(r_cel, # ref object w = range(ae_miki$age.estimates[s,1]) + c(-1, 1), # time window dfs = dfs) # df range to test }) # optimal df for each subset based on plot below df_opti <- c(3,4,4,4,5,5)
par(pty='s', mfrow = c(1,1), bty='l') plot(range(dfs), range(dfs_ssq)+c(0,12), type = 'n', xaxt = 'n', yaxt = 'n', xlab = '', ylab = "Residual SSQ", bty = 'l'); twoTicks() title(xlab = 'spline df', line = 2) invisible(sapply(seq_along(dfs_ssq), function(i) { padd. <- 2*i # added padding to distinguish between curves points(dfs, padd.+dfs_ssq[[i]], type = 'b', col = c('red',col.palette2)[i], lty=3, lwd=2) points(df_opti[i], padd.+dfs_ssq[[i]][which(dfs==df_opti[i])], cex = 1.75, pch=1) })) legend('topright', legend = c( "GS", paste("WT", shifts), 'Chosen df'), text.font = c(rep(2, 6),1), text.col = c('red', col.palette2, 1), bty = "n", lwd=c(rep(3, 6), 1), lty=c(rep(3, 6), NA), pch=1, col = c('red', col.palette2, 1), pt.cex = c(rep(1, 6), 1.75))
The selected df increases with the developmental shift, which is expected since the reference window to include gets larger and may thus contain more complex dynamics.
Having defined the appropriate fit parameters for the reference, we can integrate in the DE model. We define
run_DESeq2_ref()
to do this (see DE functions below for code).
# run DESeq2 with reference data inclusion on shifted subsets DEref.shifts <- lapply(seq_along(subs), function(i){ s <-subs[[i]] # select sample subset run_DESeq2_ref(X = g.filt[, s], # sample counts p = dsmiki2017$p[s, ], # sample pdata ae_values = dsmiki2017$p$ae[s], # age estimates formula = ~strain, # formula (age is added by the function) ref = r_cel, # ref object ns.df = df_opti[i], # df for spline fit window.extend = 1) # reference window extension }) res_ref.shifts <- lapply(DEref.shifts, get_DEres) # get results table
As done above, let's assess the performance of these models in detecting truly DE genes.
# compute age-corrected precision-recall curves rocs_corrected_age <- lapply(seq_along(subs), function(i){ v <- res_ref.shifts[[i]]$padj v[abs(res_ref.shifts[[i]]$log2FoldChange)<=thr.logFC] <- 1 ROCR::prediction(predictions = -v, labels = truth) })
layout(matrix(0:3, byrow = 2, nrow = 1), widths = c(.1,.7,.4,.1)) par(pty="s") # ROCR measures to plot xmes <- "rec" ymes <- "prec" # plot initial curves in background for comparison invisible(sapply(seq_along(rocs_initial_age), function(i){ plot(performance(rocs_initial_age[[i]], measure = ymes, x.measure = xmes), lwd = 2, col = transp(col.palette2[i], a=.3), lty = 1, add=ifelse(i==1, F, T), ylim=c(0,1), xlim = c(0,1), main = "DE performance (age-corrected)") })) # add corrected model curves invisible(sapply(seq_along(rocs_corrected_age)[-1], function(i){ plot(performance(rocs_corrected_age[[i]], measure = ymes, x.measure = xmes), lwd = 2, col = col.palette2[i-1], lty = 1, add=T) })) # add GS curve plot(performance(rocs_corrected_age[[1]], measure = ymes, x.measure = xmes), lwd = 2, col = 'red', lty = 2, add=T) mtext(text = paste0('WT ', shifts), line = 1, side = 4, padj = 0, font = 2, cex = .9, las = 2, col = col.palette2, at = c(.9, .8, .7, .6, .5)) par(pty='m') aucs <- cbind( init_auc = c(1, init_auc), corr_auc = do.call(rbind, lapply(rocs_corrected_age, function(rr) performance(rr, "aucpr")@y.values[[1]])) ) barplot(t(aucs), beside = T, col = transp(rep(c('red', col.palette2), e=2), a=c(.5,1)), las = 2, border = c(NA), names = c("GS", paste0('WT',shifts)), ylab = "", ylim = c(0,1), cex.names = .8, yaxs = 'i', yaxt='n', main = "Area under curve");twoTicks(2); axis(2, lwd.ticks = 0, labels = F, line=0) title(ylab = "P/R curve AUC", line = 0.5) legend('topright', legend = c("no correction", "age-corrected"), pt.bg = c(transp(c(1,1), a=c(0.3, 1))), pch = 22, col = 0, bty = "n", pt.cex = 3, y.intersp = 1.1, inset = c(0, 0))
Compared to the non-corrected models we strongly rescue the performance of the analysis when there is no overlap between the compared sample groups (starting at WT-3
).
In less shifted subsets or in the gold-standard, we see that the performance of the age-corrected DE analysis to find truly DE genes is comparable to applying no correction. This is expected since developmental dynamics can be properly modeled without reference data as long as there is overlap between the compared groups, nullifying the advantage of adding it to the model.
dsmiki2017
Required libraries and variables:
data_folder <- "../inst/extdata/" library("GEOquery") # bioconductor package requireNamespace("wormRef", quietly = T) requireNamespace("utils", quietly = T)
Note : set the data_folder
variable to an existing path on your system where you want to store the objects.
Given the dsmiki2017
data generated above:
sel <- c(5:7, 19:21) # select 3 WT and 3 mut qs <- list( tpm = dsmiki2017$g[, sel], count = dsmiki2017$g.raw[, sel], pdat = data.frame( sname = paste0(rep(c('ctr', 'mut'), e=3), rep(1:3,2)), strain = factor(rep(c('ctr', 'mut'), e=3)), stringsAsFactors = F) ) colnames(qs$tpm) <- qs$pdat$sname colnames(qs$count) <- qs$pdat$sname save(qs, file = file.path(data_folder, 'sc4_qsdata.RData'), compress = "xz")
Plotting a logFC comparison as binned heatmap (gg_logFC()
)
Make a color transparent (transp()
) and add an axis with 2 tick marks twoTicks()
Running a standard DESeq2 workflow (run_DESeq2_age()
) and extracting a results table (get_DEres()
).
Finding optimal df to fit a reference window (find_df()
) and converting reference data to counts (log1ptpm_2rawcounts()
, ref_2counts()
).
Integrating reference data in a DESeq model (run_DESeq2_ref()
).
options(width = 80)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.