# # Execute this block to render the script. # devtools::load_all('../..') devtools::load_all('../../../DeLoreanData') rmarkdown::render('McDavid-DeLorean.Rmd')
library(knitr) library(knitcitations) library(rmarkdown) # # knitr options # opts_chunk$set( fig.path = 'figures/McDavid-', stop_on_error = TRUE, fig.width = 12.5, fig.height = 8) # # Citations # cleanbib() cite_options( # hyperlink = 'to.doc', hyperlink = TRUE, # style = 'html', # citation_format = 'text', citation_format = "pandoc", cite.style = "numeric", check.entries = TRUE) # hyperlink = TRUE) bib <- read.bibtex("DeLorean.bib") if (file.exists("config.R")) { source("config.R") } source(system.file("scripts/shared.R", package="DeLorean"))
# suppressMessages(loadfonts()) library(DeLorean) # # Stylesheet # options(markdown.HTML.stylesheet = system.file(file.path('Rmd', 'foghorn.css'), package="DeLorean")) font.family <- "Verdana" font.theme <- theme_update(text=element_text(family=font.family)) theme_set(font.theme)
r date()
r citet(bib[["mcdavid_modeling_2014"]])
assayed actively cycling single
cells in order to examine the confounding effect of the cell cycle on
single cell expression analyses. They measured the expression of 333 genes
in 930 cells across three cell cycle phases and three cell lines.
McDavid et al.'s data is available in the DeLorean
R package. Not all 333
genes are represented as several are rarely expressed.
library(DeLorean) library(DeLoreanData) library(dplyr) library(ggplot2) data(McDavidDeLorean) seed <- getOption("McDavid.seed", 1) set.seed(seed)
Shift positively expressed genes closer to unexpressed genes to improve fit.
dl <- de.lorean( pmax(mcdavid.expr - .69, 0), mcdavid.gene.meta, mcdavid.cell.meta)
dl <- estimate.cell.sizes(dl) ggplot(dl$cell.sizes %>% left_join(dl$cell.meta), aes(x=S.hat, fill=capture)) + geom_histogram(position='dodge') dl <- estimate.cell.sizes(dl, by.capture=FALSE) ggplot(dl$cell.sizes %>% left_join(dl$cell.meta), aes(x=S.hat, fill=capture)) + geom_histogram(position='dodge')
Examine data for empirical Bayes estimation of hyperparameters.
model.name <- getOption("McDavid.model", 'exact') dl <- estimate.hyper(dl, sigma.tau=.5, length.scale=5, model.name=model.name) if (! dl$opts$model.estimates.cell.sizes) { dl <- adjust.by.cell.sizes(dl) ggplot(dl$cell.sizes %>% left_join(dl$cell.meta), aes(x=capture, y=S.hat)) + geom_boxplot() }
genes.high.rank <- ( dl$gene.meta %>% filter(!is.na(cbRank), cbRank < 201) %>% arrange(cbRank)) levels(dl$cell.meta$cellline) cells.PC3 <- mcdavid.cell.meta %>% filter("PC3" == cellline) pca <- prcomp(t(mcdavid.expr[as.character(genes.high.rank$gene), as.character(cells.PC3$cell)]), .scale=TRUE) # Sqrt of eigenvalues qplot(pca$sdev) # Percentage of variance explained qplot(pca$sdev**2 / sum(pca$sdev**2) * 100) # Get the PC scores pc.scores <- as.data.frame(pca$x) pc.scores$cell <- factor(rownames(pca$x), levels=levels(dl$cell.meta$cell)) pc.scores <- pc.scores %>% left_join(dl$cell.meta) # Plot PC1 against PC2 ggplot(pc.scores, aes(x=PC1, y=PC2, color=capture)) + geom_point() # Plot PC1 against PC3 ggplot(pc.scores, aes(x=PC1, y=PC3, color=capture)) + geom_point()
Choose a few cells from the PC3 cell line.
dl <- filter_cells(dl, cells=cells.PC3$cell) max.cells <- getOption("McDavid.max.cells", 0) if (max.cells != 0) { dl <- filter_cells(dl, number=max.cells) }
Choose genes with low $p$-values from the McDavid et al. differential expression test.
genes.for.stan <- dl$gene.meta %>% filter(pvalue > 10) dl <- filter_genes(dl, genes=genes.for.stan$gene) max.genes <- min(getOption("McDavid.max.genes", 0)) if (max.genes != 0) { dl <- filter_genes(dl, number=max.genes) }
Save expression data and meta data.
saveRDS(list(expr = dl$expr, cell.meta = dl$cell.map, gene.meta=dl$gene.map), file='Data/McDavid-input.rds')
inf.method <- getOption("McDavid.method", "vb") num.inits <- getOption("McDavid.num.inits", default.num.cores())
Define and compile the model, find the best initialisation, and fit the model.
The model name is "r model.name
", the inference method is "r inf.method
"
with r num.inits
initialisations, the seed is r seed
and the data have r
dim(dl)[1]
genes and r dim(dl)[2]
cells.
dl <- prepare.for.stan(dl, period=3, hold.out=3) dl <- compile.model(dl) dl <- find.good.ordering(dl, seriation.find.orderings) plot(dl, type='orderings') dl <- pseudotimes.from.orderings(dl, num.to.keep=num.inits) system.time(dl <- fit.model(dl, method=inf.method, num.inits=num.inits)) if ("vb" == inf.method) { pseudotimes.pair.plot(dl) }
dl <- examine.convergence(dl)
Examine posterior.
dl <- process.posterior(dl) # dl <- optimise.best.sample(dl) dl <- analyse.noise.levels(dl)
Calculate expression profiles.
dl <- make.predictions(dl)
Calculate distance between peak time and capture time. Capture times are as defined in CycleBase. This is the naive method we compare our predicted peaks against.
source(system.file("scripts/McDavid-fns.R", package="DeLorean")) naive.peaks <- get.gene.peaks( melt.expr(dl) %>% left_join(dl$cell.map %>%select(cell, capture, obstime))) %>% left_join(dl$gene.map %>% select(gene, cbPeaktime)) %>% filter(! is.na(cbPeaktime)) %>% mutate(capture.dist=peak.distance(cbPeaktime, tau.to.cbtime(peak.tau))) naive.peaks # Calculate RMSE for naive method capture.RMSE <- calc.rms(naive.peaks$capture.dist) capture.RMSE plots <- list() # Plot the naive peaks plots$capture.peaks <- ( ggplot(naive.peaks, aes(x=capture, y=cbPeaktime)) + geom_boxplot() + scale_y_continuous(name="CycleBase peak time") + scale_x_discrete(name="cell cycle phase") + coord_flip()) print(plots$capture.peaks) ggsave('McDavid-capture-peaks.pdf', plot=plots$capture.peaks, width=text.width, height=text.width) ggsave('McDavid-capture-peaks.png', plot=plots$capture.peaks, width=fig.width*3.5, height=fig.height*3, units="in", dpi=100) # Check peak times are correct with random gene. gene.test <- sample(rownames(dl$expr), 1) gene.test gene.map.test <- filter(dl$gene.map, gene.test == gene) expr.test <- dl$expr[gene.test,] expr.test which.max(expr.test) cell.max <- names(which.max(expr.test)) cell.max cell.map.test <- filter(dl$cell.map, cell.max == cell) cell.map.test ( naive.peaks %>% filter(gene.test == gene) %>% select(cbPeaktime, cell, capture, capture.dist) ) stopifnot(cell.max == filter(naive.peaks, gene.test == gene)$cell)
dl <- within(dl, { # Find the peaks in the predicted profiles peak.vs.max <- ( predictions %>% group_by(iter, g) %>% dplyr::summarise(predicted.peak=tau.to.cbtime(tau[which.max(predictedmean)])) %>% left_join(gene.map %>% select(g, gene, cbRank, cbPeaktime, expPeaktime)) %>% filter(! is.na(cbPeaktime)) %>% mutate(dist=peak.distance(predicted.peak, cbPeaktime)) %>% left_join(naive.peaks %>% select(gene, naive.peak=peak.tau, naive.capture=capture, naive.peak.cell=cell, naive.dist=capture.dist)) ) # Calculate the distance between the peaks and the CycleBase peaktimes peak.max.dist <- ( peak.vs.max %>% group_by(iter) %>% dplyr::summarise(rmse=calc.rms(dist)) ) }) sample_n(dl$peak.max.dist, 15) best.peak.rmse <- with(dl, filter(peak.max.dist, best.sample == iter)$rmse)
The RMSE for our model's peak time estimates was r best.peak.rmse
. The
RMSE for the naive method's peak time estimates was r capture.RMSE
an
increase of r (capture.RMSE - best.peak.rmse) / best.peak.rmse * 100
%.
# Plot distribution plots$RMSE <- ( ggplot(dl$peak.max.dist, aes(x=rmse)) + geom_density() + geom_rug() + geom_vline(xintercept=capture.RMSE, linetype='dotted', color='red') + geom_vline(xintercept=best.peak.rmse, linetype='dashed', color='blue') + scale_x_continuous(name="RMSE")) ggsave('McDavid-rmse.pdf', plot=plots$RMSE, width=text.width, height=text.width) ggsave('McDavid-rmse.png', plot=plots$RMSE, width=fig.width*1.75, height=fig.height*1.5, units="in", dpi=200) print(plots$RMSE) # Plot naive distances against predicted plots$naive.vs.predicted <- ( ggplot(dl$peak.vs.max %>% filter(dl$best.sample == iter), aes(x=dist, y=naive.dist, label=gene)) + geom_text() ) ggsave('McDavid-naive-vs-predicted.pdf', plot=plots$naive.vs.predicted, width=2*text.width, height=2*text.width) # # Add the peak times to a profile plot. add.peak.times <- function(dl, gp) { plot.peaktimes <- ( dl$gene.meta # Only use those genes that are in the plot and have a peak time %>% filter(gene %in% gp$data[['gene']], ! is.na(cbPeaktime)) # Convert the CycleBase peak time to pseudotime %>% mutate(peak.tau=cbtime.to.tau(cbPeaktime)) ) # Add the peak times as vertical lines to the plot ( gp + geom_vline(data=plot.peaktimes, aes(xintercept=peak.tau), linetype='dashed', alpha=.7) ) } # # Plot profiles of worst predicted peaks worst.peaks <- ( dl$peak.vs.max %>% filter(dl$best.sample == iter) %>% arrange(naive.dist - dist) %>% head(12) ) plots$worst.profiles <- plot(dl, type='profiles', genes=worst.peaks$gene) plots$worst.profiles <- add.peak.times(dl, plots$worst.profiles) ggsave('McDavid-worst-profiles.pdf', plot=plots$worst.profiles, width=2*text.width, height=2*text.width) # Mean of RMSE posterior.RMSE.mean <- mean(dl$peak.max.dist$rmse) posterior.RMSE.mean plots$peak.vs.max <- ( ggplot(dl$peak.vs.max %>% filter(dl$best.sample == iter), aes(x=cbPeaktime, y=predicted.peak, label=gene)) + geom_point(alpha=.7, size=2) + scale_x_continuous(name="CycleBase peak time") + scale_y_continuous(name="estimated peak time") ) print(plots$peak.vs.max) ggsave('McDavid-peak-vs-max.pdf', plot=plots$peak.vs.max, width=text.width, height=text.width) ggsave('McDavid-peak-vs-max.png', plot=plots$peak.vs.max, width=fig.width*1.75, height=fig.height*1.5, units="in", dpi=200)
# Test for normality with(dl$peak.max.dist, shapiro.test(rmse)) plots$qq <- ( ggplot(dl$peak.max.dist, aes(sample=rmse)) + stat_qq()) print(plots$qq) ggsave('McDavid-rmse-qq.pdf', plot=plots$qq, width=text.width, height=text.width) # Assuming a normal distribution for the posterior RMSE, what is # the likelihood of observing at least as extreme as the # peak capture RMSE. with(dl$peak.max.dist, 1-pnorm(capture.RMSE, mean=mean(rmse), sd=sd(rmse), lower.tail=F))
Plot the profiles.
genes.for.profiles <- with(dl, samples.l$psi %>% filter(best.sample == iter) %>% left_join(samples.l$omega) %>% arrange(-psi/omega) %>% head(6) %>% left_join(gene.map) ) plots$profiles <- ( plot(dl, type="profiles", genes=genes.for.profiles$gene) + scale_x_continuous(name='Pseudotime', limits=c(0, 3)) ) plots$profiles <- ( add.peak.times(dl, plots$profiles) + scale_x_continuous(name='Pseudotime', breaks=c(0,1,2,3), labels=c('G2/M', 'G0/G1', 'S', 'G2/M')) + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) ) # Resize points and set transparency plots$profiles$layers[[3]]$aes_params$size <- 0.5 plots$profiles$layers[[3]]$aes_params$alpha <- 0.5 print(plots$profiles) ggsave('McDavid-profiles.svg', plots$profiles + plos.theme + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)), width=plos.width, height=plos.height, units="in") ggsave('McDavid-profiles.tiff', plots$profiles + plos.theme + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)), width=plos.width, height=plos.height, units="in", dpi=300) ggsave('McDavid-profiles.png', plot=plots$profiles, width=fig.width*3.5, height=fig.height*3, units="in", dpi=100) plot.save.args <- bioinf.sizes do.call(ggsave, c(list('McDavid-profiles.pdf', plot=plots$profiles + bioinf.config + theme(axis.text.x = element_text(angle = 90, hjust = 1))), plot.save.args)) # ggsave('McDavid-profiles.eps', plot=plots$profiles + plos.theme, # width=single.col.width, height=single.col.width/golden.ratio, # units='mm', device='cairo_ps') save.output <- getOption('McDavid.save.output', TRUE)
# Save DeLorean object without fit component saveRDS({dl2 <- dl; dl2$fit <- NULL; dl2}, "Data/McDavid.rds") saveRDS(plots, "Data/McDavid-plots.rds")
# Not executed # Load DeLorean object without fit component devtools::load_all('../..') library(DeLorean) library(dplyr) library(ggplot2) dl <- readRDS("Data/McDavid.rds") plots <- readRDS("Data/McDavid-plots.rds")
date()
R version and packages used:
devtools::session_info()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.