#
# Execute this block to render the script.
#
devtools::load_all('../..')
devtools::load_all('../../../DeLoreanData')
rmarkdown::render('McDavid-DeLorean.Rmd')
library(knitr)
library(knitcitations)
library(rmarkdown)
#
# knitr options
#
opts_chunk$set(
    fig.path = 'figures/McDavid-',
    stop_on_error = TRUE,
    fig.width = 12.5,
    fig.height = 8)
#
# Citations
#
cleanbib()
cite_options(
    # hyperlink = 'to.doc',
    hyperlink = TRUE,
    # style = 'html',
    # citation_format = 'text',
    citation_format = "pandoc",
    cite.style = "numeric",
    check.entries = TRUE)
    # hyperlink = TRUE)
bib <- read.bibtex("DeLorean.bib")
if (file.exists("config.R")) {
    source("config.R")
}
source(system.file("scripts/shared.R", package="DeLorean"))
# suppressMessages(loadfonts())
library(DeLorean)
#
# Stylesheet
#
options(markdown.HTML.stylesheet = system.file(file.path('Rmd', 'foghorn.css'),
                                               package="DeLorean"))
font.family <- "Verdana"
font.theme <- theme_update(text=element_text(family=font.family))
theme_set(font.theme)

r date()

r citet(bib[["mcdavid_modeling_2014"]]) assayed actively cycling single cells in order to examine the confounding effect of the cell cycle on single cell expression analyses. They measured the expression of 333 genes in 930 cells across three cell cycle phases and three cell lines.

Data

McDavid et al.'s data is available in the DeLorean R package. Not all 333 genes are represented as several are rarely expressed.

library(DeLorean)
library(DeLoreanData)
library(dplyr)
library(ggplot2)
data(McDavidDeLorean)
seed <- getOption("McDavid.seed", 1)
set.seed(seed)

Shift positively expressed genes closer to unexpressed genes to improve fit.

dl <- de.lorean(
    pmax(mcdavid.expr - .69, 0),
    mcdavid.gene.meta,
    mcdavid.cell.meta)

Test cell size methods

dl <- estimate.cell.sizes(dl)
ggplot(dl$cell.sizes %>% left_join(dl$cell.meta), aes(x=S.hat, fill=capture)) + geom_histogram(position='dodge')

dl <- estimate.cell.sizes(dl, by.capture=FALSE)
ggplot(dl$cell.sizes %>% left_join(dl$cell.meta), aes(x=S.hat, fill=capture)) + geom_histogram(position='dodge')

Estimate hyperparameters

Examine data for empirical Bayes estimation of hyperparameters.

model.name <- getOption("McDavid.model", 'exact')
dl <- estimate.hyper(dl, sigma.tau=.5, length.scale=5, model.name=model.name)
if (! dl$opts$model.estimates.cell.sizes) {
    dl <- adjust.by.cell.sizes(dl)
    ggplot(dl$cell.sizes %>% left_join(dl$cell.meta), aes(x=capture, y=S.hat)) + geom_boxplot()
}


PCA analysis

genes.high.rank <- (
    dl$gene.meta
    %>% filter(!is.na(cbRank), cbRank < 201)
    %>% arrange(cbRank))
levels(dl$cell.meta$cellline)
cells.PC3 <- mcdavid.cell.meta %>% filter("PC3" == cellline)
pca <- prcomp(t(mcdavid.expr[as.character(genes.high.rank$gene),
                             as.character(cells.PC3$cell)]),
              .scale=TRUE)
# Sqrt of eigenvalues
qplot(pca$sdev)
# Percentage of variance explained
qplot(pca$sdev**2 / sum(pca$sdev**2) * 100)
# Get the PC scores
pc.scores <- as.data.frame(pca$x)
pc.scores$cell <- factor(rownames(pca$x), levels=levels(dl$cell.meta$cell))
pc.scores <- pc.scores %>% left_join(dl$cell.meta)
# Plot PC1 against PC2
ggplot(pc.scores, aes(x=PC1, y=PC2, color=capture)) + geom_point()
# Plot PC1 against PC3
ggplot(pc.scores, aes(x=PC1, y=PC3, color=capture)) + geom_point()

Choose genes and cells

Choose a few cells from the PC3 cell line.

dl <- filter_cells(dl, cells=cells.PC3$cell)
max.cells <- getOption("McDavid.max.cells", 0)
if (max.cells != 0) {
    dl <- filter_cells(dl, number=max.cells)
}

Choose genes with low $p$-values from the McDavid et al. differential expression test.

genes.for.stan <- dl$gene.meta %>% filter(pvalue > 10)
dl <- filter_genes(dl, genes=genes.for.stan$gene)
max.genes <- min(getOption("McDavid.max.genes", 0))
if (max.genes != 0) {
    dl <- filter_genes(dl, number=max.genes)
}

Save expression data and meta data.

saveRDS(list(expr = dl$expr, cell.meta = dl$cell.map, gene.meta=dl$gene.map),
        file='Data/McDavid-input.rds')

Compile and fit model

inf.method <- getOption("McDavid.method", "vb")
num.inits <- getOption("McDavid.num.inits", default.num.cores())

Define and compile the model, find the best initialisation, and fit the model. The model name is "r model.name", the inference method is "r inf.method" with r num.inits initialisations, the seed is r seed and the data have r dim(dl)[1] genes and r dim(dl)[2] cells.

dl <- prepare.for.stan(dl, period=3, hold.out=3)
dl <- compile.model(dl)
dl <- find.good.ordering(dl, seriation.find.orderings)
plot(dl, type='orderings')
dl <- pseudotimes.from.orderings(dl, num.to.keep=num.inits)
system.time(dl <- fit.model(dl, method=inf.method, num.inits=num.inits))
if ("vb" == inf.method) {
    pseudotimes.pair.plot(dl)
}
dl <- examine.convergence(dl)

Analyse posterior

Examine posterior.

dl <- process.posterior(dl)
# dl <- optimise.best.sample(dl)
dl <- analyse.noise.levels(dl)

Profiles

Calculate expression profiles.

dl <- make.predictions(dl)

Examine peak times

Calculate distance between peak time and capture time. Capture times are as defined in CycleBase. This is the naive method we compare our predicted peaks against.

source(system.file("scripts/McDavid-fns.R", package="DeLorean"))
naive.peaks <-
  get.gene.peaks(
      melt.expr(dl) %>%
      left_join(dl$cell.map %>%select(cell, capture, obstime))) %>%
    left_join(dl$gene.map %>% select(gene, cbPeaktime)) %>%
    filter(! is.na(cbPeaktime)) %>%
    mutate(capture.dist=peak.distance(cbPeaktime, tau.to.cbtime(peak.tau)))
naive.peaks
# Calculate RMSE for naive method
capture.RMSE <- calc.rms(naive.peaks$capture.dist)
capture.RMSE
plots <- list()
# Plot the naive peaks
plots$capture.peaks <- (
    ggplot(naive.peaks, aes(x=capture, y=cbPeaktime))
    + geom_boxplot()
    + scale_y_continuous(name="CycleBase peak time")
    + scale_x_discrete(name="cell cycle phase")
    + coord_flip())
print(plots$capture.peaks)
ggsave('McDavid-capture-peaks.pdf', plot=plots$capture.peaks,
       width=text.width, height=text.width)
ggsave('McDavid-capture-peaks.png', plot=plots$capture.peaks,
       width=fig.width*3.5, height=fig.height*3,
       units="in", dpi=100)
# Check peak times are correct with random gene.
gene.test <- sample(rownames(dl$expr), 1)
gene.test
gene.map.test <- filter(dl$gene.map, gene.test == gene)
expr.test <- dl$expr[gene.test,]
expr.test
which.max(expr.test)
cell.max <- names(which.max(expr.test))
cell.max
cell.map.test <- filter(dl$cell.map, cell.max == cell)
cell.map.test
(
    naive.peaks
    %>% filter(gene.test == gene)
    %>% select(cbPeaktime, cell, capture, capture.dist)
)
stopifnot(cell.max == filter(naive.peaks, gene.test == gene)$cell)
dl <- within(dl, {
    # Find the peaks in the predicted profiles
    peak.vs.max <- (
        predictions
        %>% group_by(iter, g)
        %>% dplyr::summarise(predicted.peak=tau.to.cbtime(tau[which.max(predictedmean)]))
        %>% left_join(gene.map %>% select(g, gene, cbRank, cbPeaktime, expPeaktime))
        %>% filter(! is.na(cbPeaktime))
        %>% mutate(dist=peak.distance(predicted.peak, cbPeaktime))
        %>% left_join(naive.peaks %>% select(gene,
                                             naive.peak=peak.tau,
                                             naive.capture=capture,
                                             naive.peak.cell=cell,
                                             naive.dist=capture.dist))
    )
    # Calculate the distance between the peaks and the CycleBase peaktimes
    peak.max.dist <- (
        peak.vs.max
        %>% group_by(iter)
        %>% dplyr::summarise(rmse=calc.rms(dist))
    )
})
sample_n(dl$peak.max.dist, 15)
best.peak.rmse <- with(dl, filter(peak.max.dist, best.sample == iter)$rmse)

The RMSE for our model's peak time estimates was r best.peak.rmse. The RMSE for the naive method's peak time estimates was r capture.RMSE an increase of r (capture.RMSE - best.peak.rmse) / best.peak.rmse * 100%.

# Plot distribution
plots$RMSE <- (
    ggplot(dl$peak.max.dist, aes(x=rmse))
    + geom_density()
    + geom_rug()
    + geom_vline(xintercept=capture.RMSE, linetype='dotted', color='red')
    + geom_vline(xintercept=best.peak.rmse, linetype='dashed', color='blue')
    + scale_x_continuous(name="RMSE"))
ggsave('McDavid-rmse.pdf', plot=plots$RMSE,
       width=text.width, height=text.width)
ggsave('McDavid-rmse.png', plot=plots$RMSE,
       width=fig.width*1.75, height=fig.height*1.5,
       units="in", dpi=200)
print(plots$RMSE)
# Plot naive distances against predicted
plots$naive.vs.predicted <- (
    ggplot(dl$peak.vs.max %>% filter(dl$best.sample == iter),
           aes(x=dist, y=naive.dist, label=gene))
    + geom_text()
)
ggsave('McDavid-naive-vs-predicted.pdf', plot=plots$naive.vs.predicted,
       width=2*text.width, height=2*text.width)
#
# Add the peak times to a profile plot.
add.peak.times <- function(dl, gp) {
    plot.peaktimes <- (
        dl$gene.meta
        # Only use those genes that are in the plot and have a peak time
        %>% filter(gene %in% gp$data[['gene']], ! is.na(cbPeaktime))
        # Convert the CycleBase peak time to pseudotime
        %>% mutate(peak.tau=cbtime.to.tau(cbPeaktime))
    )
    # Add the peak times as vertical lines to the plot
    (
        gp
        + geom_vline(data=plot.peaktimes,
                     aes(xintercept=peak.tau),
                     linetype='dashed', alpha=.7)
    )
}
#
# Plot profiles of worst predicted peaks
worst.peaks <- (
    dl$peak.vs.max
    %>% filter(dl$best.sample == iter)
    %>% arrange(naive.dist - dist)
    %>% head(12)
)
plots$worst.profiles <- plot(dl, type='profiles', genes=worst.peaks$gene)
plots$worst.profiles <- add.peak.times(dl, plots$worst.profiles)
ggsave('McDavid-worst-profiles.pdf', plot=plots$worst.profiles,
       width=2*text.width, height=2*text.width)
# Mean of RMSE
posterior.RMSE.mean <- mean(dl$peak.max.dist$rmse)
posterior.RMSE.mean
plots$peak.vs.max <- (
    ggplot(dl$peak.vs.max %>% filter(dl$best.sample == iter),
           aes(x=cbPeaktime, y=predicted.peak, label=gene))
    + geom_point(alpha=.7, size=2)
    + scale_x_continuous(name="CycleBase peak time")
    + scale_y_continuous(name="estimated peak time")
)
print(plots$peak.vs.max)
ggsave('McDavid-peak-vs-max.pdf', plot=plots$peak.vs.max,
       width=text.width, height=text.width)
ggsave('McDavid-peak-vs-max.png', plot=plots$peak.vs.max,
       width=fig.width*1.75, height=fig.height*1.5,
       units="in", dpi=200)
# Test for normality
with(dl$peak.max.dist, shapiro.test(rmse))
plots$qq <- (
    ggplot(dl$peak.max.dist, aes(sample=rmse))
    + stat_qq())
print(plots$qq)
ggsave('McDavid-rmse-qq.pdf', plot=plots$qq,
       width=text.width, height=text.width)
# Assuming a normal distribution for the posterior RMSE, what is
# the likelihood of observing at least as extreme as the
# peak capture RMSE.
with(dl$peak.max.dist,
     1-pnorm(capture.RMSE, mean=mean(rmse), sd=sd(rmse), lower.tail=F))

Plot the profiles.

genes.for.profiles <- with(dl,
    samples.l$psi
    %>% filter(best.sample == iter)
    %>% left_join(samples.l$omega)
    %>% arrange(-psi/omega)
    %>% head(6)
    %>% left_join(gene.map)
)
plots$profiles <- (
    plot(dl, type="profiles", genes=genes.for.profiles$gene)
    + scale_x_continuous(name='Pseudotime', limits=c(0, 3))
)
plots$profiles <- (
    add.peak.times(dl, plots$profiles)
    + scale_x_continuous(name='Pseudotime',
                         breaks=c(0,1,2,3),
                         labels=c('G2/M', 'G0/G1', 'S', 'G2/M'))
    + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))
)
# Resize points and set transparency
plots$profiles$layers[[3]]$aes_params$size <- 0.5
plots$profiles$layers[[3]]$aes_params$alpha <- 0.5
print(plots$profiles)
ggsave('McDavid-profiles.svg', plots$profiles + plos.theme
       + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)),
       width=plos.width, height=plos.height, units="in")
ggsave('McDavid-profiles.tiff', plots$profiles + plos.theme
       + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)),
       width=plos.width, height=plos.height, units="in", dpi=300)
ggsave('McDavid-profiles.png', plot=plots$profiles,
       width=fig.width*3.5, height=fig.height*3,
       units="in", dpi=100)
plot.save.args <- bioinf.sizes
do.call(ggsave,
    c(list('McDavid-profiles.pdf',
           plot=plots$profiles + bioinf.config +
             theme(axis.text.x = element_text(angle = 90, hjust = 1))),
      plot.save.args))
# ggsave('McDavid-profiles.eps', plot=plots$profiles + plos.theme,
       # width=single.col.width, height=single.col.width/golden.ratio,
       # units='mm', device='cairo_ps')
save.output <- getOption('McDavid.save.output', TRUE)
# Save DeLorean object without fit component
saveRDS({dl2 <- dl; dl2$fit <- NULL; dl2}, "Data/McDavid.rds")
saveRDS(plots, "Data/McDavid-plots.rds")
# Not executed
# Load DeLorean object without fit component
devtools::load_all('../..')
library(DeLorean)
library(dplyr)
library(ggplot2)
dl <- readRDS("Data/McDavid.rds")
plots <- readRDS("Data/McDavid-plots.rds")

Session information

date()

R version and packages used:

devtools::session_info()


JohnReid/DeLorean documentation built on Sept. 27, 2021, 5:45 a.m.