DeLorean analysis of Shalek et al. primary mouse bone-marrow-derived dendritic cells data
In DeLorean: Estimates Pseudotimes for Single Cell Expression Data

#
# Execute this block to render the Rmarkdown.
#
devtools::load_all('../..')
devtools::load_all('../../../DeLoreanData/')
# fit.model <- FALSE
fit.model <- TRUE
rmarkdown::render('Shalek-DeLorean.Rmd')

library(knitr)
library(knitcitations)
library(rmarkdown)
library(functional)
#
# knitr options
#
opts_chunk$set(
    fig.path = 'figures/Shalek-',
    stop_on_error = TRUE,
    fig.width = 12.5,
    fig.height = 8)
#
# Citations
#
cleanbib()
cite_options(
    # hyperlink = 'to.doc',
    hyperlink = TRUE,
    # style = 'html',
    # citation_format = 'text',
    citation_format = "pandoc",
    cite.style = "numeric",
    check.entries = TRUE)
    # hyperlink = TRUE)
bib <- read.bibtex("DeLorean.bib")
if (file.exists("config.R")) {
    source("config.R")
}
source(system.file("scripts/shared.R", package="DeLorean"))

# suppressMessages(loadfonts())
library(DeLorean)
#
# Stylesheet
#
options(markdown.HTML.stylesheet = system.file(file.path('Rmd', 'foghorn.css'),
                                               package="DeLorean"))
font.family <- "Verdana"
font.theme <- theme_update(text=element_text(family=font.family))
theme_set(font.theme)

r date()

Data

Shalek et al.'s data r citet(bib[["shalek_single-cell_2014"]]) is available in the DeLorean R package.

library(DeLorean)
library(DeLoreanData)
library(dplyr)
library(ggplot2)
data(ShalekDeLorean)

Build the DeLorean object.

dl <- de.lorean(
    shalek.A.expr,
    shalek.A.gene.meta,
    shalek.A.cell.meta)
dl$cell.meta <- mutate(dl$cell.meta,
                       precocious=cell %in% c('LPS_1h_S51', 'LPS_1h_S52'))

Filter out the cells we want for the time course.

time.course.cells <- (
    dl$cell.meta
    %>% filter(! is.na(total),
               "" == assay,
               "LPS" == stimulant | "" == stimulant,
               "" == ko,
               FALSE == disrupted,
               total > 1e6,
               "" == replicate))
dl <- filter_cells(dl, cells=time.course.cells$cell)

Re-level the cells by their capture time. This improves the ordering in later plots.

dl$cell.meta$cell <- factor(
    dl$cell.meta$cell,
    levels=(shalek.A.cell.meta %>% arrange(capture))$cell)

Estimate hyperparameters

Examine data for empirical Bayes estimation of hyperparameters.

model.name <- getOption("Shalek.model", 'lowrank')
dl <- estimate.hyper(
    dl,
    sigma.tau=getOption("Shalek.sigma.tau", 1),
    length.scale=getOption("Shalek.length.scale", 5),
    model.name=model.name)
if (! dl$opts$model.estimates.cell.sizes) {
    dl <- adjust.by.cell.sizes(dl)
    ggplot(dl$cell.sizes %>% left_join(dl$cell.meta), aes(x=capture, y=S.hat)) + geom_boxplot()
}

Choose genes and cells

Only use induced genes that have been assigned to a cluster.

induced.genes <- dl$gene.meta %>% filter(! is.na(cluster))
dl <- filter_genes(dl, genes=induced.genes$gene)

Choose genes: take those with highest variance between time points relative to the noise level.

shalek.key.genes <- unique(toupper(c(
    #
    # Cluster I d (core antiviral module; enriched for annotated antiviral and
    #             interferon response genes; for example,-
    "Ifit1", "Irf7",
    #
    # Cluster III c (peaked inflammatory module; showing rapid,
    # yet transient, induction under LPS; for example,
    "Tnf", "Il1a", "Cxcl2",
    #
    # Cluster III d (sustained inflammatory module; exhibiting
    # continued rise in expression under LPS; for example,
    "Mmp14", "Marco", "Il6",
    #
    # Cluster III b (‘maturity’ module; containing markers of
    # dendritic cell maturation; for example,
    "Cd83", "Ccr7", "Ccl22",
    #
    # At 2 h following LPS,
    "Ifnb1",
    # was bimodally expressed
    #
    # Genes encoding key inflammatory cytokines (for example,
    "Tnf", "Cxcl1",
    #
    # Figure 4: core antiviral targets.
    "Rsad2", "Stat2"
)))
clusters <- c("Id", "IIIb", "IIIc", "IIId")
# clusters <- c("Id")
dl <- analyse.variance(dl, adjust.cell.sizes=TRUE)
gene.variances <- dl$gene.var  # Save copy for later
max.genes <- getOption("Shalek.max.genes", 74)
genes.for.stan <- (
    dl$gene.var
    %>% left_join(dl$gene.meta)
    %>% mutate(key=gene %in% shalek.key.genes)
    %>% filter(cluster %in% clusters)
    %>% arrange(- psi.hat / omega.hat)
    %>% head(max.genes))
dl <- filter_genes(dl, genes=genes.for.stan$gene)
# How many come from each cluster?
qplot(genes.for.stan$cluster)

Choose a few cells but make sure we have the precocious cells.

seed <- getOption("Shalek.seed", 1)
set.seed(seed)
max.cells <- getOption("Shalek.max.cells", 0)
if (max.cells != 0) {
    sampled.cells <- sample(colnames(dl$expr), max.cells)
    if (! "LPS_1h_S51" %in% sampled.cells) {
        sampled.cells[1] <- "LPS_1h_S51"
    }
    if (! "LPS_1h_S52" %in% sampled.cells) {
        sampled.cells[2] <- "LPS_1h_S52"
    }
    dl <- filter_cells(dl, cells=sampled.cells)
}

Save expression data and meta data.

saveRDS(list(expr=dl$expr,
             cell.meta=dl$cell.map,
             gene.meta=dl$gene.map,
             gene.variances=gene.variances),
        file='Data/Shalek-input.rds')

Fit model

num.inits <- getOption("Shalek.num.inits", default.num.cores())
inf.method <- getOption("Shalek.method", "vb")

Define and compile the model, find the best initialisation, and fit the model. The model name is "r model.name", the inference method is "r inf.method" with r num.inits initialisations, the seed is r seed and the data have r dim(dl)[1] genes and r dim(dl)[2] cells.

dl <- prepare.for.stan(dl)
dl <- compile.model(dl)
dl <- find.good.ordering(dl, seriation.find.orderings)
# dl <- find.good.ordering(dl, seriation.find.orderings, num.cores=2)
# dl <- find.good.ordering(dl, magda.find.orderings)
plot(dl, type='orderings')
dl <- pseudotimes.from.orderings(dl, num.to.keep=num.inits)
system.time(dl <- fit.model(dl, method=inf.method, num.inits=num.inits))
if ("vb" == inf.method) {
    pseudotimes.pair.plot(dl)
}

Analyse different ordering methods

mn.list <- sapply(dl$order.inits, function(i) i$method.name)
mn.mat <- stringr::str_split_fixed(mn.list, ':', 4)
colnames(mn.mat) <- c('method', 'scaled', 'dim.red', 'dims')
mn.df <- as.data.frame(mn.mat) %>%
    mutate(ll=sapply(dl$order.inits, function(i) i$ll))
ggplot2::ggplot(mn.df, aes(x=method, y=ll)) + geom_boxplot()
ggplot2::ggplot(mn.df, aes(x=dim.red, y=ll)) + geom_boxplot()
ggplot2::ggplot(mn.df, aes(x=dims, y=ll)) + geom_boxplot()
ggplot2::ggplot(mn.df, aes(x=scaled, y=ll)) + geom_boxplot()

dl <- examine.convergence(dl)

Analyse posterior

Examine posterior and optimise best sample.

dl <- process.posterior(dl)
dl <- analyse.noise.levels(dl)

Profiles

Calculate expression profiles.

dl <- make.predictions(dl)

Cluster analysis

clustered <- dl$gene.meta %>% filter(! is.na(cluster))
fitted.time.course.cells <- filter(time.course.cells, cell %in% dl$cell.map[['cell']])$cell
clustered.expr <- shalek.A.expr[as.character(clustered$gene),
                                as.character(fitted.time.course.cells)]
# rownames(clustered.expr)
# colnames(clustered.expr)
clustered.expr.l <- melt.expr(dl, clustered.expr) %>% left_join(dl$gene.meta)
names(clustered.expr.l)
# sample_n(clustered.expr.l, 14)
module.scores <- (
    clustered.expr.l
    %>% group_by(cluster, cell)
    %>% dplyr::summarise(module.score=mean(x))
    %>% left_join(dl$samples.l$tau
                  %>% filter(dl$best.sample == iter)
                  %>% dplyr::select(cell, tau)))
module.scores
stopifnot(all(! is.na(module.scores)))
# Find the precocious cells
core.antiviral <- (
    module.scores
    %>% left_join(dl$cell.meta %>% dplyr::select(cell, capture))
    %>% filter("Id" == cluster)
    %>% arrange(-module.score))
precocious <- core.antiviral %>% filter("1h" == capture) %>% head(2)
precocious
precocious$cell
module.scores <- (
    module.scores %>% mutate(type=ifelse(cell %in% precocious$cell,
                                         "precocious",
                                         "not precocious"))
)
plots <- list()
plots$core.antiviral <- (
    ggplot(core.antiviral,
           aes(x=module.score, color=capture))
    + geom_density()
)
print(plots$core.antiviral)
ggsave('Shalek-core-antiviral.pdf', plots$core.antiviral + plos.theme,
       width=2*fig.width, height=2*fig.height)
# Plot the core antiviral, the maturity, the peaked inflammation and
# the sustained inflammation module scores against pseudotime.
plots$module <- (
    ggplot(module.scores
           %>% filter(! is.na(tau),
                      cluster %in% c("Id", "IIIb", "IIIc", "IIId")),
           aes(x=tau, y=module.score, colour=cluster))
    + stat_smooth()
    + geom_point()
)
ggsave('Shalek-module.pdf', plots$module + plos.theme,
       width=2*fig.width, height=2*fig.height)
print(plots$module)
# Same with just core antiviral coloured by capture
plots$core <- (
    ggplot(module.scores
           %>% left_join(dl$cell.meta %>% dplyr::select(cell, capture))
           %>% filter(! is.na(tau),
                      cluster == "Id"),
           aes(x=tau, y=module.score, colour=capture, shape=type))
    + stat_smooth(aes(group="", color=NULL))
    + geom_point(alpha=.5)
)
print(plots$core)
do.call(ggsave,
        c(list('Shalek-core.pdf', plots$core + bioinf.config),
          bioinf.sizes))

#
# Profile plots
plots$profiles <- plot(dl, type='profiles', genes=dl$genes.high.psi[1:4])
# Resize points and set transparency
plots$profiles$layers[[3]]$aes_params$size <- 0.5
plots$profiles$layers[[3]]$aes_params$alpha <- 0.5
ggsave('Shalek-profiles.png', plots$profiles + plos.theme,
       width=6, height=3, units="in", dpi=600)
# Examine what pseudotimes the model estimated for the precocious genes
dplyr::filter(dl$samples.l$tau, dl$best.sample == iter, precocious)

Check precocious

S51.dists <- with(
    dl,
    reshape2::melt(expr - expr[,"LPS_1h_S51"], varnames=c("gene", "cell"))
    %>% mutate(gene=factor(gene, levels=levels(gene.map$gene)))
    %>% mutate(cell=factor(cell, levels=levels(cell.map$cell)))
    %>% left_join(gene.map)
    %>% left_join(cell.map))
names(S51.dists)
# bad.genes <- c(51, 53, 73)
bad.genes <- c()
dl$gene.map[bad.genes,]
gp <- (
    ggplot(S51.dists %>% filter(! g %in% bad.genes),
           aes(x=g, y=c, fill=value))
    + geom_tile()
    + scale_fill_gradient2()
)
png('S51-dists.png', width=960, height=960)
print(gp)
dev.off()
print(gp)

Evaluate held out genes

Evaluate the held out genes that weren't used to fit the model.

held.out.genes <- with(dl, gene.variances
                           %>% left_join(gene.meta)
                           %>% filter(! gene %in% gene.map$gene)
                           %>% filter(cluster %in% clusters)
                           # %>% filter(cluster != "Id")
                           %>% arrange(-psi.hat/omega.hat)
                           %>% head(getOption('Shalek.held.out', 100))
)
# Get an expression matrix of held out genes in the cells of interest
held.out.expr <- shalek.A.expr[as.character(held.out.genes$gene),
                               as.character(dl$cell.map$cell)]

Roughness permutation test

Permutation test for roughness.

dl <- roughness.test(dl, held.out.expr)
print(dl$roughness.test)
plots$roughnesses <- plot(dl, type="roughnesses")
print(plots$roughnesses)
do.call(ggsave,
        c(list('Shalek-roughnesses.pdf', plots$roughnesses + bioinf.config),
          suppl.sizes))

plots$tau.offset <- (
    ggplot(dl$samples.l$tau,
            aes(x=tau.offset, color=capture))
    + geom_density()
    + geom_rug(alpha=.01)
    + stat_function(fun=Curry(dnorm, sd=dl$stan.data$sigma_tau),
                    linetype=2,
                    color="black")
)
ggsave('Shalek-tau-offset.png', plots$tau.offset + plos.theme)

Test plotting held out genes.

held.out.genes <- held.out.select.genes(dl, shalek.A.expr, 25)
held.out <- held.out.melt(dl, shalek.A.expr, held.out.genes)
system.time(posterior <- held.out.posterior(dl, held.out))
gp <- plot.held.out.posterior(dl, posterior)
print(gp)
save.output <- getOption('Shalek.save.output', TRUE)

# Remove large fit component
saveRDS({dl2 <- dl; dl2$fit <- NULL; dl2}, "Data/Shalek.rds")
# dl <- readRDS("Data/Shalek.rds")
saveRDS(plots, "Data/Shalek-plots.rds")

Use dimensionality reduction to look for outliers

pca <- prcomp(t(dl$expr), center=TRUE, scale.=FALSE)
# print(pca)
# summary(pca)
png('Shalek-PCA-var.png')
plot(pca, type = "l")
dev.off()
pca.l <- reshape2::melt(pca$x, varnames=c("cell", "PC"), value.name="x")
# sample_n(pca.l, 10)
tau.best <- dl$samples.l$tau %>% filter(iter==dl$best.sample)
pca.df <-pca.l %>% reshape2::dcast(cell ~ PC) %>% left_join(tau.best)
pca.plot <- ggplot(pca.df,
       aes(x=PC1, y=PC2, label=cell, color=tau, shape=capture)) +
    #geom_text() +
    geom_point() +
    scale_size_continuous(breaks=c(1,2)) +
    scale_colour_gradient(low="red", high="blue")
print(pca.plot)
ggsave('Shalek-PCA.png', pca.plot)

Pseudotime mean vs. standard deviation

pseudo_mean_sd <-
  dl$samples.l$tau %>%
  group_by(c) %>%
  summarise(mean=mean(tau), sd=sd(tau)) %>%
  left_join(dl$cell.map)
mean_sd_plot <-
  ggplot(pseudo_mean_sd, aes(x=mean, y=sd, colour=capture)) +
  geom_point(alpha=.4) +
  bioinf.config
ggsave('Plots/Shalek-mean-sd.pdf', mean_sd_plot,
       width=6, height=4)

Session information

date()

R version and packages used:

sessionInfo()

Any scripts or data that you put into this service are public.

DeLorean documentation built on May 2, 2019, 9:24 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

DeLorean
Estimates Pseudotimes for Single Cell Expression Data

DeLorean analysis of Shalek et al. primary mouse bone-marrow-derived dendritic cells data
In DeLorean: Estimates Pseudotimes for Single Cell Expression Data

Data

Estimate hyperparameters

Choose genes and cells

Fit model

Analyse posterior

Profiles

Cluster analysis

Check precocious

Evaluate held out genes

Roughness permutation test

Use dimensionality reduction to look for outliers

Pseudotime mean vs. standard deviation

Session information

Try the DeLorean package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DeLorean Estimates Pseudotimes for Single Cell Expression Data

DeLorean analysis of Shalek et al. primary mouse bone-marrow-derived dendritic cells data In DeLorean: Estimates Pseudotimes for Single Cell Expression Data

Data

Estimate hyperparameters

Choose genes and cells

Fit model

Analyse posterior

Profiles

Cluster analysis

Check precocious

Evaluate held out genes

Roughness permutation test

Use dimensionality reduction to look for outliers

Pseudotime mean vs. standard deviation

Session information

Try the DeLorean package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DeLorean
Estimates Pseudotimes for Single Cell Expression Data

DeLorean analysis of Shalek et al. primary mouse bone-marrow-derived dendritic cells data
In DeLorean: Estimates Pseudotimes for Single Cell Expression Data