Nothing
# Update a factor based on the levels of another factor.
#
# @param .factor Factor
# @param reference.factor Factor whose levels to use.
# @param levels New levels
#
update.levels <- function(.factor,
reference.factor=NULL,
.levels=levels(reference.factor),
ordered=is.ordered(reference.factor))
{
factor(.factor, levels=.levels, ordered=ordered)
}
# Retrieve the estimated tau for the given sample.
#
# @param dl de.lorean object
# @param sample.iter Which sample to use, defaults to best sample
#
tau.for.sample <- function(dl, sample.iter=dl$best.sample) {
(
dl$samples.l$tau
%>% filter(sample.iter == iter) # Filter correct iteration
%>% arrange(c) # Sort by cell
)$tau
}
# The levels of the gene factor
#
# @param dl The de.lorean object.
#
gene.levels <- function(dl) levels=levels(dl$gene.meta$gene)
# The levels of the cell factor
#
# @param dl The de.lorean object.
#
cell.levels <- function(dl) levels=levels(dl$cell.meta$cell)
#' Melt an expression matrix.
#'
#' @param dl The de.lorean object.
#' @param expr Matrix of expression values.
#'
#' @export melt.expr
#'
melt.expr <- function(dl, expr=dl$expr) (
expr
%>% melt(varnames=c("gene", "cell"), value.name="x")
%>% mutate(gene=factor(gene, levels=gene.levels(dl)),
cell=factor(cell, levels=cell.levels(dl)))
)
# Cast an expression matrix.
#
# @param expr.l Expression values in long format
#
cast.expr <- function(expr.l) expr.l %>% acast(gene ~ cell, value.var="x")
#' Analyse variance of expression between and within capture times.
#'
#' @param dl de.lorean object
#' @param adjust.cell.sizes Choose whether to adjust the expression values by
#' the cell size estimates
#'
#' @export
#'
analyse.variance <- function(dl, adjust.cell.sizes) {
#
# First melt expression data into long format
expr.adj <-
melt.expr(dl) %>%
left_join(dl$cell.sizes) %>%
# Adjust the expression by the cell size estimates if asked to
mutate(x.hat=x-adjust.cell.sizes*S.hat)
stopifnot(! is.na(expr.adj))
within(dl, {
#
# Resummarise the adjusted expression data by gene
gene.expr <-
expr.adj %>%
group_by(gene) %>%
dplyr::summarise(phi.hat=mean(x.hat))
stopifnot(! is.na(gene.expr))
#
# Examine the expression at each gene/capture time combination
gene.time.expr <-
expr.adj %>%
left_join(cell.meta) %>%
group_by(gene, capture) %>%
dplyr::summarise(
num.capture=n(),
Mgk=mean(x.hat),
Vgk=var(x.hat)) %>%
filter(! is.na(Vgk))
stopifnot(! is.na(gene.time.expr))
stopifnot(nrow(gene.time.expr) > 0) # Must have some rows left
#
# Expected variance of samples from zero-mean Gaussian with covariance K.obs
V.obs <- with(dl,
expected.sample.var(
cov.matern.32(
cov.calc.dists(unique(dl$cell.meta$obstime)),
dl$opts$length.scale)))
#
# Variance within and between times
gene.var <-
gene.time.expr %>%
group_by(gene) %>%
dplyr::summarise(omega.hat=mean(Vgk), psi.hat=var(Mgk)/V.obs) %>%
filter(! is.na(psi.hat),
! is.na(omega.hat),
omega.hat > 0,
psi.hat > 0)
})
}
#' Estimate hyperparameters for model using empirical Bayes.
#'
#' @param dl de.lorean object
#' @param sigma.tau Noise s.d. in temporal dimension, that is prior s.d. for tau
#' @param length.scale Length scale for stationary GP covariance function.
#' Defaults to the range of the observed capture times.
#' @param model.name The model's name:
#' \itemize{
#' \item 'exact': The model without a low rank approximation
#' that does not estimate the cell sizes.
#' \item 'exactsizes': The model without a low rank approximation
#' that does estimate the cell sizes.
#' \item 'lowrank': Low rank approximation to the 'exact' model.
#' \item 'lowranksizes': Low rank approximation to the 'exactsizes' model.
#' }
#' @param adjust.cell.sizes Adjust by the cell sizes for better estimates of the hyperparameters
#'
#' @examples
#' data(WindramDeLorean)
#' dl <- de.lorean(windram.expr, windram.gene.meta, windram.cell.meta)
#' dl <- estimate.hyper(dl)
#'
#' @export
#'
estimate.hyper <- function(
dl,
sigma.tau = .5,
length.scale = NULL,
model.name = 'exact',
adjust.cell.sizes = TRUE
) {
#
# Estimate the cell sizes
# We will need these to estimate the hyper-parameters for the cell size prior if
# the model estimates or cell sizes, or to adjust the data by if it doesn't
dl <- estimate.cell.sizes(dl)
dl <- within(dl, {
#
# Remember options that depend on the model
# Remove hyphens to keep backwards compatibility with old names
opts$model.name <- sub('-', '', model.name)
opts$model.estimates.cell.sizes <- switch(
opts$model.name,
"exactsizes" = TRUE,
"exact" = FALSE,
"lowranksizes" = TRUE,
"lowrank" = FALSE,
stop('Unknown model name'))
#
# Set up temporal hyper-parameters
opts$sigma.tau <- sigma.tau
time.range <- range(cell.meta$obstime)
time.width <- time.range[2] - time.range[1]
if (is.null(length.scale)) {
opts$length.scale <- time.width / 2
} else {
opts$length.scale <- length.scale
}
})
dl <- analyse.variance(dl, adjust.cell.sizes=adjust.cell.sizes)
within(dl, {
hyper <- list(
mu_S=mean(cell.sizes$S.hat),
sigma_S=sd(cell.sizes$S.hat),
mu_phi=mean(gene.expr$phi.hat),
sigma_phi=sd(gene.expr$phi.hat),
mu_psi=mean(log(gene.var$psi.hat), na.rm=TRUE),
sigma_psi=sd(log(gene.var$psi.hat), na.rm=TRUE),
mu_omega=mean(log(gene.var$omega.hat), na.rm=TRUE),
sigma_omega=sd(log(gene.var$omega.hat), na.rm=TRUE),
sigma_tau=opts$sigma.tau,
l=opts$length.scale
)
# Check we don't have any NAs
stopifnot(all(! sapply(hyper, is.na)))
})
}
#' Filter genes
#'
#' @param dl de.lorean object
#' @param number Number to sample if filter function or genes not supplied.
#' @param genes The genes to keep.
#' @param .filter Function that takes a list of genes as input and returns
#' a vector of TRUE/FALSE
#'
#' @examples
#' data(WindramDeLorean)
#' dl <- de.lorean(windram.expr, windram.gene.meta, windram.cell.meta)
#' dl <- filter_genes(dl, number = 37)
#'
#' @export
#'
filter_genes <- function(dl,
.filter=function(x) x %in% genes,
number=NULL,
genes=sample(rownames(dl$expr), number))
{
within(dl, {
expr <- expr[.filter(rownames(expr)),]
message("Have ", nrow(expr), " genes after filtering")
})
}
#' Filter cells
#'
#' @param dl de.lorean object
#' @param number Number to sample if filter function or cells not supplied.
#' @param cells The cells to keep.
#' @param .filter Function that takes a list of cells as input and returns
#' a vector of TRUE/FALSE
#'
#' @examples
#' data(WindramDeLorean)
#' dl <- de.lorean(windram.expr, windram.gene.meta, windram.cell.meta)
#' dl <- filter_cells(dl, number = 7)
#'
#' @export
#'
filter_cells <- function(dl,
.filter=function(x) x %in% cells,
number=NULL,
cells=sample(colnames(dl$expr), number))
{
within(dl, {
expr <- expr[,.filter(colnames(expr))]
message("Have ", ncol(expr), " cells after filtering")
})
}
# Sample so many cells per capture time.
#
# @param dl de.lorean object
# @param number Number to sample from each capture time
#
sample.per.capture <- function(dl, cells.per.capture) {
sample.at.most <- function(.df, number) {
sample_n(.df, min(number, nrow(.df)))
}
sampled.cells <- (
dl$cell.meta
%>% filter(cell %in% colnames(dl$expr))
%>% group_by(capture)
# %>% do(sample_n(., min(cells.per.capture, length(cell))))
%>% do(sample.at.most(., cells.per.capture))
)
filter_cells(dl, cells=sampled.cells$cell)
}
# Sample genes and cells
#
# @param dl de.lorean object
#
sample.genes.and.cells <- function(
dl,
max.cells = 0,
max.genes = 0)
{
within(dl, {
opts$max.cells <- max.cells
opts$max.genes <- max.genes
if (opts$max.cells && ncol(expr) > opts$max.cells) {
expr <- expr[,sample(ncol(expr), opts$max.cells)]
# Remove genes that are not expressed in at least two cells
num.cells.expr <- rowSums(! is.na(expr))
expr <- expr[num.cells.expr > 1,]
}
if (opts$max.genes && nrow(expr) > opts$max.genes) {
expr <- expr[sample(nrow(expr), opts$max.genes),]
}
})
}
#' Calculate inducing pseudotimes for sparse approximation
#'
#' @param dl de.lorean object
#' @param num.inducing Number of inducing points
#' @param period Period of expression patterns
#' @param num.sd.border The size of the border of the inducing inputs
#' around the capture times in units of number of standard
#' deviations
#'
#' @export
#'
calc.inducing.pseudotimes <- function(dl, num.inducing, period = 0, num.sd.border = 7) with(dl, {
if (period > 0) {
seq(from = 0,
to = period * (1 - 1/num.inducing),
length.out = num.inducing)
} else {
seq(from = time.range[1] - num.sd.border * hyper$sigma_tau,
to = time.range[2] + num.sd.border * hyper$sigma_tau,
length.out = num.inducing)
}
})
#' Prepare for Stan
#'
#' @param dl de.lorean object
#' @param num.test Number of test points to consider
#' @param num.inducing Number of inducing points
#' @param period Period of expression patterns
#' @param hold.out Number genes to hold out for generalisation tests
#' @param num.sd.border The size of the border of the inducing inputs
#' around the capture times in units of number of standard
#' deviations
#'
#' @export prepare.for.stan
#'
prepare.for.stan <- function(
dl,
num.test = 101,
num.inducing = 30, # M
period = 0,
hold.out = 0,
num.sd.border = 7)
within(dl, {
opts$num.test <- num.test
opts$period <- period
opts$periodic <- opts$period > 0
stopifnot(hold.out < nrow(expr))
.G <- nrow(expr) - hold.out
.C <- ncol(expr)
#
# Permute genes to make held out genes random
expr <- expr[sample(.G+hold.out),]
#
# Calculate the map from gene indices to genes and their meta data
gene.map <- (
data.frame(g=1:(.G+hold.out),
gene=factor(rownames(expr),
levels=levels(gene.meta$gene)))
%>% mutate(is.held.out=g>.G)
%>% left_join(gene.expr)
%>% left_join(gene.var)
%>% left_join(gene.meta))
stopifnot(! is.na(gene.map[
c("g", "gene", "phi.hat", "psi.hat", "omega.hat")
]))
#
# Calculate the map from cell indices to genes and their meta data
cell.map <-
data.frame(
c=1:.C,
cell=factor(colnames(expr), levels=levels(cell.meta$cell))) %>%
left_join(cell.meta) %>%
left_join(cell.sizes)
stopifnot(! is.na(cell.map %>% dplyr::select(cell, capture, obstime)))
#
# Calculate the time points at which to make predictions
if (opts$periodic) {
test.input <- seq(0, opts$period, length.out=num.test)
} else {
test.input <- seq(
time.range[1] - num.sd.border * opts$sigma.tau,
time.range[2] + num.sd.border * opts$sigma.tau,
length.out = num.test)
}
#
# Gather all the data into one list
stan.data <- c(
# Hyper-parameters
hyper,
list(
# Dimensions
C=.C,
G=.G,
H=hold.out,
M=num.inducing,
# Data
time=cell.map$obstime,
expr=expr,
phi=gene.map$phi.hat,
# Inducing pseudotimes
u = calc.inducing.pseudotimes(
dl,
num.inducing = num.inducing,
period = period,
num.sd.border = num.sd.border),
# Held out parameters
heldout_psi=filter(gene.map, g > .G)$psi.hat,
heldout_omega=filter(gene.map, g > .G)$omega.hat,
# Generated quantities
numtest=opts$num.test,
testinput=test.input,
# Periodic?
periodic=opts$periodic,
period=opts$period
)
)
})
#' Get the Stan model for a DeLorean object.
#'
#' @param dl de.lorean object
#'
#' @export
#'
get_model <- function(dl) {
model <- stanmodels[[dl$opts$model.name]]
if (is.null(model)) {
stop('Unknown model: ', model_name)
}
model
}
# Define a function to initialise the chains
#
# @param dl de.lorean object
#
make.chain.init.fn <- function(dl) {
function() {
with(dl$stan.data, {
# message("Creating initialisation")
init <- list(
S=dl$cell.map$S.hat,
tau=rnorm(C, mean=time, sd=sigma_tau),
psi=rlnorm(G, meanlog=mu_psi, sdlog=sigma_psi),
omega=rlnorm(G, meanlog=mu_omega, sdlog=sigma_omega)
)
init
})
}
}
even.tau.spread <- function(dl) {
with(dl$stan.data,
seq(min(time) - sigma_tau,
max(time) + sigma_tau,
length=C))
}
# From http://stackoverflow.com/questions/11094822/numbers-in-geometric-progression
geom.series <- function(base, max) {
base^(0:floor(log(max, base)))
}
#' Use seriation package to find good orderings
#'
#' @param dl de.lorean object
#' @param .methods The seriation methods to apply
#' @param scaled Whether to use the scaled and/or unscaled expression data
#' @param dim.red Dimension reduction methods to apply
#' @param dims Number of dimensions to reduce to
#' @param num.cores Number of cores to use in parallel
#' @param num.tau.to.keep How many initialisations to keep
#'
#' @export
#'
seriation.find.orderings <- function(
dl,
# .methods = c("ARSA", "TSP", "R2E", "HC", "GW", "OLO"),
.methods = c("TSP", "R2E", "HC", "GW", "OLO"),
scaled = c('scaled', 'unscaled'),
dim.red = c('none', 'pca', 'kfa', 'ica', 'mds'),
# dim.red = c('mds'),
dims = geom.series(base=2, max=min(8, nrow(dl$expr)-1)),
num.cores = default.num.cores(),
num.tau.to.keep = default.num.cores())
{
#
# Calculate all combinations of parameters
combinations <- expand.grid(method=.methods,
scaled=scaled,
dim.red=dim.red,
dims=dims,
stringsAsFactors=FALSE) %>%
# Only keep one combination with no dimensionality reduction
dplyr::filter(dim.red != 'none' | 1 == dims) %>%
# Cannot do KFA with 1 dimension
dplyr::filter(dim.red != 'kfa' | 1 != dims)
get.expr <- function(scaled) switch(scaled,
scaled = scale(t(dl$expr)),
unscaled = t(dl$expr),
stop('scaled must be "scaled" or "unscaled"')
)
get.expr.mem <- memoise::memoise(get.expr)
get.red <- function(scaled, dim.red, dims=5) {
expr <- get.expr(scaled)
switch(dim.red,
none = expr,
pca = prcomp(expr)$x[,1:dims],
ica = fastICA::fastICA(expr, n.comp=dims)$S,
kfa = t(kernlab::kfa(t(expr), features=dims)@xmatrix),
mds = cmdscale(dist(expr), k=dims),
stop('dim.red must be "none", "ica", "kfa", "mds" or "pca"'))
}
get.red.mem <- memoise::memoise(get.red)
get.dist <- function(scaled, dim.red, dims=5) {
dist(get.red.mem(scaled, dim.red, dims))
}
get.dist.mem <- memoise::memoise(get.dist)
result <- parallel::mclapply(
1:nrow(combinations),
function(i) with(combinations[i,], within(list(), {
method.name <- stringr::str_c(method,
':', scaled,
':', dim.red,
':', dims)
elapsed <- system.time(
per <- seriation::seriate(get.dist.mem(scaled, dim.red, dims),
method=method))
ser.order <- seriation::get_order(per)
ll <- dl$ordering.ll(ser.order)
message(method.name, '; time=', elapsed[3], 's; LL=', ll)
})),
mc.cores=num.cores)
result
}
#' Run a find good ordering method and append results to existing orderings
#'
#' @param dl de.lorean object
#' @param method Function that runs the method
#' @param ... Any other arguments for the method
#'
#' @export
#'
find.good.ordering <- function(dl, method, ...)
{
dl <- create.ordering.ll.fn(dl)
dl$order.inits <- c(
dl$order.inits,
lapply(
method(dl, ...),
function(i) {
i$ser.order <- rev.order.if.better(dl, i$ser.order)
i
}))
dl
}
#' Plot likelihoods of orderings against elapsed times taken
#' to generate them
#'
#' @param dl The DeLorean object
#'
#' @export
#'
orderings.plot <- function(dl) with(dl, {
results.df <- data.frame(
method=sapply(order.inits, function(r) r$method.name),
elapsed=sapply(order.inits, function(r) r$elapsed[3]),
ll=sapply(order.inits, function(r) r$ll))
ggplot2::ggplot(results.df, aes(x=elapsed, y=ll, label=method)) +
ggplot2::geom_text()
})
# Reverse ordering if it is better correlated with observed times
rev.order.if.better <- function(dl, ser.order) {
rev.order <- rev(ser.order)
if (cor(ser.order, dl$cell.map$obstime) < cor(rev.order, dl$cell.map$obstime)) {
ser.order <- rev.order
}
ser.order
}
# Deduplicate orderings
#
deduplicate.orderings <- function(dl) {
orderings <- sapply(dl$order.inits, function(i) i$ser.order)
dupd <- duplicated(t(orderings))
dl$order.inits <- dl$order.inits[!dupd]
dl
}
#' Convert best orderings into initialisations
#'
#' @param dl The DeLorean object
#' @param num.to.keep The number to keep (defaults to default.num.cores())
#'
#' @export
#'
pseudotimes.from.orderings <- function(
dl,
num.to.keep=default.num.cores())
within(deduplicate.orderings(dl), {
order.orderings <- order(sapply(order.inits, function(i) i$ll),
decreasing=TRUE)
#
# Make sure we don't try to keep too many (due to duplicates, etc...)
actually.keep <- min(length(order.orderings), num.to.keep)
if (actually.keep < num.to.keep) {
warning("Don't have enough ", num.to.keep, " pseudotimes to keep,",
" only have ", actually.keep)
}
best.orderings <- order.inits[order.orderings[1:actually.keep]]
tau.inits <- lapply(
best.orderings,
function(O) {
message('Using ordering ', O$method.name, '; LL=', O$ll)
# Create an initialisation using the ordering
init <- init.chain.sample.tau(dl)
init$tau <- even.tau.spread(dl)[O$ser.order]
init
})
})
#' Find best order of the samples assuming some smooth GP prior on the
#' expression profiles over this ordering.
#'
#' @param dl de.lorean object
#' @param psi Temporal variation
#' @param omega Noise
#' @param num.cores Number of cores to run on. Defaults to default.num.cores()
#' @param num.tau.to.try How many initialisations to try
#' @param num.tau.to.keep How many initialisations to keep
#' @param method Method to use "maximise" or "metropolis"
#' @param ... Extra arguments to method
#'
#' @export
#'
find.smooth.tau <- function(
dl,
psi = exp(dl$hyper$mu_psi),
omega = exp(dl$hyper$mu_omega),
num.cores = default.num.cores(),
num.tau.to.try = num.cores,
num.tau.to.keep = num.cores,
method = "metropolis",
...
) {
dl <- create.ordering.ll.fn(dl)
log.likelihood <- dl$ordering.ll
dl$tau.inits <- with(dl, {
# Maximise the sum of the log marginal likelihoods
ordering.search <- function(seed) {
set.seed(seed)
# Choose a starting point by random projection
expr.centre <- t(scale(t(expr), center=T, scale=F))
init.ordering <- order(rnorm(nrow(expr.centre)) %*% expr.centre)
# init.ordering <- sample(stan.data$C)
metropolis.fn <- function(ordering, log.likelihood, ...) {
mh.run <- ordering.metropolis.hastings(
ordering,
log.likelihood,
proposal.fn=ordering.random.block.move,
...)
best.sample <- which.max(mh.run$log.likelihoods)
#ordering.maximise(mh.run$chain[best.sample,], log.likelihood)
mh.run$chain[best.sample,]
}
method.fn <- switch(method,
"maximise"=ordering.maximise,
"metropolis"=metropolis.fn,
NA)
ordering <- method.fn(init.ordering, log.likelihood, ...)
stopifnot(! is.null(ordering))
# Reverse the ordering if it makes it correlate better with
# the capture times
capture.order <- order(stan.data$time)
if (cor(capture.order, ordering) <
cor(capture.order, rev(ordering)))
{
ordering <- rev(ordering)
}
ordering
}
# Choose seeds
seeds <- sample.int(.Machine$integer.max, num.tau.to.try)
# Run in parallel or not?
if (num.cores > 1) {
orderings <- parallel::mclapply(seeds,
mc.cores=num.cores,
ordering.search)
} else {
orderings <- lapply(seeds, ordering.search)
}
# Order the taus by the best orderings
lls <- sapply(orderings, function(o) -log.likelihood(o))
best.order <- order(lls)
# Make the complete chain initialisation with the tau.
lapply(orderings[best.order[1:num.tau.to.keep]],
function(ordering) {
init <- init.chain.sample.tau(dl)
init$tau <- even.tau.spread(dl)[ordering.invert(ordering)]
init
})
})
dl
}
#' Test ordering Metropolis-Hastings sampler.
#'
#' @param dl de.lorean object
#' @param psi Temporal variation
#' @param omega Noise
#' @param num.cores Number of cores to run on.
#' Defaults to getOption("DL.num.cores", max(parallel::detectCores()-1, 1))
#' @param iterations Number of iterations
#' @param thin Thin the samples
#'
test.mh <- function(
dl,
psi = mean(dl$gene.map$psi.hat),
omega = mean(dl$gene.map$omega.hat),
num.cores = getOption("DL.num.cores", max(parallel::detectCores() - 1, 1)),
iterations = 1000,
thin = 15
) {
dl <- create.ordering.ll.fn(dl)
log.likelihood <- dl$ordering.ll
dl$tau.inits <- with(dl, {
# Maximise the sum of the log marginal likelihoods
ordering.search <- function(seed) {
set.seed(seed)
# Choose a random starting point
init.ordering <- sample(stan.data$C)
mh.run <- ordering.metropolis.hastings(
init.ordering,
log.likelihood,
proposal.fn=ordering.random.block.move,
iterations=iterations,
thin=thin)
}
# Choose seeds
seeds <- sample.int(.Machine$integer.max, num.cores)
# Run in parallel or not?
if (num.cores > 1) {
orderings <- parallel::mclapply(seeds,
mc.cores=num.cores,
ordering.search)
} else {
orderings <- lapply(seeds, ordering.search)
}
orderings
})
}
# The covariance function for the DeLorean object.
cov.fn.for <- function(dl) {
cov.matern.32
}
# Choose an initialisation by sampling tau from the prior.
#
# @param dl de.lorean object
#
init.chain.sample.tau <- function(dl) {
with(dl$stan.data, {
init <- list(
alpha=dl$cell.map$alpha.hat,
beta=rep(0, G),
S=dl$cell.map$S.hat,
tau=rnorm(C, time, sd=sigma_tau),
psi=dl$gene.map$psi.hat[1:G],
omega=dl$gene.map$omega.hat[1:G]
)
init$tauoffsets <- init$tau - time
init
})
}
#' Find best tau to initialise chains with by sampling tau from the prior
#' and using empirical Bayes parameter estimates for the other parameters.
#'
#' @param dl de.lorean object
#' @param num.tau.candidates How many candidates to examine. Defaults to 6000.
#' @param num.tau.to.keep How many candidates to keep. Defaults to num.cores.
#' @param num.cores Number of cores to run on. Defaults to default.num.cores()
#'
#' @export
#'
find.best.tau <- function(
dl,
num.tau.candidates = 6000,
num.tau.to.keep = num.cores,
num.cores = default.num.cores())
within(dl, {
#
# Define a function that calculates log probability for
# random seeded tau
try.tau.init <- function(i) {
set.seed(i)
pars <- init.chain.sample.tau(dl)
lp <- rstan::log_prob(fit, rstan::unconstrain_pars(fit, pars),
adjust_transform=FALSE)
list(lp=lp, tau=pars$tau)
}
#
# Choose tau several times and calculate log probability
if (num.cores > 1) {
tau.inits <- parallel::mclapply(1:num.tau.candidates,
mc.cores=num.cores,
try.tau.init)
} else {
tau.inits <- lapply(1:num.tau.candidates, try.tau.init)
}
#
# Which tau gave highest log probability?
tau.inits.order <- order(sapply(tau.inits, function(init) -init$lp))
#
# Just keep so many best tau inits
tau.inits <- tau.inits[tau.inits.order[1:num.tau.to.keep]]
rm(tau.inits.order, try.tau.init)
})
#' Perform all the steps necessary to fit the model:
#' \enumerate{
#' \item prepare the data
#' \item find suitable initialisations
#' \item fit the model using the specified method (sampling or variational Bayes)
#' \item process the posterior
#' }
#'
#' @param dl de.lorean object
#' @param method Fitting method:
#' \itemize{
#' \item 'sample': Use a Stan sampler.
#' See \code{\link{fit.model.sample}}.
#' \item 'vb': Use Stan ADVI variational Bayes algorithm.
#' See \code{\link{fit.model.vb}}.
#' }
#' @param ... Extra arguments for fitting method
#'
#' @export
#'
fit.dl <- function(
dl,
method = 'sample',
...)
{
dl <- prepare.for.stan(dl)
dl <- find.good.ordering(dl, seriation.find.orderings)
dl <- pseudotimes.from.orderings(dl)
dl <- fit.model(dl, method=method, ...)
dl <- process.posterior(dl)
dl <- analyse.noise.levels(dl)
}
#' Fit the model using specified method (sampling or variational Bayes).
#'
#' @param dl de.lorean object
#' @param method Fitting method:
#' \itemize{
#' \item 'sample': Use a Stan sampler.
#' See \code{\link{fit.model.sample}}.
#' \item 'vb': Use Stan ADVI variational Bayes algorithm.
#' See \code{\link{fit.model.vb}}.
#' }
#' @param ... Extra arguments for method
#'
#' @export
#'
fit.model <- function(
dl,
method = 'sample',
...)
{
switch(
method,
"sample" = fit.model.sample(dl, ...),
"vb" = fit.model.vb(dl, ...),
stop('Unknown method'))
}
#' Fit the model using Stan sampler
#'
#' @param dl de.lorean object
#' @param num.cores Number of cores to run on.
#' Defaults to getOption("DL.num.cores", max(parallel::detectCores()-1, 1))
#' @param chains Number of chains to run on each core
#' @param thin How many samples to generate before retaining one
#' @param ... Extra arguments for rstan::stan() sampling call
#'
#' @export
#'
fit.model.sample <- function(
dl,
num.cores = getOption("DL.num.cores", max(parallel::detectCores() - 1, 1)),
chains = 1,
thin = 50,
...)
{
init.chain.good.tau <- make.init.fn(dl)
# Run the chains in parallel
sflist <- parallel::mclapply(
1:num.cores,
mc.cores=num.cores,
function(i)
rstan::stan(
fit=get_model(dl),
data=dl$stan.data,
thin=thin,
init=init.chain.good.tau,
seed=i,
chains=chains,
chain_id=i,
refresh=-1,
...))
dl$fit <- rstan::sflist2stanfit(sflist)
return(dl)
}
#' Returns a function that constructs parameter settings with good tau.
#'
#' @param dl de.lorean object
#'
make.init.fn <- function(dl) {
function(chain_id) {
stopifnot(chain_id <= length(dl$tau.inits))
#
# Create random parameters
pars <- make.chain.init.fn(dl)()
#
# Replace tau with good tau
pars$tau <- dl$tau.inits[[chain_id]]$tau
pars
}
}
#' Average across a parameters samples.
#'
#' @param s An array of any dimension in which the first dimensions
#' indexes the samples
#'
avg.par.samples <- function(s) {
n.dims <- length(dim(s))
if (n.dims > 1) {
apply(s, 2:n.dims, mean)
} else {
mean(s)
}
}
#' Get posterior mean of samples
#'
#' @param extract A named list of samples
#'
#' @export
#'
get.posterior.mean <- function(extract) lapply(extract, avg.par.samples)
#' Fit the model using Stan variational Bayes
#'
#' @param dl de.lorean object
#' @param num.cores Number of cores to run on. Defaults to default.num.cores()
#' @param num.inits Number initialisations to try. Defaults to num.cores
#' @param init.idx Which initialisation to use if only using one
#' @param ... Extra arguments for rstan::vb()
#'
#' @export
#'
fit.model.vb <- function(
dl,
num.cores = default.num.cores(),
num.inits = num.cores,
init.idx = 1,
...)
{
init.chain.good.tau <- make.init.fn(dl)
if (num.cores > 1) {
#
# Run variational Bayes in parallel
sflist <- parallel::mclapply(
1:num.inits,
mc.cores=num.cores,
# mc.cores=1,
function(i) within(list(), {
fit <- rstan::vb(
get_model(dl),
data=dl$stan.data,
seed=i,
init=init.chain.good.tau(i),
...)
pars <- get.posterior.mean(rstan::extract(fit))
upars <- rstan::unconstrain_pars(fit, pars)
lp <- rstan::log_prob(fit, upars, adjust_transform=TRUE)
lp.unadj <- rstan::log_prob(fit, upars, adjust_transform=FALSE)
}))
#
# Only keep results that worked
sflist <- Filter(function(x) is.recursive(x) && ! is.null(x$lp), sflist)
n.worked <- length(sflist)
if (0 == n.worked) {
stop('No VB fits worked.')
} else if (n.worked < num.inits) {
warning('Only ', n.worked, '/', num.inits, ' VB fits succeeded.')
}
#
# Get the log likelihoods as vectors
dl$vb.lls <- sapply(sflist, function(sf) sf$lp)
dl$vb.lls.unadj <- sapply(sflist, function(sf) sf$lp.unadj)
#
# Calculate which run had best lp for posterior mean parameters
best.idx <- which.max(dl$vb.lls.unadj)
#
# Save the estimated tau for analysis
dl$vb.tau <- sapply(sflist, function(sf) sf$pars$tau)
#
# Use those results
dl$fit <- sflist[[best.idx]]$fit
} else {
# Run single variational Bayes
dl$fit <- rstan::vb(
get_model(dl),
data=dl$stan.data,
seed=init.idx,
init=init.chain.good.tau(init.idx),
...)
}
return(dl)
}
#' Analyse the samples and gather the convergence statistics. Note this
#' only makes sense if a sampling method was used to fit the model as
#' opposed to variational Bayes.
#'
#' @param dl de.lorean object
#'
#' @export
#'
examine.convergence <- function(dl) {
within(dl, {
pars <- c("tau", "psi", "S", "omega")
summ <- rstan::monitor(fit,
print=FALSE,
pars=pars)
ignore.names <- stringr::str_detect(rownames(summ),
"^(predictedvar|predictedmean)")
rhat.sorted <- sort(summ[! ignore.names, "Rhat"])
rhat.df <- data.frame(
rhat=rhat.sorted,
param=names(rhat.sorted),
parameter=stringr::str_match(names(rhat.sorted), "^[[:alpha:]]+"))
rm(summ)
rm(pars)
})
}
# The dimensions of the model parameters
#
# @param dl de.lorean object
#
model.parameter.dimensions <- function(dl) {
sample.dims <- list(
lp__=c(),
S=c("c"),
tau=c("c"),
psi=c("g"),
omega=c("g"),
predictedmean=c("g", "t"),
predictedvar=c("g", "t"),
logmarglike=c("g")
)
if (! dl$opts$model.estimates.cell.sizes) {
sample.dims$S <- NULL
}
sample.dims
}
# Sample melter
#
# @param dl de.lorean object
#
sample.melter <- function(dl, include.iter=TRUE) {
function(sample.list, sample.dims) {
melt.var <- function(param) {
# message(param)
if (include.iter) {
varnames <- c("iter", sample.dims[[param]])
} else {
varnames <- sample.dims[[param]]
}
melt(sample.list[[param]], varnames, value.name=param)
}
sapply(names(sample.dims), melt.var)
}
}
# Join extra data to tau samples.
#
# @param dl de.lorean object
#
join.tau.samples <- function(dl, tau.samples) {
with(dl,
tau.samples
%>% left_join(cell.map)
%>% mutate(tau.offset=tau-obstime))
}
#' Process the posterior, that is extract and reformat the samples from
#' Stan. We also determine which sample has the highest likelihood, this
#' is labelled as the 'best' sample.
#'
#' @param dl de.lorean object
#'
#' @export
#'
process.posterior <- function(dl) {
within(dl, {
# Define a function to melt samples into a long format
samples.l <- sample.melter(dl)(rstan::extract(dl$fit, permuted = TRUE),
model.parameter.dimensions(dl))
best.sample <- which.max(samples.l$lp__$lp__)
if (TRUE %in% samples.l$logmarglike$is.held.out) {
mean.held.out.marg.ll <-
mean((
samples.l$logmarglike %>%
left_join(gene.map) %>%
filter(is.held.out))$logmarglike)
message('Mean held out marginal log likelihood per cell: ',
mean.held.out.marg.ll / stan.data$C)
}
#
# Include meta data in tau samples
samples.l$tau <- join.tau.samples(dl, samples.l$tau)
#
# Get parameters from best iteration
best.m <- lapply(samples.l, function(s) filter(s, iter == best.sample))
})
}
# Optimise the log posterior starting at a particular sample or from some
# other set of parameters.
#
# @param dl de.lorean object
# @param sample.iter Sample to optimise (defaults to best sample).
#
optimise.sample <- function(
dl,
parameters=sample.parameters(dl, sample.iter=sample.iter),
sample.iter=dl$best.sample,
...)
{
with(dl, {
optimised <- rstan::optimizing(dl$stanmodel,
data=dl$stan.data,
init=parameters,
as_vector=FALSE,
...)
dims <- model.parameter.dimensions(dl)
# Don't melt lp__ value
dims$lp__ <- NULL
samples.l <- sample.melter(dl, include.iter=FALSE)(optimised$par, dims)
# Include meta data in tau samples
samples.l$tau <- join.tau.samples(dl, samples.l$tau)
# Include lp__ value
samples.l$lp__ <- data.frame(lp__=optimised$value)
samples.l
})
}
# Bind a sample.
#
# @param dl de.lorean object
# @param samples Samples to bind to existing samples.
# @param sample.iter Iteration (defaults to -1).
#
bind.sample <- function(dl, samples, sample.iter=-1)
{
within(
dl,
for (param in names(samples.l)) {
samples.l[[param]] <- rbind(samples[[param]]
%>% mutate(iter=sample.iter),
samples.l[[param]])
})
}
#' Optimise the best sample and update the best.sample index.
#'
#' @param dl de.lorean object
#' @param sample.to.opt Sample to optimise
#' @param new.best.sample Update to best sample index
#'
#' @export
#'
optimise.best.sample <- function(
dl,
sample.to.opt=dl$best.sample,
new.best.sample=-1)
{
dl$sample.optimised <- sample.to.opt
dl <- bind.sample(dl, optimise.sample(dl, sample.iter=sample.to.opt))
dl$sample.old.best <- dl$best.sample
dl$best.sample <- new.best.sample
dl
}
#' Analyse noise levels and assess which genes have the greatest
#' ratio of temporal variance to noise. This are labelled as the
#' 'gene.high.psi' genes.
#'
#' @param dl de.lorean object
#' @param num.high.psi How many genes with high variance to examine
#'
#' @export
#'
analyse.noise.levels <- function(dl, num.high.psi=25) {
within(dl, {
noise.levels <- (
with(samples.l, left_join(psi, omega))
%>% left_join(gene.map))
# Summarise by gene
gene.noise.levels <- (
noise.levels
%>% group_by(g)
%>% dplyr::summarise(omega=mean(omega), psi=mean(psi))
%>% left_join(gene.map)
%>% arrange(-psi/omega))
genes.high.psi <- head(gene.noise.levels$gene, num.high.psi)
})
}
# Get the sampled parameter for the gene
#
# @param dl de.lorean object
# @param gene.idx Gene index
# @param param Parameter
# @param sample.iter Iteration to use (defaults to best.sample)
#
sampled.gene.param <- function(dl,
gene.idx,
param,
sample.iter=dl$best.sample) {
with(dl, {
filter(samples.l[[param]], gene.idx == g, sample.iter == iter)[[param]]
})
}
#' Make predictions
#'
#' @param dl de.lorean object
#'
#' @export
#'
make.predictions <- function(dl) {
within(dl, {
predictions <- with(samples.l,
predictedmean
%>% left_join(predictedvar)
# %>% left_join(S)
%>% mutate(tau=test.input[t]))
})
}
#' Fit held out genes
#'
#' @param dl de.lorean object
#' @param expr.held.out The expression matrix including the held out genes
#' @param sample.iter The sample to use to fit with
#'
fit.held.out <- function(
dl,
expr.held.out,
sample.iter=dl$best.sample)
{
with(dl, {
if (opts$model.estimates.cell.sizes) {
cell.posterior <- samples.l$S %>% filter(sample.iter == iter)
expr.held.out <- t(t(expr.held.out) - cell.posterior$S)
}
#' Calculate covariance over pseudotimes and capture times
calc.K <- functional::Curry(cov.calc.gene,
dl,
include.test=F,
psi=exp(stan.data$mu_psi),
omega=exp(stan.data$mu_omega))
tau <- tau.for.sample(dl, sample.iter=sample.iter)
obstime <- cell.map$obstime
K.tau <- calc.K(tau=tau)
K.capture <- calc.K(tau=obstime)
#' Evaluate the held out gene under the GP model using pseudotimes
#' and a model without.
#'
calc.gp.marginals <- function(expr) {
c(
gp.log.marg.like(expr, K.tau),
gp.log.marg.like(expr, K.capture))
}
fit.model <- function(expr, model=loess) {
list(tau=model(expr~s(tau)))
}
apply(expr.held.out, 1, calc.gp.marginals)
# list(
# gp.marginals=sapply(held.out.genes, calc.gp.marginals),
# loess=lapply(held.out.genes, functional::Curry(fit.model, model=gam)))
# gam=lapply(held.out.genes, functional::Curry(fit.model, model=gam)))
})
}
# Parameter values for sample
#
# @param dl de.lorean object
# @param sample.iter The sample we want the parameters for.
#
sample.parameters <- function(dl,
sample.iter=dl$best.sample,
param.names=names(dl$samples.l))
{
parameters <- lapply(param.names,
function(param)
filter(dl$samples.l[[param]],
iter == sample.iter)[[param]])
names(parameters) <- param.names
parameters
}
#' Test fit for log normal and gamma
#'
#' @param vars Data to fit
#'
#' @export
#'
test.fit <- function(vars) {
fit.gamma <- MASS::fitdistr(vars, 'gamma')
fit.lognormal <- MASS::fitdistr(vars, 'lognormal')
gp <- (
ggplot(data.frame(V=vars), aes(x=V))
+ geom_density()
+ stat_function(fun=functional::Curry(dgamma,
shape=fit.gamma$estimate['shape'],
rate=fit.gamma$estimate['rate']),
linetype='dashed')
+ stat_function(fun=functional::Curry(dlnorm,
meanlog=fit.lognormal$estimate['meanlog'],
sdlog=fit.lognormal$estimate['sdlog']),
linetype='dotted')
)
list(gamma=fit.gamma, lognormal=fit.lognormal, gp=gp)
}
# The samples
#
# @param dl de.lorean object
#
sample.iters <- function(dl) dl$samples.l$lp__$iter
#' Make a fit valid by running one iteration of the sampler.
#'
#' @param dl de.lorean object
#'
#' @export
#'
make.fit.valid <- function(dl) rstan::stan(fit=dl$fit, data=dl$stan.data, iter=1, chains=1)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.