R/model.r

Defines functions make.fit.valid sample.iters test.fit sample.parameters fit.held.out make.predictions sampled.gene.param analyse.noise.levels optimise.best.sample bind.sample optimise.sample process.posterior join.tau.samples sample.melter model.parameter.dimensions examine.convergence fit.model.vb get.posterior.mean avg.par.samples make.init.fn fit.model.sample fit.model fit.dl find.best.tau init.chain.sample.tau cov.fn.for test.mh find.smooth.tau pseudotimes.from.orderings deduplicate.orderings rev.order.if.better orderings.plot find.good.ordering seriation.find.orderings geom.series even.tau.spread make.chain.init.fn get_model prepare.for.stan calc.inducing.pseudotimes sample.genes.and.cells sample.per.capture filter_cells filter_genes estimate.hyper analyse.variance cast.expr melt.expr cell.levels gene.levels tau.for.sample update.levels

Documented in analyse.noise.levels analyse.variance avg.par.samples calc.inducing.pseudotimes estimate.hyper examine.convergence filter_cells filter_genes find.best.tau find.good.ordering find.smooth.tau fit.dl fit.held.out fit.model fit.model.sample fit.model.vb get_model get.posterior.mean make.fit.valid make.init.fn make.predictions melt.expr optimise.best.sample orderings.plot prepare.for.stan process.posterior pseudotimes.from.orderings seriation.find.orderings test.fit test.mh

# Update a factor based on the levels of another factor.
#
# @param .factor Factor
# @param reference.factor Factor whose levels to use.
# @param levels New levels
#
update.levels <- function(.factor,
                          reference.factor=NULL,
                          .levels=levels(reference.factor),
                          ordered=is.ordered(reference.factor))
{
    factor(.factor, levels=.levels, ordered=ordered)
}


# Retrieve the estimated tau for the given sample.
#
# @param dl de.lorean object
# @param sample.iter Which sample to use, defaults to best sample
#
tau.for.sample <- function(dl, sample.iter=dl$best.sample) {
    (
        dl$samples.l$tau
        %>% filter(sample.iter == iter)  # Filter correct iteration
        %>% arrange(c)  # Sort by cell
    )$tau
}


# The levels of the gene factor
#
# @param dl The de.lorean object.
#
gene.levels <- function(dl) levels=levels(dl$gene.meta$gene)


# The levels of the cell factor
#
# @param dl The de.lorean object.
#
cell.levels <- function(dl) levels=levels(dl$cell.meta$cell)


#' Melt an expression matrix.
#'
#' @param dl The de.lorean object.
#' @param expr Matrix of expression values.
#'
#' @export melt.expr
#'
melt.expr <- function(dl, expr=dl$expr) (
    expr
    %>% melt(varnames=c("gene", "cell"), value.name="x")
    %>% mutate(gene=factor(gene, levels=gene.levels(dl)),
               cell=factor(cell, levels=cell.levels(dl)))
)


# Cast an expression matrix.
#
# @param expr.l Expression values in long format
#
cast.expr <- function(expr.l) expr.l %>% acast(gene ~ cell, value.var="x")


#' Analyse variance of expression between and within capture times.
#'
#' @param dl de.lorean object
#' @param adjust.cell.sizes Choose whether to adjust the expression values by
#'        the cell size estimates
#'
#' @export
#'
analyse.variance <- function(dl, adjust.cell.sizes) {
  #
  # First melt expression data into long format
  expr.adj <-
    melt.expr(dl) %>%
    left_join(dl$cell.sizes) %>%
    # Adjust the expression by the cell size estimates if asked to
    mutate(x.hat=x-adjust.cell.sizes*S.hat)
  stopifnot(! is.na(expr.adj))
  within(dl, {
    #
    # Resummarise the adjusted expression data by gene
    gene.expr <-
      expr.adj %>%
      group_by(gene) %>%
      dplyr::summarise(phi.hat=mean(x.hat))
    stopifnot(! is.na(gene.expr))
    #
    # Examine the expression at each gene/capture time combination
    gene.time.expr <-
      expr.adj %>%
      left_join(cell.meta) %>%
      group_by(gene, capture) %>%
      dplyr::summarise(
        num.capture=n(),
        Mgk=mean(x.hat),
        Vgk=var(x.hat)) %>%
      filter(! is.na(Vgk))
    stopifnot(! is.na(gene.time.expr))
    stopifnot(nrow(gene.time.expr) > 0)  # Must have some rows left
    #
    # Expected variance of samples from zero-mean Gaussian with covariance K.obs
    V.obs <- with(dl,
      expected.sample.var(
        cov.matern.32(
          cov.calc.dists(unique(dl$cell.meta$obstime)),
          dl$opts$length.scale)))
    #
    # Variance within and between times
    gene.var <-
      gene.time.expr %>%
      group_by(gene) %>%
      dplyr::summarise(omega.hat=mean(Vgk), psi.hat=var(Mgk)/V.obs) %>%
      filter(! is.na(psi.hat),
             ! is.na(omega.hat),
             omega.hat > 0,
             psi.hat > 0)
  })
}


#' Estimate hyperparameters for model using empirical Bayes.
#'
#' @param dl de.lorean object
#' @param sigma.tau Noise s.d. in temporal dimension, that is prior s.d. for tau
#' @param length.scale Length scale for stationary GP covariance function.
#'   Defaults to the range of the observed capture times.
#' @param model.name The model's name:
#'   \itemize{
#'     \item 'exact': The model without a low rank approximation
#'       that does not estimate the cell sizes.
#'     \item 'exactsizes': The model without a low rank approximation
#'       that does estimate the cell sizes.
#'     \item 'lowrank': Low rank approximation to the 'exact' model.
#'     \item 'lowranksizes': Low rank approximation to the 'exactsizes' model.
#'   }
#' @param adjust.cell.sizes Adjust by the cell sizes for better estimates of the hyperparameters
#'
#' @examples
#' data(WindramDeLorean)
#' dl <- de.lorean(windram.expr, windram.gene.meta, windram.cell.meta)
#' dl <- estimate.hyper(dl)
#'
#' @export
#'
estimate.hyper <- function(
    dl,
    sigma.tau = .5,
    length.scale = NULL,
    model.name = 'exact',
    adjust.cell.sizes = TRUE
) {
  #
  # Estimate the cell sizes
  # We will need these to estimate the hyper-parameters for the cell size prior if
  # the model estimates or cell sizes, or to adjust the data by if it doesn't
  dl <- estimate.cell.sizes(dl)
  dl <- within(dl, {
    #
    # Remember options that depend on the model
    # Remove hyphens to keep backwards compatibility with old names
    opts$model.name <- sub('-', '', model.name)
    opts$model.estimates.cell.sizes <- switch(
        opts$model.name,
        "exactsizes" = TRUE,
        "exact" = FALSE,
        "lowranksizes" = TRUE,
        "lowrank" = FALSE,
        stop('Unknown model name'))
    #
    # Set up temporal hyper-parameters
    opts$sigma.tau <- sigma.tau
    time.range <- range(cell.meta$obstime)
    time.width <- time.range[2] - time.range[1]
    if (is.null(length.scale)) {
        opts$length.scale <- time.width / 2
    } else {
        opts$length.scale <- length.scale
    }
  })
  dl <- analyse.variance(dl, adjust.cell.sizes=adjust.cell.sizes)
  within(dl, {
    hyper <- list(
      mu_S=mean(cell.sizes$S.hat),
      sigma_S=sd(cell.sizes$S.hat),
      mu_phi=mean(gene.expr$phi.hat),
      sigma_phi=sd(gene.expr$phi.hat),
      mu_psi=mean(log(gene.var$psi.hat), na.rm=TRUE),
      sigma_psi=sd(log(gene.var$psi.hat), na.rm=TRUE),
      mu_omega=mean(log(gene.var$omega.hat), na.rm=TRUE),
      sigma_omega=sd(log(gene.var$omega.hat), na.rm=TRUE),
      sigma_tau=opts$sigma.tau,
      l=opts$length.scale
    )
    # Check we don't have any NAs
    stopifnot(all(! sapply(hyper, is.na)))
  })
}


#' Filter genes
#'
#' @param dl de.lorean object
#' @param number Number to sample if filter function or genes not supplied.
#' @param genes The genes to keep.
#' @param .filter Function that takes a list of genes as input and returns
#'     a vector of TRUE/FALSE
#'
#' @examples
#' data(WindramDeLorean)
#' dl <- de.lorean(windram.expr, windram.gene.meta, windram.cell.meta)
#' dl <- filter_genes(dl, number = 37)
#'
#' @export
#'
filter_genes <- function(dl,
                         .filter=function(x) x %in% genes,
                         number=NULL,
                         genes=sample(rownames(dl$expr), number))
{
    within(dl, {
        expr <- expr[.filter(rownames(expr)),]
        message("Have ", nrow(expr), " genes after filtering")
    })
}


#' Filter cells
#'
#' @param dl de.lorean object
#' @param number Number to sample if filter function or cells not supplied.
#' @param cells The cells to keep.
#' @param .filter Function that takes a list of cells as input and returns
#'     a vector of TRUE/FALSE
#'
#' @examples
#' data(WindramDeLorean)
#' dl <- de.lorean(windram.expr, windram.gene.meta, windram.cell.meta)
#' dl <- filter_cells(dl, number = 7)
#'
#' @export
#'
filter_cells <- function(dl,
                         .filter=function(x) x %in% cells,
                         number=NULL,
                         cells=sample(colnames(dl$expr), number))
{
    within(dl, {
        expr <- expr[,.filter(colnames(expr))]
        message("Have ", ncol(expr), " cells after filtering")
    })
}




# Sample so many cells per capture time.
#
# @param dl de.lorean object
# @param number Number to sample from each capture time
#
sample.per.capture <- function(dl, cells.per.capture) {
    sample.at.most <- function(.df, number) {
        sample_n(.df, min(number, nrow(.df)))
    }
    sampled.cells <- (
        dl$cell.meta
        %>% filter(cell %in% colnames(dl$expr))
        %>% group_by(capture)
        # %>% do(sample_n(., min(cells.per.capture, length(cell))))
        %>% do(sample.at.most(., cells.per.capture))
    )
    filter_cells(dl, cells=sampled.cells$cell)
}


# Sample genes and cells
#
# @param dl de.lorean object
#
sample.genes.and.cells <- function(
    dl,
    max.cells = 0,
    max.genes = 0)
{
    within(dl, {
        opts$max.cells <- max.cells
        opts$max.genes <- max.genes
        if (opts$max.cells && ncol(expr) > opts$max.cells) {
            expr <- expr[,sample(ncol(expr), opts$max.cells)]
            # Remove genes that are not expressed in at least two cells
            num.cells.expr <- rowSums(! is.na(expr))
            expr <- expr[num.cells.expr > 1,]
        }
        if (opts$max.genes && nrow(expr) > opts$max.genes) {
            expr <- expr[sample(nrow(expr), opts$max.genes),]
        }
    })
}


#' Calculate inducing pseudotimes for sparse approximation
#'
#' @param dl de.lorean object
#' @param num.inducing Number of inducing points
#' @param period Period of expression patterns
#' @param num.sd.border The size of the border of the inducing inputs
#'            around the capture times in units of number of standard
#'            deviations
#'
#' @export
#'
calc.inducing.pseudotimes <- function(dl, num.inducing, period = 0, num.sd.border = 7) with(dl, {
  if (period > 0) {
    seq(from = 0,
        to = period * (1 - 1/num.inducing),
        length.out = num.inducing)
  } else {
    seq(from = time.range[1] - num.sd.border * hyper$sigma_tau,
        to = time.range[2] + num.sd.border * hyper$sigma_tau,
        length.out = num.inducing)
  }
})


#' Prepare for Stan
#'
#' @param dl de.lorean object
#' @param num.test Number of test points to consider
#' @param num.inducing Number of inducing points
#' @param period Period of expression patterns
#' @param hold.out Number genes to hold out for generalisation tests
#' @param num.sd.border The size of the border of the inducing inputs
#'            around the capture times in units of number of standard
#'            deviations
#'
#' @export prepare.for.stan
#'
prepare.for.stan <- function(
  dl,
  num.test = 101,
  num.inducing = 30,  # M
  period = 0,
  hold.out = 0,
  num.sd.border = 7)
within(dl, {
  opts$num.test <- num.test
  opts$period <- period
  opts$periodic <- opts$period > 0
  stopifnot(hold.out < nrow(expr))
  .G <- nrow(expr) - hold.out
  .C <- ncol(expr)
  #
  # Permute genes to make held out genes random
  expr <- expr[sample(.G+hold.out),]
  #
  # Calculate the map from gene indices to genes and their meta data
  gene.map <- (
      data.frame(g=1:(.G+hold.out),
                  gene=factor(rownames(expr),
                              levels=levels(gene.meta$gene)))
      %>% mutate(is.held.out=g>.G)
      %>% left_join(gene.expr)
      %>% left_join(gene.var)
      %>% left_join(gene.meta))
  stopifnot(! is.na(gene.map[
    c("g", "gene", "phi.hat", "psi.hat", "omega.hat")
  ]))
  #
  # Calculate the map from cell indices to genes and their meta data
  cell.map <-
    data.frame(
      c=1:.C,
      cell=factor(colnames(expr), levels=levels(cell.meta$cell))) %>%
    left_join(cell.meta) %>%
    left_join(cell.sizes)
  stopifnot(! is.na(cell.map %>% dplyr::select(cell, capture, obstime)))
  #
  # Calculate the time points at which to make predictions
  if (opts$periodic) {
      test.input <- seq(0, opts$period, length.out=num.test)
  } else {
      test.input <- seq(
          time.range[1] - num.sd.border * opts$sigma.tau,
          time.range[2] + num.sd.border * opts$sigma.tau,
          length.out = num.test)
  }
  #
  # Gather all the data into one list
  stan.data <- c(
      # Hyper-parameters
      hyper,
      list(
          # Dimensions
          C=.C,
          G=.G,
          H=hold.out,
          M=num.inducing,
          # Data
          time=cell.map$obstime,
          expr=expr,
          phi=gene.map$phi.hat,
          # Inducing pseudotimes
          u = calc.inducing.pseudotimes(
            dl,
            num.inducing = num.inducing,
            period = period,
            num.sd.border = num.sd.border),
          # Held out parameters
          heldout_psi=filter(gene.map, g > .G)$psi.hat,
          heldout_omega=filter(gene.map, g > .G)$omega.hat,
          # Generated quantities
          numtest=opts$num.test,
          testinput=test.input,
          # Periodic?
          periodic=opts$periodic,
          period=opts$period
      )
  )
})


#' Get the Stan model for a DeLorean object.
#'
#' @param dl de.lorean object
#'
#' @export
#'
get_model <- function(dl) {
  model <- stanmodels[[dl$opts$model.name]]
  if (is.null(model)) {
    stop('Unknown model: ', model_name)
  }
  model
}


# Define a function to initialise the chains
#
# @param dl de.lorean object
#
make.chain.init.fn <- function(dl) {
  function() {
    with(dl$stan.data, {
      # message("Creating initialisation")
      init <- list(
        S=dl$cell.map$S.hat,
        tau=rnorm(C, mean=time, sd=sigma_tau),
        psi=rlnorm(G, meanlog=mu_psi, sdlog=sigma_psi),
        omega=rlnorm(G, meanlog=mu_omega, sdlog=sigma_omega)
      )
      init
    })
  }
}


even.tau.spread <- function(dl) {
    with(dl$stan.data,
         seq(min(time) - sigma_tau,
             max(time) + sigma_tau,
             length=C))
}

# From http://stackoverflow.com/questions/11094822/numbers-in-geometric-progression
geom.series <- function(base, max) {
  base^(0:floor(log(max, base)))
}

#' Use seriation package to find good orderings
#'
#' @param dl de.lorean object
#' @param .methods The seriation methods to apply
#' @param scaled Whether to use the scaled and/or unscaled expression data
#' @param dim.red Dimension reduction methods to apply
#' @param dims Number of dimensions to reduce to
#' @param num.cores Number of cores to use in parallel
#' @param num.tau.to.keep How many initialisations to keep
#'
#' @export
#'
seriation.find.orderings <- function(
  dl,
  # .methods = c("ARSA", "TSP", "R2E", "HC", "GW", "OLO"),
  .methods = c("TSP", "R2E", "HC", "GW", "OLO"),
  scaled = c('scaled', 'unscaled'),
  dim.red = c('none', 'pca', 'kfa', 'ica', 'mds'),
  # dim.red = c('mds'),
  dims = geom.series(base=2, max=min(8, nrow(dl$expr)-1)),
  num.cores = default.num.cores(),
  num.tau.to.keep = default.num.cores())
{
  #
  # Calculate all combinations of parameters
  combinations <- expand.grid(method=.methods,
                              scaled=scaled,
                              dim.red=dim.red,
                              dims=dims,
                              stringsAsFactors=FALSE) %>%
    # Only keep one combination with no dimensionality reduction
    dplyr::filter(dim.red != 'none' | 1 == dims) %>%
    # Cannot do KFA with 1 dimension
    dplyr::filter(dim.red != 'kfa' | 1 != dims)
  get.expr <- function(scaled) switch(scaled,
    scaled = scale(t(dl$expr)),
    unscaled = t(dl$expr),
    stop('scaled must be "scaled" or "unscaled"')
  )
  get.expr.mem <- memoise::memoise(get.expr)
  get.red <- function(scaled, dim.red, dims=5) {
    expr <- get.expr(scaled)
    switch(dim.red,
      none = expr,
      pca = prcomp(expr)$x[,1:dims],
      ica = fastICA::fastICA(expr, n.comp=dims)$S,
      kfa = t(kernlab::kfa(t(expr), features=dims)@xmatrix),
      mds = cmdscale(dist(expr), k=dims),
      stop('dim.red must be "none", "ica", "kfa", "mds" or "pca"'))
  }
  get.red.mem <- memoise::memoise(get.red)
  get.dist <- function(scaled, dim.red, dims=5) {
    dist(get.red.mem(scaled, dim.red, dims))
  }
  get.dist.mem <- memoise::memoise(get.dist)
  result <- parallel::mclapply(
    1:nrow(combinations),
    function(i) with(combinations[i,], within(list(), {
      method.name <- stringr::str_c(method,
                                    ':', scaled,
                                    ':', dim.red,
                                    ':', dims)
      elapsed <- system.time(
        per <- seriation::seriate(get.dist.mem(scaled, dim.red, dims),
                                   method=method))
      ser.order <- seriation::get_order(per)
      ll <- dl$ordering.ll(ser.order)
      message(method.name, '; time=', elapsed[3], 's; LL=', ll)
    })),
    mc.cores=num.cores)
  result
}


#' Run a find good ordering method and append results to existing orderings
#'
#' @param dl de.lorean object
#' @param method Function that runs the method
#' @param ... Any other arguments for the method
#'
#' @export
#'
find.good.ordering <- function(dl, method, ...)
{
  dl <- create.ordering.ll.fn(dl)
  dl$order.inits <- c(
    dl$order.inits,
    lapply(
      method(dl, ...),
      function(i) {
        i$ser.order <- rev.order.if.better(dl, i$ser.order)
        i
      }))
  dl
}


#' Plot likelihoods of orderings against elapsed times taken
#' to generate them
#'
#' @param dl The DeLorean object
#'
#' @export
#'
orderings.plot <- function(dl) with(dl, {
  results.df <- data.frame(
    method=sapply(order.inits, function(r) r$method.name),
    elapsed=sapply(order.inits, function(r) r$elapsed[3]),
    ll=sapply(order.inits, function(r) r$ll))
  ggplot2::ggplot(results.df, aes(x=elapsed, y=ll, label=method)) +
    ggplot2::geom_text()
})

# Reverse ordering if it is better correlated with observed times
rev.order.if.better <- function(dl, ser.order) {
  rev.order <- rev(ser.order)
  if (cor(ser.order, dl$cell.map$obstime) < cor(rev.order, dl$cell.map$obstime)) {
    ser.order <- rev.order
  }
  ser.order
}

# Deduplicate orderings
#
deduplicate.orderings <- function(dl) {
  orderings <- sapply(dl$order.inits, function(i) i$ser.order)
  dupd <- duplicated(t(orderings))
  dl$order.inits <- dl$order.inits[!dupd]
  dl
}

#' Convert best orderings into initialisations
#'
#' @param dl The DeLorean object
#' @param num.to.keep The number to keep (defaults to default.num.cores())
#'
#' @export
#'
pseudotimes.from.orderings <- function(
  dl,
  num.to.keep=default.num.cores())
within(deduplicate.orderings(dl), {
  order.orderings <- order(sapply(order.inits, function(i) i$ll),
                           decreasing=TRUE)
  #
  # Make sure we don't try to keep too many (due to duplicates, etc...)
  actually.keep <- min(length(order.orderings), num.to.keep)
  if (actually.keep < num.to.keep) {
    warning("Don't have enough ", num.to.keep, " pseudotimes to keep,",
            " only have ", actually.keep)
  }
  best.orderings <- order.inits[order.orderings[1:actually.keep]]
  tau.inits <- lapply(
    best.orderings,
    function(O) {
      message('Using ordering ', O$method.name, '; LL=', O$ll)
      # Create an initialisation using the ordering
      init <- init.chain.sample.tau(dl)
      init$tau <- even.tau.spread(dl)[O$ser.order]
      init
    })
})


#' Find best order of the samples assuming some smooth GP prior on the
#' expression profiles over this ordering.
#'
#' @param dl de.lorean object
#' @param psi Temporal variation
#' @param omega Noise
#' @param num.cores Number of cores to run on. Defaults to default.num.cores()
#' @param num.tau.to.try How many initialisations to try
#' @param num.tau.to.keep How many initialisations to keep
#' @param method Method to use "maximise" or "metropolis"
#' @param ... Extra arguments to method
#'
#' @export
#'
find.smooth.tau <- function(
    dl,
    psi = exp(dl$hyper$mu_psi),
    omega = exp(dl$hyper$mu_omega),
    num.cores = default.num.cores(),
    num.tau.to.try = num.cores,
    num.tau.to.keep = num.cores,
    method = "metropolis",
    ...
) {
  dl <- create.ordering.ll.fn(dl)
  log.likelihood <- dl$ordering.ll
  dl$tau.inits <- with(dl, {
    # Maximise the sum of the log marginal likelihoods
    ordering.search <- function(seed) {
      set.seed(seed)
      # Choose a starting point by random projection
      expr.centre <- t(scale(t(expr), center=T, scale=F))
      init.ordering <- order(rnorm(nrow(expr.centre)) %*% expr.centre)
      # init.ordering <- sample(stan.data$C)
      metropolis.fn <- function(ordering, log.likelihood, ...) {
        mh.run <- ordering.metropolis.hastings(
          ordering,
          log.likelihood,
          proposal.fn=ordering.random.block.move,
          ...)
        best.sample <- which.max(mh.run$log.likelihoods)
        #ordering.maximise(mh.run$chain[best.sample,], log.likelihood)
        mh.run$chain[best.sample,]
      }
      method.fn <- switch(method,
                          "maximise"=ordering.maximise,
                          "metropolis"=metropolis.fn,
                          NA)
      ordering <- method.fn(init.ordering, log.likelihood, ...)
      stopifnot(! is.null(ordering))
      # Reverse the ordering if it makes it correlate better with
      # the capture times
      capture.order <- order(stan.data$time)
      if (cor(capture.order, ordering) <
          cor(capture.order, rev(ordering)))
      {
        ordering <- rev(ordering)
      }
      ordering
    }
    # Choose seeds
    seeds <- sample.int(.Machine$integer.max, num.tau.to.try)
    # Run in parallel or not?
    if (num.cores > 1) {
      orderings <- parallel::mclapply(seeds,
                                      mc.cores=num.cores,
                                      ordering.search)
    } else {
      orderings <- lapply(seeds, ordering.search)
    }
    # Order the taus by the best orderings
    lls <- sapply(orderings, function(o) -log.likelihood(o))
    best.order <- order(lls)
    # Make the complete chain initialisation with the tau.
    lapply(orderings[best.order[1:num.tau.to.keep]],
           function(ordering) {
             init <- init.chain.sample.tau(dl)
             init$tau <- even.tau.spread(dl)[ordering.invert(ordering)]
             init
           })
  })
  dl
}


#' Test ordering Metropolis-Hastings sampler.
#'
#' @param dl de.lorean object
#' @param psi Temporal variation
#' @param omega Noise
#' @param num.cores Number of cores to run on.
#'          Defaults to getOption("DL.num.cores", max(parallel::detectCores()-1, 1))
#' @param iterations Number of iterations
#' @param thin Thin the samples
#'
test.mh <- function(
    dl,
    psi = mean(dl$gene.map$psi.hat),
    omega = mean(dl$gene.map$omega.hat),
    num.cores = getOption("DL.num.cores", max(parallel::detectCores() - 1, 1)),
    iterations = 1000,
    thin = 15
) {
    dl <- create.ordering.ll.fn(dl)
    log.likelihood <- dl$ordering.ll
    dl$tau.inits <- with(dl, {
        # Maximise the sum of the log marginal likelihoods
        ordering.search <- function(seed) {
            set.seed(seed)
            # Choose a random starting point
            init.ordering <- sample(stan.data$C)
            mh.run <- ordering.metropolis.hastings(
                init.ordering,
                log.likelihood,
                proposal.fn=ordering.random.block.move,
                iterations=iterations,
                thin=thin)
        }
        # Choose seeds
        seeds <- sample.int(.Machine$integer.max, num.cores)
        # Run in parallel or not?
        if (num.cores > 1) {
            orderings <- parallel::mclapply(seeds,
                                  mc.cores=num.cores,
                                  ordering.search)
        } else {
            orderings <- lapply(seeds, ordering.search)
        }
        orderings
    })
}

# The covariance function for the DeLorean object.
cov.fn.for <- function(dl) {
    cov.matern.32
}

# Choose an initialisation by sampling tau from the prior.
#
# @param dl de.lorean object
#
init.chain.sample.tau <- function(dl) {
    with(dl$stan.data, {
        init <- list(
            alpha=dl$cell.map$alpha.hat,
            beta=rep(0, G),
            S=dl$cell.map$S.hat,
            tau=rnorm(C, time, sd=sigma_tau),
            psi=dl$gene.map$psi.hat[1:G],
            omega=dl$gene.map$omega.hat[1:G]
        )
        init$tauoffsets <- init$tau - time
        init
    })
}


#' Find best tau to initialise chains with by sampling tau from the prior
#' and using empirical Bayes parameter estimates for the other parameters.
#'
#' @param dl de.lorean object
#' @param num.tau.candidates How many candidates to examine. Defaults to 6000.
#' @param num.tau.to.keep How many candidates to keep. Defaults to num.cores.
#' @param num.cores Number of cores to run on. Defaults to default.num.cores()
#'
#' @export
#'
find.best.tau <- function(
  dl,
  num.tau.candidates = 6000,
  num.tau.to.keep = num.cores,
  num.cores = default.num.cores())
within(dl, {
  #
  # Define a function that calculates log probability for
  # random seeded tau
  try.tau.init <- function(i) {
    set.seed(i)
    pars <- init.chain.sample.tau(dl)
    lp <- rstan::log_prob(fit, rstan::unconstrain_pars(fit, pars),
                          adjust_transform=FALSE)
    list(lp=lp, tau=pars$tau)
  }
  #
  # Choose tau several times and calculate log probability
  if (num.cores > 1) {
    tau.inits <- parallel::mclapply(1:num.tau.candidates,
                                    mc.cores=num.cores,
                                    try.tau.init)
  } else {
    tau.inits <- lapply(1:num.tau.candidates, try.tau.init)
  }
  #
  # Which tau gave highest log probability?
  tau.inits.order <- order(sapply(tau.inits, function(init) -init$lp))
  #
  # Just keep so many best tau inits
  tau.inits <- tau.inits[tau.inits.order[1:num.tau.to.keep]]
  rm(tau.inits.order, try.tau.init)
})


#' Perform all the steps necessary to fit the model:
#' \enumerate{
#'   \item prepare the data
#'   \item find suitable initialisations
#'   \item fit the model using the specified method (sampling or variational Bayes)
#'   \item process the posterior
#' }
#'
#' @param dl de.lorean object
#' @param method Fitting method:
#'   \itemize{
#'     \item 'sample': Use a Stan sampler.
#'       See \code{\link{fit.model.sample}}.
#'     \item 'vb': Use Stan ADVI variational Bayes algorithm.
#'       See \code{\link{fit.model.vb}}.
#'   }
#' @param ... Extra arguments for fitting method
#'
#' @export
#'
fit.dl <- function(
    dl,
    method = 'sample',
    ...)
{
  dl <- prepare.for.stan(dl)
  dl <- find.good.ordering(dl, seriation.find.orderings)
  dl <- pseudotimes.from.orderings(dl)
  dl <- fit.model(dl, method=method, ...)
  dl <- process.posterior(dl)
  dl <- analyse.noise.levels(dl)
}


#' Fit the model using specified method (sampling or variational Bayes).
#'
#' @param dl de.lorean object
#' @param method Fitting method:
#'   \itemize{
#'     \item 'sample': Use a Stan sampler.
#'       See \code{\link{fit.model.sample}}.
#'     \item 'vb': Use Stan ADVI variational Bayes algorithm.
#'       See \code{\link{fit.model.vb}}.
#'   }
#' @param ... Extra arguments for method
#'
#' @export
#'
fit.model <- function(
    dl,
    method = 'sample',
    ...)
{
    switch(
        method,
        "sample" = fit.model.sample(dl, ...),
        "vb" = fit.model.vb(dl, ...),
        stop('Unknown method'))
}


#' Fit the model using Stan sampler
#'
#' @param dl de.lorean object
#' @param num.cores Number of cores to run on.
#'   Defaults to getOption("DL.num.cores", max(parallel::detectCores()-1, 1))
#' @param chains Number of chains to run on each core
#' @param thin How many samples to generate before retaining one
#' @param ... Extra arguments for rstan::stan() sampling call
#'
#' @export
#'
fit.model.sample <- function(
    dl,
    num.cores = getOption("DL.num.cores", max(parallel::detectCores() - 1, 1)),
    chains = 1,
    thin = 50,
    ...)
{
  init.chain.good.tau <- make.init.fn(dl)
  # Run the chains in parallel
  sflist <- parallel::mclapply(
    1:num.cores,
    mc.cores=num.cores,
    function(i)
      rstan::stan(
        fit=get_model(dl),
        data=dl$stan.data,
        thin=thin,
        init=init.chain.good.tau,
        seed=i,
        chains=chains,
        chain_id=i,
        refresh=-1,
        ...))
  dl$fit <- rstan::sflist2stanfit(sflist)
  return(dl)
}


#' Returns a function that constructs parameter settings with good tau.
#'
#' @param dl de.lorean object
#'
make.init.fn <- function(dl) {
  function(chain_id) {
    stopifnot(chain_id <= length(dl$tau.inits))
    #
    # Create random parameters
    pars <- make.chain.init.fn(dl)()
    #
    # Replace tau with good tau
    pars$tau <- dl$tau.inits[[chain_id]]$tau
    pars
  }
}


#' Average across a parameters samples.
#'
#' @param s An array of any dimension in which the first dimensions
#'   indexes the samples
#'
avg.par.samples <- function(s) {
  n.dims <- length(dim(s))
  if (n.dims > 1) {
    apply(s, 2:n.dims, mean)
  } else {
    mean(s)
  }
}

#' Get posterior mean of samples
#'
#' @param extract A named list of samples
#'
#' @export
#'
get.posterior.mean <- function(extract) lapply(extract, avg.par.samples)


#' Fit the model using Stan variational Bayes
#'
#' @param dl de.lorean object
#' @param num.cores Number of cores to run on. Defaults to default.num.cores()
#' @param num.inits Number initialisations to try. Defaults to num.cores
#' @param init.idx Which initialisation to use if only using one
#' @param ... Extra arguments for rstan::vb()
#'
#' @export
#'
fit.model.vb <- function(
    dl,
    num.cores = default.num.cores(),
    num.inits = num.cores,
    init.idx = 1,
    ...)
{
  init.chain.good.tau <- make.init.fn(dl)
  if (num.cores > 1) {
    #
    # Run variational Bayes in parallel
    sflist <- parallel::mclapply(
      1:num.inits,
      mc.cores=num.cores,
      # mc.cores=1,
      function(i) within(list(), {
        fit <- rstan::vb(
          get_model(dl),
          data=dl$stan.data,
          seed=i,
          init=init.chain.good.tau(i),
          ...)
        pars <- get.posterior.mean(rstan::extract(fit))
        upars <- rstan::unconstrain_pars(fit, pars)
        lp <- rstan::log_prob(fit, upars, adjust_transform=TRUE)
        lp.unadj <- rstan::log_prob(fit, upars, adjust_transform=FALSE)
      }))
    #
    # Only keep results that worked
    sflist <- Filter(function(x) is.recursive(x) && ! is.null(x$lp), sflist)
    n.worked <- length(sflist)
    if (0 == n.worked) {
      stop('No VB fits worked.')
    } else if (n.worked < num.inits) {
      warning('Only ', n.worked, '/', num.inits, ' VB fits succeeded.')
    }
    #
    # Get the log likelihoods as vectors
    dl$vb.lls <- sapply(sflist, function(sf) sf$lp)
    dl$vb.lls.unadj <- sapply(sflist, function(sf) sf$lp.unadj)
    #
    # Calculate which run had best lp for posterior mean parameters
    best.idx <- which.max(dl$vb.lls.unadj)
    #
    # Save the estimated tau for analysis
    dl$vb.tau <- sapply(sflist, function(sf) sf$pars$tau)
    #
    # Use those results
    dl$fit <- sflist[[best.idx]]$fit
  } else {
    # Run single variational Bayes
    dl$fit <- rstan::vb(
      get_model(dl),
      data=dl$stan.data,
      seed=init.idx,
      init=init.chain.good.tau(init.idx),
      ...)
  }
  return(dl)
}


#' Analyse the samples and gather the convergence statistics. Note this
#' only makes sense if a sampling method was used to fit the model as
#' opposed to variational Bayes.
#'
#' @param dl de.lorean object
#'
#' @export
#'
examine.convergence <- function(dl) {
    within(dl, {
        pars <- c("tau", "psi", "S", "omega")
        summ <- rstan::monitor(fit,
                        print=FALSE,
                        pars=pars)
        ignore.names <- stringr::str_detect(rownames(summ),
                                   "^(predictedvar|predictedmean)")
        rhat.sorted <- sort(summ[! ignore.names, "Rhat"])
        rhat.df <- data.frame(
            rhat=rhat.sorted,
            param=names(rhat.sorted),
            parameter=stringr::str_match(names(rhat.sorted), "^[[:alpha:]]+"))
        rm(summ)
        rm(pars)
    })
}


# The dimensions of the model parameters
#
# @param dl de.lorean object
#
model.parameter.dimensions <- function(dl) {
    sample.dims <- list(
        lp__=c(),
        S=c("c"),
        tau=c("c"),
        psi=c("g"),
        omega=c("g"),
        predictedmean=c("g", "t"),
        predictedvar=c("g", "t"),
        logmarglike=c("g")
    )
    if (! dl$opts$model.estimates.cell.sizes) {
        sample.dims$S <- NULL
    }
    sample.dims
}


# Sample melter
#
# @param dl de.lorean object
#
sample.melter <- function(dl, include.iter=TRUE) {
    function(sample.list, sample.dims) {
        melt.var <- function(param) {
            # message(param)
            if (include.iter) {
                varnames <- c("iter", sample.dims[[param]])
            } else {
                varnames <- sample.dims[[param]]
            }
            melt(sample.list[[param]], varnames, value.name=param)
        }
        sapply(names(sample.dims), melt.var)
    }
}


# Join extra data to tau samples.
#
# @param dl de.lorean object
#
join.tau.samples <- function(dl, tau.samples) {
    with(dl,
         tau.samples
             %>% left_join(cell.map)
             %>% mutate(tau.offset=tau-obstime))
}


#' Process the posterior, that is extract and reformat the samples from
#' Stan. We also determine which sample has the highest likelihood, this
#' is labelled as the 'best' sample.
#'
#' @param dl de.lorean object
#'
#' @export
#'
process.posterior <- function(dl) {
  within(dl, {
    # Define a function to melt samples into a long format
    samples.l <- sample.melter(dl)(rstan::extract(dl$fit, permuted = TRUE),
                                   model.parameter.dimensions(dl))
    best.sample <- which.max(samples.l$lp__$lp__)
    if (TRUE %in% samples.l$logmarglike$is.held.out) {
      mean.held.out.marg.ll <-
        mean((
          samples.l$logmarglike %>%
          left_join(gene.map) %>%
          filter(is.held.out))$logmarglike)
      message('Mean held out marginal log likelihood per cell: ',
              mean.held.out.marg.ll / stan.data$C)
    }
    #
    # Include meta data in tau samples
    samples.l$tau <- join.tau.samples(dl, samples.l$tau)
    #
    # Get parameters from best iteration
    best.m <- lapply(samples.l, function(s) filter(s, iter == best.sample))
  })
}


# Optimise the log posterior starting at a particular sample or from some
# other set of parameters.
#
# @param dl de.lorean object
# @param sample.iter Sample to optimise (defaults to best sample).
#
optimise.sample <- function(
    dl,
    parameters=sample.parameters(dl, sample.iter=sample.iter),
    sample.iter=dl$best.sample,
    ...)
{
    with(dl, {
        optimised <- rstan::optimizing(dl$stanmodel,
                                data=dl$stan.data,
                                init=parameters,
                                as_vector=FALSE,
                                ...)
        dims <- model.parameter.dimensions(dl)
        # Don't melt lp__ value
        dims$lp__ <- NULL
        samples.l <- sample.melter(dl, include.iter=FALSE)(optimised$par, dims)
        # Include meta data in tau samples
        samples.l$tau <- join.tau.samples(dl, samples.l$tau)
        # Include lp__ value
        samples.l$lp__ <- data.frame(lp__=optimised$value)
        samples.l
    })
}


# Bind a sample.
#
# @param dl de.lorean object
# @param samples Samples to bind to existing samples.
# @param sample.iter Iteration (defaults to -1).
#
bind.sample <- function(dl, samples, sample.iter=-1)
{
    within(
        dl,
        for (param in names(samples.l)) {
            samples.l[[param]] <- rbind(samples[[param]]
                                            %>% mutate(iter=sample.iter),
                                        samples.l[[param]])
        })
}


#' Optimise the best sample and update the best.sample index.
#'
#' @param dl de.lorean object
#' @param sample.to.opt Sample to optimise
#' @param new.best.sample Update to best sample index
#'
#' @export
#'
optimise.best.sample <- function(
    dl,
    sample.to.opt=dl$best.sample,
    new.best.sample=-1)
{
    dl$sample.optimised <- sample.to.opt
    dl <- bind.sample(dl, optimise.sample(dl, sample.iter=sample.to.opt))
    dl$sample.old.best <- dl$best.sample
    dl$best.sample <- new.best.sample
    dl
}


#' Analyse noise levels and assess which genes have the greatest
#' ratio of temporal variance to noise. This are labelled as the
#' 'gene.high.psi' genes.
#'
#' @param dl de.lorean object
#' @param num.high.psi How many genes with high variance to examine
#'
#' @export
#'
analyse.noise.levels <- function(dl, num.high.psi=25) {
    within(dl, {
        noise.levels <- (
            with(samples.l, left_join(psi, omega))
            %>% left_join(gene.map))
        # Summarise by gene
        gene.noise.levels <- (
            noise.levels
            %>% group_by(g)
            %>% dplyr::summarise(omega=mean(omega), psi=mean(psi))
            %>% left_join(gene.map)
            %>% arrange(-psi/omega))
        genes.high.psi <- head(gene.noise.levels$gene, num.high.psi)
    })
}


# Get the sampled parameter for the gene
#
# @param dl de.lorean object
# @param gene.idx Gene index
# @param param Parameter
# @param sample.iter Iteration to use (defaults to best.sample)
#
sampled.gene.param <- function(dl,
                               gene.idx,
                               param,
                               sample.iter=dl$best.sample) {
    with(dl, {
        filter(samples.l[[param]], gene.idx == g, sample.iter == iter)[[param]]
    })
}


#' Make predictions
#'
#' @param dl de.lorean object
#'
#' @export
#'
make.predictions <- function(dl) {
    within(dl, {
        predictions <- with(samples.l,
                            predictedmean
                            %>% left_join(predictedvar)
                            # %>% left_join(S)
                            %>% mutate(tau=test.input[t]))
    })
}


#' Fit held out genes
#'
#' @param dl de.lorean object
#' @param expr.held.out The expression matrix including the held out genes
#' @param sample.iter The sample to use to fit with
#'
fit.held.out <- function(
    dl,
    expr.held.out,
    sample.iter=dl$best.sample)
{
    with(dl, {
        if (opts$model.estimates.cell.sizes) {
            cell.posterior <- samples.l$S %>% filter(sample.iter == iter)
            expr.held.out <- t(t(expr.held.out) - cell.posterior$S)
        }
        #' Calculate covariance over pseudotimes and capture times
        calc.K <- functional::Curry(cov.calc.gene,
                        dl,
                        include.test=F,
                        psi=exp(stan.data$mu_psi),
                        omega=exp(stan.data$mu_omega))
        tau <- tau.for.sample(dl, sample.iter=sample.iter)
        obstime <- cell.map$obstime
        K.tau <- calc.K(tau=tau)
        K.capture <- calc.K(tau=obstime)
        #' Evaluate the held out gene under the GP model using pseudotimes
        #' and a model without.
        #'
        calc.gp.marginals <- function(expr) {
            c(
                gp.log.marg.like(expr, K.tau),
                gp.log.marg.like(expr, K.capture))
        }
        fit.model <- function(expr, model=loess) {
            list(tau=model(expr~s(tau)))
        }
        apply(expr.held.out, 1, calc.gp.marginals)
        # list(
            # gp.marginals=sapply(held.out.genes, calc.gp.marginals),
            # loess=lapply(held.out.genes, functional::Curry(fit.model, model=gam)))
            # gam=lapply(held.out.genes, functional::Curry(fit.model, model=gam)))
    })
}


# Parameter values for sample
#
# @param dl de.lorean object
# @param sample.iter The sample we want the parameters for.
#
sample.parameters <- function(dl,
                              sample.iter=dl$best.sample,
                              param.names=names(dl$samples.l))
{
    parameters <- lapply(param.names,
                         function(param)
                             filter(dl$samples.l[[param]],
                                    iter == sample.iter)[[param]])
    names(parameters) <- param.names
    parameters
}


#' Test fit for log normal and gamma
#'
#' @param vars Data to fit
#'
#' @export
#'
test.fit <- function(vars) {
    fit.gamma <- MASS::fitdistr(vars, 'gamma')
    fit.lognormal <- MASS::fitdistr(vars, 'lognormal')
    gp <- (
        ggplot(data.frame(V=vars), aes(x=V))
        + geom_density()
        + stat_function(fun=functional::Curry(dgamma,
                                shape=fit.gamma$estimate['shape'],
                                rate=fit.gamma$estimate['rate']),
                        linetype='dashed')
        + stat_function(fun=functional::Curry(dlnorm,
                                meanlog=fit.lognormal$estimate['meanlog'],
                                sdlog=fit.lognormal$estimate['sdlog']),
                        linetype='dotted')
    )
    list(gamma=fit.gamma, lognormal=fit.lognormal, gp=gp)
}


# The samples
#
# @param dl de.lorean object
#
sample.iters <- function(dl) dl$samples.l$lp__$iter


#' Make a fit valid by running one iteration of the sampler.
#'
#' @param dl de.lorean object
#'
#' @export
#'
make.fit.valid <- function(dl) rstan::stan(fit=dl$fit, data=dl$stan.data, iter=1, chains=1)
JohnReid/DeLorean documentation built on Sept. 27, 2021, 5:45 a.m.