R/compare-samplers.R

Defines functions simulation.result eval.sampler compare.samplers

Documented in compare.samplers simulation.result

# From SamplerCompare, (c) 2010 Madeleine Thompson

# compare-samplers.R contains compare.samplers and its support
# function eval.sampler, which runs an individual simulation.
# See "Graphical Comparison of MCMC Samplers" (http://arxiv.org/abs/1011.4457)
# for discussion of the figures of merit used here.

# compare.samplers is the main entry point for the SamplerCompare
# package.  See ?compare.samplers for more information.

compare.samplers <- function(sample.size, dists, samplers,
                             tuning=1, trace=TRUE, seed=17,
                             burn.in=0.2, cores=1, completed.file=NULL) {
  # Ensure all distributions are of class dist and all samplers are
  # functions with a name attribute.

  stopifnot(all(sapply(dists, function(a) class(a)=='scdist')))

  stopifnot(all(sapply(samplers,
    function(a) class(a)=='function' && !is.null(attr(a,'name')))))

  # Define a temporary save file if necessary.

  if (is.null(completed.file)) {
    unlink.completed.file <- TRUE
    completed.file <- tempfile("compare.samplers")
  } else {
    unlink.completed.file <- FALSE
  }
  if (trace) {
    cat(sprintf('Simulation started at %s.\n', Sys.time()))
    cat(sprintf('Writing results to %s.\n', completed.file))
  }

  # Set up a data frame with a row for each simulation to run.

  jobs <- expand.grid(sampler.id=1:length(samplers), dist.id=1:length(dists),
                      tuning=tuning)

  # Set up locking for an incremental save file.

  if (cores>1) {
    completed.lock <- synchronicity::boost.mutex()
  } else {
    completed.lock <- NULL
  }
  
  # Save an empty result set with timing attributes to completed.file.

  sampler.comparison <- data.frame()
  attr(sampler.comparison, 'start.time') <- as.numeric(proc.time()['elapsed'])
  attr(sampler.comparison, 'start.stamp') <- Sys.time()
  save(sampler.comparison, file=completed.file)

  # Curry out all the parameters that do not vary simulation to simulation.

  eval.sampler.job.id <- function(job.id) {

    dist <- dists[[jobs$dist.id[job.id]]]
    sampler <- samplers[[jobs$sampler.id[job.id]]]
    tuning.param <- jobs$tuning[job.id]
    eval.sampler(dist, sampler, sample.size, tuning.param,
                 burn.in, trace, seed, completed.file, completed.lock)
  }

  # Call eval.sampler.job.id on each job id, possibly using multiple cores.

  stopifnot(cores>=1)
  if (cores==1) {
    results <- lapply(1:nrow(jobs), eval.sampler.job.id)
  } else {
    results <- parallel::mclapply(1:nrow(jobs), eval.sampler.job.id,
                                  mc.preschedule=FALSE, mc.cores=cores)
  }
#  RS <- do.call(rbind, lapply(results, data.frame))
  
  # Load the saved results and return them.  We could use the lapply
  # return value, but this ensures that callers get the same results
  # whether they get them from compare.samplers or completed.file.

  load(completed.file)
  if (unlink.completed.file)
    unlink(completed.file)
  if (trace)
    cat(sprintf('Simulation finished at %s, %.3gs elapsed.\n',
		 attr(sampler.comparison, 'last.stamp'),
		 attr(sampler.comparison, 'elapsed.time')))
  if (trace && !unlink.completed.file)
    cat(sprintf('Wrote results to %s.\n', completed.file))
	 
  return(sampler.comparison)
}

# Takes a distribution, a sampler, a sample size, and a tuning
# parameter and runs the associated simulation.  Returns a list with
# the distribution name, the sampler name, the tuning parameter, the
# autocorrelation time and information about its uncertainty, the
# number of log density evaluations, the processor time consumed, the
# two-norm of the error in the mean estimate, and a flag indicating
# whether the simulation was aborted.

eval.sampler <- function(dist, sampler, sample.size, tuning.param,
                         burn.in, trace=FALSE, seed=17,
                         completed.file, completed.lock) {

  # Set seed if requested.

  if (seed) {
    if (!exists('.Random.seed'))
      runif(1)
    saved.seed <- .Random.seed
    set.seed(seed)
  }

  # Run the sampler started at a random point on the unit hypercube
  # or an initial point generated by the distribution if available.

  if (is.null(dist$initial))
    x0 <- runif(dist$ndim)
  else
    x0 <- dist$initial()
  timings <- system.time(
    S <- sampler(target.dist=dist, x0=x0, sample.size=sample.size,
                 tuning=tuning.param) )
  stopifnot(!is.null(S$X) && !is.null(S$evals))

  # Compute value to return.

  rv <- simulation.result(dist, attr(sampler, 'name'), S$X,
    S$evals, S$grads, tuning.param, timings[['elapsed']], burn.in,
    sampler.expr=attr(sampler, 'name.expression'),
    aborted = nrow(S$X) < sample.size)

  # Print results of simulation on terminal if requested.

  if (trace)
    cat(sprintf('%s %s: %.3g (%.3g,%.3g) evals tuning=%.3g%s; act.y=%.3g\n',
                dist$name, attr(sampler,'name'), rv$act * rv$evals, 
                rv$act.025 * rv$evals, rv$act.975 * rv$evals,
                tuning.param, ifelse(rv$aborted, ' (aborted)', ''), rv$act.y))

  # Restore seed if previously modified.

  if (seed)
    .Random.seed <- saved.seed

  # Save results to incremental file if necessary.  This locking scheme will
  # deadlock if there is ever an error loading and saving the completed file.
  # This would usually mean the entire simulation needs to be re-run, but it
  # would be nice if this caused all workers to fail.  As it is, one worker
  # will fail and the others will hang.

  if (!is.null(completed.file)) {
    if (!is.null(completed.lock))
      synchronicity::lock(completed.lock)

    load(completed.file)
    sc.new <- rbind(sampler.comparison, rv)
    row.names(sc.new) <- 1:nrow(sc.new)
    attr(sc.new, 'start.time') <- attr(sampler.comparison, 'start.time')
    attr(sc.new, 'start.stamp') <- attr(sampler.comparison, 'start.stamp')
    sampler.comparison <- sc.new
    attr(sampler.comparison, 'elapsed.time') <-
      as.numeric(proc.time()['elapsed']) -
      attr(sampler.comparison, 'start.time')
    attr(sampler.comparison, 'last.stamp') <- Sys.time()
    save(sampler.comparison, file=completed.file)
    if (!is.null(completed.lock))
      synchronicity::unlock(completed.lock)
  }

  return(rv)
}
   
simulation.result <- function(target.dist, sampler.name, X,
    evals=NULL, grads=NULL, tuning=NULL, cpu=NULL, burn.in=0.2, y=NULL,
    sampler.expr=sprintf("plain('%s')", sampler.name),
    aborted=NA) {

  # Cast X to a matrix.  Necessary when it is an mcmc object from coda.

  X <- as.matrix(X)
  
  # Check and chop off burn-in segments of chains.

  stopifnot(target.dist$ndim==ncol(X))
  first.obs <- ceiling(burn.in*nrow(X))
  chain <- X[first.obs:nrow(X),,drop=FALSE]

  # Initialize return value.

  nNA <- as.numeric(NA)
  rv <- list(dist=target.dist$name, dist.expr=target.dist$name.expression,
             ndim=target.dist$ndim, sampler=sampler.name,
             sampler.expr=sampler.expr,
             tuning=tuning, act=nNA, act.025=nNA, act.975=nNA,
             act.y=nNA, act.y.025=nNA, act.y.975=nNA,
             evals=ifelse(is.null(evals), nNA, evals/nrow(X)),
             grads=ifelse(is.null(grads), nNA, grads/nrow(X)),
             cpu=ifelse(is.null(cpu), nNA, cpu/nrow(X)),
             err=nNA, aborted=aborted)

  if (is.null(rv$dist.expr))
    rv$dist.expr <- sprintf('plain("%s")', rv$dist)

  if (is.null(sampler.expr))
    rv$sampler.expr <- sprintf("plain('%s')", sampler.name)

  # Fill in maximal autocorrelation times for components of chain.

  acts <- ar.act(chain, true.mean=target.dist$mean)
  rv$act <- acts$act
  rv$act.025 <- acts$act.025
  rv$act.975 <- acts$act.975

  # Fill in autocorrelation times for log density if the caller
  # specifies the log density function or passes in log density states
  # explicitly.

  if (is.null(y) && !is.null(target.dist$log.density))
    y <- apply(chain, 1, target.dist$log.density)
  if (!is.null(y)) {
    acts.y <- ar.act(y, true.mean=target.dist$mean.log.dens)
    rv$act.y <- acts.y$act
    rv$act.y.025 <- acts.y$act.025
    rv$act.y.975 <- acts.y$act.975
  }

  # Compute error in sample mean if the true mean is known.

  if (!is.null(target.dist$mean))
    rv$err <- sqrt(sum((target.dist$mean - colMeans(chain))^2))

  # Return data as a data frame with strings for factors so rbind
  # works as expected.
  
  return(as.data.frame(rv, stringsAsFactors=FALSE))
}

Try the SamplerCompare package in your browser

Any scripts or data that you put into this service are public.

SamplerCompare documentation built on May 29, 2017, 10:14 p.m.