R/ESS.R

Defines functions ESS.numeric ESS.factor ESS.phybreak ESS

Documented in ESS ESS.factor ESS.numeric ESS.phybreak

#' Effective sample size
#' @export ESS
ESS <- function(x) UseMethod("ESS")

#' @describeIn ESS Effective sample size of phybreak posterior.
#' @param x An object of class \code{phybreak}.
#' @return Effective sample sizes.
#' @details When applied to an object of class \code{phybreak}, \code{ESS} calculates for all parameters and continuous
#'   variables (infection times) the effective sample size (ESS) with the \code{\link[coda]{effectiveSize}} 
#'   function in \pkg{coda}. For the infectors, 
#'   a method is used that is similar to the method for the approximate ESS for phylogenetic trees, described in 
#'   Lanfaer et al (2016):
#'   \enumerate{
#'     \item Define as distance measure between two sampled infectors \eqn{D(i,j) = 0} if \eqn{i = j}, 
#'         and \eqn{D(i,j) = 1} if \eqn{i \neq= j}{i <> j}
#'     \item Calculate the mean squared distance f(k) between sampled infectors at intervals k = 1,2,... in the mcmc chain.
#'         The distance will increase with increasing interval k.
#'     \item Use the rate at which f(k) approaches the asymptote to calculate the ESS (see Lanfaer et al, 2016)
#'   }
#'   The latter method can also be directly called for single vectors of class \code{factor} or \code{integer}.
#' @author Don Klinkenberg \email{don@@xs4all.nl}
#' @references \href{http://dx.doi.org/10.1093/gbe/evw171}{Lanfaer et al. (2016)} Estimating 
#'   the effective sample size of tree topologies from Bayesian phylogenetic analyses. 
#'   \emph{Genome Biol Evol}, \strong{8}(8): 2319-2332.
#' @examples 
#' #First create a phybreak object
#' simulation <- sim_phybreak(obsize = 5)
#' MCMCstate <- phybreak(dataset = simulation)
#' 
#' MCMCstate <- burnin_phybreak(MCMCstate, ncycles = 20)
#' MCMCstate <- sample_phybreak(MCMCstate, nsample = 50, thin = 2)
#' ESS(MCMCstate)
#' @export
ESS.phybreak <- function(x) {
  ### tests
  if (!inherits(x, "phybreak")) {
    stop("object must be of class \"phybreak\"")
  }
  # Only with at least 2 posterior samples
  if(length(x$s$mu) < 2) stop("not possible with less than 2 posterior samples")
  
  # Get mcmc object and identify the columns with infectors
  mcmcobj <- suppressWarnings(get_mcmc(x))
  whichinfectors <- grepl("infector", colnames(mcmcobj))
  
  # Calculate the ESS per chain with the correct method
  res <- sapply(1:ncol(mcmcobj), function(i) {
    if(whichinfectors[i]) {
      ESS(as.numeric(mcmcobj[, i]))
    } else {
      coda::effectiveSize(mcmcobj[, i])
    }
  })
  
  # Add names and return
  names(res) <- colnames(mcmcobj)
  return(res)
}

#' @describeIn ESS Effective sample size of a categorical variable.
#' @param x Vector of class \code{factor}.
#' @export
ESS.factor <- function(x) {
  # length must be 2 at least
  if(length(x) < 2) return(NA_real_)  
  
  # maximum interval k to calculate mean distance
  max_int <- min(floor(length(x)/2), 250L)
  
  # make numerical vector
  x <- as.numeric(x)
  
  # no ESS if all samples are identical
  if(length(unique(x)) == 1) return(NaN)
  
  ### get ballpark m by looking at complete chain (intervals up to half chain length)
  # step 1: squared distance as function of interval
  intervalstart <- floor(length(x)/(2 * max_int))
  squared_distances <- rep(0, max_int)
  for(i in 1:max_int) {
    samplinginterval <- 1 + (i - 1) * intervalstart
    
    # distance is 0 if identical, 1 if different
    squared_distances[i] <- mean(x != c(tail(x, -samplinginterval), head(x, samplinginterval)))
  }
  
  # step 2: fit to exponential semivariogram to find asymptote
  Dpar <- if(squared_distances[1] >= mean(squared_distances)) {
    mean(squared_distances)
  } else {
    min(max(squared_distances),
        optim(c(.1, .1),
              function(x)
                sum(
                  (x[1] * (1 - exp(-x[2] * (1 + intervalstart*(0:(max_int - 1))))) -
                     squared_distances) ^ 2
                ))$par[1])
  }
  
  # step 3: no ESS if no asymptote has been reached
  if(all(tail(squared_distances, -1) - head(squared_distances, -1) > 0)) return(0)
  
  # step 4: calculate ballpark m
  mpar <- intervalstart * (which(squared_distances >= Dpar)[1] - 1) + 1
  
  
  ### get final m by calculating f(k) for k <= 1.25*mpar
  # step 1: squared distance as function of interval
  intervalend <- min(intervalstart, ceiling(mpar/200))
  sqdists <- rep(0, max_int)
  for(i in 1:max_int) {
    samplinginterval <- i * intervalend
    sqdists[i] <- mean(x != c(tail(x, -samplinginterval), head(x, samplinginterval)))
  }
  
  # step 2: calculate m as lowest k where f is within 5% of asymptote
  mpar <- intervalend * (which(sqdists > 0.95 * Dpar)[1])
  
  ### calculate other parameters of equation
  Dpar <- max(Dpar, sqdists)
  Npar <- length(x)
  
  ### calculate right-hand side of equation
  if(mpar == 0) {
    return(Npar)
  } else {
    right_hand_side <- intervalend * sum(
      sapply(1:(mpar/intervalend),
             function(x) sum(sqdists[x] * (Npar - x))
      )
    )
    right_hand_side <- right_hand_side + Dpar*((Npar - mpar) * (Npar - mpar + 1)/2)
    right_hand_side <- min(right_hand_side/(2 * Npar ^ 2), Dpar/4)
    
    ### final result up to a maximum of the chain length
    return(min(Npar, 1/(1 - 4 * right_hand_side / Dpar)))
  }
}

#' @describeIn ESS Effective sample size of a categorical variable.
#' @param x Vector of class \code{numeric}, containing only integers. This will be treated as a categorical variable.
#' @export
ESS.numeric <- function(x) {
  ESS(as.factor(as.integer(x)))
}
donkeyshot/phybreak documentation built on Sept. 17, 2021, 9:32 p.m.