R/unif_stat_MC.R
In sphunif: Uniformity Tests on the Circle, Sphere, and Hypersphere

Documented in unif_stat_MC

#' @title Monte Carlo simulation of circular and (hyper)spherical uniformity
#' statistics
#'
#' @description Utility for performing Monte Carlo simulation of several
#' statistics for assessing uniformity on the (hyper)sphere
#' \eqn{S^{p-1}:=\{{\bf x}\in R^p:||{\bf x}||=1\}}{
#' S^{p-1}:=\{x\in R^p:||x||=1\}}, \eqn{p\ge 2}.
#'
#' \code{unif_stat_MC} provides a convenient wrapper for parallel
#' evaluation of \code{unif_stat}, the estimation of critical values under the
#' null distribution, and the computation of empirical powers under the
#' alternative.
#'
#' @inheritParams unif_test
#' @inheritParams r_unif
#' @param M number of Monte Carlo replications. Defaults to \code{1e4}.
#' @param r_H1 if provided, the computation of empirical powers is
#' carried out for the alternative hypothesis sampled with \code{r_H1}.
#' This must be a function with the same arguments and value as
#' \code{\link{r_unif_sph}} (see examples). Defaults to \code{NULL}, indicating
#' that the critical values are estimated from samples of \code{r_unif_sph}.
#' @param crit_val if provided, must be the critical values as returned by
#' \code{$stats_MC} in a call to \code{unif_stat_MC}. They are used for
#' computing the empirical powers of the tests present in \code{type}.
#' Defaults to \code{NULL}, which means that no power computation is done.
#' @param return_stats return the Monte Carlo statistics? If only the critical
#' values or powers are desired, \code{FALSE} saves memory in the returned
#' object. Defaults to \code{TRUE}.
#' @param stats_sorted sort the returned Monte Carlo statistics? If
#' \code{TRUE}, this is useful for evaluating faster the empirical cumulative
#' distribution function when approximating the distribution in
#' \code{\link{unif_stat_distr}}. Defaults to \code{FALSE}.
#' @param chunks number of chunks to split the \code{M} Monte Carlo
#' replications. Useful for parallelizing the simulation study in \code{chunks}
#' tasks containing \code{ceiling(M / chunks)} replications. Useful also for
#' avoiding memory bottlenecks when \code{M} is large. Defaults to
#' \cr\code{ceiling((n * M) / 1e5)}.
#' @param cores number of cores to perform the simulation. Defaults to \code{1}.
#' @param seeds if provided, a vector of size \code{chunks} for fixing the
#' seeds on each of the simulation chunks (useful for reproducing parallel
#' simulations). Specifically, for \code{k in 1:chunks}, seeds are
#' set as \code{set.seed(seeds[k], kind = "Mersenne-Twister")} in each chunk.
#' Defaults to \code{NULL} (no seed setting is done).
#' @inheritParams unif_stat
#' @param ... optional arguments to be passed to the \code{r_H1} sampler or to
#' \code{\link[foreach]{foreach}} (for example, \code{.export} to export global
#' variables or other functions to the \code{foreach} environment).
#' @return A list with the following entries:
#' \itemize{
#'   \item \code{crit_val_MC}: a data frame of size
#'   \code{c(length(alpha), length(type))}, with column names given by
#'   \code{type} and rows corresponding to the significance levels \code{alpha},
#'   that contains the estimated critical values of the tests.
#'   \item \code{power_MC}: a data frame of size
#'   \code{c(nrow(crit_val), length(type))}, with column names given by
#'   \code{type} and rows corresponding to the significance levels of
#'   \code{crit_val}, that contains the empirical powers of the tests. \code{NA}
#'   if \code{crit_val = NULL}.
#'   \item \code{stats_MC}: a data frame of size \code{c(M, length(type))}, with
#'   column names given by \code{type}, that contains the Monte Carlo
#'   statistics.
#' }
#' @details
#' It is possible to have a progress bar if \code{unif_stat_MC} is wrapped with
#' \code{\link[progressr:with_progress]{progressr::with_progress}} or if
#' \code{progressr::handlers(global = TRUE)} is invoked (once) by the user.
#' See the examples below. The progress bar is updated with the number of
#' finished chunks.
#'
#' All the tests reject for large values of the test statistic
#' (\code{max_gap = TRUE} is assumed for the Range test), so the critical
#' values for the significance levels \code{alpha} correspond to the
#' \code{alpha}-upper quantiles of the null distribution of the test statistic.
#'
#' The Monte Carlo simulation for the CCF09 test is made conditionally
#' on the choice of \code{CCF09_dirs}. That is, all the Monte
#' Carlo statistics share the same random directions.
#' @examples
#' ## Critical values
#'
#' # Single statistic, specific alpha
#' cir <- unif_stat_MC(n = 10, M = 1e2, type = "Ajne", p = 2, alpha = 0.15)
#' summary(cir$stats_MC)
#' cir$crit_val_MC
#'
#' # All circular statistics
#' cir <- unif_stat_MC(n = 10, M = 1e2, p = 2)
#' head(cir$stats_MC)
#' cir$crit_val_MC
#'
#' # All spherical statistics
#' sph <- unif_stat_MC(n = 10, M = 1e2, p = 3)
#' head(sph$stats_MC)
#' sph$crit_val_MC
#'
#' ## Using a progress bar
#'
#' # Define a progress bar
#' require(progress)
#' require(progressr)
#' handlers(handler_progress(
#'   format = ":spin [:bar] :percent Total: :elapsedfull End \u2248 :eta",
#'   clear = FALSE))
#'
#' # Call unif_stat_MC() within with_progress()
#' with_progress(unif_stat_MC(n = 10, M = 1e2, p = 3, chunks = 10))
#'
#' # With several cores
#' with_progress(unif_stat_MC(n = 10, M = 1e2, p = 3, chunks = 10, cores = 2))
#'
#' # Instead of using with_progress() each time, it is more practical to run
#' # handlers(global = TRUE)
#' # once to activate progress bars in your R session
#'
#' ## Power computation
#'
#' # Single statistic
#' cir_pow <- unif_stat_MC(n = 10, M = 1e2, type = "Ajne", p = 2,
#'                         crit_val = cir$crit_val_MC)
#' cir_pow$crit_val_MC
#' cir_pow$power_MC
#'
#' # All circular statistics
#' cir_pow <- unif_stat_MC(n = 10, M = 1e2, p = 2, crit_val = cir$crit_val_MC)
#' cir_pow$crit_val_MC
#' cir_pow$power_MC
#'
#' # All spherical statistics
#' sph_pow <- unif_stat_MC(n = 10, M = 1e2, p = 3, crit_val = sph$crit_val_MC)
#' sph_pow$crit_val_MC
#' sph_pow$power_MC
#' \donttest{
#' ## Custom r_H1
#'
#' # Circular
#' r_H1 <- function(n, p, M, l = 0.05) {
#'
#'   stopifnot(p == 2)
#'   Theta_to_X(matrix(runif(n * M, 0, (2 - l) * pi), n, M))
#'
#' }
#' dirs <- r_unif_sph(n = 5, p = 2, M = 1)[, , 1]
#' cir <- unif_stat_MC(n = 50, M = 1e2, p = 2, CCF09_dirs = dirs)
#' cir_pow <- unif_stat_MC(n = 50, M = 1e2, p = 2, r_H1 = r_H1, l = 0.10,
#'                         crit_val = cir$crit_val_MC, CCF09_dirs = dirs)
#' cir_pow$crit_val_MC
#' cir_pow$power_MC
#'
#' # Spherical
#' r_H1 <- function(n, p, M, l = 0.5) {
#'
#'   samp <- array(dim = c(n, p, M))
#'   for (j in 1:M) {
#'
#'     samp[, , j] <- mvtnorm::rmvnorm(n = n, mean = c(l, rep(0, p - 1)),
#'                                     sigma = diag(rep(1, p)))
#'     samp[, , j] <- samp[, , j] / sqrt(rowSums(samp[, , j]^2))
#'
#'   }
#'   return(samp)
#'
#' }
#' dirs <- r_unif_sph(n = 5, p = 3, M = 1)[, , 1]
#' sph <- unif_stat_MC(n = 50, M = 1e2, p = 3, CCF09_dirs = dirs)
#' sph_pow <- unif_stat_MC(n = 50, M = 1e2, p = 3, r_H1 = r_H1, l = 0.5,
#'                        crit_val = sph$crit_val_MC, CCF09_dirs = dirs)
#' sph_pow$power_MC
#'
#' ## Pre-built r_H1
#'
#' # Circular
#' dirs <- r_unif_sph(n = 5, p = 2, M = 1)[, , 1]
#' cir_pow <- unif_stat_MC(n = 50, M = 1e2, p = 2, r_H1 = r_alt, alt = "vMF",
#'                         kappa = 1, crit_val = cir$crit_val_MC,
#'                         CCF09_dirs = dirs)
#' cir_pow$power_MC
#'
#' # Spherical
#' dirs <- r_unif_sph(n = 5, p = 3, M = 1)[, , 1]
#' sph_pow <- unif_stat_MC(n = 50, M = 1e2, p = 3, r_H1 = r_alt, alt = "vMF",
#'                         kappa = 1, crit_val = sph$crit_val_MC,
#'                         CCF09_dirs = dirs)
#' sph_pow$power_MC
#' }
#' @export
unif_stat_MC <- function(n, type = "all", p, M = 1e4, r_H1 = NULL,
                         crit_val = NULL, alpha = c(0.10, 0.05, 0.01),
                         return_stats = TRUE, stats_sorted = FALSE,
                         chunks = ceiling((n * M) / 1e5), cores = 1,
                         seeds = NULL, Rayleigh_m = 1, cov_a = 2 * pi,
                         Rothman_t = 1 / 3, Cressie_t = 1 / 3, Pycke_q = 0.5,
                         Riesz_s = 1, CCF09_dirs = NULL, K_CCF09 = 25,
                         CJ12_reg = 3, ...) {

  # Check dimension
  if (p < 2) {

    stop("Dimension p must be p >= 2.")

  }

  # Chunk large n * M to avoid memory issues
  small_M <- M %/% chunks

  # Fix projections for the CCF09 test
  if ("CCF09" %in% type & is.null(CCF09_dirs)) {

    CCF09_dirs <- r_unif_sph(n = 50, p = p, M = 1)[, , 1]

  }

  # If the uniformity is simulated
  if (is.null(r_H1)) {

    r_H1 <- r_unif_sph

  }

  # Check if it is a data frame with colnames in avail_cir or avail_sph
  if (!is.null(crit_val)) {

    avail <- switch((p > 2) + 1, avail_cir_tests, avail_sph_tests)
    tests <- colnames(crit_val) %in% avail
    if (!all(tests)) {

      stop(paste0(
        "crit_val must be a data.frame with valid tests names as colnames, ",
        "check: ", paste(paste0("\"", colnames(crit_val)[!tests], "\""),
                         collapse = ", "), "."))

    }

  }

  # Extra arguments for foreach::foreach and r_H1
  dots <- list(...)
  foreach_args <- dots[names(dots) %in% names(formals(foreach::foreach))]
  r_H1_args <- dots[names(dots) %in% names(formals(r_H1))]

  # Check seeds
  if (!is.null(seeds) & length(seeds) != chunks) {

    warning(paste("seeds and chunks do not have the same length:",
                  "seeds are ignored."))
    seeds <- NULL

  }

  # Parallel backend
  old_dopar <- doFuture::registerDoFuture()
  old_plan <- future::plan(future::multisession(), workers = cores)
  options(future.rng.onMisuse = "ignore")
  on.exit({

    with(old_dopar, foreach::setDoPar(fun = fun, data = data, info = info))
    future::plan(old_plan)
    options(future.rng.onMisuse = NULL)

  })
  `%op%` <- foreach::`%dopar%`

  # Measure progress?
  if (requireNamespace("progressr", quietly = TRUE)) {

    prog <- progressr::progressor(along = 1:chunks)

  }

  # Monte Carlo
  k <- 0
  stats <- do.call(what = foreach::foreach,
                   args = c(list(k = 1:chunks, .combine = rbind,
                                 .inorder = TRUE, .multicombine = TRUE,
                                 .maxcombine = 100, .packages = "sphunif"),
                            foreach_args)) %op% {

    # Samples
    if (!is.null(seeds)) {

      set.seed(seeds[k], kind = "Mersenne-Twister")

    }
    X <- do.call(what = r_H1, args = c(list(n = n, p = p, M = small_M),
                                            r_H1_args))

    # Statistics
    stats <- unif_stat(data = X, type = type, Rayleigh_m = Rayleigh_m,
                       cov_a = cov_a, Rothman_t = Rothman_t,
                       Cressie_t = Cressie_t, Pycke_q = Pycke_q,
                       Riesz_s = Riesz_s, CCF09_dirs = CCF09_dirs,
                       K_CCF09 = K_CCF09, CJ12_reg = CJ12_reg)

    # Remove X
    rm(X)

    # Signal progress
    if (requireNamespace("progressr", quietly = TRUE)) {

      prog()

    }

    # Return stats
    stats

  }

  # Sort statistics
  if (stats_sorted & return_stats) {

    names <- names(stats)
    stats <- as.data.frame(sort_each_col(as.matrix(stats)))
    names(stats) <- names

  }

  # Build tables
  if (is.null(crit_val)) {

    # Critical values
    crit_val <- rbind(apply(stats, 2, quantile, probs = 1 - alpha,
                            na.rm = TRUE))
    crit_val <- as.data.frame(crit_val)
    rownames(crit_val) <- alpha

    # Power
    power <- NA

  } else {

    # In case crit_val contains more tests than the ones considered in type
    # and in a potentially different order, match them accordingly with stats
    names_stats <- colnames(stats)
    ind_tests <- match(x = names_stats, table = colnames(crit_val))
    if (any(is.na(ind_tests))) {

      stop(paste0(
        "No critical values in \"crit_val\" associated with the test(s) ",
        paste(paste0("\"", names_stats[is.na(ind_tests)], "\""),
              collapse = ", "), "."))

    }
    crit_val <- crit_val[, ind_tests, drop = FALSE]

    # Empirical rejection level
    power <- t(apply(crit_val, 1,
                     function(c) rowMeans(t(stats) > c, na.rm = TRUE)))
    if (ncol(crit_val) == 1) {

      power <- t(power)
      colnames(power) <- names_stats

    }
    power <- as.data.frame(power)

  }

  # Empty stats if not desired
  if (!return_stats) {

    stats <- NA

  }

  return(list("crit_val_MC" = crit_val, "power_MC" = power, "stats_MC" = stats))

}