R/sample_partitions_by_half.R

Defines functions sample_partitions_similar_weights_by_half sample_partitions_by_half

Documented in sample_partitions_by_half sample_partitions_similar_weights_by_half

#' Generate Random, Non-Overlapping Partitions of Indices with Reference vs Rest
#'
#' @param n Number of elements to partition.
#'
#' @param fraction A numeric of length two in (0,1/2] specifying the size of
#' each partition relative to `n`, where the first element specifies the
#' "reference" partition and the second the "test" partition.
#' For backward compatible reasons, if of length one, then it's equivalent to
#' specifying `fraction = c(reference = 1/2, test = fraction)`.
#'
#' @param warn If `TRUE`, a warning is produced if the partitions produced
#' are not of equal size.
#'
#' @return A list of two disjoint random non-overlapping partitions where each
#' element holds indices in \eqn{{1, 2, ..., n}}.  The first element holds the
#' 'reference' partition and the second the 'test' partition.
#' As an example, if `fraction[1] == 1/2`, then the reference partition
#' \eqn{P_0.5} holds \eqn{n/2} indices and the test partition holds indices in
#' \eqn{{1, 2, ..., n} \ P_0.5} where the size of the partitions corresponds
#' to the sizes specified by `fraction[2]`.
#'
#' @export
sample_partitions_by_half <- function(n, fraction, warn = TRUE) {
  stop_if_not(is.numeric(n), length(n) == 1L, !is.na(n), n > 0)
  stop_if_not(is.numeric(fraction), length(fraction) %in% 1:2,
              !anyNA(fraction), all(fraction > 0), all(fraction <= 1/2))
  if (length(fraction) == 1L) {
    .Deprecated(msg = "sample_partitions_by_half(): Argument 'fraction' should be of length two")
    fraction <- c(reference = 1/2, test = fraction)
  }
  stopifnot(length(fraction) == 2L)
  if (is.null(names(fraction))) names(fraction) <- c("reference", "test")
  stopifnot(all(names(fraction) == c("reference", "test")))
  
  parts <- sample_partitions(n = n, fraction = 1/2)

  ## Sanity check
  stop_if_not(length(parts) == 2L)

  ## Down-sample partitions?
  for (ff in seq_along(fraction)) {
    if (fraction[ff] < 1/2) {
      idxs <- parts[[ff]]
      nidxs <- length(idxs)
      size <- min(round(fraction[ff] * n), nidxs)
      parts[[ff]] <- idxs[sample.int(nidxs, size = size)]
    }
  }

  names(parts) <- c("reference", "test")
  attr(parts, "fraction") <- fraction
  attr(parts, "n") <- n

  ## Sanity check
  stop_if_not(length(parts) == 2L, all(names(parts) == names(fraction)))
  
  parts
}



#' Generate Random, Non-Overlapping, Similarly-Weighted, Two-Set Partition of Indices
#'
#' @param w Numeric vector of `n` non-negative, finite weights.
#' Weights are normalized such that `sum(w)` equals one.
#'
#' @param fraction A numeric of length two in (0,1/2] specifying the size of
#' each partition relative to `n`, where the first element specifies the
#' "reference" partition and the second the "test" partition.
#' For backward compatible reasons, if of length one, then it's equivalent to
#' specifying `fraction = c(reference = 1/2, test = fraction)`.
#'
#' @param w_tolerance Maximum allowed difference between target weight of
#' each of the two sets (e.g. `fraction * 1`) and the actual total weight
#' of the sets (i.e. `sum(w[set])`.  If _both_ sets are within the tolerance,
#' the partition is accepted, otherwise rejected.
#'
#' @param max_rejections The maximum number of rejections before giving up.
#'
#' @param warn If `TRUE`, a warning is produced if the partitions produced
#' are not of equal size.
#'
#' @return A list of two disjoint random non-overlapping partitions where each
#' element holds indices in \eqn{{1, 2, ..., n}}.  The first element holds the
#' 'reference' partition and the second the 'test' partition.
#' Attribute `weights` gives the total normalized weight of each set.
#' If no accepted sample was found, then `NA` is returned.
#' Attribute `count` gives the number of internal samples produced before
#' arriving at an accepted or rejected partition.
#'
#' @importFrom parallel splitIndices
#' @export
sample_partitions_similar_weights_by_half <- function(w, fraction, w_tolerance = 0.01, max_rejections = 100L, warn = TRUE) {
  stop_if_not(is.numeric(w), length(w) > 0, !anyNA(w), all(w > 0))
  stop_if_not(is.numeric(w_tolerance), length(w_tolerance) == 1L,
              !is.na(w_tolerance), w_tolerance >= 0, w_tolerance <= 1)
  stop_if_not(is.numeric(max_rejections), length(max_rejections) == 1L,
              !is.na(max_rejections), max_rejections >= 0)
  stop_if_not(is.numeric(fraction), length(fraction) %in% 1:2,
              !anyNA(fraction), all(fraction > 0), all(fraction <= 1/2))
  if (length(fraction) == 1L) {
    .Deprecated(msg = "sample_partitions_similar_weights_by_half(): Argument 'fraction' should be of length two")
    fraction <- c(reference = 1/2, test = fraction)
  }
  stopifnot(length(fraction) == 2L)
  if (is.null(names(fraction))) names(fraction) <- c("reference", "test")
  stopifnot(all(names(fraction) == c("reference", "test")))

  ## Normalize weights
  w <- w / sum(w)
  n <- length(w)

  parts <- sample_partitions_similar_weights(w = w, fraction = 1/2, w_tolerance = w_tolerance, max_rejections = max_rejections, warn = warn)

  ## Failed to find a solution?
  if (!is.list(parts) && is.na(parts)) return(parts)

  ## Sanity check
  stop_if_not(length(parts) == 2L)

  ## Down-sample partitions?
  for (ff in seq_along(fraction)) {
    if (fraction[ff] < 1/2) {
      idxs <- parts[[ff]]
      nidxs <- length(idxs)
      size <- min(round(fraction[ff] * n), nidxs)
      parts[[ff]] <- idxs[sample.int(nidxs, size = size)]
    }
  }
  
  names(parts) <- c("reference", "test")
  attr(parts, "fraction") <- fraction
  attr(parts, "n") <- n

  ## Sanity check
  stop_if_not(length(parts) == 2L, all(names(parts) == names(fraction)))

  parts
}
HenrikBengtsson/TopDomStudy documentation built on May 14, 2021, 1:49 p.m.