R/utilities.R

Defines functions mixture_draw

Documented in mixture_draw

#' Separate a chosen proportion of a reference dataset into a mixture with known population proportions
#'
#' Takes a reference dataset and a set of population proportions, either at the collection
#' or reporting unit level. Randomly samples individuals to satisfy these desired proportions,
#' and splits them into a new "mixture" dataframe.
#'
#' @keywords internal
#'
#' @param D a two-column genetic dataframe with "indiv", "repunit", and "collection" columns
#' @param rhos a vector of the desired reporting unit proportions in the mixture set;
#' if not named, will be assumed to be ordered by order of appearance in the dataset
#' @param omegas the desired collection proportions in the mixture set
#' @param N the total size of the mixture set
#' @param min_remaining the fraction of any collection in the reference dataset which must remain
#' at the end of the draw
#'
#' @return \code{mixture_draw} returns a list of two data frames,
#' "mixture" being the random sample taken, and "reference" being the remaining samples
#'
#' @examples
#' rhos <- as.vector(gtools::rdirichlet(1, table(alewife$repunit)))
#' cross_val <- mixture_draw(D = alewife, rhos = rhos, N = 100, min_remaining = .005)
#'
#' @export
mixture_draw <- function(D, rhos = NULL, omegas = NULL, N, min_remaining = 0) {
  if(!is.null(rhos) && !is.null(omegas)) stop("Cannot specify proportions of both rho and omega")
  repidxs <- D %>%
    dplyr::select(repunit, collection) %>%
    dplyr::group_by(repunit, collection) %>%
    dplyr::tally() %>%
    dplyr::filter(n > 0)

  coll_props <- repidxs$n /sum(repidxs$n)

  if(sum(repidxs$n) < N) stop("Cannot take mixture sample of size greater than reference dataset")
  if(any(coll_props < min_remaining)) stop("one or more collections starting with proportion below the specified minimum")


  # deterministic sampling (rho is exactly the desired value)
  if(!is.null(rhos)) {
    if(!identical(all.equal(sum(rhos),1), TRUE)) stop("Desired proportions must sum to 1")

    if(is.null(names(rhos))) names(rhos) <- levels(as.factor(D$repunit))
    ru_ns <- table(D$repunit)
    samp_sizes <- as.vector(round2(rhos * N, 0))
    names(samp_sizes) <- names(rhos)
    draw <- lapply(names(rhos), function(repunit) {
      samp <- dplyr::slice_sample(D[D$repunit == repunit, ], n = samp_sizes[repunit])
    }) %>%
      do.call("rbind", .)
    new_D <- dplyr::setdiff(D, draw)
    new_repidxs <- new_D %>%
      dplyr::select(repunit, collection) %>%
      dplyr::group_by(repunit, collection) %>%
      dplyr::tally() %>%
      dplyr::filter(n > 0)

    coll_props <- new_repidxs$n /sum(new_repidxs$n)
  }

  else if(!is.null(omegas)) {
    if(!identical(all.equal(sum(omegas),1), TRUE)) stop("Desired proportions must sum to 1")

    if(is.null(names(omegas))) names(omegas) <- levels(D$collection)
    samp_sizes <- round2(omegas * N, 0)
    draw <- lapply(levels(repidxs$collection), function(collection) {
      samp <- dplyr::slice_sample(D[D$collection == collection, ], n = samp_sizes[collection])
    }) %>%
      do.call("rbind", .)
    new_D <- dplyr::setdiff(D, draw)
    new_repidxs <- new_D %>%
      dplyr::select(repunit, collection) %>%
      dplyr::group_by(repunit, collection) %>%
      dplyr::tally() %>%
      dplyr::filter(n > 0)

    coll_props <- new_repidxs$n /sum(new_repidxs$n)
  }

  draw$sample_type <- "mixture"
  new_D$sample_type <- "reference"
  out <- list("mixture" = draw, "reference" = new_D)

}


#' Round a given number, with 5 always rounded up
#'
#' Given a number and a digit to round to, returns the rounded number,
#' with 5 always rounded upwards.
#'
#' This function differs from \code{round}, which rounds 5 "towards the
#' even number". Rounding 5s up leads to bias when positive and negative numbers
#' are expected, but can be desired in some cases.
#'
#'
#'
#' @keywords internal
#'
#' @param x the data to be rounded
#' @param n the number of digits to round to
#'
#' @export
round2 = function(x, n) {
  posneg = sign(x)
  z = abs(x)*10^n
  z = z + 0.5
  z = trunc(z)
  z = z/10^n
  z*posneg
}

Try the rubias package in your browser

Any scripts or data that you put into this service are public.

rubias documentation built on Feb. 10, 2022, 1:06 a.m.