R/simulate_random_samples.R

Defines functions simulate_random_samples

Documented in simulate_random_samples

#' Generate a samples for a mixture.
#'
#' Creates random reporting unit (rho) and collection (omega) proportions, and a
#' \code{sim_colls} vector for simulation of individual genotypes, based on the methods
#' used in Hasselman \emph{et al.} (2015)
#'
#' @param RU_starts a vector delineating the reporting units in \code{RU_vec};
#' generated by \code{tcf2param_list}
#' @param RU_vec a vector of collection indices, grouped by reporting unit;
#' generated by \code{tcf2param_list}
#' @param size The number of individuals desired in the mixture sample.  Default = 100.  This is
#' ignored without a warning if alpha_repunit is specified with counts (a cnt column) or if alpha_collection
#' is specified with counts (a cnt column).
#' @param alpha_repunit If a vector, this is the dirichlet parameter for simulating
#' the proportions of reporting units. Gets recycled to the number of reporting units. Default is 1.5.
#' Otherwise, this could be a two-column data frame.  The first column must be named "repunit" and the
#' second one must be one of "dirichlet", "ppn", or "cnt", according to whether you wish to
#' specify dirichlet parameters, or proportions, or exact counts, respectively, for each population.
#' @param alpha_collection The dirichlet parameter for simulating proportions of collections within reporting units. Default = 1.5
#' @param coll_sub_dirichlet_default If you are providing a data frame with requested sub_dirichlet pars
#' for collections, and you don't specifically list one, this is the value it gets.
#' @return a list with three elements.
#' The first two are a rho vector and an omega vector, respectively. The third is a vector of origins for
#' simulated individuals, sampled from the collections with probabilities = omega
#' @export
#' @keywords internal
simulate_random_samples <- function(RU_starts, RU_vec, size = 100, alpha_repunit = 1.5, alpha_collection = 1.5, coll_sub_dirichlet_default = 1.5) {

  # first thing off the bat, get a vector of the collections indices, and their names
  # in the order that the omega vectors are listed:
  collections_in_order <- sort(RU_vec)

  #### first take care of all the repunit input parameters ####
  if (any(class(alpha_repunit) == "data.frame")) {
    #message("alpha_repunit is a data.frame.  Processing as such")
    stopifnot(ncol(alpha_repunit) == 2)
    stopifnot(names(alpha_repunit)[1] == "repunit")
    stopifnot(names(alpha_repunit)[2] %in% c("dirichlet", "ppn", "cnt"))

    repus_mismatch <- base::setdiff(alpha_repunit$repunit, names(RU_starts))
    if (length(repus_mismatch > 0)) {
      stop("Requesting alpha_repunit for repunit(s) not in data set: ", paste(names(repus_mismatch), collapse = ", "))
    }

    # make a tibble of the requested values
    repunit_requested <- tibble::tibble(repunit = names(RU_starts)[1:(length(RU_starts) - 1)]) %>%
      dplyr::left_join(alpha_repunit, by = "repunit")

    # store this for easy access
    spec_repu <- names(alpha_repunit)[2]

    # and set non-requested values to 0
    repu_vals <- repunit_requested[[2]]
    repu_vals[is.na(repu_vals)] <- 0

  } else {# if alpha_repunit is not a data frame, then just simulate em
    repu_vals <- rep(alpha_repunit, length(RU_starts) - 1)
    spec_repu <- "single"
  }


  #### then take care of the collection input parameters ####
  if (any(class(alpha_collection) == "data.frame")) {
    #message("alpha_collection is a data.frame.  Processing as such")
    stopifnot(ncol(alpha_collection) == 2)
    stopifnot(names(alpha_collection)[1] == "collection")
    stopifnot(names(alpha_collection)[2] %in% c("dirichlet", "ppn", "cnt", "sub_dirichlet", "sub_ppn"))

    colls_mismatch <- base::setdiff(alpha_collection$collection, names(collections_in_order))
    if (length(colls_mismatch > 0)) {
      stop("Requesting alpha_collection for collections(s) not in data set: ", paste(colls_mismatch, collapse = ", "))
    }

    # make a tibble of the requested values
    collection_requested <- tibble::tibble(collection = names(collections_in_order)) %>%
      dplyr::left_join(alpha_collection, by = "collection")

    spec_coll <- names(alpha_collection)[2]
    coll_vals <- collection_requested[[2]]

    # now, we go through the tedious process of the different cases, and set
    # the NA values appropriately.
    if (spec_coll == "dirichlet") {
      coll_vals[is.na(coll_vals)] <- 0
    } else if (spec_coll == "ppn") {
      coll_vals[is.na(coll_vals)] <- 0
    } else if (spec_coll == "cnt") {
      coll_vals[is.na(coll_vals)] <- 0
    } else if (spec_coll == "sub_dirichlet") {
      coll_vals[is.na(coll_vals)] <- coll_sub_dirichlet_default
    } else if (spec_coll == "sub_ppn") {
      coll_vals[is.na(coll_vals)] <- 0.0000001
    }
  } else {
    spec_coll <- "single"
    coll_vals <- rep(alpha_collection, length(RU_vec))
  }

  #### Now the easy portion --- non-sub collection spec  ####
  if (spec_coll %in% c("dirichlet", "ppn", "cnt")) {
    if (spec_coll == "dirichlet") {
      if (spec_repu != "single") warning("specification for repunit ppn, dirichlet, or counts is ignored when collections are dirichlet.  To silence this warning, leave alpha_repunit unset")
      omega <- gtools::rdirichlet(1, coll_vals)[1, ]
      sim_coll = sample(collections_in_order, size = size, replace = TRUE, prob = omega)
    } else if (spec_coll == "ppn") {
      if (spec_repu != "single") warning("specification for repunit ppn, dirichlet, or counts is ignored when collections are ppn.  To silence this warning, leave alpha_repunit unset")
      omega <- coll_vals / sum(coll_vals)
      sim_coll = sample(collections_in_order, size = size, replace = TRUE, prob = omega)
    } else if (spec_coll == "cnt") {
      if (spec_repu != "single") warning("specification for repunit ppn, dirichlet, or counts is ignored when collections are cnt.  To silence this warning, leave alpha_repunit unset")
      if (sum(coll_vals) != size) warning("sum of collection cnt is not equal to size.  resetting size to the sum.")
      sim_coll = rep(collections_in_order, coll_vals)
      omega <- coll_vals / sum(coll_vals)
    }
    # and finally we build up the rho from the bottom up.
    rho <- numeric(length(RU_starts) - 1)
    for (x in 1:(length(RU_starts) - 1)) {
      rho[x] <- sum(omega[RU_vec[(RU_starts[x] + 1):RU_starts[x + 1]]])
    }
  }  else {#### otherwise we do it from the top down---collection is sub_ppn and sub_dirichlet and single ####

    # first thing we have to do is get a collections proportions vector (which will get normalized within repunits)
    if (spec_coll == "sub_dirichlet" | spec_coll == "single") {
      coll_ppn <- gtools::rdirichlet(1, coll_vals)[1, ]
    } else if (spec_coll == "sub_ppn") {
      coll_ppn <- coll_vals / sum(coll_vals)
    }

    # now the most non-standard case is when repu is counts, so we start with that
    # and the others cases are all similar and simpler
    if (spec_repu == "cnt") {  # if it is counts we get repu_cnts but we still want a rho vector to report
      rho <- repu_vals / sum(repu_vals)
      repu_cnts <- repu_vals
      #if (sum(repu_cnts) != size) warning("User specified repunit counts which supersede the size parameter")

      # then we get the total number of individuals from each collection by doing
      # multinomial sampling
      coll_counts <- integer(length(coll_vals))
      for (x in 1:(length(RU_starts) - 1)) {
        the_idxs <- RU_vec[(RU_starts[x] + 1):RU_starts[x + 1]]  # these are the indices in the coll_counts vector of the collections
        if (repu_cnts[x] > 0) {
          coll_counts[the_idxs] <- rmultinom(1, repu_cnts[x], coll_ppn[the_idxs] / sum(coll_ppn[the_idxs]))[,1]
        }
      }

      # and we use those numbers to make the sim_coll
      sim_coll = rep(collections_in_order, coll_counts)

      # and, though we don't really use it, we will report the omega as the counts
      # taken as proportions
      omega <- coll_counts / sum(coll_counts)

      # so, that has given us rho, omega, and sim_coll, so we are done
    } else {# for repunit dirichlet, ppn, or single, we compute rho first, then omega within that, then sim_coll
      if (spec_repu == "dirichlet" || spec_repu == "single") {
        # simulate reporting unit mixing proportion from the dirichlet parameters
        rho <- gtools::rdirichlet(1, repu_vals)[1, ]
      } else if (spec_repu == "ppn") {
        rho <- repu_vals / sum(repu_vals)  # make sure they are normalized so they sum to one
      }

      # now we compute omega from the rho values and the coll_ppn vector we computed above
      omega <- numeric(length(RU_vec))
      for (x in 1:(length(RU_starts) - 1)) {
        the_idxs <- RU_vec[(RU_starts[x] + 1):RU_starts[x + 1]]  # these are the indices in the RU_vec vector of the collections
        omega[the_idxs] <- (coll_ppn[the_idxs] / sum(coll_ppn[the_idxs])) * rho[x]
      }

      # now, sample the collection indexes from the collection mixing proportions
      sim_coll = sample(collections_in_order, size = size, replace = TRUE, prob = omega)
    }
  }


  # put names on rho and omega
  names(rho) <- names(RU_starts)[-length(RU_starts)]
  names(omega) <- names(collections_in_order)

  # finally return it all in a list
  list(rho = rho, omega = omega, sim_coll = sim_coll)
}
benmoran11/rubias documentation built on Feb. 1, 2024, 10:52 p.m.