Nothing
#' Generate a samples for a mixture.
#'
#' Creates random reporting unit (rho) and collection (omega) proportions, and a
#' \code{sim_colls} vector for simulation of individual genotypes, based on the methods
#' used in Hasselman \emph{et al.} (2015)
#'
#' @param RU_starts a vector delineating the reporting units in \code{RU_vec};
#' generated by \code{tcf2param_list}
#' @param RU_vec a vector of collection indices, grouped by reporting unit;
#' generated by \code{tcf2param_list}
#' @param size The number of individuals desired in the mixture sample. Default = 100. This is
#' ignored without a warning if alpha_repunit is specified with counts (a cnt column) or if alpha_collection
#' is specified with counts (a cnt column).
#' @param alpha_repunit If a vector, this is the dirichlet parameter for simulating
#' the proportions of reporting units. Gets recycled to the number of reporting units. Default is 1.5.
#' Otherwise, this could be a two-column data frame. The first column must be named "repunit" and the
#' second one must be one of "dirichlet", "ppn", or "cnt", according to whether you wish to
#' specify dirichlet parameters, or proportions, or exact counts, respectively, for each population.
#' @param alpha_collection The dirichlet parameter for simulating proportions of collections within reporting units. Default = 1.5
#' @param coll_sub_dirichlet_default If you are providing a data frame with requested sub_dirichlet pars
#' for collections, and you don't specifically list one, this is the value it gets.
#' @return a list with three elements.
#' The first two are a rho vector and an omega vector, respectively. The third is a vector of origins for
#' simulated individuals, sampled from the collections with probabilities = omega
#' @export
#' @keywords internal
simulate_random_samples <- function(RU_starts, RU_vec, size = 100, alpha_repunit = 1.5, alpha_collection = 1.5, coll_sub_dirichlet_default = 1.5) {
# first thing off the bat, get a vector of the collections indices, and their names
# in the order that the omega vectors are listed:
collections_in_order <- sort(RU_vec)
#### first take care of all the repunit input parameters ####
if (any(class(alpha_repunit) == "data.frame")) {
#message("alpha_repunit is a data.frame. Processing as such")
stopifnot(ncol(alpha_repunit) == 2)
stopifnot(names(alpha_repunit)[1] == "repunit")
stopifnot(names(alpha_repunit)[2] %in% c("dirichlet", "ppn", "cnt"))
repus_mismatch <- base::setdiff(alpha_repunit$repunit, names(RU_starts))
if (length(repus_mismatch > 0)) {
stop("Requesting alpha_repunit for repunit(s) not in data set: ", paste(names(repus_mismatch), collapse = ", "))
}
# make a tibble of the requested values
repunit_requested <- tibble::tibble(repunit = names(RU_starts)[1:(length(RU_starts) - 1)]) %>%
dplyr::left_join(alpha_repunit, by = "repunit")
# store this for easy access
spec_repu <- names(alpha_repunit)[2]
# and set non-requested values to 0
repu_vals <- repunit_requested[[2]]
repu_vals[is.na(repu_vals)] <- 0
} else {# if alpha_repunit is not a data frame, then just simulate em
repu_vals <- rep(alpha_repunit, length(RU_starts) - 1)
spec_repu <- "single"
}
#### then take care of the collection input parameters ####
if (any(class(alpha_collection) == "data.frame")) {
#message("alpha_collection is a data.frame. Processing as such")
stopifnot(ncol(alpha_collection) == 2)
stopifnot(names(alpha_collection)[1] == "collection")
stopifnot(names(alpha_collection)[2] %in% c("dirichlet", "ppn", "cnt", "sub_dirichlet", "sub_ppn"))
colls_mismatch <- base::setdiff(alpha_collection$collection, names(collections_in_order))
if (length(colls_mismatch > 0)) {
stop("Requesting alpha_collection for collections(s) not in data set: ", paste(colls_mismatch, collapse = ", "))
}
# make a tibble of the requested values
collection_requested <- tibble::tibble(collection = names(collections_in_order)) %>%
dplyr::left_join(alpha_collection, by = "collection")
spec_coll <- names(alpha_collection)[2]
coll_vals <- collection_requested[[2]]
# now, we go through the tedious process of the different cases, and set
# the NA values appropriately.
if (spec_coll == "dirichlet") {
coll_vals[is.na(coll_vals)] <- 0
} else if (spec_coll == "ppn") {
coll_vals[is.na(coll_vals)] <- 0
} else if (spec_coll == "cnt") {
coll_vals[is.na(coll_vals)] <- 0
} else if (spec_coll == "sub_dirichlet") {
coll_vals[is.na(coll_vals)] <- coll_sub_dirichlet_default
} else if (spec_coll == "sub_ppn") {
coll_vals[is.na(coll_vals)] <- 0.0000001
}
} else {
spec_coll <- "single"
coll_vals <- rep(alpha_collection, length(RU_vec))
}
#### Now the easy portion --- non-sub collection spec ####
if (spec_coll %in% c("dirichlet", "ppn", "cnt")) {
if (spec_coll == "dirichlet") {
if (spec_repu != "single") warning("specification for repunit ppn, dirichlet, or counts is ignored when collections are dirichlet. To silence this warning, leave alpha_repunit unset")
omega <- gtools::rdirichlet(1, coll_vals)[1, ]
sim_coll = sample(collections_in_order, size = size, replace = TRUE, prob = omega)
} else if (spec_coll == "ppn") {
if (spec_repu != "single") warning("specification for repunit ppn, dirichlet, or counts is ignored when collections are ppn. To silence this warning, leave alpha_repunit unset")
omega <- coll_vals / sum(coll_vals)
sim_coll = sample(collections_in_order, size = size, replace = TRUE, prob = omega)
} else if (spec_coll == "cnt") {
if (spec_repu != "single") warning("specification for repunit ppn, dirichlet, or counts is ignored when collections are cnt. To silence this warning, leave alpha_repunit unset")
if (sum(coll_vals) != size) warning("sum of collection cnt is not equal to size. resetting size to the sum.")
sim_coll = rep(collections_in_order, coll_vals)
omega <- coll_vals / sum(coll_vals)
}
# and finally we build up the rho from the bottom up.
rho <- numeric(length(RU_starts) - 1)
for (x in 1:(length(RU_starts) - 1)) {
rho[x] <- sum(omega[RU_vec[(RU_starts[x] + 1):RU_starts[x + 1]]])
}
} else {#### otherwise we do it from the top down---collection is sub_ppn and sub_dirichlet and single ####
# first thing we have to do is get a collections proportions vector (which will get normalized within repunits)
if (spec_coll == "sub_dirichlet" | spec_coll == "single") {
coll_ppn <- gtools::rdirichlet(1, coll_vals)[1, ]
} else if (spec_coll == "sub_ppn") {
coll_ppn <- coll_vals / sum(coll_vals)
}
# now the most non-standard case is when repu is counts, so we start with that
# and the others cases are all similar and simpler
if (spec_repu == "cnt") { # if it is counts we get repu_cnts but we still want a rho vector to report
rho <- repu_vals / sum(repu_vals)
repu_cnts <- repu_vals
#if (sum(repu_cnts) != size) warning("User specified repunit counts which supersede the size parameter")
# then we get the total number of individuals from each collection by doing
# multinomial sampling
coll_counts <- integer(length(coll_vals))
for (x in 1:(length(RU_starts) - 1)) {
the_idxs <- RU_vec[(RU_starts[x] + 1):RU_starts[x + 1]] # these are the indices in the coll_counts vector of the collections
if (repu_cnts[x] > 0) {
coll_counts[the_idxs] <- rmultinom(1, repu_cnts[x], coll_ppn[the_idxs] / sum(coll_ppn[the_idxs]))[,1]
}
}
# and we use those numbers to make the sim_coll
sim_coll = rep(collections_in_order, coll_counts)
# and, though we don't really use it, we will report the omega as the counts
# taken as proportions
omega <- coll_counts / sum(coll_counts)
# so, that has given us rho, omega, and sim_coll, so we are done
} else {# for repunit dirichlet, ppn, or single, we compute rho first, then omega within that, then sim_coll
if (spec_repu == "dirichlet" || spec_repu == "single") {
# simulate reporting unit mixing proportion from the dirichlet parameters
rho <- gtools::rdirichlet(1, repu_vals)[1, ]
} else if (spec_repu == "ppn") {
rho <- repu_vals / sum(repu_vals) # make sure they are normalized so they sum to one
}
# now we compute omega from the rho values and the coll_ppn vector we computed above
omega <- numeric(length(RU_vec))
for (x in 1:(length(RU_starts) - 1)) {
the_idxs <- RU_vec[(RU_starts[x] + 1):RU_starts[x + 1]] # these are the indices in the RU_vec vector of the collections
omega[the_idxs] <- (coll_ppn[the_idxs] / sum(coll_ppn[the_idxs])) * rho[x]
}
# now, sample the collection indexes from the collection mixing proportions
sim_coll = sample(collections_in_order, size = size, replace = TRUE, prob = omega)
}
}
# put names on rho and omega
names(rho) <- names(RU_starts)[-length(RU_starts)]
names(omega) <- names(collections_in_order)
# finally return it all in a list
list(rho = rho, omega = omega, sim_coll = sim_coll)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.