Nothing
#' Partition a reference dataset and estimate reporting group and collection proportions
#'
#' From a reference dataset, this draws (without replacement) a simulated mixture
#' dataset with randomly drawn population proportions,
#' then uses this in two different estimates of population mixture proportions:
#' maximum likelihood via EM-algorithm and posterior mean from MCMC.
#'
#' This method is referred to as "Monte Carlo cross-validation".
#' The input parameters for \code{assess_reference_mc} are more restrictive than those of
#' \code{assess_reference_loo}. Rather than allowing a \emph{data.frame} to specify Dirichlet
#' parameters, proportions, or counts for specific reporting units and collections,
#' \code{assess_reference_mc} only allows vector input (default = 1.5) for \code{alpha_repunit}
#' and \code{alpha_collection}. These inputs specify the uniform Dirichlet parameters for
#' all reporting units and collections, respectively.
#'
#' For mixture proportion generation, the rho values are first drawn using a stick-breaking
#' model of the Dirichlet distribution, but with proportions capped by \code{min_remaining}.
#' Stick-breaking is then used to subdivide each reporting unit into collections. In
#' addition to the constraint that mixture sampling without replacement cannot deplete
#' the number of individuals in each collection below \code{min_remaining}, a similar
#' constraint is placed upon the number of individuals left in reporting units,
#' determined as \code{min_remaining} * (# collections in reporting unit).
#'
#' Note that this implies that the data are only truly Dirichlet distributed when no
#' rejections based on \code{min_remaining} occur. This is a reasonable certainty with
#' sufficient reference collection sizes relative to the desired mixture size.
#'
#' @param reference a two-column format genetic dataset, with "repunit", "collection", and "indiv"
#' columns, as well as a "sample_type" column that has some "reference" entries.
#' @param gen_start_col the first column of genetic data in reference
#' @param reps number of reps to do
#' @param mixsize the number of individuals in each simulated mixture.
#' @param seed a random seed for simulations
#' @param alpha_repunit The dirichlet parameter for simulating the proportions of
#' reporting units. Default = 1.5
#' @param alpha_collection The dirichlet parameter for simulating proportions of collections
#' within reporting units. Default = 1.5
#' @param min_remaining the minimum number of individuals which should be conserved in
#' each reference collection during sampling without replacement to form the simulated mixture
#' @param alle_freq_prior a one-element named list specifying the prior to be used when
#' generating Dirichlet parameters for genotype likelihood calculations. Valid methods include
#' \code{"const"}, \code{"scaled_const"}, and \code{"empirical"}. See \code{?list_diploid_params}
#' for method details.
#' @examples
#' # only 5 reps, so it doesn't take too long. Typically you would
#' # do many more
#' ale_dev <- assess_reference_mc(alewife, 17, 5)
#' @export
assess_reference_mc <- function(reference, gen_start_col, reps = 50, mixsize = 100, seed = 5,
alpha_repunit = 1.5, alpha_collection = 1.5, min_remaining = 5,
alle_freq_prior = list("const_scaled" = 1)) {
# check that reference is formatted appropriately
ploidies <- check_refmix(reference, gen_start_col, "reference")
reference$repunit <- factor(reference$repunit, levels = unique(reference$repunit))
reference$collection <- factor(reference$collection, levels = unique(reference$collection))
params <- tcf2param_list(reference, gen_start_col, summ = F, alle_freq_prior = alle_freq_prior, ploidies = ploidies)
# get a data frame that has the repunits and collections
reps_and_colls <- reference %>%
dplyr::select(repunit, collection) %>%
dplyr::group_by(repunit, collection) %>%
dplyr::tally() %>%
dplyr::ungroup() %>%
dplyr::filter(n > 0) %>%
dplyr::mutate(coll_int = 1:length(unique(reference$collection)))
# set seed
set.seed(seed)
# get the constraints on the number of individuals to be drawn during the cross-validation
# min_remaining individuals must be left in the reference for each collection,
# and min_remaining * (#collections) for each reporting unit
coll_max_draw <- reps_and_colls$n - min_remaining
ru_max_draw <- lapply(levels(reference$repunit), function(ru){
out <- sum(coll_max_draw[reps_and_colls$coll_int[reps_and_colls$repunit == ru]])
}) %>% unlist()
reps_and_colls <- dplyr::select(reps_and_colls, -coll_int)
# Get random rhos and omegas, constrained by a minimum of min_remaining
# reference individuals per population after the draw
# using a stick breaking model of the Dirichlet distribution
draw_colls <- lapply(1:reps, function(x){
rho <- numeric(length(ru_max_draw))
omega <- numeric(length(coll_max_draw))
rho_sum <- 0
for (ru in 1:length(ru_max_draw)) {
rho[ru] <- min(ru_max_draw[ru]/mixsize,
(1 - rho_sum) * rbeta(1, alpha_repunit, 1.5 * (length(ru_max_draw) - ru)))
rho_sum <- rho_sum + rho[ru]
om_sum <- 0
c <- 1
for (coll in (params$RU_starts[ru] + 1):params$RU_starts[ru + 1]) {
omega[params$RU_vec[coll]] <- min(coll_max_draw[params$RU_vec[coll]]/mixsize,
(rho[ru] - om_sum) * rbeta(1, 1.5, 1.5 * (length((params$RU_starts[ru] + 1):params$RU_starts[ru + 1]) - c)))
om_sum <- om_sum + omega[params$RU_vec[coll]]
c <- c + 1
}
rho[ru] <- om_sum
}
# The omegas should always sum to one so long as there is a reasonable reference dataset size
# However, could sum to less than one if the proposal for the last rho/omega is rejected
# Therefore, include the following quick guarantee:
rho <- rho/sum(rho)
omega <- omega/sum(omega)
true_n <- round(omega * mixsize,0)
names(true_n) <- levels(reference$collection)
list(rho = rho, omega = omega, true_n = true_n)
})
# now extract the true values of rho and omega from that into some data frames
true_omega_df <- lapply(draw_colls, function(x) tibble::tibble(collection = levels(reference$collection), omega = x$omega)) %>%
dplyr::bind_rows(.id = "iter") %>%
dplyr::mutate(iter = as.integer(iter))
true_rho_df <- lapply(draw_colls, function(x) tibble::tibble(collection = levels(reference$repunit), rho = x$rho)) %>%
dplyr::bind_rows(.id = "iter") %>%
dplyr::mutate(iter = as.integer(iter))
# and finally, extract the true numbers of individuals from each collection into a data frame
true_sim_nums <- lapply(draw_colls, function(x) tibble::tibble(collection = levels(reference$collection), n = x$true_n)) %>%
dplyr::bind_rows(.id = "iter") %>%
dplyr::mutate(iter = as.integer(iter))
reps_and_colls <- reps_and_colls %>%
dplyr::select(-n)
#### cycle over the reps data sets, get parameters for the new reference, and get proportion estimates from each
estimates <- lapply(1:reps, function(x) {
# designate random indivuals as mixture samples, based on previosly chosen proportions
mc_data <- lapply(levels(reference$collection), function(coll){
coll_split <- reference %>%
dplyr::filter(collection == coll)
mix_idx <- sample(1:nrow(coll_split), draw_colls[[x]]$true_n[coll], replace = F)
coll_split$sample_type[mix_idx] <- "mixture"
coll_split
}) %>%
dplyr::bind_rows()
#get MCMC parameters (unique to each MC draw)
clean <- tcf2long(mc_data, gen_start_col)
rac <- reference_allele_counts(clean$long)
ac <- a_freq_list(rac)
coll_N <- rep(0, ncol(ac[[1]])) # the number of individuals in each population; not applicable for mixture samples
colls_by_RU <- dplyr::filter(clean$clean_short, sample_type == "reference") %>%
droplevels() %>%
dplyr::count(repunit, collection) %>%
dplyr::ungroup() %>%
dplyr::filter(n > 0) %>%
dplyr::select(-n)
PC <- rep(0, length(unique(colls_by_RU$repunit)))
for (i in 1:nrow(colls_by_RU)) {
PC[colls_by_RU$repunit[i]] <- PC[colls_by_RU$repunit[i]] + 1
}
RU_starts <- c(0, cumsum(PC))
RU_vec <- as.integer(colls_by_RU$collection)
names(RU_vec) <- as.character(colls_by_RU$collection)
mix_I <- allelic_list(clean$clean_short, ac, samp_type = "mixture")$int
coll <- rep(0,length(mix_I[[1]]$a)) # populations of each individual in mix_I; not applicable for mixture samples
mc_params <- list_diploid_params(ac, mix_I, coll, coll_N, RU_vec, RU_starts, alle_freq_prior = alle_freq_prior)
mc_params$locus_names <- names(ac)
mc_params$ploidies <- as.integer(unname(ploidies[mc_params$locus_names]))
logl <- geno_logL(mc_params)
logl_col_maxes <- apply(logl, 2, max)
logl_swept <- sweep(logl, 2, logl_col_maxes)
SL <- apply(exp(logl_swept), 2, function(x) x/sum(x))
# get the posterior mean estimates by MCMC
pi_out <- gsi_mcmc_1(SL = SL,
Pi_init = rep(1 / mc_params$C, mc_params$C),
lambda = rep(1 / mc_params$C, mc_params$C),
reps = 2000,
burn_in = 100,
sample_int_Pi = 0,
sample_int_PofZ = 0)
# get the MLEs by EM-algorithm
em_out <- gsi_em_1(SL, Pi_init = rep(1 / mc_params$C, mc_params$C), max_iterations = 10^6,
tolerance = 10^-7, return_progression = FALSE)
# put those in a data_frame
tibble::tibble(collection = levels(reference$collection),
post_mean = pi_out$mean$pi,
mle = em_out$pi
)
}) %>%
dplyr::bind_rows(.id = "iter") %>%
dplyr::mutate(iter = as.integer(iter))
#### Now, join the estimates to the truth and coerce factors back to characters ####
# first off, reps_and_colls must be converted to characters
reps_and_colls_char <- reps_and_colls %>%
dplyr::mutate(repunit = as.character(repunit),
collection = as.character(collection))
ret <- dplyr::left_join(true_omega_df, true_sim_nums, by = c("iter", "collection")) %>%
dplyr::left_join(., estimates, by = c("iter", "collection")) %>%
dplyr::mutate(n = ifelse(is.na(n), 0, n)) %>%
dplyr::left_join(., reps_and_colls_char, by = "collection") %>%
dplyr::select(iter, repunit, dplyr::everything())
# return that data frame
ret
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.