# set the working directory always to the project directory (one level up) knitr::opts_knit$set(root.dir = normalizePath(rprojroot::find_rstudio_root_file()))
libraries:
library(tidyverse) library(rubias) library(stringr)
My goal here is to test the many different scenarios by which the composition of simulated samples can be specified for doing simulations with rubias.
I just overhauled the simulate_random_samples()
function and I want to both document its
use and test it.
Most users will never call it directly, but I will here to make sure that things are working correctly.
We will do this with the chinook
data set. Here is what the repunits and collections look like
in that data set
repc <- chinook %>% group_by(repunit, collection) %>% tally() %>% select(-n) repc
We will use that to left_join results on later.
In order to call simulated_random_samples()
we need to get some variables, that we get
this way:
p <- tcf2param_list(chinook, 5, summ = F)
Given a list of multiple outputs of simulate_random_samples() (obtained by lapplying multiple reps for example), this function collects the output into three tidy data frames.
tidy_srs <- function(L, repc) { # get the rho values rho <- lapply(L, function(x) { enframe(x$rho, name = "repunit", value = "rho") }) %>% bind_rows(.id = "rep") %>% mutate(rep = as.integer(rep)) # and then get the omega and n's omegan <- lapply(L, function(x) { ns <- as.data.frame(table(names(x$sim_coll))) %>% rename(collection = Var1, n = Freq) %>% mutate(collection = as.character(collection)) %>% as_tibble() enframe(x$omega, name = "collection", value = "omega") %>% left_join(., ns, by = "collection") }) %>% bind_rows(.id = "rep") %>% mutate(rep = as.integer(rep)) %>% mutate(n = ifelse(is.na(n), 0, n)) %>% left_join(repc, ., by = "collection") list(rho = rho, omegan = omegan) } ### Here is a function for summarizing those and returning those summarized values in a list # along with the originals. summ_it <- function(S) { rho_summ <- S$rho %>% group_by(repunit) %>% summarise(rho_mean = mean(rho), rho_var = var(rho), rho_min = min(rho), rho_max = max(rho)) omegan_summ <- S$omegan %>% group_by(repunit, collection) %>% summarise(omega_mean = mean(omega), n_mean = mean(n), omega_var = var(omega), n_var = var(n), omega_min = min(omega), omega_max = max(omega), n_min = min(n), n_max = max(n)) %>% group_by(repunit) %>% mutate(summed_rho = sum(omega_mean), npops = n()) S$rho_summ <- rho_summ S$omegan_summ <- omegan_summ S }
In order to put things in a good order on plots we make this:
repcF <- repc %>% mutate(repu_coll = paste(repunit, collection, sep = "--")) %>% select(repu_coll, everything()) repcF$repu_coll <- factor(repcF$repu_coll, levels = unique(repcF$repu_coll))
First gonna just test the repunit "single" value setting.
This is the simplest, default case. Let's get all the output:
res <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100) }) %>% tidy_srs(., repc) %>% summ_it()
Then look at it:
ggplot(res$rho_summ, aes(x = repunit, y = rho_mean)) + geom_point() + geom_segment(aes(y = rho_min, yend = rho_max, xend = repunit)) + coord_flip()
ggplot(res$omegan_summ, aes(x = collection, y = omega_mean, colour = as.factor(npops))) + geom_point() + geom_segment(aes(y = omega_min, yend = omega_max, xend = collection)) + coord_flip()
Finally look at the number of simulated individuals:
res$omegan_summ %>% group_by(repunit) %>% summarise(repu_n = sum(n_mean))
That checks out summing over reporting units.
And how about the number of collection, scaled by the number of collections in the repunit?
res$omegan_summ %>% mutate(scaled_n = n_mean * npops) %>% select(repunit, collection, scaled_n, everything())
I had introduced a bug, but have fixed it now. It seems to me that the omega vector that
Ben carried around in simulate_random_samples()
must have been ordered differently than the
natural levels(collections)
order. When I ran the above code (with a little [1,]
thrown in for
the rho
values, with the older version of the code that Ben wrote, the # of fish from each collection
is correct, but the omegas are wrong, suggesting that they are not ordered they way I thought they were.
When I sample the sim_colls I now do so from a a vector that is ordered by the levels of the collections and it seems to work.
For this rather contrived case, the reporting unit proportions are drawn from a Dirichlet(1.5), but then I can specify different proportions within rep_units of the collections, following a dirichlet distribution. For this test, everyone will get the default 1.5 except for a few collections in the CV fall and CV spring and California Coast. Namely, in my examples, I have:
sim_spec_examples$coll_sub_dirichlet
Let's give this thing a whirl. Just like before:
csd <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_collection = sim_spec_examples$coll_sub_dirichlet)}) %>% tidy_srs(., repc) %>% summ_it()
ggplot(csd$rho_summ, aes(x = repunit, y = rho_mean)) + geom_point() + geom_segment(aes(y = rho_min, yend = rho_max, xend = repunit)) + coord_flip()
Those look correct!
Now, how about the omegas?
ggplot(left_join(repcF, csd$omegan_summ), aes(x = repu_coll, y = omega_mean, colour = as.factor(npops))) + geom_point() + geom_segment(aes(y = omega_min, yend = omega_max, xend = repu_coll)) + coord_flip()
OK, that checks out. Now check sim_colls
csd$omegan_summ %>% left_join(repcF, .) %>% select(repu_coll, omega_mean, n_mean)
That check outs. Good!
We are going to do something similar, but we will specify weights for the collections, and in this case, the default is practically 0. So, for reporting units in which we specify at least one collection with a weight, they will get pretty much all of it.
sim_spec_examples$coll_sub_ppn
csp <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_collection = sim_spec_examples$coll_sub_ppn)}) %>% tidy_srs(., repc) %>% summ_it()
ggplot(csp$rho_summ, aes(x = repunit, y = rho_mean)) + geom_point() + geom_segment(aes(y = rho_min, yend = rho_max, xend = repunit)) + coord_flip()
Those look correct!
Now, how about the omegas?
ggplot(left_join(repcF, csp$omegan_summ), aes(x = repu_coll, y = omega_mean, colour = as.factor(npops))) + geom_point() + geom_segment(aes(y = omega_min, yend = omega_max, xend = repu_coll)) + coord_flip()
OK, that checks out. Now check sim_colls
csp$omegan_summ %>% left_join(repcF, .) %>% select(repu_coll, omega_mean, n_mean)
Yep. That checks out.
This is best done whilst not setting anything for alpha_repunit (else you get lots of warnings). We are going to make a contrived example here in which each reporting unit gets a certain weight, and that gets split between the collections in it according to their position in the reporting unit. This is how I did it, and it is now part of the package.
set.seed(555) ppn <- repc %>% group_by(repunit) %>% mutate(repwt = rep(runif(1, min = 1, max = 100), n())) %>% mutate(ppn = repwt / 1:n()) %>% ungroup() %>% select(collection, ppn) sim_spec_examples$coll_ppn <- ppn sim_spec_examples$coll_dirichlet <- ppn %>% rename(dirichlet = ppn)
Here is what the input looks like:
sim_spec_examples$coll_dirichlet
So, let's run it:
cdir <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_collection = sim_spec_examples$coll_dirichlet)}) %>% tidy_srs(., repc) %>% summ_it()
Then join it to what we asked for:
tmp <- sim_spec_examples$coll_dirichlet %>% mutate(n_exp = 100 * dirichlet / sum(dirichlet), omega_exp = dirichlet / sum(dirichlet)) comp_cdir <- left_join(tmp, cdir$omegan_summ) comp_cdir
Booyah! That looks just right:
ggplot(comp_cdir, aes(x = n_exp, y = n_mean)) + geom_point() + geom_point(aes(x = omega_exp * 100, y = omega_mean * 100), colour = "red")
We are going to do the same, but have them interpreted as collection proportions, rather than dirichlet parameters:
sim_spec_examples$coll_ppn
And here it goes:
cppn <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_collection = sim_spec_examples$coll_ppn)}) %>% tidy_srs(., repc) %>% summ_it()
tmp <- sim_spec_examples$coll_ppn %>% mutate(n_exp = 100 * ppn / sum(ppn), omega_exp = ppn / sum(ppn)) comp_cppn <- left_join(tmp, cppn$omegan_summ) comp_cppn
Yep. That is all good. Note that the variance of omega is 0 there.
Now, let's see if we can simulate a lot of samples with exactly the same number of individuals each time.
ccnt <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_collection = sim_spec_examples$coll_cnt)}) %>% tidy_srs(., repc) %>% summ_it()
OK, we get that warning. That is cool. In the future I think I could
throttle that warning and just say that size
is ignored in the docs.
tmp <- sim_spec_examples$coll_cnt %>% mutate(omega_exp = cnt / sum(cnt)) comp_ccnt <- left_join(tmp, ccnt$omegan_summ) comp_ccnt
Yep, that all checks out.
So, we need to test this with the three different "sub_collection"s.
First we make up a set of Dirichlet parameters for the different repunits
set.seed(555) sim_spec_examples$repunit_dirichlet <- tibble( repunit = unique(repc$repunit)) %>% mutate(dirichlet = runif(n(), min = -15, max = 15)) %>% filter(dirichlet > 0)
First we test the situation where collection is just give the default dirichlet parameter:
rdir <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_dirichlet)}) %>% tidy_srs(., repc) %>% summ_it()
Then we compare that to what we expected
sim_spec_examples$repunit_dirichlet %>% mutate(rho_exp = dirichlet / sum(dirichlet)) %>% left_join(rdir$rho_summ, .)
And at the collection level:
sim_spec_examples$repunit_dirichlet %>% mutate(rho_exp = dirichlet / sum(dirichlet)) %>% left_join(rdir$omegan_summ, .)
That all looks good.
rdir_csd <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_dirichlet, alpha_collection = sim_spec_examples$coll_sub_dirichlet)}) %>% tidy_srs(., repc) %>% summ_it()
Let's compute the expected proportions:
sim_spec_examples$repunit_dirichlet %>% ungroup() %>% mutate(dirch_ppn = dirichlet / sum(dirichlet)) %>% left_join(repc, .) %>% left_join(., sim_spec_examples$coll_sub_dirichlet) %>% mutate(sub_dirichlet = ifelse(is.na(sub_dirichlet), 1.5, sub_dirichlet), dirichlet = ifelse(is.na(dirichlet), 1.5, dirichlet)) %>% group_by(repunit) %>% mutate(exp_omega = dirch_ppn * sub_dirichlet / sum(sub_dirichlet)) %>% left_join(., rdir_csd$omegan_summ)
That looks like it all checks out.
rdir_csp <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_dirichlet, alpha_collection = sim_spec_examples$coll_sub_ppn)}) %>% tidy_srs(., repc) %>% summ_it()
Let's compute the expected proportions:
sim_spec_examples$repunit_dirichlet %>% ungroup() %>% mutate(dirch_ppn = dirichlet / sum(dirichlet)) %>% left_join(repc, .) %>% left_join(., sim_spec_examples$coll_sub_ppn) %>% mutate(sub_ppn = ifelse(is.na(sub_ppn), 0.0000001, sub_ppn), dirichlet = ifelse(is.na(dirichlet), 1.5, dirichlet)) %>% group_by(repunit) %>% mutate(exp_omega = dirch_ppn * sub_ppn / sum(sub_ppn)) %>% left_join(., rdir_csp$omegan_summ)
That looks like it checks out.
First we test the situation where collection is just give the default dirichlet parameter:
rppn_csing <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_ppn)}) %>% tidy_srs(., repc) %>% summ_it()
Then we compare that to what we expected
sim_spec_examples$repunit_ppn %>% mutate(rho_exp = ppn / sum(ppn)) %>% left_join(rppn_csing$rho_summ, .)
And at the collection level:
sim_spec_examples$repunit_ppn %>% mutate(rho_exp = ppn / sum(ppn)) %>% left_join(rppn_csing$omegan_summ, .)
rppn_csd <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_ppn, alpha_collection = sim_spec_examples$coll_sub_dirichlet)}) %>% tidy_srs(., repc) %>% summ_it()
Let's compute the expected proportions:
sim_spec_examples$repunit_ppn %>% ungroup() %>% mutate(scaled_ppn = ppn / sum(ppn)) %>% left_join(repc, .) %>% left_join(., sim_spec_examples$coll_sub_dirichlet) %>% mutate(sub_dirichlet = ifelse(is.na(sub_dirichlet), 1.5, sub_dirichlet), ppn = ifelse(is.na(ppn), 0, ppn)) %>% group_by(repunit) %>% mutate(exp_omega = scaled_ppn * sub_dirichlet / sum(sub_dirichlet)) %>% left_join(., rppn_csd$omegan_summ)
Looks like that works.
rppn_csp <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_ppn, alpha_collection = sim_spec_examples$coll_sub_ppn)}) %>% tidy_srs(., repc) %>% summ_it()
Let's compute the expected proportions:
sim_spec_examples$repunit_ppn %>% ungroup() %>% mutate(scaled_ppn = ppn / sum(ppn)) %>% left_join(repc, .) %>% left_join(., sim_spec_examples$coll_sub_ppn) %>% mutate(sub_ppn = ifelse(is.na(sub_ppn), 0.0000001, sub_ppn), ppn = ifelse(is.na(ppn), 0, ppn)) %>% group_by(repunit) %>% mutate(exp_omega = scaled_ppn * sub_ppn / sum(sub_ppn)) %>% left_join(., rppn_csp$omegan_summ)
Yep, those are right on. And there is no variance in the omega's just like we expect.
All that remains is to do explicit counts in the repunits.
First we test the situation where collection is just give the default dirichlet parameter:
rcnt_csing <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_cnt)}) %>% tidy_srs(., repc) %>% summ_it()
Then we compare that to what we expected
sim_spec_examples$repunit_cnt %>% mutate(rho_exp = cnt / sum(cnt)) %>% left_join(rppn_csing$rho_summ, .)
Looks right.
And at the collection level:
sim_spec_examples$repunit_cnt %>% mutate(rho_exp = cnt / sum(cnt)) %>% left_join(rcnt_csing$omegan_summ, .)
That looks right.
rcnt_csd <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_cnt, alpha_collection = sim_spec_examples$coll_sub_dirichlet)}) %>% tidy_srs(., repc) %>% summ_it()
Let's compute the expected proportions:
sim_spec_examples$repunit_cnt %>% ungroup() %>% mutate(scaled_ppn = cnt / sum(cnt)) %>% left_join(repc, .) %>% left_join(., sim_spec_examples$coll_sub_dirichlet) %>% mutate(sub_dirichlet = ifelse(is.na(sub_dirichlet), 1.5, sub_dirichlet), cnt = ifelse(is.na(cnt), 0, cnt)) %>% group_by(repunit) %>% mutate(exp_omega = scaled_ppn * sub_dirichlet / sum(sub_dirichlet)) %>% left_join(., rppn_csd$omegan_summ)
Close, but not totally right on. But I don't think I will worry too much about it.
rcnt_csd$omegan_summ %>% group_by(repunit) %>% summarise(summed_n = sum(n_mean)) %>% left_join(., sim_spec_examples$repunit_cnt)
That part looks great.
rcnt_csp <- lapply(1:1000, function(x) { simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_cnt, alpha_collection = sim_spec_examples$coll_sub_ppn)}) %>% tidy_srs(., repc) %>% summ_it()
Let's compute the expected proportions:
sim_spec_examples$repunit_cnt %>% ungroup() %>% mutate(scaled_ppn = cnt / sum(cnt)) %>% left_join(repc, .) %>% left_join(., sim_spec_examples$coll_sub_ppn) %>% mutate(sub_ppn = ifelse(is.na(sub_ppn), 0.0000001, sub_ppn), cnt = ifelse(is.na(cnt), 0, cnt)) %>% group_by(repunit) %>% mutate(exp_omega = scaled_ppn * sub_ppn / sum(sub_ppn)) %>% left_join(., rcnt_csp$omegan_summ) %>% ungroup() %>% mutate(n_mean_ppn = n_mean / sum(n_mean))
Good.
For a collection that is not in the data set:
simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$repunit_cnt, alpha_collection = sim_spec_examples$unknown_collection_error)
Or a repunit that is not in the data set:
simulate_random_samples(RU_starts = p$RU_starts, RU_vec = p$RU_vec, size = 100, alpha_repunit = sim_spec_examples$unknown_repunit_error, alpha_collection = sim_spec_examples$coll_sub_ppn)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.