# set the working directory always to the project directory (one level up)
knitr::opts_knit$set(root.dir = normalizePath(rprojroot::find_rstudio_root_file())) 

libraries:

library(tidyverse)
library(rubias)

My idea here is to conduct a simple, greedy, "agglomerative" algorithm to gain insight into what good reporting units might look like. And then display that on a binary tree.

Here is my thought for the tree:

Do the self-assignments.

sa <- self_assign(reference = chinook, gen_start_col = 5)

And once we have that, we compute for each individual and each pair of collection-and-inferred-collection the normalized scaled likelihood. That is, the posterior for the individual belonging to the correct versus the other collection. Oh yeah, we also initialize everyone's repuclust to just be their true collection. We will increment these as we merge collections into different nodes.

pair_posts <- sa %>%
  mutate(repuclust = as.integer(factor(collection))) %>%
  mutate(inferred_repuclust = as.integer(factor(inferred_collection, levels = levels(factor(collection))))) %>%
  group_by(indiv) %>%
  mutate(self_post = scaled_likelihood[collection == inferred_collection]) %>%
  ungroup() %>%
  mutate(pair_post = scaled_likelihood / (scaled_likelihood + self_post)) %>%
  filter(collection != inferred_collection) %>%
  ungroup()

So, to make this easier, I need to have a data frame called leaves that gives me the names of the collections and the associated repuclusts.

leaves <- pair_posts %>%
  count(collection, repuclust) %>%
  ungroup() %>%
  select(-n) %>%
  arrange(repuclust)

And while we are at it, let's add those to a list of nodes:

nodes <- lapply(leaves$repuclust, function(x) list(d1 = NA, d2 = NA, members = x))
max_node <- length(nodes)  # also, get the number of the final leaf node (will get incremented)

And now we can take the mean of those values for each collection and repuclust and find the ones that are highest:

ppm <- pair_posts %>%
  group_by(collection, inferred_collection, repuclust, inferred_repuclust) %>% 
  summarise(mean_pairwise = mean(pair_post)) %>%
  ungroup() %>%
  arrange(desc(mean_pairwise))

ppm

So, clearly what we will do here is first combine PriestRapids_H and Hanford_Reach into a single reporting unit. That will take a number which is one greater than the largest repuclust. Then we recompute things on the basis of repuclusts and see which is the next to go. It will be easiest, ultimately, to do this all in terms of integer repuclusts and integer inferred_repuclusts, and then re-associate the actual collections at the end.

Note that for things that already have repunits, I will probably want to do a version that constrains the pairings to only those collections that are in the same repunit, to see how things look with pre-existing reporting units.

So, let's fiddle around with what that might look like. Here is a function to compute the mean_pair thing:

repclust_mean_pair_func <- function(pp) {
  pp %>%
    group_by(repuclust, inferred_repuclust) %>% 
    summarise(mean_pairwise = mean(pair_post)) %>%
    ungroup() %>%
    arrange(desc(mean_pairwise))
}

Then, we take the largest of those, and get the pairs both way:

rmp <- repclust_mean_pair_func(pair_posts) %>%
  filter(repuclust %in% c(repuclust[1], inferred_repuclust[1]),
         inferred_repuclust %in% c(repuclust[1], inferred_repuclust[1])
         )

Then we would create a new node with those

max_node <- max_node + 1
n1 <- rmp$repuclust[1]  # store the nodes of these
n2 <- rmp$repuclust[2]
nodes[[max_node]] <- list()
nodes[[max_node]]$d1 <- n1
nodes[[max_node]]$p1 <- rmp$mean_pairwise[1]
nodes[[max_node]]$d2 <- n2
nodes[[max_node]]$p2 <- rmp$mean_pairwise[2]
nodes[[max_node]]$members <- base::union(nodes[[n1]]$members, nodes[[n2]]$members)

And after we have done that, we convert the repuclusts to max_node in. Let's make a function to do that:

#' @param pp the pair_posts data frame
#' @param new_repu the index of the new reporting unit node (typically max_node)
#' @param d1 the repuclust index of the first daughter of new_repu
#' @param d2 the repuclust index of the second daughter of new_repu
merge_repuclusts <- function(pp, new_repu, d1, d2) {
  pp %>%
    mutate(repuclust = ifelse(repuclust %in% c(d1, d2), new_repu, repuclust),
           inferred_repuclust = ifelse(repuclust %in% c(d1, d2), new_repu, inferred_repuclust))
}

And so we can use that

pp_new <- merge_repuclusts(pair_posts, max_node, nodes[[max_node]]$d1, nodes[[max_node]]$d2)

And then we do it again:

rmp2 <- repclust_mean_pair_func(pp_new) %>%
  filter(repuclust %in% c(repuclust[1], inferred_repuclust[1]),
         inferred_repuclust %in% c(repuclust[1], inferred_repuclust[1])
         )

OK, I think I have a clear enough picture of this now to write a function to do the calculations and return a list which I can then turn into an ape phylo object.



benmoran11/rubias documentation built on Feb. 1, 2024, 10:52 p.m.