#' Calculate the posterior probability of ancestral host repertoires at time points in the past (ages)
#'
#' @param history Data frame with posterior samples of interaction histories. Output from `read_history()`.
#' @param ages Vector of ages (time points in the past) at which samples will be retrieved. The
#' present (`age = 0`) must be included.
#' @param tree Symbiont tree.
#' @param host_tree Host tree.
#' @param extant_prob Should posterior probabilities be calculated for extant network? Default to
#' `FALSE`. `TRUE` only makes sense if interactions in the extant network were also inferred. When
#' `FALSE`, only the first MCMC sample will be retrieved at `age = 0`.
#' @param state Which state? Default is 2. For analyses using the 3-state model, give `c(1, 2)` to
#' include both states (where 1 is a potential host and 2 an actual host).
#' @param drop_empty Logical. Remove taxa without any interactions?
#'
#' @return A list of three lists, containing:
#' \itemize{
#' \item{"`samples`"}{ Arrays of samples x nodes x hosts, containing the state of each sample.}
#' \item{"`post_states`"}{ Arrays of nodes x hosts x state containing the posterior probability
#' for each state.}
#' \item{"`post_repertoires`"}{ Arrays of nodes x hosts x repertoire containing the posterior
#' probability for 1) the `"realized"` repertoire which is defined as state 2, and 2) the
#' `"fundamental"` repertoire which is defined as having any state (usually 1 or 2).}
#' }
#' Each array in these lists is for one of the `ages`. The number of samples is the number of
#' iterations in `history`.
#' @export
#'
#' @examples
#' # read symbiont and host tree
#' data_path <- system.file("extdata", package = "evolnets")
#' tree <- read_tree_from_revbayes(paste0(data_path,"/tree_pieridae.tre"))
#' host_tree <- ape::read.tree(paste0(data_path,"/host_tree_pieridae.phy"))
#'
#' # read histories sampled during MCMC
#' history <- read_history(paste0(data_path,"/history_thin_pieridae.txt"), burnin = 0)
#'
#' # get samples at ages
#' ages <- c(80,70,0)
#' at_ages <- posterior_at_ages(history, ages, tree, host_tree)
#' samples_at_ages <- at_ages$samples
#'
#' # get posterior probabilities of states at ages
#' pp_at_ages <- at_ages$post_states
posterior_at_ages <- function(history, ages, tree, host_tree, extant_prob = FALSE, state = 2, drop_empty = TRUE) {
# input checking
if (!is.data.frame(history)) {
stop('`history` should be a data.frame, usually generated by `read_history()`')
}
if (!all(c('node_index', 'iteration', 'transition_type') %in% names(history))) {
stop('`history` needs to have columns `node_index`, `iteration` and `transition_type`.')
}
if (!is.numeric(ages)) stop('`ages` should be a numeric vector.')
if (!(0 %in% ages)) stop('`ages` has to include 0 (the present).')
origin <- max(dispRity::tree.age(tree)$ages)
if (max(ages) > origin) stop('all `ages` must be younger than the origin of the symbiont clade.')
if (!inherits(tree, 'phylo')) stop('`tree` should be a phylogeny of class `phylo`.')
if (!inherits(host_tree, 'phylo')) stop('`host_tree` should be a phylogeny of class `phylo`.')
if (!is.numeric(state)) stop('`state` should be a numeric vector.')
if (!is.logical(drop_empty) | length(drop_empty) != 1) {
stop('`drop_empty` should be a logical vector of length 1.')
}
# force ages to be in decreasing order (important in downstream functions)
ages <- sort(ages, decreasing = TRUE)
# get the posteriors
l <- list()
for (i in seq_along(ages)) {
age <- ages[i]
if (!extant_prob & age == 0) {
l[[i]] <- make_samples_post_at_age(
history[history$iteration == history$iteration[1], ],
age, tree, host_tree, state, drop_empty
)
} else {
l[[i]] <- make_samples_post_at_age(
history, age, tree, host_tree, state, drop_empty
)
}
}
names(l) <- ages
purrr::transpose(l)
}
make_samples_post_at_age <- function(dat, age, tree, host_tree, state, drop_empty) {
iterations <- sort(unique(dat$iteration))
n_iter <- length(iterations)
nodes <- sort(unique(dat$node_index))
n_nodes <- length(nodes)
node_names <- c(rev(tree$tip.label), paste0("Index_", (((n_nodes + 1) / 2) + 1):length(nodes)))
dat2 <- make_dat_age(dat, age = age)
dat2 <- dat2 %>%
# Select only the columns we need
dplyr::select(.data$iteration, .data$node_index, .data$end_state) %>%
dplyr::mutate(
# Keep making factors so we do not accidentally drop iterations or nodes
iteration = factor(.data$iteration, iterations),
node_index = factor(.data$node_index, nodes),
# Split the state string up to create one observation per node
end_state = stringr::str_split(.data$end_state, ""),
# add indices for the hosts
s_index = list(seq_along(.data$end_state[[1]]))
) %>%
# Put each node on it's own row
tidyr::unnest(c(.data$end_state, .data$s_index)) %>%
# Keep making factors so we do not accidentally drop states or hosts
dplyr::mutate(
end_state = factor(.data$end_state, state),
s_index = factor(.data$s_index)
) %>%
# Drop everything where the state was '0'
dplyr::filter(.data$end_state != '0')
# Now count all combinations to make our samples array
all_combs <- unclass(table(dat2$iteration, dat2$node_index, dat2$s_index, dat2$end_state))
if (!all(all_combs %in% c(0, 1))) {
# Just in case something has gone horribly wrong.
stop('Duplicated combinations detected, please contact the maintainer of evolnets.')
}
# We have to assign the correct state, so loop through the possible states and assign the state
# to wherever an interaction was found (i.e. count == 1)
l <- list()
for (s in seq_along(state)) {
# we need to be careful here, we want to drop dimension 4 (state), but not any other. For
# example, for extant networks, we may only have 1 iteration. You can't easily drop a specific
# array dimension in R, while keeping dimnames, so that's why this is a bit convoluted.
a <- all_combs[, , , s, drop = FALSE]
dm <- dimnames(a)
dim(a) <- dim(a)[1:3]
dimnames(a) <- dm[1:3]
a[a == 1] <- state[s]
l[[s]] <- a
}
# Then we merge the arrays for each state. We take the first array, and overwrite the elements
# where the second array is non-null with elements of the second. Repeat for a possible third
# state etc. (I don't think there is a third state, but this generalizes to n states.)
array <- Reduce(function(x, y) { x[y != 0] <- y[y != 0]; return(x) }, l)
# The line above should have the same result as the sum below, but the sum is much slower
#array <- apply(array, 1:3, sum)
# Finally we assign names to the array
dimnames(array) <- list(seq_len(n_iter), node_names, host_tree$tip.label)
# For the probabilities arrays, we just need to count and divide by total iterations
# First, make an array of the posterior probabilities for each state
p_st <- unclass(table(dat2$node_index, dat2$s_index, dat2$end_state))
p_st <- p_st / n_iter
dimnames(p_st) <- list(node_names, host_tree$tip.label, state)
# Then, make an array that has the realized repertoire (state 2), and the fundamental repertoire
# (sum of the probabilities of all states)
p_rep <- array(0, c(dim(p_st)[1:2], 2), c(dimnames(p_st)[1:2], list(c('realized', 'fundamental'))))
if (2 %in% state) p_rep[, , 1] <- p_st[, , '2']
p_rep[, , 2] <- apply(p_st, 1:2, sum)
if (drop_empty) {
symbionts_present <- rowSums(p_st) != 0
hosts_present <- apply(p_st, 2, sum) != 0
array <- array[, symbionts_present, hosts_present, drop = FALSE]
p_st <- p_st[symbionts_present, hosts_present, , drop = FALSE]
p_rep <- p_rep[symbionts_present, hosts_present, , drop = FALSE]
}
list(samples = array, post_states = p_st, post_repertoires = p_rep)
}
# find lineages (and their repertoires) that exist during the specified age
make_dat_age <- function(dat, age) {
# reduce the history data frame to only include relevant branches
dat2 <- dplyr::filter(
dat,
.data$branch_start_time >= age,
.data$branch_end_time <= age,
)
# list unique nodes and iterations before further filtering
nodes <- unique(dat2$node_index)
iterations <- sort(unique(dat2$iteration))
# collect the branches without change or the events with an estimated transition time before the
# cut-off
dat2 <- rbind(
dat2[dat2$transition_type == "no_change", ],
dat2[dat2$transition_type == "anagenetic" & dat2$transition_time >= age, ]
)
# find nodes and iterations that we have lost, since we need to deal with them separately below
if (nrow(dat2) > 0) {
missing_nodes <- dat2 %>%
dplyr::group_by(.data$iteration) %>%
dplyr::reframe(node_index = setdiff(nodes, .data$node_index))
missing_iters <- dat2 %>%
dplyr::group_by(.data$node_index) %>%
dplyr::reframe(iteration = setdiff(iterations, .data$iteration))
missing <- dplyr::bind_rows(missing_nodes, missing_iters)
} else {
missing <- expand.grid(
iteration = iterations,
node_index = nodes
)
}
if (nrow(missing) > 0) {
# for missing iterations and nodes, find their parent node instead
parents <- dplyr::left_join(
missing,
dat %>% dplyr::select(.data$iteration, .data$node_index, .data$parent_index) %>% dplyr::distinct(),
c('iteration', 'node_index')
)
# now take those parents, get their data and rename their node_index back to the child node
missing_values <- dplyr::inner_join(
dat,
parents,
by = c('iteration', 'node_index' = 'parent_index'),
suffix = c('', '_tmp')
) %>%
dplyr::mutate(node_index = .data$node_index_tmp) %>%
dplyr::select(-.data$node_index_tmp)
# then join this with our previously found data
dat2 <- dplyr::bind_rows(dat2, missing_values)
}
# for each node at each iteration, find the youngest event. Use data.table since it needs to be
# fast for many groups.
dat2 <- data.table::as.data.table(dat2)
# first, sort the table for transition time, with NA values sorted at the end
data.table::setorderv(dat2, 'transition_time', na.last = TRUE)
# then for each group, take the first value
dat2[, .SD[1], by = c('iteration', 'node_index')]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.