evolnets: Reconstruct Ancestral Networks Inferred In RevBayes

Documented in posterior_at_ages

#' Calculate the posterior probability of ancestral host repertoires at time points in the past (ages)
#'
#' @param history Data frame with posterior samples of interaction histories. Output from `read_history()`.
#' @param ages Vector of ages (time points in the past) at which samples will be retrieved. The
#'   present (`age = 0`) must be included.
#' @param tree Symbiont tree.
#' @param host_tree Host tree.
#' @param extant_prob Should posterior probabilities be calculated for extant network? Default to
#'   `FALSE`. `TRUE` only makes sense if interactions in the extant network were also inferred. When
#'   `FALSE`, only the first MCMC sample will be retrieved at `age = 0`.
#' @param state Which state? Default is 2. For analyses using the 3-state model, give `c(1, 2)` to
#' include both states (where 1 is a potential host and 2 an actual host).
#' @param drop_empty Logical. Remove taxa without any interactions?
#'
#' @return A list of three lists, containing:
#' \itemize{
#'  \item{"`samples`"}{ Arrays of samples x nodes x hosts, containing the state of each sample.}
#'  \item{"`post_states`"}{ Arrays of nodes x hosts x state containing the posterior probability
#'  for each state.}
#'  \item{"`post_repertoires`"}{ Arrays of nodes x hosts x repertoire containing the posterior
#'  probability for 1) the `"realized"` repertoire which is defined as state 2, and 2) the
#'  `"fundamental"` repertoire which is defined as having any state (usually 1 or 2).}
#' }
#' Each array in these lists is for one of the `ages`. The number of samples is the number of
#' iterations in `history`.
#' @export
#'
#' @examples
#' # read symbiont and host tree
#' data_path <- system.file("extdata", package = "evolnets")
#' tree <- read_tree_from_revbayes(paste0(data_path,"/tree_pieridae.tre"))
#' host_tree <- ape::read.tree(paste0(data_path,"/host_tree_pieridae.phy"))
#'
#' # read histories sampled during MCMC
#' history <- read_history(paste0(data_path,"/history_thin_pieridae.txt"), burnin = 0)
#'
#' # get samples at ages
#' ages <- c(80,70,0)
#' at_ages <- posterior_at_ages(history, ages, tree, host_tree)
#' samples_at_ages <- at_ages$samples
#'
#' # get posterior probabilities of states at ages
#' pp_at_ages <- at_ages$post_states
posterior_at_ages <- function(history, ages, tree, host_tree, extant_prob = FALSE, state = 2, drop_empty = TRUE) {

  # input checking
  if (!is.data.frame(history)) {
    stop('`history` should be a data.frame, usually generated by `read_history()`')
  }
  if (!all(c('node_index', 'iteration', 'transition_type') %in% names(history))) {
    stop('`history` needs to have columns `node_index`, `iteration` and `transition_type`.')
  }
  if (!is.numeric(ages)) stop('`ages` should be a numeric vector.')
  if (!(0 %in% ages)) stop('`ages` has to include 0 (the present).')
  origin <- max(dispRity::tree.age(tree)$ages)
  if (max(ages) > origin) stop('all `ages` must be younger than the origin of the symbiont clade.')
  if (!inherits(tree, 'phylo')) stop('`tree` should be a phylogeny of class `phylo`.')
  if (!inherits(host_tree, 'phylo')) stop('`host_tree` should be a phylogeny of class `phylo`.')
  if (!is.numeric(state)) stop('`state` should be a numeric vector.')
  if (!is.logical(drop_empty) | length(drop_empty) != 1) {
    stop('`drop_empty` should be a logical vector of length 1.')
  }

  # force ages to be in decreasing order (important in downstream functions)
  ages <- sort(ages, decreasing = TRUE)

  # get the posteriors
  l <- list()

  for (i in seq_along(ages)) {
    age <- ages[i]

    if (!extant_prob & age == 0) {
      l[[i]] <- make_samples_post_at_age(
        history[history$iteration == history$iteration[1], ],
        age, tree, host_tree, state, drop_empty
      )
    } else {
      l[[i]] <- make_samples_post_at_age(
        history, age, tree, host_tree, state, drop_empty
      )
    }

  }
  names(l) <- ages
  purrr::transpose(l)
}

make_samples_post_at_age <- function(dat, age, tree, host_tree, state, drop_empty) {

  iterations <- sort(unique(dat$iteration))
  n_iter <- length(iterations)

  nodes <- sort(unique(dat$node_index))
  n_nodes <- length(nodes)
  node_names <- c(rev(tree$tip.label), paste0("Index_", (((n_nodes + 1) / 2) + 1):length(nodes)))

  dat2 <- make_dat_age(dat, age = age)

  dat2 <- dat2 %>%
    # Select only the columns we need
    dplyr::select(.data$iteration, .data$node_index, .data$end_state) %>%
    dplyr::mutate(
      # Keep making factors so we do not accidentally drop iterations or nodes
      iteration = factor(.data$iteration, iterations),
      node_index = factor(.data$node_index, nodes),
      # Split the state string up to create one observation per node
      end_state = stringr::str_split(.data$end_state, ""),
      # add indices for the hosts
      s_index = list(seq_along(.data$end_state[[1]]))
    ) %>%
    # Put each node on it's own row
    tidyr::unnest(c(.data$end_state, .data$s_index)) %>%
    # Keep making factors so we do not accidentally drop states or hosts
    dplyr::mutate(
      end_state = factor(.data$end_state, state),
      s_index = factor(.data$s_index)
    ) %>%
    # Drop everything where the state was '0'
    dplyr::filter(.data$end_state != '0')

  # Now count all combinations to make our samples array
  all_combs <- unclass(table(dat2$iteration, dat2$node_index, dat2$s_index, dat2$end_state))
  if (!all(all_combs %in% c(0, 1))) {
    # Just in case something has gone horribly wrong.
    stop('Duplicated combinations detected, please contact the maintainer of evolnets.')
  }
  # We have to assign the correct state, so loop through the possible states and assign the state
  # to wherever an interaction was found (i.e. count == 1)
  l <- list()
  for (s in seq_along(state)) {
    # we need to be careful here, we want to drop dimension 4 (state), but not any other. For
    # example, for extant networks, we may only have 1 iteration. You can't easily drop a specific
    # array dimension in R, while keeping dimnames, so that's why this is a bit convoluted.
    a <- all_combs[, , , s, drop = FALSE]
    dm <- dimnames(a)
    dim(a) <- dim(a)[1:3]
    dimnames(a) <- dm[1:3]
    a[a == 1] <- state[s]
    l[[s]] <- a
  }
  # Then we merge the arrays for each state. We take the first array, and overwrite the elements
  # where the second array is non-null with elements of the second. Repeat for a possible third
  # state etc. (I don't think there is a third state, but this generalizes to n states.)
  array <- Reduce(function(x, y) { x[y != 0] <- y[y != 0]; return(x) }, l)
  # The line above should have the same result as the sum below, but the sum is much slower
  #array <- apply(array, 1:3, sum)
  # Finally we assign names to the array
  dimnames(array) <- list(seq_len(n_iter), node_names, host_tree$tip.label)

  # For the probabilities arrays, we just need to count and divide by total iterations
  # First, make an array of the posterior probabilities for each state
  p_st <- unclass(table(dat2$node_index, dat2$s_index, dat2$end_state))
  p_st <- p_st / n_iter
  dimnames(p_st) <- list(node_names, host_tree$tip.label, state)
  # Then, make an array that has the realized repertoire (state 2), and the fundamental repertoire
  # (sum of the probabilities of all states)
  p_rep <- array(0, c(dim(p_st)[1:2], 2), c(dimnames(p_st)[1:2], list(c('realized', 'fundamental'))))
  if (2 %in% state) p_rep[, , 1] <- p_st[, , '2']
  p_rep[, , 2] <- apply(p_st, 1:2, sum)

  if (drop_empty) {
    symbionts_present <- rowSums(p_st) != 0
    hosts_present <- apply(p_st, 2, sum) != 0
    array <- array[, symbionts_present, hosts_present, drop = FALSE]
    p_st  <-  p_st[symbionts_present, hosts_present, , drop = FALSE]
    p_rep <- p_rep[symbionts_present, hosts_present, , drop = FALSE]
  }

  list(samples = array, post_states = p_st, post_repertoires = p_rep)
}

# find lineages (and their repertoires) that exist during the specified age
make_dat_age <- function(dat, age) {

  # reduce the history data frame to only include relevant branches
  dat2 <- dplyr::filter(
    dat,
    .data$branch_start_time >= age,
    .data$branch_end_time <= age,
  )
  # list unique nodes and iterations before further filtering
  nodes <- unique(dat2$node_index)
  iterations <- sort(unique(dat2$iteration))

  # collect the branches without change or the events with an estimated transition time before the
  # cut-off
  dat2 <- rbind(
    dat2[dat2$transition_type == "no_change", ],
    dat2[dat2$transition_type == "anagenetic" & dat2$transition_time >= age, ]
  )

  # find nodes and iterations that we have lost, since we need to deal with them separately below
  if (nrow(dat2) > 0) {
    missing_nodes <- dat2 %>%
      dplyr::group_by(.data$iteration) %>%
      dplyr::reframe(node_index = setdiff(nodes, .data$node_index))
    missing_iters <- dat2 %>%
      dplyr::group_by(.data$node_index) %>%
      dplyr::reframe(iteration = setdiff(iterations, .data$iteration))
    missing <- dplyr::bind_rows(missing_nodes, missing_iters)
  } else {
    missing <- expand.grid(
      iteration = iterations,
      node_index = nodes
    )
  }

  if (nrow(missing) > 0) {
    # for missing iterations and nodes, find their parent node instead
    parents <- dplyr::left_join(
      missing,
      dat %>% dplyr::select(.data$iteration, .data$node_index, .data$parent_index) %>% dplyr::distinct(),
      c('iteration', 'node_index')
    )
    # now take those parents, get their data and rename their node_index back to the child node
    missing_values <- dplyr::inner_join(
      dat,
      parents,
      by = c('iteration', 'node_index' = 'parent_index'),
      suffix = c('', '_tmp')
    ) %>%
      dplyr::mutate(node_index = .data$node_index_tmp) %>%
      dplyr::select(-.data$node_index_tmp)

    # then join this with our previously found data
    dat2 <- dplyr::bind_rows(dat2, missing_values)
  }

  # for each node at each iteration, find the youngest event. Use data.table since it needs to be
  # fast for many groups.
  dat2 <- data.table::as.data.table(dat2)
  # first, sort the table for transition time, with NA values sorted at the end
  data.table::setorderv(dat2, 'transition_time', na.last = TRUE)
  # then for each group, take the first value
  dat2[, .SD[1], by = c('iteration', 'node_index')]
}