R/reconstruct.R

Defines functions build_tree

Documented in build_tree

#' Reconstruct transmission trees from a case line list.
#'
#' @param id_case id of case
#' @param id_biter id of biter (i.e. known biter from contact tracing data, if
#'  unknown should be 0, if no contact tracing data, can pass NULL)
#' @param y_coord y coordinate of case (should be in UTM: to do = use haversine distance for long/lat)
#' @param x_coord x coordinate of case (should be in UTM: to do = use haversine distance for long/lat)
#' @param owned whether animal is owned or not (per Katie, to account for uncertainty in case locations)
#' @param date_symptoms case date (i.e. date symptoms started)
#' @param days_uncertain uncertainty in days around case date
#' @param lineages a data table with two columns, id_case and lineage, designating a lineage
#'  assignment for each case, defaults to NULL which means trees wont be resolved to a phylogeny
#' @param exclude_progen boolean of length id_case or 1, if TRUE then case should be excluded as a potential
#'  progenitor (i.e. if including livestock cases or other species that are dead-end transmissions)
#' @param use_known_source whether to assign known progenitors from contact tracing data
#' @param prune whether to prune links (i.e. based on Cori et al./Mancy et al.)
#'  at a certain cutoff probability. This also results in assignment of incursions
#' @param si_fun the function to get the probability of a given temporal
#'  difference between cases in days (i.e. the serial interval)
#' @param dist_fun the function to get the probability of a given spatial
#'  difference between cases in meters (i.e. the dispersal kernel)
#' @param cutoff the probability level at which to prune links (i.e. cases can not be
#'  linked if the distance or time difference is greater than the this %ile of the distribution);
#'  can either be a single number or a named vector with a cutoff called "dist" and "time"
#' @param params list of parameters to pass to si_fun and dist_fun
#' @param min_time if uncertainty results in negative time difference between
#'  known case pairs, set it to this value (better way to deal with propagating
#'  uncertainty given known case pairs?)
#'
#' @return a data.table with the reconstructed tree
#' @export
#'
build_tree <- function(id_case,
                       id_biter,
                       y_coord,
                       x_coord,
                       owned,
                       date_symptoms,
                       days_uncertain,
                       lineages = NULL,
                       exclude_progen = FALSE,
                       use_known_source = FALSE,
                       known_tree = NULL,
                       prune = TRUE,
                       si_fun,
                       dist_fun,
                       cutoff = 0.95,
                       params,
                       min_time = 1e-6) {

  if(length(cutoff) > 1) {
    if(length(cutoff) > 2 | !(c("time", "dist") %in% names(cutoff))) {
      stop("Should pass two cutoff thresholds in a vector named 'dist' and 'time'")
    }
    time_cut <- cutoff["time"]
    dist_cut <- cutoff["dist"]
  } else {
    time_cut <- cutoff
    dist_cut <- cutoff
  }

  if(any(cutoff > 1) | any(cutoff <= 0)) {
    stop("Cutoff value should be a single probability between (0, 1]")
  }

  # build line list with uncertainty
  t <- add_uncertainty(days_uncertain, date_symptoms, id_biter,
                       id_case, use_known_source)
  case_dt <- data.table(id_case, id_biter, x_coord, y_coord, owned, t, join = t)
  case_dt$type <- "reconstructed" # for tracking whether using known sources or not

  setkey(case_dt, join) # this speeds up join

  # Get the si cutoff if pruning (either length 1 or length nrow(case_dt))
  progen_dt <- case_dt[, .(t_progen = t,
                           x_coord_progen = x_coord,
                           y_coord_progen = y_coord,
                           id_progen = id_case,
                           join_on = join)]

  # exclude progenitors that are invalid (i.e. if any are livestock species, etc)
  progen_dt <- progen_dt[!exclude_progen, ]

  setkey(progen_dt, join_on)

  if(use_known_source) {

    if(is.null(known_tree)) {
      stop("Need to pass a reconstructed tree of known progenitors to `known_tree`,
            see `build_known_tree` function.")
    }

    k_tree <- copy(known_tree) # so working on copy specific to sim

    # get the uncertainty for the simulation (this will be the first match)
    k_tree$t <- case_dt$t[match(k_tree$id_case, case_dt$id_case)]
    k_tree$t_progen <- case_dt$t[match(k_tree$id_progen, case_dt$id_case)]

    # Get their diff for time
    k_tree[, t_diff := as.numeric(t - t_progen)]

    # fix any that have negative values due to uncertainty (better way?)
    k_tree$t_diff[k_tree$t_diff <= 0] <- min_time

    # Deal with multiple id's here (selecting ones that have multiple potential progenitors)
    k_tree <- select_progenitor(tree = k_tree, k_tree = NULL, lineages = NULL,
                                incursions = NULL,
                                si_fun = si_fun, dist_fun = dist_fun,
                                params = params, known = TRUE)

    # Filter out of progenitor assigment (but not out of the candidate progens!)
    case_dt <- case_dt[!(id_case %in% k_tree$id_case)]

  } else {
    k_tree <- NULL
  }

  # do a inner join to get possible progenitors
  if(prune) {
    # get the si cutoff
    si_cutoff <- si_fun(ttree = case_dt, cutoff = time_cut, params = params)
    progen_dt[, max := join_on + si_cutoff]

    # pruning at first stage based on time cutoffs to speed up
    ttree <- case_dt[progen_dt,
                     on = .(join > join_on, join <= max),
                     allow.cartesian = TRUE, nomatch = NULL][, -c("join", "join.1")]
  } else {
    ttree <- case_dt[progen_dt,
                     on = .(join > join_on),
                     allow.cartesian = TRUE, nomatch = NULL][, -"join"]
  }

  if(use_known_source) {
    # a candidate progenitor cannot be a known secondary case from contact tracing
    ttree <- ttree[!(k_tree[, .(id_progen = id_case, id_case = id_progen)]),
                   on = c("id_case", "id_progen")]
  }

  # For each case + progenitor combination, get the time and distance lag
  ttree[, c("dist_diff", "t_diff") := .(sqrt((x_coord - x_coord_progen)^2 + (y_coord - y_coord_progen)^2),
                                        as.numeric(t - t_progen))]

  # prune again for distance if true
  if(prune) {
    # Get the dist cutoff if pruning (either length 1 or length nrow(case_dt))
    dist_cutoff <- dist_fun(ttree = ttree, cutoff = dist_cut, params = params)
    ttree <- ttree[dist_diff <= dist_cutoff]
  }

  # and assign incursions as cases where no progenitor was identified
  # based on the distance OR time cutoffs
  # and also even if not pruning, the first case in the dataset will
  # not be assigned a link so need to include it in the tree here
  incursions <- case_dt[!(case_dt$id_case %in% ttree$id_case)][, -"join"]

  # This is actually the slow part so limiting # of possibilities speeds things up a lot
  # Also joins up with known tree and incursions
  ttree <- select_progenitor(tree = ttree, k_tree = k_tree, lineages = lineages,
                             incursions = incursions,
                             si_fun = si_fun, dist_fun = dist_fun,
                             params = params, known = FALSE)

  return(ttree)

}

#' Reconstruct known tree (i.e. from tracing data)
#'
#' @inheritParams build_tree
#'
#' @return a data.table with the known contact tracing tree
#' @keywords internal
#'
build_known_tree <- function(id_case,
                             id_biter,
                             y_coord,
                             x_coord,
                             owned,
                             date_symptoms) {


  # build line list with uncertainty
  case_dt <- data.table(id_case, id_biter, x_coord, y_coord, owned, t = date_symptoms)

  # Get the si cutoff if pruning (either length 1 or length nrow(case_dt))
  progen_dt <- case_dt[, .(t_progen = t,
                           x_coord_progen = x_coord,
                           y_coord_progen = y_coord,
                           id_progen = id_case,
                           join_ct = id_case)]
  setkey(progen_dt, join_ct)

  # Filter ones with non-zero biter ids
  # making sure the ids exist in the progenitor list
  known_biters <- case_dt[id_biter != 0 & id_biter %in% progen_dt$id_progen]
  known_biters[, join_ct := id_biter]
  setkey(known_biters, join_ct)

  # Inner join with the biters
  known_tree <- progen_dt[known_biters, on = "join_ct"][, -"join_ct"]

  # Get their distance difference and type
  known_tree[, c("dist_diff",
                 "type") := .(sqrt((x_coord - x_coord_progen)^2 + (y_coord - y_coord_progen)^2),
                              "traced")]

  return(known_tree)
}

# Helper functions ----------

# Select progenitor
#' Wrapper to select single progenitor in each case
#'
#' @param tree the data.table with possible case pairs to select from
#' @param k_tree the data.table with the known tree (when use_known_source = TRUE,
#'  NULL otherwise)
#' @param incursion the data.table with incursions (when prune =TRUE, NULL otherwise)
#' @inheritParams build_tree
#'
#' @return a data.table filtered to the selected case-progenitor pair
#' @keywords internal
#'
select_progenitor <- function(tree, lineages, k_tree, incursions,
                              si_fun, dist_fun, params, known = FALSE) {

  # probabilities
  si_fun(ttree = tree, params = params)
  dist_fun(ttree = tree, params = params)
  tree[, source_prob := dist_prob * t_prob][, prob_scale := source_prob/sum(source_prob), by = id_case]
  ttree <- tree[, selected := assign_progen(prob_scale), by = id_case][selected == 1]

  if(!known) {
    # Bind to incursions & known tree (can be NULL)
    ttree <- rbindlist(list(incursions, k_tree, ttree), fill = TRUE)

    if(is.null(lineages)) {
      lineages <- data.table(id_case = ttree$id_case, lineage = 0)
    }

    # Check which ones have mismatched lineages
    ttree <- ttree[lineages, on = "id_case"]
    tree <- tree[lineages, on = "id_case"]

    known_progens <- k_tree$id_case

    out <- find_lins_to_fix(ttree, known_progens, selector = "prob_scale")

    lins_to_fix <- out$lins_to_fix
    membership_dt <- out$membership_dt

    # set links to fix to NA
    ttree[id_case %in% lins_to_fix]$id_progen <- NA
    nfixes <- length(lins_to_fix)

    if(nfixes > 0) {

      rfixes <- sample(nfixes, nfixes) # fix in random order

      for(i in rfixes) {

        # Join up links with updated membership_dt
        tree <- membership_dt[tree, on = "id_case"]
        setnames(membership_dt, c("membership", "id_case", "lineage_chain"),
                 c("membership_progen", "id_progen", "lineage_progen_chain"))
        tree <- membership_dt[tree, on = "id_progen"]
        tree[, membership_progen := ifelse(is.na(membership_progen), 0,
                                           membership_progen)]
        tree[, lineage_progen_chain := ifelse(is.na(lineage_progen_chain), 0,
                                        lineage_progen_chain)]

        # For those that have no incursions and are also the minimum case replace
        # with the next most likely progen, that is not already in the current chains
        candidate_links <- tree[id_case %in% lins_to_fix[i] & membership != membership_progen]

        # Also filter to those that in chain with same (or totally unsampled lineages)
        candidate_links <- candidate_links[lineage_chain * lineage_progen_chain == 0 | lineage_chain * lineage_progen_chain == lineage_chain^2]

        # Rescale probabilities & select
        fixed_links <- candidate_links[, prob_scale := source_prob/sum(source_prob),
                                       by = id_case][, selected := assign_progen(prob_scale),
                                                     by = id_case][selected == 1]

        # if none then set to NA (prob & links)
        incs <- ifelse(nrow(fixed_links) == 0, lins_to_fix[i], 0)
        if("id_biter" %in% names(ttree)){
          set_incs <- ttree[id_case %in% incs][, c("id_biter", "id_case", "x_coord",
                                                   "y_coord", "owned", "t", "type",
                                                   "lineage")]
        } else {
          set_incs <- ttree[id_case %in% incs][, c("id_case", "x_coord",
                                                   "y_coord", "owned", "t", "type",
                                                   "lineage")]
        }

        set_incs[, id_progen := NA]

        # bind them together
        ttree <-
          rbindlist(list(ttree[!(id_case %in% lins_to_fix[i])],
                         fixed_links, set_incs),
                    fill = TRUE)

        # clean ttree & links_all
        ttree[, c("membership", "membership_progen", "lineage_chain", "lineage_progen_chain") := NULL]
        tree[, c("membership", "membership_progen", "lineage_chain", "lineage_progen_chain") := NULL]

        # update membership_dt
        membership_dt <- get_membership(ttree)

      }
    }

    ttree <- membership_dt[ttree, on = "id_case"]

  }

  ttree[, prob_ll := log(source_prob)]
  ttree[, incursion := is.na(id_progen)]

  return(ttree)
}

#' Assign progenitor
#'
#' @keywords internal
assign_progen <- function(prob_scaled) {

  cumul_scaled <- cumsum(prob_scaled)
  rand_var <- runif(1, 0, 1)
  ind <- which(cumul_scaled > rand_var)[1]
  selected <- rep(0L, length(cumul_scaled))
  selected[ind] <- 1L

  return(selected)
}

#' Add uncertainty
#'
#' @keywords internal
add_uncertainty <- function(uncertainty, date_symptoms, id_biter,
                            id_case, use_known_source, buffer = 7,
                            max_tries = 100){

  if(any(uncertainty > 0)) {

    days_offset <- unlist(lapply(uncertainty,
                                 function(x){sample(seq(-x,x), size = 1)}))
    date_uncertain <- date_symptoms +  days_offset
    niter <- 0

    if(use_known_source) {
      date_min <- date_uncertain[match(id_biter, id_case)] + buffer # this will be relative to the first one
      date_min[is.na(date_min)] <- date_uncertain[is.na(date_min)]

      while(any(date_uncertain < date_min) & niter <= max_tries) {
        niter <- niter + 1
        # constrain known biters
        date_uncertain[date_uncertain < date_min] <- date_min[date_uncertain < date_min]

        # check again
        date_min <- date_uncertain[match(id_biter, id_case)] + buffer
        date_min[is.na(date_min)] <- date_uncertain[is.na(date_min)]
      }

      if(any(date_uncertain < date_min)) {
        warning("Significant date uncertainty means that some case dates may
               not line up with known sequence of events from contact tracing!")
      }
    }

    return(date_uncertain)

  } else {
    return(date_symptoms)
  }

}

#' Helper for functions to export to foreach loop in boot_trees
#'
#' @param filename path to the R script with functions. You still need to source
#'  this function before calling boot_trees.
#'
#' @return a character vector of names functions in R script
#' @export
#'
list_funs <- function(filename) {
  temp.env <- new.env()
  sys.source(filename, envir = temp.env)
  functions <- ls(envir = temp.env)
  return(functions)
}

#' Wrapper function for bootstrapped trees: reproducibly & in parallel
#'
#' @inheritParams build_tree
#' @param N number of trees to build
#' @param seed seed to pass to doRNG to make trees reproducible
#' @param exp_funs functions to export to foreach loop (i.e. for customized
#'  si_fun/dist_fun which relies on an external scripts), may be needed
#'  for certain types of cluster configs
#' @param exp_pkgs packages to export to foreach loop, defaults to
#'  data.table and treerabid, if other dependencies for si_fun or
#'  dist_fun, then pass here. May be needed for
#'  certain types of cluster configs
#'
#' @return a data.table with bootstrapped trees
#' @importFrom foreach foreach
#' @importFrom doRNG %dorng%
#' @export
#'
boot_trees <- function(id_case,
                       id_biter,
                       y_coord,
                       x_coord,
                       owned,
                       date_symptoms, # needs to be in a date class
                       days_uncertain,
                       lineages = NULL,
                       exclude_progen = FALSE,
                       use_known_source = FALSE,
                       prune = TRUE,
                       si_fun,
                       dist_fun,
                       cutoff = 0.95,
                       params,
                       min_time = 1e-6,
                       N = 1,
                       seed = 1245,
                       exp_funs = NULL,
                       exp_pkgs = c("data.table", "treerabid", "igraph")) {

  if(any(is.na(x_coord) | is.na(y_coord) | is.na(date_symptoms))) {
    stop("Missing data in times or locations!")
  }

  # check that date_symptoms is a date class
  if(class(date_symptoms) != "Date") {
    stop("date_symptoms is not of class Date!")
  }

  if(use_known_source){
    known_tree <- build_known_tree(id_case,
                                   id_biter,
                                   y_coord,
                                   x_coord,
                                   owned,
                                   date_symptoms)
  } else {
    known_tree <- NULL
  }

  # Check for duplicated id's here
  # (i.e. depending on whether you are using a known source or not)
  msg <- "id_case has duplicated values,
          but you are not using a known set of possible sources for these cases,
          there should only be one record per case id!"
  if(!use_known_source) {
    dups <- tabulate(id_case)
    if(any(dups > 1)) {
      stop(msg)
    }
  } else {
    # any biter ids that are < 1 (i.e. 0, neg numbers, and NAs)
    dups <- tabulate(id_case[id_biter < 1])
    if(any(dups > 1)) {
      stop(msg)
    }

  }

  foreach(i = seq_len(N),
          .combine = 'rbind', .options.RNG = seed,
          .export = exp_funs,
          .packages = exp_pkgs) %dorng% {

          ttree <-
            build_tree(id_case = id_case, id_biter = id_biter, y_coord = y_coord,
                       x_coord = x_coord,
                       owned = owned, date_symptoms = date_symptoms,
                       days_uncertain = days_uncertain,
                       exclude_progen = exclude_progen,
                       use_known_source = use_known_source,
                       known_tree = known_tree,
                       lineages = lineages,
                       prune = prune,
                       si_fun,
                       dist_fun,
                       cutoff = cutoff,
                       params = params,
                       min_time = min_time)
          ttree$sim <- i
          ttree
      }

}
mrajeev08/treerabid documentation built on Oct. 15, 2024, 12:14 p.m.