R/ts_recursive.R

Defines functions ts_recursive

# Receives a group of Taxonomy IDs and the desired sample size m.
# Returns a sampled vector with maximized taxonomy diversity.
# Assumes that every input id is unique.
# May return repeated IDs if replacement == TRUE, to increase
# diversity when method == "diversity".
#
# Args:
# list containing fields:
#   taxlist$ts.params$taxon:  (char/integer) Taxon from which to start sampling
#                             children taxa.
#   taxlist$ts.params$m:      (integer) desired sample size.
#   taxlist$nodes:            (data.frame) pre-processed information from NCBI
#                             taxonomy.
#   taxlist$ts.params$countIDs: (numeric) count of taxnomoy IDs belonging to
#                             each taxon, created by get_taxonomy_counts()
#   replacement:              (logical) whether the algorithm allows to repeat
#                             IDs in order to maximize taxonomy diversity and to
#                             reach m IDs in the output.
#   randomize:                (char) whether the algorithm will choose IDs
#                             randomly or maintaining a balanced allocation (m_i
#                             differing by at most 1 if the maximum possible
#                             value wasn't reached).
#   method:                   (char) whether it favors balanced taxa
#                             representation ("balance") or maximized taxa
#                             representation ("diversity").
#   requireIDs:               (char) IDs that must appear in the output.
#
# Returns:
#   updated taxlist containing field:
#   outputIDs:                (vector) vector of IDs with maximized taxonomy
#                             balance or diversity.

ts_recursive <- function(taxlist, verbose = TRUE) {


  if(verbose) {
    cat("\r", rep(" ", 50),
        "\r--> Recursive sampling:",
        paste(rep(".", sample.int(8, 1)), collapse = ""))
  }

  # extract relevant variables for recursive call
  taxon <- taxlist$ts.process$taxon
  m     <- taxlist$ts.process$m

  # Sanity check
  if (m <= 0) stop("m less or equal than zero during recursion.")

  # First step: Find the sub-taxa (children nodes) of the current taxon
  # that has members in user-provided data
  children <- taxlist$nodes$id[taxlist$nodes$parent == taxon & taxlist$nodes$id != taxon]
  children <- intersect(children, names(taxlist$countIDs))

  # Condition to end recursion
  if (length(children) == 0) {
    if(taxlist$ts.params$replacement){
      return(rep(taxon, m))
    } else {
      return(taxon)
    }
  }

  childrenCount    <- taxlist$countIDs[as.character(children)]
  childrenCountSpp <- taxlist$countSpp[as.character(children)]

  # Sanity check
  # In a few cases, the number of sequences may be greater than the number of
  # species. We saw this happening only in terminal nodes where there's both a
  # species and a subspecies, as our mapping of taxID2species ends at the
  # species level.
  # If that's the case, replace the number of known species by the number of
  # child sequences.
  # TODO: check this with Francisco
  childrenCountSpp[childrenCount > childrenCountSpp] <- childrenCount[childrenCount > childrenCountSpp]


  # For cases when one taxon isn't a leaf node, but is an input ID that should
  # be available to be sampled along with its children.
  # Example of this case is an input with Homo sapiens (9606),
  # H. sapiens neanderthalensis (63221) and H. sapiens ssp. denisova (741158).
  if (sum(childrenCount) < taxlist$countIDs[as.character(taxon)]) {
    childrenCount    <- c(childrenCount, taxlist$countIDs[as.character(taxon)])
    childrenCountSpp <- c(childrenCountSpp, taxlist$countIDs[as.character(taxon)])
    childrenCountSpp[as.character(taxon)] <- 1
    childrenCount[as.character(taxon)]    <- 1
  }

  # Allocation function to use:
  fname <- paste("sample",
                 substr(taxlist$ts.params$sampling, 1, 1),
                 substr(taxlist$ts.params$method, 1, 1),
                 substr(taxlist$ts.params$randomize, 1, 1),
                 sep = "_")

  # Perform sampling at current recursion level
  m_i <- do.call(fname,
                 args = list(m                = m,
                             m_i              = 0 * childrenCount,
                             childrenCount    = childrenCount,
                             childrenCountSpp = childrenCountSpp))

  # TODO: check the whole requireIDs thing.
  # Require specific IDs (and their parents/ancestors) to be present.
  # If any required ID has a lower m_i allocated than needed (informed by
  # requireIDs), reallocate from another ID in m_i.
  #  if (!is.null(requireIDs)) {
  #    req <- is.element(names(m_i), names(requireIDs))
  #    toAdd <- m_i[req] < requireIDs[names(m_i[req])]
  #    if (randomize == "no") {
  #      while (any(toAdd)) {
  #        subtract <- m_i[setdiff(names(m_i), names(toAdd))]
  #        subtract <- sample(names(subtract[subtract == max(subtract)]), 1)
  #        m_i[subtract] <- m_i[subtract] - 1
  #        add <- sample(names(toAdd[toAdd == TRUE]), 1)
  #        m_i[add] <- m_i[add] + 1
  #        toAdd <- m_i[req] < requireIDs[names(m_i[req])]
  #      }
  #    } else if (randomize == "yes") {
  #      while (any(toAdd)) {
  #        subtract <- m_i[setdiff(names(m_i), names(toAdd))]
  #        subtract <- sample(names(subtract[subtract > 0]), 1)
  #        m_i[subtract] <- m_i[subtract] - 1
  #        add <- sample(names(toAdd[toAdd == TRUE]), 1)
  #        m_i[add] <- m_i[add] + 1
  #        toAdd <- m_i[req] < requireIDs[names(m_i[req])]
  #      }
  #    }
  #  }

  outputIDs <- character()
  for (id in names(m_i)) {
    if (m_i[id] > 0){
      if (id == as.character(taxon)){
        outputIDs <- c(outputIDs, id)
      } else {
        taxlist$ts.process$m     <- m_i[id]
        taxlist$ts.process$taxon <- id
        outputIDs <- c(outputIDs, ts_recursive(taxlist, verbose))
      }
    }
  }

  return(outputIDs)
}
franciscolobo/TaxonSampling documentation built on Dec. 31, 2021, 12:04 a.m.