R/ReadTntTree.R

Defines functions .TntCommands TntText2Tree ReadTntTree

Documented in ReadTntTree TntText2Tree

#' Parse TNT Tree
#'
#' Read a tree from [TNT](https://www.lillo.org.ar/phylogeny/tnt/)'s
#' parenthetical output.
#'
#' `ReadTntTree()` imports trees generated by the parsimony analysis program 
#' [TNT](https://www.lillo.org.ar/phylogeny/tnt/) into R, including node labels
#' written with the `ttags` command.
#' Tree files must have been saved by TNT in parenthetical notation, using the
#' TNT command `tsave *`.
#' Trees are easiest to load into R if taxa have been saved using their names
#' (TNT command `taxname =`).  In this case, the TNT `.tre` file
#' contains tip labels and can be parsed directly.  The downside is that the
#' uncompressed `.tre` files will have a larger file size.
#'
#' `ReadTntTree()` can also read `.tre` files in which taxa have been saved
#' using their numbers (`taxname -`).  Such files contain a hard-coded link to
#' the matrix file that was used to generate the trees, in the first line of the
#' `.tre` file.  This poses problems for portability: if the matrix file is
#' moved, or the `.tre` file is accessed on another computer, the taxon names
#' may be lost.  As such, it is important to check that the matrix file
#' exists in the expected location -- if it does not,
#' either use the `relativePath` argument to point to its new location, or
#' specify `tipLabels` to manually specify the tip labels.
#' 
#' `TntText2Tree()` converts text representation of a tree in TNT to an
#'  object of class `phylo`.
#'
#' @param filepath character string specifying path to TNT `.tre` file,
#' relative to the R working directory (visible with `getwd()`).
#' @param relativePath (discouraged) character string specifying location of the
#' matrix file used to generate the TNT results, relative to the current working
#' directory.  Taxon names will be read from this file if they are not specified
#'  by `tipLabels`.
#' @param keepEnd (optional, default 1) integer specifying how many elements of
#'  the file path to conserve when creating relative path (see examples).
#' @param tipLabels (optional) character vector specifying the names of the
#' taxa, in the sequence that they appear in the TNT file.  If not specified,
#' taxon names will be loaded from the data file linked in the first line of the
#'  `.tre` file specified in `filepath`.
#'
#' @return `ReadTntTree()` returns a tree of class `phylo` in 
#' \link[=TNTOrder]{TNT order},
#' corresponding to the tree in `filepath`, or NULL if no trees are found.
#'
#' @examples
#' # In the examples below, TNT has read a matrix from
#' # "c:/TreeTools/input/dataset.nex"
#' # The results of an analysis were written to
#' # "c:/TreeTools/output/results1.tnt"
#' #
#' # results1.tnt will contain a hard-coded reference to
#' # "c:/TreeTools/input/dataset.nex".
#'
#' # On the original machine (but not elsewhere), it would be possible to read
#' # this hard-coded reference from results.tnt:
#' # ReadTntTree("output/results1.tnt")
#'
#' # These datasets are provided with the "TreeTools" package, which will
#' # probably not be located at c:/TreeTools on your machine:
#'
#' oldWD <- getwd() # Remember the current working directory
#' setwd(system.file(package = "TreeTools"))
#'
#' # If taxon names were saved within the file (using `taxname=` in TNT),
#' # then our job is easy:
#' ReadTntTree("extdata/output/named.tre")
#'
#' # But if taxa were compressed to numbers (using `taxname-`), we need to
#' # look up the original matrix in order to dereference the tip names.
#' #
#' # We need to extract the relevant file path from the end of the
#' # hard-coded path in the original file.
#' #
#' # We are interested in the last two elements of
#' # c:/TreeTools/input/dataset.nex
#' #                2      1
#' #
#' # "." means "relative to the current directory"
#' ReadTntTree("extdata/output/numbered.tre", "./extdata", 2)
#'
#' # If working in a lower subdirectory
#' setwd("./extdata/otherfolder")
#'
#' # then it will be necessary to navigate up the directory path with "..":
#' ReadTntTree("../output/numbered.tre", "..", 2)
#'
#'
#' setwd(oldWD) # Restore original working directory
#'
#' TNTText2Tree("(A (B (C (D E ))));")
#'
#' @template MRS
#' @family tree import functions
#' @importFrom ape read.tree
#' @export
ReadTntTree <- function(filepath, relativePath = NULL, keepEnd = 1L,
                        tipLabels = NULL) {
  commands <- .TntCommands(filepath)
  tread <- grep("^trea?d?\\b", commands, perl = TRUE)
  if (length(tread) < 1) return(NULL)
  trees <- c(TntText2Tree(
    paste(
      gsub("tread\\s+('[^']*')*\\s*", "", commands[tread]),
      collapse = "*")))
  nRead <- unname(vapply(
    commands[tread],
    function(cmd) length(
      c(TntText2Tree(gsub("tread\\s+('[^']*')*\\s*", "", cmd)))
    ), double(1)
  ))
  
  if (!any(grepl("[A-z]", trees[[1]][["tip.label"]]))) {
    if (is.null(tipLabels)) {
      tipLabels <- rownames(ReadTntCharacters(filepath))
      if (is.null(tipLabels)) {
        if (length(tread) > 1) {
          warning("Multiple tree blocks not fully supported; ",
                  "please check tip labels and contact \"TreeTools\" ",
                  "maintainer if necessary")
        }
        treadComment <- strsplit(commands[tread[1]], "'", fixed = TRUE)[[1]]
        filePrefix <- "tree(s) from TNT, for data in "
        taxonFile <- grep(filePrefix, treadComment, fixed = TRUE)
        taxonFile <- gsub(filePrefix, "", 
                          gsub("\\", "/", treadComment[taxonFile], fixed = TRUE),
                          fixed = TRUE)
        if (!is.null(relativePath)) {
          taxonFileParts <- strsplit(taxonFile, "/")[[1]]
          nParts <- length(taxonFileParts)
          if (nParts < keepEnd) {
            stop("Taxon file path (", taxonFile,                                # nocov
                 ") contains fewer than keepEnd (", keepEnd, ") components.")   # nocov
          }
          taxonFile <- paste0(c(relativePath,
                                taxonFileParts[(nParts + 1L - keepEnd):nParts]),
                              collapse = "/")
        }
        
        if (length(taxonFile)) {
          if (!file.exists(taxonFile)) {
            warning("Cannot find linked data file:\n  ", taxonFile)             # nocov
          } else {
            tipLabels <- rownames(ReadTntCharacters(taxonFile, character_num = 1L))
            if (is.null(tipLabels)) {
              # TNT character read failed.  Perhaps taxonFile is in NEXUS format?
              tipLabels <- rownames(ReadCharacters(taxonFile, character_num = 1L))
            }
            if (is.null(tipLabels)) {
              warning("Could not read taxon names from linked TNT file:\n  ",   # nocov
                      taxonFile, "\nIs the file in TNT or Nexus format?",       # nocov
                      " If failing inexplicably, please report:",               # nocov
                      "\n  https://github.com/ms609/TreeTools/issues/new")      # nocov
            }
          }
        } else {
          warning("`filepath` does not link to taxon names; try specifying `tipLabels`")
          tipLabels <- as.character(seq_len(trees[[1]][["edge"]][1] - 1) - 1)
        }
      }
    }
    
    trees <- lapply(trees, function(tree) {
      tree$tip.label <- tipLabels[as.integer(tree$tip.label) + 1L]
      tree
    })
  }
  
  if (length(tipLabels)) {
    trees <- RenumberTips(trees, tipLabels)
  }
  
  trees <- TntOrder(trees)
  
  # Assign Tree Tags
  ttags <- grep("^tta?g?s?\\b", commands, perl = TRUE)
  tagLines <- commands[ttags]
  nTags <- length(ttags)
  
  tagTarget <- "^tta?g?s?\\s*\\*\\s*([\\d!]+)$"
  targetTags <- grep(tagTarget, tagLines, perl = TRUE)
  tagIsTarget <- as.logical(tabulate(targetTags, nTags))
  targetTree <- NA
  
  tagWrite <- "^tta?g?s?\\s*\\+\\s*(\\d+)\\s+(.+)$"
  writeTags <- grep(tagWrite, tagLines, perl = TRUE)
  tagIsWrite <- as.logical(tabulate(writeTags, nTags))
  
  tagClear <- "^tta?g?s?\\s*\\-$"
  clearTags <- grep(tagClear, tagLines, perl = TRUE)
  tagIsClear <- as.logical(tabulate(clearTags, nTags))
  
  for (i in seq_along(ttags)) {
    # A loop is inefficient, but matches TNT's command-driven storage structure
    # and thus may be more robust to input format -- and 
    # will make code maintenance and improvement easier in future.
    tagLine <- tagLines[i]
    if (tagIsTarget[i]) {
      target <- gsub(tagTarget, "\\1", tagLine, perl = TRUE)
      targetTree <- if (target == "!") {
        cumsum(nRead)[which.max(c(tread, Inf) > ttags[i]) - 1L]
      } else {
        as.numeric(target)
      }
    } else if (tagIsWrite[i]) {
      if (is.na(targetTree)) {
        warning("Expected `ttags *N`; applying tags to first tree");
        targetTree <- 1L
      }
      if (!length(trees[[targetTree]][["node.label"]])) {
        trees[[targetTree]][["node.label"]] <-
          character(trees[[targetTree]][["Nnode"]])
      }
      targetTips <- NTip(trees[[targetTree]])
      node <- sum(
        as.numeric(gsub(tagWrite, "\\1", tagLine, perl = TRUE)),
        - targetTips, # Tip labels stored separately
        + 1 # TNT counts from zero
      )
      tagText <- gsub(tagWrite, "\\2", tagLine, perl = TRUE)
      if (node < 1) {
        message("Ignoring tag on leaf ", node + targetTips,
                ": \"", tagText, "\"")
      } else {
        trees[[targetTree]][["node.label"]][node] <- 
          if (trees[[targetTree]][["node.label"]][node] == "") {
            tagText
          } else {
            paste(trees[[targetTree]][["node.label"]][node], tagText)
          }
      }
    } else if (tagIsClear[i]) {
      trees[[targetTree]][["node.label"]] <- NULL
    } else {
      warning("ttags command not yet supported: ", tagLine,
              ". Contact TreeTools maintainer.")
    }
  }
  
  # Return:
  if (length(trees) == 1) {
    trees[[1]]
  } else if (length(trees) == 0) {
    NULL                                                                        # nocov
  } else {
    class(trees) <- "multiPhylo"
    trees
  }
  
}

#' @rdname ReadTntTree
#' @param treeText Character string describing one or more trees,
#' in the parenthetical format output by TNT.
#' @export
TntText2Tree <- function(treeText) {
  treeText <- paste(strsplit(treeText, "\\s*\\*\\s*", perl = TRUE)[[1]], ";")
  treeText <- gsub(";(\\s*;)+", ";", treeText)
  treeText <- gsub("\\s*([\\w'\\.\\-]+)\\s*", "\\1,", treeText, perl = TRUE)
  treeText <- gsub("\\)\\s*\\(", "),(", treeText, perl = TRUE)
  treeText <- gsub(",)", ")", treeText, fixed = TRUE)
  
  tr <- read.tree(text = treeText)
  if (inherits(tr, "multiPhylo")) {
    tr[] <- lapply(tr, function (tree) {
      tree$tip.label[] <- Unquote(tree[["tip.label"]])
      tree
    })
  } else if (inherits(tr, "phylo")) {
    tr[["tip.label"]][] <- Unquote(tr[["tip.label"]])
  }
  
  # Return:
  tr
}

#' @rdname ReadTntTree
#' @export
TNTText2Tree <- TntText2Tree

.TntCommands <- function(filepath) {
  fileText <- readLines(filepath)
  commands <- strsplit(
    paste(fileText, collapse = "\UA"), #\UA = \n
    "\\s*;\\s*")[[1]]
}

Try the TreeTools package in your browser

Any scripts or data that you put into this service are public.

TreeTools documentation built on June 22, 2024, 9:27 a.m.