R/write_tnt_matrix.R
In Claddis: Measuring Morphological Diversity and Evolutionary Tempo

Documented in write_tnt_matrix

#' Writes out a morphological TNT data file
#'
#' @description
#'
#' Writes out a morphological data file in Hennig86/TNT format.
#'
#' @param cladistic_matrix A cladistic matrix in the format imported by \link{read_nexus_matrix}.
#' @param file_name The file name to write to. Should end in \code{.tnt}.
#' @param add_analysis_block Whether or not to add analysis block (i.e., tree search commands).
#'
#' @details
#'
#' Writes out a TNT (Goloboff et al. 2008; Goloboff and Catalano 2016) data file representing the distribution of discrete morphological characters in a set of taxa. Data must be in the format created by importing data with \link{read_nexus_matrix}.
#'
#' Note that the format can currently deal with continuous characters, sequence (DNA) data, and combinations of these and discrete morphology, but not yet the morphometric format introduced in Goloboff and Catalano (2016).
#'
#' @author Graeme T. Lloyd \email{graemetlloyd@@gmail.com}
#'
#' @seealso
#'
#' \link{write_nexus_matrix}
#'
#' @references
#'
#' Goloboff, P. A. and Catalano, S. A., 2016. TNT version 1.5, including a full implementation of phylogenetic morphometrics. \emph{Cladistics}, \bold{32}, 221-238.
#'
#' Goloboff, P., Farris, J. and Nixon, K., 2008. TNT, a free program for phylogenetic analysis. \emph{Cladistics}, \bold{24}, 774-786.
#'
#' @seealso
#'
#' \link{build_cladistic_matrix}, \link{compactify_cladistic_matrix}, \link{prune_cladistic_matrix}, \link{read_nexus_matrix}, \link{safe_taxonomic_reduction}, \link{write_nexus_matrix}
#'
#' @examples
#'
#' # Write out Michaux 1989 to current working directory:
#' write_tnt_matrix(cladistic_matrix = michaux_1989, file_name = "michaux_1989.tnt")
#'
#' # Remove file when finished:
#' file.remove(file1 = "michaux_1989.tnt")
#' @export write_tnt_matrix
write_tnt_matrix <- function(cladistic_matrix, file_name, add_analysis_block = FALSE) {

  # Check cladistic_matrix has class cladisticMatrix and stop and warn user if not:
  if (!inherits(x = cladistic_matrix, what = "cladisticMatrix")) stop("cladistic_matrix must be an object of class \"cladisticMatrix\".")

  # Subfunction to convert matrices back to symbols, missing and gap characters:
  convert_matrix <- function(cladistic_matrix) {

    # If there are missing characters replace with missing symbol:
    if (any(is.na(cladistic_matrix$matrix))) cladistic_matrix$matrix[is.na(cladistic_matrix$matrix)] <- "?"

    # If there are gap characters replace with gap symbol:
    if (sum(as.vector(cladistic_matrix$matrix) == "") > 0) cladistic_matrix$matrix[cladistic_matrix$matrix == ""] <- "-"

    # If there are symbols (i.e., non-continuous data):
    if (length(x = cladistic_matrix$characters$symbols) > 0) {

      # If datatype is STANDARD set TNT symbols:
      if (cladistic_matrix$datatype == "STANDARD") tnt_symbols <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V")

      # If datatype is non-STANDARD (but still discrete):
      if (cladistic_matrix$datatype != "STANDARD") tnt_symbols <- cladistic_matrix$characters$symbols

      # In reverse order go through numbers:
      for (i in rev(tnt_symbols)) {

        # Replace current number with appropriate symbol:
        if (length(x = grep(as.character(which(x = tnt_symbols == i) - 1), cladistic_matrix$matrix)) > 0) cladistic_matrix$matrix <- gsub(pattern = as.character(which(x = tnt_symbols == i) - 1), replacement = i, x = cladistic_matrix$matrix)
      }
    }

    # If there are uncertainties:
    if (length(x = grep("/", cladistic_matrix$matrix)) > 0) {

      # Find cells that have uncertainties:
      uncertainties <- grep("/", cladistic_matrix$matrix)

      # Replace with all possible values in curly braces:
      cladistic_matrix$matrix[uncertainties] <- paste("[", unlist(x = lapply(X = strsplit(cladistic_matrix$matrix[uncertainties], split = "/"), FUN = paste, collapse = "")), "]", sep = "")
    }

    # If there are polymorphisms:
    if (length(x = grep("&", cladistic_matrix$matrix)) > 0) {

      # Find cells with polymorphsims:
      polymorphisms <- grep("&", cladistic_matrix$matrix)

      # Resplae with values inside parentheses:
      cladistic_matrix$matrix[polymorphisms] <- paste("[", unlist(x = lapply(X = strsplit(cladistic_matrix$matrix[polymorphisms], split = "&"), paste, collapse = "")), "]", sep = "")
    }

    # Get equal length taxon names (with added spaces):
    taxon_names_with_spaces <- paste(rownames(x = cladistic_matrix$matrix), unlist(x = lapply(X = lapply(X = as.list(x = (max(nchar(x = rownames(x = cladistic_matrix$matrix))) + 2) - nchar(x = rownames(x = cladistic_matrix$matrix))), FUN = rep, x = " "), FUN = paste, collapse = "")), sep = "")

    # If block is continuous:
    if (cladistic_matrix$datatype == "CONTINUOUS") {

      # Format rows with spaces between values:
      cladistic_matrix$matrix <- paste(taxon_names_with_spaces, (apply(cladistic_matrix$matrix, 1, paste, collapse = " ")), sep = "")

      # If block is non-continuous:
    } else {

      # Format rows without spaces between values:
      cladistic_matrix$matrix <- paste(taxon_names_with_spaces, (apply(cladistic_matrix$matrix, 1, paste, collapse = "")), sep = "")
    }

    # Return just the newly formatted matrix (now a vector):
    cladistic_matrix$matrix
  }

  # Isolate just data blocks (i.e., cladistic_matrix without topper):
  data_blocks <- cladistic_matrix[2:length(x = cladistic_matrix)]

  # Get block names:
  block_names <- unlist(x = lapply(X = data_blocks, FUN = "[[", "block_name"))

  # Get data_types:
  data_types <- unlist(x = lapply(X = data_blocks, FUN = "[[", "datatype"))

  # NEXUS versions of data_types:
  nexus_datatypes <- c("CONTINUOUS", "DNA", "NUCLEOTIDE", "PROTEIN", "RESTRICTION", "RNA", "STANDARD")

  # TNT versions of data_types:
  tnt_datatypes <- c("continuous", "dna", "dna", "proteins", "numeric", "dna", "numeric")

  # Replace NEXUS versions of data_types with TNT version (as best as I can guess!):
  for (i in 1:7) data_types <- gsub(pattern = nexus_datatypes[i], replacement = tnt_datatypes[i], x = data_types)

  # Get number of taxa:
  n_taxa <- unname(unlist(x = lapply(X = lapply(X = data_blocks, FUN = "[[", "matrix"), FUN = nrow))[1])

  # Get number of characters:
  n_characters <- unlist(x = lapply(X = lapply(X = data_blocks, FUN = "[[", "matrix"), FUN = ncol))

  # Get symbols strings:
  symbols <- unlist(x = lapply(X = lapply(X = lapply(X = data_blocks, FUN = "[[", "characters"), FUN = "[[", "symbols"), FUN = paste, collapse = " "))

  # Get missing value:
  missing <- unlist(x = lapply(X = lapply(X = data_blocks, FUN = "[[", "characters"), "[[", "missing"))

  # Get gap symbol:
  gap <- unlist(x = lapply(X = lapply(X = data_blocks, FUN = "[[", "characters"), FUN = "[[", "gap"))

  # Conver matrices to vectors of text strings:
  data_block_strings <- lapply(X = data_blocks, FUN = convert_matrix)

  # Set up header block (returns empty string if nothing there):
  header_block <- ifelse(length(x = cladistic_matrix$topper$header) > 0, paste("'", cladistic_matrix$topper$header, "'\n", sep = ""), "")

  # Set up character block (including MATRIX that will begin data):
  character_block <- paste("& [", data_types, "]\n", sep = "")

  # Take character block and meld with matri(ces) into matrix block(s):
  matrix_block <- paste(paste(character_block, unlist(x = lapply(X = data_block_strings, paste, collapse = "\n")), "\n", sep = ""), collapse = "")

  # Get ordering of all characters in sequence:
  ordering <- unname(unlist(x = lapply(X = data_blocks, "[[", "ordering")))

  # Get weights of all characters in sequence:
  character_weights <- unname(unlist(x = lapply(X = data_blocks, "[[", "character_weights")))

  # Make sure costmatrices are a list if null:
  if (!is.list(cladistic_matrix$topper$costmatrices)) cladistic_matrix$topper$costmatrices <- list(NULL)

  # If there are costmatrices:
  if (any(!unlist(x = lapply(X = cladistic_matrix$topper$costmatrices, is.null)))) {

    # Empty vector to store hits (characters assigned to a costmatrix):
    global_hits <- vector(mode = "numeric")

    # Empty vector to store all costmatrix lines:
    all_costmatrix_lines <- vector(mode = "character")

    # Check there are not too many costmatrices:
    if (length(x = cladistic_matrix$topper$costmatrices) > 32) stop("Too many (>32) costmatrices for TNT!")

    # For each costmatrix:
    for (i in 1:length(x = cladistic_matrix$topper$costmatrices)) {

      # Set up costs vector:
      costs <- vector(mode = "character")

      # For each row state (from):
      for (j in rownames(x = cladistic_matrix$topper$costmatrices[[i]]$costmatrix)) {

        # For each column state (to):
        for (k in colnames(x = cladistic_matrix$topper$costmatrices[[i]]$costmatrix)) {

          # Add cost of j to k transition to costs vector:
          costs <- c(
            costs,
            paste(
              j,
              ">",
              k,
              " ",
              gsub(
                pattern = Inf,
                replacement = "i",
                x = cladistic_matrix$topper$costmatrices[[i]]$costmatrix[j, k]
              ),
              sep = ""
            )
          )
        }
      }

      # Format top of costmatrix code:
      costmatrix_top <- paste("smatrix = ", i - 1, " (", names(cladistic_matrix$topper$costmatrices)[i], ")", sep = "")

      # Get hits (characters assigned to ith costmatrix):
      hits <- which(x = ordering == names(cladistic_matrix$topper$costmatrices)[i])

      # Add hits to global hits for all costmatrices:
      global_hits <- c(global_hits, hits)

      # Stop if no hits!:
      if (length(x = hits) == 0) stop(paste("No characters assigned to costmatrix: ", names(cladistic_matrix$topper$costmatrices)[i], ".", sep = ""))

      # Build costmatrix lines:
      costmatrix_lines <- paste(c(costmatrix_top, costs, ";", paste("smatrix + ", names(cladistic_matrix$topper$costmatrices)[i], " ", paste(hits - 1, collapse = " "), ";", sep = "")), collapse = "\n")

      # Add costmatrix lines to all costmatrix lines:
      all_costmatrix_lines <- c(all_costmatrix_lines, costmatrix_lines)
    }

    # Make costmatrix block:
    costmatrix_block <- paste(paste(c(all_costmatrix_lines, paste("ccode ( ", paste(sort(x = global_hits - 1), collapse = " "), ";", sep = "")), collapse = "\n"), "\n", sep = "")

  # If no costmatri(ces):
  } else {

    # Create empty costmatrix block:
    costmatrix_block <- ""
  }

  # Build ccode block:
  ccode_block <- paste("ccode ", paste(paste(ifelse(ifelse(ordering == "continuous", "ord", ordering) == "ord", "+", "-"), ifelse(character_weights == 0, "]", "["), "/", ifelse(ordering == "continuous", "1", ifelse(character_weights == 0, 1, character_weights)), " ", paste(1:sum(n_characters) - 1), sep = ""), collapse = " "), " ;\n", sep = "")

  # Build full string with all blocks together:
  full_string <- paste("taxname=;\nmxram 4096;\ntaxname +", max(nchar(x = rownames(x = cladistic_matrix$matrix_1$matrix))), ";\nnstates num 32;\nxread\n", header_block, sum(n_characters), " ", n_taxa, "\n", matrix_block, ";\n", ccode_block, costmatrix_block, "proc/;\n", sep = "")

  # If adding analysis block:
  if (add_analysis_block) {

    # If there are few enough taxa for an exact solution:
    if (n_taxa <= 24) {

      # Use implicit enumeration:
      analysis_block <- c("collapse [;", "ienum;")

      # If there are too many taxa (requiring heuristics):
    } else {

      # First iteration with new tech (where scratch.tre is created):
      analysis_block <- c("rseed*;", "hold 999;", "xmult=rss fuse 50 drift 50 ratchet 50;", "tsave scratch.tre;", "save;", "tsave /;")

      # Iterations 2-20 (where trees are appended to scratch.tre):
      analysis_block <- c(analysis_block, rep(c("rseed*;", "hold 999;", "xmult=rss fuse 50 drift 50 ratchet 50;", "tsave scratch.tre +;", "save;", "tsave /;"), 19))

      # Read in new technology trees (scratch.tre) and finish with a heuristic (tbr) search for all mpts:
      analysis_block <- c(analysis_block, c("hold 100000;", "shortread scratch.tre;", "bbreak=tbr;"))
    }

    # Get stripped file name for use in export lines:
    output_file_name <- strsplit(strsplit(file_name, "/")[[1]][length(x = strsplit(file_name, "/")[[1]])], "\\.")[[1]][1]

    # Make name for strict consensus and MPTs tree:
    scc_file_name <- paste("export -", output_file_name, "tntmpts_plus_strict.nex;", sep = "")

    # Make MRP file name:
    mrp_file_name <- c("mrp;", paste("export ", output_file_name, "mrp.nex;", sep = ""))

    # Add analysis block to output file:
    full_string <- gsub(pattern = "proc/;\n", replacement = paste(paste(analysis_block, collapse = "\n"), "nelsen*;", scc_file_name, paste(mrp_file_name, collapse = "\n"), "\nproc/;\n"), x = full_string)
  }

  # Write to file:
  write(full_string, file_name)
}