Generate MrBayes trees using bayesgen.pl

Before running this file, convert MrBayes output into R-readable output in nexTrees folder using t2nex.pl

Getting things ready

Before loading trees, R needs to know where abouts on the computer files are stored.

If you are using RStudio, then R will by default begin in the directory in which the package is installed -- great, no further work is required.

Otherwise you might need to run setwd("C:/path/to/OReillyEtAl2016/data-raw"), substituting in the necessary path on your machine.

Data are saved to the package using the usedata package, which you'll need to install using install.packages('usethis') if you haven't already.

```{R Initialize}

Load the tree used to generate the simulated data matrices

referenceTree <- OReillyEtAl2016::orReferenceTree

Tree files are located in the data-raw subdirectory

DIR_ROOT = '../data-raw/trees/'

The file names have a number of components, whose format is defined here:

N_BAYES_RUNS <- 2 DATASET_NUMS <- 1:100 REPL_NUMS <- 1:10 NCHAR_VALUES <- c('100', '350', '1000') FILE_NUMS <- as.character(vapply(DATASET_NUMS, paste0, character(length(REPL_NUMS)), '_', REPL_NUMS))

TIP_ORDER <- c("Osteolepiformes", "Ichthyostega", "Archeria", "Proterogyrinus", "Ariekanerpeton", "Seymouria", "Kotlassia", "Nectridea", "Utaherpeton", "Asaphestera", "Lysorophia", "Cardiocephalus", "Brachystelechidae", "Rhynchonkos", "Pantylus", "Microbrachis", "Adelogyrinidae", "Caeciliidae", "Typhlonectidae", "Ichthyophiidae", "Rhinatrematidae", "Eocaecilia", "Rhyacotritonidae", "Plethodontinae", "Bolitoglossinae", "Spelerpinae", "Hemidactylinae", "Amphiumidae", "Salamandridae", "Dicamptodontidae", "Ambystomatidae", "Proteidae", "Sirenidae", "Cryptobranchidae", "Hynobiidae", "Karaurus", "Leiopelmatidae", "Ascaphidae", "Pelodytidae", "Megophryidae", "Pelobatidae", "Scaphiopodidae", "Ranoidea", "Sooglossidae", "Heleophrynidae", "Myobatrachidae", "Calyptocephallidae", "Hyloidea", "Rhinophrynidae", "Pipidae", "Bombinatoridae", "Discoglossidae", "Triadobatrachus", "Aistopoda", "Westlothiana", "Procolophonidae", "Captorhinidae", "Synapsida", "Diadectes", "Limnoscelis", "Solenodonsaurus", "Gephyrostegidae", "Doleserpeton", "Amphibamus", "Apateon", "Tersomius", "Ecolsonia", "Eryops", "Dendrerpeton", "Colosteidae", "Baphetidae", "Crassigyrinus", "Tulerpeton", "Acanthostega", "Panderichthyidae");

Trees themselves are saved in the data-raw/Trees subdirectory

TREE_FILE <- paste0(DIR_ROOT, '%s.%s/%s.sym') # Defines the pattern of the file name BAYES_FILE <- paste0(DIR_ROOT, '%s.%s/R%s.con.nex') # Defines the pattern of the file name BAYES_TREE <- paste0(DIR_ROOT, 'MrBayes/%s_%s.mb.nex.run%s.nex')

SUBOPTIMAL <- list( mk = seq(1, 0.5, length.out = 21L), gc = c(-100, -75, -50, -25, seq(0, 100, length.out = 21L-4L)) )

ReadNexus <- ape::read.nexus WriteNexus <- ape::write.nexus

GenerateScores <- function(analysis, trees) { cat("Generating results for", analysis, "... ") cat("Quartets...") orQuartets[[as.character(NCHAR)]][[analysis]] <<- vapply(trees, Quartet::QuartetStatus, 0L * BLANK_QUARTS[, , 1], cf = referenceTree) cat(" Done. Partitions...") orPartitions[[as.character(NCHAR)]][[analysis]] <<- vapply(trees, Quartet::SplitStatus, 0 * BLANK_SPLITS[, , 1], cf = referenceTree) cat(" Done.\n") }

ANALYSES <- c(mk = 'markov', eq = 'equal', k2 = 'implied2', k3 = 'implied3', k5 = 'implied5', k10 = 'implied10', k20 = 'implied20', k200 ='implied200', kC = 'impliedC')

BLANK_SPLITS <- array(NA, c(21, 8, length(FILE_NUMS)), dimnames=list(NULL, c("N", "P1", "P2", "s", "d1", "d2", "r1", "r2"), FILE_NUMS)) ALL_BLANK_P <- lapply(ANALYSES, function(x) BLANK_SPLITS) names(ALL_BLANK_P) <- ANALYSES

BLANK_QUARTS <- array(NA, c(21, 7, length(FILE_NUMS)), dimnames=list(NULL, c("N", "Q", "s", "d", "r1", "r2", "u"), FILE_NUMS)) ALL_BLANK_Q <- lapply(ANALYSES, function(x) BLANK_QUARTS) names(ALL_BLANK_Q) <- ANALYSES THESE_FILE_NUMS <- as.character(vapply(DATASET_NUMS, paste0, character(length(REPL_NUMS)), '_', REPL_NUMS))

Create empty data objects as placeholders in first instance

if (!any(file.exists(paste0(c('..', '.'), '/data/orQuartets.rda')))) { # Don't overwrite existing data! orPartitions <- list('100'=ALL_BLANK_S, '350'=ALL_BLANK_S, '1000'=ALL_BLANK_S) orQuartets <- list('100'=ALL_BLANK_Q, '350'=ALL_BLANK_Q, '1000'=ALL_BLANK_Q) usethis::use_data(orQuartets, orPartitions, overwrite=FALSE) remove("orQuartets", "orPartitions") }

' @param tree Output of TreeSearch::TNTText2Tree(text)

TranslateTntTips <- function(tree) { tree$tip.label <- TIP_ORDER[as.integer(tree$tip.label) + 1L] tree }

Resampling <- function (jackFile) { jackLines <- readLines(jackFile) jackTree <- TreeSearch::TNTText2Tree(jackLines[3]) jackTipOrder <- order(as.integer(jackTree$tip.label) + 1L) jackNodeOrder <- unique(unlist(phangorn::Ancestors(jackTree, jackTipOrder)))[-1] nTntNode <- jackTree$Nnode jackTree <- TranslateTntTips(jackTree)

jackScores <- trimws(gsub("ttag \+\d+ (.); ", "\1", jackLines[length(jackLines) - (nTntNode - 2L):0]))[order(jackNodeOrder)] jackScores <- gsub("\?", 0, jackScores) jackScores <- gsub("\[(\d+)\]", "-\1", jackScores) jackScores <- matrix(as.double(unlist(strsplit(jackScores, '/'))), 2, dimnames=list(c('freq', 'gc'), NULL))

attr(jackTree, 'scores') <- jackScores # Return: jackTree }

ReduceTreesBySupport <- function(tree) { nTip <- length(tree$tip.label) rootNode <- nTip + 1L nNode <- tree$Nnode nodeSupport <- attr(tree, 'scores')['gc', ]

nodeNumbers <- rootNode + seq_along(nodeSupport) collapse <- lapply(SUBOPTIMAL[['gc']], function (threshold) nodeNumbers[nodeSupport < threshold])

# Return: lapply(collapse, function (toCollapse) TreeSearch::CollapseNode(tree, toCollapse)) }

SupportSuboptimal <- function (filePath) { resampled <- Resampling(filePath) reduced <- ReduceTreesBySupport(resampled) structure(reduced, class='multiPhylo') }

Helper function to load suboptimal trees

LoadSuboptimal <- function (fileNums, pref) { ret <- lapply(sprintf(TREE_FILE, pref, NCHAR, fileNums), SupportSuboptimal) names(ret) <- fileNums

# Return: ret }

### Load trees, calcuate and save statistics

```{R calculate-statistics}
orQuartets <- OReillyEtAl2016::orQuartets
orPartitions <- OReillyEtAl2016::orPartitions

for (NCHAR in NCHAR_VALUES) {

  for (NUM in THESE_FILE_NUMS) {
    # Generate consensus trees from MrBayes output.
    if (!file.exists(sprintf(TREE_FILE, 'mk', NCHAR, NUM, ''))
        && all(file.exists(sprintf(BAYES_TREE, NCHAR, NUM, seq_len(N_BAYES_RUNS)))))
    {
      trees <- structure(unlist(lapply(seq_len(N_BAYES_RUNS), function (run) {
        ReadNexus(file=sprintf(BAYES_TREE, NCHAR, NUM, run))
      }), recursive=FALSE), class='multiPhylo')
      consi <- lapply(BAYES_SUBOPTIMAL, function (p) ape::consensus(trees, p=p))
      names(consi) <- paste0('consensus_', BAYES_SUBOPTIMAL)
      WriteNexus(rev(consi), file=sprintf(TREE_FILE, 'mk', NCHAR, NUM, ''))
    }
  }


  # Load consensus trees from Equal Weights and Markov model analyses
  markov <- lapply(sprintf(BAYES_FILE, 'mk', NCHAR, THESE_FILE_NUMS), ReadNexusSafely)
  names(markov) <- THESE_FILE_NUMS

  SAFE_FILE_NUMS <- THESE_FILE_NUMS[
    !vapply(markov, is.null, logical(1)) &
    file.exists(paste0(DIR_ROOT, 'k2.'  , NCHAR, '/', THESE_FILE_NUMS, '.sym')) & 
    file.exists(paste0(DIR_ROOT, 'k5.'  , NCHAR, '/', THESE_FILE_NUMS, '.sym')) & 
    file.exists(paste0(DIR_ROOT, 'k10.' , NCHAR, '/', THESE_FILE_NUMS, '.sym')) & 
    file.exists(paste0(DIR_ROOT, 'k20.' , NCHAR, '/', THESE_FILE_NUMS, '.sym')) & 
    file.exists(paste0(DIR_ROOT, 'k200.', NCHAR, '/', THESE_FILE_NUMS, '.sym'))]

  # Some TNT / MrBayes analyses failed to complete after > 48 hours
  cat(NCHAR, "characters: Results not available for: ",
      THESE_FILE_NUMS[!THESE_FILE_NUMS %in% SAFE_FILE_NUMS], "\n")

  GenerateScores('markov', markov[SAFE_FILE_NUMS])
  remove('markov') # Free memory

  GenerateScores('equal', LoadSuboptimal(SAFE_FILE_NUMS, 'eq'))

    implied2Trees <- LoadSuboptimal(SAFE_FILE_NUMS, 'k2')
    implied3Trees <- LoadSuboptimal(SAFE_FILE_NUMS, 'k3')
    implied5Trees <- LoadSuboptimal(SAFE_FILE_NUMS, 'k5')
   implied10Trees <- LoadSuboptimal(SAFE_FILE_NUMS, 'k10')
   implied20Trees <- LoadSuboptimal(SAFE_FILE_NUMS, 'k20')
  implied200Trees <- LoadSuboptimal(SAFE_FILE_NUMS, 'k200')
    impliedCTrees <- lapply(SAFE_FILE_NUMS,
    function(NUM) {
      i2   <-   implied2Trees[[NUM]]
      i3   <-   implied3Trees[[NUM]]
      i5   <-   implied5Trees[[NUM]]
      i10  <-  implied10Trees[[NUM]]
      i20  <-  implied20Trees[[NUM]]
      i200 <- implied200Trees[[NUM]]

      # Return:
      structure(lapply(1:21, function (j) 
        ape::consensus(i2[[j]], i3[[j]], i5[[j]], i10[[j]], i20[[j]], i200[[j]])),
        class = 'multiPhylo')
      }
    )
  names(impliedCTrees) <- SAFE_FILE_NUMS

  ### Calculate tree statistics

  # Define the expected format of tree statistics (needed for vapply)
  # (Using lapply or sapply instead of vapply can be simpler, and is only slightly slower)

  GenerateScores(  'impliedC', impliedCTrees); remove(impliedCTrees)
  GenerateScores(  'implied2', implied2Trees); remove(implied2Trees)
  GenerateScores(  'implied3', implied3Trees); remove(implied3Trees)
  GenerateScores(  'implied5', implied5Trees); remove(implied5Trees)
  GenerateScores( 'implied10', implied10Trees); remove(implied10Trees)
  GenerateScores( 'implied20', implied20Trees); remove(implied20Trees)
  GenerateScores('implied200', implied200Trees); remove(implied200Trees)

  usethis::use_data(orPartitions, overwrite=TRUE, compress='bzip2')
  usethis::use_data(orQuartets,   overwrite=TRUE, compress='bzip2') # xz better, but errors
}

tools::resaveRdaFiles('data') # Slow; once optimal compression identified,
                              # specify it in use_data


ms609/OReillyEtAl2016 documentation built on March 3, 2024, 1:18 p.m.