Generate MrBayes trees using bayesgen.pl

Before running this file, convert MrBayes output into R-readable output in nexTrees folder using t2nex.pl

Getting things ready

Before loading trees, R needs to know where abouts on the computer files are stored.

If you are using RStudio, then R will by default begin in the directory in which the package is installed -- great, no further work is required.

Otherwise you might need to run setwd("C:/path/to/CongreveLamsdell2016/data-raw"), substituting in the necessary path on your machine.

Data are saved to the package using the usedata package, which you'll need to install using install.packages('usethis') if you haven't already.

```{R Initialize}

Load the tree used to generate the simulated data matrices

referenceTree <- CongreveLamsdell2016::clReferenceTree

Tree files are located in the data-raw subdirectory

DIR_ROOT = '../data-raw/trees/' MATRIX_DIR = '../data-raw/matrices'

The file names have a number of components, whose format is defined here:

FILE_NUMS <- formatC(1:100, width=3, format='d', flag='0') # Add leading zeroes to numbers SO_NUMS <- formatC(1:20, width=2, format='d', flag='0') # Enumeration of suboptimal trees WEIGHTINGS <- c('eq', 'k1', 'k2', 'k3', 'k5', 'kX')

Trees themselves are saved in the data-raw/clTrees subdirectory

TREE_FILE <- paste0(DIR_ROOT, '%s/%s.', FILE_NUMS, '%s.con.tre') # Defines the pattern of the file name MATRIX_FILES <- paste0(MATRIX_DIR, '/', FILE_NUMS, '.txt.nex') BAYES_TREE <- paste0(DIR_ROOT, 'MrBayes/%s.nex.run%s.nex') CI_PATH <- paste0(DIR_ROOT, 'consistency_indices.txt') SUBOPTIMAL <- list( mk = seq(1, 0.5, length.out = 21L), mkv = seq(1, 0.5, length.out = 21L), brem = seq(1, 0.5, length.out = 21L), freq = seq(0, 100, length.out = 51L), gc = seq(-100, 100, length.out = 41L) )

Input matrices are numbered non-sequentially; order taken from data file.

TIP_ORDER <- as.character(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 19, 18, 17, 15, 16, 13, 14, 20, 21, 22))

Define the expected format of tree statistics (needed for vapply)

(Using lapply or sapply instead of vapply can be simpler, and is only slightly slower)

BlankReturn <- function (analysis, nCol) matrix(0, ncol=nCol, nrow=length(SUBOPTIMAL[[analysis]]))

' @param tree Output of TreeSearch::TNTText2Tree(text)

TranslateTntTips <- function(tree) { tree$tip.label <- TIP_ORDER[as.integer(tree$tip.label) + 1L] tree }

Helper function to load suboptimal trees

LoadSuboptimal <- function (pref) { lapply(TREE_FILE, function (treeFile) { lapply(c(sprintf(treeFile, pref, pref, ''), sprintf(treeFile, pref, pref, paste0('.so', SO_NUMS))), TreeSearch::ReadTntTree, tipLabels = TIP_ORDER) }) }

Helper function to load bootstrap and jackknife scores

Resampling <- function (jackFile) { jackLines <- readLines(jackFile) jackTree <- TreeSearch::TNTText2Tree(jackLines[3]) jackTipOrder <- order(as.integer(jackTree$tip.label) + 1L) jackNodeOrder <- unique(unlist(phangorn::Ancestors(jackTree, jackTipOrder)))[-1] nTntNode <- jackTree$Nnode jackTree <- TranslateTntTips(jackTree)

jackScores <- trimws(gsub("ttag \+\d+ (.); ", "\1", jackLines[length(jackLines) - (nTntNode - 2L):0]))[order(jackNodeOrder)] jackScores <- gsub("\?", 0, jackScores) jackScores <- gsub("\[(\d+)\]", "-\1", jackScores) jackScores <- matrix(as.double(unlist(strsplit(jackScores, '/'))), 2, dimnames=list(c('freq', 'gc'), NULL))

attr(jackTree, 'scores') <- jackScores # Return: jackTree }

QuartetScores <- function (X) Quartet::QuartetStatus(X, cf=referenceTree) PartitionScores <- function (X) Quartet::SplitStatus(X, cf=referenceTree)

Not used: remove the least supported node in turn until none remain

ReduceTreesNodeByNode <- function(tree, nodeSupport) { nTip <- length(tree$tip.label) rootNode <- nTip + 1L nNode <- tree$Nnode collapseOrder <- order(as.double(nodeSupport)) nodeOrder <- rootNode + collapseOrder nodeChoices <- !upper.tri(matrix(NA, length(nodeOrder), length(nodeOrder)))

# Return: apply(nodeChoices, 1, function (i) TreeSearch::CollapseNode(tree, nodeOrder[i])) }

ReduceTreesBySupport <- function(tree, tag) { nTip <- length(tree$tip.label) rootNode <- nTip + 1L nNode <- tree$Nnode nodeSupport <- attr(tree, 'scores')[tag, ]

nodeNumbers <- rootNode + seq_along(nodeSupport) collapse <- lapply(SUBOPTIMAL[[tag]], function (threshold) nodeNumbers[nodeSupport < threshold])

# Return: lapply(collapse, function (toCollapse) TreeSearch::CollapseNode(tree, toCollapse)) }

SupportSuboptimal <- function (filePath, tag) { resampled <- Resampling(filePath) reduced <- ReduceTreesBySupport(resampled, tag) }

' @param pref prefix denoting weighting conditions (eq, k1, k2...)

' @param measure jak, jackknife (p = 36); sym, symmetric resampling (p = 0.33).

' @param tag TNT states that the first tag is the frequency, the second GC.

NodeSupportSuboptimal <- function(pref, measure, tag) { lapply(paste0(DIR_ROOT, '', pref, '/', pref, '.', FILE_NUMS, '.', measure), SupportSuboptimal, tag) }

GenerateTrees <- function (method, tag) { result <- lapply(WEIGHTINGS, NodeSupportSuboptimal, method, tag) names(result) <- WEIGHTINGS result[['kC']] <- lapply(seq_along(FILE_NUMS), function(i) lapply(seq_along(result[[1]][[1]]), function (j) ape::consensus(result[['k2']][[i]][[j]], result[['k3']][[i]][[j]], result[['k5']][[i]][[j]], result[['kX']][[i]][[j]]) )) result }

## Load matrices and calcluate consistency

```{R load-matrices}
rawMatrices <- lapply(MATRIX_FILES, ape::read.nexus.data)
clMatrices <- lapply(rawMatrices, function (mat) lapply(mat, function (row) row[-(56:59)]))
clPhyDat <- lapply(clMatrices, phangorn::phyDat, type="USER", levels=1:2)
clCI <- vapply(clPhyDat, function (mat) phangorn::CI(referenceTree, mat), double(1))
names(clCI) <- FILE_NUMS

# Save data:
usethis::use_data(clMatrices, clPhyDat, clCI, overwrite=TRUE)

Bayesian results

```{R markov-analyses}

For each file, load the MrBayes tree:

for (NUM in FILE_NUMS) { if (!file.exists(sprintf(TREE_FILE[as.integer(NUM)], 'mk', 'mk', '')) && all(file.exists(sprintf(BAYES_TREE, NUM, 1:4)))) { trees <- unlist(lapply(1:4, function (run) { ape::read.nexus(file=sprintf(BAYES_TREE, NUM, run)) }), recursive=FALSE)

class(trees) <- 'multiPhylo'
consi <- lapply(SUBOPTIMAL$mk, function (p) ape::consensus(trees, p=p))
names(consi) <- paste0('consensus_', SUBOPTIMAL$mk)
ape::write.nexus(rev(consi), file=sprintf(TREE_FILE[as.integer(NUM)], 'mk', 'mk', ''))

} }

Load consensus trees from Equal Weights and Markov model analyses

markovTrees <- lapply(sprintf(TREE_FILE, 'mk', 'mk', ''), ape::read.nexus) clMkvQuartets <- vapply(markovTrees, QuartetScores, BlankReturn('mkv', 7L)) clMkvPartitions <- vapply(markovTrees, PartitionScores, BlankReturn('mkv', 8L)) usethis::use_data(clMkvQuartets, clMkvPartitions, overwrite=TRUE)

## Bremer results

```{R Bremer-analyses}
bremTrees <- lapply(WEIGHTINGS, LoadSuboptimal)
names(bremTrees) <- WEIGHTINGS
bremTrees[['kC']] <- lapply(seq_along(FILE_NUMS), function(i) lapply(1:21, function (j)
  ape::consensus(bremTrees[['k2']][[i]][[j]], bremTrees[['k3']][[i]][[j]],
                 bremTrees[['k5']][[i]][[j]], bremTrees[['kX']][[i]][[j]])
  ))

clBremQuartets <- lapply(bremTrees, vapply, QuartetScores, BlankReturn('brem', 7L))
clBremPartitions <-  lapply(bremTrees, vapply, PartitionScores, BlankReturn('brem', 8L))

usethis::use_data(clBremQuartets, clBremPartitions, overwrite=TRUE)

Calculate tree statistics (resampling)

```{R Load resampling metrics, message=FALSE, warn=FALSE} bootFreqTrees <- GenerateTrees('sym', 'freq') bootGcTrees <- GenerateTrees('sym', 'gc') jackFreqTrees <- GenerateTrees('jak', 'freq') jackGcTrees <- GenerateTrees('jak', 'gc')

clBootFreqQuartets <- lapply(bootFreqTrees, vapply, QuartetScores, FUN.VALUE=BlankReturn('freq', 7L)) clJackFreqQuartets <- lapply(jackFreqTrees, vapply, QuartetScores, FUN.VALUE=BlankReturn('freq', 7L)) clBootFreqPartitions <- lapply(bootFreqTrees, vapply, PartitionScores, FUN.VALUE=BlankReturn('freq', 8L)) clJackFreqPartitions <- lapply(jackFreqTrees, vapply, PartitionScores, FUN.VALUE=BlankReturn('freq', 8L))

clBootGcQuartets <- lapply(bootGcTrees, vapply, QuartetScores, FUN.VALUE=BlankReturn('gc', 7L)) clJackGcQuartets <- lapply(jackGcTrees, vapply, QuartetScores, FUN.VALUE=BlankReturn('gc', 7L)) clBootGcPartitions <- lapply(bootGcTrees, vapply, PartitionScores, FUN.VALUE=BlankReturn('gc', 8L)) clJackGcPartitions <- lapply(jackGcTrees, vapply, PartitionScores, FUN.VALUE=BlankReturn('gc', 8L))

usethis::use_data(clBootFreqQuartets, clBootFreqPartitions, clJackFreqQuartets, clJackFreqPartitions, clBootGcQuartets, clBootGcPartitions, clJackGcQuartets, clJackGcPartitions, overwrite=TRUE)

tools::resaveRdaFiles('data') # Use optimal compression ```



ms609/CongreveLamsdell2016 documentation built on March 5, 2024, 9:28 a.m.