inst/doc/tree-space.R

## ----load-trees---------------------------------------------------------------
# Load required libraries 
library("TreeTools", quietly = TRUE) 
library("TreeDist") 
library("TreeSearch") 
  

# Load Sun et al. 2018 trees from TreeSearch package
treeFile <- system.file("datasets/Sun2018.nex", package = "TreeSearch")

# Load trees from your own file with
# treeFile <- "Path/to/my.file"
# 
# To check and set working directory, use:
# getwd() # Should match location of data / tree files 
# setwd(".") # Replace . with desired/directory to change 

# Read all trees from file
trees <- read.nexus(treeFile) 

# Down-sample to a maximum of 48 trees
trees <- trees[unique(as.integer(seq.int(1, length(trees), length.out = 48)))] 

## ----exploring-consensus, fig.keep = "none"-----------------------------------
# Select an appropriate majority value
# When analysing parsimony results, this value should be 1.
majority <- 0.5

# Identify rogue taxa
exclude <- Rogue::QuickRogue(trees, p = majority)$taxon[-1]
exclude

# Select a rogue whose positions should be depicted
plottedRogue <- exclude[1]

# Remove other excluded taxa from tree
consTrees <- lapply(trees, DropTip, setdiff(exclude, plottedRogue))

# Colour tip labels according to their original 'instability' (Smith 2022a)
tipCols <- Rogue::ColByStability(trees)

# Our plotted rogue will not appear on the tree
tipCols <- tipCols[setdiff(consTrees[[1]]$tip.label, plottedRogue)] 


# Set up plotting area 
par(
 mar = c(0, 0, 0, 0), # Zero margins 
 cex = 0.8            # Smaller font size 
)

# Plot the reduced consensus tree, showing position of our plotted rogue 
plotted <- RoguePlot( 
 trees = consTrees, 
 tip = plottedRogue,
 p = majority, 
 edgeLength = 1, 
 tip.color = tipCols 
) 

# Calculate split concordance
concordance <- SplitFrequency(plotted$cons, trees) / length(trees)

# Annotate splits by concordance 
LabelSplits(
 tree = plotted$cons,
 labels = signif(concordance, 3),
 col = SupportColor(concordance),
 frame = "none",
 pos = 3
) 

## ----tree-distances, fig.keep = "none"----------------------------------------
# Compute distances between pairs of trees
dists <- TreeDist::ClusteringInfoDistance(trees)

## ----quartet-distance, eval = FALSE-------------------------------------------
#  # The quartet distance is a slower alternative:
#  library("Quartet")
#  dists <- as.dist(Quartet::QuartetDivergence(
#     Quartet::ManyToManyQuartetAgreement(trees), similarity = FALSE)
#  )

## ----compute-clusters, fig.keep = "none"--------------------------------------
# Try K-means clustering (Hartigan & Wong 1979):
kClusters <- lapply(
  2:15, # Minimum 2 clusters; maximum 15
  function (k) kmeans(dists, k)
)
kSils <- vapply(kClusters, function (kCluster) { 
 mean(cluster::silhouette(kCluster$cluster, dists)[, 3]) 
}, double(1)) 
bestK <- which.max(kSils) 
kSil <- kSils[bestK] # Best silhouette coefficient 
kCluster <- kClusters[[bestK]]$cluster # Best solution 

# Try partitioning around medoids (Maechler et al. 2019):
pamClusters <- lapply(2:15, function (k) cluster::pam(dists, k = k)) 
pamSils <- vapply(pamClusters, function (pamCluster) { 
 mean(cluster::silhouette(pamCluster)[, 3]) 
}, double(1)) 
bestPam <- which.max(pamSils) 
pamSil <- pamSils[bestPam] # Best silhouette coefficient 
pamCluster <- pamClusters[[bestPam]]$cluster # Best solution 

# Try hierarchical clustering with minimax linkage (Bien & Tibshirani 2011):
hTree <- protoclust::protoclust(dists) 
hClusters <- lapply(2:15, function (k) cutree(hTree, k = k)) 
hSils <- vapply(hClusters, function (hCluster) { 
 mean(cluster::silhouette(hCluster, dists)[, 3]) 
}, double(1)) 
bestH <- which.max(hSils) 
hSil <- hSils[bestH] # Best silhouette coefficient 
hCluster <- hClusters[[bestH]] # Best solution

# Set threshold for recognizing meaningful clustering 
# no support < 0.25 < weak < 0.5 < good < 0.7 < strong 
threshold <- 0.5

# Compare silhouette coefficients of each method 
bestMethodId <- which.max(c(threshold, pamSil, hSil, kSil)) 
bestCluster <- c("none", "pam", "hmm", "kmn")[bestMethodId] 

# Best clustering:
c("Structure below threshold",
  "Partitioning around medoids",
  "Hierarchical-minimax",
  "K-means")[bestMethodId]

# Best silhouette coefficient:
max(c(threshold, pamSil, hSil, kSil))

# Store the cluster to which each tree is optimally assigned: 
clustering <- switch(bestCluster, pam = pamCluster, hmm = hCluster, kmn = kCluster, 1) 
nClusters <- length(unique(clustering))

## ----map-space, fig.keep = "none"---------------------------------------------
nDim <- 3 # Number of dimensions to plot

# Generate first dimensions of tree space using PCoA 
map <- cmdscale(dists, k = nDim) 

# Prepare plot layout 
nPanels <- nDim * (nDim - 1L) / 2L # Lower-left triangle 
plotSeq <- matrix(0, nDim, nDim)
plotSeq[upper.tri(plotSeq)] <- seq_len(nPanels)
plotSeq[nDim - 1, 2] <- max(plotSeq) + 1L
layout(t(plotSeq[-nDim, -1]))

# Set plot margins 
par(mar = rep(0.2, 4))

# Set up tree plotting symbols
treePch <- 0 # Square
clusterCol <- palette.colors(nClusters, "Alphabet", alpha = 0.8)
treeCols <- clusterCol[clustering]
for (i in 2:nDim) for (j in seq_len(i - 1)) { 
  
  # Set up blank plot 
  plot( 
    x = map[, j], 
    y = map[, i], 
    ann = FALSE,        # No annotations 
    axes = FALSE,       # No axes 
    frame.plot = TRUE,  # Border around plot 
    type = "n",         # Don't plot any points yet 
    asp = 1,            # Fix aspect ratio to avoid distortion 
    xlim = range(map),  # Constant X range for all dimensions 
    ylim = range(map)   # Constant Y range for all dimensions 
  ) 
   
  # Plot minimum spanning tree (Gower 1969) to depict distortion (Smith 2022)
  mst <- MSTEdges(as.matrix(dists)) 
  segments( 
    x0 = map[mst[, 1], j], 
    y0 = map[mst[, 1], i], 
    x1 = map[mst[, 2], j], 
    y1 = map[mst[, 2], i], 
    col = "#bbbbbb", # Light grey 
    lty = 1          # Solid lines 
  ) 
   
  # Add points
  points( 
    x = map[, j], 
    y = map[, i], 
    pch = treePch, 
    col = treeCols, 
    cex = 1.7, # Point size 
    lwd = 2 # Line width 
  ) 
   
  # Mark clusters 
  for (clI in seq_len(nClusters)) { 
    inCluster <- clustering == clI 
    clusterX <- map[inCluster, j] 
    clusterY <- map[inCluster, i] 
    hull <- chull(clusterX, clusterY) 
    polygon( 
      x = clusterX[hull], 
      y = clusterY[hull], 
      lty = 1, # Solid line style 
      lwd = 2, # Wider line width 
      border = clusterCol[clI] 
    ) 
  } 
}

## ----cluster-consensus, fig.keep = "none"-------------------------------------
# Select proportion for majority rule consensus trees
# 1 is the appropriate value when parsimony trees are included
majority <- 1

# Plot each consensus tree in turn: 
for (i in seq_len(nClusters)) {
  clusterTrees <- trees[clustering == i]
  # Identify rogue taxa for this cluster
  clusterRogues <- Rogue::QuickRogue(clusterTrees, p = majority)$taxon[-1]
  
  # Colour tree labels based on stability across cluster
  tipCols <- Rogue::ColByStability(clusterTrees)
  
  cons <- ConsensusWithout( 
    trees = clusterTrees, 
    tip = clusterRogues,
    p = majority
  ) 
  
  # Root tree 
  cons <- RootTree(cons, trees[[1]]$tip.label[1])
  
  # Set unit edge lengths for viewing
  cons$edge.length <- rep.int(1, nrow(cons$edge)) 
  
  # Rotate nodes, to display clades in order of size 
  cons <- SortTree(cons, order = names(trees[[1]]$tip.label))
  
  # Plot tree
  par(mar = c(0, 0, 0, 0), cex = 0.8) # Set up plotting area
  plot( 
    cons, 
    edge.width = 2,             # Widen lines 
    font = 3,                   # Italicize labels 
    cex = 0.83,                 # Shrink tip font size 
    edge.color = clusterCol[i], # Colour tree according to previous plot
    tip.color = tipCols[cons$tip.label] 
  ) 
  legend(
    "bottomright", 
    paste("Cluster", i), 
    pch = 15,            # Filled circle icon 
    pt.cex = 1.5,        # Increase icon size 
    col = clusterCol[i], 
    bty = "n"            # Don't plot legend in box 
  ) 
  if (length(clusterRogues)) {
    # Mark omitted taxa, if any rogues present
    legend(
      "topright",
      clusterRogues,
      col = tipCols[clusterRogues],
      text.col = tipCols[clusterRogues],
      pch = "-",      # Mark as excluded
      cex = 0.8,      # Smaller font
      text.font = 3,  # Italicize
      bty = "n"       # Don't plot legend in box 
    )
  }
}

Try the TreeSearch package in your browser

Any scripts or data that you put into this service are public.

TreeSearch documentation built on Aug. 26, 2023, 9:06 a.m.