Nothing
#All functions documented with examples
#Rate Table - BEAST2 version
#' Extract evolutionary rates from Bayesian clock trees produced by BEAST2
#'
#' BEAST2 stores the rates for each clock in a separate file. All trees need to be loaded using \code{treeio::read.beast}.
#'
#' @param ... \code{treedata} objects containing the summary trees with associated data on the rates for each separate clock.
#' @param summary summary metric used for the rates. Currently supported: \code{"mean"} or \code{"median"}, default \code{"median"}.
#' @param drop_dummy if not \code{NULL}, will drop the dummy extant tip with the given label from the BEAST2 summary trees prior to extracting the clock rates (when present). Default is \code{NULL}.
#'
#' @return A data frame with a column containing the node identifier (\code{node}) and one column containing the clock rates for each tree provided, in the same order as the trees.
#'
#' @seealso [get_clockrate_table_MrBayes()] for the equivalent function for MrBayes output files.
#' @seealso [clockrate_summary()] for summarizing and examining properties of the resulting rate table. Note that clade membership for each node must be customized (manually added) before these functions can be used, since this is tree and dataset dependent.
#'
#' @export
#'
#' @examples
#' #Import all clock summary trees produced by BEAST2 from your local directory
#' \dontrun{
#' tree_clock1 <- treeio::read.beast("tree_file_clock1.tre")
#' tree_clock2 <- treeio::read.beast("tree_file_clock2.tre")
#' }
#'
#' #Or use the example BEAST2 multiple clock trees that accompany EvoPhylo.
#' data(tree_clock1)
#' data(tree_clock2)
#'
#' # obtain the rate table from BEAST2 trees
#' rate_table <- get_clockrate_table_BEAST2(tree_clock1, tree_clock2, summary = "mean")
#'
#' @md
get_clockrate_table_BEAST2 <- function(..., summary = "median", drop_dummy = NULL) {
trees <- list(...)
if(length(trees) == 0) stop("No trees provided")
summary <- match.arg(summary, c("mean", "median"))
name <- if(summary == "mean") "rate" else "rate_median"
if (!is.null(drop_dummy)) {
trees <- lapply(trees, function(tr) treeio::drop.tip(tr, drop_dummy))
}
rate_table <- data.frame(nodes = as.integer(trees[[1]]@data$node))
for(tr in trees) {
data <- tr@data[match(rate_table$nodes, as.integer(tr@data$node)), ] #get rates in same order as 1st column
rate_table <- cbind(rate_table, data[[name]])
}
colnames(rate_table)[2:ncol(rate_table)] <- paste0("rates", 1:(ncol(rate_table) - 1))
row.names(rate_table) <- NULL
return(rate_table)
}
#Rate Table - MrBayes version
get_clockrate_table_MrBayes <- function(tree, summary = "median", drop_dummy = NULL) {
if (!is.null(drop_dummy)) {
tree <- treeio::drop.tip(tree, drop_dummy)
}
nodes <- as.integer(tree@data$node)
p <- unglue::unglue_data(names(tree@data), "rate<model>rlens<clock>_<summary>",
open = "<", close = ">")
rownames(p) <- names(tree@data)
p <- p[rowSums(is.na(p)) < ncol(p),,drop=FALSE]
p$clock <- gsub("\\{|\\}", "", p$clock)
summary <- match.arg(summary, c("mean", "median"))
rates <- rownames(p)[p$summary == summary]
rate_table <- setNames(data.frame(nodes, tree@data[rates]),
c("nodes", paste0("rates", p[rates, "clock"])))
for (i in seq_len(ncol(rate_table))[-1]) rate_table[[i]] <- as.numeric(rate_table[[i]])
return(rate_table)
}
#----------------------------------------------------------------------------------
#' Designate clade membership for each tip for downstream analyses summarizing rates for each clade
#'
#' @param tree Tree object (file path to a Nexus file) used to extract node numbers and tip labels.
#' @param ancestral_nodes A named list specifying the MRCA node number for each clade.
#' The names are the clade labels. To specify non-monophyletic groups, use a character string
#' in the format \code{"inclusive_node - exclusive_node"}.
#' @param other_nodes_label A label to assign to tips not included in any clade defined by \code{ancestral_nodes}.
#'
#' @return A data frame with columns: \code{node} (node number), \code{ancestral_node} (source MRCA node),
#' and \code{clade} (clade label). Each row represents the clade assignment of a node in the tree.
#'
#' @examples
#' \dontrun{
#' ancestral_nodes <- list(
#' Non_lepidosauria = 242,
#' Other_Lepidosauria = 237,
#' Gekkota = 222,
#' Scincoidea = 210,
#' Teiioidea = 192,
#' Lacertidae = 209,
#' Amphisbaenia = 205,
#' Anguiformes = 150,
#' Acrodonta = 145,
#' Pleurodonta = 133,
#' Caenophidia = 170,
#' Early_Serpentes = "164 - 170" # non-monophyletic group
#' )
#'
#' Nodes_Clade_Table <- clade_membership(
#' tree = "tree.nex",
#' ancestral_nodes = ancestral_nodes,
#' other_nodes_label = "deep_Squamata_nodes"
#' )
#' }
#'
#' @md
### Function
clade_membership <- function(tree, ancestral_nodes, other_nodes_label = "other_nodes") {
# Read the phylogenetic tree from the file
tree <- ape::read.nexus(tree)
# Initialize a list to store results
descendant_list <- list()
# A helper list to store computed descendants for easy reference
computed_descendants <- list()
# Track all descendants to identify other nodes later
all_descendants <- c()
# Loop through each ancestral node and calculate descendants
for (name in names(ancestral_nodes)) {
definition <- ancestral_nodes[[name]]
# Check if the definition is a node number or a formula
if (is.numeric(definition)) {
# If it's numeric, get the descendants as a monophyletic group
descendants <- get_descendants(tree, definition)
} else if (grepl("-", definition)) {
# If it's a formula, parse it and perform the set difference
# Split the formula by "-" to get the inclusive and exclusive nodes as strings
terms <- strsplit(definition, " - ")[[1]]
inclusive_node <- as.numeric(terms[1])
exclusive_node <- as.numeric(terms[2])
# Get descendants for both nodes
inclusive_descendants <- get_descendants(tree, inclusive_node)
exclusive_descendants <- get_descendants(tree, exclusive_node)
# Compute non-monophyletic descendants
descendants <- setdiff(inclusive_descendants, exclusive_descendants)
} else {
stop("Unrecognized format in ancestral_nodes")
}
# Store the descendants in the computed list
computed_descendants[[name]] <- descendants
# Store the results in the descendant_list
descendant_list[[name]] <- data.frame(
node = descendants,
ancestral_node = if (is.numeric(definition)) definition else inclusive_node, # Use inclusive node as reference
clade = name
)
# Track all descendants for this group
all_descendants <- c(all_descendants, descendants)
}
# Get all nodes in the tree (tips and internal nodes)
all_tree_nodes <- c(1:Ntip(tree), (Ntip(tree) + 1):(Ntip(tree) + Nnode(tree)))
# Identify nodes that are not in any descendant list (i.e., "other_nodes")
other_nodes <- setdiff(all_tree_nodes, unique(all_descendants))
# Create a data frame for 'other_nodes'
other_nodes_df <- data.frame(
node = other_nodes,
ancestral_node = NA, # No ancestral node since these are not part of specified groups
clade = other_nodes_label # Use custom label for other nodes
)
# Add the 'other_nodes' data frame to the list
descendant_list[[other_nodes_label]] <- other_nodes_df
# Combine all data frames into one
combined_df <- do.call(rbind, descendant_list)
# Return the combined data frame
return(combined_df)
}
#----------------------------------------------------------------------------------
#Summary stats for clades
clockrate_summary <- function(rate_table, file = NULL, digits = 3) {
if (!is.data.frame(rate_table)) {
stop("'rate_table' must be a data frame.", call. = FALSE)
}
if (!any(startsWith(names(rate_table), "rates"))) {
stop("'rate_table' must contain \"rates\" columns containing clockrate summaries.", call. = FALSE)
}
if (!hasName(rate_table, "clade")) {
stop("A 'clade' column must be present in the data.", call. = FALSE)
}
clocks <- sort(gsub("rates", "", names(rate_table)[startsWith(names(rate_table), "rates")], fixed = TRUE))
clades <- sort(unique(rate_table$clade))
if ("Other" %in% clades) clades <- c(setdiff(clades, "Other"), "Other")
out <- do.call("rbind", lapply(clades, function(cl) {
in_cl <- which(rate_table$clade == cl)
cbind(data.frame(clade = cl,
clock = clocks),
do.call("rbind", lapply(clocks, function(r) {
oneSummary(rate_table[[paste0("rates", r)]][in_cl], digits = digits)
}))
)
}))
out$clade <- factor(out$clade, levels = clades)
out <- out[with(out, order(clock, clade)),]
rownames(out) <- NULL
if (all(clocks == "")) out$clock <- NULL
if (length(file) > 0) {
write.csv(out, file = file)
invisible(out)
}
else {
return(out)
}
}
#Density plot of rates by clade
clockrate_dens_plot <- function(rate_table, clock = NULL, stack = FALSE, nrow = 1, scales = "fixed") {
if (!is.data.frame(rate_table)) {
stop("'rate_table' must be a data frame.", call. = FALSE)
}
if (!any(startsWith(names(rate_table), "rates"))) {
stop("'rate_table' must contain \"rates\" columns containing clockrate summaries.", call. = FALSE)
}
if (!hasName(rate_table, "clade")) {
stop("A 'clade' column must be present in the data.", call. = FALSE)
}
clock_cols <- which(startsWith(names(rate_table), "rates"))
if (is.null(clock)) {
rate_table <- rate_table[c("clade", "nodes", names(rate_table)[clock_cols])]
}
else {
if (!is.numeric(clock) || anyNA(clock) ||
!all(clock %in% as.numeric(gsub("rates", "", names(rate_table)[clock_cols])))) {
stop(paste0("'clock' must be a vector of clock indices. In this data, ",
ngettext(length(clock_cols), "1 clock is ",
paste(length(clock_cols), "clocks are")),
" available."),
call. = FALSE)
}
rate_table <- rate_table[c("clade", "nodes", paste0("rates", clock))]
}
clades <- sort(unique(rate_table$clade))
if ("Other" %in% clades) clades <- c(setdiff(clades, "Other"), "Other")
rate_table$clade <- factor(rate_table$clade, levels = clades)
rt <- clock_reshape(rate_table)
levels(rt$clock) <- paste("Clock", levels(rt$clock))
rateplot <- ggplot(data = rt, mapping = aes(x = .data$rates, fill = .data$clade,
color = .data$clade)) +
geom_hline(yintercept = 0) +
geom_density(position = if (stack) "stack" else "identity",
alpha = if (stack) 1 else .3) +
scale_x_continuous() +
labs(x = "Rate", y = "Density", fill = "Clade", color = "Clade") +
theme_bw() +
theme(legend.position = "top")
# if (nlevels(rt$clock) > 1) {
rateplot <- rateplot + facet_wrap(~.data$clock, nrow = nrow, scales = scales)
# }
rateplot
}
#Plot linear model and Pearson correlation of one rate against another
clockrate_reg_plot <- function(rate_table, clock_x, clock_y, method = "lm", show_lm = TRUE, ...) {
if (!is.data.frame(rate_table)) {
stop("'rate_table' must be a data frame.", call. = FALSE)
}
if (!any(startsWith(names(rate_table), "rates"))) {
stop("'rate_table' must contain \"rates\" columns containing clockrate summaries.", call. = FALSE)
}
clock_cols <- which(startsWith(names(rate_table), "rates"))
if (length(clock_cols) <= 1) {
stop("At least two clock rates must be present in the input.", call. = FALSE)
}
clocks <- sort(gsub("rates", "", names(rate_table)[clock_cols], fixed = TRUE))
if (missing(clock_x) && missing(clock_y)) {
clock_x <- clocks[1]
clock_y <- clocks[2]
}
else if (missing(clock_x)) {
if (length(clock_y) != 1 || !paste0("rates", clock_y) %in% names(rate_table)[clock_cols]) {
stop("'clock_y' must be the index of a clock rate in the input.", call. = FALSE)
}
clock_x <- setdiff(clocks, as.character(clock_y))[1]
}
else if (missing(clock_y)) {
if (length(clock_x) != 1 || !paste0("rates", clock_x) %in% names(rate_table)[clock_cols]) {
stop("'clock_x' must be the index of a clock rate in the input.", call. = FALSE)
}
clock_y <- setdiff(clocks, as.character(clock_x))[1]
}
else {
if (length(clock_x) != 1 || length(clock_y) != 1 ||
!all(paste0("rates", c(clock_x, clock_y)) %in% names(rate_table)[clock_cols])) {
stop("'clock_x' and 'clock_y' must be the indices of clock rates in the input.", call. = FALSE)
}
}
names(rate_table)[names(rate_table) == paste0("rates", clock_x)] <- "clock_x"
names(rate_table)[names(rate_table) == paste0("rates", clock_y)] <- "clock_y"
regplot <- ggplot(rate_table, aes(x = clock_x, y = clock_y)) +
geom_point() +
geom_smooth(method = method, formula = y ~ x, ...) +
scale_x_continuous() +
scale_y_continuous() +
labs(x = paste("Clock", clock_x), y = paste("Clock", clock_y)) +
theme_bw()
if (show_lm) {
r <- cor(rate_table$clock_x, rate_table$clock_y)
#Extract underlying ggplot data to place correlation in correct place in plot
ggbd <- ggplot_build(regplot)$data
ggbd1 <- ggbd[[1]] #geom_point data
ggbd2 <- ggbd[[2]] #geom_smooth data
min_x <- min(min(ggbd1$x), min(ggbd2$x))
max_x <- max(max(ggbd1$x), max(ggbd2$x))
min_y <- min(min(ggbd1$y), min(ggbd2$y),
if (hasName(ggbd2, "ymin")) min(ggbd2$ymin)) #FALSE when se = FALSE
max_y <- max(max(ggbd1$y), max(ggbd2$y),
if (hasName(ggbd2, "ymax")) max(ggbd2$ymax)) #FALSE when se = FALSE
regplot <- regplot +
annotate("text", label = c(paste0("italic(R)^2 == ", round(r^2, 2)),
paste0("italic(r) == ", round(r, 2))), parse = TRUE,
x = .3*min_x + .7*max_x,
y = c(.85*min_y + .15*max_y,
.8*min_y + .2*max_y))
}
regplot
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.