#' Get similarity
#' Uses ape's cophenetic.phylo to get patristic distances between pairs of taxa, then makes the diagonals Inf. If the tree has no branch lengths, makes every edge length 1.
#' @param phy_full A phylo object with all feasible taxa to sammple from
#' @param taxa_feasible A vector of taxon names that are feasible to study (often on another tree)
#' @param truncate_full_to_mrca If TRUE, prune the full tree to the node that is the MRCA of the
#' @return A matrix with tips on the rows and columns and patristic distances in the cells. Rows are tips on the phy_full tree, columns are taxa in the taxa_feasible vector
GetSimilarity <- function(phy_full, taxa_feasible, truncate_full_to_mrca=FALSE) {
if(truncate_full_to_mrca) {
phy_full <- ape::extract.clade(phy_full, node=ape::getMRCA(phy_full, tip=phy_full$tip.label[phy_full$tip.label %in% taxa_feasible]))
}
if(is.null(phy_full$edge.length)) {
phy_full <- ape::compute.brlen(phy_full, 1)
}
cophy <- ape::cophenetic.phylo(phy_full)
cophy <- cophy[,colnames(cophy) %in% taxa_feasible]
if(ncol(cophy)<2) {
warning(paste0("Only ", ncol(cophy), " taxa in phy_full matched the taxa_feasible vector. Do the names overlap? No spaces in one, underscores in another, etc.?"))
}
return(cophy)
}
#' Get the closest taxon to the focal taxon
#' @param focal_taxon String with taxon name to find closest match to
#' @param similarity matrix Matrix from GetSimilarity function
#' @return The taxon closest to it
GetClosest <- function(focal_taxon, similarity_matrix) {
smallest <- which(similarity_matrix[focal_taxon,]==min(similarity_matrix[focal_taxon,]))
return(sample(names(smallest), 1))
}
#' Get n samples
#'
#' @param n How many taxa to sample
#' @param phy_full A phylo object with all feasible taxa to sample from
#' @param taxa_feasible A vector of taxon names that are feasible to study (often on another tree)
#' @param replace_full If TRUE, will allow selecting the same taxon from the full tree more than once. If FALSE, forbids this. Both cases return n taxa
#' @param replace_feasible If TRUE, will allow selecting the same taxon from the set of feasible taxa more than once (returning a smaller than desired tree). If FALSE, forbids this.
#' @param truncate_full_to_mrca If TRUE, prune the full tree to the node that is the MRCA of the
#' @param less_memory If TRUE, uses a much slower approach that will not create giant matrices
#' @param descendant_labeled If TRUE, assumes the phy_full has been labeled with LabelNodesWithFeasibleDescendants
#' @param fast_ultrametric If TRUE, uses a fast algorithm for ultrametric trees
#' @param verbose If TRUE, all the output will print to the screen
#' @export
#' @return A data.frame of chosen taxa, closest feasible match, and distance between them
#' @import data.table
GetClosestSamples <- function(n, phy_full, taxa_feasible, replace_full=TRUE, replace_feasible=FALSE, truncate_full_to_mrca=FALSE, less_memory=FALSE, descendant_labeled=FALSE, fast_ultrametric=FALSE, verbose=TRUE) {
# data.table is lovable but quirky
DesNode = NULL
FocalNode = NULL
. = NULL
taxa_feasible_pruned <- taxa_feasible[which(taxa_feasible %in% phy_full$tip.label)] #if not on the full tree, we can't find it
if(length(taxa_feasible_pruned)<length(taxa_feasible)) {
warning(paste0("Only ", length(taxa_feasible_pruned), " of ", length(taxa_feasible), " taxa passed matched taxa on the phy_full tree. The others have been excluded. Examples of taxa that failed are ", paste0(head(taxa_feasible[-which(taxa_feasible %in% phy_full$tip.label)]), collapse=", ")))
}
if(!replace_feasible & n>length(taxa_feasible_pruned)) {
stop("You have asked for more unique samples (n) than you have feasible taxa")
}
chosen.df <- data.frame(chosen=rep(NA,n), closest=rep(NA,n), distance=rep(NA,n))
if(fast_ultrametric) {
if(!descendant_labeled) {
phy_full <- LabelNodesWithFeasibleDescendants(taxa_feasible_pruned, phy_full)
}
possible_choices <- phy_full$tip.label
taxa_feasible_pruned_id <- which(phy_full$tip.label %in% taxa_feasible_pruned)
for (sample_iteration in sequence(n)) {
chosen_taxon <- sample(possible_choices,1)
chosen_taxon_id <- which(phy_full$tip.label %in% chosen_taxon)
if(chosen_taxon %in% taxa_feasible_pruned) {
closest_taxon <- chosen_taxon
} else {
ancestor_nodes <- phangorn::Ancestors(phy_full, chosen_taxon_id, type="all")
ancestor_node_labels <- phy_full$node.label[ancestor_nodes - ape::Ntip(phy_full)]
ancestor_node_labels <- ancestor_node_labels[!is.na(ancestor_node_labels)]
ancestor_node_taxa <- strsplit(ancestor_node_labels, ", ")
for (i in seq_along(ancestor_node_taxa)) {
valid <- ancestor_node_taxa[[i]][ancestor_node_taxa[[i]] %in% taxa_feasible_pruned_id]
if(length(valid)>0) {
closest_taxon_id <- sample(valid,1)
break() #keep moving rootward until we find a match
}
}
closest_taxon <- phy_full$tip.label[as.numeric(closest_taxon_id)]
}
if(!replace_full) {
possible_choices <- possible_choices[!possible_choices %in% chosen_taxon]
}
if(!replace_feasible) {
taxa_feasible_pruned <- taxa_feasible_pruned[!taxa_feasible_pruned %in% closest_taxon]
}
chosen.df[sample_iteration,] <- data.frame(chosen_taxon, closest_taxon, NA, stringsAsFactors=FALSE)
}
} else {
if(!less_memory) {
similarity_matrix_original <- GetSimilarity(phy_full, taxa_feasible_pruned, truncate_full_to_mrca=truncate_full_to_mrca)
similarity_matrix <- similarity_matrix_original
for (sample_iteration in sequence(n)) {
chosen_taxon <- sample(rownames(similarity_matrix),1)
closest_taxon <- GetClosest(chosen_taxon, similarity_matrix)
chosen.df[sample_iteration,] <- data.frame(chosen_taxon, closest_taxon, similarity_matrix[chosen_taxon, closest_taxon], stringsAsFactors=FALSE)
# remember full is rows, feasible is columns
if(!replace_full) {
similarity_matrix <- similarity_matrix[!rownames(similarity_matrix) %in% chosen_taxon,]
}
if(!replace_feasible) {
similarity_matrix[,!colnames(similarity_matrix) %in% closest_taxon]
}
}
} else {
if(truncate_full_to_mrca) {
phy_full <- ape::extract.clade(phy_full, node=ape::getMRCA(phy_full, tip=phy_full$tip.label[phy_full$tip.label %in% taxa_feasible_pruned]))
}
print("Progress in sampling species")
#pb <- utils::txtProgressBar(min=0, max=n*ape::Ntip(phy_full))
# for (sample_iteration in sequence(n)) {
# chosen_taxon <- sample(taxa_feasible,1)
# names_distances <- rep(NA, ape::Ntip(phy_full))
# names(names_distances) <- phy_full$tip.label
# for (potential_taxon in sequence(ape::Ntip(phy_full))) {
# run_count <- run_count + 1
# #print(potential_taxon)
# names_distances[potential_taxon] <- phytools::fastDist(phy_full, chosen_taxon, phy_full$tip.label[potential_taxon])
# #utils::setTxtProgressBar(pb, value=potential_taxon+(sample_iteration-1)*ape::Ntip(phy_full))
# print(run_count/(n*ape::Ntip(phy_full)))
# }
# closest_taxon <- sample(names(names_distances)[which.min(names_distances)], 1)
# chosen.df[sample_iteration,] <- data.frame(chosen_taxon, closest_taxon, min(names_distances), stringsAsFactors=FALSE)
#
# }
# The logic here is based on phytools::fastHeight, but with caching to prevent all the lookups
potential_taxon_list <- list()
chosen_taxa <- sample(phy_full$tip.label, n, replace=replace_full)
print("Chose taxa from the full tree")
for (potential_closest_i in sequence(length(taxa_feasible_pruned))) {
actual_id <- which(phy_full$tip.label == taxa_feasible_pruned[potential_closest_i])
potential_taxon_list[[potential_closest_i]] <- c(actual_id, phangorn::Ancestors(phy_full, actual_id, type="all"))
}
print("Got ancestral paths for the potential taxa")
#full_heights <- phytools::nodeHeights(phy_full)
node.ages <- c(rep(0, ape::Ntip(phy_full)), ape::branching.times(phy_full))
full_heights <- matrix(0, dim(phy_full$edge)[1], 2)
for(row.index in 1:dim(phy_full$edge)[1]){
full_heights[row.index,1] <- node.ages[phy_full$edge[row.index,1]]
full_heights[row.index,2] <- node.ages[phy_full$edge[row.index,2]]
}
### Added ###
tmp.df <- cbind(full_heights, phy_full$edge[,1], phy_full$edge[,2])
colnames(tmp.df) <- c("RootwardAge", "TipwardAge", "FocalNode", "DesNode")
tmp.df <- rbind(tmp.df, c(max(ape::branching.times(phy_full)), max(ape::branching.times(phy_full)), NA, Ntip(phy_full)+1))
full_heights.dt <- data.table::as.data.table(tmp.df)
###########
print("Got heights of all nodes on the full tree")
run_count <- 0
start_time <- Sys.time()
names_used <- c()
for (sample_iteration in sequence(n)) {
chosen_taxon <- chosen_taxa[sample_iteration]
names_distances <- rep(NA, length(taxa_feasible_pruned))
names(names_distances) <- taxa_feasible_pruned
chosen_taxon_id <- which(phy_full$tip.label == chosen_taxon)
chosen_taxon_ancestors <- phangorn::Ancestors(phy_full, chosen_taxon_id, type="all")
data.table::setkey(full_heights.dt, DesNode)
#for (potential_closest_taxon in sequence(length(taxa_feasible_pruned))) {
#mrca_node <- intersect(potential_taxon_list[[potential_closest_taxon]]$ancestors, chosen_taxon_ancestors)[1]
mrca_node <- unlist(lapply(potential_taxon_list, GetIntersection, chosen_taxon_ancestors), recursive=FALSE, use.name=FALSE)
rows <- full_heights.dt[.(mrca_node), which=TRUE]
names_distances[1:length(rows)] <- full_heights.dt[rows, TipwardAge]
#utils::setTxtProgressBar(pb, value=potential_taxon+(sample_iteration-1)*ape::Ntip(phy_full))
#run_count <- run_count+1
#}
current_time <- Sys.time()
if(verbose){
print(paste0(100*run_count/(n*length(taxa_feasible_pruned)), "% done; ", round(difftime(current_time, start_time, units="min"),2), " min elapsed so far; approx. ", round(((n*length(taxa_feasible_pruned)-run_count)*as.numeric(difftime(current_time, start_time, units="min")) / run_count)), " min remain"))
}
if(!replace_feasible) {
names_distances <- names_distances[!names(names_distances) %in% names_used]
closest_taxon <- sample(names(names_distances)[which.min(names_distances)], 1)
names_used <- c(names_used, closest_taxon)
}else{
closest_taxon <- sample(names(names_distances)[which.min(names_distances)], 1)
}
chosen.df[sample_iteration,] <- data.frame(chosen_taxon, closest_taxon, min(names_distances), stringsAsFactors=FALSE)
# }
#
# full_heights <- phytools::nodeHeights(phy_full)
# print("Got heights of all nodes on the full tree")
# run_count <- 0
# start_time <- Sys.time()
# for (sample_iteration in sequence(n)) {
# chosen_taxon <- chosen_taxa[sample_iteration]
# names_distances <- rep(NA, length(taxa_feasible_pruned))
# names(names_distances) <- taxa_feasible_pruned
#
# chosen_taxon_id <- which(phy_full$tip.label == chosen_taxon)
# chosen_taxon_ancestors <- c(chosen_taxon_id, phangorn::Ancestors(phy_full, chosen_taxon_id, type="all"))
# for (potential_closest_taxon in sequence(length(taxa_feasible_pruned))) {
# mrca_node <- intersect(potential_taxon_list[[potential_closest_taxon]]$ancestors, chosen_taxon_ancestors)[1]
# #print(mrca_node)
# #print(potential_taxon_list[[potential_closest_taxon]]$ancestors)
# names_distances[potential_closest_taxon] <- full_heights[which(phy_full$edge==chosen_taxon_id)[1]]+full_heights[which(phy_full$edge==potential_taxon_list[[potential_closest_taxon]]$id)[1]] - 2*full_heights[which(phy_full$edge==mrca_node)[1]]
# #utils::setTxtProgressBar(pb, value=potential_taxon+(sample_iteration-1)*ape::Ntip(phy_full))
# run_count <- run_count+1
# }
# current_time <- Sys.time()
# print(paste0(100*run_count/(n*length(taxa_feasible_pruned)), "% done; ", round(difftime(current_time, start_time, units="min"),2), " min elapsed so far; approx. ", round(((n*length(taxa_feasible_pruned)-run_count)*as.numeric(difftime(current_time, start_time, units="min")) / run_count)), " min remain"))
# closest_taxon <- sample(names(names_distances)[which.min(names_distances)], 1)
# chosen.df[sample_iteration,] <- data.frame(chosen_taxon, closest_taxon, min(names_distances), stringsAsFactors=FALSE)
}
}
}
return(chosen.df)
}
#' Get enclosing taxonomy for a feasible tree
#'
#' If you don't have a taxonomy tree for the feasible tree, this will find one
#' @param tips A vetor of taxon names
#' @return A phylogeny of the enclosing taxonomy
#' @export
GetEnclosingTaxonomy <- function(phy_feasible) {
# OTOL attempt, but after all this can only return trees of 25K taxa
# ott_ids <- c()
# focal_tips <- phy_feasible$tip.label
# tips_splits <- split(focal_tips, ceiling(seq_along(focal_tips)/2000))
# for (i in sequence(length(tips_splits))) {
# ott_ids <- c(ott_ids,rotl::tnrs_match_names(tips_splits[[i]])$ott_id)
# }
# ott_ids <- ott_ids[!is.na(ott_ids)]
# ott_ids_good <- rep(NA, length(ott_ids))
# for (i in seq_along(ott_ids)) {
# attempt <- NULL
# try(attempt <- rotl::tol_node_info(ott_ids[i]), silent=TRUE)
# if(!is.null(attempt)) {
# ott_ids_good[i] <- ott_ids[i]
# }
# }
# crown_node <- rotl::tol_mrca(ott_ids=ott_ids_good[!is.na(ott_ids_good)])
# phy_full <- rotl::tol_subtree(node_id=crown_node$mrca$node_id, label_format="name")
classification_info <- vector(mode = "list", length = length(tips))
for (i in seq_along(tips)) {
classification_info[i] <- taxize::classification(tips[i], db = "itis")
Sys.sleep(1)
}
attr(classification_info, "db") <- "itis"
attr(classification_info, "class") <- "classification"
names(classification_info) <- phy_feasible$tip.label
classification_info <- classification_info[!is.na(classification_info)]
ids <- lapply(classification_info, "[[", "id")
ids_count <- table(unlist(ids))
tipmost_mrca <- NA
for (i in seq_along(ids[[1]])) {
if(ids_count[ids[[1]][i]]==length(classification_info)) {
tipmost_mrca <- ids[[1]][i]
}
}
all_tips <- taxize::downstream(as.numeric(tipmost_mrca), db="itis", downto="species")
all_classifications <- taxize::classification(as.numeric(all_tips[[1]][1]$tsn), db="itis")
phy_full <- ape::stree(n=nrow(all_tips[[1]]), type="star", tip.label=all_tips[[1]]$taxonname)
try(phy_full <- taxize::class2tree(all_classifications))
return(phy_full)
}
#' Return a sampled tree
#'
#' @param phy_feasible The tree to subsample
#' @param n The number of taxa to include
#' @param phy_full The larger tree giving relationships
#' @param replace_full If TRUE, will allow selecting the same taxon from the full tree more than once. If FALSE, forbids this. Both cases return n taxa
#' @param replace_feasible If TRUE, will allow selecting the same taxon from the set of feasible taxa more than once (returning a smaller than desired tree). If FALSE, forbids this.
#' @param truncate_full_to_mrca If TRUE, prune the full tree to the node that is the MRCA of the feasible tree
#' @param less_memory If TRUE, uses a much slower approach that will not create giant matrices
#' @param fast_ultrametric If TRUE, uses a fast algorithm for ultrametric trees
#' @param verbose If TRUE, all the output will print to the screen
#' @return A phylo object where taxa are sampled based on representing flat sampling from taxonomy
#' @export
SubsampleTree <- function(phy_feasible, n, phy_full=NULL, replace_full=TRUE, replace_feasible=FALSE, truncate_full_to_mrca=FALSE, less_memory=FALSE, fast_ultrametric=FALSE, verbose=TRUE) {
if(is.null(phy_full)) {
phy_full <- GetEnclosingTaxonomy(phy_feasible$tip.label)
if(ape::Nnode(phy_full)==1) {
warning("Taxonomy tree is completely unresolved")
}
}
samples <- GetClosestSamples(n=n, phy_full=phy_full, taxa_feasible=phy_feasible$tip.label, replace_full=replace_full,
replace_feasible=replace_feasible, truncate_full_to_mrca=truncate_full_to_mrca, less_memory=less_memory, fast_ultrametric=fast_ultrametric, verbose=verbose)
phy_sample <- ape::keep.tip(phy_feasible, samples$closest)
return(phy_sample)
}
#' Include descendant taxon ids in node labels
#' @param node node number, typically a tip
#' @param phy phylo object
#' @param clean wipe existing node labels
#' @param sep Separator
#' @return A phylogeny with terminal node numbers as node.labels. Unfortunately, each will start with NA.
LabelNodesWithChosenDescendants <- function(node, phy, clean=FALSE, sep=", ") {
if(clean | is.null(phy$node.label)) {
phy$node.label <- rep(NA, ape::Nnode(phy))
}
ancestor.nodes <- phangorn::Ancestors(phy, node, type="all")
phy$node.label[ancestor.nodes - ape::Ntip(phy)] <- paste(phy$node.label[ancestor.nodes - ape::Ntip(phy)], node, sep=sep)
return(phy)
}
#' Include descendant taxon ids in node labels
#' @param taxa_feasible vector of taxon names that are feasible for later suse
#' @param phy phylo object
#' @return A phylogeny with terminal node numbers as node.labels.
#' @export
LabelNodesWithFeasibleDescendants <- function(taxa_feasible, phy) {
tip_numbers <- which(phy$tip.label %in% taxa_feasible)
phy$node.label <- rep(NA, ape::Nnode(phy))
for(tip_index in seq_along(tip_numbers)) {
phy <- LabelNodesWithChosenDescendants(tip_numbers[tip_index], phy)
}
phy$node.label <- gsub("NA, ", "", phy$node.label)
return(phy)
}
GetIntersection <- function(x, y){
#Note that this intersection function reverses the order of the output compared to R base intersect().
#So, now rather than grabbing the first item, we grab the last
#return(intersect(x, y)[1])
tmp <- Intersect(x, y)
return(tmp[length(tmp)])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.