#' Creates Hierarchical Trees
#'
#' These functions will create a taxonomic tree (dendrogram) based on ranking from EOLs provider
#' (hierarchy) pages.
#'
#' @export
#'
#' @param MyHiers A vector of filenames or a list of XMLs for downloaded EOL pages
#' @param TreeData A dataframe of taxonomic hierarchy information out of MakeTreeData function
#' @param missingData If tip taxa are not all the same taxonomic rank, should Reol cleave out taxa
#' or hierarchical rank first
#' @param includeNodeLabels Option to write node labels to phylogenetic tree (Note, this can also
#' be done separately using \code{NodeLabelList}
#' @param userRanks Option for the user to define their own hierarchical pattern to make a tree.
#' This will define which ranked classifications to include in the final tree. If left NULL, it
#' will try to keep as much information as possible.
#' @param Taxon Taxonomic group that contains subunits
#' @param label Which hierarchical units should be included in the node labels
#' @param i A Hierarchical Taxonomy (ex: Kingdom, Class, Species)
#' @param j A Hierarchical Taxonomy (ex: Class, Genus, Species)
#'
#' @return \code{MakeTreeData} returns a data frame with data for use in \code{MakeHierarchyTree}
#' function, but can also be used independently to examine the hierarchical structure.
#' \code{AutofillTaxonNames} is an internal function that deals with missing internal data in
#' TreeData. If taxa are of varying hierarchical ranks (for example a mix of genera and species)
#' then tips can not be aligned in the tree. You will need to select what data you would like
#' to drop from the analysis; for example, either taxa with missing species information OR all
#' species names that will then make a tree of genera. \code{MakeHierarchyTree} returns taxonomic
#' tree in the class phylo. \code{ReturnTaxSet} will return the tree tips for the taxonomic group
#' requested, this is mostly an internal function for creating node labels. \code{MakeNodeLabels}
#' will return a list of tip labels per internal node. This can be used to create node labels to
#' plot on the tree. \code{MergeTaxonomies} will merge two taxonomies into a single list that
#' preserves hierarchical structure. It is mostly for internal use.
#'
#' @details This tree displays taxonomic structuring only and is not the result of a phylogenetic
#' analysis. Alos note that not all providers return hierarchy information, if errors or no tree
#' is returned it is likely that information is missing and you may have to use another provider.
#'
#' @seealso \code{\link{ProviderCount}} \code{\link{DownloadHierarchy}} \code{\link{MakeEdgeLabels}}
#' @examples \dontrun{
#' data(MyHiers)
#' TreeData <- MakeTreeData(MyHiers)
#' Tree <- MakeHierarchyTree(MyHiers, includeNodeLabels=TRUE)
#' labels <- NodeLabelList(MyHiers, "all")
#' plot(Tree, show.node.label=TRUE)
#' plot(Tree, "c", show.node.label=TRUE, adj=0.5, font=3, edge.color="gray",
#' tip.color=rainbow(10))
#' }
MakeHierarchyTree <- function(MyHiers, missingData=NULL, includeNodeLabels=TRUE, userRanks=NULL) {
TreeData <- MakeTreeData(MyHiers)
pattern <- paste("~", paste(colnames(TreeData), sep="", collapse="/"), sep="")
if(!is.null(userRanks)){
TreeData <- TreeData[,which(colnames(TreeData) %in% userRanks)]
TreeData <- AutofillTaxonNames(TreeData)
#TreeData <- DropADim(TreeData)
pattern <- paste("~", paste(colnames(TreeData), sep="", collapse="/"), sep="")
}
if(any(apply(TreeData, 2, RepeatDataToDrop))) {
if(any(is.na(TreeData))){
TreeData <- AutofillTaxonNames(TreeData)
if(colnames(TreeData[dim(TreeData)[2]]) != strsplit(pattern, "/")[[1]][length(strsplit(pattern, "/")[[1]])]){
if(is.null(missingData))
stop("Tip taxa are differing hierarchical ranks, you need to choose an option in missingData whether to drop taxa or ranks.")
if(missingData == "pruneTaxa")
TreeData <- TreeData[-which(is.na(TreeData[,dim(TreeData)[2]])),]
}
}
DataToDrop <- which(apply(TreeData, 2, RepeatDataToDrop))
if(any(DataToDrop))
pattern <- paste("~", paste(colnames(TreeData)[-DataToDrop], sep="", collapse="/"), sep="")
}
if(colnames(TreeData[dim(TreeData)[2]]) != strsplit(pattern, "/")[[1]][length(strsplit(pattern, "/")[[1]])]){
paste("Your hierarchy files contain information to the", colnames(TreeData[dim(TreeData)[2]]), "level, however not all taxa have this information. In order to make a tree the tips must align, so information was pruned to", strsplit(pattern, "/")[[1]][length(strsplit(pattern, "/")[[1]])])
paste("If you want to make a", colnames(TreeData[dim(TreeData)[2]]), "tree, change the missingData argument to pruneTaxa, which will remove taxa without", colnames(TreeData[dim(TreeData)[2]]))
}
if(pattern == "~")
stop("Error in Tree Building: try MakeTreeData(MyHiers) to see if there is hierarchical data associated with your files")
fo <- stats::as.formula(pattern)
TreeData <- as.data.frame(apply(TreeData, 2, factor))
tree <- ladderize(as.phylo.formula(fo, data=TreeData))
if(includeNodeLabels)
tree <- makeNodeLabel(tree, method="u", nodeList=NodeLabelList(MyHiers, "all", missingData=missingData), fixed=TRUE) #maybe change this later when other options
return(tree)
}
subsetDataForHierTrees <- function(oneFileHier, HierID) {
oneFileHier <- matrix(oneFileHier[1:which(oneFileHier[,6] == HierID),], ncol=7) #stop at HierID to avoid taking children
oneFileHier <- matrix(oneFileHier[which(!duplicated(oneFileHier[,2])),], ncol=7) #delete repeats
oneFileHier <- matrix(oneFileHier[!is.na(oneFileHier[,2]),], ncol=7) #delete NAs; ncol is hard coded to be 7 columns so that R doesn't convert to a vector when there is a single row.
if(any(oneFileHier[,2] == "unranked clade"))
oneFileHier <- oneFileHier[-which(oneFileHier[,2] == "unranked clade"),] #delete unranked
return(oneFileHier)
}
#' @rdname MakeHierarchyTree
#' @export
MergeTaxonomies <- function(i, j) {
combined <- i
`%ni%` = Negate(`%in%`)
outlier.pos <- which(j %ni% i)
for (outlier.index in sequence(length(outlier.pos))) {
if (outlier.pos[outlier.index] == 1) {
paste("Haven't dealt with this yet")
return(combined)
}
previous.pos <- which(grepl(j[outlier.pos[outlier.index] - 1], combined))[1]
if ((previous.pos+1) <= length(combined))
combined <- c(combined[1:previous.pos], j[outlier.pos[outlier.index]], combined[(previous.pos+1): length(combined)])
else
combined <- c(combined[1:previous.pos], j[outlier.pos[outlier.index]])
}
return(combined)
}
CombineHierarchyInfo <- function(MyHiers) {
CombFiles <- matrix(nrow=0, ncol=7)
longestHierTaxon <- 0 #start at 0, so it accepts the first file as the longest
MergedTax <- NULL
for(i in sequence(length(MyHiers))) {
OFH <- OneFileHierarchy(MyHiers[i])
if(dim(OFH)[1] <= 1)
cat(paste(OFH[1,1], "has no higher taxonomic rankings, so it must be dropped\n"))
if(dim(OFH)[1] > 1){
oneFile <- subsetDataForHierTrees(OFH, GetHierID(MyHiers[i]))
if(dim(oneFile)[1] <= 1)
cat(paste(OFH[1,1], "has no higher taxonomic rankings, so it must be dropped\n"))
if(dim(oneFile)[1] > 1) {
Tax <- oneFile[,2]
MergedTax <- MergeTaxonomies(Tax, MergedTax)
if(length(oneFile[,2]) > longestHierTaxon) {
longestHierTaxon <- max(longestHierTaxon, length(oneFile[,2]))
CombFiles <- rbind(oneFile, CombFiles) #puts longest hierarchies first in combined files
CombFiles <- as.data.frame(CombFiles, stringsAsFactors=FALSE)
}
}
else
CombFiles <- rbind(CombFiles, oneFile) #puts shorter hierarchies after
}
}
return(list(CombFiles, MergedTax))
}
#' @rdname MakeHierarchyTree
#' @export
MakeTreeData <- function(MyHiers) {
MyHiers <- RemoveNAFiles(MyHiers)
CombFiles <- CombineHierarchyInfo(MyHiers)
whichColumns <- CombFiles[[2]]
TreeData <- data.frame(matrix(nrow=length(MyHiers), ncol=length(whichColumns)))
colnames(TreeData) <- whichColumns
for(i in sequence(length(MyHiers))) {
#here go one at a time and add each row
oneFile <- subsetDataForHierTrees(OneFileHierarchy(MyHiers[i]), GetHierID(MyHiers[i]))
for(j in sequence(dim(oneFile)[1])){
colPosition <- which(colnames(TreeData) == oneFile[j,2])
TreeData[i,colPosition] <- oneFile[j,1]
TreeData <- as.data.frame(TreeData, stringsAsFactors=T)
}
}
return(TreeData)
}
RepeatDataToDrop <- function(TreeData) {
#Drop any columns that are the same or NA
if(length(unique(TreeData)) == 1)
return(TRUE)
if(any(is.na(TreeData)))
return(TRUE)
else
return(FALSE)
}
DropADim <- function(TreeData) {
while(any(is.na(TreeData))){
rowPercent <- apply(is.na(TreeData), 1, sum)/dim(TreeData)[2]
colPercent <- apply(is.na(TreeData), 2, sum)/dim(TreeData)[1]
maxPercent <- max(c(rowPercent, colPercent))
if(maxPercent %in% rowPercent)
TreeData <- TreeData[-which(rowPercent == maxPercent), ]
if(maxPercent %in% colPercent)
TreeData <- TreeData[,-which(colPercent == maxPercent)]
}
return(TreeData)
}
#' @rdname MakeHierarchyTree
#' @export
AutofillTaxonNames <- function(TreeData){
#autofill in missing data with child taxon names
for(i in sequence(dim(TreeData)[1])){
columnNAs <- which(is.na(TreeData[i,]))
columnInfo <- which(!is.na(TreeData[i,]))
if(any(columnNAs)){
if(max(columnNAs) > max(columnInfo)){
columnNAs <- columnNAs[-which(columnNAs > max(columnInfo))]
}
for (j in rev(sequence(length(columnNAs)))){
ChildTaxonPlace <- which(columnInfo > columnNAs[j])[1]
#TreeData[i, columnNAs[j]] <- paste(colnames(TreeData)[columnNAs[j]], TreeData[i, columnInfo[ChildTaxonPlace]], sep="")
TreeData[i, columnNAs[j]] <- paste(TreeData[i, columnInfo[ChildTaxonPlace]], sep="")
}
}
}
return(TreeData)
}
#' @rdname MakeHierarchyTree
#' @export
ReturnTaxSet <- function(Taxon, TreeData) {
whichRows <- which(TreeData == Taxon, TRUE)[,1]
return(TreeData[whichRows, dim(TreeData)[2]])
}
#' @rdname MakeHierarchyTree
#' @export
NodeLabelList <- function(MyHiers, label="all", missingData) { #also make an option to just label genus, etc.
TreeData <- MakeTreeData(MyHiers)
if(any(is.na(TreeData))){
TreeData <- AutofillTaxonNames(TreeData)
if(any(is.na(TreeData)) && !is.null(missingData)){
if(missingData == "pruneTaxa")
TreeData <- TreeData[-which(is.na(TreeData[,dim(TreeData)[2]])),]
}
}
DataToDrop <- which(apply(TreeData, 2, RepeatDataToDrop))
if(any(DataToDrop))
TreeData <- TreeData[,-DataToDrop]
if(label == "all")
uniqueTaxSets <- c(apply(TreeData[,-dim(TreeData)[2]], 2, unique), recursive=T)
ListOfSpeciesPerNode <- lapply(uniqueTaxSets, ReturnTaxSet, TreeData= TreeData)
names(ListOfSpeciesPerNode) <- uniqueTaxSets
ListOfSpeciesPerNode <- ListOfSpeciesPerNode[which(lapply(ListOfSpeciesPerNode, length) > 1)]
if(any(c(lapply(ListOfSpeciesPerNode, duplicated), recursive=T))){
for(i in rev(sequence(length(ListOfSpeciesPerNode)))){
if(any(duplicated(ListOfSpeciesPerNode[[i]])))
ListOfSpeciesPerNode[[i]] <- ListOfSpeciesPerNode[[i]][-which(duplicated(ListOfSpeciesPerNode[[i]]))]
}
}
if(any(c(lapply(ListOfSpeciesPerNode, length), recursive=T) == 1))
ListOfSpeciesPerNode <- ListOfSpeciesPerNode[-which(c(lapply(ListOfSpeciesPerNode, length), recursive=T) == 1)]
if(any(is.na(names(ListOfSpeciesPerNode))) || any(names(ListOfSpeciesPerNode) == "Not assigned")){
whichMissing <- c(which(is.na(names(ListOfSpeciesPerNode))), which(names(ListOfSpeciesPerNode) == "Not assigned"))
ListOfSpeciesPerNode <- ListOfSpeciesPerNode[-whichMissing]
}
return(ListOfSpeciesPerNode)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.