# hidden functions for used with makePBDBtaxonTree
translatePBDBtaxa <- function(taxaDataPBDB, convertAccepted = TRUE){
# if convertAccepted is TRUE, then the taxon name and taxon no will be replaced
# with the accepted name and no for that taxon
# this is NOT disirable for trying to trace parents in a set of taxa
#################
# Do some translation
#need to replace any empty string values with NAs (due perhaps to use of read.csv with the API)
taxaDataPBDB[taxaDataPBDB == ""] <- NA
#if com vocab
if(any("rnk" == colnames(taxaDataPBDB))){
#apparently it doesn't matter if these columns *are* present or not
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "acn"] <- "accepted_name"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "snp"] <- "senpar_no"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "rnk"] <- "taxon_rank"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "nam"] <- "taxon_name"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "fml"] <- "family"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "odl"] <- "order"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "cll"] <- "class"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "phl"] <- "phylum"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "kgl"] <- "kingdom"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "par"] <- "parent_no"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "oid"] <- "taxon_no"
# taxon rank translation vectors for compact vocab
taxRankPBDB <- getTaxRankPBDB()
taxRankCOM <- 2:26
#change contents of "identified_rank" and "accepted_rank"
taxaDataPBDB$taxon_rank <- sapply(taxaDataPBDB$taxon_rank,function(x)
taxRankPBDB[x == taxRankCOM])
message("compact vocab detected, relevant fields will be translated")
}
###########
# following are closet cases that mostly only apply to OLD API calls
#
if(any(colnames(taxaDataPBDB) == "rank")){
#if 1.1 and vocab is pbdb
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "rank"] <- "taxon_rank"
}
#
if(convertAccepted){
# if convertAccepted is TRUE, then the taxon name and taxon no will be replaced
# with the accepted name and no for that taxon
# this is NOT disirable for trying to trace parents in a set of taxa
if(any(colnames(taxaDataPBDB) == "accepted_name")){
#if 1.2 and there is an accepted_name column..
#fill empty accepted_name values with taxon_name
nameFormal <- taxaDataPBDB[,"accepted_name"]
nameFormal[is.na(nameFormal)] <- as.character(taxaDataPBDB[is.na(nameFormal),"taxon_name"])
#replace taxon_name
taxaDataPBDB[,"taxon_name"] <- nameFormal
#
#replace taxon_no with accepted_no
taxNum <- taxaDataPBDB[,"accepted_no"]
taxNum[is.na(taxNum)] <- as.character(taxaDataPBDB[is.na(taxNum),"taxon_no"])
taxaDataPBDB[,"taxon_no"] <- taxNum
#
}
}
if(any(colnames(taxaDataPBDB)=="senpar_no")){
#if this is OLD v1.2 taxaDataPBDB, and there is a senpar_no column
# replace parent_no in the same way with senpar_no
parNum <- taxaDataPBDB[,"senpar_no"]
parNum[is.na(parNum)] <- as.character(taxaDataPBDB[is.na(parNum),"parent_no"])
taxaDataPBDB[,"parent_no"] <- parNum
}
#
return(taxaDataPBDB)
}
getTaxRankPBDB<-function(){
return(c("subspecies","species","subgenus","genus",
"subtribe","tribe","subfamily",
"family","superfamily","infraorder",
"suborder","order","superorder","infraclass",
"subclass","class","superclass","subphylum",
"phylum","superphylum","subkingdom",
"kingdom","unranked clade","informal"
#keep informal as high, never drop!
))
}
convertParentChildMatNames <- function(taxID, parData){
taxMatch <- match(taxID, parData$taxon_no)
if(is.na(taxMatch)){
newName <- "NODE"
}else{
newName <- parData$taxon_name[taxMatch]
newName <- as.character(newName)
}
return(newName)
}
queryMissingParents <- function(taxaID,
APIversion = "1.2",
status = "all",
convertAccepted = FALSE,
stopIfSelfParent = FALSE,
failIfNoInternet = TRUE
){
# find missing parents by access API
#
# drop Eukarya, as it won't return if status = senior under 1.1
# taxaID <- as.numeric(taxaID[taxaID != "1"])
#
# let's get some taxonomic data
# first test internet
testConnect <- canConnectPBDB(fail = failIfNoInternet)
if(is.null(testConnect)){
return(NULL)
}
#
floatData <- read.csv(
paste0("https://paleobiodb.org/data",APIversion,
"/taxa/list.txt?taxon_id=",paste0(taxaID,collapse=","),
### should we take all or only ACCEPTED parents?
"&rel=exact&status=",status,
"&vocab=pbdb"
),
stringsAsFactors = FALSE)
if(nrow(floatData) == 0){
stop(
paste("Current PBDB API would not return info for the following taxon IDs: \n",
paste0(taxaID,collapse = ", ")
)
)
}
# if convertAccepted is TRUE, then the taxon name and taxon no will be replaced
# with the accepted name and no for that taxon
# this is NOT disirable for trying to trace parents in a set of taxa
floatData <- translatePBDBtaxa(floatData,
convertAccepted = convertAccepted)
#parse down to just taxon_name, taxon_no, parent_no
floatData <- parseParentPBDBData(floatData,
stopIfSelfParent = stopIfSelfParent)
#
return(floatData)
}
parseParentPBDBData <- function(parentData, stopIfSelfParent = FALSE){
#parse down to just taxon_name, taxon_no, parent_no
if(any(colnames(parentData)=="accepted_name")){
taxon_name = parentData$accepted_name
}else{
taxon_name = parentData$taxon_name
}
#
result <- data.frame(
taxon_name = as.character(taxon_name),
parent_no = as.numeric(parentData$parent_no),
taxon_no = as.numeric(parentData$taxon_no)
)
rownames(result) <- NULL
#
if(stopIfSelfParent){
result <- fixSelfParents(dataParents = result,
approach = "stop")
}
#
return(result)
}
findNoParentMatch<-function(parData){
res <- is.na(match(parData$parent_no, parData$taxon_no))
return(res)
}
fixSelfParents <- function(dataParents, approach = "clean"){
# check that no taxa are their own parent
selfParents <- (dataParents$parent_no == dataParents$taxon_no)
if(any(selfParents)){
if(approach == "clean"){
dataParents <- dataParents[!selfParents,]
}
if(approach == "stop"){
parMatch <- paste0(dataParents$taxon_name,
" (", dataParents$taxon_no,")")
parMatch <- paste0(parMatch[selfParents],
collapse = ", ")
stop(paste0("The following taxa (with taxon_no) are their own parents:\n",
parMatch))
}
}
return(dataParents)
}
getAllParents <- function(
inputData,
status,
annotatedDuplicateNames = TRUE,
convertAccepted = TRUE,
stopIfSelfParent = FALSE,
failIfNoInternet = TRUE
){
###############################################
parData<-parseParentPBDBData(inputData)
noParentMatch<-findNoParentMatch(parData)
floatingParentNumOld <- NA
while(sum(noParentMatch)>1){
# get floating parent taxon numbers
floatingParentNum <- unique(parData$parent_no[noParentMatch])
#
# checks
if(identical(floatingParentNumOld, floatingParentNum)){
if(convertAccepted){
# well try turning convertAccepted to FALSE
convertAccepted <- FALSE
}else{
#oh, its already false? eh... then give up
dataUrlRef <- paste0("http://paleobiodb.org/data",1.2,
"/taxa/list.txt?taxon_id=",paste0(floatingParentNum,collapse=","),
"&rel=exact&status=",status,"&vocab=pbdb"
)
print(dataNew)
stop(paste0(
"PBDB not returning single common ancestor by tracing parents.\n",
"Following floating parent numbers are repeated:\n",
paste0(floatingParentNum,collapse=", "),
"\nThese were not resolved with the new data (printed above).",
"\nCheck the following URL:\n",dataUrlRef
))
}
}else{
floatingParentNumOld <- floatingParentNum
}
#
# get the missing parents
# if convertAccepted is TRUE, then the taxon name and taxon no will be replaced
# with the accepted name and no for that taxon
# this is (probably?) NOT disirable for trying to trace parents in a set of taxa
dataNew <- queryMissingParents(
floatingParentNum,
status = status,
convertAccepted = convertAccepted,
stopIfSelfParent = stopIfSelfParent,
failIfNoInternet = failIfNoInternet
)
if(is.null(dataNew)){return(NULL)}
# add new parents to the top of the matrix
# so if duplicated names are annotated, its the originals that get annotated
parData <- rbind(dataNew,parData)
noParentMatch <- findNoParentMatch(parData)
}
# remove duplicate taxa
parData <- unique(parData)
# remove self parents
parData <- fixSelfParents(dataParents = parData,
approach = "clean")
#
# TESTS
if(sum(noParentMatch)!=1){
stop("Cannot find a single common ancestor by tracing parents")
}
if(nrow(parData) != nrow(unique(parData))){
print(parData[duplicated(parData),])
stop("getAllParents added duplicate parent-child relationships, see print-out above")
}
if(length(parData$taxon_no) != length(unique(parData$taxon_no))){
print(parData[duplicated(parData$taxon_no),])
stop("getAllParents added duplicate child taxa, see print-out above")
}
if(length(parData$taxon_name) != length(unique(parData$taxon_name))){
message("Additional accepted parent taxa have same names as input taxa - names of originals annotated")
#print(parData[duplicated(parData$taxon_name),])
#
if(annotatedDuplicateNames){
parData$taxon_name <- make.unique(as.character(parData$taxon_name))
}else{
print(parData[duplicated(parData$taxon_name),])
stop("getAllParents added duplicate accepted taxon names, see print-out above")
}
}
return(parData)
}
getFloatAncPBDB <- function(pcDat){
#identify IDs of parents floating without ancestors of their own
# which parents have no children?
noChildren <- sapply(pcDat$parent_no, function(x)
all(x != pcDat$child_no)
)
# list all unique no-child parents
res <- unique(pcDat$parent_no[noChildren])
return(res)
}
constructParentChildMatrixPBDB <- function(initPCmat, parData){
# starting from desired tip OTUs, work backwards to a common ancestor from the full parData
if(nrow(initPCmat) != nrow(unique(initPCmat))){
print(initPCmat[duplicated(initPCmat),])
stop("initial parent-child matrix had duplicate parent-child relationships, see print-out above")
}
newPCmat <- initPCmat
# find floating parents in current newPCmat
#identify IDs of parents floating without ancestors of their own
floaters <- getFloatAncPBDB(pcDat = newPCmat)
# make sure floaters are only unique values
floaters <- unique(floaters)
# use a while loop to complete the parent-child matrix
while(length(floaters)>1){ #so only ONE root can float
# get new relations: will 'anchor' the floaters
anchorMat <- subsetParDataPBDB(subsetNum = floaters,
parData = parData)
# bind to newPCmat
newPCmat <- rbind(newPCmat,anchorMat)
if(nrow(newPCmat) != nrow(unique(newPCmat))){
print(newPCmat[duplicated(newPCmat),])
stop("annotated pcMat had duplicate parent-child relationships, see print-out above")
}
# recalculate floater taxa
floatersNew <- getFloatAncPBDB(pcDat = newPCmat) #recalculate float
#
# put in a stopping condition for the love of god
if(length(floatersNew)>1 & identical(floaters,floatersNew) ){
stop(paste0(
"Provided PBDB Dataset does not appear to have a \n",
" monophyletic set of parent-child relationship pairs. \n",
"Multiple taxa appear to be listed as parents, but are not \n",
" listed themselves so have no parents listed: \n",
paste0(floaters,
collapse = ", "
),
paste0(
parData$taxon_name[
match(floaters,
parData$taxon_no)
],
collapse = ", "
)
))
}else{
floaters <- floatersNew
}
}
return(newPCmat)
}
subsetParDataPBDB <- function(subsetNum,parData){
# check that subsetNum is all unique values
if(length(subsetNum) != length(unique(subsetNum))){
stop("Some values of subsetNum are non-unique")
}
# pull parent-child relationships for a set of taxon numbers from parData
subsetRows <- match(subsetNum, parData$taxon_no)
# remove the non matching subset (usually the root)
subsetRows <- subsetRows[!is.na(subsetRows)]
# are there any duplicated subsetRows?
if(length(subsetRows) != length(unique(subsetRows))){
print("Total subsetRows:")
print(subsetRows)
print("Duplicates:")
dupSSR <- subsetRows[subsetRows == subsetRows[duplicated(subsetRows)]]
print(dupSSR)
print("Rows in Questions:")
print(parData[unique(dupSSR),])
stop("Stop - the subset selected by subsetParDataPBDB using match() has repeated rows??")
}
# pull the subset of parent-child relationships from parData
#subsetMat <- parData[subsetRows, c("parent_no", "taxon_no")]
subsetMat <- data.frame(
parent_no = parData$parent_no[subsetRows],
child_no = parData$taxon_no[subsetRows]
)
numSubset <- parData$taxon_no[subsetRows]
# test if any duplicated taxon numbers
if(length(numSubset) != length(unique(numSubset))){
duplicatedValues <- numSubset[duplicated(numSubset)]
duplicatedRows <- sapply(parData$taxon_no, function(x)
any(x == duplicatedValues))
print("All Taxon Numbers:")
print(parData$taxon_no)
print("The Subset Taxon Numbers:")
print(numSubset)
print("The Rows of parData Containing Duplicates:")
print(parData[duplicatedRows ,])
stop("getAllParents added duplicate taxa numbers, see print-out above")
}
rownames(subsetMat)<-as.character(numSubset)
return(subsetMat)
}
getLinneanTaxonTreePBDB <- function(dataTransform, tipSet, cleanTree, rankTaxon){
#########
dataTransform <- apply(dataTransform, 2, as.character)
#Check if show = class was used
if(!any(colnames(dataTransform) == "family")){
stop("taxaDataPBDB must be a taxonomic download with show = class for method = 'Linnean'")
}
#message that tipSet (and solveMissing) is ignored
if(!is.null(tipSet)){
message("Linnean taxon-tree option selected, argument 'tipSet' is ignored")
}
#now check and return an error if duplicate taxa of selected rank (rankTaxon)
nDup <- sapply(nrow(dataTransform),function(x)
sum(dataTransform[,"taxon_name"] == dataTransform[x,"taxon_name"])>1
& dataTransform[x,"taxon_rank"] == rankTaxon
)
if(any(nDup)){
stop(
paste0(
"Duplicate taxa of selected rank: ",
paste0("(", which(nDup), ") ",
dataTransform[nDup,"taxon_name"],
collapse = ", "
)
)
)
}
#filter on rankTaxon
dataTransform <- dataTransform[dataTransform[,"taxon_rank"] == rankTaxon,]
#
#get the taxonomic fields you want
#from API documentation: "phylum","class","order","family","genus"
# apparently 'kingdom' dropped from v1.2 API
taxonFields <- c(
#"kingdom",
"phylum", "class",
"order", "family",
"taxon_name")
#print(colnames(dataTransform))
taxonData <- dataTransform[,taxonFields]
taxonData <- apply(taxonData,2,as.character)
tree <- taxonTable2taxonTree(
taxonTable = taxonData,
cleanTree = cleanTree
)
tree$taxonTable <- taxonData
return(tree)
}
parentChildPBDBOld <- function(
dataTransform,
tipSet,
cleanTree,
method,
APIversion,
failIfNoInternet = TRUE
){
dataTransform <- apply(dataTransform, 2, as.character)
# need two things: a table of parent-child relationships as IDs
#and a look-up table of IDs and taxon names
#
# filer out lower than selected rank
#
# translate rank and taxon_rank to a number
# taxon rank translation vectors for compact vocab
taxRankPBDB <- getTaxRankPBDB()
rankID <- which(rank == taxRankPBDB)
numTaxonRank <- sapply(dataTransform[,"taxon_rank"],
function(x) which(x == taxRankPBDB))
#drop taxa below specified rank
dataTransform <- dataTransform[rankID <= numTaxonRank,]
#also recreate numTaxonRank
numTaxonRank <- sapply(dataTransform[,"taxon_rank"],
function(x) which(x == taxRankPBDB))
#
#create lookup table for taxon names
taxonNameTable <- cbind(
as.numeric(dataTransform[,"taxon_no"]),
as.character(dataTransform[,"accepted_name"])
)
#add parents not listed
parentFloat <- unique(dataTransform[,"parent_no"])
parentFloat <- parentFloat[is.na(match(parentFloat, taxonNameTable[,1]))]
taxonNameTable <- rbind(taxonNameTable,
cbind(parentFloat,
paste("ID:", as.character(parentFloat))
)
)
#print(taxonNameTable)
#DONE
#
#now need to put together parentChild table
#first, get table of all parentChild relationships in taxaDataPBDB
pcAll <- cbind(as.numeric(dataTransform[,"parent_no"]),
as.numeric(dataTransform[,"taxon_no"]))
#then start creating final matrix: first, those of desired rank
pcMat <- pcAll[rankID == numTaxonRank,]
#identify IDs of parents floating without ancestors of their own
floaters <- getFloatAncPBDB(pcDat = pcMat)
#
nCount <- 0 #start tracing back
while(length(floaters)>1){ #so only ONE root can float
nCount <- nCount+1
#get new relations: will 'anchor' the floaters
anchors <- match(floaters,pcAll[,2])
pcMat <- rbind(pcMat, pcAll[anchors[!is.na(anchors)],]) #bind to pcMat
floatersNew <- getFloatAncPBDB(pcDat = pcMat) #recalculate float
#stopping condition, as this is a silly while() loop...
if(length(floatersNew)>1 & identical(sort(floaters),sort(floatersNew))){
#if(!is.null(solveMissing)){
if(method == "parentChildOldQueryPBDB"){
floatData <- queryMissingParents(
taxaID = floatersNew,
APIversion = APIversion,
failIfNoInternet = failIfNoInternet
)
if(is.null(floatData)){ return(NULL) }
#update taxon names in taxonNameTable
whichUpdate <- match(floatData[,"taxon_no"],taxonNameTable[,1])
replaceFloatName <- floatData[!is.na(whichUpdate), "taxon_name"]
whichUpdate <- whichUpdate[!is.na(whichUpdate)]
taxonNameTable[whichUpdate,2] <- replaceFloatName
#add any new parent taxa to taxonNameTable
parentFloat <- unique(floatData[,"parent_no"])
matchingFloat <- is.na(match(parentFloat, taxonNameTable[,1]))
parentFloat <- parentFloat[matchingFloat]
#
if(length(parentFloat)>0){
taxonNameTable <- rbind(taxonNameTable,
cbind(parentFloat, paste("ID:", as.character(parentFloat))))
}
#update parentChildMat, parentChildAll
newEntries <- floatData[,c("parent_no","taxon_no")]
pcMat <- rbind(pcMat,newEntries)
pcAll <- rbind(pcAll,newEntries)
}
if(method == "parentChildOldMergeRoot"){
pcMat <- rbind(pcMat,cbind("ArtificialRoot",floaters))
taxonNameTable <- rbind(taxonNameTable,
c("ArtificialRoot","ArtificialRoot"))
message(paste0(
"Multiple potential root-taxa artificially merged at a common root: \n",
paste0(taxonNameTable[match(floaters,taxonNameTable[,1]),2]
,collapse = ", ")))
}
#regardless of method used, recalculate floaters
floaters <- getFloatAncPBDB(pcDat = pcMat)
if(length(floatersNew)>1 & identical(sort(floaters),sort(floatersNew))){
stop(
paste0("Provided PBDB Dataset does not appear to have a \n",
" monophyletic set of parent-child relationship pairs. \n",
"Multiple taxa appear to be listed as parents, but are not \n",
" listed themselves so have no parents listed: \n",
paste0(
taxonNameTable[match(floaters,taxonNameTable[,1]),2],
collapse = ", "
)
)
)
}
}else{
floaters <- floatersNew
}
}
pcMat <- apply(pcMat,2,as.character)
#
#
tree <- parentChild2taxonTree(
parentChild = pcMat,
tipSet = tipSet,
cleanTree = cleanTree)
#convert tip.label and node.label to taxon names from taxonNameTable
tree$tip.label <- taxonNameTable[
match(tree$tip.label, taxonNameTable[,1])
,2]
tree$node.label <- taxonNameTable[
match(tree$node.label, taxonNameTable[,1])
,2]
tree$parentChild <- pcMat
return(tree)
}
#getTaxaIDsDesiredRank<-function(data, rank){
# # filter out lower than selected rank (for tip taxa)
# # so need to know which ranks are lower/higher
# # get taxon rank translation vectors for compact vocab
# taxRankPBDB <- getTaxRankPBDB()
# # translate rank to a number
# # translate taxon_rank to a number
# numTaxonRank <- sapply(data[,"taxon_rank"],
# function(x) which(x == taxRankPBDB))
# #now need to put together parentChild table
# # get taxon ID numbers of just those of desired rank
# desiredIDs <- data[rankID == numTaxonRank, "parent_no"]
# desiredIDs <- as.numeric(desiredIDs)
# return(desiredIDs)
# }
# OLD DOC
### OLD EXAMPLE CODE
# #get time data from occurrences
# graptOccGenus <- taxonSortPBDBocc(graptOccPBDB,
# rank = "genus", onlyFormal = FALSE)
# graptTimeGenus <- occData2timeList(occList = graptOccGenus)
#
# #let's time-scale the parentChild tree with paleotree
# # use minimum branch length for visualization
# # and nonstoch.bin so we plot maximal ranges
# timeTree <- bin_timePaleoPhy(graptTree,
# timeList = graptTimeGenus,
# nonstoch.bin = TRUE,
# type = "mbl", vartime = 3)
#
# #drops a lot of taxa; some of this is due to mispellings, etc
# @param solveMissing Under \code{method = "parentChild"}, what should \code{makePBDBtaxonTree} do about
# multiple 'floating' parent taxa, listed without their own parent taxon information in the input
# dataset under \code{taxaDataPBDB}? Each of these is essentially a separate root taxon, for a different set
# of parent-child relationships, and thus poses a problem as far as returning a single phylogeny is
# concerned. If \code{solveMissing = NULL} (the default), nothing is done and the operation halts with
# an error, reporting the identity of these taxa. Two alternative solutions are offered: first,
# \code{solveMissing = "mergeRoots"} will combine these disparate potential roots and link them to an
# artificially-constructed pseudo-root, which at least allows for visualization of the taxonomic
# structure in a limited dataset. Secondly, \code{solveMissing = "queryPBDB"} queries the Paleobiology
# Database repeatedly via the API for information on parent taxa of the 'floating' parents, and continues
# within a \code{while()} loop until only one such unassigned parent taxon remains. This latter option may
# talk a long time or never finish, depending on the linearity and taxonomic structures encountered in the
# PBDB taxonomic data; i.e. if someone a taxon was ultimately its own indirect child in some grand loop by
# mistake, then under this option \code{makePBDBtaxonTree} might never finish. In cases where taxonomy is
# bad due to weird and erroneous taxonomic assignments reported by the PBDB, this routine may search all
# the way back to a very ancient and deep taxon, such as the Eukaryota taxon.
# Users should thus use \code{solveMissing = "queryPBDB"} only with caution.
# @param cleanDuplicate If \code{TRUE} (\emph{not} the default), duplicated taxa of a
# taxonomic rank \emph{not} selected by argument \code{rank}
# will be removed silently. Only duplicates of the taxonomic rank of interest
# will actually result in an error message.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.