Nothing
# hidden functions for used with makePBDBtaxonTree
translatePBDBtaxa <- function(taxaDataPBDB, convertAccepted = TRUE){
# if convertAccepted is TRUE, then the taxon name and taxon no will be replaced
# with the accepted name and no for that taxon
# this is NOT disirable for trying to trace parents in a set of taxa
#################
# Do some translation
#need to replace any empty string values with NAs (due perhaps to use of read.csv with the API)
taxaDataPBDB[taxaDataPBDB == ""] <- NA
#if com vocab
if(any("rnk" == colnames(taxaDataPBDB))){
#apparently it doesn't matter if these columns *are* present or not
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "acn"] <- "accepted_name"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "snp"] <- "senpar_no"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "rnk"] <- "taxon_rank"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "nam"] <- "taxon_name"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "fml"] <- "family"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "odl"] <- "order"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "cll"] <- "class"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "phl"] <- "phylum"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "kgl"] <- "kingdom"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "par"] <- "parent_no"
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "oid"] <- "taxon_no"
# taxon rank translation vectors for compact vocab
taxRankPBDB <- getTaxRankPBDB()
taxRankCOM <- 2:26
#change contents of "identified_rank" and "accepted_rank"
taxaDataPBDB$taxon_rank <- sapply(taxaDataPBDB$taxon_rank,function(x)
taxRankPBDB[x == taxRankCOM])
message("compact vocab detected, relevant fields will be translated")
}
###########
# following are closet cases that mostly only apply to OLD API calls
#
if(any(colnames(taxaDataPBDB) == "rank")){
#if 1.1 and vocab is pbdb
colnames(taxaDataPBDB)[colnames(taxaDataPBDB) == "rank"] <- "taxon_rank"
}
#
if(convertAccepted){
# if convertAccepted is TRUE, then the taxon name and taxon no will be replaced
# with the accepted name and no for that taxon
# this is NOT disirable for trying to trace parents in a set of taxa
if(any(colnames(taxaDataPBDB) == "accepted_name")){
#if 1.2 and there is an accepted_name column..
#fill empty accepted_name values with taxon_name
nameFormal <- taxaDataPBDB[,"accepted_name"]
nameFormal[is.na(nameFormal)] <- as.character(taxaDataPBDB[is.na(nameFormal),"taxon_name"])
#replace taxon_name
taxaDataPBDB[,"taxon_name"] <- nameFormal
#
#replace taxon_no with accepted_no
taxNum <- taxaDataPBDB[,"accepted_no"]
taxNum[is.na(taxNum)] <- as.character(taxaDataPBDB[is.na(taxNum),"taxon_no"])
taxaDataPBDB[,"taxon_no"] <- taxNum
#
}
}
if(any(colnames(taxaDataPBDB)=="senpar_no")){
#if this is OLD v1.2 taxaDataPBDB, and there is a senpar_no column
# replace parent_no in the same way with senpar_no
parNum <- taxaDataPBDB[,"senpar_no"]
parNum[is.na(parNum)] <- as.character(taxaDataPBDB[is.na(parNum),"parent_no"])
taxaDataPBDB[,"parent_no"] <- parNum
}
#
return(taxaDataPBDB)
}
getTaxRankPBDB<-function(){
return(c("subspecies","species","subgenus","genus",
"subtribe","tribe","subfamily",
"family","superfamily","infraorder",
"suborder","order","superorder","infraclass",
"subclass","class","superclass","subphylum",
"phylum","superphylum","subkingdom",
"kingdom","unranked clade","informal"
#keep informal as high, never drop!
))
}
convertParentChildMatNames <- function(taxID, parData){
taxMatch <- match(taxID, parData$taxon_no)
if(is.na(taxMatch)){
newName <- "NODE"
}else{
newName <- parData$taxon_name[taxMatch]
newName <- as.character(newName)
}
return(newName)
}
queryMissingParents <- function(taxaID,
APIversion = "1.2",
status = "all",
convertAccepted = FALSE,
stopIfSelfParent = FALSE,
failIfNoInternet = TRUE
){
# find missing parents by access API
#
# drop Eukarya, as it won't return if status = senior under 1.1
# taxaID <- as.numeric(taxaID[taxaID != "1"])
#
# let's get some taxonomic data
# first test internet
testConnect <- canConnectPBDB(fail = failIfNoInternet)
if(is.null(testConnect)){
return(NULL)
}
#
floatData <- read.csv(
paste0("https://paleobiodb.org/data",APIversion,
"/taxa/list.txt?taxon_id=",paste0(taxaID,collapse=","),
### should we take all or only ACCEPTED parents?
"&rel=exact&status=",status,
"&vocab=pbdb"
),
stringsAsFactors = FALSE)
if(nrow(floatData) == 0){
stop(
paste("Current PBDB API would not return info for the following taxon IDs: \n",
paste0(taxaID,collapse = ", ")
)
)
}
# if convertAccepted is TRUE, then the taxon name and taxon no will be replaced
# with the accepted name and no for that taxon
# this is NOT disirable for trying to trace parents in a set of taxa
floatData <- translatePBDBtaxa(floatData,
convertAccepted = convertAccepted)
#parse down to just taxon_name, taxon_no, parent_no
floatData <- parseParentPBDBData(floatData,
stopIfSelfParent = stopIfSelfParent)
#
return(floatData)
}
parseParentPBDBData <- function(parentData, stopIfSelfParent = FALSE){
#parse down to just taxon_name, taxon_no, parent_no
if(any(colnames(parentData)=="accepted_name")){
taxon_name = parentData$accepted_name
}else{
taxon_name = parentData$taxon_name
}
#
result <- data.frame(
taxon_name = as.character(taxon_name),
parent_no = as.numeric(parentData$parent_no),
taxon_no = as.numeric(parentData$taxon_no)
)
rownames(result) <- NULL
#
if(stopIfSelfParent){
result <- fixSelfParents(dataParents = result,
approach = "stop")
}
#
return(result)
}
findNoParentMatch<-function(parData){
res <- is.na(match(parData$parent_no, parData$taxon_no))
return(res)
}
fixSelfParents <- function(dataParents, approach = "clean"){
# check that no taxa are their own parent
selfParents <- (dataParents$parent_no == dataParents$taxon_no)
if(any(selfParents)){
if(approach == "clean"){
dataParents <- dataParents[!selfParents,]
}
if(approach == "stop"){
parMatch <- paste0(dataParents$taxon_name,
" (", dataParents$taxon_no,")")
parMatch <- paste0(parMatch[selfParents],
collapse = ", ")
stop(paste0("The following taxa (with taxon_no) are their own parents:\n",
parMatch))
}
}
return(dataParents)
}
getAllParents <- function(
inputData,
status,
annotatedDuplicateNames = TRUE,
convertAccepted = TRUE,
stopIfSelfParent = FALSE,
failIfNoInternet = TRUE
){
###############################################
parData<-parseParentPBDBData(inputData)
noParentMatch<-findNoParentMatch(parData)
floatingParentNumOld <- NA
while(sum(noParentMatch)>1){
# get floating parent taxon numbers
floatingParentNum <- unique(parData$parent_no[noParentMatch])
#
# checks
if(identical(floatingParentNumOld, floatingParentNum)){
if(convertAccepted){
# well try turning convertAccepted to FALSE
convertAccepted <- FALSE
}else{
#oh, its already false? eh... then give up
dataUrlRef <- paste0("http://paleobiodb.org/data",1.2,
"/taxa/list.txt?taxon_id=",paste0(floatingParentNum,collapse=","),
"&rel=exact&status=",status,"&vocab=pbdb"
)
print(dataNew)
stop(paste0(
"PBDB not returning single common ancestor by tracing parents.\n",
"Following floating parent numbers are repeated:\n",
paste0(floatingParentNum,collapse=", "),
"\nThese were not resolved with the new data (printed above).",
"\nCheck the following URL:\n",dataUrlRef
))
}
}else{
floatingParentNumOld <- floatingParentNum
}
#
# get the missing parents
# if convertAccepted is TRUE, then the taxon name and taxon no will be replaced
# with the accepted name and no for that taxon
# this is (probably?) NOT disirable for trying to trace parents in a set of taxa
dataNew <- queryMissingParents(
floatingParentNum,
status = status,
convertAccepted = convertAccepted,
stopIfSelfParent = stopIfSelfParent,
failIfNoInternet = failIfNoInternet
)
if(is.null(dataNew)){return(NULL)}
# add new parents to the top of the matrix
# so if duplicated names are annotated, its the originals that get annotated
parData <- rbind(dataNew,parData)
noParentMatch <- findNoParentMatch(parData)
}
# remove duplicate taxa
parData <- unique(parData)
# remove self parents
parData <- fixSelfParents(dataParents = parData,
approach = "clean")
#
# TESTS
if(sum(noParentMatch)!=1){
stop("Cannot find a single common ancestor by tracing parents")
}
if(nrow(parData) != nrow(unique(parData))){
print(parData[duplicated(parData),])
stop("getAllParents added duplicate parent-child relationships, see print-out above")
}
if(length(parData$taxon_no) != length(unique(parData$taxon_no))){
print(parData[duplicated(parData$taxon_no),])
stop("getAllParents added duplicate child taxa, see print-out above")
}
if(length(parData$taxon_name) != length(unique(parData$taxon_name))){
message("Additional accepted parent taxa have same names as input taxa - names of originals annotated")
#print(parData[duplicated(parData$taxon_name),])
#
if(annotatedDuplicateNames){
parData$taxon_name <- make.unique(as.character(parData$taxon_name))
}else{
print(parData[duplicated(parData$taxon_name),])
stop("getAllParents added duplicate accepted taxon names, see print-out above")
}
}
return(parData)
}
getFloatAncPBDB <- function(pcDat){
#identify IDs of parents floating without ancestors of their own
# which parents have no children?
noChildren <- sapply(pcDat$parent_no, function(x)
all(x != pcDat$child_no)
)
# list all unique no-child parents
res <- unique(pcDat$parent_no[noChildren])
return(res)
}
constructParentChildMatrixPBDB <- function(initPCmat, parData){
# starting from desired tip OTUs, work backwards to a common ancestor from the full parData
if(nrow(initPCmat) != nrow(unique(initPCmat))){
print(initPCmat[duplicated(initPCmat),])
stop("initial parent-child matrix had duplicate parent-child relationships, see print-out above")
}
newPCmat <- initPCmat
# find floating parents in current newPCmat
#identify IDs of parents floating without ancestors of their own
floaters <- getFloatAncPBDB(pcDat = newPCmat)
# make sure floaters are only unique values
floaters <- unique(floaters)
# use a while loop to complete the parent-child matrix
while(length(floaters)>1){ #so only ONE root can float
# get new relations: will 'anchor' the floaters
anchorMat <- subsetParDataPBDB(subsetNum = floaters,
parData = parData)
# bind to newPCmat
newPCmat <- rbind(newPCmat,anchorMat)
if(nrow(newPCmat) != nrow(unique(newPCmat))){
print(newPCmat[duplicated(newPCmat),])
stop("annotated pcMat had duplicate parent-child relationships, see print-out above")
}
# recalculate floater taxa
floatersNew <- getFloatAncPBDB(pcDat = newPCmat) #recalculate float
#
# put in a stopping condition for the love of god
if(length(floatersNew)>1 & identical(floaters,floatersNew) ){
stop(paste0(
"Provided PBDB Dataset does not appear to have a \n",
" monophyletic set of parent-child relationship pairs. \n",
"Multiple taxa appear to be listed as parents, but are not \n",
" listed themselves so have no parents listed: \n",
paste0(floaters,
collapse = ", "
),
paste0(
parData$taxon_name[
match(floaters,
parData$taxon_no)
],
collapse = ", "
)
))
}else{
floaters <- floatersNew
}
}
return(newPCmat)
}
subsetParDataPBDB <- function(subsetNum,parData){
# check that subsetNum is all unique values
if(length(subsetNum) != length(unique(subsetNum))){
stop("Some values of subsetNum are non-unique")
}
# pull parent-child relationships for a set of taxon numbers from parData
subsetRows <- match(subsetNum, parData$taxon_no)
# remove the non matching subset (usually the root)
subsetRows <- subsetRows[!is.na(subsetRows)]
# are there any duplicated subsetRows?
if(length(subsetRows) != length(unique(subsetRows))){
print("Total subsetRows:")
print(subsetRows)
print("Duplicates:")
dupSSR <- subsetRows[subsetRows == subsetRows[duplicated(subsetRows)]]
print(dupSSR)
print("Rows in Questions:")
print(parData[unique(dupSSR),])
stop("Stop - the subset selected by subsetParDataPBDB using match() has repeated rows??")
}
# pull the subset of parent-child relationships from parData
#subsetMat <- parData[subsetRows, c("parent_no", "taxon_no")]
subsetMat <- data.frame(
parent_no = parData$parent_no[subsetRows],
child_no = parData$taxon_no[subsetRows]
)
numSubset <- parData$taxon_no[subsetRows]
# test if any duplicated taxon numbers
if(length(numSubset) != length(unique(numSubset))){
duplicatedValues <- numSubset[duplicated(numSubset)]
duplicatedRows <- sapply(parData$taxon_no, function(x)
any(x == duplicatedValues))
print("All Taxon Numbers:")
print(parData$taxon_no)
print("The Subset Taxon Numbers:")
print(numSubset)
print("The Rows of parData Containing Duplicates:")
print(parData[duplicatedRows ,])
stop("getAllParents added duplicate taxa numbers, see print-out above")
}
rownames(subsetMat)<-as.character(numSubset)
return(subsetMat)
}
getLinneanTaxonTreePBDB <- function(dataTransform, tipSet, cleanTree, rankTaxon){
#########
dataTransform <- apply(dataTransform, 2, as.character)
#Check if show = class was used
if(!any(colnames(dataTransform) == "family")){
stop("taxaDataPBDB must be a taxonomic download with show = class for method = 'Linnean'")
}
#message that tipSet (and solveMissing) is ignored
if(!is.null(tipSet)){
message("Linnean taxon-tree option selected, argument 'tipSet' is ignored")
}
#now check and return an error if duplicate taxa of selected rank (rankTaxon)
nDup <- sapply(nrow(dataTransform),function(x)
sum(dataTransform[,"taxon_name"] == dataTransform[x,"taxon_name"])>1
& dataTransform[x,"taxon_rank"] == rankTaxon
)
if(any(nDup)){
stop(
paste0(
"Duplicate taxa of selected rank: ",
paste0("(", which(nDup), ") ",
dataTransform[nDup,"taxon_name"],
collapse = ", "
)
)
)
}
#filter on rankTaxon
dataTransform <- dataTransform[dataTransform[,"taxon_rank"] == rankTaxon,]
#
#get the taxonomic fields you want
#from API documentation: "phylum","class","order","family","genus"
# apparently 'kingdom' dropped from v1.2 API
taxonFields <- c(
#"kingdom",
"phylum", "class",
"order", "family",
"taxon_name")
#print(colnames(dataTransform))
taxonData <- dataTransform[,taxonFields]
taxonData <- apply(taxonData,2,as.character)
tree <- taxonTable2taxonTree(
taxonTable = taxonData,
cleanTree = cleanTree
)
tree$taxonTable <- taxonData
return(tree)
}
parentChildPBDBOld <- function(
dataTransform,
tipSet,
cleanTree,
method,
APIversion,
failIfNoInternet = TRUE
){
dataTransform <- apply(dataTransform, 2, as.character)
# need two things: a table of parent-child relationships as IDs
#and a look-up table of IDs and taxon names
#
# filer out lower than selected rank
#
# translate rank and taxon_rank to a number
# taxon rank translation vectors for compact vocab
taxRankPBDB <- getTaxRankPBDB()
rankID <- which(rank == taxRankPBDB)
numTaxonRank <- sapply(dataTransform[,"taxon_rank"],
function(x) which(x == taxRankPBDB))
#drop taxa below specified rank
dataTransform <- dataTransform[rankID <= numTaxonRank,]
#also recreate numTaxonRank
numTaxonRank <- sapply(dataTransform[,"taxon_rank"],
function(x) which(x == taxRankPBDB))
#
#create lookup table for taxon names
taxonNameTable <- cbind(
as.numeric(dataTransform[,"taxon_no"]),
as.character(dataTransform[,"accepted_name"])
)
#add parents not listed
parentFloat <- unique(dataTransform[,"parent_no"])
parentFloat <- parentFloat[is.na(match(parentFloat, taxonNameTable[,1]))]
taxonNameTable <- rbind(taxonNameTable,
cbind(parentFloat,
paste("ID:", as.character(parentFloat))
)
)
#print(taxonNameTable)
#DONE
#
#now need to put together parentChild table
#first, get table of all parentChild relationships in taxaDataPBDB
pcAll <- cbind(as.numeric(dataTransform[,"parent_no"]),
as.numeric(dataTransform[,"taxon_no"]))
#then start creating final matrix: first, those of desired rank
pcMat <- pcAll[rankID == numTaxonRank,]
#identify IDs of parents floating without ancestors of their own
floaters <- getFloatAncPBDB(pcDat = pcMat)
#
nCount <- 0 #start tracing back
while(length(floaters)>1){ #so only ONE root can float
nCount <- nCount+1
#get new relations: will 'anchor' the floaters
anchors <- match(floaters,pcAll[,2])
pcMat <- rbind(pcMat, pcAll[anchors[!is.na(anchors)],]) #bind to pcMat
floatersNew <- getFloatAncPBDB(pcDat = pcMat) #recalculate float
#stopping condition, as this is a silly while() loop...
if(length(floatersNew)>1 & identical(sort(floaters),sort(floatersNew))){
#if(!is.null(solveMissing)){
if(method == "parentChildOldQueryPBDB"){
floatData <- queryMissingParents(
taxaID = floatersNew,
APIversion = APIversion,
failIfNoInternet = failIfNoInternet
)
if(is.null(floatData)){ return(NULL) }
#update taxon names in taxonNameTable
whichUpdate <- match(floatData[,"taxon_no"],taxonNameTable[,1])
replaceFloatName <- floatData[!is.na(whichUpdate), "taxon_name"]
whichUpdate <- whichUpdate[!is.na(whichUpdate)]
taxonNameTable[whichUpdate,2] <- replaceFloatName
#add any new parent taxa to taxonNameTable
parentFloat <- unique(floatData[,"parent_no"])
matchingFloat <- is.na(match(parentFloat, taxonNameTable[,1]))
parentFloat <- parentFloat[matchingFloat]
#
if(length(parentFloat)>0){
taxonNameTable <- rbind(taxonNameTable,
cbind(parentFloat, paste("ID:", as.character(parentFloat))))
}
#update parentChildMat, parentChildAll
newEntries <- floatData[,c("parent_no","taxon_no")]
pcMat <- rbind(pcMat,newEntries)
pcAll <- rbind(pcAll,newEntries)
}
if(method == "parentChildOldMergeRoot"){
pcMat <- rbind(pcMat,cbind("ArtificialRoot",floaters))
taxonNameTable <- rbind(taxonNameTable,
c("ArtificialRoot","ArtificialRoot"))
message(paste0(
"Multiple potential root-taxa artificially merged at a common root: \n",
paste0(taxonNameTable[match(floaters,taxonNameTable[,1]),2]
,collapse = ", ")))
}
#regardless of method used, recalculate floaters
floaters <- getFloatAncPBDB(pcDat = pcMat)
if(length(floatersNew)>1 & identical(sort(floaters),sort(floatersNew))){
stop(
paste0("Provided PBDB Dataset does not appear to have a \n",
" monophyletic set of parent-child relationship pairs. \n",
"Multiple taxa appear to be listed as parents, but are not \n",
" listed themselves so have no parents listed: \n",
paste0(
taxonNameTable[match(floaters,taxonNameTable[,1]),2],
collapse = ", "
)
)
)
}
}else{
floaters <- floatersNew
}
}
pcMat <- apply(pcMat,2,as.character)
#
#
tree <- parentChild2taxonTree(
parentChild = pcMat,
tipSet = tipSet,
cleanTree = cleanTree)
#convert tip.label and node.label to taxon names from taxonNameTable
tree$tip.label <- taxonNameTable[
match(tree$tip.label, taxonNameTable[,1])
,2]
tree$node.label <- taxonNameTable[
match(tree$node.label, taxonNameTable[,1])
,2]
tree$parentChild <- pcMat
return(tree)
}
#getTaxaIDsDesiredRank<-function(data, rank){
# # filter out lower than selected rank (for tip taxa)
# # so need to know which ranks are lower/higher
# # get taxon rank translation vectors for compact vocab
# taxRankPBDB <- getTaxRankPBDB()
# # translate rank to a number
# # translate taxon_rank to a number
# numTaxonRank <- sapply(data[,"taxon_rank"],
# function(x) which(x == taxRankPBDB))
# #now need to put together parentChild table
# # get taxon ID numbers of just those of desired rank
# desiredIDs <- data[rankID == numTaxonRank, "parent_no"]
# desiredIDs <- as.numeric(desiredIDs)
# return(desiredIDs)
# }
# OLD DOC
### OLD EXAMPLE CODE
# #get time data from occurrences
# graptOccGenus <- taxonSortPBDBocc(graptOccPBDB,
# rank = "genus", onlyFormal = FALSE)
# graptTimeGenus <- occData2timeList(occList = graptOccGenus)
#
# #let's time-scale the parentChild tree with paleotree
# # use minimum branch length for visualization
# # and nonstoch.bin so we plot maximal ranges
# timeTree <- bin_timePaleoPhy(graptTree,
# timeList = graptTimeGenus,
# nonstoch.bin = TRUE,
# type = "mbl", vartime = 3)
#
# #drops a lot of taxa; some of this is due to mispellings, etc
# @param solveMissing Under \code{method = "parentChild"}, what should \code{makePBDBtaxonTree} do about
# multiple 'floating' parent taxa, listed without their own parent taxon information in the input
# dataset under \code{taxaDataPBDB}? Each of these is essentially a separate root taxon, for a different set
# of parent-child relationships, and thus poses a problem as far as returning a single phylogeny is
# concerned. If \code{solveMissing = NULL} (the default), nothing is done and the operation halts with
# an error, reporting the identity of these taxa. Two alternative solutions are offered: first,
# \code{solveMissing = "mergeRoots"} will combine these disparate potential roots and link them to an
# artificially-constructed pseudo-root, which at least allows for visualization of the taxonomic
# structure in a limited dataset. Secondly, \code{solveMissing = "queryPBDB"} queries the Paleobiology
# Database repeatedly via the API for information on parent taxa of the 'floating' parents, and continues
# within a \code{while()} loop until only one such unassigned parent taxon remains. This latter option may
# talk a long time or never finish, depending on the linearity and taxonomic structures encountered in the
# PBDB taxonomic data; i.e. if someone a taxon was ultimately its own indirect child in some grand loop by
# mistake, then under this option \code{makePBDBtaxonTree} might never finish. In cases where taxonomy is
# bad due to weird and erroneous taxonomic assignments reported by the PBDB, this routine may search all
# the way back to a very ancient and deep taxon, such as the Eukaryota taxon.
# Users should thus use \code{solveMissing = "queryPBDB"} only with caution.
# @param cleanDuplicate If \code{TRUE} (\emph{not} the default), duplicated taxa of a
# taxonomic rank \emph{not} selected by argument \code{rank}
# will be removed silently. Only duplicates of the taxonomic rank of interest
# will actually result in an error message.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.