#' @name searchTxnyms
#' @title Get node labels based on online taxonomic database
#' @description Return names of each node in tree based on searching tip labels
#' through Global Names Resolver \url{http://resolver.globalnames.org/} in NCBI.
#' @details For each node, all the descendants are searched, the taxonomic lineages returned and
#' then searched to find the lowest shared name.
#' All the tip labels are searched against a specified taxonomic database through the GNR and NCBI.
#' (So far only tested with NCBI database.)
#' Use the infer argument to ensure a taxonym is returned for all nodes. If infer is true,
#' all nodes without an identifed taxonym adopt the taxonym of their parent.
#' Will raise a warning if connection fails and will return NULL.
#' @param tree TreeMan object
#' @param cache T/F, create a local cache of downloaded names?
#' @param parent specify parent of all names to prevent false names
#' @param clean T/F, ensure returned names contain no special characters?
#' @param infer T/F, infer taxonyms for unfound nodes?
#' @seealso
#' \code{\link{taxaResolve}}, \code{\link{setTxnyms}}, \code{\link{getNdsFrmTxnyms}}
#' @export
#' @examples
#' tree <- randTree(8)
#' new_tids <- c("Gallus_gallus", "Aileuropoda_melanoleucha", "Ailurus_fulgens",
#' "Rattus_rattus", "Mus_musculus", "Gorilla_gorilla", "Pan_trogoldytes", "Homo_sapiens")
#' tree <- setNdsID(tree, tree['tips'], new_tids)
#' nd_labels <- searchTxnyms(tree)
#' print(nd_labels)
# TODO: add compatibility with other GNR datasources
# TODO: catalogue of life, unlike NCBI, does not keep lineages and rank lengths constant between names
searchTxnyms <- function (tree, cache=FALSE, parent=NULL, clean=TRUE,
infer=TRUE) {
# Use GNR to label all nodes in a phylogeny
# first replace all _ with spaces
tip_labels <- gsub ('_', ' ', tree@tips)
nids <- tree@nds
nd_labels <- rep(NA, tree@nall)
names(nd_labels) <- tree@all
taxa_res <- taxaResolve(tip_labels, datasource=4, cache=cache, parent=parent)
if(is.null(taxa_res)) {
return(NULL)
}
# for tips use the first word of the name
nd_labels[tree@tips] <- vapply(strsplit(tip_labels, "\\s+"), function(x) x[1],
character(1))
nds_kids <- getNdsKids(tree, ids=nids)
for(nid in nids) {
kids <- gsub("_" , " ", nds_kids[[nid]])
genus_names <- vapply(strsplit(kids, "\\s+"), function(x) x[1],
character(1))
if(all(genus_names == genus_names[1])) {
nd_labels[nid] <- genus_names[1]
} else {
lineages <- as.character(taxa_res[taxa_res$search.name %in% kids, "lineage"])
lineages <- strsplit (lineages, "\\|")
lineages <- lineages[!is.na (lineages)]
if(length (lineages) > 1) {
nd_labels[nid] <- .findClade(lineages)
}
}
}
if(clean) {
nd_labels <- gsub('\\s', '_', nd_labels)
nd_labels <- gsub('[^a-zA-Z_0-9]', '', nd_labels)
}
if(infer & any(is.na(nd_labels))) {
if(sum(is.na(nd_labels))/length(nd_labels) > 0.5) {
message('Fewer than 50% of nodes identified,',
' not attempting inference of remaning nodes.')
} else {
for(i in which(is.na(nd_labels))) {
prids <- getNdPrids(tree, names(nd_labels)[i])
pssbls <- nd_labels[prids]
pssbls <- pssbls[!is.na(pssbls)]
if(length(pssbls) > 0) {
nd_labels[[i]] <- pssbls[[1]]
}
}
}
}
nd_labels
}
#' @name taxaResolve
#' @title Resolve taxonmic names online
#' @description Resolve taxonomic names via the Global Names Resolver.
#' @details Returns dataframe containing GNR metadata for each name wames
#' that cannot be resolved are returned as NA. Various datasources are
#' available, see \url{http://resolver.globalnames.org/data_sources} for a
#' list and IDs. Default is 4 for NCBI.
#' Will raise a warning if connection fails and will return NULL.
#' @param nms vector of names
#' @param batch size of the batches to be queried
#' @param datasource ID number of the datasource
#' @param genus boolean, if true will search against GNR with just the genus
#' name for names that failed to resolve using the full species name
#' @param cache T/F, create a local cache of downloaded names?
#' @param parent specify parent of all names to prevent false names
#' @seealso
#' \code{\link{searchTxnyms}}, \code{\link{setTxnyms}}, \code{\link{getNdsFrmTxnyms}}
#' @export
#' @examples
#' my_lovely_names <- c ('Gallus gallus', 'Pongo pingu', 'Homo sapiens',
#' 'Arabidopsis thaliana', 'Macaca thibetana', 'Bacillus subtilis')
#' res <- taxaResolve (nms=my_lovely_names)
#' length(colnames(res)) # 10 different metadata for returned names including original search name
#' # let's look at the lineages
#' lineages <- strsplit(as.vector(res$lineage), '\\|')
#' print(lineages[[6]]) # the bacteria has far fewer taxonomic levels
# NOTE. Originally built for MTT
taxaResolve <- function (nms, batch=100, datasource=4, genus=TRUE,
cache=FALSE, parent=NULL) {
.replace <- function (i, slot.name) {
# controlled extraction
element <- data[[i]]$result[[1]][[slot.name]]
if (!is.null (element)) {
res <- element
} else {
res <- NA
}
res
}
batchResolve <- function (batch.nms) {
#create query from nms
url <- "http://resolver.globalnames.org/name_resolvers.json?"
data_source_ids <- paste0 ("&data_source_ids=", datasource)
nms2 <- paste0 ("names=", paste0 (stringr::str_replace_all (
batch.nms, " ", "+"), collapse = "|"))
query <- paste (plyr::compact (list (url, nms2,
data_source_ids)), collapse = "")
#search via API
data <- .safeFromJSON(query)$data
return(data)
}
# avoid names -- names exist on database but mean nothing
avoid <- c ('unidentified')
# make sure names don't have '_'
trms <- gsub ('_', ' ', nms)
# remove all non alphanumerics
trms <- gsub ('\\W+', ' ', trms)
# remove trailing whitespace
trms <- gsub ('^\\s+|\\s+$', '', trms)
# any missing trms, replace with stubs
trms[trms==''] <- 'invalid'
deja_vues <- rep(FALSE, length(trms))
data <- vector("list", length=length(trms))
names(data) <- nms
if(cache) {
if(!file.exists("gnr_cache")) {
dir.create("gnr_cache")
}
for(i in 1:length(nms)) {
fp <- file.path("gnr_cache", paste0(nms[i], ".RData"))
if(file.exists(fp)) {
load(fp)
data[[nms[i]]] <- nd
deja_vues[i] <- TRUE
}
}
}
if(sum(!deja_vues) > 0) {
# Split nms into batch sized chunks
# http://stackoverflow.com/questions/3318333/split-a-vector-into-chunks-in-r
x <- seq_along (trms[!deja_vues])
btrms <- split (trms[!deja_vues], ceiling (x/batch))
bnms <- split (nms[!deja_vues], ceiling (x/batch))
for (i in 1:length(btrms)) {
temp.data <- batchResolve(btrms[[i]])
if (is.null(temp.data)) {
return(NULL)
}
data[bnms[[i]]] <- temp.data
}
}
#transform results into output
search.name <- name.string <- canonical.form <-
lineage <- lineage.ids <- rank <- taxid <-
match.type <- prescore <- score <- rep (NA, length (nms))
for (i in 1:length (data)){
parent_test <- TRUE
nd <- data[[i]]
if(cache & !deja_vues[i]) {
fp <- file.path("gnr_cache", paste0(names(data)[i], ".RData"))
save(nd, file=fp)
}
if (!'results' %in% names (nd)){
search.name[i] <- nms[i]
} else if (nd[[1]] %in% avoid) {
search.name[i] <- nms[i]
} else {
search.name[i] <- nms[i]
lng <- .replace(i, 'classification_path')
if(!is.null(parent)) {
parent_test <- grepl(parent, lng)
}
if(parent_test) {
name.string[i] <- .replace(i, 'name_string')
canonical.form[i] <- .replace(i, 'canonical_form')
lineage[i] <- lng
lineage.ids[i] <- .replace(i, 'classification_path_ids')
rank[i] <- .replace(i, 'classification_path_ranks')
taxid[i] <- .replace(i, 'taxon_id')
match.type[i] <- .replace(i, 'match_type')
prescore[i] <- .replace(i, 'prescore')
score[i] <- nd$results[[1]]$score
}
}
}
res <- data.frame (search.name=search.name,
name.string=name.string,
canonical.form=canonical.form,
lineage=lineage, lineage.ids=lineage.ids,
rank=rank, taxid=taxid,
match.type=match.type, prescore=prescore,
score=score, stringsAsFactors=FALSE)
failed <- which (is.na (res$name.string))
if (genus & length (failed) > 0) {
# if genus, search just genus names
genus.nms <- sub ('\\s+.*', '', res$search.name[failed])
genus.res <- taxaResolve(genus.nms, batch, datasource, genus=FALSE,
parent=parent, cache=cache)
# replace in original results, all slots except search.name
res[failed,-1] <- genus.res[ ,-1]
}
return (res)
}
.safeFromJSON <- function (url, max_trys=5, power=2) {
# Safe wrapper for fromJSON
trys <- 0
waittime <- 2
while (trys < max_trys) {
json_obj <- try (RJSONIO::fromJSON(url), silent = TRUE)
if(class(json_obj) == 'try-error') {
cat('---- Connection failed: trying again in [', waittime,
's]----\n', sep='')
trys <- trys + 1
Sys.sleep(waittime)
waittime <- waittime*power
} else {
return (json_obj)
}
}
warning("Failed to connect, server may be down.")
list('data' = NULL)
}
.findClade <- function(lineages) {
# for a list of lineages, find the clade shared by all
subj <- lineages[[1]]
for(i in 2:length(lineages)) {
query <- lineages[[i]]
subj <- subj[subj %in% query]
}
subj[length(subj)]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.