Nothing
#' Get full lineage details from a taxonomic ID number.
#'
#' This function derives the full lineage of a taxon ID number
#' from a given taxonomy database.
#'
#' @param taxIDs integer or vector of integers giving the taxonomic ID number(s).
#' @param db a taxonomy database (a data.frame object).
#' See \code{\link{taxonomy}} for details.
#' @param simplify logical indicating whether a single lineage
#' derived from a length-one input
#' should be simplified from a list to a named character vector.
#' Defaults to TRUE.
#' @param numbers logical indicating whether the output string(s) should
#' be comprised of the taxonomic ID numbers rather than taxon names.
#' Defaults to FALSE.
#' @param cores integer giving the number of processors for multithreading (Defaults to 1).
#' This argument may alternatively be a 'cluster' object,
#' in which case it is the user's responsibility to close the socket
#' connection at the conclusion of the operation,
#' for example by running \code{parallel::stopCluster(cores)}.
#' The string 'autodetect' is also accepted, in which case the maximum
#' number of cores to use is one less than the total number of cores available.
#' Note that in this case there
#' may be a tradeoff in terms of speed depending on the number and size
#' of sequences to be processed, due to the extra time required to initialize
#' the cluster.
#' @return the full lineage as a named character vector, or list of named character
#' vectors if the length of the input object is > 1 or simplify = FALSE.
#' "names" attributes are taxonomic ranks.
#' @author Shaun Wilkinson
#' @references
#' Federhen S (2012) The NCBI Taxonomy database.
#' \emph{Nucleic Acids Research}
#' \strong{40}, D136-D143. doi:10.1093/nar/gkr1178.
#'
#' \url{https://www.ncbi.nlm.nih.gov/taxonomy/}
#' @examples
#' data(whales)
#' data(whale_taxonomy)
#' taxIDs <- as.integer(gsub(".+\\|", "", names(whales)[1:2]))
#' get_lineage(taxIDs, db = whale_taxonomy)
################################################################################
get_lineage <- function(taxIDs, db, simplify = TRUE, numbers = FALSE, cores = 1){
db$rank <- as.character(db$rank)
db$name <- as.character(db$name) # avoid stringsasfactor issues
pointers <- .point(paste(taxIDs))
taxIDs <- taxIDs[!duplicated(pointers)]
if("tax_id" %in% colnames(db)){
colnames(db)[colnames(db) == "tax_id"] <- "taxID"
colnames(db)[colnames(db) == "parent_tax_id"] <- "parent_taxID"
## for backwards compatibility
}
gl1 <- function(taxID, db){
if(is.na(taxID)) return(NA)
stopifnot(length(taxID) == 1 & mode(taxID) == "numeric")
res <- if(numbers) integer(100) else character(100)
resnames <- character(100)
counter <- 1
index <- match(taxID, db$taxID)
if(is.na(index)){
# warning(paste("Taxon ID", taxID, "not found in database\n"))
return(NA)
}
repeat{
if(is.na(index)) break
# if(length(index) > 1) cat(index, "\n")
res[counter] <- if(numbers) db$taxID[index] else db$name[index]
resnames[counter] <- db$rank[index]
index <- db$parent_tax_index[index]
counter <- counter + 1
}
res <- res[1:(counter - 1)]
names(res) <- resnames[1:(counter - 1)]
return(rev(res))
}
db$parent_tax_index <- match(db$parent_taxID, db$taxID)
## multithreading
if(inherits(cores, "cluster")){
res <- parallel::parLapply(cores, taxIDs, gl1, db)
}else if(cores == 1){
res <- lapply(taxIDs, gl1, db)
}else{
navailcores <- parallel::detectCores()
if(identical(cores, "autodetect")) cores <- navailcores - 1
if(!(mode(cores) %in% c("numeric", "integer"))) stop("Invalid 'cores' argument")
if(cores > 1){
cl <- parallel::makeCluster(cores)
res <- parallel::parLapply(cl, taxIDs, gl1, db)
parallel::stopCluster(cl)
}else{
res <- lapply(taxIDs, gl1, db)
}
}
res <- res[pointers]
if(length(res) == 1 & simplify) res <- res[[1]]
return(res)
}
################################################################################
#' Get taxon ID from taxonomy database.
#'
#' This function returns the unique ID for a specified taxon name by looking
#' up a taxonomy database.
#'
#' @param lineage character vector of taxon names or semicolon-delimited
#' lineage strings.
#' @param db a valid taxonomy database (as a data.frame object).
#' See \code{\link{taxonomy}} for details.
#' @param multimatch integer giving the value to return if the query
#' matches multiple entries. Defaults to NA_integer_.
#' @return An integer giving the unique taxon ID,
#' or NA if the taxon is not found in the database.
#' @author Shaun Wilkinson
#' @references
#' Federhen S (2012) The NCBI Taxonomy database.
#' \emph{Nucleic Acids Research}
#' \strong{40}, D136-D143. doi:10.1093/nar/gkr1178.
#'
#' \url{https://www.ncbi.nlm.nih.gov/taxonomy/}
#' @examples
#' data(whale_taxonomy)
#' get_taxID("Odontoceti", db = whale_taxonomy)
################################################################################
get_taxID <- function(lineage, db, multimatch = NA){
pointers <- .point(lineage)
lineage <- lineage[!duplicated(pointers)]
db <- db[!duplicated(db), ]
multimatch <- as.integer(multimatch)
res <- integer(length(lineage))
linlist <- strsplit(lineage, split = "; ")
spps <- vapply(linlist, tail, "", 1)
duplicates <- duplicated(db$name)
dupnames <- db$name[duplicates]
multimatches <- spps %in% dupnames
if(any(multimatches)){
db2 <- prune_taxonomy(db, taxIDs = unique(db$taxID[db$name %in% spps[multimatches]]))
gtid_slow <- function(x, db2, multimatch){ # x is a character vector
indices <- which(db2$name == tail(x, 1)) # should be length >= 2 vector
candidates <- get_lineage(db2$taxID[indices], db2, simplify = FALSE) # length 2 list
candidates <- vapply(candidates, function(cdd, x) all(x %in% cdd), logical(1), x)
return(if(sum(candidates) == 1) db2$taxID[indices[candidates]] else multimatch)
}
res[multimatches] <- vapply(linlist[multimatches], gtid_slow, 0L, db2, multimatch)
res[!multimatches] <- db$taxID[match(spps[!multimatches], db$name)]
}else{
res <- db$taxID[match(spps, db$name)]
}
res <- res[pointers]
return(res)
}
################################################################################
#' Download taxonomy database.
#'
#' This function downloads an up-to-date copy of the taxonomy database.
#'
#' @param db character string specifying which taxonomy database to download.
#' Currently only "NCBI" is supported.
#' @param synonyms logical indicating whether synonyms should be included.
#' Note that this increases the size of the returned object by around 10\%.
#' @return a dataframe with the following elements:
#' "taxID", "parent_taxID", "rank", "name".
#' @details
#' This function downloads the specified taxonomy database as a data.frame
#' object with the following columns:
#' "taxID", "parent_taxID", "rank", "name".
#' As of early 2018 the zip archive to download the NCBI taxonomy database
#' is approximately 40Mb in size, and the output dataframe object is around
#' 200Mb in memory. Once downloaded, the dataframe can be pruned
#' for increased speed and memory efficiency using the function
#' \code{\link{prune_taxonomy}}.
#' @author Shaun Wilkinson
#' @references
#' Federhen S (2012) The NCBI Taxonomy database.
#' \emph{Nucleic Acids Research}
#' \strong{40}, D136-D143. doi:10.1093/nar/gkr1178.
#'
#' \url{https://www.ncbi.nlm.nih.gov/taxonomy/}
#' @examples
#' \donttest{
#' # db <- taxonomy()
#' }
################################################################################
taxonomy <- function(db = "NCBI", synonyms = FALSE){
if(!identical(db, "NCBI")){
stop("Only the NCBI taxonomy database is available in this version\n")
}
tmp <- tempdir()
fn <- "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
download.file(fn, destfile = paste0(tmp, "/tmp.tar.gz"))
message("Extracting data\n")
test <- untar(tarfile = paste0(tmp, "/tmp.tar.gz"), exdir = tmp)
if(!identical(test, 0L)) stop(cat(test))
message("Building data frame\n")
x <- scan(file = paste0(tmp, "/nodes.dmp"), what = "", sep = "\n", quiet = TRUE)
x <- strsplit(x, split = "\t")
x <- sapply(x, function(s) s[c(1, 3, 5)])
nodes <- as.data.frame(t(x), stringsAsFactors = FALSE)
nodes[[1]] <- as.integer(nodes[[1]])
nodes[[2]] <- as.integer(nodes[[2]])
colnames(nodes) <- c("taxID", "parent_taxID", "rank")
x <- scan(file = paste0(tmp, "/names.dmp"), what = "", sep = "\n", quiet = TRUE)
if(synonyms){
syn <- x[grepl("synonym", x)]
syn <- strsplit(syn, split = "\t")
syn <- sapply(syn, function(s) s[c(1, 3)])
syn <- as.data.frame(t(syn), stringsAsFactors = FALSE)
syn[[1]] <- as.integer(syn[[1]])
colnames(syn) <- c("taxID", "name")
}
x <- x[grepl("scientific name", x)] # 1637100 elements ~200 Mb, ~5 sec
x <- strsplit(x, split = "\t")
x <- sapply(x, function(s) s[c(1, 3)])
namez <- as.data.frame(t(x), stringsAsFactors = FALSE)
namez[[1]] <- as.integer(namez[[1]])
colnames(namez) <- c("taxID", "name")
# merge node and names data frames together
taxa <- merge(nodes, namez, by = "taxID")
taxa$parent_taxID[taxa$taxID == 1] <- 0
if(synonyms){
taxapp <- taxa[match(syn$taxID, taxa$taxID), ]
taxapp$name <- syn$name
rownames(taxapp) <- NULL
taxa <- rbind(taxa, taxapp)
} # attr(taxa, "synonyms") <- syn
message("Done\n")
return(taxa)
}
################################################################################
#' Prune taxonomy database.
#'
#' This function prunes the taxon database, removing specified taxa as
#' desired to improve speed and memory efficiency.
#'
#' @param db a valid taxonomy database, e.g. obtained by running the
#' \code{\link{taxonomy}} function.
#' @param taxIDs the names or taxon ID numbers of the taxa to be retained
#' (or discarded if \code{keep = FALSE}).
#' @param keep logical, indicates whether the specified taxa should be
#' kept and the rest of the database removed or vice versa. Defaults to TRUE.
#' @return a data.frame with the same column names as
#' the input object
#' ("taxID", "parent_taxID", "rank", "name").
#' @details TBA
#' @author Shaun Wilkinson
#' @references
#' Federhen S (2012) The NCBI Taxonomy database.
#' \emph{Nucleic Acids Research}
#' \strong{40}, D136-D143. doi:10.1093/nar/gkr1178.
#'
#' \url{https://www.ncbi.nlm.nih.gov/taxonomy/}
#' @examples
#' ## remove Odontoceti suborder from cetacean taxonomy database
#' data(whale_taxonomy)
#' prune_taxonomy(whale_taxonomy, taxIDs = 9722, keep = FALSE)
################################################################################
prune_taxonomy <- function(db, taxIDs, keep = TRUE){
### taxIDs can be character or taxIDs
if("tax_id" %in% colnames(db)){
colnames(db)[colnames(db) == "tax_id"] <- "taxID"
colnames(db)[colnames(db) == "parent_tax_id"] <- "parent_taxID"
## for backwards compatibility
}
if(mode(taxIDs) == "character"){
tmp <- db$taxID[match(taxIDs, db$name)]
if(any(is.na(tmp))) stop("Not found in db:", taxIDs[is.na(tmp)], "\n")
taxIDs <- tmp
}else{
if(any(is.na(match(taxIDs, db$taxID)))) {
stop("Taxon IDs not found in db:",
paste0(taxIDs[is.na(taxIDs)], collapse = " "),
"\n")
}
}
taxIDs <- unique(taxIDs)
if(keep){
rowstokeep <- integer(0)
repeat{
indices <- match(taxIDs, db$taxID)
if(any(is.na(indices))) stop("Invalid database format\n")
rowstokeep <- c(rowstokeep, indices)
taxIDs <- unique(db$parent_taxID[indices])
if(all(taxIDs == 0L)) break
taxIDs <- taxIDs[taxIDs != 0]
}
rowstokeep <- sort(unique(rowstokeep))
db <- db[rowstokeep, ]
}else{
rowstodiscard <- match(taxIDs, db$taxID)
repeat{
tmp <- which(db$parent_taxID %in% taxIDs)
if(length(tmp) == 0) break
rowstodiscard <- c(rowstodiscard, tmp)
taxIDs <- db$taxID[tmp]
}
db <- db[-rowstodiscard, ]
}
rownames(db) <- NULL
return(db)
}
################################################################################
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.