R/tnrs.R
In rotl: Interface to the 'Open Tree of Life' API

Documented in tnrs_contexts tnrs_infer_context tnrs_match_names

##' Match taxonomic names to the Open Tree Taxonomy.
##'
##' Accepts one or more taxonomic names and returns information about
##' potential matches for these names to known taxa in the Open Tree
##' Taxonomy.
##'
##' This service uses taxonomic contexts to disambiguate homonyms and misspelled
##' names; a context may be specified using the \code{context_name} argument.
##' The default value for \code{context_name} is "All life". If no context is
##' specified (i.e., \code{context_name} is set to \code{NULL}), then the
##' context will be inferred (i.e., the shallowest taxonomic context that
##' contains all unambiguous names in the input). Taxonomic contexts are
##' uncontested higher taxa that have been selected to allow limits to be
##' applied to the scope of TNRS searches (e.g. 'match names only within
##' flowering plants'). Once a context has been identified (either
##' user-specified or inferred), all taxon name matches will performed only
##' against taxa within that context. For a list of available taxonomic
##' contexts, see \code{\link{tnrs_contexts}}.
##'
##' A name is considered unambiguous if it is not a synonym and has
##' only one exact match to any taxon name in the entire taxonomy.
##'
##' When the name search returns multiple matches, the taxon with the highest
##' match score is returned. If the name returned is not the one you intended,
##' you can use the \code{inspect} function to check the other taxa returned by
##' your search. The
##' \href{https://docs.ropensci.org/rotl/articles/rotl.html#how-to-change-the-ott-ids-assigned-to-my-taxa}{Getting
##' Started vignette} has more information on how to do this.
##'
##' Several functions listed in the \sQuote{See also} section can be
##' used to inspect and manipulate the object generated by this
##' function.
##'
##'
##' @title Match names to the Open Tree Taxonomy
##' @param names taxon names to be queried. Currently limited to 10,000 names
##'   for exact matches and 2,500 names for approximate matches (character
##'   vector)
##' @param context_name name of the taxonomic context to be searched (length-one
##'   character vector or \code{NULL}). Must match (case sensitive) one of the
##'   values returned by \code{\link{tnrs_contexts}}. Default to "All life".
##' @param do_approximate_matching A logical indicating whether or not to
##'   perform approximate string (a.k.a. \dQuote{fuzzy}) matching. Using
##'   \code{FALSE} will greatly improve speed. Default, however, is \code{TRUE}.
##' @param ids A vector of ids to use for identifying names. These will be
##'   assigned to each name in the names array. If ids is provided, then ids and
##'   names must be identical in length.
##' @param include_suppressed Ordinarily, some quasi-taxa, such as incertae
##'   sedis buckets and other non-OTUs, are suppressed from TNRS results. If
##'   this parameter is true, these quasi-taxa are allowed as possible TNRS
##'   results.
##' @param ... additional arguments to customize the API request (see
##'   \code{\link{rotl}} package documentation).
##' @return A data frame summarizing the results of the query. The original
##'   query output is appended as an attribute to the returned object (and can
##'   be obtained using \code{attr(object, "original_response")}).
##' @seealso \code{\link{inspect.match_names}},
##'   \code{\link{update.match_names}}, \code{\link{synonyms.match_names}}.
##' @examples \dontrun{
##'  deuterostomes <- tnrs_match_names(names=c("echinodermata", "xenacoelomorpha",
##'                                             "chordata", "hemichordata"))
##' }
##' @importFrom stats setNames
##' @export
tnrs_match_names <- function(names = NULL, context_name = "All life",
                             do_approximate_matching = TRUE, ids = NULL,
                             include_suppressed = FALSE, ...) {
  if (!is.null(context_name) &&
    !context_name %in% unlist(tnrs_contexts(...))) {
    stop(
      "The ", sQuote("context_name"),
      " is not valid. Check possible values using tnrs_contexts()"
    )
  }

  ## take care of duplicated names
  if (any(duplicated(tolower(names)))) {
    names <- tolower(names)
    warning("Some names were duplicated: ",
      paste(sQuote(names[duplicated(names)]), collapse = ", "), ".",
      call. = FALSE
    )
    names <- unique(names)
  }

  res <- .tnrs_match_names(
    names = names, context_name = context_name,
    do_approximate_matching = do_approximate_matching,
    ids = ids, include_suppressed = include_suppressed,
    ...
  )

  check_tnrs(res)

  match_ids <- highest_match_score(res)

  if (!identical(length(res[["results"]]), length(match_ids))) {
    stop(
      "The number of matches and the number of 'results' elements should",
      " be the same."
    )
  }

  summary_match <- mapply(
    function(rid, mid) {
      build_summary_match(
        res,
        res_id = rid,
        match_id = mid,
        initial_creation = TRUE
      )
    },
    seq_along(res[["results"]]),
    match_ids,
    SIMPLIFY = FALSE
  )

  ## add taxon names with no maches
  summary_match <- do.call("rbind", summary_match)
  summary_match <- as.data.frame(summary_match, stringsAsFactors = FALSE)

  summary_match$search_string <- gsub("\\\\", "", summary_match$search_string)

  ## reorder to match original query
  ordr <- match(tolower(names), tolower(summary_match$search_string))
  stopifnot(identical(length(match_ids), length(ordr)))

  summary_match <- summary_match[ordr, ]
  match_ids <- match_ids[ordr]

  summary_match[["approximate_match"]] <-
    convert_to_logical(summary_match[["approximate_match"]])
  summary_match[["is_synonym"]] <-
    convert_to_logical(summary_match[["is_synonym"]])
  summary_match[["flags"]] <- convert_to_logical(summary_match[["flags"]])

  has_original_match <- !is.na(summary_match[["number_matches"]])

  json_coords <- data.frame(
    search_string = names,
    original_order = as.numeric(rownames(summary_match)),
    match_id = match_ids,
    has_original_match = has_original_match,
    row.names = seq_along(names),
    stringsAsFactors = FALSE
  )

  attr(summary_match, "original_order") <- as.numeric(rownames(summary_match))
  attr(summary_match, "original_response") <- res
  attr(summary_match, "match_id") <- match_ids
  attr(summary_match, "has_original_match") <- has_original_match
  attr(summary_match, "json_coords") <- json_coords

  class(summary_match) <- c("match_names", "data.frame")
  rownames(summary_match) <- NULL
  summary_match
}

##' @importFrom stats na.omit
convert_to_logical <- function(x) {
  if (all(stats::na.omit(x) %in% c("TRUE", "FALSE"))) {
    x <- as.logical(x)
  } else {
    x
  }
}

check_tnrs <- function(req) {
  no_match <- req[["unmatched_names"]]

  if (any(vapply(no_match, length, integer(1)) > 0)) {
    warning(
      paste(unlist(no_match), collapse = ", "), " are not matched",
      call. = FALSE
    )
  }
}


tnrs_columns <- list(
  "search_string" = function(x) x[["search_string"]],
  "unique_name" = function(x) .tax_unique_name(x[["taxon"]]),
  "approximate_match" = function(x) x[["is_approximate_match"]],
  "score" = function(x) x[["score"]],
  "ott_id" = function(x) .tax_ott_id(x[["taxon"]]),
  "is_synonym" = function(x) x[["is_synonym"]],
  "flags" = function(x) paste(.tax_flags(x[["taxon"]]), collapse = ", ")
)

summary_row_factory <- function(res, res_id, match_id, columns = tnrs_columns) {
  res_address <- res[["results"]][[res_id]][["matches"]][[match_id]]
  ret <- sapply(columns, function(f) f(res_address))
  n_match <- length(res[["results"]][[res_id]][["matches"]])
  c(ret, number_matches = n_match)
}

build_summary_match <- function(res, res_id, match_id = NULL, initial_creation) {
  if (length(res_id) > 1 &&
    (!is.null(match_id) && length(match_id) > 1)) {
    stop("Something is wrong. Please contact us.")
  }

  build_summary_row <- function(rid) {
    if (is.null(match_id)) {
      match_id <- seq_len(length(res[["results"]][[rid]][["matches"]]))
    }
    if (identical(length(match_id), 0L) ||
      is.null(res[["results"]][[rid]][["matches"]][match_id][[1]])) {
      return(build_empty_row(tolower(res[["results"]][[rid]][["name"]])))
    }
    res <- lapply(match_id, function(mid) {
      summary_row_factory(res, rid, mid)
    })
    if (identical(length(match_id), 1L)) {
      unlist(res)
    } else {
      res
    }
  }

  summary_row <- lapply(res_id, build_summary_row)

  if (identical(length(res_id), 1L)) {
    summary_row <- unlist(summary_row, recursive = FALSE)
  }

  ## Needed if only 1 row returned
  if (!inherits(summary_row, "list")) {
    summary_row <- list(summary_row)
  }

  summary_match <- do.call("rbind", summary_row)
  summary_match <- data.frame(summary_match, stringsAsFactors = FALSE)
  names(summary_match) <- c(names(tnrs_columns), "number_matches")
  clean_tnrs_summary(summary_match)
}

##' @importFrom stats setNames
build_empty_row <- function(x) {
  no_match_row <- stats::setNames(
    rep(NA, length(tnrs_columns) + 1),
    c(names(tnrs_columns), "number_matches")
  )
  no_match_row[1] <- x
  no_match_row
}

lowest_ott_id <- function(rsp) {
  vapply(seq_along(rsp[["results"]]), function(x) {
    .r <- build_summary_match(
      res = rsp, res_id = x, match_id = NULL,
      initial_creation = TRUE
    )

    .r <- .r[(!as.logical(.r[["is_synonym"]])) &
      !is.na(.r[["flags"]]) &
      .r[["flags"]] == "", ]

    if (nrow(.r) > 0) {
      which.min(.r[["ott_id"]])
    } else {
      1L
    }
  }, integer(1))
}

highest_match_score <- function(rsp) {
  vapply(seq_along(rsp[["results"]]), function(x) {
    .r <- build_summary_match(
      res = rsp, res_id = x, match_id = NULL,
      initial_creation = TRUE
    )

    .r <- .r[(!as.logical(.r[["is_synonym"]])) &
               !is.na(.r[["flags"]]) &
               .r[["flags"]] == "", ]

    if (nrow(.r) > 0) {
      which.max(.r[["score"]])
    } else {
      1L
    }
  }, integer(1))
}



clean_tnrs_summary <- function(summary_match) {
  summary_match[["approximate_match"]] <-
    convert_to_logical(summary_match[["approximate_match"]])
  summary_match[["score"]] <- as.numeric(summary_match[["score"]])
  summary_match[["is_synonym"]] <-
    convert_to_logical(summary_match[["is_synonym"]])
  summary_match[["flags"]] <- convert_to_logical(summary_match[["flags"]])
  summary_match[["ott_id"]] <- as.integer(summary_match[["ott_id"]])
  summary_match[["number_matches"]] <-
    as.integer(summary_match[["number_matches"]])
  summary_match
}

##' This function returns a list of pre-defined taxonomic contexts
##' (i.e. clades) which can be used to limit the scope of tnrs
##' queries.
##'
##' Taxonomic contexts are available to limit the scope of TNRS
##' searches. These contexts correspond to uncontested higher taxa
##' such as 'Animals' or 'Land plants'. This service returns a list
##' containing all available taxonomic context names, which may be
##' used as input (via the \code{context_name} argument in other
##' functions) to limit the search scope of other services including
##' \code{\link{tnrs_match_names}}.
##' @title TNRS contexts
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return Returns invisibly a list for each major clades (e.g.,
##'     animals, microbes, plants, fungi, life) whose elements
##'     contains the possible contexts.
##' @export

tnrs_contexts <- function(...) {
  res <- .tnrs_contexts(...)
  class(res) <- "tnrs_contexts"
  res
}

##' @export
print.tnrs_contexts <- function(x, ...) {
  cat("Possible contexts:\n")
  lapply(x, function(t) {
    res <- unlist(t)
    cat("  ", res[1], "\n")
    if (length(res) > 1) {
      lapply(seq(2, length(res), by = 5), function(l) {
        m <- ifelse(l + 5 <= length(res), l + 4, length(res))
        cat("     ", paste(res[l:m], collapse = ", "), "\n")
      })
    }
  })
}

##' Return a taxonomic context given a list of taxonomic names
##'
##' Find the least inclusive taxonomic context that includes all the
##' unambiguous names in the input set. Unambiguous names are names
##' with exact matches to non-homonym taxa. Ambiguous names (those
##' without exact matches to non-homonym taxa) are indicated in
##' results.
##'
##' @title Infer the taxonomic context from a list of names
##' @param names Vector of taxon names.
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return A list including the context name, the context ott id and
##'     possibly the names in the query that have an ambiguous
##'     taxonomic meaning in the query.
##' @examples
##' \dontrun{
                                ##' res <- tnrs_infer_context(names=c("Stellula calliope", "Struthio camelus"))
                                ##' }
                                ##' @export
                                tnrs_infer_context <- function(names = NULL, ...) {
                                  res <- .tnrs_infer_context(names = names, ...)
                                  return(res)
                                }