#' @title Find taxon names using Global Names Recognition and Discovery
#'
#' @description Uses the Global Names Recognition and Discovery service, see
#' http://gnrd.globalnames.org/
#'
#' NOTE: This function sometimes gives data back and sometimes not. The API
#' that this function is using is extremely buggy.
#'
#' @export
#'
#' @param url (character) If text parameter is empty, and `url` is given,
#' GNfinder will process the URL and will find names in the content of its
#' body.
#' @param text (character) Contains the text which will be checked for
#' scientific names. If this parameter is not empty, the `url` parameter is
#' ignored.
#' @param format (character) Sets the output format. It can be set to: `"csv"`
#' (the default), `"tsv"`, or `"json"`.
#' @param bytes_offset (logical) This changes how the position of a detected
#' name in text is calculated. Normally a name's start and end positions are
#' given as the number of UTF-8 characters from the beginning of the text. If
#' this is `TRUE`, the start and end offsets are recalculated in the number of
#' bytes.
#' @param return_content (logical) If this is `TRUE`, the text used for the name
#' detection is returned back. This is especially useful if the input was not
#' a plain UTF-8 text and had to be prepared for name-finding. Then the
#' returned content can be used together with start and end fields of detected
#' name-strings to locate the strings in the text.
#' @param unique_names (logical) If this is `TRUE`, the output returns a list of
#' unique names, instead of a list of all name occurrences. Unique list of
#' names does not provide position information of a name in the text.
#' @param ambiguous_names (logical) If this is `TRUE`, strings which are
#' simultaneously scientific names and "normal" words are not filtered out
#' from the results. For example, generic names like America, Cancer,
#' Cafeteria will be returned in the results.
#' @param no_bayes (logical) If this is `TRUE`, only heuristic algorithms are
#' used for name detection.
#' @param odds_details (logical) If `TRUE`, the result will contain odds of all
#' features used for calculation of NaiveBayes odds. Odds describe probability
#' of a name to be 'real'. The higher the odds, the higher the probability
#' that a detected name is not a false positive. Odds are calculated by
#' multiplication of the odds of separate features. Odds details explain how
#' the final odds value is calculated.
#' @param language (character) The language of the text. Language value is used
#' for calculation of Bayesian odds. If this parameter is not given, `"eng"`
#' is used by default. Currently only English and German languages are
#' supported. Valid values are: `"eng"`, `"deu"`, and `"detect"`.
#' @param words_around (integer) Allows to see the context surrounding a
#' name-string. This sets the number of words located immediately before or
#' after a detected name. These words are then returned in the output. Default
#' is 0, maximum value is 5.
#' @param verification (character) When this `TRUE`, there is an additional
#' verification step for detected names. This step requires internet
#' connection and uses https://verifier.globalnames.org/api/v1 for
#' verification queries.
#' @param sources Pipe separated list of data source ids to resolve found names
#' against. See list of Data Sources
#' http://resolver.globalnames.org/data_sources
#' @param all_matches When this option is true all found results are returned,
#' not only the bestResults. The bestResult field in this case is null, and
#' results field should contain found results of the matches.
#' @param ... Further args passed to [crul::verb-GET]
#' @param detect_language Defunct. See the `language` option.
#' @param data_source_ids Defunct. See the `sources` option.
#' @param file Defunct. If you feel this is important functionality submit an
#' issue at "https://github.com/ropensci/taxize"
#' @param unique Defunct. See the `unique_names` option.
#' @param engine Defunct. The API used no longer supports this option.
#'
#' @author Scott Chamberlain, Zachary Foster
#'
#' @return A [tibble::tibble()] or list representing parsed JSON output
#' depending on the value of the `format` option.
#' @examples \dontrun{
#' # Get data from a website using its URL
#' scrapenames('https://en.wikipedia.org/wiki/Spider')
#' scrapenames('https://en.wikipedia.org/wiki/Animal')
#' scrapenames('https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0095068')
#' scrapenames('https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0080498')
#'
#' scrapenames(url = 'https://en.wikipedia.org/wiki/Spider', source=c(1, 169))
#'
#' # Get data from text string
#' scrapenames(text='A spider named Pardosa moesta Banks, 1892')
#'
#' # return OCR content
#' scrapenames(text='A spider named Pardosa moesta Banks, 1892',
#' return_content = TRUE, format = 'json')
#' }
scrapenames <- function(
url = NULL,
text = NULL,
format = 'csv',
bytes_offset = FALSE,
return_content = FALSE,
unique_names = TRUE,
ambiguous_names = FALSE,
no_bayes = FALSE,
odds_details = FALSE,
language = 'detect',
words_around = 0,
verification = TRUE,
sources = NULL,
all_matches = FALSE,
...,
file = NULL,
unique = NULL,
engine = NULL,
detect_language = NULL,
data_source_ids = NULL
) {
# Error if defunct parameters are used.
if (!is.null(unique)) {
stop(call. = FALSE, 'The `unique` option is defunct. See the `unique_names` option. ')
}
if (!is.null(engine)) {
stop(call. = FALSE, 'The `engine` option is defunct. The API no longer supports this option. ')
}
if (!is.null(detect_language)) {
stop(call. = FALSE, 'The `detect_language` option is defunct. See the `language` option. ')
}
if (!is.null(data_source_ids)) {
stop(call. = FALSE, 'The `data_source_ids` option is defunct. See the `source` option. ')
}
# Validate parameters
if (! format %in% c('csv', 'tsv', 'json')) {
stop(call. = FALSE, 'The `format` option must be "csv", "tsv", or "json". "', format, '" was the value given')
}
# Make query
base <- "http://gnrd.globalnames.org/api/v1/find"
args <- tc(list(
text = text,
url = url,
format = format,
bytesOffset = bytes_offset,
returnContent = return_content,
uniqueNames = unique_names,
ambiguousNames = ambiguous_names,
noBayes = no_bayes,
oddsDetails = odds_details,
language = language,
wordsAround = words_around,
verification = verification,
sources = paste0(sources, collapse = '|'),
allMatches = all_matches
))
cli <- crul::HttpClient$new(base, headers = tx_ual, opts = list(...))
response <- cli$post(body = args, encode = "form")
# Check for errors
if (response$status_code == "500") {
warning(call. = FALSE, 'The GNR server has encountered an internal error trying to process this request.')
return(NULL)
}
# Parse and return results
raw_output <- response$parse("UTF-8")
switch (format,
csv = tibble::as_tibble(utils::read.csv(text = raw_output)),
tsv = tibble::as_tibble(utils::read.csv(text = raw_output, sep = '\t')),
json = jsonlite::fromJSON(raw_output),
other = stop("Invalid 'format' option.")
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.