R/wcvp.R

Defines functions wcvp_download_url_ wcvp_search_url_ wcvp_taxon_url_ download_wcvp lookup_wcvp search_wcvp

Documented in download_wcvp lookup_wcvp search_wcvp

#' Search WCVP for a taxon.
#'
#' Query the World Checklist of Vascular Plants search API
#' for a taxon string.
#'
#' The [World Checklist of Vascular Plants (WCVP)](https://wcvp.science.kew.org/)
#' is a global consensus view of all known vascular plant species.
#' It has been compiled by staff at RBG Kew in consultation with plant
#' group experts.
#'
#' The search API allows users to query the checklist for plant names.
#' Currently, it does not support partial or fuzzy matching.
#' In order to get a result, the user must supply a valid name string.
#' For example, 'Myrcia' and 'Myrcia guianensis' will return results,
#' but 'M' or 'Myr' will not.
#'
#' There is some support for querying using keyword arguments. The API is
#' not currently documented, so only keywords that are definitely there have
#' been implemented. Use the `get_keywords` function to view a list of all implemented keywords.
#'
#' The API will return taxonomic information (the family, authority, status, and rank)
#' of all names matching the query. These results can be limited, for example to accepted species,
#' using filters. Use the `get_filters` function to view a list of all implemented filters.
#'
#' @param query The taxon string to search WCVP for. If using keywords,
#'  the query must be formatted as a list.
#' @param filters Filter to apply to search results.
#'  Multiple filters must be supplied as a character vector.
#' @param cursor A cursor returned by a previous search.
#'  If used, the query and filter must be exactly the same.
#' @param limit An integer specifying the maximum number of results
#'  to return.
#' @param .wait Time to wait before making a request, to help
#'  rate limiting.
#'
#' @return Returns an object of class `wcvp_search` that is a simple
#' structure with slots for:
#'
#'  * `total`: the total number of results held in WCVP for the query
#'  * `cursor`: a cursor to retrieve the next page of results from the API.
#'  * `limit`: the maximum number of results requested from the API.
#'  * `results`: the query results parsed into a list.
#'  * `query`: the query string submitted to the API.
#'  * `filter`: the filter strings submitted to the API.
#'  * `response`: the [httr response object][httr::response].
#'
#' @examples
#' # search for all entries containing a genus name
#' search_wcvp("Myrcia")
#'
#' # search for all accepted species within a genus
#' search_wcvp("Myrcia", filters=c("species", "accepted"))
#'
#' # search for up to 10,000 species in a genus
#' search_wcvp("Poa", filters=c("species"), limit=10000)
#'
#' # search for all names in a family
#' search_wcvp(list(family="Myrtaceae"))
#'
#' # search for genera within a family
#' search_wcvp(list(family="Myrtaceae"), filters=c("genera"))
#'
#' # search for all names with a specific epithet
#' search_wcvp(list(species="guianensis"))
#'
#' # search for a species name and print the results
#' r <- search_wcvp("Myrcia guianensis", filters=c("species"))
#' print(r)
#'
#' # simplify search results to a `tibble`
#' r <- search_wcvp("Poa", filters=c("species"))
#' tidy(r)
#'
#' # accepted name info is nested inside the records for synonyms
#' # simplify accepted name info to the name ID
#' r <- search_wcvp("Poa", filters=c("species"))
#' tidied <- tidy(r)
#' tidyr::unnest(tidied, cols=synonymOf, names_sep="_")
#'
#' @references
#' WCVP (2020). World Checklist of Vascular Plants, version 2.0. Facilitated by the Royal Botanic Gardens, Kew. Published on the Internet; http://wcvp.science.kew.org/
#'
#' @family WCVP functions
#' @seealso
#'  * [lookup_wcvp()] to lookup information about a taxon name
#'   using a valid IPNI ID.
#'  * [download_wcvp()] to download the entire WCVP.
#'
#' @export
search_wcvp <- function(query, filters=NULL, cursor="*", limit=50, .wait=0.1) {
  url <- wcvp_search_url_()

  # keeping a copy of this to return in the result object
  original_query <- query

  query <- format_query_(query, "wcvp")

  query$limit <- limit
  query$cursor <- cursor
  query$f <- format_filters_(filters, "wcvp")

  results <- make_request_(url, query, .wait=.wait)

  # calculate total number of pages, because it isn't returned
  total_pages <- ceiling(results$content$total / results$content$limit)

  structure(
    list(
      total=results$content$total,
      pages=total_pages,
      cursor=results$content$cursor,
      limit=results$content$limit,
      results=results$content$results,
      query=original_query,
      filters=filters,
      response=results$response
    ),
    class=c("wcvp_search", "wcvp")
  )
}

#' Look up a taxon in WCVP.
#'
#' Request the record for a taxon in the World Checklist of
#' Vascular Plants (WCVP) using the IPNI ID.
#'
#' The [World Checklist of Vascular Plants (WCVP)](https://wcvp.science.kew.org/)
#' is a global consensus view of all known vascular plant species.
#' It has been compiled by staff at RBG Kew in consultation with plant
#' group experts.
#'
#' The taxon lookup API allows users to retrieve taxonomic information for
#' a specific taxon name using the unique IPNI ID. If this is not known,
#' it can be found out using the [WCVP search API][kewr::search_wcvp].
#'
#' @param taxonid A string containing a valid IPNI ID.
#' @param .wait Time to wait before making a request, to help
#'  rate limiting.
#'
#' @return A `wcvp_taxon` object, which is a simple structure with fields
#'   for each of the fields returned by the lookup API, as well as the the [httr response object][httr::response].
#'
#' @examples
#'
#' # retrieve taxonomic information for a taxon name
#' lookup_wcvp("271445-2")
#'
#' # print a summary of the returned information
#' r <- lookup_wcvp("271445-2")
#' print(r)
#'
#' # tidy into a tibble
#' r <- lookup_wcvp("271445-2")
#' tidy(r)
#'
#' # tidy the returned list of synonyms into a tibble
#' r <- lookup_wcvp("60447743-2")
#' tidied <- tidy(r)
#' tidyr::unnest(tidied, cols=synonyms, names_sep="_")
#'
#' # expand the child entries returned for each entry
#' r <- lookup_wcvp("30000055-2")
#' tidied <- tidy(r)
#' tidyr::unnest(tidied, cols=children, names_sep="_")
#'
#' @family WCVP functions
#' @seealso
#'  * [search_wcvp()] to search WCVP using a taxon name.
#'  * [download_wcvp()] to download the entire WCVP.
#'
#' @references
#' WCVP (2020). World Checklist of Vascular Plants, version 2.0. Facilitated by the Royal Botanic Gardens, Kew. Published on the Internet; http://wcvp.science.kew.org/
#'
#' @export
lookup_wcvp <- function(taxonid, .wait=0.1) {
  url <- wcvp_taxon_url_(taxonid)

  result <- make_request_(url, query=NULL, .wait=.wait)

  # this might be better if things were explicitly listed
  record <- result$content
  record$response <- result$response
  record$queryId <- taxonid

  # fill in status if unplaced
  status <- record$status
  record$status <- ifelse(is.null(status), "unplaced", status)

  # make sure author string is not null
  authors <- record$authors
  record$authors <- ifelse(is.null(authors), NA_character_, authors)

  structure(
    record,
    class=c("wcvp_taxon", "wcvp")
  )
}

#' Download the whole of the WCVP.
#'
#' Download the latest or a specific version of the World
#' Checklist of Vascular Plants (WCVP).
#'
#' The [World Checklist of Vascular Plants (WCVP)](https://wcvp.science.kew.org/)
#' is a global consensus view of all known vascular plant species.
#' It has been compiled by staff at RBG Kew in consultation with plant
#' group experts.
#'
#' Versioned downloads of the whole WCVP are provided on the website.
#' This function allows the user to download the latest or a specific
#' version of the WCVP.
#'
#' @param save_dir A string specifying the folder to save the download in. If
#'   no value is provided, \link[here]{here} will be used.
#' @param version An integer version number to download. The latest
#'   version will be downloaded by default.
#'
#' @examples
#' \dontrun{
#'  # download the latest version
#'  download_wcvp()
#'
#'  # download version 1
#'  download_wcvp(version=1)
#' }
#'
#' @family WCVP functions
#' @seealso
#'  * [lookup_wcvp()] to lookup information about a taxon name
#'   using a valid IPNI ID.
#'  * [search_wcvp()] to search WCVP using a taxon name.
#'
#' @references
#' WCVP (2020). World Checklist of Vascular Plants, version 2.0. Facilitated by the Royal Botanic Gardens, Kew. Published on the Internet; http://wcvp.science.kew.org/
#'
#' @importFrom here here
#' @importFrom glue glue
#' @importFrom stringr str_extract
#' @importFrom utils download.file
#'
#' @export
download_wcvp <- function(save_dir=NULL, version=NULL) {
  if (is.null(save_dir)) {
    save_dir <- here()
  }

  download_link <- wcvp_download_url_(version)
  filename <- str_extract(download_link, "(?<=/)wcvp.+\\.zip$")
  save_path <- file.path(save_dir, filename)

  if (is.null(version)) {
    version <- "latest"
  }

  message <- glue("Downloading WCVP version {version}",
                  "to: {save_path}\n",
                  .sep=" ", .trim=FALSE)

  cat(message)

  download.file(download_link, save_path)

  invisible()
}

#' Make the WCVP taxon lookup URL.
#'
#' @param taxonid A valid IPNI ID.
#'
#' @noRd
#'
#' @importFrom glue glue
wcvp_taxon_url_ <- function(taxonid) {
  base <- get_url_("wcvp")

  glue("{base}/taxon/{taxonid}")
}

#' Make the WCVP search URL.
#'
#' @noRd
wcvp_search_url_ <- function() {
  base <- get_url_("wcvp")

  paste0(base, "/search")
}

#' Get a WCVP download URL.
#'
#' @importFrom httr GET
#' @importFrom rvest html_nodes html_attr
#' @importFrom stringr str_detect str_extract
#' @importFrom glue glue
#'
#' @noRd
wcvp_download_url_ <- function(version=NULL) {
  base <- "http://sftp.kew.org/pub/data-repositories/WCVP/"
  response <- GET(base)

  page <- content(response)
  link_nodes <- html_nodes(page, "a")
  links <- html_attr(link_nodes, "href")

  download_links <- links[str_detect(links, "\\.zip$")]
  versions <- str_extract(download_links, "(?<=_v)\\d+")

  if (is.null(version)) {
    version <- max(versions)
  }

  if (! version %in% versions) {
    message <- glue("Not a recognised version of WCVP: {version}",
                    "Available versions: {paste0(versions, collapse=',')}",
                    "",
                    .sep="\n", .trim=FALSE)

    stop(message, call.=FALSE)
  }

  download_link <- download_links[str_detect(download_links, paste0("_v", version))]
  paste0(base, download_link)
}
barnabywalker/kewr documentation built on July 5, 2022, 5:37 p.m.