
Defines functions PaleobiologyDBOccurrenceQuerier

Documented in PaleobiologyDBOccurrenceQuerier

#' Palaeobiology Database Occurrence Querier
#' @description
#' Given a set of Paleobiology Database taxon name(s) or number(s) returns occurrence information for those tax(a).
#' @param taxon_nos A vector of Paleobiology database taxon number(s) to retrieve from the database.
#' @param original Whether or not to return the original (TRUE) or resolved version (FALSE) of names.
#' @param breaker Size of breaker to use if querying a large number of taxa (reduces load on database of individual queries; default is 100).
#' @param RetainUncertainOccurrences Logical indicating whether or not to retain uncertain (i.e., aff, cf., ?, "") occurrences (defaults to FALSE).
#' @details
#' Uses the Paleobiology Database (\code{paleobiodb.org}) API (Peters and McLennen 2016) to query known taxon numbers and returns information on their occurrence as fossils (where this data is available in the database).
#' @return
#' A multi-column matrix with rows as occurrences.
#' @author
#' Graeme T. Lloyd \email{graemetlloyd@@gmail.com}
#' @references
#' Peters, S. E. and McClennen, M., 2016. The Paleobiology Database application programming interface. \emph{Paleobiology}, \bold{42}, 1-7.
#' @examples
#' # Occurrence query for Allosaurus fragilis:
#' PaleobiologyDBOccurrenceQuerier(taxon_nos = "52962")
#' @export PaleobiologyDBOccurrenceQuerier
PaleobiologyDBOccurrenceQuerier <- function(taxon_nos, original = TRUE, breaker = 100, RetainUncertainOccurrences = FALSE) {
  # Subfunction to break N numbers into breaker-sized blocks:
  NumberChunker <- function(N, breaker) {
    # Get total numebr of chunks:
    NChunks <- ceiling(N / breaker)
    # Make initial list of numbers:
    ListOfNumbers <- rep(list(1:breaker), NChunks)
    # Update last element of list (which can be less than breaker) into the correct size (if required):
    if((N %% breaker) > 0) ListOfNumbers[[length(ListOfNumbers)]] <- 1:(N %% breaker)
    # Add cumulative breaker value to all numbers (if required):
    if(N > breaker) ListOfNumbers <- mapply('+', ListOfNumbers, c(0, cumsum(rep(breaker, NChunks - 1))), SIMPLIFY = FALSE)
    # Return list of numbers:
  # Build list of numbers to query:
  NumbersToQuery <- lapply(NumberChunker(N = length(taxon_nos), breaker = breaker), function(x) taxon_nos[x])
  # Build HTTP string(s):
  ResolvedHTTPStrings <- lapply(NumbersToQuery, function(x) ifelse(original, paste("https://paleobiodb.org/data1.2/occs/list.json?taxon_id=", paste(paste("var:", x, sep = ""), collapse = ","), "&show=coords,paleoloc", sep = ""), paste("https://paleobiodb.org/data1.2/occs/list.json?taxon_id=", paste(paste("txn:", x, sep = ""), collapse = ","), "&show=coords,paleoloc", sep = "")))
  # Get resolved json strings for each chunk:
  ResolvedJSON <- lapply(ResolvedHTTPStrings, function(x) {
    # Set resolved json to NA (used later to check results are coming back from server):
    resolvedjson <- NA
    # Set start value for counter (used later to avoid infinite loop):
    counter <- 0
    # While server has not been reached (and querying a taxon number):
    while(is.na(resolvedjson[[1]][1])) {
      # Attempt to acquire resolved taxon string:
      try(resolvedjson <- readLines(x), silent = TRUE)
      # If server was not reached:
      if(is.na(resolvedjson[[1]][1])) {
        # Update counter to record how many attempts to reach server have been made:
        counter <- counter + 1
        # If repeatedly failing to get results stop trying:
        if(counter == 100) stop("Server not responding after 100 straight attempts")
        # Wait two seconds before next attempt (also avoids overloading server):
    # Return resolvedjson string:
  # Extract data from json data:
  Output <- do.call(rbind, lapply(ResolvedJSON, function(x) {
    # Subfunction to extract specific parameter from json string:
    ParameterExtraction <- function(jsonstring, parameterstring) {
      # Extract specific parameter:
      output <- unlist(lapply(as.list(jsonstring), function(x) ifelse(length(grep(parameterstring, x)) > 0, gsub("\"", "", strsplit(strsplit(x, parameterstring)[[1]][2], ",")[[1]][1]), NA)))
      # Return output:
    # Find any taxon numbers not in database:
    UnknownTaxonHits <- grep("Unknown taxon", x)
    # If found stop and warn user:
    if(length(UnknownTaxonHits) > 0) stop(paste("The following taxon numbers were not found in the database: ", paste(unlist(lapply(strsplit(x[UnknownTaxonHits], split = "Unknown taxon '|'\""), '[', 2)), collapse = ", "), sep = ""))
    # Isolate record line:
    jsonstring <- x[(grep("\\[", x) + 1):(grep("\\]", x) - 1)]
    # List of parameters to extract (check API documentation for meaning):
    Parameters <- c("cid", "idn", "tna", "oei", "eag", "lag", "lng", "lat", "pln", "pla")
    # Compile output:
    output <- do.call(cbind, lapply(as.list(Parameters), function(x) gsub("col:", "", ParameterExtraction(jsonstring, parameterstring = paste("\"", x, "\":", sep = "")))))
    # Add names:
    colnames(output) <- c("CollectionNo", "IdentifiedName", "TaxonName", "Age", "MaxMa", "MinMa", "Longitude", "Latitude", "PalaeoLongitude", "PalaeoLatitude")
    # Return output:
  # If there are any unidentified
  if(any(is.na(Output[, "IdentifiedName"]))) {
    # Store rows with NAs for identified name(s):
    Rows <- which(is.na(Output[, "IdentifiedName"]))
    # Overwrite NAs with taxon name:
    Output[Rows, "IdentifiedName"] <- Output[Rows, "TaxonName"]
  # If do not want to retain uncertain occurrences:
  if(!RetainUncertainOccurrences) {
    # Identify any uncertain occurrences:
    UncertainOccurrences <- grep("\\\\|cf\\.|aff\\.|\\?", Output[, "IdentifiedName"])
    # If found, remove these from the output:
    if(length(UncertainOccurrences) > 0) Output <- Output[-UncertainOccurrences, , drop = FALSE]

  # Perform a taxon query (to check for extant and no occurrence taxa):
  TaxonQuery <- PaleobiologyDBDescendantFinder(taxon_nos = taxon_nos)
  # If extant taxa are found add them to the matrix:
  if(any(sort(TaxonQuery[, "Extant"] == "1"))) Output <- rbind(Output, do.call(rbind, lapply(as.list(TaxonQuery[TaxonQuery[, "Extant"] == "1", "TaxonName"]), function(x) c("0", x, x, "Extant", "0", "0", NA, NA, NA, NA))))
  # Find any taxa without (definite) occurrences (excludes extant taxa):
  NoOccurrenceTaxa <- setdiff(TaxonQuery[, "TaxonName"], unique(Output[, "TaxonName"]))
  # If no occurrence are found add them to the matrix with NAs:
  if(length(NoOccurrenceTaxa) > 0) Output <- rbind(Output, do.call(rbind, lapply(as.list(NoOccurrenceTaxa), function(x) c("0", x, x, NA, NA, NA, NA, NA, NA, NA))))

  # Return output:

