R/lit_search.R
In rgbif: Interface to the Global Biodiversity Information Facility API

Documented in lit_count lit_search

#' Search for literature that cites GBIF mediated data
#' 
#' 
#' @export
#' 
#' @param q (character) Simple full text search parameter. The value for this 
#' parameter can be a simple word or a phrase. Wildcards are not supported.
#' @param countriesOfResearcher (character) Country of institution with which 
#' author is affiliated, e.g. DK (for Denmark). Country codes are 
#' listed in enumeration_country().  
#' @param countriesOfCoverage (character) Country of focus of study, 
#' e.g. BR (for Brazil). Country codes are listed in enumeration_country().  
#' @param literatureType (character) Type of literature ("JOURNAL", 
#' "BOOK_SECTION", "WORKING_PAPER", "REPORT", "GENERIC", "THESIS", "CONFERENCE_PROCEEDINGS", 
#' "WEB_PAGE").
#' @param relevance (character) How is the publication relate to GBIF. See details 
#' ("GBIF_USED", "GBIF_MENTIONED", "GBIF_PUBLISHED", "GBIF_CITED", "GBIF_CITED", 
#' "GBIF_PUBLISHED", "GBIF_ACKNOWLEDGED", "GBIF_AUTHOR").
#' @param year (integer) Year of publication.
#' @param topics (character) Topic of publication.
#' @param datasetKey (character) GBIF dataset uuid referenced in publication.
#' @param publishingOrg (character) Publisher uuid whose dataset is 
#' referenced in publication.
#' @param peerReview (logical) Has publication undergone peer-review? 
#' @param openAccess (logical) Is publication Open Access?
#' @param downloadKey (character) Download referenced in publication.
#' @param doi (character) Digital Object Identifier (DOI).
#' @param journalSource (character) Journal of publication.
#' @param journalPublisher (character) Publisher of journal.
#' @param flatten (logical) should any lists in the resulting data be flattened
#' into comma-seperated strings?
#' @param limit how many records to return. limit=NULL will fetch up to 10,000. 
#' @param curlopts list of named curl options passed on to HttpClient. 
#' see curl::curl_options for curl options.
#' @param ... additional parameters passed to lit_search
#'
#' @details
#' This function enables you to search for literature indexed by GBIF, 
#' including peer-reviewed papers, citing GBIF datasets and downloads. 
#' The literature API powers the 
#' \href{https://www.gbif.org/resource/search?contentType=literature}{literature search}
#' on GBIF.  
#' 
#' The GBIF Secretariat maintains an ongoing 
#' \href{https://www.gbif.org/literature-tracking}{literature tracking programme}, 
#' which identifies research uses and citations of biodiversity information 
#' accessed through GBIF’s global infrastructure. 
#' 
#' In the literature database, \strong{relevance} refers to how publications relate 
#' to GBIF following these definitions:
#' \itemize{
#' \item GBIF_USED : makes substantive use of data in a quantitative analysis (e.g. ecological niche modelling)
#' \item GBIF_CITED : cites a qualitative fact derived in data (e.g. a given species is found in a given country)
#' \item GBIF_DISCUSSED : discusses GBIF as an infrastructure or the use of data
#' \item GBIF_PRIMARY : GBIF is the primary source of data (no longer applied)
#' \item GBIF_ACKNOWLEDGED : acknowledges GBIF (but doesn't use or cite data)
#' \item GBIF_PUBLISHED : describes or talks about data published to GBIF
#' \item GBIF_AUTHOR : authored by GBIF staff
#' \item GBIF_MENTIONED : unspecifically mentions GBIF or the GBIF portal
#' \item GBIF_FUNDED : funded by GBIF or a GBIF-managed funding programme
#' }
#' 
#' The following arguments can take multiple values:
#' \itemize{
#' \item relevance
#' \item countriesOfResearcher
#' \item countriesOfCoverage
#' \item literatureType
#' \item topics
#' \item datasetKey
#' \item publishingOrg
#' \item downloadKey
#' \item doi
#' \item journalSource
#' \item journalPublisher
#' }
#' 
#' If \code{flatten=TRUE}, then \strong{data} will be returned as flat 
#' data.frame with no complex column types (i.e. no lists or data.frames).
#'  
#' \code{limit=NULL} will return up to 10,000 records. The maximum value for
#' \code{limit} is 10,000. If no filters are used, only the first 1,000 records 
#' will be returned, limit must be explicitly set to \code{limit=10000}, to get 
#' the first 10,000 records in this case.  
#' 
#' \code{lit_count()} is a convenience wrapper, which will return the number of 
#' literature references for a certain \code{lit_search()} query. This is the 
#' same as running \code{lit_search()$meta$count}.
#' 
#' @return
#' A named list with two values: \code{$data} and \code{$meta}. \code{$data} is
#' a \code{data.frame} of literature references.
#'  
#' @examples \dontrun{
#' lit_search(q="bats")$data 
#' lit_search(datasetKey="50c9509d-22c7-4a22-a47d-8c48425ef4a7")
#' lit_search(year=2020)
#' lit_search(year="2011,2020") # year ranges
#' lit_search(relevance=c("GBIF_CITED","GBIF_USED")) # multiple values
#' lit_search(relevance=c("GBIF_USED","GBIF_CITED"), 
#' topics=c("EVOLUTION","PHYLOGENETICS"))
#' lit_count() # total number of literature referencing GBIF
#' lit_count(peerReview=TRUE)
#' # number of citations of iNaturalist 
#' lit_count(datasetKey="50c9509d-22c7-4a22-a47d-8c48425ef4a7")
#' # number of peer-reviewed articles used GBIF mediated data
#' lit_count(peerReview=TRUE,literatureType="JOURNAL",relevance="GBIF_USED")
#'  
#' # Typically what is meant by "literature that uses GBIF" 
#' lit_search(peerReview=TRUE,literatureType="JOURNAL",relevance="GBIF_USED")
#' lit_count(peerReview=TRUE,literatureType="JOURNAL",relevance="GBIF_USED")
#' }
lit_search <- function(
    q=NULL, 
    countriesOfResearcher=NULL, 
    countriesOfCoverage=NULL, 
    literatureType=NULL, 
    relevance=NULL, 
    year=NULL, 
    topics=NULL, 
    datasetKey=NULL, 
    publishingOrg=NULL, 
    peerReview=NULL, 
    openAccess=NULL, 
    downloadKey=NULL, 
    doi=NULL, 
    journalSource=NULL, 
    journalPublisher=NULL,
    flatten=TRUE,
    limit=NULL,
    curlopts = list()
) {
  step <- 1000 # max step size in 1000
  max_limit <- 10000 # max limit is 10,000
  # check inputs 
  if(!is_uuid(datasetKey) & !is.null(datasetKey)) stop("'datasetKey' should be a GBIF dataset uuid.")
  if(!is_uuid(publishingOrg) & !is.null(publishingOrg)) stop("'publishingOrg' should be a GBIF publisher uuid.")
  if(!is_download_key(downloadKey) & !is.null(downloadKey)) stop("'downloadKey' should be a GBIF downloadkey.")
  
  assert(q,"character")
  assert(countriesOfResearcher,"character")
  assert(countriesOfCoverage,"character")
  assert(literatureType,"character")
  assert(relevance,"character")
  assert(topics,"character")
  assert(peerReview,"logical")
  assert(openAccess,"logical")
  assert(downloadKey,"character")
  assert(doi,"character")
  assert(journalSource,"character")
  assert(journalPublisher,"character")
  
  # args that take only a single value
  args <- rgbif_compact(list(
    q=q, 
    year=year, 
    peerReview=peerReview, 
    openAccess=openAccess
  ))
  # args that take many values
  args <- c(args,
            convmany(relevance),
            convmany(countriesOfResearcher),
            convmany(countriesOfCoverage),
            convmany(literatureType),
            convmany(topics),
            convmany_rename(datasetKey,"gbifDatasetKey"),
            convmany_rename(publishingOrg,"publishingOrganizationKey"),
            convmany_rename(downloadKey,"gbifDownloadKey"), 
            convmany(doi), 
            convmany_rename(journalSource,"source"), 
            convmany_rename(journalPublisher,"publisher")
  )
  
  # make count to see if any further processing needed
  count_args <- args; count_args$limit <- 0
  count_limit <- gbif_GET(paste0(gbif_base(), "/literature/search"), count_args, parse=TRUE, curlopts = curlopts, mssg = NULL)$count
  
  # exit early if there will be no results
  if(count_limit == 0) {
    meta <- list(endOfRecords=TRUE,count=0) 
    data <- tibble::tibble()
    return(list(data=data,meta=meta))
  }
  
  # if limit not filled in use count_limit
  if(is.null(limit)) {
    limit <- count_limit
    if(length(args) == 0) {
        message("No filters used, but 'limit=NULL' returning just the first 1000 results. If you actually just want the first 10,000 records, use 'limit=10000'.")
        limit <- 1000
    }
  }
  
  # if larger than max value of 10,000 
  if(limit > max_limit) { 
    message("Not returning all records. Max records is 10,000.") 
    limit <- max_limit 
  } 
  
  # if step size greater than limit adjust for one request
  if((step >= limit)) step <- limit
  
  # if limit exactly divisible by step size
  if((limit%%step) == 0) { 
    offset_seq <- seq(from=0,limit-step,by=step)
    limit_seq <- rep(step,length(offset_seq))
  } else {
    offset_seq <- seq(from=0,limit,by=step)
    limit_seq <- c(rep(step,length(offset_seq)-1),limit %% step)
  }
  if(step == limit) offset_seq <- 0 # only one request needed
  if(length(args) == 0) { 
    urls <- paste0(gbif_base(),"/literature/search?",
                   "offset=",offset_seq,
                   "&limit=",limit_seq)
  } else {
    req <- paste0(names(args),"=",args,collapse="&")
    urls <- paste0(gbif_base(),
                   "/literature/search?",req,
                   "&offset=",offset_seq,
                   "&limit=",limit_seq)
  }
  # clean urls
  urls <- sapply(urls,function(x) utils::URLencode(x))
  urls <- sapply(urls,function(x) gsub("\\[|\\]","",x)) # remove any square brackets
  # make request 
  ll <- gbif_async_get(urls,parse=TRUE)
  data <- process_lit_async_results(ll,flatten=flatten)
  meta <- rgbif_compact(ll[[length(urls)]])
  # clean results
  meta$results <- NULL
  meta$offset <- NULL
  meta$limit <- NULL
  list(data=data,meta=meta)
}

#' @export
#' @rdname lit_search
lit_count <- function(...) {
  
  x <- rgbif_compact(list(...))
  accepted_args <- c("q",
                     "countriesOfResearcher",
                     "countriesOfResearcher",
                     "countriesOfCoverage",
                     "literatureType",
                     "relevance",
                     "year",
                     "topics",
                     "datasetKey",
                     "publishingOrg",
                     "peerReview",
                     "openAccess",
                     "downloadKey",
                     "doi",
                     "journalSource",
                     "journalPublisher")
  if(!all(names(x) %in% accepted_args)) {
    stop(
    paste0(
    "Please use accepted argument from lit_search() :",toString(accepted_args)
    ))}
  
  count <- lit_search(
    q=x$q,
    countriesOfResearcher=x$countriesOfResearcher,
    countriesOfCoverage=x$countriesOfCoverage,
    literatureType=x$literatureType,
    relevance=x$relevance,
    year=x$year,
    topics=x$topics,
    datasetKey=x$datasetKey,
    publishingOrg=x$publishingOrg,
    peerReview=x$peerReview,
    openAccess=x$openAccess,
    downloadKey=x$downloadKey,
    doi=x$doi,
    journalSource=x$journalSource,
    journalPublisher=x$journalPublisher,
    limit=1)$meta$count
  count
}

process_lit_async_results <- function(ll,flatten=TRUE) {
  data_list <- lapply(ll,function(x) x$results)
  # handle complex identifiers
  data_list <- lapply(data_list,function(x) tibble::tibble(x,x$identifiers))
  for(i in 1:length(data_list)) data_list[[i]]$identifiers <- NULL
  for(i in 1:length(data_list)) data_list[[i]]$abstract <- NULL
  data <- bind_rows(data_list)
  # data
  if(flatten) {
    # handle complex identifiers and authors
    data$authors <- sapply(data$authors,function(x) paste0(x$firstName," ",x$lastName,collapse=","))
    # flatten other columns
    data <- tibble::as_tibble(lapply(data,function(x) if(is.list(x)) sapply(x,toString) else x))
  }
  data$x <- NULL
  data
}