RecordAPIDemo.R
In ridigbio: Interface to the iDigBio Data API

## ----message=FALSE------------------------------------------------------------
# Load core libraries; install these packages if you have not already
library(ridigbio)
library(tidyverse)

# Load library for making nice HTML output
library(kableExtra)

## ----echo = FALSE-------------------------------------------------------------
verify_records_1A <- FALSE

#Test that examples will run
tryCatch({
    # Your code that might throw an error
  verify_records_1A <- idig_search_records(
  # `rq` is where you adjust your record query
  rq = list(genus = "shortia"),
  # `fields` is where you adjust what fields you want returned by the API
  fields = c("uuid",
             "family",
             "genus",
             "specificepithet",
             "scientificname",
             "stateprovince"),
  # `limit` is where you can set a limit on the number of records to return in
  # order to speed up your query; max is 100000
  limit = 10)
}, error = function(e) {
    # Code to run if an error occurs
    cat("An error occurred during the idig_search_records call: ", e$message, "\n")
    cat("Vignettes will not be fully generated. Please try again after resolving the issue.")
    # Optionally, you can return NULL or an empty dataframe
    verify_records_1A <- FALSE
})

## ----eval=verify_records_1A---------------------------------------------------
# Let's start with a simple search introducing the primary arguments for the
# function `idig_search_records`
records_1A <- idig_search_records(
  # `rq` is where you adjust your record query
  rq = list(genus = "shortia"),
  # `fields` is where you adjust what fields you want returned by the API
  fields = c("uuid",
             "family",
             "genus",
             "specificepithet",
             "scientificname",
             "stateprovince"),
  # `limit` is where you can set a limit on the number of records to return in
  # order to speed up your query; max is 100000
  limit = 10,
  # `sort` is where you can specify fields for sorting
  sort = c("stateprovince",
           "scientificname"))

# Display the data frame we just created above in a nice pretty table for HTML
knitr::kable(records_1A) %>% 
    kable_styling(bootstrap_options = 
                         c("striped", "hover", "condensed", "responsive")) %>% 
  scroll_box(height = "300px")

## ----eval=verify_records_1A---------------------------------------------------
# Now let's repeat the same search but remove all arguments other than `rq` to
# see what the defaults for the other arguments look like
records_1B <- idig_search_records(
  rq = list(genus = "shortia"))

records_1B$occurrenceid <- if_else(grepl("^http://", records_1B$occurrenceid),
  gsub("^http://", "", records_1B$occurrenceid),
  records_1B$occurrenceid
)

records_1B$occurrenceid <- if_else(grepl("data.biodiversitydata.nl/naturalis", records_1B$occurrenceid),
  gsub("data.biodiversitydata.nl/naturalis", "bioportal.naturalis.nl/nl", records_1B$occurrenceid),
  records_1B$occurrenceid
)

records_1B$occurrenceid <- if_else(grepl("https://grbio.org/cool", records_1B$occurrenceid),
  gsub("https://grbio.org/cool", "grbio.org/cool", records_1B$occurrenceid),
  records_1B$occurrenceid
)

records_1B$occurrenceid <- if_else(grepl("https://biocol.org", records_1B$occurrenceid),
  gsub("https://biocol.org", "biocol.org", records_1B$occurrenceid),
  records_1B$occurrenceid
)

# Display the data frame we just created above in a nice pretty table for HTML
knitr::kable(records_1B) %>% 
    kable_styling(bootstrap_options = 
                         c("striped", "hover", "condensed", "responsive")) %>% 
  scroll_box(height = "300px")

## ----eval=verify_records_1A---------------------------------------------------
# In the example above, we are only using one parameter in `rq` to define our
# query, but now let's search by multiple parameters
records_2A <- idig_search_records(
  rq = list(basisofrecord = "fossilspecimen",
            # Use `type = "exists"` to search for rows where there is a value
            # present in this field; the inverse of this is `type = "missing"`
            geopoint = list(type = "exists")),
  limit = 10)

# Display the data frame we just created above in a nice pretty table for HTML
knitr::kable(records_2A) %>% 
    kable_styling(bootstrap_options = 
                         c("striped", "hover", "condensed", "responsive")) %>% 
  scroll_box(height = "300px")

## ----eval=verify_records_1A---------------------------------------------------
# What if we wanted to see more fields than the default provides? Using the same
# search as above, we can retrieve all indexed fields with `fields = "all"`
records_2B <- idig_search_records(
  rq = list(basisofrecord = "fossilspecimen",
          geopoint = list(type="exists")),
  fields = "all",
  limit = 10)

records_2B$institutionid <- if_else(grepl("^http://", records_2B$institutionid),
  gsub("^http://", "https://", records_2B$institutionid),
  records_2B$institutionid
)

# Display the data frame we just created above in a nice pretty table for HTML
knitr::kable(records_2B) %>% 
    kable_styling(bootstrap_options = 
                         c("striped", "hover", "condensed", "responsive")) %>% 
  scroll_box(height = "300px")

## ----eval=verify_records_1A---------------------------------------------------
# But wait, there are even more fields available than just those we retrieved
# in the query above! Using the same search, we can choose exactly what fields
# to retrieve from indexed and raw data if we call the fields out by name in
# the `fields` argument; raw data fields are prefaced by "data.dwc:" and use 
# camelCase in their naming convention (vs. lowercase for iDigBio fields)
records_2C <- idig_search_records(
  rq = list(basisofrecord = "fossilspecimen",
          geopoint = list(type="exists")),
  # Here is where we are explicitly asking for specific fields
  fields = c("uuid",
             "recordset",
             "institutioncode", "data.dwc:institutionCode",
             "country", "data.dwc:country",
             "countrycode", "data.dwc:countryCode",
             "stateprovince", "data.dwc:stateProvince",
             "locality", "data.dwc:locality",
             "geopoint", "data.dwc:decimalLongitude", "data.dwc:decimalLatitude"),
  limit = 10)

# Display the data frame we just created above in a nice pretty table for HTML
knitr::kable(records_2C) %>% 
    kable_styling(bootstrap_options = 
                         c("striped", "hover", "condensed", "responsive")) %>% 
  scroll_box(height = "300px")

## ----eval=verify_records_1A---------------------------------------------------
# You may be curious what the difference is between indexed and raw data such as
# that we saw in the search above. Indexed data has been altered by iDigBio
# (often in an attempt to standardize and/or correct values), and raw data is
# what was provided to iDigBio by the data provider, i.e. the natural history
# collection. Here we will do a new search on a data quality flag to view
# differences between indexed and raw data
records_3A <- idig_search_records(
  # Data quality flags are a way for iDigBio to communicate how data was altered
  # during its quality control process, i.e. how the indexed and raw data differ
  rq = list(flags = "rev_geocode_lat_sign"),
  fields = c("uuid",
             "institutioncode", "data.dwc:institutionCode",
             "country", "data.dwc:country",
             "countrycode", "data.dwc:countryCode",
             "stateprovince", "data.dwc:stateProvince",
             "locality", "data.dwc:locality",
             "geopoint", "data.dwc:decimalLongitude", "data.dwc:decimalLatitude"),
  limit = 10)

# Let's format our results to be more readable by renaming and reordering columns
records_3A <- records_3A %>% 
  rename_at(vars(starts_with("data.dwc:")),
            ~str_replace(., "data.dwc:", "raw_")) %>% 
  select(uuid,
         indexed_decimalLatitude = geopoint.lat,
         raw_decimalLatitude,
         indexed_decimalLongitude = geopoint.lon,
         raw_decimalLongitude,
         everything())

# Display the data frame we just created above in a nice pretty table for HTML
knitr::kable(records_3A) %>% 
    kable_styling(bootstrap_options = 
                         c("striped", "hover", "condensed", "responsive")) %>% 
  scroll_box(height = "300px")

## ----eval=verify_records_1A---------------------------------------------------
# Let's test out a search using parameters we know would retrieve many records
count_1A <- idig_count_records(
  rq = list(basisofrecord = "fossilspecimen",
          geopoint = list(type="exists")))

# We can reformat our result to be more readable
count_1A <- format(count_1A, big.mark = ",")

# This number shows how many records in iDigBio have a value of "fossilspecimen"
# as well as geographic coordinate data
count_1A

## ----eval=verify_records_1A---------------------------------------------------
# Let's go back to our first simple search and see what the top values are for
# `scientificname` where the genus is "shortia"
top_1A <- idig_top_records(
  # `rq` is where you adjust your record query
  rq = list(genus = "shortia"),
  # `top_fields` is where you adjust what fields you want to see summarized
  top_fields = "scientificname",
  # `count` is where you can set a limit on the number of top values to return
  # in order to speed up your query; max is 1000
  count = 10)

# We need to convert our results from a nested list into a more readable format
top_1A <- as_tibble(top_1A$scientificname) %>% 
  pivot_longer(everything(), names_to = "scientificname", values_to = "count")

# Display the data frame we just created above in a nice pretty table for HTML
knitr::kable(top_1A) %>% 
    kable_styling(bootstrap_options = 
                         c("striped", "hover", "condensed", "responsive")) %>% 
  scroll_box(height = "300px")