Sampling event based datasets at GBIF are available in Darwin Core Archive files - zipped CSV files with metadata.
These files use Darwin Core simple terms. The darwinator
package bundles a dataset with those recommended Darwin Core terms.
The labels for these Darwin Core terms can for example be used for validating column names in sampling event datasets. The simple Darwin Core terms data can be accessed like so:
library(darwinator) # read the documentation for the dataset ?dwc_terms # display the first row of the recommended simple Darwin Core terms dataset library(dplyr) simple_terms <- dwc_terms %>% filter(is_simple) knitr::kable(simple_terms %>% head(1) %>% t())
To get a DwcA file from GBIF, a dataset identifier key is needed. To get all such keys for sampling event datasets published at gbif.org from Norway, first search for the keys using rgbif
:
library(rgbif) library(tidyverse) search <- dataset_search( type = "SAMPLING_EVENT", publishingCountry = "NO" ) message("Search hit count: ", search$count)
Lets quickly list the first 5 search hits out of the total of r search$count
hits:
knitr::kable( search$data %>% head(5) %>% select(datasetTitle, publishingOrganization, datasetKey) )
There are several datasets available with sampling event data from Norway.
Now we show how to make a search to retrieve specific keys/identifiers for a specific relevant dataset, lets say we want datasets with a title that contains the string "Lepidurus":
key <- search$data %>% filter(grepl("Lepidurus", datasetTitle)) %>% .$datasetKey key
This is the dataset we're interested in. We now retrieve the sampling event data for that single specific dataset identifier:
library(darwinator) sed <- sampling_event_data(key) # we now have a suggested citation to use citation <- sed$meta$citation citation
We can display a row from each of the individual sampling event tables:
# the sample event data tables event_core <- sed$dwca$event_core occ_ext <- sed$dwca$occurrence_ext mof <- sed$dwca$measurementorfact knitr::kable(event_core %>% head(1) %>% t()) knitr::kable(occ_ext %>% head(1) %>% t()) knitr::kable(mof %>% head(1) %>% t())
This is what a row from the combined data from those tables can look like:
# we have the data available for use and the metadata metadata <- sed$meta data <- sed$data # display first row of data knitr::kable( data %>% head(1) %>% t() )
This snippet shows how to download all sampling event datasets from Norway and persist DwcA files locally on disk to enable working with these off-line:
library(rgbif) library(purrr) library(dplyr) library(darwinator) # make a search using rgbif for: # all Norwegian sampling event dataset identfiers search <- dataset_search( type = "SAMPLING_EVENT", publishingCountry = "NO" ) norway_keys <- search$data$datasetKey # function to extract IPT endpoint url for a specific dataset key ipt_endpoints <- function(key) { tibble( key = key, url = darwinator:::dataset_details(key)$endpoints$url ) } # then iterate over all the norwegian IPT endpoint URLs # restrict IPT urls to DWC_ARCHIVE files only norway_ipt_urls <- map_df(norway_keys, ipt_endpoints) ipt_urls_dwca <- norway_ipt_urls %>% mutate(is_dwca = grepl("archive.do", url)) %>% filter(is_dwca) %>% .$url # finally the download - iterate over all those and download # then return the names of those tempfiles (DwcA-zip-files) filez <- map_chr(ipt_urls_dwca, darwinator:::dwca_download) tibble(filez, ipt_urls_dwca)
We may now be interested in reporting a few findings - for example are there any IPT URL endpoints that risk being blocked by extra strict firewalls? And those that are served without using HTTPS?
# find all URLs with a port specified (such as :8080) # by using a regexp for a : followed by one or more digits has_port <- function(x) grepl(":\\d+", x) has_https <- function(x) grepl("https://", x) is_dwca <- function(x) grepl("archive.do", x) potentially_firewalled_datasets <- norway_ipt_urls %>% mutate(firewalled = has_port(url)) %>% filter(firewalled) %>% select(url) # networks that block everything but 80 and 443 (http and https) # may restrict access to these IPT endpoint URLs message("Extra strict firewalls may block these URLs:") potentially_firewalled_datasets # non-https IPT endpoint URLs non_https_datasets <- norway_ipt_urls %>% mutate(is_not_https = !has_https(url)) %>% mutate(is_dwca = is_dwca(url)) %>% filter(is_not_https, is_dwca) %>% select(url) message("These IPT urls are not served using https:") non_https_datasets
For reporting parsing issues, this snippet can be used:
# test parsing of each and every one... in order # to see if there are any parsing issues res <- map(filez, darwinator:::dwca_parse) # for those with parsing issues, show those in a report issues_index <- unlist(map(res, "has_parsing_issues")) issues <- map_df(res[issues_index], "parsing_issues") report <- tibble(url = ipt_urls_dwca[issues_index], file = filez[issues_index]) %>% left_join(issues, by = "file") %>% select(-file, -url, everything(), url) knitr::kable(report) #if (!dir.exists("data-raw")) dir.create("data-raw") #readr::write_excel_csv(report, "data-raw/issues-norway.csv")
Here is how to download and parse all Norwegian sampling event datasets in one go - as of 2018-02-22 this resulted in a large list of approx 30 elements, 718 Mb:
# use dplyr::possibly to wrap the sampling_event_data function # so that it returns NULL if it fails on a particular dataset library(purrr) psed <- possibly(function(key) sampling_event_data(key), NULL) norway <- map(norway_keys, psed) # exclude failed requests fails <- unlist(map(norway, is.null)) failed_keys <- norway_keys[fails] successful_downloads <- norway[!fails] # persist this on disk as an R object so it can be # loaded again off-line using readRDS() #saveRDS(successful_downloads, "data-raw/norway-sed.Rds")
A report of parsing issues can be generated like this:
parsing_issues <- map_df(successful_downloads, c("dwca", "parsing_issues")) readr::write_excel_csv(issues_norway, "data-raw/issues-norway.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.