#' get_pbdb
#'
#' Function for acquiring Paleobiology Database (PBDB) data
#' (saved to disk and/or imported into R) or to generate PBDB
#' API-compatible URLs. If downloading data over timespans
#' greater than 100 Ma, the download is performed in 100 Ma
#' chunks to reduce the chance of overloading the API, and to
#' better track the download progress.
#'
#' @param taxon A character vector of taxon names. Prepending
#' a taxon name with ^ will exclude it from the PBDB search.
#' Alternatively @param ex_taxon can be used to do this
#' @param interval A numeric vector of length two with positive
#' ages in Ma, or a character vector containing one or two ICS
#' chronostratigraphic interval names
#' @param mode A character vector of length one specifying the
#' type of data to return: one of occurrence, collection, taxa,
#' specimen, measurement, strata, diversity, opinion or reference
#' @param res A character vector of length one specifying the
#' taxonomic resolution of the dataset: one of all, family, genus
#' species, lump_genus or lump_subgen. The latter two lump
#' multiple occurrences of genera or subgenera within collections
#' into a single representative occurrence
#' @param fields A character vector of PBDB vocabulary for
#' additional data fields to download:
#' see https://paleobiodb.org/data1.2/occs/list_doc.html
#' @param ex_taxon A character vector of taxon names to exclude
#' from the PBDB search
#' @param area If not NULL, then a numeric vector of length four
#' specifying, in order, the min lng, max lng, min lat and max
#' lat of the area from which occurrences will be returned, in
#' decimal degrees (equator = 0 lat, prime meridian = 0 lng).
#' Alternatively, a character vector of regions from which
#' occurrences will be returned: any valid country name or ISO2
#' code. Continent names and codes are also supported as follows:
#' ATA Antarctica, AFR Africa, ASI Asia, AUS Australia, EUR
#' Europe, IOC Indian Ocean, NOA North America, OCE Oceania,SOA
#' South America
#' @param ex_area If not NULL, then a character vector of
#' valid country names or ISO2 codes, as in @param area (), from
#' which occurrences will be excluded from a PBDB search
#' @param invert_area If TRUE, then regions specified in area
#' will be excluded from a PBDB search, except for the regions
#' specified in ex_area
#' @param litho If not NULL, a character vector of PBDB vocabulary
#' corresponding to which lithologies a PBDB search should return
#' @param invert_litho If TRUE, a character vector of PBDB
#' vocabulary corresponding to which lithologies a PBDB search
#' should exclude
#' @param env If not NULL, a character vector of PBDB vocabulary
#' corresponding to which environments a PBDB search should return
#' @param ex_env If not NULL, a character vector of PBDB vocabulary
#' corresponding to which environments a PBDB search should exclude
#' @param invert_env If TRUE, then environments specified in env
#' will be excluded from a PBDB search, except for the environments
#' specified in ex_env
#' @param pres A character vector of length one specifying the
#' preservation mode of the occurrences to return: one of regular,
#' form, ichno, or 'form,ichno'
#' @param idqual A character vector of length one specifying the
#' taxonomic certainty of the occurrences to return: one of certain,
#' genus_certain, uncertain, new"
#' @param return_url If TRUE, the function will return a correctly
#' formatted url suitable for use with curl or similar API functions,
#' comprising the search parameters set by the user
#' @param fname If not NULL, the file name to which the downloaded
#' data will be saved on the disk as a .csv
#' @param auto_read If TRUE (default), the downloaded csv will
#' automatically be read into R (this must be assigned to an object)
#' @param override If not NULL, a character vector of length one
#' containing a url compatible with the PBDB API. This argument
#' exists to allow the user to supply a URL directly, but make use
#' of the redating part of the function
#' @param tscale A character vector of length one determining, if
#' at all, how a dataset will be updated to a particular
#' chronostratigrapic timescale. "ICS2013" will produce no redating,
#' retaining the PBDB ICS 2013 standard. "GTS2020" will update all
#' early and late interval ages to the GTS2020 standard, using a
#' lookup table supplied with the function. Alternatively, the
#' pathway to a custom .csv file with columns Interval, FAD and
#' LAD where Interval are the names of the early and late intervals
#' in the PBDB, and FAD and LAD are the numeric lower and upper
#' boundaries of those intervals
#' @return either a PBDB API compatible URL or a pBDB dataset
#' saved to the disk and/or read into R
#' @importFrom stats na.omit
#' @import curl
#' @importFrom utils data
#' @importFrom data.table fread fwrite
#' @export
get_pbdb <- function(taxon = NULL, interval = NULL, mode = "occurrence", res = "all", fields = c("ident", "coords", "class"),
ex_taxon = NULL, area = NULL, ex_area = NULL, invert_area = FALSE, litho = NULL, invert_litho = FALSE,
env = NULL, ex_env = NULL, invert_env = NULL, pres = NULL, idqual = NULL,
return_url = FALSE, fname = NULL, auto_read = TRUE, override = NULL, tscale = "ICS2013") {
# check fundamental arguments (url vs download)
if(is.null(taxon) & is.null(interval) & is.null(override)) {
stop("Taxon or interval must be specified at the very least")
}
if(isTRUE(return_url) & isTRUE(auto_read)) {
stop("return_url and auto_read cannot both be TRUE")
}
if (isFALSE(return_url) & !is.null(override)) {
stop("If override is supplied with a URL, then return_url cannot be TRUE")
}
if(!exists("taxon")) {
taxon <- NULL
}
#######################
# set up taxon argument
if(!is.null(taxon)) {
taxon <- unique(na.omit(taxon))
if(!is.character(taxon) | length(taxon) == 0) {
stop("Taxon must be a character vector of one or more names")
}
if(!is.null(ex_taxon)) {
ex_taxon <- unique(na.omit(ex_taxon))
if(!is.character(ex_taxon) | length(ex_taxon) == 0) {
stop("Taxa to exclude must be provided as a character vector of one or more names")
}
if(any(ex_taxon %in% taxon)) {
stop("One or more taxa are present in both the search and the exclusion")
}
ex_taxon <- paste0("^", ex_taxon, collapse = " ")
taxon <- paste0(taxon, collapse = ",")
taxon <- paste0(taxon, ex_taxon, collapse = " ")
} else {
taxon <- paste0(taxon, collapse = ",")
}
}
if(!is.null(pres)) {
pres <- na.omit(unique(pres))
if(class(pres) != "character") {
stop("Pres should be one of the following: regular, form, ichno, 'form,ichno'")
}
if(length(pres) != 0) {
warning("More than one element was provided in pres - only the first will be used")
pres <- pres[1]
}
if(!is.null(taxon)) {
pres <- paste0("&pres=", pres)
} else {
pres <- paste0("pres=", pres)
}
taxon <- paste0(taxon, pres)
}
if(!is.null(idqual)) {
idqual <- na.omit(unique(idqual))
if(class(idqual) != "character") {
stop("Idqual should be one of the following: certain, genus_certain, uncertain, new")
}
if(length(idqual) != 0) {
warning("More than one element was provided in pres - only the first will be used")
idqual <- idqual[1]
}
if(!is.null(taxon)) {
idqual <- paste0("&idqual=", idqual)
} else {
idqual <- paste0("idqual=", idqual)
}
taxon <- paste0(taxon, idqual)
}
#######################
# set up time argument if provided
if(exists("interval")) {
if((!class(interval) %in% c("character", "numeric")) | (is.character(interval) & length(grep("[0-9]", interval)) != 0)) {
stop("Interval must be either a single interval name, a vector of two interval names, or numeric vector of length 2")
}
interval <- unique(na.omit(interval))
if(is.character(interval)) {
if(length(interval) > 2) {
warning("Three or more interval names were provided - only the first and last will be used")
}
interval <- interval[c(1, length(interval))]
GTS2020 <- get("GTS2020")
interval <- c(GTS2020$FAD[match(interval[1], GTS2020$Interval)], GTS2020$LAD[match(interval[2], GTS2020$Interval)])
}
if(is.numeric(interval)) {
if(length(interval) > 2) {
warning("Three or more numerics were provided - only the largest and smallest values will be used")
}
interval <- c(max(na.omit(interval)), min(na.omit(interval)))
if(abs(diff(interval)) > 100) {
interval <- seq(interval[1], interval[2], by = -100)
if(max(interval > 601)) {
interval <- c(interval[1], interval[which(interval < 601)])
}
}
}
}
#######################
# set up mode
mode <- na.omit(unique(mode))
if(length(mode) != 1) {
warning("Mode contains multiple elements. Only the first will be used")
mode <- mode[1]
}
mode <- switch(mode,
"occurrence" = "https://paleobiodb.org/data1.2/occs/list.csv?",
"collection" = "https://paleobiodb.org/data1.2/colls/list.csv?",
"taxa" = "https://paleobiodb.org/data1.2/occs/taxa.csv?",
"specimen" = "https://paleobiodb.org/data1.2/specs/list.csv?",
"measurement" = "https://paleobiodb.org/data1.2/specs/measurements.csv?",
"strata" = "https://paleobiodb.org/data1.2/occs/strata.csv?",
"diversity" = "https://paleobiodb.org/data1.2/diversity.csv?",
"opinion" = "https://paleobiodb.org/data1.2/occs/opinions.csv?",
"reference" = "https://paleobiodb.org/data1.2/occs/refs.csv?")
if(is.null(mode)) {
stop("Mode must be one of the following: occurrence, collection, taxa, specimen,
measurement, strata, diversity, opinion or reference")
}
#######################
# check res argument
res <- na.omit(unique(res))
if(length(res) != 1) {
warning("Only the first element of res will be used")
res <- res[1]
}
if(!res %in% c("all", "family", "genus", "species", "lump_genus", "lump_gensub")) {
stop("res must be one of all, family, genus, species, lump_genus or lump_subgen")
}
if(res == "all") {
res <- NULL
} else {
res <- paste0("taxon_reso=", res, "&")
}
#######################
# set up fields argument if provided
if(!is.null(fields)) {
if((class(fields) != "character")) {
stop("Fields must be a character vector with one or more elements corresponding to PBDB vocabulary")
}
fields <- na.omit(unique(fields))
pbdb_fields <- get("pbdb_fields")
inv <- fields[!fields %in% as.vector(unlist(pbdb_fields))]
if(length(inv) != 0) {
fields <- fields[!fields %in% inv]
warning(paste0("The following elements in fields (", paste0(inv, collapse = ", "), ")
are not part of the PBDB vocabulary and have been dropped"))
}
bulk2 <- pbdb_fields$bulk[pbdb_fields$bulk %in% fields]
if(length(bulk2) != 0) {
for(i in 1:length(bulk2)) {
rem <- as.vector(unlist(pbdb_fields[which(names(pbdb_fields) == bulk2[i])]))
fields <- fields[!fields %in% rem]
}
}
fields <- paste0(fields, collapse = ",")
fields <- paste0("&show=", fields)
}
#######################
# set up area argument if provided
if(!is.null(area)) {
area <- na.omit(area)
if(!class(area) %in% c("character", "numeric") | (is.character(interval) & length(grep("[0-9]", interval)) != 0)) {
stop("Area must be a numeric of length four with decimal degree values in order of min longitude, max longitude,
min latitude, max latitude (prime meridian = 0 longitude, equator = 0 latitude), or a character vector of one
or more valid country names, their ISO2 abbreviations, and/or one or more of the following continental regions:
ATA Antarctica, AFR Africa, ASI Asia, AUS Australia, EUR Europe, IOC Indian Ocean, NOA North America, OCE Oceania,
SOA South America")
}
if(is.numeric(area)) {
if(length(area) != 4) {
stop("If supplied as coordinates, Area must be a numeric of length four with decimal
degree values in order of min longitude, max longitude, min latitude, max latitude (prime meridian = 0 longitude)")
}
if(any(area[1:2]) < -180 | any(area[1:2]) > 180 | any(area[3:4]) < -90 | any(area[3:4]) > 90) {
stop("If supplied as coordinates, Area must be a numeric of length four with decimal degree values in order of
min longitude, max longitude, min latitude, max latitude (prime meridian = 0 longitude, equator = 0 latitude)")
}
if(area[1] > area[2]) {
stop("The minimum longitude cannot be greater than the maximum longitude")
}
if(area[3] > area[3]) {
stop("The minimum latitude cannot be greater than the maximum longitude")
}
if(!is.null(ex_area)) {
warning("As area has been supplied as a vector of decimal degrees, ex_area will be ignored")
}
area <- paste0("&lngmin=", area[1], "&lngmax=", area[2], "&latmin=", area[3], "&latmax=", area[4])
}
if(is.character(area)) {
area <- unique(area)
geog_lookup <- get("geog_lookup")
test <- match(area, geog_lookup$Name)
if(any(is.na(test))) {
if(length(test) == length(area)) {
stop("None of the area names are valid")
}
inv <- area[which(is.na(test))]
warning(paste0("The following elements in area (", paste0(inv, collapse = ", "), ")
are not valid area names and have been dropped"))
area <- area[-which(is.na(test))]
}
if(is.null(ex_area)) {
area <- paste0(area, collapse = ",")
}
}
}
if(!is.null(ex_area)) {
if(is.numeric(area)) {
warning("As area has been supplied as a vector of decimal degrees, ex_area will be ignored")
} else {
ex_area <- na.omit(ex_area)
if(!is.null(area)) {
if(any(ex_area %in% area)) {
stop("One or more areas are present in both the search and the exclusion")
}
geog_lookup <- get("geog_lookup")
geog_lookup <- geog_lookup[10:nrow(geog_lookup),]
test2 <- match(ex_area, geog_lookup$Name)
if(any(is.na(test2))) {
if(length(test2) == length(ex_area)) {
stop("None of the area names are valid")
}
inv <- ex_area[which(is.na(test2))]
warning(paste0("The following elements in area (", paste0(inv, collapse = ", "), ")
are not valid area names and have been dropped"))
ex_area <- ex_area[-which(is.na(test2))]
}
ex_area <- paste0("^", ex_area, collapse = ",")
if(invert_area) {
area <- paste0("!", area, collapse = ",")
}
area <- paste0(area, ex_area, collapse = ",")
} else {
area <- paste0("!", paste0(ex_area, collapse = ","))
}
}
}
if(!is.null(area)) {
area <- paste0("&cc=", area)
}
#######################
# set up lithology argument if provided
if(!is.null(litho)) {
litho <- na.omit(unique(litho))
if(class(area) != "character") {
stop("Lithology must be a character vector of one or more of the following elements:
siliclastic, mixed, carbonate, evaporite, organic, chemical, volcanic, metasedimentary,
metamorphic, other or unknown")
}
inv <- litho[!litho %in% c("siliclastic", "mixed", "carbonate", "evaporite", "organic", "chemical",
"volcanic", "metasedimentary", "metamorphic", "other", "unknown")]
if(length(inv) != 0) {
lith <- lith[!lith %in% inv]
warning(paste0("The following elements in fields (", paste0(inv, collapse = ", "), ")
are not part of the PBDB lithology vocabulary and have been dropped"))
}
if(invert_litho) {
litho <- paste0("!", paste0(litho, collapse = ","))
}
litho <- paste0("&lithology=", litho)
}
#######################
# set up environment argument if provided
if(!is.null(env)) {
env <- na.omit(unique(env))
if(class(env) != "character") {
stop("Environment must be a character vector of one or more of the following elements (ignoring brackets):
terrestrial, any marine, carbonate, siliciclastic, unknown, lacustrine, fluvial, karst,
terrother (other terrestrial), marginal (marginal marine), reef, stshallow (shallow subtidal),
stdeep (deep subtidal), offshore, slope (slope or basin), or marindet (indeterminate marine)")
}
inv <- env[!env %in% c("siliclastic", "mixed", "carbonate", "evaporite", "organic", "chemical",
"volcanic", "metasedimentary", "metamorphic", "other", "unknown",
"lacustrine", "fluvial", "karst", "terrother", "marginal", "reef", "stshallow",
"stdeep", "offshore", "slope", "marindet")]
if(length(inv) != 0) {
env <- env[!env %in% inv]
warning(paste0("The following elements in environment (", paste0(inv, collapse = ", "), ")
are not part of the PBDB environment vocabulary and have been dropped"))
}
if(is.null(ex_env)) {
env <- paste0(env, collapse = ",")
}
}
if(!is.null(ex_env)) {
ex_env <- na.omit(ex_env)
if(!is.null(env)) {
inv <- ex_env[!ex_env %in% c("lacustrine", "fluvial", "karst", "terrother", "marginal", "reef", "stshallow",
"stdeep", "offshore", "slope", "marindet")]
if(length(inv) != 0) {
ex_env <- ex_env[!ex_env %in% inv]
warning(paste0("The following elements in exclude environment (", paste0(inv, collapse = ", "), ")
are not part of the PBDB environment vocabulary and have been dropped"))
}
if(any(ex_env %in% env)) {
stop("One or more environments are present in both the search and the exclusion")
}
ex_env <- paste0("^", ex_env, collapse = ",")
if(invert_env) {
env <- paste0("!", env, collapse = ",")
}
env <- paste0(env, ex_env, collapse = ",")
} else {
env <- paste0("!", paste0(ex_env, collapse = ","))
}
}
if(!is.null(env)) {
env <- paste0("&envtype=", env)
}
#######################
# build url
if(!is.null(interval)) {
if(is.character(interval)) {
interval <- paste0("interval=", interval)
}
if(is.numeric(interval)) {
intnum <- paste0("max_ma=", interval[1], "&min_ma=", interval[length(interval)])
if(length(interval) == 2) {
interval <- intnum
} else {
interval1 <- interval
interval <- vector()
for(i in 1:(length(interval1) - 1)) {
interval[i] <- paste0("max_ma=", interval1[i], "&min_ma=", interval1[i + 1])
}
}
}
if(is.null(taxon)) {
if(is.character(interval)) {
purl <- paste0(mode, res, interval, fields, area, litho)
testurl <- purl
}
if(is.numeric(interval)) {
purl <- paste0(mode, res, intnum, fields, area, litho)
testurl <- vector()
for(i in 1:length(interval)) {
testurl[i] <- paste0(mode, res, interval[i], fields, area, litho)
}
}
} else {
if(is.character(interval)) {
purl <- paste0(mode, paste0("base_name=", taxon, "&"), res, interval, fields, area, litho)
testurl <- purl
}
if(is.numeric(interval)) {
purl <- paste0(mode, paste0("base_name=", taxon, "&"), res, intnum, fields, area, litho)
testurl <- vector()
for(i in 1:length(interval)) {
testurl[i] <- paste0(mode, paste0("base_name=", taxon, "&"), res, interval[i], fields, area, litho)
}
}
}
} else {
purl <- paste0(mode, taxon, "&", res, fields, area, litho)
testurl <- purl
}
#######################
# do output
if(return_url) {
return(purl)
} else {
# clean tempdir
del <- list.files(tempdir(), pattern = "_pbdb", full.names = TRUE)
unlink(del)
# download data (short loop used to allow messaging)
for(i in 1:length(testurl)) {
cat(paste0("Chunk ", i, "/", length(testurl), " downloading"), "\n")
tfile <- tempfile(pattern = paste0(i, "_pbdb"), fileext = ".csv")
curl_download(url = testurl[i], destfile = tfile, quiet = TRUE)
}
cat("Checking data", "\n")
dload <- as.list(list.files(tempdir(), pattern = "_pbdb", full.names = TRUE))
dload <- lapply(dload, data.table::fread, encoding = "UTF-8")
dload <- do.call(rbind, dload)
dload <- as.data.frame(dload[!duplicated(dload$occurrence_no), ])
dload[dload == ""] <- NA
# clean tempdir
del <- list.files(tempdir(), pattern = "_pbdb", full.names = TRUE)
unlink(del)
# perform redating if specified
do_redate = FALSE
if(tscale == "ICS2013") {
do_redate <- FALSE
}
if(tscale == "GTS2020") {
tscale <- get("GTS2020")
tscale <- tscale[,c("Interval", "FAD", "LAD")]
do_redate <- TRUE
}
#if(is.data.frame(tscale)) {
# do_redate = TRUE
# colnames(tscale) <- c("Interval", "FAD", "LAD")
#}
if(do_redate) {
# this is a deconstruction of the redate_intervals function. The latter function exists
# separately so it can be used with any dataset. Its functionality is deconstructed here so that
# get_pbdb can stand alone as a single function
xfad <- "early_interval"
xlad <- "late_interval"
xerl <- "max_ma"
xlte <- "min_ma"
cinterval <- "Interval"
cfad <- "FAD"
clad <- "LAD"
# if the lad is missing, overwrite with the FAD
dload[is.na(dload[,xlad]), xlad] <- dload[is.na(dload[,xlad]), xfad]
# get new FADs and LADs
new_fad <- tscale[match(dload[,xfad], tscale[,cinterval]), cfad]
new_lad <- tscale[match(dload[,xlad], tscale[,cinterval]), clad]
# in case of new intervals (which give NA), retain original date
new_fad[is.na(new_fad)] <- dload[is.na(new_fad),xerl]
new_lad[is.na(new_lad)] <- dload[is.na(new_lad),xlte]
# overwrite
dload[,xerl] <- new_fad
dload[,xlte] <- new_lad
}
# build kingdom field if classification was requested
if(grepl("class", fields)) {
pbdb_kingdoms <- get("pbdb_kingdoms")
animals <- pbdb_kingdoms$animals
plants <- pbdb_kingdoms$plants
protists <- pbdb_kingdoms$protists
dload$kingdom <- NA
dload$kingdom[dload$phylum %in% animals] <- "Animalia"
dload$kingdom[dload$phylum %in% plants] <- "Plantae"
dload$kingdom[dload$phylum %in% protists] <- "Protista"
dload$kingdom[dload$phylum == "Pezizomycotina"] <- "Fungi"
dload$kingdom[dload$phylum == "Cyanobacteria"] <- "Bacteria"
if(which(colnames(dload) == "phylum") == 1) {
dload <- cbind.data.frame(kingdom = dload$kingdom, dload[,-which(colnames(dload) == "kingdom"), drop = FALSE])
} else {
dload <- cbind.data.frame(dload[,1:(which(colnames(dload) == "phylum") - 1)], dload$kingdom, dload[,(which(colnames(dload) == "phylum")):(ncol(dload) - 1)])
}
colnames(dload)[which(colnames(dload) == "dload$kingdom")] <- "kingdom"
}
if(is.null(fname)) {
pbdb_data <- dload
return(pbdb_data)
} else {
fname <- paste0(fname, ".csv")
cat("Writing data", "\n")
data.table::fwrite(dload, fname, bom = TRUE)
if(auto_read) {
cat("Reading data", "\n")
pbdb_data <- data.table::fread(fname, data.table = FALSE, encoding = "UTF-8")
pbdb_data[pbdb_data == ""] <- NA
return(pbdb_data)
}
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.