Nothing
#' Parse Darwin Core Archive
#'
#' @export
#'
#' @param input (character) Path to local zip file, directory, or a url.
#' If a URL it must be for a zip file.
#' @param read (logical) Whether or not to read in data files. If `FALSE`,
#' we give back paths to files only. Default: `FALSE`
#' @param ... Further args passed on to [data.table::fread()]
#'
#' @details
#' Note that sometimes file reads fail. We use [data.table::fread()]
#' internally, which is very fast, but can fail sometimes. If so, try reading
#' in the data manually.
#'
#' When you pass in a URL, we use \pkg{rappdirs} to determine cache path, and
#' if you pass the same URL again, and your cache is not cleared, we'll
#' pull from the cache. Passing a file or directory on your local system
#' won't invoke the caching route, but will go directly to the file/directory.
#'
#' @examples
#' \dontrun{
#' # set up a temporary directory for the example
#' dwca_cache$cache_path_set(path = "finch", type = "tempdir")
#'
#' dir <- system.file("examples", "0000154-150116162929234", package = "finch")
#'
#' # Don't read data in
#' (x <- dwca_read(dir, read=FALSE))
#' x$files
#' x$highmeta
#' x$dataset_meta[[1]]
#' x$data
#'
#' # Read data
#' (x <- dwca_read(dir, read=TRUE))
#' head(x$data[[1]])
#'
#' # Can pass in a zip file
#' zip <- system.file("examples", "0000154-150116162929234.zip",
#' package = "finch")
#' (out <- dwca_read(zip))
#' out$files
#' out$highmeta
#' out$emlmeta
#' out$dataset_meta
#'
#' # Can pass in zip file as a url
#' url <-
#'"https://github.com/ropensci/finch/blob/master/inst/examples/0000154-150116162929234.zip?raw=true"
#' (out <- dwca_read(url))
#'
#' # another url
#' url <- "http://ipt.jbrj.gov.br/jbrj/archive.do?r=redlist_2013_taxons&v=3.12"
#' (out <- dwca_read(url))
#' }
dwca_read <- function(input, read = FALSE, ...){
# check file type
x <- as.location(input)
x <- process(x)
# get file names and paths
files <- parse_dwca(x)
# high level metadata - meta.xml file
highmeta <- try_meta(files$xml_files)
# high level metadata - eml.xml or other named eml file
emlmeta <- try_eml(files$xml_files)
# get datasets metadata
dataset_meta <- lapply(files$datasets_meta, EML::read_eml)
# get data
datout <- read_data(files$data_paths, read, ...)
structure(list(files = files,
highmeta = highmeta,
emlmeta = emlmeta,
dataset_meta = dataset_meta,
data = datout), class = "dwca_gbif", read = read)
}
try_meta <- function(x) {
tmp <- read_xml(grep("meta.xml", x, value = TRUE))
xml_ns_strip(tmp)
core <- xml_find_first(tmp, "//archive/core")
if (inherits(core, "xml_missing")) {
occ_file <- "core"
occ_df <- list()
} else {
occ_file <- xml_text(xml_find_first(core, "files/location"))
occ_df <- dt(lapply(xml_find_all(core, "field"), function(z) {
as.list(xml_attrs(z))
}))
}
multi <- xml_find_first(tmp, "//extension[contains(@rowType, \"Multimedia\")]")
if (inherits(multi, "xml_missing")) {
multi_file <- "multimedia"
multi_df <- list()
} else {
multi_file <- xml_text(xml_find_first(multi, "files/location"))
multi_df <- dt(lapply(xml_find_all(multi, "field"), function(z) {
as.list(xml_attrs(z))
}))
}
verb <- xml_find_first(tmp, "//extension[contains(@rowType, \"Occurrence\")]")
if (inherits(verb, "xml_missing")) {
verb_file <- "occurrence"
verb_df <- list()
} else {
verb_file <- xml_text(xml_find_first(verb, "files/location"))
verb_df <- dt(lapply(xml_find_all(verb, "field"), function(z) {
as.list(xml_attrs(z))
}))
}
Filter(function(z) length(z) != 0, stats::setNames(
list(occ_df, multi_df, verb_df),
c(occ_file, multi_file, verb_file)
))
}
# test if file contents is EML
is_eml <- function(x) any( grepl("eml:eml", readLines(x, n = 5)) )
# gives back file path
get_eml <- function(x) {
tmp <- read_xml(grep("meta.xml", x, value = TRUE))
ff <- xml_attr(tmp, "metadata")
grep(ff, x, value = TRUE)
}
# try to read in EML metadata
try_eml <- function(w) EML::read_eml(get_eml(w))
#' @export
print.dwca_gbif <- function(x, ...){
cat("<gbif dwca>", sep = "\n")
cat(paste0(" Package ID: ", cmeta(x)), sep = "\n")
cat(paste0(" No. data sources: ", length(x$dataset_meta)), sep = "\n")
cat(paste0(" No. datasets: ", length(x$data)), sep = "\n")
for (i in seq_along(x$data)) {
if (attr(x, "read")) {
cat(sprintf(" Dataset %s: [%s X %s]", names(x$data[i]),
NCOL(x$data[[i]]), NROW(x$data[[i]])), sep = "\n")
} else {
z <- sprintf(" Dataset %s: %s", names(x$data[i]), x$data[[i]])
if (length(z) > 0) cat(z, sep = "\n")
}
}
}
cmeta <- function(y) {
if ( !is.null(y$emlmeta) ) {
y$emlmeta$packageId
} else {
NULL
}
}
parse_dwca <- function(x){
ff <- list.files(x, full.names = TRUE)
list(xml_files = grep("\\.xml", ff, value = TRUE),
txt_files = grep("\\.txt", ff, value = TRUE),
datasets_meta = list.files(grep("dataset", ff, value = TRUE),
full.names = TRUE),
data_paths = data_paths(ff))
}
data_paths <- function(x){
basedir <- dirname(grep("meta.xml", x, value = TRUE))
meta <- read_xml(grep("meta.xml", x, value = TRUE))
xml_ns_strip(meta)
file.path(basedir, xml_text(xml_find_all(meta, "//files/location")))
}
read_data <- function(x, read, ...){
if ( read ) {
datout <- list()
for (i in seq_along(x)) {
datout[[basename(x[[i]])]] <- try_read(x[[i]], ...)
}
datout
} else {
x
}
}
try_read <- function(z, ...){
res <- tryCatch(
suppressWarnings(
data.table::fread(z, stringsAsFactors = FALSE, data.table = FALSE,
sep = "\t", quote = "", ...)
), error = function(e) e
)
if ( inherits(res, "simpleError") ) {
data.frame(NULL, stringsAsFactors = FALSE)
} else {
res
}
}
process <- function(x){
switch(
attr(x, "type"),
dir = x,
file = {
dwca_cache$mkdir()
dirpath <- file.path(dwca_cache$cache_path_get(),
basename(sub("\\.zip", "", x[[1]])))
if (file.exists(dirpath)) {
message("File in cache")
return(dirpath)
} else {
utils::unzip(x, exdir = dirpath)
return(dirpath)
}
},
url = dwca_cache_get(x)
)
}
dwca_cache_get <- function(url) {
sha <- digest::digest(url, algo = "sha1")
key <- paste0(sha, ".zip")
fpath <- file.path(dwca_cache$cache_path_get(), key)
dirpath <- sub("\\.zip", "", fpath)
if (file.exists(dirpath)) {
message("File in cache")
return(dirpath)
} else {
on.exit(unlink(fpath))
dwca_cache$mkdir()
utils::download.file(url = url, destfile = fpath, quiet = TRUE, mode = "wb")
utils::unzip(fpath, exdir = dirpath)
return(dirpath)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.