#' @title Read Data From DwC-A Files
#'
#' @description This function reads species records from Darwin Core Archive
#' (DwC-A) files, typically obtained from GBIF as a '.zip' file. It returns
#' different information from inside the DwC-A files so it can enter the
#' __plantR__ workflow. Optionally, it can be used to save the required
#' information into a local directory.
#'
#' @param file character. Name of the DwC-A file (often a '.zip') containing the
#' species records.
#' @param path character. The path to the directory where the file was saved or
#' of the web Uniform Resource Locator (URL) from which the file can be
#' downloaded from. Defaults to the user working directory or to gbif download
#' path.
#' @param dir.name character. Name of the folder where the processed data should
#' be saved. Defaults to the directory defined by `path`.
#' @param dir.tmp character. Name of the sub-folder where the temporary files
#' should be saved within `dir.name`. Defaults to "plantR_input".
#' @param method the method to be passed to function `download.file()` for
#' downloading files. Defaults to 'auto'.
#' @param bind.data logical. Should the occurrence and verbatim information be
#' combined into a single table? Defaults to TRUE.
#' @param output character. Which information from the Darwin-Core file should
#' be returned/saved? Default to 'occurrence', 'verbatim' and 'citations'.
#' @param save logical. Should the information be saved to file? Defaults to
#' FALSE.
#' @param file.format character. The file extension to be used for saving.
#' Defaults to 'csv'.
#' @param compress logical. Should the files be compressed? Default to TRUE.
#' @param sep character. The separator between columns to be passed to
#' data.table::fread() (see the help of this function for details).
#' @param quote character. The symbol for text quotes to be passed to
#' data.table::fread(). Defaults to "\"".
#' @param na.strings character. Vector of strings which are to be interpreted as
#' NA values to be passed to data.table::fread(). Defaults to "NA".
#'
#' @details This function provides different options to read DwC-A files,
#' typically the ones obtained from GBIF. Currently, this zip file can be read
#' from a local directory or directly from GBIF API address. If the path is an
#' URL address (e.g. https://api.gbif.org/v1/occurrence/download/request),
#' then the function will download the zip file directly from the GBIF API.
#'
#' The argument `output` defines which of the information within GBIF DwC-A
#' files should be returned. Currently, the outputs available are:
#' 'occurrence', 'verbatim', 'multimedia', 'citations' and 'rights'. If more
#' than one output is selected, the function returns a list in which each
#' element represent the selected outputs. Currently, no data and database
#' metadata are returned (i.e. '.xml' files). See package __finch__ for the
#' complete parsing of DwC-A files and metadata.
#'
#' All temporary files and folders are deleted after the extraction of the
#' information, except if `save` is TRUE. In this case, only the unzipped
#' files within the DwC-A file are removed.
#'
#' Downloading large files (more than 2GB) may be an issue for some R
#' versions. The `method` 'wget' may be more appropriate for users with proxy
#' firewalls (see help of function `download.file()`).
#'
#' @import data.table
#' @importFrom utils download.file unzip
#'
#'
#' @examples
#' \dontrun{
#' occs <- readData(file = "0227351-200613084148143.zip",
#' path <- "https://api.gbif.org/v1/occurrence/download/request/")
#' }
#'
#' @export readData
#'
readData <- function(file = NULL,
path = "",
dir.name = "",
dir.tmp = "plantR_input",
method = "auto",
bind.data = TRUE,
output = c("occurrence", "verbatim", "citations"),
save = FALSE,
file.format = "csv",
compress = TRUE,
sep = "auto",
quote = "\"",
na.strings = c("NA")) {
# Check input
if (is.null(file))
stop("Please provide the file name")
file_sep <- .Platform$file.sep
if (is.null(dir.name) | dir.name == "") {
if (is.null(path) | path == "") {
dir.name <- paste0(file.path(getwd(), dir.tmp), file_sep)
if (!file.exists(dir.name))
dir.create(dir.name)
} else {
if (grepl("http", path)) {
dir.name <- paste0(file.path(getwd(), dir.tmp), file_sep)
if (!file.exists(dir.name))
dir.create(dir.name)
} else {
# dir.name <- path
dir.name <- paste0(file.path(path, dir.tmp), file_sep)
if (!file.exists(dir.name))
dir.create(dir.name)
}
}
}
# Defining the general path for reading
if (is.null(path) | path == "") {
path <- file.path(getwd(), file)
} else {
if (grepl("http", path)) {
if (grepl("/$", path)) {
path <- paste0(path, file)
} else {
path <- paste(path, file, sep = "/")
}
} else {
path <- file.path(path, file)
if (!file.exists(path))
stop("The file name provided could not be found in the directory provided")
}
}
## DOWNLOADING THE DATA DIRECTLY FROM GBIF
if (grepl("http", path)) {
destfile.dwca <- file.path(dir.name, file)
if (file.exists(destfile.dwca)) {
warning("A zip file with the same name already exists in 'dir.name' and was not downloaded")
} else {
dwca <- utils::download.file(path, destfile = destfile.dwca,
method = method, cacheOK = TRUE)
}
} else {
destfile.dwca <- path
}
## UNZIPPING THE DATA IN THE LOCAL DIRECTORY ##
dir.name.unzip <- file.path(dir.name, gsub(".zip", "", file, fixed = TRUE))
if (file.exists(dir.name.unzip)) {
if (length(list.files(dir.name.unzip)) == 0) {
dir.create(dir.name.unzip)
zip.files <- utils::unzip(zipfile = destfile.dwca, list = TRUE)$Name
zip.files <- zip.files[grepl(paste0(output,".txt", collapse = "|"), zip.files)]
cat("Unzipping the DwC-A file... ", sep = "")
dwca.unzip <- utils::unzip(zipfile = destfile.dwca,
exdir = dir.name.unzip,
files = zip.files)
cat("done!\n", sep = "")
} else {
warning("An unzipped, non-empty folder for this file already exists and was not overwritten")
}
} else {
dir.create(dir.name.unzip)
zip.files <- utils::unzip(zipfile = destfile.dwca, list = TRUE)$Name
zip.files <- zip.files[grepl(paste0(output,".txt", collapse = "|"), zip.files)]
cat("Unzipping the DwC-A file... ", sep = "")
dwca.unzip <- utils::unzip(zipfile = destfile.dwca,
exdir = dir.name.unzip,
files = zip.files)
cat("done!\n", sep = "")
}
## READING THE UNZIPPED FILES FROM LOCAL DIRECTORY ##
all.files <- list.files(dir.name.unzip, full.names = TRUE)
occurrence <- verbatim <- multimedia <- citations <- rights <- metadata <- NULL
## Reading the txt files
if ("occurrence" %in% output) {
cat("Reading the unzipped occurrences... \n", sep = "")
occ.path <- all.files[grepl("occurrence.txt", all.files)]
occ.data <- data.table::fread(occ.path,
na.strings = na.strings,
quote = quote,
sep = sep)
if (any(grepl("verbatim.txt", all.files))) {
verb.path <- all.files[grepl("verbatim.txt", all.files)]
verb.data <- data.table::fread(verb.path,
na.strings = na.strings,
quote = quote,
sep = sep)
} else {
verb.data <- NULL
}
cat("done!\n", sep = "")
}
if ("multimedia" %in% output) {
cat("Reading the unzipped multimedia... ", sep = "")
mult.path <- all.files[grepl("multimedia.txt", all.files)]
mult.data <- data.table::fread(mult.path, na.strings=c("NA"))
cat("done!\n", sep = "")
}
if ("citations" %in% output) {
cat("Reading the unzipped citations... ", sep = "")
cite.path <- all.files[grepl("citations.txt", all.files)]
citations <- scan(cite.path, what = "character", sep = "\n",
fileEncoding = "UTF-8", quiet = TRUE)[-1]
citations <- data.frame(citations = citations, stringsAsFactors = FALSE)
cat(scan(cite.path, what = "character", sep = "\n",
fileEncoding = "UTF-8", quiet = TRUE)[1], "\n")
cat("done!\n", sep = "")
}
if ("rights" %in% output) {
cat("Reading the unzipped data rights... ", sep = "")
right.path <- all.files[grepl("rights.txt", all.files)]
rights0 <- scan(right.path, what = "character", sep = "\n",
fileEncoding = "UTF-8", quiet = TRUE)
rights <- cbind.data.frame(Dataset = rights0[grepl("Dataset", rights0, fixed = TRUE)],
Rights = rights0[grepl("Rights as supplied", rights0, fixed = TRUE)])
rights$Dataset <- gsub("^Dataset\\: ", "", rights$Dataset, perl = TRUE)
rights$Rights <- gsub("^Rights as supplied\\: ", "", rights$Rights, perl = TRUE)
cat("done!\n", sep = "")
}
## Reading the xml files (currently not implemented)
# if ("metadata" %in% output) {
# meta.path <- all.files[grepl("meta.xml", all.files)]
# meta <- EML:::read_eml(meta.path)
#
# metadata.path <- all.files[grepl("metadata.xml", all.files)]
# metadata <- EML:::read_eml(metadata.path)
#
# dataset.paths <- list.files(
# all.files[grepl("dataset$", all.files, perl = TRUE)], full.names = TRUE)
# datasets <- lapply(dataset.paths, EML::read_eml)
# }
## MERGING OCCURRENCE AND VERBATIM DATA ##
if ("occurrence" %in% output) {
if (bind.data) {
if (!is.null(verb.data)) {
cols <- !names(verb.data) %in% names(occ.data)
verb.data <- verb.data[, cols, with = FALSE]
occurrence <- cbind(occ.data, verb.data)
} else {
occurrence <- occ.data
}
output <- output[!grepl("verbatim", output)]
} else {
occurrence <- occ.data
verbatim <- verb.data
}
}
## PUTTING THE INFORMATION TOGETHER
all.data <- list(occurrence = occurrence, verbatim = verbatim,
multimedia = multimedia, citations = citations, rights = rights)
all.data <- all.data[!sapply(all.data, is.null)]
## SAVING THE SELECTED INFORMATION IN THE LOCAL FILE
if (save) {
if (file.format == "csv") {
if (compress) {
cat("Saving compressed occurrence data... ", sep = "")
for (i in 1:length(all.data)) {
df <- as.data.frame(all.data[[i]])
nome <- names(all.data)[i]
save.path <- file.path(dir.name.unzip, paste0(nome, ".csv.zip"))
data.table::fwrite(df, file = save.path, compress = "gzip")
}
cat("saved!", sep = "")
} else {
cat("Saving occurrence data... ", sep = "")
for (i in 1:length(all.data)) {
df <- as.data.frame(all.data[[i]])
nome <- names(all.data)[i]
save.path <- file.path(dir.name.unzip, paste0(nome, ".csv"))
data.table::fwrite(df, file = save.path)
}
cat("saved!", sep = "")
}
}
if (file.format == "rds") {
if (compress) {
cat("Saving compressed occurrence data... ", sep = "")
for (i in 1:length(all.data)) {
df <- as.data.frame(all.data[[i]])
nome <- names(all.data)[i]
save.path <- file.path(dir.name.unzip, paste0(nome, ".rds"))
saveRDS(df, file = save.path, compress = "gzip")
}
cat("saved!\n", sep = "")
} else {
cat("Saving occurrence data... ", sep = "")
for (i in 1:length(all.data)) {
df <- as.data.frame(all.data[[i]])
nome <- names(all.data)[i]
save.path <- file.path(dir.name.unzip, paste0(nome, ".rds"))
saveRDS(df, file = save.path, compress = FALSE)
}
cat("saved!\n", sep = "")
}
}
}
## REMOVING THE DOWNLOADED/UNZIPPED FILES
if (grepl("http", path))
file.remove(file.path(dir.name, file))
file.remove(all.files[grepl("txt$|xml$", all.files)])
if (length(list.files(dir.name.unzip)) == 0)
unlink(dir.name.unzip, recursive = TRUE)
if (length(list.files(dir.name)) == 0) {
dir.name0 <- gsub(paste0(file_sep,"$"),"", dir.name)
unlink(dir.name0, recursive = TRUE)
}
## RETURNING THE SELECTED OUTPUT
selected.data <- all.data[output]
ids.dt <- sapply(selected.data,
function (x) identical(class(x), c("data.table","data.frame")))
if (any(ids.dt))
for(i in which(ids.dt == TRUE))
selected.data[[i]] <- as.data.frame(selected.data[[i]])
if (length(output) > 1)
return(selected.data)
if (length(output) == 1)
return(selected.data[[output]])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.