#' Reference to eBird data file
#'
#' Create a reference to an eBird Basic Dataset (EBD) file in preparation for
#' filtering using AWK.
#'
#' @param file character; input file. If file is not found as specified, it will
#' be looked for in the directory specified by the `EBD_PATH` environment
#' variable.
#' @param file_sampling character; optional input sampling event data (i.e.
#' checklists) file, required if you intend to zero-fill the data to produce a
#' presence-absence data set. This file consists of just effort information
#' for every eBird checklist. Any species not appearing in the EBD for a given
#' checklist is implicitly considered to have a count of 0. This file should
#' be downloaded at the same time as the basic dataset to ensure they are in
#' sync. If file is not found as specified, it will be looked for in the
#' directory specified by the `EBD_PATH` environment variable.
#' @param sep character; the input field separator, the eBird data are tab
#' separated so this should generally not be modified. Must only be a single
#' character and space delimited is not allowed since spaces appear in many of
#' the fields.
#'
#' @details
#' eBird data can be downloaded as a tab-separated text file from the
#' [eBird website](http://ebird.org/ebird/data/download) after submitting a
#' request for access. As of February 2017, this file is nearly 150 GB making it
#' challenging to work with. If you're only interested in a single species or a
#' small region it is possible to submit a custom download request. This
#' approach is suggested to speed up processing time.
#'
#' There are two potential pathways for preparing eBird data. Users wishing to
#' produce presence only data, should download the
#' [eBird Basic Dataset](http://ebird.org/ebird/data/download/) and reference
#' this file when calling `auk_ebd()`. Users wishing to produce zero-filled,
#' presence absence data should additionally download the sampling event data
#' file associated with the basic dataset This file contains only checklist
#' information and can be used to infer absences. The sampling event data file
#' should be provided to `auk_ebd()` via the `file_sampling` argument. For
#' further details consult the vignettes.
#'
#' @return An `auk_ebd` object storing the file reference and the desired
#' filters once created with other package functions.
#' @export
#' @family objects
#' @examples
#' # get the path to the example data included in the package
#' # in practice, provide path to ebd, e.g. f <- "data/ebd_relFeb-2018.txt
#' f <- system.file("extdata/ebd-sample.txt", package = "auk")
#' auk_ebd(f)
#' # to produce zero-filled data, provide a checklist file
#' f_ebd <- system.file("extdata/zerofill-ex_ebd.txt", package = "auk")
#' f_cl <- system.file("extdata/zerofill-ex_sampling.txt", package = "auk")
#' auk_ebd(f_ebd, file_sampling = f_cl)
auk_ebd <- function(file, file_sampling, sep = "\t") {
# checks
assertthat::assert_that(
assertthat::is.string(sep), nchar(sep) == 1, sep != " "
)
file <- ebd_file(file)
# read header rows
header <- tolower(get_header(file, sep))
header <- stringr::str_replace_all(header, "[^a-z0-9]+", " ")
# fix for custom download
header[header == "state province"] <- "state"
header[header == "subnational1 code"] <- "state code"
col_idx <- data.frame(id = NA_character_,
name = header,
index = seq_along(header),
stringsAsFactors = FALSE)
# ensure key columns are present
mandatory <- c("scientific name",
"country code", "state code",
"latitude", "longitude",
"observation date", "time observations started",
"protocol type",
"exotic code",
"duration minutes", "effort distance km",
"all species reported",
"observer id",
"sampling event identifier", "group identifier")
col_miss <- mandatory[!(mandatory %in% header)]
if (length(col_miss) > 0) {
m <- sprintf("Required columns missing from the EBD file:\n\t%s",
paste(col_miss, collapse = "\n\t"))
stop(m)
}
# identify columns required for filtering
filter_cols <- data.frame(
id = c("species",
"country", "state", "county", "bcr",
"lat", "lng",
"date", "time", "last_edited",
"protocol", "project",
"duration", "distance",
"breeding", "exotic",
"complete",
"observer"),
name = c("scientific name",
"country code", "state code", "county code", "bcr code",
"latitude", "longitude",
"observation date", "time observations started",
"last edited date",
"protocol type", "project code",
"duration minutes", "effort distance km",
"breeding code",
"exotic code",
"all species reported",
"observer id"),
stringsAsFactors = FALSE)
filter_cols <- filter_cols[filter_cols$name %in% col_idx$name, ]
col_idx$id[match(filter_cols$name, col_idx$name)] <- filter_cols$id
# process sampling data header
if (!missing(file_sampling)) {
file_sampling <- ebd_file(file_sampling)
# variables not in sampling data
not_in_sampling <- c("species", "breeding", "exotic")
filter_cols_sampling <- filter_cols[!filter_cols$id %in% not_in_sampling, ]
# read header rows
header_sampling <- tolower(get_header(file_sampling, sep))
# ensure key columns are present
mandatory_sampl <- setdiff(mandatory, "scientific name")
col_miss <- mandatory_sampl[!(mandatory_sampl %in% header)]
if (length(col_miss) > 0) {
m <- sprintf("Required columns missing from the sampling file:\n\t%s",
paste(mandatory, collapse = "\n\t"))
stop(m)
}
# identify column locations
col_idx_sampling <- data.frame(id = NA_character_,
name = header_sampling,
index = seq_along(header_sampling),
stringsAsFactors = FALSE)
col_found <- filter_cols_sampling$name %in% col_idx$name
filter_cols_sampling <- filter_cols_sampling[col_found, ]
mtch <- match(filter_cols_sampling$name, col_idx_sampling$name)
col_idx_sampling$id[mtch] <- filter_cols_sampling$id
} else {
file_sampling <- NULL
col_idx_sampling <- NULL
}
# output
structure(
list(
file = file,
file_sampling = file_sampling,
output = NULL,
output_sampling = NULL,
col_idx = col_idx,
col_idx_sampling = col_idx_sampling,
filters = list(
species = character(),
country = character(),
state = character(),
county = character(),
bcr = integer(),
bbox = numeric(),
year = integer(),
date = character(),
time = character(),
last_edited = character(),
protocol = character(),
project = character(),
duration = numeric(),
distance = numeric(),
breeding = FALSE,
exotic = character(),
complete = FALSE,
observer = character()
)
),
class = "auk_ebd"
)
}
#' @export
print.auk_ebd <- function(x, ...) {
cat("Input \n")
cat(paste(" EBD:", x$file, "\n"))
if (!is.null(x$file_sampling)) {
cat(paste(" Sampling events:", x$file_sampling, "\n"))
}
cat("\n")
cat("Output \n")
if (is.null(x$output)) {
cat(" Filters not executed\n")
} else {
cat(paste(" EBD:", x$output, "\n"))
if (!is.null(x$output_sampling)) {
cat(paste(" Sampling events:", x$output_sampling, "\n"))
}
}
cat("\n")
cat("Filters \n")
# species filter
cat(" Species: ")
if (length(x$filters$species) == 0) {
cat("all")
} else if (length(x$filters$species) <= 10) {
cat(paste(x$filters$species, collapse = ", "))
} else {
cat(paste0(length(x$filters$species), " species"))
}
cat("\n")
# country filter
cat(" Countries: ")
if (length(x$filters$country) == 0) {
cat("all")
} else if (length(x$filters$country) <= 10) {
cat(paste(x$filters$country, collapse = ", "))
} else {
cat(paste0(length(x$filters$country), " countries"))
}
cat("\n")
# state filter
cat(" States: ")
if (length(x$filters$state) == 0) {
cat("all")
} else if (length(x$filters$state) <= 10) {
cat(paste(x$filters$state, collapse = ", "))
} else {
cat(paste0(length(x$filters$state), " states"))
}
cat("\n")
# county filter
cat(" Counties: ")
if (length(x$filters$county) == 0) {
cat("all")
} else if (length(x$filters$county) <= 10) {
cat(paste(x$filters$county, collapse = ", "))
} else {
cat(paste0(length(x$filters$county), " counties"))
}
cat("\n")
# bcr filter
cat(" BCRs: ")
if (length(x$filters$bcr) == 0) {
cat("all")
} else if (length(x$filters$bcr) <= 10) {
cat(paste(x$filters$bcr, collapse = ", "))
} else {
cat(paste0(length(x$filters$bcr), " BCRs"))
}
cat("\n")
# bbox filter
cat(" Bounding box: ")
e <- round(x$filters$bbox, 1)
if (length(e) == 0) {
cat("full extent")
} else {
cat(paste0("Lon ", e[1], " - ", e[3], "; "))
cat(paste0("Lat ", e[2], " - ", e[4]))
}
cat("\n")
# year filter
cat(" Years: ")
if (length(x$filters$year) == 0) {
cat("all")
} else if (length(x$filters$year) <= 10) {
cat(paste(x$filters$year, collapse = ", "))
} else {
cat(paste0(length(x$filters$year), " years"))
}
cat("\n")
# date filter
cat(" Date: ")
if (length(x$filters$date) == 0) {
cat("all")
} else {
cat(paste0(x$filters$date[1], " - ", x$filters$date[2]))
}
cat("\n")
# time filter
cat(" Start time: ")
if (length(x$filters$time) == 0) {
cat("all")
} else {
cat(paste0(x$filters$time[1], "-", x$filters$time[2]))
}
cat("\n")
# last edited date filter
cat(" Last edited date: ")
if (length(x$filters$last_edited) == 0) {
cat("all")
} else {
cat(paste0(x$filters$last_edited[1], " - ", x$filters$last_edited[2]))
}
cat("\n")
# protocol filter
cat(" Protocol: ")
if (length(x$filters$protocol) == 0) {
cat("all")
} else {
cat(paste(x$filters$protocol, collapse = ", "))
}
cat("\n")
# project filter
cat(" Project code: ")
if (length(x$filters$project) == 0) {
cat("all")
} else {
cat(paste(x$filters$project, collapse = ", "))
}
cat("\n")
# duration filter
cat(" Duration: ")
if (length(x$filters$duration) == 0) {
cat("all")
} else {
cat(paste0(x$filters$duration[1], "-", x$filters$duration[2], " minutes"))
}
cat("\n")
# distance filter
cat(" Distance travelled: ")
if (length(x$filters$distance) == 0) {
cat("all")
} else {
cat(paste0(x$filters$distance[1], "-", x$filters$distance[2], " km"))
}
cat("\n")
# breeding codes
cat(" Records with breeding codes only: ")
if (x$filters$breeding) {
cat("yes")
} else {
cat("no")
}
cat("\n")
# exotic code
cat(" Exotic Codes: ")
if (length(x$filters$exotic) %in% c(0, 4)) {
cat("all")
} else {
ex_codes <- dplyr::recode(x$filters$exotic,
"N" = "Naturalized",
"P" = "Provisional",
"X" = "Escapee")
ex_codes <- ifelse(ex_codes == "", "Native", ex_codes)
cat(paste(ex_codes, collapse = ", "))
}
cat("\n")
# complete checklists only
cat(" Complete checklists only: ")
if (x$filters$complete) {
cat("yes")
} else {
cat("no")
}
cat("\n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.