#' Remove duplicate group checklists
#'
#' eBird checklists can be shared among a group of multiple observers, in which
#' case observations will be duplicated in the database. This functions removes
#' these duplicates from the eBird Basic Dataset (EBD) or the EBD sampling event
#' data (with `checklists_only = TRUE`), creating a set of unique bird
#' observations. This function is called automatically by [read_ebd()] and
#' [read_sampling()].
#'
#' @param x data.frame; the EBD data frame, typically as imported by
#' [read_ebd()].
#' @param group_id character; the name of the group ID column.
#' @param checklist_id character; the name of the checklist ID column, each
#' checklist within a group will get a unique value for this field. The record
#' with the lowest `checklist_id` will be picked as the unique record within
#' each group.
#' @param species_id character; the name of the column identifying species
#' uniquely. This is required to ensure that removing duplicates is done
#' independently for each species. Note that this will not treat sub-species
#' independently and, if that behavior is desired, the user will have to
#' generate a column uniquely identifying species and subspecies and pass that
#' column's name to this argument.
#' @param checklists_only logical; whether the dataset provided only contains
#' checklist information as with the sampling event data file. If this
#' argument is `TRUE`, then the `species_id` argument is ignored and removing
#' of duplicated is done at the checklist level not the species level.
#'
#' @details This function chooses the checklist within in each that has the
#' lowest value for the field specified by `checklist_id`. A new column is
#' also created, `checklist_id`, whose value is the taken from the field
#' specified in the `checklist_id` parameter for non-group checklists and from
#' the field specified by the `group_id` parameter for grouped checklists.
#'
#' @return A data.frame with unique observations, and an additional field,
#' `checklist_id`, which is a combination of the sampling event and group IDs.
#' @export
#' @examples
#' # read in an ebd file and don't automatically remove duplicates
#' ebd <- system.file("extdata/ebd-sample.txt", package = "auk") %>%
#' read_ebd(unique = FALSE)
#' # remove duplicates
#' ebd_unique <- auk_unique(ebd)
#' nrow(ebd)
#' nrow(ebd_unique)
auk_unique <- function(x,
group_id = "group_identifier",
checklist_id = "sampling_event_identifier",
species_id = "scientific_name",
checklists_only = FALSE) {
# checks
assertthat::assert_that(
is.data.frame(x),
assertthat::is.flag(checklists_only),
assertthat::is.string(group_id),
group_id %in% names(x),
assertthat::is.string(checklist_id),
checklist_id %in% names(x),
assertthat::is.string(species_id),
checklists_only || species_id %in% names(x),
# all id columns should be character vectors
is.character(x[[group_id]]),
is.character(x[[checklist_id]]),
checklists_only || is.character(x[[species_id]]))
# preserve class
orig_class <- class(x)
# identify and separate non-group records
grouped <- !is.na(x[[group_id]])
x_grouped <- x[grouped, ]
# sort by sampling event id
x_grouped <- x_grouped[order(x_grouped[[checklist_id]]), ]
# remove duplicated records, ensuring different species treated independently
if (checklists_only) {
cols <- group_id
} else {
cols <- c(species_id, group_id)
}
x_grouped <- x_grouped[!duplicated(x_grouped[, cols]), ]
# set id field
x$checklist_id <- x[[checklist_id]]
x_grouped$checklist_id <- x_grouped[[group_id]]
# only keep non-group or non-duplicated records
out <- rbind(x[!grouped, ], x_grouped)
# move id field to front
out <- out[, c("checklist_id", setdiff(names(out), "checklist_id"))]
# re-assign class
class(out) <- orig_class
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.