#' Add date and day-of-year field/columns to data
#'
#' Creates and adds columns `date` and `doy` (day-of-year) to the data source
#' (either data frame or database table `naturecounts`).
#'
#' @param df_db Either data frame or a connection to database with
#' `naturecounts` table. Must have fields/columns of `survey_year`,
#' `survey_month`, `survey_day`
#' @param overwrite Logical. Overwrite existing columns `date` and/or `doy`?
#'
#' @return If `df_db`was a data frame, return a data frame with new columns
#' `date` and `doy`. Otherwise return database connection.
#'
#' @examples
#' bcch_with_dates <- format_dates(bcch)
#'
#' @export
format_dates <- function(df_db, overwrite = FALSE) {
r <- df_db_check(df_db, collect = FALSE)
if(inherits(df_db, "data.frame")) r <- format_dates_df(df_db, overwrite)
if(inherits(df_db, "SQLiteConnection")) format_dates_db(df_db, overwrite)
r
}
format_dates_df <- function(df, overwrite) {
if(!overwrite) {
if("date" %in% names(df)) stop("'date' column already exists, and ",
"'overwrite' is FALSE", call. = FALSE)
if("doy" %in% names(df)) stop("'doy' column already exists, and ",
"'overwrite' is FALSE", call. = FALSE)
}
if(!all(c("survey_year", "survey_month", "survey_day") %in% names(df))) {
stop("Missing column 'survey_year', 'survey_month', and/or 'survey_day'",
call. = FALSE)
}
dplyr::mutate(df,
date = lubridate::ymd(paste(.data$survey_year,
.data$survey_month,
.data$survey_day), quiet = TRUE),
doy = lubridate::yday(.data$date))
}
format_dates_db <- function(db, overwrite) {
col_db <- DBI::dbListFields(db, "naturecounts")
# Add columns if don't already exist
if(!"date" %in% col_db) {
DBI::dbExecute(db, "ALTER TABLE naturecounts ADD COLUMN date TEXT;")
} else if("date" %in% col_db & !overwrite) {
stop("'date' field already exists, and ",
"'overwrite' is FALSE", call. = FALSE)
} else {
DBI::dbExecute(db, "UPDATE naturecounts SET date = NULL;")
}
if(!"doy" %in% col_db) {
DBI::dbExecute(db, "ALTER TABLE naturecounts ADD COLUMN doy NUMERIC;")
} else if(!"doy" %in% col_db & !overwrite) {
stop("'doy' field already exists, and ",
"'overwrite' is FALSE", call. = FALSE)
} else {
DBI::dbExecute(db, "UPDATE naturecounts SET doy = NULL;")
}
# Format to date
DBI::dbExecute(db, paste0("UPDATE naturecounts SET ",
"date = survey_year || '-' || ",
"PRINTF('%02d', survey_month) || '-' ",
"|| PRINTF('%02d', survey_day) ",
"WHERE ",
"survey_year IS NOT NULL AND ",
"survey_month IS NOT NULL AND ",
"survey_day IS NOT NULL"))
# Add Day of Year
DBI::dbExecute(db, paste0("UPDATE naturecounts SET ",
"doy = strftime('%j', date);"))
db
}
#' Zero-fill data
#'
#' Zero-fill the species presence data by adding zero observation counts
#' (absences) data to an existing naturecounts dataset.
#'
#' @param df_db Either data frame or a connection to database with
#' `naturecounts` table (a data frame is returned).
#' @param by Character vector. By default, "SamplingEventIdentifier" or a vector
#' of specific column names to fill by (see details)
#' @param species Character vector. Either "all", for species in the data, or a
#' vector of species ID codes to fill in.
#' @param fill Character. The column name to fill in. Defaults to
#' "ObservationCount".
#' @param extra_species Character vector. Extra columns/fields uniquely
#' associated with `species_id` to keep in the data (all columns not in `by`,
#' `species`, `fill`, `extra_species`, or `extra_event` will be omitted from
#' the result).
#' @param extra_event Character vector. Extra columns/fields uniquely associated
#' with the Sampling Event (the field defined by `by`) to keep in the data
#' (all columns not in `by`, `species`, `fill`, `extra_species`, or
#' `extra_event`) will be omitted from the result).
#' @param warn Logical. If TRUE, stop zero-filling if >100 species and >1000
#' unique sampling events. If FALSE, ignore and proceed.
#' @inheritParams args
#'
#' @details `by` refers to the combination of columns which are used to detect
#' missing values. By default `SamplingEventIdentifier` is used. Otherwise
#' users can specify their own combination of columns.
#'
#' If `species` is supplied, all records will be used to determine observation
#' events, but only records (zero-filled or otherwise) which correspond to a
#' species in `species` will be returned (all others will be omitted). Note
#' that records where `species_id` is NA (generally for 0 counts for
#' presence/absence), will be converted to a list of 0's for the individual
#' species.
#'
#' @return Data frame
#'
#' @examples
#' # Download data (with "core" fields to include 'CommonName')
#' sample <- nc_data_dl(collection = c("SAMPLE1", "SAMPLE2"), fields_set = "core",
#' username = "sample", info = "nc_example")
#'
#' # Remove casual observations (i.e. 'AllSpeciesReported' = "No")
#' library(dplyr) # For filter function
#' sample <- filter(sample, AllSpeciesReported == "Yes")
#'
#' # Remove data with "X" ObservationCount (only keep numeric obs)
#' sample <- filter(sample, ObservationCount != "X")
#'
#' # Zero fill by all species present
#' sample_all_zeros <- format_zero_fill(sample)
#'
#' # Zero fill only for Canada Goose
#' goose <- format_zero_fill(sample, species = "230")
#'
#' # Keep species-specific variables
#' goose <- format_zero_fill(sample, species = "230", extra_species = "CommonName")
#'
#' # Keep sampling-event-specific variables
#' coords <- format_zero_fill(sample, extra_event = c("latitude", "longitude"))
#'
#' # By species, keeping extra species variables and event variables
#' goose_coords <- format_zero_fill(sample, species = "230",
#' extra_species = "CommonName",
#' extra_event = c("latitude", "longitude"))
#'
#' # Only return event information
#' events <- format_zero_fill(sample, fill = NA,
#' extra_event = c("latitude", "longitude"))
#'
#'
#' @export
format_zero_fill <- function(df_db, by = "SamplingEventIdentifier",
species = "all", fill = "ObservationCount",
extra_species = NULL,
extra_event = NULL,
warn = TRUE, verbose = TRUE) {
df <- df_db_check(df_db)
# Species ids present?
if(!"species_id" %in% names(df)) {
stop("Column 'species_id' must be present", call. = FALSE)
}
# fill columns present?
if(length(fill) > 1) stop("'fill' can only be one column", .call = FALSE)
if(!fill %in% names(df) && !is.na(fill)) {
stop("'fill' column ('", fill, "') is missing from the data", call. = FALSE)
}
# All species reported?
if(!"AllSpeciesReported" %in% names(df) ||
any(is.na(df$AllSpeciesReported)) ||
any(df$AllSpeciesReported != "Yes")) {
stop("Column 'AllSpeciesReported' must be present and 'Yes'", call. = FALSE)
}
# Select grouping columns
if(any(!by %in% names(df))) {
stop("'by' columns must be present in the ",
"data (missing: ",
paste0(by[!by %in% names(df)], collapse = ", "), ")", call. = FALSE)
}
if("species_id" %in% by) {
stop("The column 'species_id' cannot be in 'by'", call. = FALSE)
}
# Keep extra columns completely associated with 'by'
if(!is.null(extra_species)) {
if(any(!extra_species %in% names(df))) {
stop("Some 'extra_species' are not in the data (",
paste0(extra_species[!extra_species %in% names(df)], collapse = ", "),
")", call. = FALSE)
}
extra_keep <- find_unique(df, "species_id", extra_species)
if(!all(extra_species %in% extra_keep)) {
if(verbose) {
message(" - Ignoring 'extra_species' columns (",
paste0(extra_species[!extra_species %in% extra_keep], collapse = ", "),
") not uniquely ",
"associated with the 'species_id' column")
}
}
extra_species <- dplyr::select(df, "species_id",
tidyselect::all_of(extra_keep)) %>%
dplyr::distinct()
}
# Select species ids
if(species == "all") {
species <- unique(df$species_id)
} else {
species <- codes_check(species)
}
# Check how many species/events there are
if(warn && length(species) > 1000 && nrow(unique(df[by])) > 5000) {
stop("You are trying to zero-fill over 1000 species with over 5000 ",
"sampling events. This could take a while! ",
"To ignore this warning and proceed, set 'warn = FALSE'",
call. = FALSE)
}
# Check if more than one observation per unique column set
if(verbose &&
any(dplyr::count(df, !!!rlang::syms(c(by, "species_id")))$n > 1)) {
message(" - Consider summarizing multiple observations per set of 'by' ",
"before zero-filling to increase speed")
}
# Check for missing by values
if(verbose && any(is.na(df[by]))) {
message(" - There are missing values in 'by'. ",
"These are classified as a single event")
}
# Convert fill column to numeric
if(!is.numeric(df[[fill]]) && !is.na(fill)) {
orig <- class(df[[fill]])
df[[fill]] <- as_numeric(df[[fill]])
if(!is.numeric(df[[fill]])) {
stop("'fill' column cannot be converted to numeric (non-numeric entries)",
call. = FALSE)
}
if(verbose) message(" - Converted 'fill' column (", fill, ") from ",
orig, " to numeric")
}
# Get extra events columns
if(!is.null(extra_event)) {
if(any(!extra_event %in% names(df))) {
stop("Some 'extra_event' are not in the data (",
paste0(extra_event[!extra_event %in% names(df)], collapse = ", "),
")", call. = FALSE)
}
extra_keep <- find_unique(df, by, extra_event)
if(!all(extra_event %in% extra_keep)) {
if(verbose) {
message(" - Ignoring 'extra_event' columns (",
paste0(extra_event[!extra_event %in% extra_keep], collapse = ", "),
") not uniquely ",
"associated with the '", by, "' column")
}
}
extra_event <- dplyr::select(df,
tidyselect::all_of(c(by, extra_keep))) %>%
dplyr::distinct()
}
df_by <- df %>%
dplyr::select(tidyselect::all_of(by)) %>%
dplyr::distinct() %>%
tidyr::expand(!!!rlang::syms(by), species_id = species)
if(!is.na(fill)) {
df_filled <- df %>%
dplyr::select(tidyselect::all_of(by),
"species_id",
tidyselect::all_of(fill)) %>%
dplyr::filter(.data$species_id %in% species) %>%
dplyr::left_join(df_by, ., by = c(by, "species_id"),
multiple = "all") %>%
dplyr::mutate(!!fill := tidyr::replace_na(!!rlang::sym(fill), 0))
} else {
df_filled <- dplyr::select(df, tidyselect::all_of(by)) %>%
dplyr::distinct()
}
if(!is.null(extra_species)) {
df_filled <- dplyr::left_join(df_filled, extra_species, by = "species_id")
}
if(!is.null(extra_event)) {
df_filled <- dplyr::left_join(df_filled, extra_event, by = by)
}
as.data.frame(df_filled)
}
# Grab extra columns also unique to 'by'
find_unique <- function(df, by, extra){
extra[sapply(extra, FUN = function(x) {
nrow(unique(cbind(df[by], df[x]))) == nrow(unique(df[by]))
})]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.