#' Prepare and clean site index file
#'
#' A site index file contains information on when specific ARUs were deployed
#' where. This function cleans a file (csv, xlsx) or data frame in preparation
#' for adding these details to the output of `clean_metadata()`. It can be used
#' to specify missing information according to date, such as GPS lon/lats and
#' site ids.
#'
#' Note that times are assumed to be in 'local' time and a timezone isn't used
#' (and is removed if present, replaced with UTC). This allows sites
#' from different timezones to be processed at the same time.
#'
#' @param site_index Data frame (can be spatial) or file path. Site index data
#' to clean. If file path, must be to a local csv or xlsx file.
#' @param col_aru_id Character. Column name that contains ARU ids. Default `aru_id`.
#' @param col_site_id Character. Column name that contains site ids.
#' @param col_date_time Character. Column name that contains dates or
#' date/times. Can be vector of two names if there are both 'start' and 'end'
#' columns. Can be `NULL` to ignore dates.
#' @param col_coords Character. Column names that contain longitude and
#' latitude (in that order). Ignored if `site_index` is spatial.
#' @param col_extra Character. Column names for extra data to include. If a named
#' vector, will rename the columns (see examples).
#' @param resolve_overlaps Logical. Whether or not to resolve date overlaps by
#' shifting the start/end dates to noon (default `TRUE`). This assumes that
#' ARUs are generally *not* deployed/removed at midnight (the official
#' start/end of a day) and so noon is used as an approximation for when an ARU
#' was deployed or removed. If possible, use specific deployment times to
#' avoid this issue.
#'
#' @inheritParams common_docs
#'
#' @return Standardized site index data frame
#' @export
#'
#' @examples
#'
#' s <- clean_site_index(example_sites,
#' col_aru_id = "ARU",
#' col_site_id = "Sites",
#' col_date_time = c("Date_set_out", "Date_removed"),
#' col_coords = c("lon", "lat"))
#'
#' s <- clean_site_index(example_sites,
#' col_aru_id = "ARU",
#' col_site_id = "Sites",
#' col_date_time = c("Date_set_out", "Date_removed"),
#' col_coords = c("lon", "lat"),
#' col_extra = c("plot" = "Plots"))
#'
#' # Without dates
#' eg <- dplyr::select(example_sites, -Date_set_out, -Date_removed)
#' s <- clean_site_index(eg,
#' col_aru_id = "ARU",
#' col_site_id = "Sites",
#' col_date_time = NULL,
#' col_coords = c("lon", "lat"),
#' col_extra = c("plot" = "Plots"))
#'
clean_site_index <- function(site_index,
col_aru_id = "aru_id",
col_site_id = "site_id",
col_date_time = "date",
col_coords = c("longitude", "latitude"),
col_extra = NULL,
resolve_overlaps = TRUE,
quiet = FALSE) {
# Checks
check_df_file(site_index)
check_text(col_aru_id, n = 1)
check_text(col_site_id, n = 1)
check_text(col_date_time, n = c(1, 2), not_null = FALSE)
check_text(col_extra, not_null = FALSE)
check_logical(resolve_overlaps)
is_sf <- inherits(site_index, "sf")
if(!is_sf) {
check_text(col_coords, n = 2)
} else {
check_points(site_index)
col_coords <- NULL
}
# Format different inputs
if(is_sf) {
# SF - (create tibble sf https://github.com/r-spatial/sf/issues/951#issuecomment-455735564)
site_index <- dplyr::as_tibble(site_index) |> sf::st_as_sf()
} else if(is.data.frame(site_index)) {
# Data frames
site_index <- suppressMessages(readr::type_convert(site_index)) |>
dplyr::as_tibble()
} else {
# Files
type <- fs::path_ext(site_index)
check_ext(type, c("csv", "xlsx"))
# Let readr do the index checking
if(type == "csv") {
site_index <- readr::read_csv(site_index,
progress = FALSE,
show_col_types = FALSE)
} else if(type == "xlsx") {
site_index <- readxl::read_excel(site_index, progress = FALSE)
}
}
site_index <- dplyr::rename_with(site_index, tolower)
# Check cols
check_cols(site_index, c(col_site_id, col_date_time, col_aru_id, col_coords,
col_extra),
name = "site_index",
extra = "See ?clean_site_index")
# Check dates
check_dates(site_index, col_date_time)
if(is.null(col_date_time)) {
d <- dt <- NULL
} else if(length(col_date_time) == 1) {
dt <- "date_time"
d <- "date"
} else {
dt <- c("date_time_start", "date_time_end")
d <- c("date_start", "date_end")
}
# Prepare for renaming
if(length(names(col_extra)) == 0) {
col_extra <- stats::setNames(nm = col_extra)
}
cols <- c(
"site_id" = col_site_id,
"aru_id" = col_aru_id,
stats::setNames(col_date_time, dt),
if(!is.null(col_coords)) stats::setNames(col_coords, c("longitude", "latitude")),
col_extra) |>
tolower()
# Check and force time zones to UTC if required
site_index <- check_UTC(site_index, cols[dt])
# Rename and fix dates
site_index <- site_index |>
# Grab and rename columns
dplyr::select(dplyr::all_of(cols)) |>
# Get dates
dplyr::mutate(
dplyr::across(dplyr::all_of(dt), ~lubridate::as_datetime(.x) |> lubridate::force_tz("UTC")),
dplyr::across(dplyr::all_of(dt), lubridate::as_date, .names = "{d}")) |>
dplyr::relocate(dplyr::any_of(c("longitude", "latitude", "geometry")),
.after = dplyr::last_col()) |>
dplyr::relocate(dplyr::any_of(names(col_extra)), .after = dplyr::last_col())
# For date ranges, check if only using dates
if(resolve_overlaps &&
length(dt) == 2 &&
all(site_index$date_time_start == site_index$date_start) &&
all(site_index$date_time_end == site_index$date_end)) {
site_index$date_time_end %in% site_index$date_time_start
by_site <- dplyr::group_by(site_index, .data$site_id) |>
dplyr::filter(.data$date_time_end %in% .data$date_time_start) |>
nrow()
by_aru <- dplyr::group_by(site_index, .data$aru_id) |>
dplyr::filter(.data$date_time_end %in% .data$date_time_start) |>
nrow()
if(by_site > 0 | by_aru > 0) {
lubridate::hour(site_index$date_time_start) <- 12
lubridate::hour(site_index$date_time_end) <- 12
if(!quiet) {
rlang::inform(
c("There are overlapping date ranges",
"Shifting start/end times to 'noon'",
#"Use `by_date = \"date_time\"` in `add_sites()`",
"Skip this with `resolve_overlaps = FALSE`"))
}
}
}
site_index
}
#' Check and clean GPS data
#'
#' Check and clean GPS data from ARU logs. GPS points are checked for obvious
#' problems (expected range, distance cutoffs and timing) then attached to the
#' meta data frame. Note that it is often safer and more reliable to create
#' your own Site Index file including site ids, and GPS coordinates. This file
#' can be cleaned and prepared with `clean_site_index()` instead.
#'
#' If checking for a maximum distance (`dist_cutoff`) among GPS points within a
#' group (`dist_by`), the returned data frame will include a column `max_dist`,
#' which represents the largest distance among points within that group.
#'
#' @param meta Data frame. Output of `clean_metadata()`.
#' @param dist_cutoff Numeric. Maximum distance (m) between GPS points within a
#' site. Default is 100m but can be set to `Inf` to skip.
#' @param dist_crs Numeric. Coordinate Reference System to use when calculating
#' distance (should be one with m).
#' @param dist_by Character. Column which identifies sites within which to
#' compare distance among GPS points. Only valid if `dist_cutoff` is not
#' `Inf`.
#' @param verbose Logical. Show extra loading information. Default `FALSE`.
#'
#' @inheritParams common_docs
#'
#' @return Data frame of site-level metadata.
#' @export
#'
#' @examples
#'
#' \dontrun{
#' m <- clean_metadata(project_dir = "my_project")
#' g <- clean_gps(meta = m)
#' }
clean_gps <- function(meta = NULL,
dist_cutoff = 100, dist_crs = 3161,
dist_by = c("site_id", "aru_id"),
quiet = FALSE, verbose = FALSE) {
# Checks
check_data(meta, type = "meta", ref = "clean_metadata()")
check_num(dist_cutoff, n = 1)
check_num(dist_crs, n = 1)
check_text(dist_by)
check_logical(verbose)
#meta <- check_UTC(meta) # Technically not needed at this step... worth doing anyway?
# Load, combine and clean gps files
gps <- clean_gps_files(meta, quiet, verbose)
# Check distances (skips if dist_cutoff = Inf)
gps <- check_gps_dist(gps, crs = dist_crs, dist_cutoff = dist_cutoff,
dist_by = dist_by, quiet = quiet)
# Flag problems
n <- nrow(gps)
f_dt <- sum(is.na(gps$date_time))
f_coord <- sum(is.na(gps$longitude) | is.na(gps$latitude))
f_zero <- sum(gps$longitude == 0 | gps$latitude == 0, na.rm = TRUE)
f_gpx <- sum(gps$gps_ext == "gpx" &
is.na(gps$date_time) & is.na(gps$date) &
is.na(gps$longitude) & is.na(gps$latitude))
f_header <- sum(gps$problems_dt | gps$problems_tm | gps$problems_ll, na.rm = TRUE)
if(any(c(f_dt, f_coord, f_zero, f_gpx, f_header) > 0)) {
msg <- c("Identified possible problems with GPS extraction:")
msg <- c(msg, report_missing(f_dt, n, "date/times"))
msg <- c(msg, report_missing(f_coord, n, "coordinates"))
if(f_zero > 0) {
msg <- c(
msg,
"Some coordinates detected as zero (can occur in Song Meters if not set)",
paste0("Replacing zero coordinates with NA (", f_zero, "/", n, ")"))
gps <- dplyr::mutate(
gps, dplyr::across(c("longitude", "latitude"), ~dplyr::na_if(.x, 0)))
}
msg <- c(msg, report_missing(f_header, n, "headers (in text GPS files)"))
msg <- c(msg, report_missing(f_gpx, n, "GPX files", "extracted"))
rlang::inform(msg)
}
dplyr::select(gps, -dplyr::starts_with("problems_"))
}
clean_gps_files <- function(meta, quiet, verbose) {
gps <- dplyr::filter(meta, .data$type == "gps")
if(nrow(gps) == 0) {
rlang::abort(
"No GPS data provided and no GPS log files recorded in `meta`",
call = NULL)
}
if(!quiet) {
rlang::inform(
c("Note: GPS log files can be unreliable... ",
"Consider supplying your own GPS records and using `clean_site_index()`"))
}
if(!quiet) {
p1 <- list(
show_after = 0,
format = "Loading GPS files {cli::pb_percent} [{cli::pb_elapsed}]")
p2 <- list(
show_after = 0,
format = "Formating GPS files {cli::pb_percent} [{cli::pb_elapsed}]")
} else p1 <- p2 <- FALSE
gps |>
# Check columns and get skips for non-GPX files
check_gps_files() |>
dplyr::mutate(
# Read files
gps_data = purrr::pmap(
list(.data$path, .data$skip, .data$gps_ext),
~load_gps(..1, ..2, ..3, verbose = verbose),
.progress = p1),
# Format data
gps_data = purrr::map2(
.data$gps_data, .data$gps_ext, fmt_gps,
.progress = p2)) |>
# Clean up
dplyr::select(-"date_time", -"date", -"skip") |>
tidyr::unnest("gps_data", keep_empty = TRUE)
}
#' Load GPS from text or gpx
#'
#' @param path Character. File to load
#' @param skip Numeric. Number of lines to skip in text GPS files
#' @param gps_ext Character. Extension of the GPS file (to identify GPX files)
#' @param verbose Logical. Whether to be extra chatty when loading files
#'
#' @noRd
load_gps <- function(path, skip = NA, gps_ext, verbose = FALSE) {
if(gps_ext == "gpx") {
g <- try(sf::st_read(path, layer = "waypoints", quiet = TRUE), silent = TRUE)
} else {
if(is.na(skip)) {
g <- try(stop("No skip", call. = FALSE), silent = TRUE)
} else {
g <- try(readr::read_csv(path, skip = skip - 1, show_col_types = verbose,
guess_max = Inf,
name_repair = "unique_quiet",
progress = FALSE),
silent = TRUE)
}
}
g
}
#' Check text GPS files
#'
#' - Read first 5 lines
#' - Check that can identify column headers
#' - Get the number of lines to skip
#' - Return skips and any problems
#'
#' @param gps_files Character vector. All GPS files (gpx and text)
#'
#' @noRd
check_gps_files <- function(gps_files) {
# Get text-based GPS logs (i.e. anything but GPX files)
gps_files <- dplyr::mutate(gps_files,
gps_ext = tolower(fs::path_ext(.data$path)))
gps_txt <- gps_files |>
dplyr::filter(.data$gps_ext != "gpx") |>
dplyr::pull(.data$path)
if(length(gps_txt) > 0) {
lines <- stats::setNames(nm = gps_txt) |>
purrr::imap(~readr::read_lines(.x, n_max = 5, progress = FALSE))
# Set patterns
opts <- getOption("ARUtools")
pattern_date <- stringr::regex(opts$pat_gps_date, ignore_case = TRUE)
pattern_time <- stringr::regex(opts$pat_gps_time, ignore_case = TRUE)
pattern_coords <- stringr::regex(paste0(opts$pat_gps_coords, collapse = "|"),
ignore_case = TRUE)
# Get skips
skips <- purrr::map(lines,
~stringr::str_which(.x, pattern_coords) |> dplyr::first())
# Check for columns
dt <- purrr::map_lgl(lines, ~!any(stringr::str_detect(.x, pattern_date)))
tm <- purrr::map_lgl(lines, ~!any(stringr::str_detect(.x, pattern_time)))
ll <- is.na(skips) | length(skips) == 0
# Skip if not detected
skips[dt | tm | ll] <- NA_integer_
# Get skips and problems for future reporting
gps_files <- dplyr::tibble(path = names(skips),
skip = unlist(skips),
problems_dt = dt,
problems_tm = tm,
problems_ll = ll) |>
dplyr::full_join(gps_files, by = "path")
} else {
gps_files <- dplyr::mutate(gps_files, skip = NA_integer_,
problems_dt = FALSE,
problems_tm = FALSE,
problems_ll = FALSE)
}
gps_files
}
#' Check column for evidence of coordinate directions
#'
#' Looks for a single character pattern in the column (usually "N" or "E")
#' If found, returns TRUE to identify column as a coordinate direction column
#'
#' @param col Vector to check
#' @param pattern Character pattern to look for
#'
#' @noRd
coord_dir <- function(col, pattern) {
# Not all missing
!all(is.na(col)) &
# all a direction pattern
all(stringr::str_detect(col, paste0("^[", pattern, "]{1}$")), na.rm = TRUE)
}
#' Format loaded GPS coordinates
#'
#' - If there was a loading error, return empty data frame
#' - GPX uses `fmt_gps_gpx()`
#' - Otherwise uses `fmt_gps_txt()`
#'
#' @param df GPS data frame to format
#' @param gps_ext GPS file extension used to identify GPX files
#'
#' @noRd
fmt_gps <- function(df, gps_ext) {
if(inherits(df, "try-error")) {
g <- fmt_gps_empty()
} else if(gps_ext == "gpx") {
g <- fmt_gps_gpx(df)
} else {
g <- fmt_gps_txt(df)
}
dplyr::select(g, "longitude", "latitude", "date", "date_time")
}
fmt_gps_empty <- function() {
dplyr::tibble(date = lubridate::NA_Date_,
date_time = lubridate::NA_POSIXct_,
longitude = NA_real_,
latitude = NA_real_)
}
fmt_gps_gpx <- function(df) {
df |>
sf::st_drop_geometry() |>
dplyr::bind_cols(sf::st_coordinates(df)) |>
dplyr::select("date_time" = "time", "longitude" = "X", "latitude" = "Y") |>
dplyr::mutate(
date_time = lubridate::as_datetime(.data$date_time),
date = lubridate::as_date(.data$date_time),
date_time = dplyr::if_else(lubridate::year(.data$date_time) == -1,
lubridate::NA_POSIXct_,
.data$date_time),
date = dplyr::if_else(lubridate::year(.data$date) == -1,
lubridate::NA_Date_,
.data$date))
}
fmt_gps_txt <- function(df) {
opts <- getOption("ARUtools")
df_fmt <- df |>
# Omit Headings appearing at odd places
dplyr::filter(.data[[names(df)[1]]] != names(df)[[1]]) |>
dplyr::rename(
"longitude" = dplyr::matches(opts$pat_gps_coords[1]),
"latitude" = dplyr::matches(opts$pat_gps_coords[2]),
"date" = dplyr::matches(opts$pat_gps_date),
"time" = dplyr::matches(opts$pat_gps_time)) |>
# Format times
dplyr::mutate(
date_time_chr = paste(.data$date, .data$time),
date_time = lubridate::parse_date_time(
.data$date_time_chr, orders = c("Ymd HMS", "dmY HMS")),
date = lubridate::as_date(.data$date_time),
)
# Fix coords - Check and apply -/+ if N/S/E/W columns present
dir <- dplyr::select(df_fmt, dplyr::where(~coord_dir(.x, "NnSsEeWw")))
if(ncol(dir) > 0) {
df_fmt <- df_fmt |>
dplyr::rename_with(~ "ns", .cols = dplyr::where(~coord_dir(.x, "NnSs"))) |>
dplyr::rename_with(~ "ew", .cols = dplyr::where(~coord_dir(.x, "EeWw"))) |>
# Define direction shift
dplyr::mutate(dplyr::across(
dplyr::any_of(c("ns", "ew")),
~ stringr::str_replace_all(tolower(.x),
c("w" = "-", "e" = "",
"s" = "-", "n" = "")))) |>
# Apply direction shift (i.e. merge)
tidyr::unite("longitude", dplyr::any_of(c("ew", "longitude")), sep = "") |>
tidyr::unite("latitude", dplyr::any_of(c("ns", "latitude")), sep = "")
}
# Clean up
df_fmt |>
dplyr::mutate(longitude = as.numeric(.data$longitude),
latitude = as.numeric(.data$latitude)) |>
dplyr::select("longitude", "latitude", "date", "date_time")
}
#' Check distances between points from GPS log
#'
#' @param gps Data frame of gps sites and coordinates. Requires longitude,
#' latitude, and any columns in `dist_by`.
#' @param crs Numeric. CRS to use for measuring distances. Should be in meters
#' @param dist_cutoff Distance cutoff in meters. Can be set to Inf to avoid this
#' check.
#' @param dist_by Character. Column names to use in grouping GPS points before
#' calculating within group distances.
#' @param quiet Logical. Suppress non-essential messages
#'
#' @return Returns data frame with maximum distances between gps points within a
#' group.
#'
#' @noRd
check_gps_dist <- function(gps, crs, dist_cutoff, dist_by, quiet = FALSE){
if(dist_cutoff < Inf) {
max_dist <- gps |>
dplyr::filter(dplyr::if_all(
dplyr::any_of(c("longitude", "latitude", dist_by)), ~!is.na(.)))
if(nrow(max_dist) == 0) {
if(!is.null(dist_by)) {
dist_by <- paste0(", `", paste0(dist_by, collapse = "`, `"), "`")
} else dist_by <- ""
if(!quiet) {
rlang::inform(
c("Skipping distance check:",
paste0("All records missing at least one of ",
"`longitude`, `latitude`", dist_by)))
}
} else {
n <- max_dist |>
dplyr::select(dplyr::all_of(c("longitude", "latitude", dist_by))) |>
dplyr::distinct() |>
dplyr::count(dplyr::across(dplyr::all_of(dist_by)))
if(all(n$n == 1)) {
if(!quiet) {
rlang::inform(
c("Skipping distance check:",
paste0("No records with more than one set of coordinates per unique `",
paste0(dist_by, collapse = "`/`"), "`")))
}
} else {
max_dist <- max_dist |>
dplyr::select(dplyr::all_of(c(dist_by, "longitude", "latitude"))) |>
dplyr::distinct() |>
sf::st_as_sf(coords= c("longitude", "latitude"), crs = 4326) |>
sf::st_transform(crs) |>
dplyr::group_by(dplyr::across(dplyr::all_of(dist_by))) |>
dplyr::summarize(
max_dist = max(sf::st_distance(.data$geometry, .data$geometry)),
.groups = 'drop') |>
sf::st_drop_geometry()
if(any(max_dist$max_dist > units::set_units(dist_cutoff, "m"))) {
rlang::warn(
c("Within site distances are greater than cutoff",
"x" = paste0("Distances among ARUs within a site must be less than ",
"`dist_cutoff` (currently ", dist_cutoff, "m)"),
"i" = "Set `dist_cutoff` to `Inf` to skip this check (e.g. moving ARUs)"))
}
gps <- dplyr::left_join(gps, max_dist, by = dist_by)
}
}
}
gps
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.