ARUtools: Tools for management of Acoustic Recording Unit (ARU) data

Documented in add_sites

#' Add site-level data to the metadata
#'
#' Uses dates to join site-level data (coordinates and site ids) to the meta
#' data. If the site data have only single dates, then a buffer before and after
#' is used to determine which recordings belong to that site observation. Can
#' join by site ids alone if set `by_date = NULL`.
#'
#' @param sites Data frame. Site-level data from `clean_site_index()`.
#' @param buffer_before Numeric. Number of hours before a deployment in which to
#'   include recordings. `NULL` means include the time up to the last
#'   deployment. Coupled with `buffer_after`, this creates a window around a
#'   date/time in which to join recordings to the site-level data. Ignored if
#'   `sites` has both a start and end column for date/times. Default 0.
#' @param buffer_after Numeric. Number of hours after the deployment in which to
#'   include recordings. `NULL` means include the time up to the next
#'   deployment. Coupled with `buffer_before`, creates a window around a
#'   date/time in which to join recordings to the site-level data. Ignored if
#'   `sites` has both a start and end column for date/times. Default `NULL`.
#' @param by Character. Columns which identify a deployment in `sites` as well
#'   as `meta`, besides date/time, which are used to join the data. Default is
#'   `site_id` and `aru_id`.
#' @param by_date Character. Date/time type to join data by. `date` is faster
#'   but `date_time` is more precise. Default `date_time`. `NULL` means ignore
#'   dates and join only with `by` columns (`dplyr::left_join()`).
#'
#' @inheritParams common_docs
#'
#' @return A data frame of metadata with site-level data joined in.
#' @export
#'
#' @examples
#' m <- clean_metadata(project_files = example_files)
#' s <- clean_site_index(example_sites_clean,
#'                       col_date = c("date_time_start", "date_time_end"))
#' m <- add_sites(m, s)
#'
#' # Without dates (by site only)
#' m <- clean_metadata(project_files = example_files)
#' eg <- dplyr::select(example_sites_clean, -date_time_start, -date_time_end)
#' s <- clean_site_index(eg, col_date_time = NULL)
#' m <- add_sites(m, s, by_date = NULL)
#'
add_sites <- function(meta, sites, buffer_before = 0, buffer_after = NULL,
                      by = c("site_id", "aru_id"),
                      by_date = "date_time",
                      quiet = FALSE) {

  # Checks
  check_data(meta, type = "meta", ref = "clean_metadata()")
  check_data(sites, type = "sites", ref = "clean_sites_index()` or `clean_gps()")
  check_text(by)
  check_text(by_date, opts = c("date", "date_time"), n = 1, not_null = FALSE)
  check_num(buffer_before, not_null = FALSE, n = 1)
  check_num(buffer_after, not_null = FALSE, n = 1)
  meta <- check_UTC(meta)
  sites <- check_UTC(sites, stringr::str_subset(names(sites), "^date_time"))

  # Check 'by's
  by_date_cols <- check_date_joins(sites, by_date, quiet)
  by <- check_by(by, meta, cols_omit = c("date", "date_time"), quiet)

  # If sf, convert to df
  crs <- sf::st_crs(sites) # Hold on for converting back
  sites <- sf_to_df(sites)

  # Clean and prep
  sites <- prep_sites(sites, by, by_date_cols)
  meta <- prep_meta(meta, names(sites), by, by_date_cols, quiet)

  # Check that there are columns to add (check after prep_xxx())
  add <- names(sites)[!names(sites) %in% c(by, by_date_cols)]
  if(length(add) < 1) {
    rlang::abort(paste0(
      "No new columns in `sites` to add to `meta` ",
      "(all columns in `by` or used in date matching)"),
      call = NULL)
  }

  # Do the joins
  if(!is.null(by_date)) {
    meta_sites <- add_sites_date(sites, meta,
                                 by, by_date, by_date_cols,
                                 buffer_before, buffer_after,
                                 quiet)
  } else {
    meta_sites <- dplyr::left_join(meta, sites, by = by)
  }

  # Finalize
  meta_sites |>
    df_to_sf(crs = crs) |>  # If was sf, convert back
    dplyr::arrange(dplyr::across(dplyr::any_of(c(by, by_date_cols, "path"))))
}

#' Calculate buffer date/times around a single date/time
#'
#' Creates a date/time range (t1-t2) within which to match sites. If no buffer
#' provided, uses the time of the previous/next observation.
#'
#' @param df Data frame in which to store buffer times
#' @param buffer_before Numeric. Hours before a date/time to be included
#' @param buffer_after Numeric. Hours after a date/time to be included
#' @param by_date_cols Character. Name of the date/time column (`date` or
#'   `date_time`) to use as the starting point.
#'
#' @return df with added columns t1 and t2 which have the start and end times
#'  for the buffer.
#'
#' @noRd
calc_buffers <- function(df, buffer_before, buffer_after, by_date_cols) {
  # If no buffers, use previous/next observation
  # for first/last, use the equivalent of an -Inf / +Inf date
  d1 <- lubridate::as_datetime("1900-01-01")
  d2 <- lubridate::as_datetime("2999-01-01")

  # Start time
  if(is.null(buffer_before)) {
    df <- dplyr::mutate(
      df, t1 = dplyr::lag(.data[[by_date_cols]], default = .env$d1) + lubridate::seconds(1))
  } else {
    df <- dplyr::mutate(
      df, t1 = .data[[by_date_cols]] - lubridate::hours(.env$buffer_before))
  }

  # End time
  if(is.null(buffer_after)) {
    df <- dplyr::mutate(
      df, t2 = dplyr::lead(.data[[by_date_cols]], default = .env$d2) - lubridate::seconds(1))
  } else {
    df <- dplyr::mutate(
      df, t2 = .data[[by_date_cols]] + lubridate::hours(.env$buffer_after))
  }

  df
}

#' Prepare site data for merging
#'
#' Simplify and reduce the dataset for faster merging
#'
#' - Omit date/time columns which are not used
#'   (i.e. if merging by `date_time`, omit `date`)
#' - Drop NAs
#' - If the input data is from `clean_gps()`, remove unnecessary columns
#' - Keep only unique observations once unnecessary columns have been omitted
#'
#' @param sites Data frame. `sites` data processed in `add_sites()`
#' @param by Character vector. Columns to join by, passed through from
#'   `add_sites()`
#' @param by_date_cols Character vector. Names of date/time columns to join by,
#'   processed in `add_sites()`
#'
#' @return sites data simplified
#'
#' @noRd
prep_sites <- function(sites, by, by_date_cols) {

  # Omit unused dates
  omit_dts <- stringr::str_subset(names(sites), "date")
  omit_dts <- omit_dts[!omit_dts %in% by_date_cols]
  sites <- dplyr::select(sites, -dplyr::all_of(omit_dts))

  # Omit NAs
  sites <- tidyr::drop_na(
    sites,
    dplyr::any_of(c(by_date_cols, "longitude", "latitude")))

  # Fix output of clean_gps()
  if("type" %in% names(sites) && all(sites$type == "gps")) {
    omit <- c("file_name", "type", "path", "aru_type")
    omit <- omit[!omit %in% by]
    sites <- dplyr::select(sites, -dplyr::any_of(omit))
  }

  dplyr::distinct(sites)
}

#' Prepare meta data for merging
#'
#' Subfunction to `add_sites`. Simplify and reduce the dataset for faster
#' merging
#'
#' - Omit gps file records
#' - Omit columns not used in the merging by sets which are already present in
#'   sites (faster merging)
#' - Let users know which columns have been omitted (but only if non-NA)
#'
#' @param meta Data frame. `meta` data processed in `add_sites()`
#' @param cols_sites Character vector. Names of columns in `sites` data
#'   processed in `add_sites()`
#' @param by Character vector. Columns to join by, passed through from
#'   `add_sites()`
#' @param by_date_cols Character vector. Names of date/time columns to join by,
#'   processed in `add_sites()`
#' @param quiet Logical. Whether to show non-critical messages or not.
#'
#' @return meta data simplified
#'
#' @noRd
prep_meta <- function(meta, cols_sites, by , by_date_cols, quiet) {

  meta <- dplyr::filter(meta, .data$type != "gps")

  # Omit columns in both meta and site not in 'by'/'by_date_cols' (allows faster filling)
  omit_cols <- cols_sites[!cols_sites %in% c(by, by_date_cols)]
  omit_cols <- omit_cols[omit_cols %in% names(meta)]

  if(length(omit_cols) > 0) {
    meta <- dplyr::select(meta, -dplyr::all_of(omit_cols))

    # Don't report if all NA in meta
    report <- omit_cols
    for(i in report) if(all(is.na(meta[[i]]))) report <- report[report != i]

    if(length(report) > 0) {
      rlang::inform(c(
        "Some columns in both `meta` and `sites` are not used to join (`by`)",
        "*" = paste0(
          "These columns (`", paste0(omit_cols, collapse = "`, `"), "`) ",
          "will be overwritten in `meta`")
      ))
    }
  }
  meta
}

#' Merge by date/time range
#'
#' Clean up/simplify coordinates (if multiple per record), calculate buffers if
#' required, then merge by date range. Flags recordings with multiple or no
#' matches.
#'
#' @return sites merged into meta data by date range
#'
#' @noRd
add_sites_date <- function(sites, meta, by, by_date, by_date_cols,
                           buffer_before, buffer_after, quiet) {

  # Clean up Dates
  if(by_date == "date") {
    dt_fun <- lubridate::as_date
  } else dt_fun <- lubridate::as_datetime

  sites <- dplyr::mutate(sites, dplyr::across(dplyr::all_of(by_date_cols), dt_fun))

  # Summarize multiple, coordinates within a date by grouping variables
  # Deals with multiple reads of GPS locations (e.g., hourly)
  if(by_date == "date") {
    n <- dplyr::count(
      sites,
      dplyr::across(.cols = c(dplyr::all_of(by), dplyr::matches("date(_(start|end))?"))))
    if(any(n$n > 1)) {

      sites <- sites |>
        dplyr::group_by(dplyr::across(
          .cols = c(dplyr::all_of(by), dplyr::matches("date(_(start|end))?")))) |>
        dplyr::summarize(sd_lon = stats::sd(.data$longitude),
                         sd_lat = stats::sd(.data$latitude),
                         longitude = mean(.data$longitude),
                         latitude = mean(.data$latitude)) |>
        dplyr::ungroup()

      rlang::inform(c(
        paste0("Multiple coordinates per date at each combination of `",
               paste0(by, collapse = "`/`"), "`"),
        paste0("Taking mean coordinates ",
               "(max sd lon ", round(max(sites$sd_lon, na.rm = TRUE), 4),
               "; max sd lat ", round(max(sites$sd_lat, na.rm = TRUE), 4), ")"),
        "Consider using `by = \"date_time\"` for more fine-tuned control"))

      sites <- dplyr::select(sites, -"sd_lon", -"sd_lat")
    }
  }

  # Create date/time range (from buffers as required)
  if(length(by_date_cols) == 1) {
    sites <- sites |>
      dplyr::group_by(dplyr::across(dplyr::all_of(by))) |>
      calc_buffers(buffer_before, buffer_after, by_date_cols) |>
      dplyr::mutate(dt_range = lubridate::interval(.data$t1, .data$t2)) |>
      dplyr::select(-dplyr::all_of(by_date_cols), -"t1", -"t2") |>
      dplyr::ungroup()
  } else {
    sites <- sites |>
      dplyr::mutate(dt_range = lubridate::interval(.data[[by_date_cols[1]]],
                                                   .data[[by_date_cols[[2]]]])) |>
      dplyr::select(-dplyr::all_of(by_date_cols))
  }

  # Join by date (add `...n` to track successfully merged records)
  sites <- dplyr::mutate(sites, `...n` = 1:dplyr::n())

  meta_sites <- date_join(meta, sites, by = by, id = "path",
                          col = by_date, int = "dt_range")

  # Flags
  msg <- c("Identified possible problems with metadata extraction:")

  if(nrow(meta_sites) > nrow(meta)) {
    msg <- c(msg,
             "x" = "Some sound files matched multiple site references (see `n_matches` column)")
  }

  if(any(is.na(meta_sites$`...n`))) {
    n <- nrow(meta)
    f <- sum(is.na(meta_sites$`...n`))
    msg <- c(msg,
             "x" = paste0("Not all files were matched to a site reference (", f, "/", n, ")"))
  }
  if(length(msg) > 1) {
    msg <- c(msg, "*" = "Consider adjusting the `by` argument")
    if(by_date == "date") {
      msg <- c(msg, "*" = "Consider matching by time, `by_date = \"date_time\"`")
    }
    if(length(by_date_cols) == 1)  {
      msg <- c(msg,
               "*" = c("Consider adjusting `buffer_before` or `buffer_after`"),
               "*" = paste0("Consider using date/time ranges by including `date_start`",
                            "/`date_time_start` and `date_end`/`date_time_end` in `sites`"))
    }

    rlang::inform(msg)
  }

  dplyr::select(meta_sites, -"...n")
}