R/clean_metadata.R

Defines functions clean_metadata

Documented in clean_metadata

#' Extract and clean ARU metadata from file names
#'
#' Using regular expressions, metadata is extracted from file names and
#' directory structure, checked and cleaned.
#'
#' Note that times are extracted by first combining the date, date/time
#' separator and the time patterns. This means that if there is a problem with
#' this combination, dates might be extracted but date/times will not. This
#' mismatch can be used to determine which part of a pattern needs to be
#' tweaked.
#'
#' See `vignette("customizing", package = "ARUtools")` for details on
#' customizing `clean_metadata()` for your project.
#'
#' @param file_type Character. Type of file (extension) to summarize. Default
#'   wav.
#' @param pattern_site_id Character. Regular expression to extract site ids. See
#'   `create_pattern_site_id()`. Can be a vector of multiple patterns to match.
#' @param pattern_aru_id Character. Regular expression to extract ARU ids. See
#'   `create_pattern_aru_id()`. Can be a vector of multiple patterns to match.
#' @param pattern_date Character. Regular expression to extract dates. See
#'   `create_pattern_date()`. Can be a vector of multiple patterns to match.
#' @param pattern_time Character. Regular expression to extract times. See
#'   `create_pattern_time()`. Can be a vector of multiple patterns to match.
#' @param pattern_dt_sep Character. Regular expression to mark separators
#'   between dates and times. See `create_pattern_dt_sep()`.
#' @param order_date Character. Order that the date appears in. "ymd"
#'   (default), "mdy", or "dmy". Can be a vector of multiple patterns to match.
#'
#' @inheritParams common_docs
#'
#' @return Data frame with extracted metadata
#'
#' @examples
#' clean_metadata(project_files = example_files)
#' clean_metadata(project_files = example_files, subset = "P02")
#'
#' @export
clean_metadata <- function(
    project_dir = NULL,
    project_files = NULL,
    file_type = "wav",
    subset = NULL,
    subset_type = "keep",
    pattern_site_id = create_pattern_site_id(),
    pattern_aru_id = create_pattern_aru_id(),
    pattern_date = create_pattern_date(),
    pattern_time = create_pattern_time(),
    pattern_dt_sep = create_pattern_dt_sep(),
    order_date = "ymd",
    quiet = FALSE) {

  # Checks
  check_text(project_dir, not_null = FALSE, n = 1)
  check_text(project_files, not_null = FALSE)
  check_text(file_type, n = 1)
  check_text(subset, not_null = FALSE, n = 1)
  check_text(subset_type, n = 1)
  check_text(pattern_site_id)
  check_text(pattern_aru_id)
  check_text(pattern_date)
  check_text(pattern_time)
  check_text(pattern_dt_sep)
  check_text(order_date)
  check_logical(quiet)


  # Prepare patterns
  file_type_pattern <- stringr::regex(paste0(file_type, "$"), ignore_case = TRUE)

  pattern_site_id <-  pat_collapse(pattern_site_id)
  pattern_aru_id <- pat_collapse(pattern_aru_id)
  pattern_date <- pat_collapse(pattern_date)
  pattern_time <- pat_collapse(pattern_time)
  pattern_dt_sep <- pat_collapse(pattern_dt_sep)

  pattern_date_time <- paste0(pattern_date, pattern_dt_sep, pattern_time)

  # Get file lists
  if(!is.null(project_dir)) {
    if(!is.null(project_files)) {
      rlang::warn("`project_dir` overrides `project_files`", call = NULL)
    }
    if(!quiet) rlang::inform("Fetching file list...")
    project_files <- list_files(project_dir, subset, subset_type,
                                type = "file")
  } else if(!is.null(subset)){
    project_files <- stringr::str_subset(project_files, subset,
                                         negate = subset_type == "omit")
  } else if(is.null(project_files)) {
    rlang::abort("Must provide one of `project_dir` or `project_files`",
                 call = NULL)
  }

  # Check for files (either zero or all directories)
  if(length(project_files) == 0 || all(fs::is_dir(project_files))) {
    if(is.null(subset)) {
      msg <- "`project_dir`"
    } else {
      msg <- "`project_dir`/`subset`/`subset_type` combination"
    }

    rlang::abort(c(
      paste0("There are no files in the ", msg, " you have specified. Note:"),
      "i" = "Paths are case-sensitive",
      "i" = "Check folders using `list.dirs(path = PROJECT_DIR)`",
      "i" = "Check for files using `count_files(project_dir = PROJECT_DIR)`")
    )
  }

  # Check for file types
  n_ext <- sum(stringr::str_detect(project_files, file_type_pattern))
  if(n_ext == 0){
    rlang::abort(c(glue::glue("Did not find any '{file_type}' files."),
                   "i" = "Use `file_type` to change file extension for sound files",
                   "i" = "Check `project_dir`/`project_files` are correct"))
  }


  # Collect non-file-type files
  extra <- stringr::str_subset(project_files, file_type_pattern, negate = TRUE)
  gps <- stringr::str_subset(extra, stringr::regex("gps|summary", ignore_case = TRUE))
  focal <- stringr::str_subset(project_files, file_type_pattern)

  # Set up file path metadata
  meta <- dplyr::tibble(
    dir = fs::path_dir(focal),
    file_name = fs::path_file(focal),
    type = tolower(fs::path_ext(focal)))

  if(length(gps) > 1) {
    meta <- meta |>
      dplyr::add_row(dir = fs::path_dir(gps),
                     file_name = fs::path_file(gps),
                     type = "gps")
  }

  pattern_aru_type <- c("barlt" = "BarLT",
                        "SMM" = "SongMeter",
                        "SM\\d" = "SongMeter",
                        "S\\dA" = "SongMeter")

  if(!quiet) rlang::inform("Extracting ARU info...")

  # Extract ARU metadata -----------------------
  meta <- meta |>
    dplyr::mutate(
      path = file.path(.data$dir, .data$file_name),
      aru_type = extract_replace(.data$file_name, pattern_aru_type),
      aru_type = dplyr::if_else(is.na(.data$aru_type),
                                extract_replace(.data$dir, pattern_aru_type),
                                .data$aru_type),
      aru_id = stringr::str_extract(.data$file_name, pattern_aru_id),
      aru_id = dplyr::if_else(is.na(.data$aru_id),
                              stringr::str_extract(.data$dir, pattern_aru_id),
                              .data$aru_id))

  meta <- dplyr::mutate(meta, site_id = stringr::str_extract(.data$dir, .env$pattern_site_id))

  pattern_non_date <- paste0("(", pattern_site_id, ")|(",
                             pattern_aru_id, ")|(",
                             paste0("(", pattern_aru_type, ")", collapse = "|"),
                             ")")


  # Extract Date/time --------------------------
  if(!quiet) rlang::inform("Extracting Dates and Times...")

  meta <- meta |>
    dplyr::mutate(
      file_left = stringr::str_remove_all(.data$file_name, pattern_non_date),
      dir_left = stringr::str_remove_all(.data$dir, pattern_non_date),

      # Try file name
      date_time_chr = stringr::str_extract(.data$file_left, .env$pattern_date_time),
      # Try dir name
      date_time_chr = dplyr::if_else(
        is.na(.data$date_time_chr),
        stringr::str_extract(.data$dir_left, .env$pattern_date_time),
        .data$date_time_chr),
      # Get date_times
      date_time = lubridate::parse_date_time(
        .data$date_time_chr,
        orders = paste(order_date, "HMS"),
        truncated = 1),
      date = lubridate::as_date(.data$date_time))

  if(any(is.na(meta$date))) {

    missing <- meta |>
      dplyr::filter(is.na(.data$date)) |>
      dplyr::mutate(
        # Try file name
        date_chr = stringr::str_extract(.data$file_left, .env$pattern_date),
        # Try dir name
        date_chr = dplyr::if_else(
          is.na(.data$date_chr),
          stringr::str_extract(.data$dir_left, .env$pattern_date),
          .data$date_chr),
        date = lubridate::parse_date_time(.data$date_chr, orders = order_date,
                                          quiet = TRUE),
        date = lubridate::as_date(.data$date)) |>
      dplyr::select("path", "date")

    if(any(!is.na(missing$date))) {
      # Add dates where missing
      meta <- dplyr::rows_patch(meta, missing, by = "path")
    }
  }

  # Report on details -------------------------
  # Extra files
  if(length(extra) > 1) {
    rlang::inform(
      c("!" = paste0("Omitted ", length(extra), " extra, non-",
                     file_type, "/GPS files")))
  }

  if(length(gps) > 1) {
    rlang::inform(c("!" = paste0("Detected ", length(gps), " GPS logs")))
  }

  # Flag problems
  f <- dplyr::filter(meta, .data$type == "wav")
  n <- nrow(f)
  f_d <- sum(is.na(f$date))
  f_dt <- sum(is.na(f$date_time))
  f_type <- sum(is.na(f$aru_type))
  f_id <- sum(is.na(f$aru_id))
  f_site <- sum(is.na(f$site_id))

  if(any(c(f_d, f_dt, f_type, f_id, f_site) > 0)) {
   msg <- c("Identified possible problems with metadata extraction:")
   msg <- c(msg, report_missing(f_d, n, "dates"))
   msg <- c(msg, report_missing(f_dt, n, "times"))
   msg <- c(msg, report_missing(f_type, n, "ARU types"))
   msg <- c(msg, report_missing(f_id, n, "ARU ids"))
   msg <- c(msg, report_missing(f_site, n, "sites"))
   rlang::inform(msg)
  }

  meta |>
    dplyr::arrange(.data$type != "gps", !is.na(.data$date_time), .data$path,
                   .data$site_id, .data$date_time) |>
    dplyr::select(-"file_left", -"dir_left", -"date_time_chr", -"dir")
}
dhope/ARUtools documentation built on Jan. 18, 2024, 5:47 a.m.