#' Extract and clean ARU metadata from file names
#'
#' Using regular expressions, metadata is extracted from file names and
#' directory structure, checked and cleaned.
#'
#' Note that times are extracted by first combining the date, date/time
#' separator and the time patterns. This means that if there is a problem with
#' this combination, dates might be extracted but date/times will not. This
#' mismatch can be used to determine which part of a pattern needs to be
#' tweaked.
#'
#' See `vignette("customizing", package = "ARUtools")` for details on
#' customizing `clean_metadata()` for your project.
#'
#' @param file_type Character. Type of file (extension) to summarize. Default
#' wav.
#' @param pattern_site_id Character. Regular expression to extract site ids. See
#' `create_pattern_site_id()`. Can be a vector of multiple patterns to match.
#' @param pattern_aru_id Character. Regular expression to extract ARU ids. See
#' `create_pattern_aru_id()`. Can be a vector of multiple patterns to match.
#' @param pattern_date Character. Regular expression to extract dates. See
#' `create_pattern_date()`. Can be a vector of multiple patterns to match.
#' @param pattern_time Character. Regular expression to extract times. See
#' `create_pattern_time()`. Can be a vector of multiple patterns to match.
#' @param pattern_dt_sep Character. Regular expression to mark separators
#' between dates and times. See `create_pattern_dt_sep()`.
#' @param order_date Character. Order that the date appears in. "ymd"
#' (default), "mdy", or "dmy". Can be a vector of multiple patterns to match.
#'
#' @inheritParams common_docs
#'
#' @return Data frame with extracted metadata
#'
#' @examples
#' clean_metadata(project_files = example_files)
#' clean_metadata(project_files = example_files, subset = "P02")
#'
#' @export
clean_metadata <- function(
project_dir = NULL,
project_files = NULL,
file_type = "wav",
subset = NULL,
subset_type = "keep",
pattern_site_id = create_pattern_site_id(),
pattern_aru_id = create_pattern_aru_id(),
pattern_date = create_pattern_date(),
pattern_time = create_pattern_time(),
pattern_dt_sep = create_pattern_dt_sep(),
order_date = "ymd",
quiet = FALSE) {
# Checks
check_text(project_dir, not_null = FALSE, n = 1)
check_text(project_files, not_null = FALSE)
check_text(file_type, n = 1)
check_text(subset, not_null = FALSE, n = 1)
check_text(subset_type, n = 1)
check_text(pattern_site_id)
check_text(pattern_aru_id)
check_text(pattern_date)
check_text(pattern_time)
check_text(pattern_dt_sep)
check_text(order_date)
check_logical(quiet)
# Prepare patterns
file_type_pattern <- stringr::regex(paste0(file_type, "$"), ignore_case = TRUE)
pattern_site_id <- pat_collapse(pattern_site_id)
pattern_aru_id <- pat_collapse(pattern_aru_id)
pattern_date <- pat_collapse(pattern_date)
pattern_time <- pat_collapse(pattern_time)
pattern_dt_sep <- pat_collapse(pattern_dt_sep)
pattern_date_time <- paste0(pattern_date, pattern_dt_sep, pattern_time)
# Get file lists
if(!is.null(project_dir)) {
if(!is.null(project_files)) {
rlang::warn("`project_dir` overrides `project_files`", call = NULL)
}
if(!quiet) rlang::inform("Fetching file list...")
project_files <- list_files(project_dir, subset, subset_type,
type = "file")
} else if(!is.null(subset)){
project_files <- stringr::str_subset(project_files, subset,
negate = subset_type == "omit")
} else if(is.null(project_files)) {
rlang::abort("Must provide one of `project_dir` or `project_files`",
call = NULL)
}
# Check for files (either zero or all directories)
if(length(project_files) == 0 || all(fs::is_dir(project_files))) {
if(is.null(subset)) {
msg <- "`project_dir`"
} else {
msg <- "`project_dir`/`subset`/`subset_type` combination"
}
rlang::abort(c(
paste0("There are no files in the ", msg, " you have specified. Note:"),
"i" = "Paths are case-sensitive",
"i" = "Check folders using `list.dirs(path = PROJECT_DIR)`",
"i" = "Check for files using `count_files(project_dir = PROJECT_DIR)`")
)
}
# Check for file types
n_ext <- sum(stringr::str_detect(project_files, file_type_pattern))
if(n_ext == 0){
rlang::abort(c(glue::glue("Did not find any '{file_type}' files."),
"i" = "Use `file_type` to change file extension for sound files",
"i" = "Check `project_dir`/`project_files` are correct"))
}
# Collect non-file-type files
extra <- stringr::str_subset(project_files, file_type_pattern, negate = TRUE)
gps <- stringr::str_subset(extra, stringr::regex("gps|summary", ignore_case = TRUE))
focal <- stringr::str_subset(project_files, file_type_pattern)
# Set up file path metadata
meta <- dplyr::tibble(
dir = fs::path_dir(focal),
file_name = fs::path_file(focal),
type = tolower(fs::path_ext(focal)))
if(length(gps) > 1) {
meta <- meta |>
dplyr::add_row(dir = fs::path_dir(gps),
file_name = fs::path_file(gps),
type = "gps")
}
pattern_aru_type <- c("barlt" = "BarLT",
"SMM" = "SongMeter",
"SM\\d" = "SongMeter",
"S\\dA" = "SongMeter")
if(!quiet) rlang::inform("Extracting ARU info...")
# Extract ARU metadata -----------------------
meta <- meta |>
dplyr::mutate(
path = file.path(.data$dir, .data$file_name),
aru_type = extract_replace(.data$file_name, pattern_aru_type),
aru_type = dplyr::if_else(is.na(.data$aru_type),
extract_replace(.data$dir, pattern_aru_type),
.data$aru_type),
aru_id = stringr::str_extract(.data$file_name, pattern_aru_id),
aru_id = dplyr::if_else(is.na(.data$aru_id),
stringr::str_extract(.data$dir, pattern_aru_id),
.data$aru_id))
meta <- dplyr::mutate(meta, site_id = stringr::str_extract(.data$dir, .env$pattern_site_id))
pattern_non_date <- paste0("(", pattern_site_id, ")|(",
pattern_aru_id, ")|(",
paste0("(", pattern_aru_type, ")", collapse = "|"),
")")
# Extract Date/time --------------------------
if(!quiet) rlang::inform("Extracting Dates and Times...")
meta <- meta |>
dplyr::mutate(
file_left = stringr::str_remove_all(.data$file_name, pattern_non_date),
dir_left = stringr::str_remove_all(.data$dir, pattern_non_date),
# Try file name
date_time_chr = stringr::str_extract(.data$file_left, .env$pattern_date_time),
# Try dir name
date_time_chr = dplyr::if_else(
is.na(.data$date_time_chr),
stringr::str_extract(.data$dir_left, .env$pattern_date_time),
.data$date_time_chr),
# Get date_times
date_time = lubridate::parse_date_time(
.data$date_time_chr,
orders = paste(order_date, "HMS"),
truncated = 1),
date = lubridate::as_date(.data$date_time))
if(any(is.na(meta$date))) {
missing <- meta |>
dplyr::filter(is.na(.data$date)) |>
dplyr::mutate(
# Try file name
date_chr = stringr::str_extract(.data$file_left, .env$pattern_date),
# Try dir name
date_chr = dplyr::if_else(
is.na(.data$date_chr),
stringr::str_extract(.data$dir_left, .env$pattern_date),
.data$date_chr),
date = lubridate::parse_date_time(.data$date_chr, orders = order_date,
quiet = TRUE),
date = lubridate::as_date(.data$date)) |>
dplyr::select("path", "date")
if(any(!is.na(missing$date))) {
# Add dates where missing
meta <- dplyr::rows_patch(meta, missing, by = "path")
}
}
# Report on details -------------------------
# Extra files
if(length(extra) > 1) {
rlang::inform(
c("!" = paste0("Omitted ", length(extra), " extra, non-",
file_type, "/GPS files")))
}
if(length(gps) > 1) {
rlang::inform(c("!" = paste0("Detected ", length(gps), " GPS logs")))
}
# Flag problems
f <- dplyr::filter(meta, .data$type == "wav")
n <- nrow(f)
f_d <- sum(is.na(f$date))
f_dt <- sum(is.na(f$date_time))
f_type <- sum(is.na(f$aru_type))
f_id <- sum(is.na(f$aru_id))
f_site <- sum(is.na(f$site_id))
if(any(c(f_d, f_dt, f_type, f_id, f_site) > 0)) {
msg <- c("Identified possible problems with metadata extraction:")
msg <- c(msg, report_missing(f_d, n, "dates"))
msg <- c(msg, report_missing(f_dt, n, "times"))
msg <- c(msg, report_missing(f_type, n, "ARU types"))
msg <- c(msg, report_missing(f_id, n, "ARU ids"))
msg <- c(msg, report_missing(f_site, n, "sites"))
rlang::inform(msg)
}
meta |>
dplyr::arrange(.data$type != "gps", !is.na(.data$date_time), .data$path,
.data$site_id, .data$date_time) |>
dplyr::select(-"file_left", -"dir_left", -"date_time_chr", -"dir")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.