#' @import purrr dplyr tibble stringr tidyr readODS
#' @importFrom rlang set_names
#' @title Check data and metadata files generated by Bruker OPUS lab spectral
#' measurement workflow
#' @description Perform folder, file naming convention, and data and metadata
#' integrity tests.
#' @export
test_data_metadata <- function(data_root, pattern = "\\.ods$") {
data_folders <- list.dirs(data_root, full.names = FALSE, recursive = FALSE)
# List of binary OPUS files per date
date_files_paths <- map(set_names(data_folders),
~ paste(data_root, .x, sep = "/"))
# Only capture filenames, no full paths
date_files_opus_names <- map(date_files_paths, ~ dir(path = .x,
full.names = FALSE))
# Read and check the metadata spreadsheets
metadata_tbl <- meas_ids_from_metadata(data_root = data_root,
pattern = pattern)
# Create zero-filled dates from measurment folder names
data_folders_yyyy_mm_dd <- zerofill_date(data_root = data_root)
# Create a tibble that lists files in each measurement directory
opus_files_tbl <- date_files_opus_names %>%
# Set leading zero dates as new names
rlang::set_names(nm = data_folders_yyyy_mm_dd) %>%
purrr::imap(., ~ tibble::tibble(file_id = .x) %>%
dplyr::mutate(date_file = .y)) %>%
dplyr::bind_rows()
# Split metadata and data by measurement date folder because there are
# identical `file_id`s for different dates (internal quality control samples)
metadata_tbl_lst <- metadata_tbl %>% split(.$date)
opus_files_tbl_lst <- opus_files_tbl %>% split(.$date_file)
# Check if metadata and measurement file list information is identical;
# which measurement files do not have the correct metadata information
# In case from metadata reconstructed `file_id`s (`sample_id`s and
# well `pos` number) are missing in the list of measurement `file_id`s
metadata_notin_data <- map2(.x = metadata_tbl_lst, .y = opus_files_tbl_lst,
~ .x[which(!pull(.x, file_id) %in% dplyr::pull(.y, file_id)), ])
# Join opus data file summary and metadata summaries and return
data_notin_metadata <- map2(.x = metadata_tbl_lst, .y = opus_files_tbl_lst,
~ .y[which(!dplyr::pull(.y, file_id) %in% dplyr::pull(.x, file_id)), ])
metadata_missing <- Filter(function(x) nrow(x) > 1, metadata_notin_data)
data_missing <- Filter(function(x) nrow(x) > 1, data_notin_metadata)
if (length(metadata_missing) == 0 && length(data_missing) == 0) {
message("Measurement data files and metadata records are complete.")
} else if (length(metadata_missing) > 0 && length(data_missing) > 0) {
stop(paste("Measurment data files and metadata are not complete.",
"Metadata entries for the following measurement data <files are missing:",
metadata_missing, ".",
"Measurement data files for the following metadata entries are missing:",
data_missing, "."))
} else if (length(metadata_missing) > 0) {
stop(paste("Measurement data files and metadata are not complete.",
"Metadata entries for the following measurement data <files are missing:",
metadata_missing, "."))
} else {
stop(paste("Measurement data files for the following metadata entries
are missing:", data_missing, "."))
}
# Return joined metadata and data `id` tibble
map2(.x = metadata_tbl_lst, .y = opus_files_tbl_lst,
~ dplyr::inner_join(x = .x, y = .y, by = "file_id")) %>%
dplyr::bind_rows()
}
# Helpers ----------------------------------------------------------------------
# Check if the measurement metadata of files contain the complete list
# of binary OPUS files with with the spectral data
meas_ids_from_metadata <- function(data_root, pattern = "\\.ods$") {
metadata_lst <- read_metadata_check_data(
data_root = data_root, pattern = pattern)
metadata_tbl_nested <- imap(metadata_lst,
~ dplyr::mutate(.x, meas_folder_exp = .y)) %>%
dplyr::bind_rows() %>%
tibble::as.tibble() %>%
dplyr::mutate(date = as.character(date)) %>%
# Reconstruct a `file_id` combining the `sample_id` and
# the `pos` (well plate positions)
dplyr::mutate(sample_id_pos = paste(sample_id, pos, sep = "_")) %>%
dplyr::group_by(date, sample_id) %>%
tidyr::nest()
# Append a repetition number to `sample_id_pos` for all rows nested by
# `date` and `sample_id`; first add `meas_rep_meta` for the metadata
metadata_tbl <- dplyr::mutate(.data = metadata_tbl_nested, data = map(data,
~ .x %>% dplyr::group_by(sample_id_pos) %>% tidyr::nest() %>%
dplyr::mutate(data = map(data,
~ dplyr::mutate(.x, meas_rep_meta = 0:(nrow(.x) - 1)))
) %>%
tidyr::unnest(data)
)
) %>%
dplyr::mutate(data = map(data,
~ dplyr::mutate(.x,
file_id = paste(sample_id_pos, meas_rep_meta, sep = ".")))
) %>%
tidyr::unnest(data) %>%
# Add how many replicate measurements per `sample_id` are present
dplyr::add_count(sample_id) %>%
dplyr::mutate(sample_id_reps = n) %>%
dplyr::select(-n)
# Check if the `date` specified in the metadata spreadsheets corresponds
# to the expected measurement folder name (`meas_folder_exp`) in the metadata
# filename
date_meas_identical <- dplyr::mutate(metadata_tbl,
date_meas_identical = map2_lgl(.x = date, .y = meas_folder_exp,
~ .x == .y)) %>%
dplyr::pull(date_meas_identical)
if (all(date_meas_identical)) {
message("`date` column entries in the metadata spreadsheet(s)
correspond to the expected measurement folder name (date) in the metadata
file name(s)")
}
metadata_tbl
}
read_metadata_check_data <- function(data_root, pattern = "\\.ods$") {
# Zero-fill and paste year, month, and day to obtain %yyyy-mm-dd% date format
data_folders_yyyy_mm_dd <- zerofill_date(data_root = data_root)
# Check if there is a correct metadata file data information for each
# corresponding measurement date; first read metadata spreadsheets
metadata_lst <- read_metadata_files(data_root = data_root, pattern = pattern)
metadata_file_date <- stringr::str_extract(names(metadata_lst),
"[0-9]{4}[.-][0-9]{2}[.-][0-9]{2}")
# Name list of metadata tibbles with measurement date
metadata_lst <- set_names(metadata_lst, metadata_file_date)
if (all(metadata_file_date %in% data_folders_yyyy_mm_dd) &&
all(data_folders_yyyy_mm_dd %in% metadata_file_date)) {
message("Each measurement folder has a corresponding metadata file.")
metadata_lst
} else if (!all(metadata_file_date %in% data_folders_yyyy_mm_dd)) {
stop(paste(
"For the following measurement dates there is metadata, but no
corresponding folder(s) with measurement files exist(s):",
dQuote(
metadata_file_date[!metadata_file_date %in% data_folders_yyyy_mm_dd])
))
} else {
# In case there is measurement folders, but no corresponding metadata
stop(paste(
"For the following measurement file folder(s) there is no metadata file(s)
available:",
dQuote(
data_folders_yyyy_mm_dd[!data_folders_yyyy_mm_dd %in%
metadata_file_date]
)
))
}
}
# Read metadata sheets and check file naming conventions
read_metadata_files <- function(data_root, pattern = "\\.ods$") {
# Use regular expression to match `.odt` at end of string
files <- dir(path = data_root, pattern = pattern, full.names = TRUE)
metadata_filenames <- basename(files)
# Read metadata from spreadsheets, each file is an element represented
# as data frame
metadata_lst <- map(files, ~ suppressMessages(
readODS::read_ods(.x, sheet = 1)) %>% tibble::as.tibble())
metadata_lst <- rlang::set_names(x = metadata_lst, nm = metadata_filenames)
# Check if all metadata file names contain data in %yyyy-mm-dd% format
metadata_filenm_extr <- stringr::str_extract(names(metadata_lst),
"[0-9]{4}[.-][0-9]{2}[.-][0-9]{2}")
if (any(is.na(metadata_filenm_extr))) {
stop(paste(
"The metadata spreadsheet(s) with file name(s):\n",
dQuote(names(metadata_lst)[map_lgl(metadata_filenm_extr, is.na)]),
"\ndoes/do not follow the expected date %YYYY-MM-DD% prefix.\n",
"Please correctly rename the file(s)."
)
)
}
metadata_lst
}
# Zero-fill Bruker OPUS Lab software folder names. These are generated for each
# date when measurements are done, and come in the format `YYYY_M_D`,
# which does not fulfill ISO standards. These functions returns a character
# vector of folder names that are converted to `YYYY-MM-DD`
zerofill_date <- function(data_root) {
# List all folders names
data_folders <- list.dirs(data_root, full.names = FALSE, recursive = FALSE)
## Split and zero fill measurement date folders to obtain %YYYY-MM-DD%
# Expand day and month to 2 digit numbers
data_folders_split <- stringr::str_split(data_folders, "_")
# Convert elements in recursive list from character to integer
data_folders_split_int <- map(data_folders_split, as.integer)
# Add leading zeros for month and day
data_folders_zerofilled <- map(data_folders_split_int,
# `.at` modifies only at second (month) and third (day) vector element
~ unlist(purrr::modify_at(as.list(.x), .at = 2:3, ~ sprintf("%02d", .x)))
)
# Paste year, month, and day to obtain %yyyy-mm-dd% date format
map_chr(data_folders_zerofilled, ~ paste(.x, collapse = "-"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.