R/test-data-metadata.R

Defines functions test_data_metadata meas_ids_from_metadata read_metadata_check_data read_metadata_files zerofill_date

Documented in test_data_metadata

#' @import purrr dplyr tibble stringr tidyr readODS
#' @importFrom rlang set_names
#' @title Check data and metadata files generated by Bruker OPUS lab spectral 
#' measurement workflow
#' @description Perform folder, file naming convention, and data and metadata
#' integrity tests.
#' @export
test_data_metadata <- function(data_root, pattern = "\\.ods$") {
  data_folders <- list.dirs(data_root, full.names = FALSE, recursive = FALSE)
  # List of binary OPUS files per date
  date_files_paths <- map(set_names(data_folders),
    ~ paste(data_root, .x, sep = "/"))
  # Only capture filenames, no full paths
  date_files_opus_names <- map(date_files_paths, ~ dir(path = .x,
    full.names = FALSE))
  
  # Read and check the metadata spreadsheets
  metadata_tbl <- meas_ids_from_metadata(data_root = data_root,
    pattern = pattern)
  
  # Create zero-filled dates from measurment folder names
  data_folders_yyyy_mm_dd <- zerofill_date(data_root = data_root)
  
  # Create a tibble that lists files in each measurement directory
  opus_files_tbl <- date_files_opus_names %>%
    # Set leading zero dates as new names
    rlang::set_names(nm = data_folders_yyyy_mm_dd) %>%
    purrr::imap(., ~ tibble::tibble(file_id = .x) %>%
    dplyr::mutate(date_file = .y)) %>%
    dplyr::bind_rows()
  
  # Split metadata and data by measurement date folder because there are
  # identical `file_id`s for different dates (internal quality control samples)
  metadata_tbl_lst <- metadata_tbl %>% split(.$date)
  opus_files_tbl_lst <- opus_files_tbl %>% split(.$date_file)
  
  # Check if metadata and measurement file list information is identical;
  # which measurement files do not have the correct metadata information
  
  # In case from metadata reconstructed `file_id`s (`sample_id`s and
  # well `pos` number) are missing in the list of measurement `file_id`s
  metadata_notin_data <- map2(.x = metadata_tbl_lst, .y = opus_files_tbl_lst,
  ~ .x[which(!pull(.x, file_id) %in% dplyr::pull(.y, file_id)), ])
  # Join opus data file summary and metadata summaries and return
  data_notin_metadata <- map2(.x = metadata_tbl_lst, .y = opus_files_tbl_lst,
  ~ .y[which(!dplyr::pull(.y, file_id) %in% dplyr::pull(.x, file_id)), ])
  
  metadata_missing <- Filter(function(x) nrow(x) > 1, metadata_notin_data)
  data_missing <- Filter(function(x) nrow(x) > 1, data_notin_metadata)
  
  if (length(metadata_missing) == 0 && length(data_missing) == 0) {
    message("Measurement data files and metadata records are complete.")
  } else if (length(metadata_missing) > 0 && length(data_missing) > 0) {
    stop(paste("Measurment data files and metadata are not complete.",
      "Metadata entries for the following measurement data <files are missing:",
      metadata_missing, ".",
      "Measurement data files for the following metadata entries are missing:",
      data_missing, "."))
  } else if (length(metadata_missing) > 0) {
    stop(paste("Measurement data files and metadata are not complete.",
      "Metadata entries for the following measurement data <files are missing:",
      metadata_missing, "."))
  } else {
    stop(paste("Measurement data files for the following metadata entries
      are missing:", data_missing, "."))
  }
  
  # Return joined metadata and data `id` tibble
  map2(.x = metadata_tbl_lst, .y = opus_files_tbl_lst,
    ~ dplyr::inner_join(x = .x, y = .y, by = "file_id")) %>%
    dplyr::bind_rows()
}


# Helpers ----------------------------------------------------------------------


# Check if the measurement metadata of files contain the complete list
# of binary OPUS files with with the spectral data

meas_ids_from_metadata <- function(data_root, pattern = "\\.ods$") {
  metadata_lst <- read_metadata_check_data(
    data_root = data_root, pattern = pattern)
  metadata_tbl_nested <- imap(metadata_lst,
      ~ dplyr::mutate(.x, meas_folder_exp = .y)) %>%
    dplyr::bind_rows() %>%
    tibble::as.tibble() %>%
    dplyr::mutate(date = as.character(date)) %>%
    # Reconstruct a `file_id` combining the `sample_id` and
    # the `pos` (well plate positions)
    dplyr::mutate(sample_id_pos = paste(sample_id, pos, sep = "_")) %>%
    dplyr::group_by(date, sample_id) %>%
    tidyr::nest()
  
  # Append a repetition number to `sample_id_pos` for all rows nested by
  # `date` and `sample_id`; first add `meas_rep_meta` for the metadata
  metadata_tbl <- dplyr::mutate(.data = metadata_tbl_nested, data = map(data,
    ~ .x %>% dplyr::group_by(sample_id_pos) %>% tidyr::nest() %>%
        dplyr::mutate(data = map(data,
          ~ dplyr::mutate(.x, meas_rep_meta = 0:(nrow(.x) - 1)))
          ) %>%
        tidyr::unnest(data)
        )
    ) %>%
    dplyr::mutate(data = map(data,
      ~ dplyr::mutate(.x,
          file_id = paste(sample_id_pos, meas_rep_meta, sep = ".")))
    ) %>%
    tidyr::unnest(data) %>%
    # Add how many replicate measurements per `sample_id` are present
    dplyr::add_count(sample_id) %>% 
    dplyr::mutate(sample_id_reps = n) %>%
    dplyr::select(-n)
  
  # Check if the `date` specified in the metadata spreadsheets corresponds
  # to the expected measurement folder name (`meas_folder_exp`) in the metadata 
  # filename
  date_meas_identical <-  dplyr::mutate(metadata_tbl,
    date_meas_identical = map2_lgl(.x = date, .y = meas_folder_exp,
      ~ .x == .y)) %>%
    dplyr::pull(date_meas_identical)
  
  if (all(date_meas_identical)) {
    message("`date` column entries in the metadata spreadsheet(s)
      correspond to the expected measurement folder name (date) in the metadata
      file name(s)")
  }

  metadata_tbl
}


read_metadata_check_data <- function(data_root, pattern = "\\.ods$") {
  # Zero-fill and paste year, month, and day to obtain %yyyy-mm-dd% date format
  data_folders_yyyy_mm_dd <- zerofill_date(data_root = data_root)
  
  # Check if there is a correct metadata file data information for each
  # corresponding measurement date; first read metadata spreadsheets
  metadata_lst <- read_metadata_files(data_root = data_root, pattern = pattern)
  metadata_file_date <- stringr::str_extract(names(metadata_lst),
    "[0-9]{4}[.-][0-9]{2}[.-][0-9]{2}")
  # Name list of metadata tibbles with measurement date
  metadata_lst <- set_names(metadata_lst, metadata_file_date)
  
  if (all(metadata_file_date %in% data_folders_yyyy_mm_dd) &&
      all(data_folders_yyyy_mm_dd %in% metadata_file_date)) {
    message("Each measurement folder has a corresponding metadata file.")
    metadata_lst
  } else if (!all(metadata_file_date %in% data_folders_yyyy_mm_dd)) {
    stop(paste(
      "For the following measurement dates there is metadata, but no 
      corresponding folder(s) with measurement files exist(s):",
      dQuote(
        metadata_file_date[!metadata_file_date %in% data_folders_yyyy_mm_dd])
    ))
  } else {
    # In case there is measurement folders, but no corresponding metadata
    stop(paste(
      "For the following measurement file folder(s) there is no metadata file(s)
       available:",
      dQuote(
        data_folders_yyyy_mm_dd[!data_folders_yyyy_mm_dd %in%
          metadata_file_date]
      )
    ))
  }
}


# Read metadata sheets and check file naming conventions

read_metadata_files <- function(data_root, pattern = "\\.ods$") {
  # Use regular expression to match `.odt` at end of string
  files <- dir(path = data_root, pattern = pattern, full.names = TRUE)
  metadata_filenames <- basename(files)
  # Read metadata from spreadsheets, each file is an element represented
  # as data frame
  metadata_lst <- map(files, ~ suppressMessages(
    readODS::read_ods(.x, sheet = 1)) %>% tibble::as.tibble())
  metadata_lst <- rlang::set_names(x = metadata_lst, nm = metadata_filenames)
  # Check if all metadata file names contain data in %yyyy-mm-dd% format
  metadata_filenm_extr <- stringr::str_extract(names(metadata_lst),
    "[0-9]{4}[.-][0-9]{2}[.-][0-9]{2}")
  if (any(is.na(metadata_filenm_extr))) {
    stop(paste(
      "The metadata spreadsheet(s) with file name(s):\n",
      dQuote(names(metadata_lst)[map_lgl(metadata_filenm_extr, is.na)]),
      "\ndoes/do not follow the expected date %YYYY-MM-DD% prefix.\n",
      "Please correctly rename the file(s)."
      )
    )
  }
  metadata_lst
}


# Zero-fill Bruker OPUS Lab software folder names. These are generated for each
# date when measurements are done, and come in the format `YYYY_M_D`,
# which does not fulfill ISO standards. These functions returns a character 
# vector of folder names that are converted to `YYYY-MM-DD`

zerofill_date <- function(data_root) {
  # List all folders names
  data_folders <- list.dirs(data_root, full.names = FALSE, recursive = FALSE)
  
  ## Split and zero fill measurement date folders to obtain %YYYY-MM-DD%
  # Expand day and month to 2 digit numbers
  data_folders_split <- stringr::str_split(data_folders, "_")
  # Convert elements in recursive list from character to integer
  data_folders_split_int <- map(data_folders_split, as.integer)
  # Add leading zeros for month and day
  data_folders_zerofilled <- map(data_folders_split_int,
    # `.at` modifies only at second (month) and third (day) vector element
    ~ unlist(purrr::modify_at(as.list(.x), .at = 2:3, ~ sprintf("%02d", .x)))
  )
  # Paste year, month, and day to obtain %yyyy-mm-dd% date format
  map_chr(data_folders_zerofilled, ~ paste(.x, collapse = "-"))
}
philipp-baumann/simplerspec.opus documentation built on Aug. 15, 2019, 8:25 a.m.