R/extract_metadata_from_UFZ_files.R

Defines functions extract_metadata_from_ufz_files

Documented in extract_metadata_from_ufz_files

#' @title Extract Metadata from UFZ FTMS Filenames
#'
#' @description This function extracts metadata from XML filenames following the UFZ FTMS naming conventions.
#' It parses elements like sample ID, position, date, and retention time, organizing them into a structured data.table.
#'
#' @name extract_metadata_from_ufz_files
#' @family internal functions
#' @keywords metadata internal
#' @import data.table
# @importFrom tcltk tk_choose.dir
#'
#' @param folder_path (Optional) The path to the directory containing the XML files.
#' @param file_type (Default: ".xml")
#' If not provided, the user will be prompted to choose a file path interactively.
#'
#' @return A data.table containing extracted metadata fields from each filename. The columns are:
#'   \itemize{
#'     \item \code{sample_id}: Identifier for the sample.
#'     \item \code{sample_id_ufz}: Identifier specific to UFZ's format, if available.
#'     \item \code{position}: Position or condition identifier in the experiment.
#'     \item \code{date}: Experiment date, formatted as \code{Date}.
#'     \item \code{segment}: Segment information related to time or experiment phase.
#'     \item \code{ret_time}: Retention time range within the segment.
#'     \item \code{file_long}: Original filename after format adjustments.
#'     \item \code{file}: Filename without the XML extension.
#'     \item \code{link_rawdata}: Original filename as a link to raw data.
#'     \item \code{ID}: Unique row identifier for each entry.
#'   }
#'
# @examples
# # Assuming the folder_path variable is defined as the directory containing XML files
# extract_metadata_from_ufz_files(folder_path)
#
#' @details
#' This function reads XML filenames from a specified folder and splits their components into structured metadata fields. It processes the filenames to ensure a consistent format by replacing an underscore preceding the 4-digit sample number with a hyphen. The function then extracts key information (e.g., sample ID, experiment date, retention time) based on the UFZ FTMS naming conventions and outputs a tidy data.table.
#'
#' The expected filename format is as follows:
#' - Standard: \code{104B12_9557_RB3_10-12-2023_Segment1_1-2min.xml}
#' - Exception with additional underscore in the first part: \code{srfa_mcs_9554_GA2_10-12-2023_Segment1_1-2min.xml}

extract_metadata_from_ufz_files <- function(folder_path = NULL, file_type = NULL) {

  full_path <- link_rawdata <- file_long <- ID <- NULL
  V1 <- V2 <- V3 <- V4 <- V5 <- V6 <- V7 <- V8 <- NULL
  position <- ret_time <- sample_id_ufz <- segment <- sample_id <- NULL

if(is.null(folder_path)){
  file_type <- "\\.xml$"
} else {
  file_type <- paste0("\\", file_type, "$")
}

  if(is.null(folder_path)){
    # folder_path <- tcltk::tk_choose.dir(default = "//smb.isibhv.dmawi.de/projects-noreplica/p_ume/Spektren FTMS", caption = "Select a folder containing XML peaklists")
    folder_path <- file.choose()
  }

  #folder_path <- "\\\\smb.isibhv.dmawi.de\\projects-noreplica\\p_ume\\Spektren FTMS\\UFZ\\UFZ 2025.03 DOGMA"
  folder_path <- dirname(folder_path)

  # Get list of XML files in the folder
  if(file_type == "\\.d$"){
    all_dirs <- list.dirs(folder_path, full.names = TRUE, recursive = TRUE)
    file_list <- all_dirs[grepl("\\.d$", basename(all_dirs))] |>  data.table()
  } else {
  file_list <- list.files(path = folder_path, pattern = file_type, full.names = TRUE, recursive = TRUE) |> data.table()
  }

  setnames(x = file_list, old = "V1", new = "full_path", skip_absent = T)
  file_list[, file := basename(full_path)]
  file_list[, link_rawdata := file]

  # Replace undesired underscore before the four-digit sample ID with a hyphen
  # file_list[, file := sub("^(.*)_([^_]+_\\d{4})", "\\1-\\2", file)]

  # Split filenames into columns
  spectra <- file_list[, tstrsplit(file, "_", fixed=TRUE)]

  # Combine with original file list and set column names
  spectra <- cbind(spectra, file_list)
  spectra

  #setnames(spectra, c("V1", "V2", "V3", "V4", "V5", "V6"), c("sample_id", "sample_id_ufz", "position", "date", "segment", "ret_time"))
  #setnames(spectra, c("V1", "V2", "V3", "V4", "V5", "V6"), c("sample_id", "position", "pos2", "sample_id_ufz", "segment", "ret_time"))
  #setnames(spectra, c("V1", "V2", "V3", "V4", "V5"), c("sample_id", "position", "sample_id_ufz", "segment", "ret_time"))
  setnames(spectra, old = names(spectra)[1:8], new = c("tag1", "tag2", "tag3", "sample_id_ufz", "position", "date", "segment", "ret_time"))
  spectra[, 1:5]
  spectra[is.na(sample_id), .N, .(V1, V2, V3, sample_id)]
  spectra[V1 %in% c("HI") & V1 != "SML2", sample_tag:=paste(V1, V2, V3)]
  spectra[V1 %in% c("M3ULW"), sample_tag := "M3 ULW"]
  spectra[V1 %in% c("POOL", "SRFA", "BLK"), sample_tag := V1]
  spectra[V1 %in% c("PR", "MQ"), sample_tag := paste(V1, V2)]
  spectra[is.na(sample_tag), sample_tag := paste(V1, V2)]
  spectra[V3 == 1624, sample_tag := paste(V1, V2)]

  spectra[V2 %like% "16", sample_id_ufz:=V2]
  spectra[V3 %like% "16", sample_id_ufz:=V3]
  spectra[V4 %like% "16", sample_id_ufz:=V4]

  spectra[V3 %like% "RE", position:=V3]
  spectra[V3 %like% "BE", position:=V3]
  spectra[V4 %like% "B", position:=V4]
  spectra[V4 %like% "R", position:=V4]
  spectra[V5 %like% "B", position:=V5]
  spectra[V5 %like% "R", position:=V5]

  spectra[V4 %like% "Segment", segment:=V4]
  spectra[V5 %like% "Segment", segment:=V5]
  spectra[V6 %like% "Segment", segment:=V6]
  spectra[V7 %like% "Segment", segment:=V7]

  spectra[V4 %like% "2025-", date:=V4]
  spectra[V5 %like% "2025-", date:=V5]
  spectra[V6 %like% "2025-", date:=V6]

  spectra[V6 %like% "min.xml", ret_time:=V6]
  spectra[V7 %like% "min.xml", ret_time:=V7]
  spectra[V8 %like% "min.xml", ret_time:=V8]

  spectra[, .N, V6]
  names(spectra)

  spectra[is.na(ret_time), .N, .(V1, V2, V3, V4, V5, V6, V7, V8, sample_tag, sample_id_ufz, position, segment, date)]
  spectra[, .N, .(V1, V2, V3, V4, sample_tag, sample_id_ufz)]

  # Process columns as needed
  spectra[, full_path := NULL]
  spectra[, date := as.Date("2024-11-14", "%Y-%m-%d")]
  spectra[, file_long := file]
  spectra[, file := gsub(".xml", "", file)]
  spectra[, ID := 1:nrow(spectra)]

# Check if entries in 'sample_id' contain letters
  if(any(grepl("[a-zA-Z]", spectra$sample_id))){
    setnames(spectra, old = "sample_id", new = "sample_tag")
  }

  return(spectra[])
}

Try the ume package in your browser

Any scripts or data that you put into this service are public.

ume documentation built on Dec. 13, 2025, 1:06 a.m.