R/ifcb_summarize_biovolumes.R
In iRfcb: Tools for Managing Imaging FlowCytobot (IFCB) Data

Documented in ifcb_summarize_biovolumes

utils::globalVariables(c("biovolume_um3", "carbon_pg", "counts", "classifier", "."))
#' Summarize Biovolumes and Carbon Content from IFCB Data
#'
#' This function calculates aggregated biovolumes and carbon content from Imaging FlowCytobot (IFCB)
#' samples based on biovolume information from feature files. Images are grouped into classes either
#' based on MATLAB classification, manually annotated files, or a user-supplied list of images and
#' their corresponding class labels (e.g. from a CNN model).
#'
#' @param feature_folder Path to the folder containing feature files (e.g., CSV format).
#' @param mat_folder (Optional) Path to the folder containing MATLAB classification or manual annotation files.
#' @param class2use_file (Optional) A character string specifying the path to the file containing the class2use variable (default NULL). Only needed when summarizing manual MATLAB results.
#' @param hdr_folder (Optional) Path to the folder containing HDR files. Needed for calculating cell, biovolume and carbon concentration per liter.
#' @param custom_images (Optional) A character vector of image filenames in the format DYYYYMMDDTHHMMSS_IFCBXXX_ZZZZZ,
#'        where "XXX" represents the IFCB number and "ZZZZZ" represents the ROI number.
#'        These filenames should match the `roi_number` assignment in the `feature_files` and can be
#'        used as a substitute for MATLAB files.
#' @param custom_classes (Optional) A character vector of corresponding class labels for `custom_images`.
#' @param micron_factor Conversion factor from microns per pixel (default: 1/3.4).
#' @param diatom_class A string vector of diatom class names in the World Register of Marine Species (WoRMS). Default is "Bacillariophyceae".
#' @param marine_only Logical. If TRUE, restricts the WoRMS search to marine taxa only. Default is FALSE.
#' @param threshold Threshold for classification (default: "opt").
#' @param feature_recursive Logical. If TRUE, the function will search for feature files recursively within the `feature_folder`. Default is TRUE.
#' @param mat_recursive Logical. If TRUE, the function will search for MATLAB files recursively within the `mat_folder`. Default is TRUE.
#' @param hdr_recursive Logical. If TRUE, the function will search for HDR files recursively within the `hdr_folder` (if provided). Default is TRUE.
#' @param use_python Logical. If `TRUE`, attempts to read the `.mat` file using a Python-based method. Default is `FALSE`.
#' @param verbose A logical indicating whether to print progress messages. Default is TRUE.
#'
#' @return A data frame summarizing aggregated biovolume and carbon content per class per sample.
#'   Columns include 'sample', 'classifier', 'class', 'biovolume_mm3', 'carbon_ug', 'ml_analyzed',
#'   'biovolume_mm3_per_liter', and 'carbon_ug_per_liter'.
#'
#' @details This function performs the following steps:
#' \enumerate{
#'   \item Extracts biovolumes and carbon content from feature and classification results using `ifcb_extract_biovolumes`.
#'   \item Optionally incorporates volume data from HDR files to calculate volume analyzed per sample.
#'   \item Computes biovolume and carbon content per liter of sample analyzed.
#' }
#'
#' The MATLAB classification or manual annotation files are generated by the `ifcb-analysis` repository
#' (Sosik and Olson 2007). Users can optionally provide a **custom classification** by supplying a vector of image filenames
#' (`custom_images`) along with corresponding class labels (`custom_classes`). This allows summarization
#' of biovolume and carbon content without requiring MATLAB classification or manual annotation files
#' (e.g. results from a CNN model).
#'
#' Biovolumes are converted to carbon according to Menden-Deuer and Lessard 2000
#' for individual regions of interest (ROI), applying different conversion factors to diatoms and
#' non-diatom protists. If provided, the function also incorporates sample volume data from HDR files
#' to compute biovolume and carbon content per liter of sample.
#'
#' If `use_python = TRUE`, the function tries to read the `.mat` file using `ifcb_read_mat()`, which relies on `SciPy`.
#' This approach may be faster than the default approach using `R.matlab::readMat()`, especially for large `.mat` files.
#' To enable this functionality, ensure Python is properly configured with the required dependencies.
#' You can initialize the Python environment and install necessary packages using `ifcb_py_install()`.
#'
#' @examples
#' \dontrun{
#' # Example usage:
#' ifcb_summarize_biovolumes("path/to/features", "path/to/mat", hdr_folder = "path/to/hdr")
#'
#' # Using custom classification result:
#' images <- c("D20220522T003051_IFCB134_00002",
#'             "D20220522T003051_IFCB134_00003")
#' classes = c("Mesodinium_rubrum",
#'             "Mesodinium_rubrum")
#'
#' ifcb_summarize_biovolumes(feature_folder = "path/to/features",
#'                           hdr_folder = "path/to/hdr",
#'                           custom_images = images,
#'                           custom_classes = classes)
#' }
#'
#' @references Menden-Deuer Susanne, Lessard Evelyn J., (2000), Carbon to volume relationships for dinoflagellates, diatoms, and other protist plankton, Limnology and Oceanography, 3, doi: 10.4319/lo.2000.45.3.0569.
#' @references Sosik, H. M. and Olson, R. J. (2007), Automated taxonomic classification of phytoplankton sampled with imaging-in-flow cytometry. Limnol. Oceanogr: Methods 5, 204–216.
#'
#' @export
ifcb_summarize_biovolumes <- function(feature_folder, mat_folder = NULL, class2use_file = NULL,
                                      hdr_folder = NULL, custom_images = NULL, custom_classes = NULL,
                                      micron_factor = 1 / 3.4, diatom_class = "Bacillariophyceae",
                                      marine_only = FALSE, threshold = "opt", feature_recursive = TRUE,
                                      mat_recursive = TRUE, hdr_recursive = TRUE, use_python = FALSE,
                                      verbose = TRUE) {

  # Extract biovolumes and carbon content from feature and class files
  biovolumes <- ifcb_extract_biovolumes(feature_files = feature_folder,
                                        mat_folder = mat_folder,
                                        custom_images = custom_images,
                                        custom_classes = custom_classes,
                                        class2use_file = class2use_file,
                                        micron_factor = micron_factor,
                                        diatom_class = diatom_class,
                                        marine_only = marine_only,
                                        threshold = threshold,
                                        feature_recursive = feature_recursive,
                                        mat_recursive = mat_recursive,
                                        use_python = use_python,
                                        verbose = verbose)

  # Aggregate biovolumes and carbon content by sample and class
  biovolume_aggregated <- biovolumes %>%
    group_by(sample, classifier, class) %>%
    summarise(counts = n(),
              biovolume_mm3 = sum(biovolume_um3 * 10^-9, na.rm = TRUE),  # Convert from um3 to mm3
              carbon_ug = sum(carbon_pg * 10^-6, na.rm = TRUE),  # Convert from pg to ug
              .groups = 'drop')

  # Optionally incorporate sample volume data from HDR files if provided and calculate volume normalized values
  if (!is.null(hdr_folder)) {
    hdr_files <- list.files(hdr_folder, pattern = "D.*\\.hdr", full.names = TRUE, recursive = hdr_recursive)

    # Extract sample names from HDR and class files using a general regular expression
    hdr_sample_names <- sub(".*/(D\\d+T\\d+_IFCB\\d+)\\.hdr", "\\1", hdr_files)
    # mat_sample_names <- sub(".*/(D\\d{8}T\\d{6}_IFCB\\d+).*", "\\1", mat_files)

    if (!is.null(custom_images)) {
      # Extract date-time from class file paths
      mat_sample_names <- sub("^(D\\d{8}T\\d{6}_IFCB\\d+)_.*", "\\1", custom_images)
    } else {
      mat_files <- list.files(mat_folder, pattern = "D.*\\.mat", full.names = TRUE, recursive = mat_recursive)
      # Extract date-time from class file paths
      mat_sample_names <- sub(".*/(D\\d{8}T\\d{6}_IFCB\\d+).*", "\\1", mat_files)
    }

    # Find common sample names between HDR and class files
    common_sample_names <- intersect(hdr_sample_names, mat_sample_names)

    # Filter HDR files to include only those matching common sample names
    hdr_files_filtered <- hdr_files[hdr_sample_names %in% common_sample_names]

    # Initialize an empty data frame to store volume data
    volumes <- data.frame()

    # Loop through filtered HDR files to extract volume analyzed per sample
    for (file in seq_along(hdr_files_filtered)) {
      volume <- data.frame(sample = sub(".*/(D\\d+T\\d+_IFCB\\d+)\\.hdr", "\\1", hdr_files_filtered[file]),
                           ml_analyzed = ifcb_volume_analyzed(hdr_files_filtered[file]))  # Calculate volume analyzed

      volumes <- rbind(volumes, volume)  # Append volume data to 'volumes' data frame
    }

    # Join volume data with aggregated biovolumes based on 'sample' column
    biovolume_aggregated <- left_join(biovolume_aggregated, volumes, by = "sample")

    # Calculate biovolume and carbon content per liter of sample analyzed
    biovolume_aggregated$counts_per_liter <- biovolume_aggregated$counts / (biovolume_aggregated$ml_analyzed / 1000)
    biovolume_aggregated$biovolume_mm3_per_liter <- biovolume_aggregated$biovolume_mm3 / (biovolume_aggregated$ml_analyzed / 1000)
    biovolume_aggregated$carbon_ug_per_liter <- biovolume_aggregated$carbon_ug / (biovolume_aggregated$ml_analyzed / 1000)
  }
  return(biovolume_aggregated)
}