R/ifcb_extract_biovolumes.R
In iRfcb: Tools for Managing Imaging FlowCytobot (IFCB) Data

Documented in ifcb_extract_biovolumes

utils::globalVariables(c("biovolume", "roi"))
#' Extract Biovolumes from IFCB Data and Compute Carbon Content
#'
#' This function reads biovolume data from feature files generated by the `ifcb-analysis` repository (Sosik and Olson 2007)
#' and matches them with corresponding classification results or manual annotations. It calculates biovolume in cubic micrometers and
#' determines if each class is a diatom based on the World Register of Marine Species (WoRMS). Carbon content
#' is computed for each region of interest (ROI) using conversion functions from Menden-Deuer and Lessard (2000),
#' depending on whether the class is identified as a diatom.
#'
#' @param feature_files A path to a folder containing feature files or a character vector of file paths.
#' @param mat_folder (Optional) Path to the folder containing class or manual annotation files.
#' @param custom_images (Optional) A character vector of image filenames in the format DYYYYMMDDTHHMMSS_IFCBXXX_ZZZZZ.png,
#'        where "XXX" represents the IFCB number and "ZZZZZ" represents the ROI number.
#'        These filenames should match the `roi_number` assignment in the `feature_files` and can be
#'        used as a substitute for MATLAB files.
#' @param custom_classes (Optional) A character vector of corresponding class labels for `custom_images`.
#' @param class2use_file A character string specifying the path to the file containing the `class2use` variable (default: NULL).
#' @param micron_factor Conversion factor for biovolume to cubic micrometers. Default is `1 / 3.4`.
#' @param diatom_class A character vector specifying diatom class names in WoRMS. Default: `"Bacillariophyceae"`.
#' @param marine_only Logical. If `TRUE`, restricts the WoRMS search to marine taxa only. Default: `FALSE`.
#' @param threshold Threshold for selecting classification information (`"opt"` for above-threshold classification, otherwise `"all"`). Default: `"opt"`.
#' @param multiblob Logical. If `TRUE`, includes multiblob features. Default: `FALSE`.
#' @param feature_recursive Logical. If `TRUE`, searches recursively for feature files when `feature_files` is a folder. Default: `TRUE`.
#' @param mat_recursive Logical. If `TRUE`, searches recursively for MATLAB files in `mat_folder`. Default: `TRUE`.
#' @param use_python Logical. If `TRUE`, attempts to read `.mat` files using a Python-based method (`SciPy`). Default: `FALSE`.
#' @param verbose Logical. If `TRUE`, prints progress messages. Default: `TRUE`.
#'
#' @return A data frame containing:
#' - `sample`: The sample name.
#' - `classifier`: The classifier used (if applicable).
#' - `roi_number`: The region of interest (ROI) number.
#' - `class`: The identified taxonomic class.
#' - `biovolume_um3`: Computed biovolume in cubic micrometers.
#' - `carbon_pg`: Estimated carbon content in picograms.
#'
#' @details
#' - **Classification Data Handling:**
#'   - If `mat_folder` is provided, the function reads class annotations from MATLAB `.mat` files.
#'   - If `custom_images` and `custom_classes` are supplied, they override MATLAB classification data (e.g. data from a CNN model).
#'   - If both `mat_folder` and `custom_images/custom_classes` are given, `mat_folder` takes precedence.
#'
#' - **MAT File Processing:**
#'   - If `use_python = TRUE`, the function reads `.mat` files using `ifcb_read_mat()` (requires Python + `SciPy`).
#'   - Otherwise, it falls back to `R.matlab::readMat()`.
#'
#' @examples
#' \dontrun{
#' # Using MATLAB results:
#' feature_files <- "data/features"
#' mat_folder <- "data/classified"
#'
#' biovolume_df <- ifcb_extract_biovolumes(feature_files,
#'                                         mat_folder)
#'
#' print(biovolume_df)
#'
#' # Using custom classification result:
#' class = c("Mesodinium_rubrum",
#'           "Mesodinium_rubrum")
#' image <- c("D20220522T003051_IFCB134_00002",
#'            "D20220522T003051_IFCB134_00003")
#'
#' biovolume_df_custom <- ifcb_extract_biovolumes(feature_files,
#'                                                custom_images = image,
#'                                                custom_classes = class)
#'
#' print(biovolume_df_custom)
#' }
#'
#' @references Menden-Deuer Susanne, Lessard Evelyn J., (2000), Carbon to volume relationships for dinoflagellates, diatoms, and other protist plankton, Limnology and Oceanography, 3, doi: 10.4319/lo.2000.45.3.0569.
#' @references Sosik, H. M. and Olson, R. J. (2007), Automated taxonomic classification of phytoplankton sampled with imaging-in-flow cytometry. Limnol. Oceanogr: Methods 5, 204–216.
#'
#' @export
#'
#' @seealso \code{\link{ifcb_read_features}} \code{\link{ifcb_is_diatom}} \url{https://www.marinespecies.org/}
ifcb_extract_biovolumes <- function(feature_files, mat_folder = NULL, custom_images = NULL, custom_classes = NULL,
                                    class2use_file = NULL, micron_factor = 1 / 3.4,
                                    diatom_class = "Bacillariophyceae", marine_only = FALSE,
                                    threshold = "opt", multiblob = FALSE, feature_recursive = TRUE,
                                    mat_recursive = TRUE, use_python = FALSE, verbose = TRUE) {

  if (is.null(mat_folder) && (is.null(custom_images) || is.null(custom_classes))) {
    stop("Error: No classification information supplied. Provide either `mat_folder` for MATLAB data or both `custom_images` and `custom_classes` for a custom list.")
  }

  if (!is.null(mat_folder) && (!is.null(custom_images) || !is.null(custom_classes))) {
    warning("Warning: Both `mat_folder` and `custom_images/custom_classes` were provided. The function will use the data from `mat_folder` and ignore `custom_images` and `custom_classes`.")
  }

  if (is.character(feature_files)) {
    if (length(feature_files) == 1) {
      # feature_files is a single path, check if it's a directory
      if (dir.exists(feature_files)) {
        # It's a directory, proceed
      } else {
        stop("The specified directory does not exist: ", feature_files)
      }
    } else {
      # feature_files is a vector of filenames, check if all files exist
      if (!all(file.exists(feature_files))) {
        stop("One or more specified feature files do not exist.")
      }
    }
  } else {
    stop("feature_files must be a character vector of filenames or a single directory path.")
  }

  # Check if feature_files is a single folder path or a vector of file paths
  if (length(feature_files) == 1 && file.info(feature_files)$isdir) {
    feature_files <- list.files(feature_files, pattern = "D.*\\.csv", full.names = TRUE, recursive = feature_recursive)
  }

  if (!is.null(mat_folder)) {
    # List mat files
    mat_files <- list.files(mat_folder, pattern = "D.*\\.mat", full.names = TRUE, recursive = mat_recursive)

    if (length(mat_files) == 0) {
      stop("No MAT files found")
    }

    # Check if files are manually classified
    is_manual <- "class2use_manual" %in% ifcb_get_mat_names(mat_files[1])

    if (is_manual && is.null(class2use_file)) {
      stop("class2use must be specified when extracting manual biovolume data")
    }
  }

  if (!is.null(custom_images)) {
    # Extract date-time from class file paths
    class_date_times <- gsub(".*D(\\d{8}T\\d{6})_.*", "\\1", custom_images)
  } else {
    # Extract date-time from class file paths
    class_date_times <- gsub(".*D(\\d{8}T\\d{6})_.*", "\\1", mat_files)
  }

  # Extract date-time from feature file paths
  extracted_dates <- sub(".*D(\\d{8}T\\d{6}).*", "\\1", feature_files)

  # List matching feature files
  feature_files <- feature_files[extracted_dates %in% class_date_times]

  if (verbose) {
    cat("Reading feature files...\n")
  }

  # Read feature files
  features <- ifcb_read_features(feature_files, multiblob = multiblob, verbose = verbose)

  if (length(features) == 0) {
    stop("No feature data files found")
  }

  # Initialize an empty list to store data frames
  data_list <- list()

  # Loop through each feature file
  for (file_name in names(features)) {

    file_data <- features[[file_name]]

    # Create a data frame with sample, roi_number, and biovolume
    temp_df <- data.frame(
      sample = ifelse(multiblob, str_replace(file_name, "_multiblob_v\\d+.csv", ""), str_replace(file_name, "_fea_v\\d+.csv", "")),
      roi_number = file_data$roi_number,
      biovolume = file_data$Biovolume
    )

    # Append to the list
    data_list <- append(data_list, list(temp_df))
  }

  # Combine all data frames into one
  biovolume_df <- do.call(rbind, data_list)

  # If custom class list is supplied
  if (!is.null(custom_images) && !is.null(custom_classes)) {
    if (length(custom_images) != length(custom_classes)) {
      stop("Error: The number of images does not match the number of class labels. Ensure both vectors have the same length.")
    }

    image_df <- ifcb_convert_filenames(custom_images)
    class_df <- image_df %>%
      mutate(classifier = NA,
             class = custom_classes) %>%
      select(sample, classifier, roi, class) %>%
      rename(roi_number = roi)

  } else {
    # Find unique samples in biovolume_df
    unique_samples <- unique(biovolume_df$sample)

    # Function to check if any sample name is in the class path
    contains_sample <- function(class_path, samples) {
      # Check if any of the samples are present in the class_path
      any(sapply(samples, function(sample) grepl(sample, class_path)))
    }

    # Filter classes where the sample name exists in biovolume_df$sample
    matching_mat <- mat_files[sapply(mat_files, contains_sample, samples = unique_samples)]

    # Initialize an empty list to store data frames
    tb_list <- list()

    # Initialize a list to store all warnings
    warning_list <- list()

    if (is_manual) {

      class_df <- ifcb_count_mat_annotations(matching_mat,
                                             class2use_file,
                                             sum_level = "roi",
                                             use_python = use_python)

      names(class_df)[2] <- "roi_number"

      class_df$classifier <- NA

    } else {
      # Loop through matching classes
      for (class in seq_along(matching_mat)) {
        # Capture warnings for each readMat call
        temp <- suppressWarnings({

          if (use_python && scipy_available()) {
            temp_result <- ifcb_read_mat(matching_mat[class])
          } else {
            # Read the contents of the MAT file
            temp_result <- read_mat(matching_mat[class], fixNames = FALSE)
          }
          warning_list <- c(warning_list, warnings())
          temp_result
        })

        # Create a data frame with sample, roi_number, and class information
        temp_df <- data.frame(
          sample = str_replace(basename(matching_mat[class]), "_class_v\\d+.mat", ""),
          classifier = temp$classifierName,
          roi_number = temp$roinum,
          class = if (threshold == "opt") {
            unlist(temp$TBclass_above_threshold)
          } else {
            unlist(temp$TBclass)
          }
        )

        # Append to the list
        tb_list <- append(tb_list, list(temp_df))
      }
    }

    # Display the number of warnings
    num_warnings <- length(warning_list)

    if (num_warnings > 0) {
      message(sprintf("There were %d warnings (use warnings() to see them)", num_warnings))
    }

    if (!is_manual) {
      # Combine all data frames into one
      class_df <- do.call(rbind, tb_list)
    }
  }

  # Join biovolume_df with class_df
  biovolume_df <- left_join(biovolume_df, class_df, by = c("sample", "roi_number"))

  # Calculate biovolume in cubic microns
  biovolume_df$biovolume_um3 <- biovolume_df$biovolume * (micron_factor ^ 3)

  # Determine if each class is a diatom
  unique_classes <- unique(biovolume_df$class)

  if (verbose) {
    cat("Retrieving WoRMS records...\n")
  }

  is_diatom <- data.frame(class = unique_classes, is_diatom = ifcb_is_diatom(unique_classes,
                                                                             diatom_class = diatom_class,
                                                                             marine_only = marine_only,
                                                                             verbose = verbose))
  biovolume_df <- left_join(biovolume_df, is_diatom, by = "class")

  # Filter rows where is_diatom$is_diatom is NA
  na_classes <- is_diatom[is.na(is_diatom$is_diatom), "class"]

  # Filter rows where is_diatom$is_diatom is NA
  non_diatoms <- is_diatom[!is_diatom$is_diatom, "class"]

  # Print the classes with NA values
  if (length(na_classes) > 0 & verbose) {
    cat("INFO: Some classes could not be found in WoRMS. They will be assumed as NOT diatoms for carbon calculations:\n")
    cat(sort(na_classes), sep = "\n")
  }

  # Print the classes that are non-Diatoms
  if (length(non_diatoms) > 0 & verbose) {
    cat("INFO: The following classes are considered NOT diatoms for carbon calculations:\n")
    cat(sort(non_diatoms), sep = "\n")
  }

  # Calculate carbon content based on diatom classification
  biovolume_df <- biovolume_df %>%
    mutate(carbon_pg = case_when(
      !is.na(is_diatom) & is_diatom ~ vol2C_lgdiatom(biovolume_um3),
      !is.na(is_diatom) & !is_diatom ~ vol2C_nondiatom(biovolume_um3),
      is.na(is_diatom) ~ vol2C_nondiatom(biovolume_um3),
      TRUE ~ NA_real_
    )) %>%
    select(-biovolume, -is_diatom)

  type_convert(biovolume_df, col_types = cols())
}