R/ifcb_count_mat_annotations.R
In iRfcb: Tools for Managing Imaging FlowCytobot (IFCB) Data

Documented in ifcb_count_mat_annotations

utils::globalVariables(c("name", "manual", "roi number"))
#' Count IFCB Annotations from .mat Files
#'
#' This function processes `.mat` files, generated by the code in the `ifcb-analysis` repository (Sosik and Olson 2007),
#' to count and summarize the annotations for each class based on the class2use information provided in a file.
#'
#' @param manual_files A character string specifying the path to the .mat files or a folder containing .mat files.
#' @param class2use_file A character string specifying the path to the file containing the class2use variable.
#' @param skip_class A numeric vector of class IDs or a character vector of class names to be excluded from the count. Default is NULL.
#' @param sum_level A character string specifying the level of summarization. Options: "sample", "roi" or "class" (default).
#' @param mat_recursive Logical. If TRUE, the function will search for MATLAB files recursively when `manual_files` is a folder. Default is FALSE.
#' @param use_python Logical. If `TRUE`, attempts to read the `.mat` file using a Python-based method. Default is `FALSE`.
#'
#' @details
#' If `use_python = TRUE`, the function tries to read the `.mat` file using `ifcb_read_mat()`, which relies on `SciPy`.
#' This approach may be faster than the default approach using `R.matlab::readMat()`, especially for large `.mat` files.
#' To enable this functionality, ensure Python is properly configured with the required dependencies.
#' You can initialize the Python environment and install necessary packages using `ifcb_py_install()`.
#'
#' If `use_python = FALSE` or if `SciPy` is not available, the function falls back to using `R.matlab::readMat()`.
#'
#' @return A data frame with the total count of images per class, roi or per sample.
#' @export
#' @references Sosik, H. M. and Olson, R. J. (2007), Automated taxonomic classification of phytoplankton sampled with imaging-in-flow cytometry. Limnol. Oceanogr: Methods 5, 204–216.
#'
#' @examples
#' \dontrun{
#' # Count annotations excluding specific class IDs
#' result <- ifcb_count_mat_annotations("path/to/manual_folder",
#'                                      "path/to/class2use_file",
#'                                      skip_class = c(99, 100))
#' print(result)
#'
#' # Count annotations excluding a specific class name
#' result <- ifcb_count_mat_annotations("path/to/manual_folder",
#'                                      "path/to/class2use_file",
#'                                      skip_class = "unclassified")
#' print(result)
#' }
ifcb_count_mat_annotations <- function(manual_files, class2use_file, skip_class = NULL, sum_level = "class", mat_recursive = FALSE, use_python = FALSE) {
  if (!sum_level %in% c("class", "sample", "roi")) {
    stop("sum_level should either be `class`, `roi` or `sample`")
  }

  # Check if feature_files is a single folder path or a vector of file paths
  if (length(manual_files) == 1 && file.info(manual_files)$isdir) {
    manual_files <- list.files(manual_files, pattern = "D.*\\.mat", full.names = TRUE, recursive = mat_recursive)
  }

  # Get the class2use variable from the specified file
  class2use <- ifcb_get_mat_variable(class2use_file)

  # Create a lookup table from class2use
  lookup_table <- data.frame(
    manual = seq_along(class2use),
    name = class2use,
    stringsAsFactors = FALSE
  )

  # Convert skip_class names to manual IDs if they are character strings
  if (is.character(skip_class)) {
    filtered_skip_class <- lookup_table %>% filter(name %in% skip_class)
    if (nrow(filtered_skip_class) == 0) {
      stop("None of the class names provided in skip_class were found in class2use.")
    }
    skip_class <- filtered_skip_class %>% pull(manual)
  }

  # Initialize an empty data frame to accumulate the results
  total_sum <- data.frame()

  # Initialize a list to store all warnings
  warning_list <- list()

  for (file in manual_files) {

    # Skip empty/corrupt files
    if (file.size(file) == 0) {
      warning(paste("Empty .mat file:", file, "Skipping."))
      next
    }

    if (use_python && scipy_available()) {
      mat_data <- ifcb_read_mat(file)
    } else {
      # Read the contents of the MAT file
      mat_data <- read_mat(file)
    }

    taxa_list <- as.data.frame(mat_data$classlist)

    # Assign names to the columns in taxa_list
    names(taxa_list) <- unlist(mat_data$list_titles)

    # Filter out the skipped classes and NA values from the taxa list
    taxa_list <- taxa_list %>%
      filter(!manual %in% skip_class & !is.na(manual))

    # Replace the numbers in taxa_list$manual with the corresponding names using a lookup table
    sum <- taxa_list %>%
      mutate(sample = tools::file_path_sans_ext(basename(file))) %>%
      left_join(lookup_table, by = "manual") %>%
      mutate(class = as.character(ifelse(is.na(name), as.character(manual), name))) %>%
      select(sample, `roi number`, class)

    if (sum_level %in% c("class", "sample")) {
      # Summarize the number of images by class
      sum <- sum %>%
        # mutate(sample = tools::file_path_sans_ext(basename(file))) %>%
        group_by(sample, class) %>%
        summarise(n = n(), .groups = 'drop')
    }

    # Accumulate the results into total_sum
    total_sum <- bind_rows(total_sum, sum)
  }

  if (sum_level == "class") {
    # Combine and summarize results
    total_sum <- total_sum %>%
      group_by(class) %>%
      summarise(n = sum(n, na.rm = TRUE), .groups = 'drop')
  }

  # Display the number of warnings
  num_warnings <- length(warning_list)

  if (num_warnings > 0) {
    message(sprintf("There were %d warnings (use warnings() to see them)", num_warnings))
  }

  total_sum
}