R/ifcb_read_summary.R
In iRfcb: Tools for Managing Imaging FlowCytobot (IFCB) Data

Documented in ifcb_read_summary

utils::globalVariables(c("gpsLatitude", "gpsLongitude", "type", "value", "counts", "ml_analyzed_calc", "biovolume_per_liter", "timestamp", "time", "ifcb_number"))
#' Read and Summarize Classified IFCB Data
#'
#' This function reads a MATLAB `.mat` file containing aggregated and classified IFCB (Imaging FlowCytobot)
#' data generated by the `countcells_allTBnew_user_training` function from the `ifcb-analysis` repository (Sosik and Olson 2007),
#' or a list of classified data generated by `ifcb_summarize_class_counts`.
#' It returns a data frame with species counts and optionally biovolume information based on specified thresholds.
#'
#' @param summary A character string specifying the path to the `.mat` summary file or a list generated by `ifcb_summarize_class_counts`.
#' @param hdr_directory A character string specifying the path to the directory containing header (.hdr) files. Default is NULL.
#' @param biovolume A logical indicating whether the file contains biovolume data. Default is FALSE.
#' @param threshold A character string specifying the threshold type for counts and biovolume. Options are "opt" (default), "adhoc", and "none".
#' @param use_python Logical. If `TRUE`, attempts to read the `.mat` file using a Python-based method. Default is `FALSE`.
#' @return A data frame containing the summary information including file list, volume analyzed, species counts, optionally biovolume, and other metadata.
#'
#' @details
#' If `use_python = TRUE`, the function tries to read the `.mat` file using `ifcb_read_mat()`, which relies on `SciPy`.
#' This approach may be faster than the default approach using `R.matlab::readMat()`, especially for large `.mat` files.
#' To enable this functionality, ensure Python is properly configured with the required dependencies.
#' You can initialize the Python environment and install necessary packages using `ifcb_py_install()`.
#'
#' If `use_python = FALSE` or if `SciPy` is not available, the function falls back to using `R.matlab::readMat()`.
#'
#' @seealso \url{https://github.com/hsosik/ifcb-analysis}
#' @export
#' @references Sosik, H. M. and Olson, R. J. (2007), Automated taxonomic classification of phytoplankton sampled with imaging-in-flow cytometry. Limnol. Oceanogr: Methods 5, 204–216.
#' @examples
#' mat_file <- system.file("exdata/example_summary.mat", package = "iRfcb")
#'
#' summary_data <- ifcb_read_summary(mat_file, biovolume = FALSE, threshold = "opt")
#' print(summary_data)
ifcb_read_summary <- function(summary, hdr_directory = NULL, biovolume = FALSE, threshold = "opt", use_python = FALSE) {

  if (is.list(summary)) {
    # If 'summary' is a list, assign it to the variable 'mat'
    mat <- summary

    # Replace all underscores in the names of the list elements with dots to match output from MATLAB
    # names(mat) <- gsub("_", ".",names(mat))
  } else {
    if (use_python && scipy_available()) {
      mat <- ifcb_read_mat(summary)
    } else {
      # Read the contents of the MAT file
      mat <- read_mat(summary)
    }
  }

  # Check if hdr_directory is provided and exists
  if (!is.null(hdr_directory)) {
    # Extract GPS information from header files
    hdr_info <- ifcb_read_hdr_data(file.path(hdr_directory), gps_only = TRUE, verbose = FALSE)
    gps_info <- hdr_info %>%
      dplyr::select(sample, gpsLatitude, gpsLongitude)

    # List all .hdr files in the specified directory
    files <- list.files(hdr_directory, pattern = "\\.hdr$", recursive = TRUE, full.names = TRUE)

    # Extract volume analyzed information from .hdr files
    volume_info <- data.frame(
      sample = gsub(".*/(D\\d+T\\d+_IFCB\\d+)\\.hdr", "\\1", files),
      ml_analyzed_calc = ifcb_volume_analyzed(files)
    )
  }

  # Extract ml_analyzed and file list from the MATLAB data
  ml_analyzed <- as.vector(mat$ml_analyzedTB)
  filelistTB <- unlist(mat$filelistTB)

  # Select class count based on threshold
  classcountTB <- switch(threshold,
                         "opt" = mat$classcountTB_above_optthresh,
                         "adhoc" = mat$classcountTB_above_adhocthresh,
                         "none" = mat$classcountTB,
                         stop("Invalid threshold option. Choose from 'opt', 'adhoc', or 'none'."))

  # Check if classcountTB is NULL
  if (is.null(classcountTB)) {
    stop(paste("Class count data for threshold", threshold, "does not exist in the file."))
  }

  # Extract species names from class2useTB
  class2useTB <- unlist(mat$class2useTB)

  if (use_python) {
    classcountTB <- as.matrix(t(classcountTB))
  }

  # Assign column names for class counts
  colnames(classcountTB) <- paste("counts", class2useTB, sep = "_")

  # Initialize the summary data frame with sample and ml_analyzed
  summary <- data.frame(
    sample = filelistTB,
    ml_analyzed = ml_analyzed,
    classcountTB,
    check.names = FALSE
  )

  # If biovolume is requested, include biovolume data
  if (biovolume) {
    # Select biovolume based on threshold
    classbiovolTB <- switch(threshold,
                            "opt" = mat$classbiovolTB_above_optthresh,
                            "adhoc" = mat$classbiovolTB_above_adhocthresh,
                            "none" = mat$classbiovolTB,
                            stop("Invalid threshold option. Choose from 'opt', 'adhoc', or 'none'."))

    # Check if classbiovolTB is NULL
    if (is.null(classbiovolTB)) {
      stop(paste("Biovolume data for threshold", threshold, "does not exist in the file."))
    }

    # Assign column names for biovolume
    colnames(classbiovolTB) <- paste("biovolume", class2useTB, sep = "_")

    # Combine biovolume data with summary
    summary <- dplyr::bind_cols(summary, classbiovolTB)
  }

  # Transform summary data into long format and calculate counts per liter
  summary_long <- summary %>%
    tidyr::pivot_longer(
      cols = !c("sample", "ml_analyzed"),
      names_pattern = "([^_]+)_(.*)",
      names_to = c("type", "species"),
      values_to = "value"
    ) %>%
    tidyr::pivot_wider(names_from = type, values_from = value) %>%
    dplyr::filter(counts != 0)

  # If hdr_directory is provided, adjust ml_analyzed with calculated volume
  if (!is.null(hdr_directory)) {
    summary_long <- summary_long %>%
      dplyr::left_join(volume_info, by = "sample") %>%
      dplyr::mutate(ml_analyzed = dplyr::coalesce(ml_analyzed, ml_analyzed_calc)) %>%
      dplyr::select(-ml_analyzed_calc)
  }

  # Calculate counts per liter
  summary_long <- summary_long %>%
    dplyr::mutate(counts_per_liter = counts / ml_analyzed * 1000)

  # If biovolume is requested, calculate biovolume per liter and in mm3
  if (biovolume) {
    summary_long <- summary_long %>%
      dplyr::mutate(biovolume_per_liter = biovolume / ml_analyzed * 1000,
                    biovolume_mm3 = biovolume_per_liter / 1000)
  }

  # Extract date information from sample names
  date_info <- ifcb_convert_filenames(unique(summary_long$sample))

  # Merge date information with summary_long
  summary_long <- summary_long %>%
    dplyr::left_join(date_info, by = "sample") %>%
    dplyr::relocate(timestamp, date, year, month, day, time, ifcb_number, .after = sample)

  # If hdr_directory is provided, merge GPS information with summary_long
  if (!is.null(hdr_directory)) {
    summary_long <- summary_long %>%
      dplyr::left_join(gps_info, by = "sample") %>%
      dplyr::relocate(gpsLatitude, gpsLongitude, .after = ifcb_number)
  }

  # Return the finalized summary_long data frame
  summary_long
}