R/ifcb_psd.R
In iRfcb: Tools for Managing Imaging FlowCytobot (IFCB) Data

Documented in ifcb_psd

utils::globalVariables(c("variable", "number", "Bin"))
#' Plot and Save IFCB PSD Data
#'
#' This function generates and saves data about a dataset's Particle Size Distribution (PSD) from Imaging FlowCytobot (IFCB)
#' feature and hdr files, which can be used for data quality assurance and quality control.
#'
#' @details
#' The PSD function originates from the `PSD` python repository (Hayashi et al. in prep), which can be found at \url{https://github.com/kudelalab/PSD}.
#'
#' Python must be installed to use this function. The required python packages can be installed in a virtual environment using `ifcb_py_install()`.
#'
#' The function requires v2 features generated by the `ifcb-analysis` MATLAB package (Sosik and Olson 2007) found at \url{https://github.com/hsosik/ifcb-analysis}.
#'
#' @param feature_folder The absolute path to a directory containing all of the v2 feature files for the dataset.
#' @param hdr_folder The absolute path to a directory containing all of the hdr files for the dataset.
#' @param save_data A boolean indicating whether to save data to CSV files. Default is FALSE.
#' @param output_file A string with the base file name for the .csv to use (including path). Set to NULL to not save data (default).
#' @param plot_folder The folder where graph images for each file will be saved. Set to NULL to not save graphs (default).
#' @param use_marker A boolean indicating whether to show markers on the plot. Default is FALSE.
#' @param start_fit An integer indicating the start fit value for the plot. Default is 10.
#' @param r_sqr The lower limit of acceptable R^2 values (any curves below it will be flagged). Default is 0.5.
#' @param beads The maximum multiplier for the curve fit. Any files with higher curve fit multipliers will be flagged as bead runs. If this argument is included, files with "runBeads" marked as TRUE in the header file will also be flagged as a bead run. Optional.
#' @param bubbles The minimum difference between the starting ESD and the ESD with the most targets.  Any files with a difference higher than this threshold will be flagged as mostly bubbles. Optional.
#' @param incomplete A tuple with the minimum volume of cells (in c/L) and the minimum mL analyzed for a complete run. Any files with values below these thresholds will be flagged as incomplete. Optional.
#' @param missing_cells The minimum image count to trigger count ratio. Any files with lower ratios will be flagged as missing cells. Optional.
#' @param biomass The minimum number of targets in the most populated ESD bin for any given run. Any files with fewer targets will be flagged as having low biomass. Optional.
#' @param bloom The minimum difference between the starting ESD and the ESD with the most targets. Any files with a difference less than this threshold will be flagged as a bloom. Will likely be lower than the bubbles threshold. Optional.
#' @param humidity The maximum percent humidity. Any files with higher values will be flagged as high humidity. Optional.
#' @param micron_factor The conversion factor to microns. Default is 1/3.4.
#'
#' @return A list with data, fits, and flags DataFrames if `save_data` is FALSE; otherwise, NULL.
#'
#' @seealso \code{\link{ifcb_py_install}} \url{https://github.com/kudelalab/PSD} \url{https://github.com/hsosik/ifcb-analysis}
#'
#' @references
#' Hayashi, K., Walton, J., Lie, A., Smith, J. and Kudela M. Using particle size distribution (PSD) to automate imaging flow cytobot (IFCB) data quality in coastal California, USA. In prep.
#' Sosik, H. M. and Olson, R. J. (2007), Automated taxonomic classification of phytoplankton sampled with imaging-in-flow cytometry. Limnol. Oceanogr: Methods 5, 204–216.
#'
#' @examples
#' \dontrun{
#' # Initialize the python session if not already set up
#' ifcb_py_install()
#'
#' ifcb_psd(
#'   feature_folder = 'path/to/features',
#'   hdr_folder = 'path/to/hdr_data',
#'   save_data = TRUE,
#'   output_file = 'psd/svea_2021',
#'   plot_folder = 'psd/plots',
#'   use_marker = FALSE,
#'   start_fit = 13,
#'   r_sqr = 0.5,
#'   beads = 10 ** 9,
#'   bubbles = 150,
#'   incomplete = c(1500, 3),
#'   missing_cells = 0.7,
#'   biomass = 1000,
#'   bloom = 5,
#'   humidity = NULL,
#'   micron_factor = 1/3.0
#' )
#' }
#' @export
ifcb_psd <- function(feature_folder, hdr_folder, save_data = FALSE, output_file = NULL, plot_folder = NULL,
                     use_marker = FALSE, start_fit = 10, r_sqr = 0.5, beads = NULL, bubbles = NULL, incomplete = NULL,
                     missing_cells = NULL, biomass = NULL, bloom = NULL, humidity = NULL, micron_factor = 1/3.4) {

  if (!reticulate::py_available(initialize = TRUE)) {
    stop("Python is not installed on this machine. Please install Python to use this function.")
  }

  if (save_data & is.null(output_file)) {
    stop("No output file specified. Please provide a valid output file path to save the data.")
  }

  # Initialize python check
  check_python_and_module(module = "pandas")
  check_python_and_module(module = "matplotlib")

  # Source the Python script
  source_python(system.file("python", "psd.py", package = "iRfcb"))

  # Create a Bin object
  b <- Bin(feature_dir = as.character(feature_folder), hdr_dir = as.character(hdr_folder), micron_factor = as.numeric(micron_factor))

  # Plot the PSD
  b$plot_PSD(use_marker = use_marker, plot_folder = NULL, start_fit = as.integer(start_fit))

  if (save_data) {
    # Prepare arguments for save_data
    args <- list(name = as.character(output_file), r_sqr = as.numeric(r_sqr))
    if (!is.null(beads)) args$beads <- as.numeric(beads)
    if (!is.null(bubbles)) args$bubbles <- as.integer(bubbles)
    if (!is.null(incomplete)) args$incomplete <- as.integer(incomplete)
    if (!is.null(missing_cells)) args$missing_cells <- missing_cells
    if (!is.null(biomass)) args$biomass <- as.integer(biomass)
    if (!is.null(bloom)) args$bloom <- as.integer(bloom)
    if (!is.null(humidity)) args$humidity <- humidity

    # Save the data
    do.call(b$save_data, args)
  }

  # Retrieve data from Python
  data <- b$get_data()
  fits <- b$get_fits()
  flags <- b$get_flags(r_sqr = r_sqr, beads = beads, bubbles = bubbles, incomplete = incomplete,
                       missing_cells = missing_cells, biomass = biomass, bloom = bloom, humidity = humidity)


  # Flatten nested lists and combine into a data frame
  data_df <- as.data.frame(lapply(data, function(x) unlist(x)))

  # Convert to tibble and add sample column
  data_df <- data_df %>%
    mutate(sample = rownames(data_df)) %>%  # Add row names as a new column
    relocate(sample) %>%
    arrange(sample) %>%
    dplyr::as_tibble()

  # Convert nested list to a data frame
  fits_df <- as.data.frame(lapply(fits, function(x) unlist(x)))

  # Convert to long format and then to wide format
  fits_df <- fits_df %>%
    mutate(sample = rownames(.)) %>%
    dplyr::as_tibble() %>%
    relocate(sample) %>%
    dplyr::arrange(sample)

  if (nrow(as.data.frame(flags)) > 0) {
    # Convert to a data frame
    files <- unlist(flags$file)
    flags <- unlist(flags$flag)

    # Combine into a data frame
    flags_df <- dplyr::tibble(
      sample = files,
      flag = flags
    ) %>%
      dplyr::arrange(sample)
  } else {
    flags_df <- NULL
  }

  if (!is.null(plot_folder)) {

    if (!dir.exists(plot_folder)) {
      dir.create(plot_folder, recursive = TRUE)
    }

    # List of sample names
    sample_names <- data_df$sample

    for (sample in sample_names) {

      # Find the potential flag
      flag <- flags_df[grepl(sample, flags_df$sample),]

      # Specify plot subfolder
      if (nrow(flag) == 0) {
        flag_folder <- file.path(plot_folder, "PSD.OK")
      } else {
        flag_folder <- file.path(plot_folder, make.names(flag$flag))
      }

      # Create plot subfolder
      if (!dir.exists(flag_folder)) {
        dir.create(flag_folder, recursive = TRUE)
      }

      # Plot the sample PSD
      p <- ifcb_psd_plot(sample, data_df, fits_df, start_fit)

      # Save the plot
      ggsave(filename = file.path(flag_folder,
                                  paste0(sample, ".png")),
             plot = p,
             bg = "white",
             width = 6,
             height = 4)
    }
  }

  list(data = data_df, fits = fits_df, flags = flags_df)
}