Nothing
utils::globalVariables(c("biovolume", "roi"))
#' Extract Biovolumes from IFCB Data and Compute Carbon Content
#'
#' This function reads biovolume data from feature files generated by the `ifcb-analysis` repository (Sosik and Olson 2007)
#' and matches them with corresponding classification results or manual annotations. It calculates biovolume in cubic micrometers and
#' determines if each class is a diatom based on the World Register of Marine Species (WoRMS). Carbon content
#' is computed for each region of interest (ROI) using conversion functions from Menden-Deuer and Lessard (2000),
#' depending on whether the class is identified as a diatom.
#'
#' @param feature_files A path to a folder containing feature files or a character vector of file paths.
#' @param mat_folder (Optional) Path to the folder containing class or manual annotation files.
#' @param custom_images (Optional) A character vector of image filenames in the format DYYYYMMDDTHHMMSS_IFCBXXX_ZZZZZ.png,
#' where "XXX" represents the IFCB number and "ZZZZZ" represents the ROI number.
#' These filenames should match the `roi_number` assignment in the `feature_files` and can be
#' used as a substitute for MATLAB files.
#' @param custom_classes (Optional) A character vector of corresponding class labels for `custom_images`.
#' @param class2use_file A character string specifying the path to the file containing the `class2use` variable (default: NULL).
#' @param micron_factor Conversion factor for biovolume to cubic micrometers. Default is `1 / 3.4`.
#' @param diatom_class A character vector specifying diatom class names in WoRMS. Default: `"Bacillariophyceae"`.
#' @param marine_only Logical. If `TRUE`, restricts the WoRMS search to marine taxa only. Default: `FALSE`.
#' @param threshold Threshold for selecting classification information (`"opt"` for above-threshold classification, otherwise `"all"`). Default: `"opt"`.
#' @param multiblob Logical. If `TRUE`, includes multiblob features. Default: `FALSE`.
#' @param feature_recursive Logical. If `TRUE`, searches recursively for feature files when `feature_files` is a folder. Default: `TRUE`.
#' @param mat_recursive Logical. If `TRUE`, searches recursively for MATLAB files in `mat_folder`. Default: `TRUE`.
#' @param use_python Logical. If `TRUE`, attempts to read `.mat` files using a Python-based method (`SciPy`). Default: `FALSE`.
#' @param verbose Logical. If `TRUE`, prints progress messages. Default: `TRUE`.
#'
#' @return A data frame containing:
#' - `sample`: The sample name.
#' - `classifier`: The classifier used (if applicable).
#' - `roi_number`: The region of interest (ROI) number.
#' - `class`: The identified taxonomic class.
#' - `biovolume_um3`: Computed biovolume in cubic micrometers.
#' - `carbon_pg`: Estimated carbon content in picograms.
#'
#' @details
#' - **Classification Data Handling:**
#' - If `mat_folder` is provided, the function reads class annotations from MATLAB `.mat` files.
#' - If `custom_images` and `custom_classes` are supplied, they override MATLAB classification data (e.g. data from a CNN model).
#' - If both `mat_folder` and `custom_images/custom_classes` are given, `mat_folder` takes precedence.
#'
#' - **MAT File Processing:**
#' - If `use_python = TRUE`, the function reads `.mat` files using `ifcb_read_mat()` (requires Python + `SciPy`).
#' - Otherwise, it falls back to `R.matlab::readMat()`.
#'
#' @examples
#' \dontrun{
#' # Using MATLAB results:
#' feature_files <- "data/features"
#' mat_folder <- "data/classified"
#'
#' biovolume_df <- ifcb_extract_biovolumes(feature_files,
#' mat_folder)
#'
#' print(biovolume_df)
#'
#' # Using custom classification result:
#' class = c("Mesodinium_rubrum",
#' "Mesodinium_rubrum")
#' image <- c("D20220522T003051_IFCB134_00002",
#' "D20220522T003051_IFCB134_00003")
#'
#' biovolume_df_custom <- ifcb_extract_biovolumes(feature_files,
#' custom_images = image,
#' custom_classes = class)
#'
#' print(biovolume_df_custom)
#' }
#'
#' @references Menden-Deuer Susanne, Lessard Evelyn J., (2000), Carbon to volume relationships for dinoflagellates, diatoms, and other protist plankton, Limnology and Oceanography, 3, doi: 10.4319/lo.2000.45.3.0569.
#' @references Sosik, H. M. and Olson, R. J. (2007), Automated taxonomic classification of phytoplankton sampled with imaging-in-flow cytometry. Limnol. Oceanogr: Methods 5, 204–216.
#'
#' @export
#'
#' @seealso \code{\link{ifcb_read_features}} \code{\link{ifcb_is_diatom}} \url{https://www.marinespecies.org/}
ifcb_extract_biovolumes <- function(feature_files, mat_folder = NULL, custom_images = NULL, custom_classes = NULL,
class2use_file = NULL, micron_factor = 1 / 3.4,
diatom_class = "Bacillariophyceae", marine_only = FALSE,
threshold = "opt", multiblob = FALSE, feature_recursive = TRUE,
mat_recursive = TRUE, use_python = FALSE, verbose = TRUE) {
if (is.null(mat_folder) && (is.null(custom_images) || is.null(custom_classes))) {
stop("Error: No classification information supplied. Provide either `mat_folder` for MATLAB data or both `custom_images` and `custom_classes` for a custom list.")
}
if (!is.null(mat_folder) && (!is.null(custom_images) || !is.null(custom_classes))) {
warning("Warning: Both `mat_folder` and `custom_images/custom_classes` were provided. The function will use the data from `mat_folder` and ignore `custom_images` and `custom_classes`.")
}
if (is.character(feature_files)) {
if (length(feature_files) == 1) {
# feature_files is a single path, check if it's a directory
if (dir.exists(feature_files)) {
# It's a directory, proceed
} else {
stop("The specified directory does not exist: ", feature_files)
}
} else {
# feature_files is a vector of filenames, check if all files exist
if (!all(file.exists(feature_files))) {
stop("One or more specified feature files do not exist.")
}
}
} else {
stop("feature_files must be a character vector of filenames or a single directory path.")
}
# Check if feature_files is a single folder path or a vector of file paths
if (length(feature_files) == 1 && file.info(feature_files)$isdir) {
feature_files <- list.files(feature_files, pattern = "D.*\\.csv", full.names = TRUE, recursive = feature_recursive)
}
if (!is.null(mat_folder)) {
# List mat files
mat_files <- list.files(mat_folder, pattern = "D.*\\.mat", full.names = TRUE, recursive = mat_recursive)
if (length(mat_files) == 0) {
stop("No MAT files found")
}
# Check if files are manually classified
is_manual <- "class2use_manual" %in% ifcb_get_mat_names(mat_files[1])
if (is_manual && is.null(class2use_file)) {
stop("class2use must be specified when extracting manual biovolume data")
}
}
if (!is.null(custom_images)) {
# Extract date-time from class file paths
class_date_times <- gsub(".*D(\\d{8}T\\d{6})_.*", "\\1", custom_images)
} else {
# Extract date-time from class file paths
class_date_times <- gsub(".*D(\\d{8}T\\d{6})_.*", "\\1", mat_files)
}
# Extract date-time from feature file paths
extracted_dates <- sub(".*D(\\d{8}T\\d{6}).*", "\\1", feature_files)
# List matching feature files
feature_files <- feature_files[extracted_dates %in% class_date_times]
if (verbose) {
cat("Reading feature files...\n")
}
# Read feature files
features <- ifcb_read_features(feature_files, multiblob = multiblob, verbose = verbose)
if (length(features) == 0) {
stop("No feature data files found")
}
# Initialize an empty list to store data frames
data_list <- list()
# Loop through each feature file
for (file_name in names(features)) {
file_data <- features[[file_name]]
# Create a data frame with sample, roi_number, and biovolume
temp_df <- data.frame(
sample = ifelse(multiblob, str_replace(file_name, "_multiblob_v\\d+.csv", ""), str_replace(file_name, "_fea_v\\d+.csv", "")),
roi_number = file_data$roi_number,
biovolume = file_data$Biovolume
)
# Append to the list
data_list <- append(data_list, list(temp_df))
}
# Combine all data frames into one
biovolume_df <- do.call(rbind, data_list)
# If custom class list is supplied
if (!is.null(custom_images) && !is.null(custom_classes)) {
if (length(custom_images) != length(custom_classes)) {
stop("Error: The number of images does not match the number of class labels. Ensure both vectors have the same length.")
}
image_df <- ifcb_convert_filenames(custom_images)
class_df <- image_df %>%
mutate(classifier = NA,
class = custom_classes) %>%
select(sample, classifier, roi, class) %>%
rename(roi_number = roi)
} else {
# Find unique samples in biovolume_df
unique_samples <- unique(biovolume_df$sample)
# Function to check if any sample name is in the class path
contains_sample <- function(class_path, samples) {
# Check if any of the samples are present in the class_path
any(sapply(samples, function(sample) grepl(sample, class_path)))
}
# Filter classes where the sample name exists in biovolume_df$sample
matching_mat <- mat_files[sapply(mat_files, contains_sample, samples = unique_samples)]
# Initialize an empty list to store data frames
tb_list <- list()
# Initialize a list to store all warnings
warning_list <- list()
if (is_manual) {
class_df <- ifcb_count_mat_annotations(matching_mat,
class2use_file,
sum_level = "roi",
use_python = use_python)
names(class_df)[2] <- "roi_number"
class_df$classifier <- NA
} else {
# Loop through matching classes
for (class in seq_along(matching_mat)) {
# Capture warnings for each readMat call
temp <- suppressWarnings({
if (use_python && scipy_available()) {
temp_result <- ifcb_read_mat(matching_mat[class])
} else {
# Read the contents of the MAT file
temp_result <- read_mat(matching_mat[class], fixNames = FALSE)
}
warning_list <- c(warning_list, warnings())
temp_result
})
# Create a data frame with sample, roi_number, and class information
temp_df <- data.frame(
sample = str_replace(basename(matching_mat[class]), "_class_v\\d+.mat", ""),
classifier = temp$classifierName,
roi_number = temp$roinum,
class = if (threshold == "opt") {
unlist(temp$TBclass_above_threshold)
} else {
unlist(temp$TBclass)
}
)
# Append to the list
tb_list <- append(tb_list, list(temp_df))
}
}
# Display the number of warnings
num_warnings <- length(warning_list)
if (num_warnings > 0) {
message(sprintf("There were %d warnings (use warnings() to see them)", num_warnings))
}
if (!is_manual) {
# Combine all data frames into one
class_df <- do.call(rbind, tb_list)
}
}
# Join biovolume_df with class_df
biovolume_df <- left_join(biovolume_df, class_df, by = c("sample", "roi_number"))
# Calculate biovolume in cubic microns
biovolume_df$biovolume_um3 <- biovolume_df$biovolume * (micron_factor ^ 3)
# Determine if each class is a diatom
unique_classes <- unique(biovolume_df$class)
if (verbose) {
cat("Retrieving WoRMS records...\n")
}
is_diatom <- data.frame(class = unique_classes, is_diatom = ifcb_is_diatom(unique_classes,
diatom_class = diatom_class,
marine_only = marine_only,
verbose = verbose))
biovolume_df <- left_join(biovolume_df, is_diatom, by = "class")
# Filter rows where is_diatom$is_diatom is NA
na_classes <- is_diatom[is.na(is_diatom$is_diatom), "class"]
# Filter rows where is_diatom$is_diatom is NA
non_diatoms <- is_diatom[!is_diatom$is_diatom, "class"]
# Print the classes with NA values
if (length(na_classes) > 0 & verbose) {
cat("INFO: Some classes could not be found in WoRMS. They will be assumed as NOT diatoms for carbon calculations:\n")
cat(sort(na_classes), sep = "\n")
}
# Print the classes that are non-Diatoms
if (length(non_diatoms) > 0 & verbose) {
cat("INFO: The following classes are considered NOT diatoms for carbon calculations:\n")
cat(sort(non_diatoms), sep = "\n")
}
# Calculate carbon content based on diatom classification
biovolume_df <- biovolume_df %>%
mutate(carbon_pg = case_when(
!is.na(is_diatom) & is_diatom ~ vol2C_lgdiatom(biovolume_um3),
!is.na(is_diatom) & !is_diatom ~ vol2C_nondiatom(biovolume_um3),
is.na(is_diatom) ~ vol2C_nondiatom(biovolume_um3),
TRUE ~ NA_real_
)) %>%
select(-biovolume, -is_diatom)
type_convert(biovolume_df, col_types = cols())
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.