Nothing
utils::globalVariables(c("biovolume", "roi", "roi_number", "Biovolume"))
#' Extract Biovolumes from IFCB Data and Compute Carbon Content
#'
#' This function reads biovolume data from feature files generated by the `ifcb-analysis` repository (Sosik and Olson 2007)
#' and matches them with corresponding classification results or manual annotations. It calculates biovolume in cubic micrometers and
#' determines if each class is a diatom based on the World Register of Marine Species (WoRMS). Carbon content
#' is computed for each region of interest (ROI) using conversion functions from Menden-Deuer and Lessard (2000),
#' depending on whether the class is identified as a diatom.
#'
#' @param feature_files A path to a folder containing feature files or a character vector of file paths.
#' @param class_files (Optional) A character vector of full paths to classification or manual
#' annotation files (`.mat`, `.h5`, or `.csv`), or a single path to a folder
#' containing such files.
#' @param custom_images (Optional) A character vector of image filenames in the format DYYYYMMDDTHHMMSS_IFCBXXX_ZZZZZ(.png),
#' where "XXX" represents the IFCB number and "ZZZZZ" represents the ROI number.
#' These filenames should match the `roi_number` assignment in the `feature_files` and can be
#' used as a substitute for classification files.
#' @param custom_classes (Optional) A character vector of corresponding class labels for `custom_images`.
#' @param class2use_file (Optional) A character string specifying the path to the file containing the `class2use` variable. Only required for manual results (default: NULL).
#' @param micron_factor Conversion factor from microns per pixel (default: 1/3.4).
#' @param diatom_class A character vector specifying diatom class names in WoRMS. Default: `"Bacillariophyceae"`.
#' @param diatom_include Optional character vector of class names that should always be treated as diatoms,
#' overriding the boolean result of \code{ifcb_is_diatom}. Default: NULL.
#' @param marine_only Logical. If `TRUE`, restricts the WoRMS search to marine taxa only. Default: `FALSE`.
#' @param threshold A character string controlling which classification to use.
#' `"opt"` (default) uses the threshold-applied classification, where
#' predictions below the per-class optimal threshold are labeled
#' `"unclassified"`. Any other value (e.g. `"all"`) uses the raw winning
#' class without any threshold applied.
#' @param multiblob Logical. If `TRUE`, includes multiblob features. Default: `FALSE`.
#' @param feature_recursive Logical. If `TRUE`, searches recursively for feature files when `feature_files` is a folder. Default: `TRUE`.
#' @param class_recursive Logical. If `TRUE` and `class_files` is a folder, searches recursively for classification files. Default: `TRUE`.
#' @param drop_zero_volume Logical. If `TRUE`, rows where `Biovolume` equals zero (e.g., artifacts such as smudges on the flow cell) are removed. Default: `FALSE`.
#' @param feature_version Optional numeric or character version to filter feature files by (e.g. 2 for "_v2"). Default is NULL (no filtering).
#' @param use_python Logical. If `TRUE`, attempts to read `.mat` files using a Python-based method (`SciPy`). Default: `FALSE`.
#' @param verbose Logical. If `TRUE`, prints progress messages. Default: `TRUE`.
#' @param mat_folder `r lifecycle::badge("deprecated")`
#' Use \code{class_files} instead.
#' @param mat_files `r lifecycle::badge("deprecated")`
#' Use \code{class_files} instead.
#' @param mat_recursive `r lifecycle::badge("deprecated")`
#' Use \code{class_recursive} instead.
#'
#' @return A data frame containing:
#' - `sample`: The sample name.
#' - `classifier`: The classifier used (if applicable).
#' - `roi_number`: The region of interest (ROI) number.
#' - `class`: The identified taxonomic class.
#' - `biovolume_um3`: Computed biovolume in cubic micrometers.
#' - `carbon_pg`: Estimated carbon content in picograms.
#'
#' @details
#' - **Classification Data Handling:**
#' - If `class_files` is provided, the function reads class annotations from `.mat`, `.h5`, or `.csv` files.
#' - If `custom_images` and `custom_classes` are supplied, they override classification file data (e.g. data from a CNN model).
#' - If both `class_files` and `custom_images/custom_classes` are given, `class_files` takes precedence.
#'
#' - **MAT File Processing:**
#' - If `use_python = TRUE`, the function reads `.mat` files using `ifcb_read_mat()` (requires Python + `SciPy`).
#' - Otherwise, it falls back to `R.matlab::readMat()`.
#'
#' @examples
#' \dontrun{
#' # Using classification results:
#' feature_files <- "data/features"
#' class_files <- "data/classified"
#'
#' biovolume_df <- ifcb_extract_biovolumes(feature_files,
#' class_files)
#'
#' print(biovolume_df)
#'
#' # Using custom classification result:
#' classes <- c("Mesodinium_rubrum",
#' "Mesodinium_rubrum")
#' images <- c("D20220522T003051_IFCB134_00002",
#' "D20220522T003051_IFCB134_00003")
#'
#' biovolume_df_custom <- ifcb_extract_biovolumes(feature_files,
#' custom_images = images,
#' custom_classes = classes)
#'
#' print(biovolume_df_custom)
#' }
#'
#' @references Menden-Deuer Susanne, Lessard Evelyn J., (2000), Carbon to volume relationships for dinoflagellates, diatoms, and other protist plankton, Limnology and Oceanography, 45(3), 569-579, doi: 10.4319/lo.2000.45.3.0569.
#' @references Sosik, H. M. and Olson, R. J. (2007), Automated taxonomic classification of phytoplankton sampled with imaging-in-flow cytometry. Limnol. Oceanogr: Methods 5, 204–216.
#'
#' @export
#'
#' @seealso \code{\link{ifcb_read_features}} \code{\link{ifcb_is_diatom}} \url{https://www.marinespecies.org/}
ifcb_extract_biovolumes <- function(feature_files, class_files = NULL, custom_images = NULL, custom_classes = NULL,
class2use_file = NULL, micron_factor = 1 / 3.4,
diatom_class = "Bacillariophyceae", diatom_include = NULL, marine_only = FALSE,
threshold = "opt", multiblob = FALSE, feature_recursive = TRUE,
class_recursive = TRUE, drop_zero_volume = FALSE,
feature_version = NULL, use_python = FALSE, verbose = TRUE,
mat_folder = deprecated(), mat_files = deprecated(), mat_recursive = deprecated()) {
# Handle deprecated mat_folder argument
if (lifecycle::is_present(mat_folder)) {
deprecate_warn("0.7.0", "iRfcb::ifcb_extract_biovolumes(mat_folder = )", "iRfcb::ifcb_extract_biovolumes(class_files = )")
class_files <- mat_folder
}
# Handle deprecated mat_files argument
if (lifecycle::is_present(mat_files)) {
deprecate_warn("0.8.0", "iRfcb::ifcb_extract_biovolumes(mat_files = )", "iRfcb::ifcb_extract_biovolumes(class_files = )")
class_files <- mat_files
}
# Handle deprecated mat_recursive argument
if (lifecycle::is_present(mat_recursive)) {
deprecate_warn("0.8.0", "iRfcb::ifcb_extract_biovolumes(mat_recursive = )", "iRfcb::ifcb_extract_biovolumes(class_recursive = )")
class_recursive <- mat_recursive
}
if (is.null(class_files) && (is.null(custom_images) || is.null(custom_classes))) {
stop("No classification information supplied. Provide either `class_files` or both `custom_images` and `custom_classes`.")
}
if (!is.null(class_files) && (!is.null(custom_images) || !is.null(custom_classes))) {
warning("Both `class_files` and `custom_images/custom_classes` were provided. The function will use `class_files` and ignore `custom_images` and `custom_classes`.")
}
if (is.character(feature_files)) {
if (length(feature_files) == 1) {
if (file.exists(feature_files)) {
# It's a single file
} else if (dir.exists(feature_files)) {
# It's a directory
} else {
stop("The specified file or directory does not exist: ", feature_files)
}
} else {
# feature_files is a vector of files
if (!all(file.exists(feature_files))) {
stop("One or more specified feature files do not exist.")
}
}
} else {
stop("feature_files must be a character vector of filenames or a single directory path.")
}
# Check if feature_files is a single folder path or a vector of file paths
if (length(feature_files) == 1 && dir.exists(feature_files)) {
feature_files <- list.files(feature_files, pattern = "D.*\\.csv", full.names = TRUE, recursive = feature_recursive)
}
if (!is.null(class_files)) {
# Check if class_files is a single folder path or a vector of file paths
if (length(class_files) == 1 && file.info(class_files)$isdir) {
class_files <- list.files(class_files, pattern = "\\.(mat|h5|csv)$", recursive = class_recursive, full.names = TRUE)
}
if (length(class_files) == 0) {
stop("No classification files found")
}
# Check if files are manually classified (.h5 and .csv files are never manual)
is_manual <- tolower(tools::file_ext(class_files[1])) == "mat" &&
"class2use_manual" %in% ifcb_get_mat_names(class_files[1])
if (is_manual && is.null(class2use_file)) {
stop("class2use must be specified when extracting manual biovolume data")
}
}
if (!is.null(custom_images)) {
# Extract date-time from class file paths
class_date_times <- gsub(".*D(\\d{8}T\\d{6})_.*", "\\1", custom_images)
} else {
# Extract date-time from class file paths
class_date_times <- gsub(".*D(\\d{8}T\\d{6})_.*", "\\1", class_files)
}
# Extract date-time from feature file paths
extracted_dates <- sub(".*D(\\d{8}T\\d{6}).*", "\\1", feature_files)
# List matching feature files
feature_files <- feature_files[extracted_dates %in% class_date_times]
if (verbose) {
message("Reading feature files...")
}
# Read feature files
features <- ifcb_read_features(feature_files = feature_files,
multiblob = multiblob,
feature_version = feature_version,
biovolume_only = TRUE,
verbose = verbose)
if (length(features) == 0) {
stop("No feature data files found")
}
data_list <- vector("list", length(features))
idx <- 1
for (file_name in names(features)) {
file_data <- features[[file_name]]
if (drop_zero_volume) {
file_data <- file_data[file_data$Biovolume != 0, , drop = FALSE]
}
if (nrow(file_data) > 0) {
sample_name <- if (multiblob) {
str_replace(file_name, "_multiblob_v\\d+.csv", "")
} else {
str_replace(file_name, "_fe[a-z]*_v\\d+\\.csv", "")
}
data_list[[idx]] <- tibble(
sample = sample_name,
roi_number = file_data$roi_number,
biovolume = file_data$Biovolume
)
idx <- idx + 1
} else if (drop_zero_volume) {
warning(sprintf(
"All rows were dropped for file '%s' because Biovolume == 0.",
file_name
))
}
}
biovolume_df <- do.call(rbind, data_list[seq_len(idx - 1)])
# Stop if the combined data frame is empty
if (is.null(biovolume_df)) {
stop("No biovolume data available in feature files.")
}
# If custom class list is supplied
if (!is.null(custom_images) && !is.null(custom_classes)) {
if (length(custom_images) != length(custom_classes)) {
stop("Error: The number of images does not match the number of class labels. Ensure both vectors have the same length.")
}
image_df <- ifcb_convert_filenames(custom_images)
class_df <- image_df %>%
mutate(classifier = NA,
class = custom_classes) %>%
select(sample, classifier, roi, class) %>%
rename(roi_number = roi)
} else {
# Find unique samples in biovolume_df
unique_samples <- unique(biovolume_df$sample)
# Identify matching class files
class_samples <- sub(".*(D\\d{8}T\\d{6}_IFCB\\d+).*", "\\1", class_files)
matching_class_files <- class_files[class_samples %in% unique_samples]
# Initialize an empty list to store data frames
tb_list <- list()
if (is_manual) {
class_df <- ifcb_count_mat_annotations(matching_class_files,
class2use_file,
sum_level = "roi",
use_python = use_python)
names(class_df)[2] <- "roi_number"
class_df$classifier <- NA
} else {
n_files <- length(matching_class_files)
tb_list <- vector("list", n_files)
# Set up the progress bar
if (verbose && n_files > 0) {
message("Reading classification files...")
pb <- txtProgressBar(min = 0, max = n_files, style = 3)
}
for (i in seq_along(matching_class_files)) {
if (verbose && n_files > 0) {
setTxtProgressBar(pb, i)
}
temp <- suppressWarnings({
read_class_file(matching_class_files[i], use_python = use_python)
})
sample_name <- sub("_class(_v\\d+)?\\.(mat|h5)$", "", basename(matching_class_files[i]))
# Also handle CSV files where the filename is just the sample name
sample_name <- sub("\\.csv$", "", sample_name)
tb_list[[i]] <- tibble(
sample = sample_name,
classifier = temp$classifierName,
roi_number = temp$roinum,
class = if (threshold == "opt")
unlist(temp$TBclass_above_threshold)
else
unlist(temp$TBclass)
)
}
# Close the progress bar
if (verbose && n_files > 0) close(pb)
}
if (!is_manual) {
# Combine all data frames into one
class_df <- do.call(rbind, tb_list)
}
}
# Join biovolume_df with class_df
biovolume_df <- left_join(biovolume_df, class_df, by = c("sample", "roi_number"))
# Calculate biovolume in cubic microns
biovolume_df$biovolume_um3 <- biovolume_df$biovolume * (micron_factor ^ 3)
# Determine if each class is a diatom
unique_classes <- unique(biovolume_df$class)
if (verbose) {
message("Retrieving WoRMS records...")
}
is_diatom <- tibble(class = unique_classes, is_diatom = ifcb_is_diatom(unique_classes,
diatom_class = diatom_class,
marine_only = marine_only,
verbose = verbose))
# Override diatom classification if diatom_include is provided
if (!is.null(diatom_include)) {
matched <- is_diatom$class %in% diatom_include
if (verbose && any(matched)) {
message("INFO: The following classes were manually included as diatoms via diatom_include:")
message(paste(sort(is_diatom$class[matched]), collapse = "\n"))
}
is_diatom$is_diatom[matched] <- TRUE
}
biovolume_df <- left_join(biovolume_df, is_diatom, by = "class")
# Filter rows where is_diatom$is_diatom is NA
na_classes <- is_diatom[is.na(is_diatom$is_diatom), "class"]
# Filter rows where is_diatom$is_diatom is NA
non_diatoms <- is_diatom[!is_diatom$is_diatom, "class"]
# Print the classes with NA values
if (length(na_classes$class) > 0 & verbose) {
message("INFO: Some classes could not be found in WoRMS. They will be assumed as NOT diatoms for carbon calculations:")
message(paste(sort(na_classes$class), collapse = "\n"))
}
# Print the classes that are non-Diatoms
if (length(non_diatoms$class) > 0 & verbose) {
message("INFO: The following classes are considered NOT diatoms for carbon calculations:")
message(paste(sort(non_diatoms$class), collapse = "\n"))
}
# Calculate carbon content based on diatom classification
biovolume_df <- biovolume_df %>%
mutate(carbon_pg = case_when(
!is.na(is_diatom) & is_diatom ~ vol2C_lgdiatom(biovolume_um3),
!is.na(is_diatom) & !is_diatom ~ vol2C_nondiatom(biovolume_um3),
is.na(is_diatom) ~ vol2C_nondiatom(biovolume_um3),
TRUE ~ NA_real_
)) %>%
select(-biovolume, -is_diatom)
type_convert(biovolume_df, col_types = cols())
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.