isoorbi: Process Orbitrap Isotopocule Data

Documented in orbi_define_basepeak orbi_filter_flagged_data orbi_filter_satellite_peaks orbi_filter_scan_intensity orbi_filter_weak_isotopocules orbi_flag_outliers orbi_flag_satellite_peaks orbi_flag_weak_isotopocules

# Internal utility functions =============

# internal function to ensure dataset columns are factors (if they exist)
# will warn the user about the transformation
factorize_dataset <- function(dataset, cols = c()) {
  for (col in cols) {
    if (col %in% names(dataset) && !is.factor(dataset[[col]])) {
      sprintf("column `%s` was turned into a factor", col) |> message()
      dataset[[col]] <- factor_in_order(dataset[[col]])
    }
  }
  return(dataset)
}

# group if exists
group_if_exists <- function(dataset, cols, add = TRUE) {
  for (col in cols) {
    if (col %in% names(dataset))
      dataset <- dataset |> group_by(!!sym(col), .add = add)
  }
  return(dataset)
}

# group one dataset the same as another (use to restore original groupings)
group_by_same_groups <- function(target_dataset, source_dataset) {
  target_dataset |> dplyr::group_by(!!!dplyr::groups(source_dataset))
}

# count distinct column respecting existing grouping
count_grouped_distinct <- function(dataset, column) {
  dataset |> dplyr::select(!!!c(dplyr::groups(dataset), column)) |>
    dplyr::distinct() |> dplyr::count(!!sym(column)) |>
    dplyr::pull(.data$n) |> sum()
}

# internal function to wrap around expressions that should throw errors whenever anything unexpected happens
# error/warn can be text or function
try_catch_all <- function(expr, error, warn = error, newline = TRUE) {
  tryCatch(
    {{ expr }},
    error = function(p) {
      if(newline) cat("\n")
      if (is_function(error)) error(p)
      else abort(error, parent = p)
    },
    warning = function(p) {
      if(newline) cat("\n")
      if (is_function(warn)) warn(p)
      else abort(warn, parent = p)
    }
  )
}

# print out info start message
message_start <- function(...) {
  message_wrap(..., exdent = 3, appendLF = !interactive())
  return(Sys.time())
}

# print out info end message
message_finish <- function(..., start_time = NULL, pre = if (!interactive()) "..." else "",  indent = if (interactive()) 0 else 3) {
  end_time <- Sys.time()
  time_info <-
    if (!is.null(start_time)) sprintf(" in %.2f seconds.", end_time - start_time)
    else ""
  message_wrap(pre, ..., time_info, indent = indent, exdent = 3)
}

# print out standalone info message
message_standalone <- function(..., start_time = NULL) {
  message_finish(..., start_time = start_time, pre = "", indent = 0)
}

# print out a message that wraps in non-interactive mode (e.g. in notebooks)
message_wrap <- function (..., appendLF = TRUE, width = if (!interactive()) options()$width - 2 else NA, indent = 0, exdent = 0) {
  if (is.na(width)) {
    message(..., appendLF = appendLF)
  } else {
    original <- paste0(..., collapse = "")
    original |>
      strwrap(width = width, indent = indent, exdent = exdent) |>
      paste(collapse = "\n") |>
      message(appendLF = appendLF)
  }
  invisible()
}


# Common utility functions to clean and annotate data ------------------------------------

#' @title Function replaced by `orbi_flag_satellite_peaks()`
#' @param ... parameters passed on to the new function [orbi_flag_satellite_peaks()].
#' @export
orbi_filter_satellite_peaks <- function(...) {
  lifecycle::deprecate_warn("1.2.0", "orbi_filter_satellite_peaks()", "orbi_flag_satellite_peaks()", always = TRUE)
  orbi_flag_satellite_peaks(...) |>
    orbi_filter_flagged_data()
}

#' @title Flag minor satellite peaks
#' @description Flag minor signals (e.g., satellite peaks) that were reported by IsoX (filter them out with `orbi_filter_flagged_data()`).
#' @param dataset A data frame or tibble produced from IsoX data by `orbi_simplify_isox()`
#' @details The `orbi_filter_satellite_peaks()` function removes minor signals for an isotopocule that have been reported by IsoX.
#' These are often small `satellite peaks` generated by the Fourier transform.
#'
#' If there are signal of high intensity or very many signals, this can indicate that the m/z and tolerance setting used for processing .raw files with IsoX were incorrect.
#'
#' @examples
#' fpath <- system.file("extdata", "testfile_flow.isox", package = "isoorbi")
#' df <-
#'   orbi_read_isox(file = fpath) |>
#'   orbi_simplify_isox() |>
#'   orbi_flag_satellite_peaks()
#'
#' @return A data frame with new column `is_satellite_peak` that flags satellite peaks.
#' @export
orbi_flag_satellite_peaks <- function(dataset) {

  # safety checks
  cols <- c("filepath", "filename", "compound", "scan.no", "time.min", "isotopocule", "ions.incremental", "tic", "it.ms")
  stopifnot(
    "need a `dataset` data frame" = !missing(dataset) && is.data.frame(dataset),
    "`dataset` requires columns `filepath`, `filename`, `compound`, `scan.no`, `time.min`, `isotopocule`, `ions.incremental`, `tic` and `it.ms`" =
      all(cols %in% names(dataset))
  )

  # info
  start_time <- message_start("orbi_flag_satellite_peaks() is flagging minor signals (satellite peaks)... ")

  # calculation
  dataset <-
    try_catch_all(
      dataset |>
        dplyr::group_by(
          .data$filename,
          .data$compound,
          .data$scan.no,
          .data$isotopocule
        ) |>
        dplyr::mutate(is_satellite_peak = .data$ions.incremental < max(.data$ions.incremental)) |>
        dplyr::ungroup(),
      "something went wrong tying to flag satellite peaks: "
    )

  # info
  sat_peaks <- sum(dataset$is_satellite_peak)
  isotopocules <- dataset |> 
    dplyr::summarise(has_sat_peaks = any(.data$is_satellite_peak), .by = "isotopocule") |>
    dplyr::filter(.data$has_sat_peaks) |>
    dplyr::pull(.data$isotopocule)
  
  if(sat_peaks > 0){
    sprintf(
      "flagged %d/%d peaks in %d isotopocules (\"%s\") as satellite peaks (%.1f%%)",
      sat_peaks, nrow(dataset), length(isotopocules), 
      paste(isotopocules, collapse = "\", \""),
      100 * sat_peaks/nrow(dataset)) |>
      message_finish(start_time = start_time)
  } else {
    message_finish("confirmed there are no satellite peaks", start_time = start_time)
  }

  return(dataset)
}

#' @title Function replaced by `orbi_flag_weak_isotopocules()`
#' @param ... parameters passed on to the new function orbi_flag_weak_isotopocules().
#' @export
orbi_filter_weak_isotopocules <- function(...) {
  lifecycle::deprecate_warn("1.2.0", "orbi_filter_weak_isotopocules()", "orbi_flag_weak_isotopocules()", always = TRUE)
  orbi_flag_weak_isotopocules(...) |>
    orbi_filter_flagged_data()
}

#' @title Flag weak isotopocules
#' @description The function `orbi_filter_weak_isotopocules()` flags isotopocules that are not detected in a minimum of `min_percent` of scans. This function evaluates weak isotopocules within each "filename", "block", "segment" and "injection" (if these columns exist), in addition to any groupings already defined before calling this function using dplyr's `group_by()`. It restores the original groupings in the returned data frame.
#'
#' @param dataset A simplified IsoX data frame to be processed
#' @param min_percent A number between 0 and 90. Isotopocule must be observed in at least this percentage of scans (please note: the percentage is defined relative to the most commonly observed isotopocule of each compound)
#'
#' @examples
#' fpath <- system.file("extdata", "testfile_flow.isox", package = "isoorbi")
#' df <- orbi_read_isox(file = fpath) |>
#'       orbi_simplify_isox() |>
#'       orbi_flag_weak_isotopocules(min_percent = 2)
#'
#' @details The input `dataset` is expected to have at least these 8 columns: `filename`, `scan.no`, `time.min`, `compound`, `isotopocule`, `ions.incremental`, `tic`, `it.ms`.
#'
#' @return A data frame with new column `is_weak_isotopocule` that flags weak isotopocules.
#' @export
orbi_flag_weak_isotopocules <-
  function(dataset, min_percent) {

    # safety checks
    cols <- c("filepath", "filename", "compound", "scan.no", "time.min", "isotopocule", "ions.incremental", "tic", "it.ms")
    stopifnot(
      "need a `dataset` data frame" = !missing(dataset) && is.data.frame(dataset),
      "`dataset` requires columns `filepath`, `filename`, `compound`, `scan.no`, `time.min`, `isotopocule`, `ions.incremental`, `tic` and `it.ms`" =
        all(cols %in% names(dataset)),
      "`min_percent` needs to be a single number" = !missing(min_percent) && is.numeric(min_percent) && length(min_percent) == 1L,
      "`min_percent` needs to be between 0 and 90" = min_percent >= 0 && min_percent <= 90
    )

    # ensure factors
    dataset <- dataset |> factorize_dataset("isotopocule")

    # groupings if they exist
    dataset_out <- dataset |> group_if_exists(c("filename", "compound", "block", "segment", "injection"), add = TRUE)

    # info
    start_time <-
      sprintf(
        "orbi_flag_weak_isotopocules() is flagging isotopocules from data that are detected in less than %s%% of scans in %d data group(s) (based on '%s')... ",
        min_percent,  dplyr::n_groups(dataset_out), paste(dplyr::group_vars(dataset_out), collapse = "', '")) |>
      message_start()
    starting_isotopocules <- dataset_out |> count_grouped_distinct("isotopocule")

    # minor peak removal
    dataset_out <-
      try_catch_all(
        dataset_out |>
          dplyr::mutate(max.scans = length(unique(.data$scan.no))) |>
          dplyr::group_by(.data$isotopocule, .add = TRUE) |>
          dplyr::mutate(obs.scans = length(unique(.data$scan.no))) |>
          dplyr::mutate(is_weak_isotopocule = .data$obs.scans < min_percent / 100 * .data$max.scans) |>
          dplyr::select(-"obs.scans", -"max.scans"),
      "something went wrong flagging weak isotopocules: "
    )

    # info
    removed_isotopocules <- dataset_out |> dplyr::filter(.data$is_weak_isotopocule) |> count_grouped_distinct("isotopocule")
    if (removed_isotopocules > 0) {
      sprintf(
        "flagged %d/%d isotopocules across all data groups (use `orbi_plot_isotopocule_coverage()` to visualize them)",
        removed_isotopocules, starting_isotopocules) |>
        message_finish(start_time = start_time)
    } else {
      message_finish("confirmed there are no weak isotopocules", start_time = start_time)
    }

    # return with restored groupings from original dataset
    return(dataset_out |> group_by_same_groups(dataset))
  }


#' @title Function replaced by `orbi_flag_outliers()`
#' @param ... parameters passed on to the new function [orbi_flag_outliers()].
#' @param outlier_percent outlier_percent needs to be between 0 and 10, flags extreme scans based on TIC x injection time (i.e., ion intensity)
#' @export
orbi_filter_scan_intensity <- function(..., outlier_percent) {
  lifecycle::deprecate_warn("1.2.0", "orbi_filter_scan_intensity()", "orbi_flag_outliers()", always = TRUE)
  lifecycle::deprecate_warn("1.2.0", "orbi_filter_scan_intensity(outlier_percent)", details = "the argument `outlier_percent` has been superseded by `agc_window`", always = TRUE)
  orbi_flag_outliers(..., agc_window = c(outlier_percent, 100 - outlier_percent)) |>
    orbi_filter_flagged_data()
}


#' @title Flag outlier scans
#' @description The function `orbi_flag_outliers()` flags outliers using one of the different methods provided by the parameters (to use multiple, please call this function several times sequentially). Note that this function evaluates outliers within each "filename", "block", "segment" and "injection" (if these columns exist), in addition to any groupings already defined before calling this function using dplyr's `group_by()`. It restores the original groupings in the returned data frame.
#' 
#' @param agc_window flags scans with a critically low or high number of ions in the Orbitrap analyzer. Provide a vector with 2 numbers `c(x,y)` flagging the lowest x percent and highest y percent. TIC multiplied by injection time serves as an estimate for the number of ions in the Orbitrap.
#' @param agc_fold_cutoff flags scans with a fold cutoff based on the average number of ions in the Orbitrap analyzer. For example, `agc_fold_cutoff = 2` flags scans that have more than 2 times, or less than 1/2 times the average. TIC multiplied by injection time serves as an estimate for the number of ions in the Orbitrap. 
#' @param dataset Simplified IsoX dataset to have outliers flagged
#' @details Function is intended to flag scans that are outliers.
#'
#' The input `dataset` is expected to have at least these 8 columns: `filename`, `scan.no`, `time.min`, `compound`, `isotopocule`, `ions.incremental`, `tic`, `it.ms`.
#'
#' @examples
#' fpath <- system.file("extdata", "testfile_flow.isox", package = "isoorbi")
#' df <-
#'   orbi_read_isox(file = fpath) |>
#'   orbi_simplify_isox() |>
#'   orbi_flag_outliers(agc_window = c(1,99))
#'
#' @return A data frame with new columns `is_outlier` and `outlier_type` (if they don't already exist) that flags outliers identified by the method and provides the type of outlier (e.g. "2 fold agc cutoff"), respectively.
#' @export
orbi_flag_outliers <- function(dataset, agc_fold_cutoff = NA_real_, agc_window = c()) {

  # safety checks
  cols <- c("filename", "compound", "scan.no", "tic", "it.ms")
  stopifnot(
    "need a `dataset` data frame" = !missing(dataset) && is.data.frame(dataset),
    "`dataset` requires columns `filename`, `compound`, `scan.no`, `tic` and `it.ms`" =
      all(cols %in% names(dataset)),
    "if provided, `agc_fold_cutoff` needs to be a single number" = (length(agc_fold_cutoff) == 1L && is.na(agc_fold_cutoff)) || is_scalar_double(agc_fold_cutoff),
    "if provided, `agc_window` needs to be a vector of two numbers (low and high filter) between 0 and 100" = length(agc_window) == 0L || (is.numeric(agc_window) && length(agc_window) == 2L && agc_window[1] < agc_window[2])
  )

  # check filter to apply
  method <- c(
    agc_window = length(agc_window) == 2L,
    agc_fold_cutoff = !is.na(agc_fold_cutoff)
  )
  if (sum(method) > 1L) {
    sprintf("can only use one method at a time, please call this function sequentially for each of these parameters: '%s'",
            paste(names(method)[method], collapse = "', '")) |>
    abort()
  } else if (sum(method) == 0L) {
    sprintf("need to define at least one of these parameters for identifying outliers: '%s'",
            paste(names(method), collapse = "', '")) |>
      abort()
  }
  method <- names(method)[method]
  
  # method message
  method_msg <- ""
  method_type <- ""
  if (method == "agc_window") {
    method_msg <- sprintf(
      "%s %% of scans with the lowest and above %s %% of scans with the highest number of ions (`tic` * `it.ms`) in the Orbitrap analyzer",
      agc_window[1], agc_window[2]
    )
    method_type <- sprintf("AGC window (%s to %s %%)", agc_window[1], agc_window[2])
  } else if (method == "agc_fold_cutoff") {
    method_msg <- sprintf(
      "scans below 1/%s and above %s times the average number of ions (`tic` * `it.ms`) in the Orbitrap analyzer",
      agc_fold_cutoff, agc_fold_cutoff
    )
    method_type <- sprintf("%s fold AGC cutoff", agc_fold_cutoff)
  }
  
  # optional groupings
  dataset_out <- dataset |> group_if_exists(c("filename", "block", "segment", "injection"), add = TRUE)
  
  # info
  start_time <-
    sprintf(
      "orbi_flag_outliers() is flagging the %s in %d data group(s) (based on '%s')... ",
      method_msg,
      dplyr::n_groups(dataset_out), paste(dplyr::group_vars(dataset_out), collapse = "', '")) |>
    message_start()
  n_scans <- dataset_out |> count_grouped_distinct("scan.no")

  # calculation
  dataset_out <- try_catch_all(
    if (method == "agc_window") {
      dataset_out |>
        dplyr::mutate(
          TICxIT = .data$tic * .data$it.ms,
          is_new_outlier =
            .data$TICxIT < stats::quantile(.data$TICxIT, agc_window[1] / 100) |
            .data$TICxIT > stats::quantile(.data$TICxIT, agc_window[2] / 100)
        ) |>
        dplyr::select(-"TICxIT")
    } else if (method == "agc_fold_cutoff") {
      dataset_out |>
        dplyr::mutate(
          TICxIT = .data$tic * .data$it.ms,
          TICxIT_mean = mean(.data$TICxIT),
          is_new_outlier =
            .data$TICxIT < 1/agc_fold_cutoff * .data$TICxIT_mean |
            .data$TICxIT > agc_fold_cutoff * .data$TICxIT_mean
        ) |>
        dplyr::select(-"TICxIT", -"TICxIT_mean")
    },
    "something went wrong flagging outliers: "
  )

  # dataset outlier and type update
  if (!"is_outlier" %in% names(dataset_out)) {
    dataset_out$is_outlier <- FALSE
    dataset_out$outlier_type <- NA_character_
  }
  dataset_out <- dataset_out |>
    dplyr::mutate(
      is_outlier = ifelse(.data$is_new_outlier, TRUE, .data$is_outlier),
      outlier_type = ifelse(.data$is_new_outlier, method_type, .data$outlier_type)
    ) |>
    dplyr::select(-"is_new_outlier")
  
  # info
  n_scans_removed <- dataset_out |> dplyr::filter(.data$is_outlier) |> count_grouped_distinct("scan.no")
  if(n_scans_removed > 0){
    sprintf(
      "flagged %d/%d scans (%.1f%%) as outliers (use `orbi_plot_raw_data(y = tic * it.ms)` to visualize them) across all data groups",
      n_scans_removed, n_scans, n_scans_removed/n_scans * 100) |>
      message_finish(start_time = start_time)
  } else {
    message_finish("confirmed there are no outliers based on this method", start_time = start_time)
  }
 

  # return with restored groupings from original dataset
  return(dataset_out |> group_by_same_groups(dataset))
}

# filter flagged data (fast internal function)
filter_flagged_data <- function(dataset, filter_satellite_peaks = TRUE, filter_weak_isotopocules = TRUE, filter_outliers = TRUE) {
  if (filter_satellite_peaks && "is_satellite_peak" %in% names(dataset)) {
    dataset <- dataset |> filter(!.data$is_satellite_peak)
  }
  if (filter_weak_isotopocules && "is_weak_isotopocule" %in% names(dataset)) {
    dataset <- dataset |> filter(!.data$is_weak_isotopocule)
  }
  if (filter_outliers && "is_outlier" %in% names(dataset)) {
    dataset <- dataset |> filter(!.data$is_outlier)
  }
  return(dataset |> droplevels())
}

#' @title Filter out flagged data
#' @description This function filters out data that have been previously flagged using functions `orbi_flag_satellite_peaks()`, `orbi_flag_weak_isotopocules()`, and/or `orbi_flag_outliers()`. Note that this function is no longer necessary to call explicitly as `orbi_analyze_shot_noise()` and `orbi_summarize_results()` automatically exclude flagged data.
#' @param dataset a tibble with previously flagged data from `orbi_flag_satellite_peaks()`, `orbi_flag_weak_isotopocules()`, and/or `orbi_flag_outliers()`
#' @return a dataset with the flagged data filtered out
#' @export
orbi_filter_flagged_data <- function(dataset) {

  # safety checks
  stopifnot(
    "need a `dataset` data frame" = !missing(dataset) && is.data.frame(dataset)
  )

  # deprecation
  lifecycle::deprecate_warn(
    "1.3.0", "orbi_filter_flagged_data()", 
    details = "filtering flagged data is no longer necessary as orbi_summarize_results() and other functions take flagged data into consideration and treat it appropriately", 
    always = TRUE
  )
  
  # original n
  nall <- nrow(dataset)
  nsat_peaks <- 0L
  nweak_isos <- 0L
  noutliers <- 0L

  # remove flagged
  start_time <- Sys.time()
  details <- c()
  if ("is_satellite_peak" %in% names(dataset)) {
    dataset <- dataset |> filter(!.data$is_satellite_peak)
    nsat_peaks <- nall - nrow(dataset)
    details <- c(details, sprintf("%d satellite peaks", nsat_peaks))
  }
  if ("is_weak_isotopocule" %in% names(dataset)) {
    dataset <- dataset |> filter(!.data$is_weak_isotopocule)
    nweak_isos <- nall - nsat_peaks - nrow(dataset)
    details <- c(details, sprintf("%d weak isotopocule peaks", nweak_isos))
  }
  if ("is_outlier" %in% names(dataset)) {
    dataset <- dataset |> filter(!.data$is_outlier)
    noutliers <- nall - nsat_peaks - nweak_isos - nrow(dataset)
    details <- c(details, sprintf("%d outlier peaks", noutliers))
  }

  # info
  sprintf(
    "orbi_filter_flagged_data() removed %d flagged records (%.1f%%)%s",
    nsat_peaks + nweak_isos + noutliers, 100 * (nsat_peaks + nweak_isos + noutliers)/nall,
    if(length(details) > 0) sprintf(" - %s", paste(details, collapse = ", ")) else ""
  ) |>
    message_standalone(start_time = start_time)

  # return
  return(dataset |> droplevels())
}


#' @title Define the denominator for ratio calculation
#' @description `orbi_define_basepeak()` sets one isotopocule in the data frame as the base peak (ratio denominator) and calculates the instantaneous isotope ratios against it.
#' @param dataset A tibble from a `IsoX` output. Needs to contain columns for `filename`, `compound`, `scan.no`, `isotopocule`, and `ions.incremental`.
#' @param basepeak_def The isotopocule that gets defined as base peak, i.e. the denominator to calculate ratios
#'
#' @examples
#' fpath <- system.file("extdata", "testfile_flow.isox", package = "isoorbi")
#' df <- orbi_read_isox(file = fpath) |>
#'   orbi_simplify_isox() |>
#'   orbi_define_basepeak(basepeak_def = "M0")
#'
#' @returns Input data frame without the rows of the basepeak isotopocule and instead three new columns called `basepeak`, `basepeak_ions`, and `ratio` holding the basepeak information and the isotope ratios vs. the base peak
#' @export
orbi_define_basepeak <- function(dataset, basepeak_def) {

  # safety checks
  stopifnot(
    "need a `dataset` data frame" = !missing(dataset) && is.data.frame(dataset),
    "`dataset` requires columns `filename`, `compound`, `scan.no`, `isotopocule`, and `ions.incremental`" =
      all(c("filename", "compound", "scan.no", "isotopocule", "ions.incremental") %in% names(dataset)),
    "`basepeak_def` needs to be a single text value identifying the isotopocule to use as the basepeak" =
      !missing(basepeak_def) && rlang::is_scalar_character(basepeak_def),
    "`basepeak_def` is not an isotopocule in the dataset" = basepeak_def %in% levels(dataset$isotopocule)
  )

  # ensure factors
  dataset <- dataset |> factorize_dataset("isotopocule")

  # info message
  start_time <-
    sprintf(
      "orbi_define_basepeak() is setting the '%s' isotopocule as the ratio denominator... ",
      basepeak_def) |>
    message_start()

  # identify `basepeak` for each scan
  df.out <-
    try_catch_all({
      df.out <- dataset |>
        dplyr::mutate(basepeak = !!basepeak_def) |>
        dplyr::group_by(.data$filename, .data$compound, .data$scan.no)
      # add basepeak ions
      if ("is_satellite_peak" %in% names(dataset)) {
        # with satellite peak defined
        df.out <- df.out |>
          dplyr::mutate(
            basepeak_ions = .data$ions.incremental[.data$isotopocule == !!basepeak_def & !.data$is_satellite_peak]
          )
      } else {
        # without satellite peak defined
        df.out <- df.out |>
          dplyr::mutate(
            basepeak_ions = .data$ions.incremental[.data$isotopocule == !!basepeak_def]
          )
      }
      # return
      df.out |>
        dplyr::ungroup()
    },
    # error catching
    function(p) {
      # analyze scans (is slow so only done if error)
      df_summary <-
        dataset |>
        dplyr::summarize(
          n_bp = sum(.data$isotopocule == !!basepeak_def),
          .by = c("filename", "compound", "scan.no")
        ) |>
        dplyr::summarize(
          n_scans = dplyr::n(),
          n_too_few = sum(.data$n_bp == 0L),
          n_too_many = sum(.data$n_bp > 1L),
          .by = c("filename", "compound")
        )

      # check abundances
      too_many_bps <- df_summary |>
        dplyr::filter(.data$n_too_many > 0)
      too_few_bps <- df_summary |>
        dplyr::filter(.data$n_too_few > 0)

      if (nrow(too_many_bps) > 0 && !"is_satellite_peak" %in% names(dataset)) {
        # too many base peaks
        sprintf("the %s isotopocule exists multiple times in some scans, make sure to run orbi_flag_satellite_peaks() first",
                basepeak_def) |>
          abort()
      } else if (nrow(too_few_bps) > 0) {
        info <-
          too_few_bps |>
          dplyr::mutate(
            label = sprintf("basepeak '%s' is missing in %d scans (%.1f%%) of compound '%s' in file '%s'",
                            basepeak_def, .data$n_too_few, .data$n_too_few/.data$n_scans * 100, .data$compound, .data$filename)
          ) |>
          dplyr::pull(.data$label)

        sprintf("the '%s' isotopocule does not exist in some scans, consider using `orbi_filter_isox()` to focus on specific file(s) and/or compound(s): \n - %s", basepeak_def,
                paste(info, collapse = "\n - ")) |>
          abort()
      } else {
        # some other error
        abort("something went wrong identifying the base peak for each scan", parent = p)
      }
    }
    )

  # remove basepeak from isotopocule list
  df.out <-
    try_catch_all(
      df.out |> dplyr::filter(.data$isotopocule != basepeak_def) |> droplevels(),
      "something went wrong removing the base peak isotopocule"
    )

  # calculate ratios
  df.out <-
    try_catch_all(
      df.out |>
        dplyr::mutate(
          ratio = .data$ions.incremental / .data$basepeak_ions,
          .after = "ions.incremental"
        ),
      "something went wrong calculating ratios: "
    )
  
  # info message
  sprintf(
    "set base peak and calculated %d ratios for %d isotopocules/base peak (%s)",
    nrow(df.out),
    length(levels(df.out$isotopocule)),
    paste(levels(df.out$isotopocule), collapse = ", ")
  ) |> message_finish(start_time = start_time)

  # return
  return(df.out)
}