R/pick_spectra.R
In maldipickr: Dereplicate and Cherry-Pick Mass Spectrometry Spectra

Documented in pick_spectra

# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand

#' Cherry-pick Bruker MALDI Biotyper spectra
#'
#' Using the clusters information, and potential additional metadata as external
#' criteria, spectra are labeled as to be picked for each cluster. Note that some
#' spectra and therefore clusters can be explicitly removed (_masked_)
#' from the picking decision if they have been previously picked
#' or should be discarded, using logical columns in the metadata table.
#' If no metadata are provided, the reference spectra of each cluster will be picked.
#'
#' @param cluster_df A tibble with clusters information
#' from the [delineate_with_similarity] or the [import_spede_clusters] function.
#' @param metadata_df Optional tibble with relevant metadata to guide the
#' picking process (e.g., OD600).
#' @param criteria_column Optional character indicating the column in `metadata_df`
#' to be used as a criteria.
#' @param hard_mask_column Column name in the `cluster_df` or `metadata_df` tibble indicating whether the spectra, **and the clusters to which they belong** should be discarded (`TRUE`) or not (`FALSE`) before the picking decision.
#' @param soft_mask_column Column name in the `cluster_df` or `metadata_df` tibble indicating whether the spectra should be discarded (`TRUE`) or not (`FALSE`) before the picking decision.
#' @param is_descending_order Optional logical indicating whether to sort the `criteria_column` from the highest-to-lowest value (`TRUE`) or lowest-to-highest (`FALSE`).
#' @param is_sorted Optional logical to indicate that the `cluster_df` is
#' already sorted by cluster based on (usually multiple) internal criteria to
#' pick the first of each cluster. This flag is **overridden** if a `metadata_df`
#' is provided.
#'
#' @return A tibble with as many rows as `cluster_df` with an additional logical
#' column named `to_pick` to indicate whether the colony associated to the spectra
#' should be picked. If `metadata_df` is provided, then additional columns from
#' this tibble are added to the returned tibble.
#'
#' @seealso [delineate_with_similarity], [set_reference_spectra]. For a useful utility function to soft-mask specific spectra: [is_well_on_edge].
#'
#' @export
#' @examples
#' # 0. Load a toy example of a tibble of clusters created by
#' #   the `delineate_with_similarity` function.
#' clusters <- readRDS(
#'   system.file("clusters_tibble.RDS",
#'     package = "maldipickr"
#'   )
#' )
#' # 1. By default and if no other metadata are provided,
#' #   the function picks reference spectra for each clusters.
#' #
#' # N.B: The spectra `name` and `to_pick` columns are moved to the left
#' # only for clarity using the `relocate()` function.
#' #
#' pick_spectra(clusters) %>%
#'   dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 2.1 Simulate OD600 values with uniform distribution
#' #  for each of the colonies we measured with
#' #  the Bruker MALDI Biotyper
#' set.seed(104)
#' metadata <- dplyr::transmute(
#'   clusters,
#'   name = name, OD600 = runif(n = nrow(clusters))
#' )
#' metadata
#'
#' # 2.2 Pick the spectra based on the highest
#' #   OD600 value per cluster
#' pick_spectra(clusters, metadata, "OD600") %>%
#'   dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 3.1 Say that the wells on the right side of the plate are
#' #   used for negative controls and should not be picked.
#' metadata <- metadata %>% dplyr::mutate(
#'   well = gsub(".*[A-Z]([0-9]{1,2}$)", "\\1", name) %>%
#'     strtoi(),
#'   is_edge = is_well_on_edge(
#'     well_number = well, plate_layout = 96, edges = "right"
#'   )
#' )
#'
#' # 3.2 Pick the spectra after discarding (or soft masking)
#' #   the spectra indicated by the `is_edge` column.
#' pick_spectra(clusters, metadata, "OD600",
#'   soft_mask_column = "is_edge"
#' ) %>%
#'   dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 4.1 Say that some spectra were picked before
#' #   (e.g., in the column F) in a previous experiment.
#' # We do not want to pick clusters with those spectra
#' #   included to limit redundancy.
#' metadata <- metadata %>% dplyr::mutate(
#'   picked_before = grepl("_F", name)
#' )
#' # 4.2 Pick the spectra from clusters without spectra
#' #   labeled as `picked_before` (hard masking).
#' pick_spectra(clusters, metadata, "OD600",
#'   hard_mask_column = "picked_before"
#' ) %>%
#'   dplyr::relocate(name, to_pick) # only for clarity
pick_spectra <- function(
    cluster_df,
    metadata_df = NULL, criteria_column = NULL,
    hard_mask_column = NULL, soft_mask_column = NULL,
    is_descending_order = TRUE,
    is_sorted = FALSE) {
  # Check that:
  #  - the mandatory column describing the cluster are present
  #  - clusters have a reference spectra to be picked if no metadata are provided
  #  - when metadata (resp. a criteria column) is provided,
  #     a criteria column (resp. metadata) should be provided as well
  #  - when masking columns are indicated, the metadata tibble should be added
  #  - the spectra name in the metadata table are also present in the clusters table
  if (any(!c("name", "membership") %in% colnames(cluster_df))) {
    stop(
      "'cluster_df' lacks one of the following columns: name or membership"
    )
  }
  if (is.null(metadata_df) & !"is_reference" %in% colnames(cluster_df)) {
    stop(
      "No additional metadata are provided ('metadata_df' tibble is NULL)",
      " and there is no 'is_reference' column in the 'cluster_df' tibble.\n",
      "Please do one of the following strategy:\n",
      "    1. Either run the 'set_reference_spectra' or 'import_spede_clusters'",
      "       function beforehand to indicate a reference spectra\n",
      "    2. Provide a 'metadata' tibble\n"
    )
  }
  if (
    (!is.null(metadata_df) & is.null(criteria_column)) ||
      (is.null(metadata_df) & !is.null(criteria_column))
  ) {
    stop(
      "Additional metadata ('metadata_df' tibble) should be provided",
      " with a criteria column ('criteria_column') to be sorted.",
      "Please provide the two together."
    )
  }
  if (
    (is.null(metadata_df) & !is.null(hard_mask_column)) ||
      (is.null(metadata_df) & !is.null(soft_mask_column))
  ) {
    stop(
      "Masking column(s) ('hard'/'soft') require an additional",
      " metadata ('metadata_df')"
    )
  }
  if (any(!metadata_df$name %in% cluster_df$name)) {
    stop(
      "The spectra names in the metadata ('metadata_df' tibble) table",
      " do not match the names in the clusters table ('cluster_df')."
    )
  }
  # Warn that metadata table are prioritize over the 'is_sorted' flag
  if (is_sorted & !is.null(metadata_df)) {
    message(
      "The 'is_sorted' flag is set to TRUE, while a metadata table is",
      " provided. ",
      "Please note that the metadata table takes precedence and the cherry-picking",
      " step will rely on the 'criteria_column'."
    )
    # Set the flag to FALSE for consistency
    is_sorted <- FALSE
  }

  # Preparing the table by merging with the metadata if present
  #   and checking the presence of the columns

  # Keep the initial order of the rows
  cluster_df <- tibble::rowid_to_column(cluster_df, "rowid")
  # And a copy of the unchanged data.frame
  # orig_cluster_df <- dplyr::select(cluster_df, c("rowid", "name"))
  orig_cluster_df <- cluster_df

  if (!is.null(metadata_df)) {
    cluster_df <- cluster_df %>%
      dplyr::left_join(metadata_df, by = "name")
    # Update the copy of the unchanged data.frame
    orig_cluster_df <- cluster_df
    if (!is.null(criteria_column) & !criteria_column %in% colnames(cluster_df)) {
      stop(
        "The 'criteria_column' is not present in the merged tibble."
      )
    }
    if (!is.null(soft_mask_column)) {
      if (!soft_mask_column %in% colnames(cluster_df)) {
        stop(
          "The 'soft_mask_column' is not present in the merged tibble."
        )
      }
    }
    if (!is.null(hard_mask_column)) {
      if (!hard_mask_column %in% colnames(cluster_df)) {
        stop(
          "The 'hard_mask_column' is not present in the merged tibble."
        )
      }
    }

    # Hard mask: discard the spectra *and* their clusters
    # Soft mask: discard the spectra only

    # Soft mask
    if (!is.null(soft_mask_column)) {
      cluster_df <- cluster_df %>% filter(!.data[[soft_mask_column]])
    }
    # Hard masking is used to label a group that should be
    # discarded in the dereplication process and not be chosen
    # from:
    #  e.g. 24h in the 24h vs 48h comparison
    #  e.g. fast in the fast vs slow growers comparison
    if (!is.null(hard_mask_column)) {
      clusters_to_keep <- cluster_df %>%
        dplyr::select(tidyselect::all_of(c("membership", hard_mask_column))) %>%
        # Internally, we will label the clusters to be kept,
        #  meaning the clusters that:
        #   do NOT contain spectra from the discard group
        #   AND contain only spectra from one group (either all discard, or all keep)
        dplyr::distinct() %>%
        dplyr::add_count(.data$membership) %>%
        dplyr::filter(n == 1 & !.data[[hard_mask_column]]) %>%
        dplyr::pull(.data$membership)

      # Remove the clusters where picking is forbidden
      cluster_df <- dplyr::filter(cluster_df, .data$membership %in% clusters_to_keep)
    }
  }


  # Sort the spectra within each cluster using the provided metadata
  if (!is.null(metadata_df) & !is_sorted) {
    cluster_df <- cluster_df %>%
      dplyr::group_by(.data$membership)
    if (is_descending_order) {
      cluster_df <- dplyr::arrange(
        cluster_df, dplyr::desc(.data[[criteria_column]]),
        .by_group = TRUE
      )
    } else {
      cluster_df <- dplyr::arrange(
        cluster_df, .data[[criteria_column]],
        .by_group = TRUE
      )
    }
  }

  # Pick the reference spectra if no metadata are present
  if (is.null(metadata_df)) {
    cluster_df <- cluster_df %>%
      dplyr::mutate(
        "to_pick" = .data$is_reference
      )
  } else {
    # first() gives the name of the spectra so needs for an
    # extra step to convert to a logical vector
    cluster_df <- cluster_df %>%
      dplyr::mutate(
        "to_pick" = dplyr::first(.data$name),
        "to_pick" = .data$to_pick == .data$name
      )
  }
  # Merge with original tibble to keep potential masked/discarded spectra
  # and same row number and order
  # NA in 'to_pick' are replaced with FALSE in the case of discarded clusters

  cols <- base::intersect(
    colnames(cluster_df),
    colnames(orig_cluster_df)
  )
  cluster_df %>%
    dplyr::ungroup() %>%
    dplyr::right_join(orig_cluster_df, by = cols) %>%
    dplyr::mutate(
      "to_pick" = tidyr::replace_na(.data$to_pick, FALSE)
    ) %>%
    # Sort the tibble in the original order
    dplyr::arrange(.data$rowid) %>%
    dplyr::select(-c("rowid")) %>%
    return()
}