R/delineate_with_identification.R
In maldipickr: Dereplicate and Cherry-Pick Mass Spectrometry Spectra

Documented in delineate_with_identification

# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand

#' Delineate clusters from taxonomic identifications
#'
#' From the report of taxonomic identification produced by the Bruker MALDI Biotyper
#' spectra sharing the same identification are labeled in the same cluster.
#' Spectra with unknown identification (e.g., due to database completeness) are set in unique cluster.
#'
#' @param tibble_report A tibble of *n* rows, with *n* the number of spectra,
#' produced by [read_biotyper_report()] or [read_many_biotyper_reports()]. The long format and the best hits options are expected to be used in these functions to produce a compliant input tibble.
#'
#' @details As all unknown identification are considered unique clusters _within one input tibble_, it is important to consider whether the taxonomic identifications come from a single report or multiple reports, depending on the research question. A message is displayed to confirm from which type of reports the delineation was done.
#'
#' @return A tibble of *n* rows for each spectra and 3 columns:
#' * `name`: the spectra names from the `name` column from the output of either [read_biotyper_report()] or [read_many_biotyper_reports()].
#' * `membership`: integers stating the cluster number to which the spectra belong to. It starts from 1 to _c_, the total number of clusters.
#' * `cluster_size`: integers indicating the total number of spectra in the corresponding cluster.
#'
#' @seealso [delineate_with_similarity]
#'
#' @export
#' @examples
#' report_unknown <- read_biotyper_report(
#'   system.file("biotyper_unknown.csv", package = "maldipickr")
#' )
#' delineate_with_identification(report_unknown)
delineate_with_identification <- function(tibble_report) {
  # check correct names and number of columns
  single_report_cols <- c(
    "name", "sample_name",
    "hit_rank", "bruker_quality",
    "bruker_species", "bruker_taxid",
    "bruker_hash", "bruker_log"
  )
  many_reports_cols <- c(
    "name",
    gsub("^name$", "original_name", single_report_cols)
  )
  if (identical(
    base::colnames(tibble_report),
    single_report_cols
  )) {
    message("Generating clusters from single report")
    id_column <- single_report_cols[1]
  } else if (identical(
    base::colnames(tibble_report),
    many_reports_cols
  )) {
    message("Generating clusters from multiple reports")
    id_column <- many_reports_cols[1]
  } else {
    stop(
      "Unexpected format of Biotyper report.\n",
      "  Please ensure the report is imported in R correctly with either:\n",
      "    - `read_biotyper_report(long_format = TRUE)`\n",
      "    - `read_many_biotyper_reports()`"
    )
  }

  # Checking that best_hits = TRUE was used
  n_target_names <- dplyr::pull(tibble_report, {{ id_column }}) %>% dplyr::n_distinct()
  if (n_target_names != nrow(tibble_report)) {
    warning(
      "There is more than one spectrum identification per target!\n",
      "  Did you forget to use best_hits = TRUE or to filter the hits beforehand?"
    )
  }

  # Make unknown identification unique
  tibble_report %>%
    dplyr::mutate(
      "membership" = dplyr::if_else(
        .data$bruker_species == "not reliable identification",
        make.unique(.data$bruker_species),
        .data$bruker_species
      ),
      "membership" = as.integer(as.factor(.data$membership))
    ) %>%
    dplyr::group_by(.data$membership) %>%
    dplyr::mutate(
      "cluster_size" = dplyr::n()
    ) %>%
    dplyr::ungroup() %>%
    dplyr::select(
      c(
        {{ id_column }}, "membership", "cluster_size"
      )
    ) %>%
    return()
}