R/set_reference_spectra.R
In maldipickr: Dereplicate and Cherry-Pick Mass Spectrometry Spectra

Documented in set_reference_spectra

# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand

#' Set a reference spectrum for each cluster
#'
#' Define a high-quality spectra as a representative
#' spectra of the cluster based on the highest median signal-to-noise ratio
#' and the number of detected peaks
#'
#'
#' @param cluster_df A tibble of *n* rows for each spectra produced by [delineate_with_similarity] function with at least the following columns:
#' * `name`: the rownames of the similarity matrix indicating the spectra names
#' * `membership`: integers stating the cluster number to which the spectra belong to. It starts from 1 to _c_, the total number of clusters.
#' * `cluster_size`: integers indicating the total number of spectra in the corresponding cluster.
#' @param metadata_df A tibble of *n* rows for each spectra produced by the [process_spectra] function with median signal-to-noise ratio (`SNR`), peaks number (`peaks`), and spectra names in the `name` column.
#'
#'
#' @return A merged tibble in the same order as `cluster_df` with both the columns of `cluster_df` and `metadata_df`, as well as a logical column `is_reference` indicating if the spectrum is the reference spectra of the cluster.
#'
#' @seealso [delineate_with_similarity], [pick_spectra]
#'
#' @export
#' @examples
#' # Get an example directory of six Bruker MALDI Biotyper spectra
#' # Import the six spectra and
#' # Transform the spectra signals according to Strejcek et al. (2018)
#' processed <- system.file(
#'   "toy-species-spectra",
#'   package = "maldipickr"
#' ) %>%
#'   import_biotyper_spectra() %>%
#'   process_spectra()
#'
#' # Toy similarity matrix between the six example spectra of
#' #  three species. The cosine metric is used and a value of
#' #  zero indicates dissimilar spectra and a value of one
#' #  indicates identical spectra.
#' cosine_similarity <- matrix(
#'   c(
#'     1, 0.79, 0.77, 0.99, 0.98, 0.98,
#'     0.79, 1, 0.98, 0.79, 0.8, 0.8,
#'     0.77, 0.98, 1, 0.77, 0.77, 0.77,
#'     0.99, 0.79, 0.77, 1, 1, 0.99,
#'     0.98, 0.8, 0.77, 1, 1, 1,
#'     0.98, 0.8, 0.77, 0.99, 1, 1
#'   ),
#'   nrow = 6,
#'   dimnames = list(
#'     c(
#'       "species1_G2", "species2_E11", "species2_E12",
#'       "species3_F7", "species3_F8", "species3_F9"
#'     ),
#'     c(
#'       "species1_G2", "species2_E11", "species2_E12",
#'       "species3_F7", "species3_F8", "species3_F9"
#'     )
#'   )
#' )
#' # Delineate clusters based on a 0.92 threshold applied
#' #  to the similarity matrix
#' clusters <- delineate_with_similarity(
#'   cosine_similarity,
#'   threshold = 0.92
#' )
#'
#' # Set reference spectra with the toy example
#' set_reference_spectra(clusters, processed$metadata)
set_reference_spectra <- function(cluster_df, metadata_df) {
  # Checking the tibbles sizes
  if (base::nrow(cluster_df) != base::nrow(metadata_df)) {
    stop(
      "The tibbles do not have the same number of rows!",
      "Note: if multiple batches are included in 'cluster_df'",
      "consider combining the multiple associated metadata tables",
      "using 'dplyr::bind_rows()'."
    )
  }
  # Checking the tibbles columns
  if (any(!c("name", "membership", "cluster_size") %in% colnames(cluster_df))) {
    stop(
      "'cluster_df' lacks one of the following columns:",
      "name, membership, cluster_size"
    )
  }
  if (any(!c("name", "SNR", "peaks") %in% colnames(metadata_df))) {
    stop(
      "'metadata_df' lacks one of the following columns:",
      "name, SNR, peaks"
    )
  }
  # Merging cluster information and spectra/peaks metadata to select the
  # reference spectra
  cluster_df %>%
    # Keep track of the row order via a numeric id
    tibble::rowid_to_column() %>%
    dplyr::inner_join(metadata_df, by = "name") %>%
    # Make sure the sorting is done within groups
    dplyr::group_by(.data$membership) %>%
    dplyr::arrange(dplyr::desc(.data$SNR), dplyr::desc(.data$peaks), .by_group = TRUE) %>%
    # first() gives the name of the spectra so needs for an
    # extra step to convert to a logical vector
    dplyr::mutate(
      "is_reference" = dplyr::first(.data$name),
      "is_reference" = .data$is_reference == .data$name
    ) %>%
    dplyr::ungroup() %>%
    # Sort the tibble in the original order
    dplyr::arrange(.data$rowid) %>%
    dplyr::select(-c("rowid")) %>%
    return()
}