R/import_spede_clusters.R
In maldipickr: Dereplicate and Cherry-Pick Mass Spectrometry Spectra

Documented in import_spede_clusters

# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand

#' Import clusters results generated by SPeDE
#'
#' Reformat the table output from the analysis of raw Bruker MALDI Biotyper
#' spectra by the SPeDE tool from Dumolin et al. (2019) to be consistent with the Strejcek et al. (2018) procedure followed
#' in the [maldipickr] package.
#'
#' @param path Path to the comma separated table generated by SPeDE
#'
#' @return A tibble with the following columns:
#'
#' * `name`: a character denoting the spectra name (all spaces, dashes and dots are replaced by underscores "_" in SPeDE)
#' * `membership`: integers stating the cluster number to which the spectra belong to. It starts from 1 to _c_, the total number of clusters.
#' * `cluster_size`: integers indicating the total number of spectra in the corresponding cluster.
#' * `quality`: a character indicating the spectra quality category by SPeDE, out of GREEN, ORANGE and RED.
#' * `is_reference`: a logical indicating whether the corresponding spectra is a reference spectra of the cluster.
#'
#' @seealso <https://github.com/LM-UGent/SPeDE>
#' @references Dumolin C, Aerts M, Verheyde B, Schellaert S, Vandamme T, Van Der Jeugt F, De Canck E, Cnockaert M, Wieme AD, Cleenwerck I, Peiren J, Dawyndt P, Vandamme P, & Carlier A. (2019). "Introducing SPeDE: High-Throughput Dereplication and Accurate Determination of Microbial Diversity from Matrix-Assisted Laser Desorption–Ionization Time of Flight Mass Spectrometry Data". *MSystems* 4(5). <doi:10.1128/msystems.00437-19>.
#' @export
#' @examples
#' # Reformat the output from SPeDE table
#' # https://github.com/LM-UGent/SPeDE
#' import_spede_clusters(
#'   system.file("spede.csv", package = "maldipickr")
#' )
import_spede_clusters <- function(path) {
  # Import the SPeDE csv table with only the columns planned to be used
  utils::read.csv(path) %>%
    dplyr::select(
      "SOURCE_FILE",
      "QUALITY",
      "REFERENCE",
      "REFERENCE_NUMBER"
    ) %>%
    # Discard the trailing "Not matched:" and  "Rejected spectra:" at the end of the csv
    dplyr::filter(!is.na(.data$REFERENCE_NUMBER)) %>%
    tibble::as_tibble() %>%
    # Sanitize the column names and variable types to the package convention
    dplyr::rename(c(
      "name" = "SOURCE_FILE",
      "membership" = "REFERENCE_NUMBER",
      "quality" = "QUALITY"
    )) %>%
    dplyr::mutate(
      "membership" = .data$membership + 1,
      "is_reference" = if_else(.data$REFERENCE == "Yes", TRUE, FALSE)
    ) %>%
    # Add the cluster_size
    dplyr::group_by(.data$membership) %>%
    dplyr::mutate("cluster_size" = dplyr::n()) %>%
    dplyr::select("name", "membership", "cluster_size", "quality", "is_reference") %>%
    dplyr::ungroup() %>%
    return()
}