R/read_biotyper_report.R
In maldipickr: Dereplicate and Cherry-Pick Mass Spectrometry Spectra

Documented in read_biotyper_report

# WARNING - Generated by {fusen} from dev/import-data.Rmd: do not edit by hand

#' Importing Bruker MALDI Biotyper CSV report
#'
#'
#' The header-less table exported by the Compass software in the Bruker MALDI
#' Biotyper device is separated by semi-colons and has empty columns which prevent
#' an easy import in R. This function reads the report correctly as a tibble.
#'
#' @details
#' The header-less table contains identification information for each target processed by
#' the Biotyper device and once processed by the `read_biotyper_report`,
#' the following seven columns are available in the tibble, _when using the `best_hits = TRUE` option_:
#' * `name`: a character indicating the name of the spot of the MALDI target (i.e., plate)
#' * `sample_name`: the character string provided during the preparation of the MALDI target (i.e., plate)
#' * `hit_rank`: an integer indicating the rank of the hit for the corresponding target and identification
#' * `bruker_quality`: a character encoding the quality of the identification with potentially multiple "+" symbol or only one "-"
#' * `bruker_species`: the species name associated with the MALDI spectrum analyzed.
#' * `bruker_taxid`: the NCBI Taxonomy Identifier of the species name in the column species
#' * `bruker_hash`: a hash from an undocumented checksum function probably to encode the database entry.
#' * `bruker_log`: the log-score of the identification.
#'
#' When all hits are returned (with `best_hits = FALSE`), the default output format is the long format (`long_format = TRUE`), meaning that the previous columns remain
#' unchanged, but all hits are now returned, thus increasing the number of rows.
#'
#' When all hits are returned (with `best_hits = FALSE`) _using the wide format_ (`long_format = FALSE), the two columns `name` and `sample_name`
#' remains unchanged, but the five columns prefixed by `bruker_` contain the hit rank, **creating a tibble of 52 columns**:
#'
#' * `bruker_01_quality`
#' * `bruker_01_species`
#' * `bruker_01_taxid`
#' * `bruker_01_hash`
#' * `bruker_01_log`
#' * `bruker_02_quality`
#' * ...
#' * `bruker_10_species`
#' * `bruker_10_taxid`
#' * `bruker_10_hash`
#' * `bruker_10_log`
#'
#' @note A report that contains only spectra with no peaks found will return a tibble of 0 rows and a warning message.
#'
#' @param path Path to the semi-colon separated table
#' @param best_hits A logical indicating whether to return only the best hits for each target analyzed
#' @param long_format A logical indicating whether the table is in the long format (many rows) or wide format (many columns) when showing all the hits. This option has no effect when `best_hits = TRUE`.
#'
#' @return
#' A tibble of 7 columns (`best_hits = TRUE`) or 52 columns (`best_hits = FALSE`). See Details for the description of the columns.
#'
#' @seealso [read_many_biotyper_reports]
#'
#' @export
#'
#' @examples
#' # Get a example Bruker report
#' biotyper <- system.file("biotyper.csv", package = "maldipickr")
#' # Import the report as a tibble
#' report_tibble <- read_biotyper_report(biotyper)
#' # Display the tibble
#' report_tibble
read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
  # Prepare the columns names, because 10 hits are reported by default
  prep_names <- tidyr::expand_grid(
    "prefix" = "bruker",
    "iteration" = sprintf("%02d", 1:10), # Because 10 hits per spot with each 5 columns
    "variables" = c("quality", "species", "taxid", "hash", "log")
  ) %>% dplyr::mutate(
    "type" = dplyr::if_else(.data$variables == "log", "d", "c"),
    "col_names" = paste(.data$prefix, .data$iteration, .data$variables, sep = "_")
  )

  # Read in the report, usually warnings about problems and
  #  inconsistent number of columns are triggered
  # Having name as first column always is to enable
  #  taxonomic identification cherry-picking
  breport <- utils::read.delim(
    path,
    col.names = c("name", "sample_name", prep_names$col_names),
    sep = ";", header = FALSE,
    na = c("NA", "E1", "E2", "") # Added E1 identification in taxid as NA
  )
  no_peak_lgl <- breport$bruker_01_species == "no peaks found"

  # Remove the spot name for which no peaks were detected, and warn the user
  breport <- tibble::as_tibble(breport) %>%
    # Empty sample_name are considered logical and this is undesirable
    dplyr::mutate("sample_name" = as.character(.data$sample_name)) %>%
    dplyr::filter(.data$bruker_01_species != "no peaks found")
  if (sum(no_peak_lgl) > 0) {
    warning(
      "Remove ", sum(no_peak_lgl), " row(s) out of ", length(no_peak_lgl),
      " due to no peaks found"
    )
  }

  # Fix issue with empty tibble that could not run the LONG/WIDE procedure
  # Otherwise exit with
  # Error in `dplyr::relocate()`:
  # Can't subset columns that don't exist (quality for instance)
  if (nrow(breport) == 0) {
    tibble::tibble(
      "name" = character(), "sample_name" = character(), "hit_rank" = integer(),
      "bruker_quality" = character(), "bruker_species" = character(),
      "bruker_taxid" = numeric(), "bruker_hash" = character(),
      "bruker_log" = numeric()
    ) %>% return()
  } else {
    # Format the table in WIDE (many columns) or LONG format (many rows)
    # By design, the table is wide.
    # But the default tibble rendering is long

    # styler: off
  if ( (long_format & best_hits)  |
       (long_format & !best_hits) |
       (!long_format & best_hits)) {
      # styler: on


      # The tibble has different types meaning
      # a naive approach with `pivot_longer()` directly would raise:
      # Error in `pivot_longer()`:
      # ! Can't combine `bruker_01_quality` <character> and `bruker_01_taxid` <integer>.

      # Subset the table with only the character variables
      report_chr <- breport %>%
        dplyr::select(
          c("name", "sample_name") |
            tidyselect::contains("bruker") & tidyselect::where(is.character)
        ) %>%
        tidyr::pivot_longer(
          !c("name", "sample_name"),
          names_to = c("hit_rank", "type"),
          names_pattern = "bruker_(.*)_(.*)"
        ) %>%
        tidyr::pivot_wider(names_from = "type", values_from = "value")

      report_num <- breport %>%
        dplyr::select(
          tidyselect::all_of(c("name", "sample_name")) |
            tidyselect::contains("bruker") & tidyselect::where(is.numeric)
        ) %>%
        tidyr::pivot_longer(
          !tidyselect::all_of(c("name", "sample_name")),
          names_to = c("hit_rank", "type"),
          names_pattern = "bruker_(.*)_(.*)"
        ) %>%
        tidyr::pivot_wider(names_from = "type", values_from = "value")

      # Combine the two sub-tables and convert hit rank to integer for further filtering.
      breport <- dplyr::full_join(
        report_chr,
        report_num,
        by = c("name", "sample_name", "hit_rank")
      ) %>%
        dplyr::mutate("hit_rank" = strtoi(.data$hit_rank, base = 10L)) %>%
        dplyr::relocate(
          c(
            "name", "sample_name", "hit_rank",
            "quality", "species", "taxid", "hash", "log"
          )
        ) %>%
        dplyr::rename_with(
          ~ paste0("bruker_", .x),
          !c("name", "sample_name", "hit_rank")
        )
    }
    # when all hits are used, pivot the wide table
    # to have the name sample_name hit_number and the rest of the column
    if (best_hits) {
      breport %>%
        dplyr::filter(.data$hit_rank == 1) %>%
        return()
    } else {
      return(breport)
    }
  }
}