Nothing
# WARNING - Generated by {fusen} from dev/import-data.Rmd: do not edit by hand
#' Importing Bruker MALDI Biotyper CSV report
#'
#'
#' The header-less table exported by the Compass software in the Bruker MALDI
#' Biotyper device is separated by semi-colons and has empty columns which prevent
#' an easy import in R. This function reads the report correctly as a tibble.
#'
#' @details
#' The header-less table contains identification information for each target processed by
#' the Biotyper device and once processed by the `read_biotyper_report`,
#' the following seven columns are available in the tibble, _when using the `best_hits = TRUE` option_:
#' * `name`: a character indicating the name of the spot of the MALDI target (i.e., plate)
#' * `sample_name`: the character string provided during the preparation of the MALDI target (i.e., plate)
#' * `hit_rank`: an integer indicating the rank of the hit for the corresponding target and identification
#' * `bruker_quality`: a character encoding the quality of the identification with potentially multiple "+" symbol or only one "-"
#' * `bruker_species`: the species name associated with the MALDI spectrum analyzed.
#' * `bruker_taxid`: the NCBI Taxonomy Identifier of the species name in the column species
#' * `bruker_hash`: a hash from an undocumented checksum function probably to encode the database entry.
#' * `bruker_log`: the log-score of the identification.
#'
#' When all hits are returned (with `best_hits = FALSE`), the default output format is the long format (`long_format = TRUE`), meaning that the previous columns remain
#' unchanged, but all hits are now returned, thus increasing the number of rows.
#'
#' When all hits are returned (with `best_hits = FALSE`) _using the wide format_ (`long_format = FALSE), the two columns `name` and `sample_name`
#' remains unchanged, but the five columns prefixed by `bruker_` contain the hit rank, **creating a tibble of 52 columns**:
#'
#' * `bruker_01_quality`
#' * `bruker_01_species`
#' * `bruker_01_taxid`
#' * `bruker_01_hash`
#' * `bruker_01_log`
#' * `bruker_02_quality`
#' * ...
#' * `bruker_10_species`
#' * `bruker_10_taxid`
#' * `bruker_10_hash`
#' * `bruker_10_log`
#'
#' @note A report that contains only spectra with no peaks found will return a tibble of 0 rows and a warning message.
#'
#' @param path Path to the semi-colon separated table
#' @param best_hits A logical indicating whether to return only the best hits for each target analyzed
#' @param long_format A logical indicating whether the table is in the long format (many rows) or wide format (many columns) when showing all the hits. This option has no effect when `best_hits = TRUE`.
#'
#' @return
#' A tibble of 7 columns (`best_hits = TRUE`) or 52 columns (`best_hits = FALSE`). See Details for the description of the columns.
#'
#' @seealso [read_many_biotyper_reports]
#'
#' @export
#'
#' @examples
#' # Get a example Bruker report
#' biotyper <- system.file("biotyper.csv", package = "maldipickr")
#' # Import the report as a tibble
#' report_tibble <- read_biotyper_report(biotyper)
#' # Display the tibble
#' report_tibble
read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
# Prepare the columns names, because 10 hits are reported by default
prep_names <- tidyr::expand_grid(
"prefix" = "bruker",
"iteration" = sprintf("%02d", 1:10), # Because 10 hits per spot with each 5 columns
"variables" = c("quality", "species", "taxid", "hash", "log")
) %>% dplyr::mutate(
"type" = dplyr::if_else(.data$variables == "log", "d", "c"),
"col_names" = paste(.data$prefix, .data$iteration, .data$variables, sep = "_")
)
# Read in the report, usually warnings about problems and
# inconsistent number of columns are triggered
# Having name as first column always is to enable
# taxonomic identification cherry-picking
breport <- utils::read.delim(
path,
col.names = c("name", "sample_name", prep_names$col_names),
sep = ";", header = FALSE,
na = c("NA", "E1", "E2", "") # Added E1 identification in taxid as NA
)
no_peak_lgl <- breport$bruker_01_species == "no peaks found"
# Remove the spot name for which no peaks were detected, and warn the user
breport <- tibble::as_tibble(breport) %>%
# Empty sample_name are considered logical and this is undesirable
dplyr::mutate("sample_name" = as.character(.data$sample_name)) %>%
dplyr::filter(.data$bruker_01_species != "no peaks found")
if (sum(no_peak_lgl) > 0) {
warning(
"Remove ", sum(no_peak_lgl), " row(s) out of ", length(no_peak_lgl),
" due to no peaks found"
)
}
# Fix issue with empty tibble that could not run the LONG/WIDE procedure
# Otherwise exit with
# Error in `dplyr::relocate()`:
# Can't subset columns that don't exist (quality for instance)
if (nrow(breport) == 0) {
tibble::tibble(
"name" = character(), "sample_name" = character(), "hit_rank" = integer(),
"bruker_quality" = character(), "bruker_species" = character(),
"bruker_taxid" = numeric(), "bruker_hash" = character(),
"bruker_log" = numeric()
) %>% return()
} else {
# Format the table in WIDE (many columns) or LONG format (many rows)
# By design, the table is wide.
# But the default tibble rendering is long
# styler: off
if ( (long_format & best_hits) |
(long_format & !best_hits) |
(!long_format & best_hits)) {
# styler: on
# The tibble has different types meaning
# a naive approach with `pivot_longer()` directly would raise:
# Error in `pivot_longer()`:
# ! Can't combine `bruker_01_quality` <character> and `bruker_01_taxid` <integer>.
# Subset the table with only the character variables
report_chr <- breport %>%
dplyr::select(
c("name", "sample_name") |
tidyselect::contains("bruker") & tidyselect::where(is.character)
) %>%
tidyr::pivot_longer(
!c("name", "sample_name"),
names_to = c("hit_rank", "type"),
names_pattern = "bruker_(.*)_(.*)"
) %>%
tidyr::pivot_wider(names_from = "type", values_from = "value")
report_num <- breport %>%
dplyr::select(
tidyselect::all_of(c("name", "sample_name")) |
tidyselect::contains("bruker") & tidyselect::where(is.numeric)
) %>%
tidyr::pivot_longer(
!tidyselect::all_of(c("name", "sample_name")),
names_to = c("hit_rank", "type"),
names_pattern = "bruker_(.*)_(.*)"
) %>%
tidyr::pivot_wider(names_from = "type", values_from = "value")
# Combine the two sub-tables and convert hit rank to integer for further filtering.
breport <- dplyr::full_join(
report_chr,
report_num,
by = c("name", "sample_name", "hit_rank")
) %>%
dplyr::mutate("hit_rank" = strtoi(.data$hit_rank, base = 10L)) %>%
dplyr::relocate(
c(
"name", "sample_name", "hit_rank",
"quality", "species", "taxid", "hash", "log"
)
) %>%
dplyr::rename_with(
~ paste0("bruker_", .x),
!c("name", "sample_name", "hit_rank")
)
}
# when all hits are used, pivot the wide table
# to have the name sample_name hit_number and the rest of the column
if (best_hits) {
breport %>%
dplyr::filter(.data$hit_rank == 1) %>%
return()
} else {
return(breport)
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.