Nothing
# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand
#' Delineate clusters from taxonomic identifications
#'
#' From the report of taxonomic identification produced by the Bruker MALDI Biotyper
#' spectra sharing the same identification are labeled in the same cluster.
#' Spectra with unknown identification (e.g., due to database completeness) are set in unique cluster.
#'
#' @param tibble_report A tibble of *n* rows, with *n* the number of spectra,
#' produced by [read_biotyper_report()] or [read_many_biotyper_reports()]. The long format and the best hits options are expected to be used in these functions to produce a compliant input tibble.
#'
#' @details As all unknown identification are considered unique clusters _within one input tibble_, it is important to consider whether the taxonomic identifications come from a single report or multiple reports, depending on the research question. A message is displayed to confirm from which type of reports the delineation was done.
#'
#' @return A tibble of *n* rows for each spectra and 3 columns:
#' * `name`: the spectra names from the `name` column from the output of either [read_biotyper_report()] or [read_many_biotyper_reports()].
#' * `membership`: integers stating the cluster number to which the spectra belong to. It starts from 1 to _c_, the total number of clusters.
#' * `cluster_size`: integers indicating the total number of spectra in the corresponding cluster.
#'
#' @seealso [delineate_with_similarity]
#'
#' @export
#' @examples
#' report_unknown <- read_biotyper_report(
#' system.file("biotyper_unknown.csv", package = "maldipickr")
#' )
#' delineate_with_identification(report_unknown)
delineate_with_identification <- function(tibble_report) {
# check correct names and number of columns
single_report_cols <- c(
"name", "sample_name",
"hit_rank", "bruker_quality",
"bruker_species", "bruker_taxid",
"bruker_hash", "bruker_log"
)
many_reports_cols <- c(
"name",
gsub("^name$", "original_name", single_report_cols)
)
if (identical(
base::colnames(tibble_report),
single_report_cols
)) {
message("Generating clusters from single report")
id_column <- single_report_cols[1]
} else if (identical(
base::colnames(tibble_report),
many_reports_cols
)) {
message("Generating clusters from multiple reports")
id_column <- many_reports_cols[1]
} else {
stop(
"Unexpected format of Biotyper report.\n",
" Please ensure the report is imported in R correctly with either:\n",
" - `read_biotyper_report(long_format = TRUE)`\n",
" - `read_many_biotyper_reports()`"
)
}
# Checking that best_hits = TRUE was used
n_target_names <- dplyr::pull(tibble_report, {{ id_column }}) %>% dplyr::n_distinct()
if (n_target_names != nrow(tibble_report)) {
warning(
"There is more than one spectrum identification per target!\n",
" Did you forget to use best_hits = TRUE or to filter the hits beforehand?"
)
}
# Make unknown identification unique
tibble_report %>%
dplyr::mutate(
"membership" = dplyr::if_else(
.data$bruker_species == "not reliable identification",
make.unique(.data$bruker_species),
.data$bruker_species
),
"membership" = as.integer(as.factor(.data$membership))
) %>%
dplyr::group_by(.data$membership) %>%
dplyr::mutate(
"cluster_size" = dplyr::n()
) %>%
dplyr::ungroup() %>%
dplyr::select(
c(
{{ id_column }}, "membership", "cluster_size"
)
) %>%
return()
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.