Nothing
# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand
#' Cherry-pick Bruker MALDI Biotyper spectra
#'
#' Using the clusters information, and potential additional metadata as external
#' criteria, spectra are labeled as to be picked for each cluster. Note that some
#' spectra and therefore clusters can be explicitly removed (_masked_)
#' from the picking decision if they have been previously picked
#' or should be discarded, using logical columns in the metadata table.
#' If no metadata are provided, the reference spectra of each cluster will be picked.
#'
#' @param cluster_df A tibble with clusters information
#' from the [delineate_with_similarity] or the [import_spede_clusters] function.
#' @param metadata_df Optional tibble with relevant metadata to guide the
#' picking process (e.g., OD600).
#' @param criteria_column Optional character indicating the column in `metadata_df`
#' to be used as a criteria.
#' @param hard_mask_column Column name in the `cluster_df` or `metadata_df` tibble indicating whether the spectra, **and the clusters to which they belong** should be discarded (`TRUE`) or not (`FALSE`) before the picking decision.
#' @param soft_mask_column Column name in the `cluster_df` or `metadata_df` tibble indicating whether the spectra should be discarded (`TRUE`) or not (`FALSE`) before the picking decision.
#' @param is_descending_order Optional logical indicating whether to sort the `criteria_column` from the highest-to-lowest value (`TRUE`) or lowest-to-highest (`FALSE`).
#' @param is_sorted Optional logical to indicate that the `cluster_df` is
#' already sorted by cluster based on (usually multiple) internal criteria to
#' pick the first of each cluster. This flag is **overridden** if a `metadata_df`
#' is provided.
#'
#' @return A tibble with as many rows as `cluster_df` with an additional logical
#' column named `to_pick` to indicate whether the colony associated to the spectra
#' should be picked. If `metadata_df` is provided, then additional columns from
#' this tibble are added to the returned tibble.
#'
#' @seealso [delineate_with_similarity], [set_reference_spectra]. For a useful utility function to soft-mask specific spectra: [is_well_on_edge].
#'
#' @export
#' @examples
#' # 0. Load a toy example of a tibble of clusters created by
#' # the `delineate_with_similarity` function.
#' clusters <- readRDS(
#' system.file("clusters_tibble.RDS",
#' package = "maldipickr"
#' )
#' )
#' # 1. By default and if no other metadata are provided,
#' # the function picks reference spectra for each clusters.
#' #
#' # N.B: The spectra `name` and `to_pick` columns are moved to the left
#' # only for clarity using the `relocate()` function.
#' #
#' pick_spectra(clusters) %>%
#' dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 2.1 Simulate OD600 values with uniform distribution
#' # for each of the colonies we measured with
#' # the Bruker MALDI Biotyper
#' set.seed(104)
#' metadata <- dplyr::transmute(
#' clusters,
#' name = name, OD600 = runif(n = nrow(clusters))
#' )
#' metadata
#'
#' # 2.2 Pick the spectra based on the highest
#' # OD600 value per cluster
#' pick_spectra(clusters, metadata, "OD600") %>%
#' dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 3.1 Say that the wells on the right side of the plate are
#' # used for negative controls and should not be picked.
#' metadata <- metadata %>% dplyr::mutate(
#' well = gsub(".*[A-Z]([0-9]{1,2}$)", "\\1", name) %>%
#' strtoi(),
#' is_edge = is_well_on_edge(
#' well_number = well, plate_layout = 96, edges = "right"
#' )
#' )
#'
#' # 3.2 Pick the spectra after discarding (or soft masking)
#' # the spectra indicated by the `is_edge` column.
#' pick_spectra(clusters, metadata, "OD600",
#' soft_mask_column = "is_edge"
#' ) %>%
#' dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 4.1 Say that some spectra were picked before
#' # (e.g., in the column F) in a previous experiment.
#' # We do not want to pick clusters with those spectra
#' # included to limit redundancy.
#' metadata <- metadata %>% dplyr::mutate(
#' picked_before = grepl("_F", name)
#' )
#' # 4.2 Pick the spectra from clusters without spectra
#' # labeled as `picked_before` (hard masking).
#' pick_spectra(clusters, metadata, "OD600",
#' hard_mask_column = "picked_before"
#' ) %>%
#' dplyr::relocate(name, to_pick) # only for clarity
pick_spectra <- function(
cluster_df,
metadata_df = NULL, criteria_column = NULL,
hard_mask_column = NULL, soft_mask_column = NULL,
is_descending_order = TRUE,
is_sorted = FALSE) {
# Check that:
# - the mandatory column describing the cluster are present
# - clusters have a reference spectra to be picked if no metadata are provided
# - when metadata (resp. a criteria column) is provided,
# a criteria column (resp. metadata) should be provided as well
# - when masking columns are indicated, the metadata tibble should be added
# - the spectra name in the metadata table are also present in the clusters table
if (any(!c("name", "membership") %in% colnames(cluster_df))) {
stop(
"'cluster_df' lacks one of the following columns: name or membership"
)
}
if (is.null(metadata_df) & !"is_reference" %in% colnames(cluster_df)) {
stop(
"No additional metadata are provided ('metadata_df' tibble is NULL)",
" and there is no 'is_reference' column in the 'cluster_df' tibble.\n",
"Please do one of the following strategy:\n",
" 1. Either run the 'set_reference_spectra' or 'import_spede_clusters'",
" function beforehand to indicate a reference spectra\n",
" 2. Provide a 'metadata' tibble\n"
)
}
if (
(!is.null(metadata_df) & is.null(criteria_column)) ||
(is.null(metadata_df) & !is.null(criteria_column))
) {
stop(
"Additional metadata ('metadata_df' tibble) should be provided",
" with a criteria column ('criteria_column') to be sorted.",
"Please provide the two together."
)
}
if (
(is.null(metadata_df) & !is.null(hard_mask_column)) ||
(is.null(metadata_df) & !is.null(soft_mask_column))
) {
stop(
"Masking column(s) ('hard'/'soft') require an additional",
" metadata ('metadata_df')"
)
}
if (any(!metadata_df$name %in% cluster_df$name)) {
stop(
"The spectra names in the metadata ('metadata_df' tibble) table",
" do not match the names in the clusters table ('cluster_df')."
)
}
# Warn that metadata table are prioritize over the 'is_sorted' flag
if (is_sorted & !is.null(metadata_df)) {
message(
"The 'is_sorted' flag is set to TRUE, while a metadata table is",
" provided. ",
"Please note that the metadata table takes precedence and the cherry-picking",
" step will rely on the 'criteria_column'."
)
# Set the flag to FALSE for consistency
is_sorted <- FALSE
}
# Preparing the table by merging with the metadata if present
# and checking the presence of the columns
# Keep the initial order of the rows
cluster_df <- tibble::rowid_to_column(cluster_df, "rowid")
# And a copy of the unchanged data.frame
# orig_cluster_df <- dplyr::select(cluster_df, c("rowid", "name"))
orig_cluster_df <- cluster_df
if (!is.null(metadata_df)) {
cluster_df <- cluster_df %>%
dplyr::left_join(metadata_df, by = "name")
# Update the copy of the unchanged data.frame
orig_cluster_df <- cluster_df
if (!is.null(criteria_column) & !criteria_column %in% colnames(cluster_df)) {
stop(
"The 'criteria_column' is not present in the merged tibble."
)
}
if (!is.null(soft_mask_column)) {
if (!soft_mask_column %in% colnames(cluster_df)) {
stop(
"The 'soft_mask_column' is not present in the merged tibble."
)
}
}
if (!is.null(hard_mask_column)) {
if (!hard_mask_column %in% colnames(cluster_df)) {
stop(
"The 'hard_mask_column' is not present in the merged tibble."
)
}
}
# Hard mask: discard the spectra *and* their clusters
# Soft mask: discard the spectra only
# Soft mask
if (!is.null(soft_mask_column)) {
cluster_df <- cluster_df %>% filter(!.data[[soft_mask_column]])
}
# Hard masking is used to label a group that should be
# discarded in the dereplication process and not be chosen
# from:
# e.g. 24h in the 24h vs 48h comparison
# e.g. fast in the fast vs slow growers comparison
if (!is.null(hard_mask_column)) {
clusters_to_keep <- cluster_df %>%
dplyr::select(tidyselect::all_of(c("membership", hard_mask_column))) %>%
# Internally, we will label the clusters to be kept,
# meaning the clusters that:
# do NOT contain spectra from the discard group
# AND contain only spectra from one group (either all discard, or all keep)
dplyr::distinct() %>%
dplyr::add_count(.data$membership) %>%
dplyr::filter(n == 1 & !.data[[hard_mask_column]]) %>%
dplyr::pull(.data$membership)
# Remove the clusters where picking is forbidden
cluster_df <- dplyr::filter(cluster_df, .data$membership %in% clusters_to_keep)
}
}
# Sort the spectra within each cluster using the provided metadata
if (!is.null(metadata_df) & !is_sorted) {
cluster_df <- cluster_df %>%
dplyr::group_by(.data$membership)
if (is_descending_order) {
cluster_df <- dplyr::arrange(
cluster_df, dplyr::desc(.data[[criteria_column]]),
.by_group = TRUE
)
} else {
cluster_df <- dplyr::arrange(
cluster_df, .data[[criteria_column]],
.by_group = TRUE
)
}
}
# Pick the reference spectra if no metadata are present
if (is.null(metadata_df)) {
cluster_df <- cluster_df %>%
dplyr::mutate(
"to_pick" = .data$is_reference
)
} else {
# first() gives the name of the spectra so needs for an
# extra step to convert to a logical vector
cluster_df <- cluster_df %>%
dplyr::mutate(
"to_pick" = dplyr::first(.data$name),
"to_pick" = .data$to_pick == .data$name
)
}
# Merge with original tibble to keep potential masked/discarded spectra
# and same row number and order
# NA in 'to_pick' are replaced with FALSE in the case of discarded clusters
cols <- base::intersect(
colnames(cluster_df),
colnames(orig_cluster_df)
)
cluster_df %>%
dplyr::ungroup() %>%
dplyr::right_join(orig_cluster_df, by = cols) %>%
dplyr::mutate(
"to_pick" = tidyr::replace_na(.data$to_pick, FALSE)
) %>%
# Sort the tibble in the original order
dplyr::arrange(.data$rowid) %>%
dplyr::select(-c("rowid")) %>%
return()
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.