Nothing
#' Bridge selection function
#'
#' @description
#' The bridge selection function will select a number of bridge samples based
#' on the input data. It selects samples with good detection that pass QC
#' and cover a good range of the data. If possible, Olink recommends 8-16
#' bridge samples. When running the selector, Olink recommends starting at
#' sample_missing_freq = 0.10 which represents a maximum of 10\% data below LOD
#' per sample. If there are not enough samples output, increase to 20\%. \cr\cr
#' The function accepts NPX Excel files with data < LOD replaced.
#'
#' @details
#' `olink_bridgeselector()` is a synonym of `olink_bridge_selector()` .
#'
#' @param df Tibble/data frame in long format such as produced by the
#' Olink Analyze read_npx function.
#' @param sample_missing_freq The threshold for sample wise missingness.
#' @param n Number of bridge samples to be selected.
#' @param check_log A named list returned by [`check_npx()`]. If `NULL`,
#' [`check_npx()`] will be run internally using `df`.
#'
#' @return A "tibble" with sample IDs and mean NPX for a defined number of
#' bridging samples. Columns include:
#' \itemize{
#' \item{SampleID:} Sample ID
#' \item{PercAssaysBelowLOD:} Percent of Assays that are below LOD for
#' the sample
#' \item{MeanNPX:} Mean NPX for the sample
#' }
#'
#' @aliases
#' olink_bridgeselector
#'
#' @export
#'
#' @examples
#' \donttest{
#' check_log <- OlinkAnalyze::check_npx(df = npx_data1)
#'
#' bridge_samples <- OlinkAnalyze::olink_bridge_selector(
#' df = npx_data1,
#' sample_missing_freq = 0.1,
#' n = 20L,
#' check_log = check_log
#' )
#' }
#'
olink_bridge_selector <- function(df,
sample_missing_freq,
n,
check_log = NULL) {
# ---- STEP 0: Input checks --------------------------------------------------
## dataset ----
check_is_dataset(x = df, error = TRUE)
## sample_missing_freq ----
# Validate that sample_missing_freq is a proportion in [0, 1]
if (missing(sample_missing_freq) || is.na(sample_missing_freq)) {
cli::cli_abort(
"{.arg sample_missing_freq} is a required, non 'NA', argument!"
)
}
check_is_scalar_numeric(x = sample_missing_freq, error = TRUE)
if (!dplyr::between(x = sample_missing_freq, left = 0, right = 1)) {
cli::cli_abort(
"Please provide a value for {.arg sample_missing_freq} between 0 and 1!"
)
}
## n ----
if (missing(n) || is.na(n)) {
cli::cli_abort(
"{.arg n} is a required, non 'NA', argument!"
)
}
check_is_scalar_numeric(x = n, error = TRUE)
if ((n <= 0L) || (n %% 1L != 0L)) {
cli::cli_abort(
"Please provide a positive non-decimal value for {.arg n}!"
)
}
# ---- STEP 1: Remove invalid OlinkIDs & control samples ---------------------
check_log <- run_check_npx(df = df, check_log = check_log)
df_clean <- run_clean_npx(
df = df,
check_log = check_log,
remove_qc_warning = FALSE,
remove_assay_warning = FALSE,
verbose = FALSE
)
check_log_clean <- run_check_npx(df = df_clean, check_log = NULL) |>
suppressMessages() |>
suppressWarnings()
if (!("sample_type" %in% names(check_log_clean$col_names))) {
cli::cli_inform(
"No sample type column detected in the input dataset {.arg df}! Ensure
that control samples have been filtered out!"
)
}
# ---- STEP 2: Outlier metrics per (Panel, SampleID) -------------------------
qc_outliers <- df_clean |>
dplyr::group_by(
dplyr::across(
dplyr::all_of(
c(check_log_clean$col_names$panel,
check_log_clean$col_names$sample_id)
)
)
) |>
dplyr::summarise(
IQR = stats::IQR(
x = .data[[check_log_clean$col_names$quant]],
na.rm = TRUE
),
sample_median = stats::median(
x = .data[[check_log_clean$col_names$quant]],
na.rm = TRUE
),
.groups = "drop"
) |>
dplyr::group_by(
dplyr::across(
dplyr::all_of(check_log_clean$col_names$panel)
)
) |>
dplyr::mutate(
median_low = mean(.data[["sample_median"]], na.rm = TRUE) - 3 *
stats::sd(.data[["sample_median"]], na.rm = TRUE),
median_high = mean(.data[["sample_median"]], na.rm = TRUE) + 3 *
stats::sd(.data[["sample_median"]], na.rm = TRUE),
iqr_low = mean(.data[["IQR"]], na.rm = TRUE) - 3 *
stats::sd(.data[["IQR"]], na.rm = TRUE),
iqr_high = mean(.data[["IQR"]], na.rm = TRUE) + 3 *
stats::sd(.data[["IQR"]], na.rm = TRUE)
) |>
dplyr::ungroup() |>
dplyr::mutate(
Outlier = dplyr::if_else(
(.data[["sample_median"]] < .data[["median_high"]]) &
(.data[["sample_median"]] > .data[["median_low"]]) &
(.data[["IQR"]] > .data[["iqr_low"]]) &
(.data[["IQR"]] < .data[["iqr_high"]]),
0, 1
)
) |>
dplyr::select(
dplyr::all_of(
c(check_log_clean$col_names$sample_id,
check_log_clean$col_names$panel,
"Outlier")
)
)
# ---- STEP 3: Handle LOD variations ----------------------------------------
if (!("lod" %in% names(check_log_clean$col_names))) {
df_clean <- df_clean |>
dplyr::mutate(
LOD = -Inf
)
check_log_clean$col_names[["lod"]] <- "LOD"
cli::cli_inform(
"LOD not available, hence not filtering by LOD."
)
} else if (length(check_log_clean$col_names$lod) > 1L) {
check_log_clean$col_names$lod <- check_log_clean$col_names$lod |>
unique() |>
sort() |>
utils::head(n = 1L)
cli::cli_inform(
"Multiple LOD columns detected. Will be using
{.val {check_log_clean$col_names$lod}} as filter criteria."
)
}
# ---- STEP 4: Sample-level QC and filtering --------------------------------
df_ready <- df_clean |>
dplyr::left_join(
qc_outliers,
by = c(check_log_clean$col_names$sample_id,
check_log_clean$col_names$panel),
relationship = "many-to-one"
) |>
dplyr::mutate(
quant_na = dplyr::if_else(
.data[[check_log_clean$col_names$quant]] <=
.data[[check_log_clean$col_names$lod]],
NA_real_,
.data[[check_log_clean$col_names$quant]]
)
) |>
dplyr::group_by(
dplyr::across(
dplyr::all_of(check_log_clean$col_names$sample_id)
)
) |>
dplyr::mutate(
qc_warn = dplyr::if_else(
all(toupper(.data[[check_log_clean$col_names$qc_warning]]) == "PASS"),
"PASS",
"WARNING"
),
outliers = sum(.data[["Outlier"]], na.rm = TRUE),
PercAssaysBelowLOD = sum(is.na(.data[["quant_na"]])) / dplyr::n(),
MeanNPX = mean(x = .data[["quant_na"]], na.rm = TRUE)
) |>
dplyr::ungroup() |>
dplyr::filter(
.data[["qc_warn"]] == "PASS" &
.data[["outliers"]] == 0L &
.data[["PercAssaysBelowLOD"]] < .env[["sample_missing_freq"]]
) |>
dplyr::distinct(
.data[[check_log_clean$col_names$sample_id]],
.data[["PercAssaysBelowLOD"]],
.data[["MeanNPX"]]
) |>
dplyr::rename(
"SampleID" = !!check_log_clean$col_names$sample_id
)
# ---- STEP 5: Select evenly spread bridge samples ---------------------------
# if fewer available samples than required by user
if (nrow(df_ready) < n) {
cli::cli_abort(
c(
"x" = "Only {.val {nrow(df_ready)}} samples eligible. Increase
{.arg sample_missing_freq} and/or decrease {.arg n}."
),
call = rlang::caller_env(),
wrap = TRUE
)
}
# if available exactly as many samples as required by user
if (nrow(df_ready) == n) {
return(df_ready)
}
# if more available samples than required by user, select n samples that are
# evenly spread across the range of mean NPX values.
df_ready <- df_ready |>
dplyr::arrange(
dplyr::desc(
.data[["MeanNPX"]]
)
) |>
dplyr::mutate(
order = dplyr::row_number()
)
bridge_samples <- floor(
x = seq(
from = 1L,
to = nrow(df_ready),
length.out = n + 2L
)[c(-1L, -(n + 2L))]
)
selected_bridges <- df_ready |>
dplyr::filter(
.data[["order"]] %in% .env[["bridge_samples"]]
) |>
# random order
dplyr::slice_sample(
prop = 1
) |>
dplyr::select(
-dplyr::all_of("order")
)
return(selected_bridges)
}
#' @rdname olink_bridge_selector
#' @param ... Additional arguments. Currently only accepts `sampleMissingFreq`
#' for backward compatibility. Please use `sample_missing_freq` instead of
#' `sampleMissingFreq` in the future.
#' @export
olink_bridgeselector <- function(df, # nolint: object_name_linter
...,
n,
check_log = NULL) {
# Accept either spelling for sample_missing_freq for backward compatibility
dots <- list(...)
# Validate names in `...` to avoid silently ignoring unexpected arguments
dot_names <- names(dots)
if (!is.null(dot_names)) {
allowed_dot_args <- c("sample_missing_freq", "sampleMissingFreq")
unexpected_args <- setdiff(
x = setdiff(x = dot_names,
y = allowed_dot_args),
y = ""
)
if (length(unexpected_args) > 0L) {
cli::cli_warn(
"Unexpected argument name{?s} in `...` will be ignored:
{.val {unexpected_args}}!"
)
}
}
if (is.null(dots$sample_missing_freq)) {
sampleMissingFreq <- dots$sampleMissingFreq # nolint: object_name_linter
} else {
sampleMissingFreq <- dots$sample_missing_freq # nolint: object_name_linter
}
if (is.null(sampleMissingFreq)) {
cli::cli_abort(
c(
"x" = "Please provide a value for either {.arg sampleMissingFreq} or
{.arg sample_missing_freq}."
),
call = rlang::caller_env(),
wrap = TRUE
)
}
return(
olink_bridge_selector(
df = df,
sample_missing_freq = sampleMissingFreq,
n = n,
check_log = check_log
)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.