Nothing
#' Clean proteomics data quantified with Olink's PEA technology
#'
#' @description
#' This function applies a series of cleaning steps to a data set exported by
#' Olink Software and imported in R by [`read_npx()`]. Some of the steps of this
#' function rely on results from [`check_npx()`].
#'
#' This function removes samples and assays that are not suitable for downstream
#' statistical analysis. Some of the data records that are removed include
#' duplicate sample identifiers, external controls samples, internal control
#' assays, and samples or assays with quality control flags.
#'
#' @details
#' The pipeline performs the following steps:
#'
#' 1. **Remove assays with invalid identifiers**: assays flagged as having
#' invalid identifiers from [`check_npx()`]. Occurs when the original data set
#' provided by Olink Software has been modified.
#' 2. **Remove assays with `NA` quantification values**: assays lacking
#' quantification data are reported with `NA` as quantification. These assays
#' are identified in [`check_npx()`].
#' 3. **Remove samples with duplicate identifiers**: samples with identical
#' identifiers detected by [`check_npx()`]. Instances of duplicate sample
#' identifiers cause errors in the downstream analysis of data with, and it is
#' highly discouraged.
#' 4. **Remove external control samples**:
#' - Uses column marking sample type (e.g. `SampleType`) to exclude external
#' control samples.
#' - Uses column marking sample identifier (e.g. `SampleID`) to remove
#' external control samples, or samples that ones wants to exclude from the
#' downstream analysis.
#' 5. **Remove samples failing quality control**: samples with QC status `FAIL`.
#' 6. **Remove internal control assays**: Uses column marking assay type (e.g.
#' `AssayType`) to exclude internal control assays.
#' 7. **Remove assays with quality controls warnings**: assays with QC status
#' `WARN`.
#' 8. **Correct column data type**: ensure that certain columns have the
#' expected data type (class). These columns are identified in [`check_npx()`].
#' 9. **Resolve multiple UniProt mappings per assay**: ensure that each assay
#' identifier (e.g., `OlinkID`) maps uniquely to a single UniProt ID.
#'
#' **Important:**
#'
#' - When data set lacks a column marking sample type (e.g. `SampleType`), one
#' should remove external control samples based on their sample identifiers.
#' This function does not auto-detect external control samples based on their
#' sample identifiers. *Please ensure external control samples have been*
#' *removed prior to downstream statistical analysis.*
#' - When data set lacks a column marking assay type (e.g. `AssayType`), one
#' should remove internal control assays manually. This function does not
#' auto-detect internal control assays. *Please ensure internal control assays*
#' *have been removed prior to downstream statistical analysis.*
#'
#' @author
#' Kang Dong
#' Klev Diamanti
#'
#' @inherit .downstream_fun_args params
#' @inherit .read_npx_args params return
#' @param remove_assay_na Logical. If `FALSE`, skips filtering assays with all
#' quantified values `NA`. Defaults to `TRUE`.
#' @param remove_invalid_oid Logical. If `FALSE`, skips filtering assays with
#' invalid identifiers. Defaults to `TRUE`.
#' @param remove_dup_sample_id Logical. If `FALSE`, skips filtering samples with
#' duplicate sample identifiers. Defaults to `TRUE`.
#' @param remove_control_sample If `FALSE`, all control samples are retained. If
#' `TRUE`, all control samples are removed. Alternatively, a character vector
#' with one or more of `r ansi_collapse_quot(x = names(olink_sample_types))`
#' indicating the sample types to remove.
#' @param remove_control_assay If `FALSE`, all internal control assays are
#' retained. If `TRUE`, all internal control assays are removed. Alternatively,
#' a character vector with one or more of
#' `r ansi_collapse_quot(x = names(olink_assay_types))` indicating the assay
#' types to remove.
#' @param remove_qc_warning Logical. If `FALSE`, retains samples flagged as
#' `FAIL` in QC warning. Defaults to `TRUE`.
#' @param remove_assay_warning Logical. If `FALSE`, retains assays flagged as
#' `WARN` in assay warning. Defaults to `TRUE`.
#' @param control_sample_ids character vector of sample identifiers of control
#' samples. Default `NULL`, to mark no samples to be removed.
#' @param convert_df_cols Logical. If `FALSE`, retains columns of `df` as are.
#' Defaults to `TRUE`, were columns required for downstream analysis are
#' converted to the expected format.
#' @param convert_nonunique_uniprot Logical. If `FALSE`, retains non-unique
#' OlinkID - UniProt mapping. Defaults to `TRUE`.
#' @param verbose Logical. If `FALSE` (default), silences step-wise messages.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # run check_npx
#' check_log <- check_npx(
#' df = npx_data1
#' )
#'
#' # run clean_npx
#' clean_npx(
#' df = npx_data1,
#' check_log = check_log
#' )
#'
#' # run clean_npx with messages for all steps
#' clean_npx(
#' df = npx_data1,
#' check_log = check_log,
#' verbose = TRUE
#' )
#' }
#'
clean_npx <- function(df,
check_log = NULL,
remove_assay_na = TRUE,
remove_invalid_oid = TRUE,
remove_dup_sample_id = TRUE,
remove_control_assay = TRUE,
remove_control_sample = TRUE,
remove_qc_warning = TRUE,
remove_assay_warning = TRUE,
control_sample_ids = NULL,
convert_df_cols = TRUE,
convert_nonunique_uniprot = TRUE,
out_df = "tibble",
verbose = FALSE) {
# Validate input dataset
check_is_dataset(x = df, error = TRUE)
check_is_scalar_boolean(x = verbose, error = TRUE)
check_log <- run_check_npx(df = df, check_log = check_log)
if (verbose) cli::cli_h2("Starting {.fn clean_npx} pipeline.")
# Clean invalid Olink IDs
if (verbose) cli::cli_h3("Removing assays with invalid identifiers.")
df <- clean_invalid_oid(
df = df,
check_log = check_log,
remove_invalid_oid = remove_invalid_oid,
verbose = verbose
)
# Clean assays with all NA values
if (verbose) cli::cli_h3("Removing assays missing all quantified values.")
df <- clean_assay_na(
df = df,
check_log = check_log,
remove_assay_na = remove_assay_na,
verbose = verbose
)
# Clean duplicate sample IDs
if (verbose) cli::cli_h3("Removing duplicated sample identifiers.")
df <- clean_duplicate_sample_id(
df = df,
check_log = check_log,
remove_dup_sample_id = remove_dup_sample_id,
verbose = verbose
)
# Clean control samples based on sample type
if (verbose) cli::cli_h3("Removing control samples based on sample type.")
df <- clean_sample_type(
df = df,
check_log = check_log,
remove_control_sample = remove_control_sample,
verbose = verbose
)
# Clean control samples based on Sample ID
if (verbose) cli::cli_h3("Removing samples based on sample identifiers.")
df <- clean_control_sample_id(
df = df,
check_log = check_log,
control_sample_ids = control_sample_ids,
verbose = verbose
)
# Clean Samples with QC Status 'FAIL'
if (verbose) cli::cli_h3("Removing samples with QC status 'FAIL'.")
df <- clean_qc_warning(
df = df,
check_log = check_log,
remove_qc_warning = remove_qc_warning,
verbose = verbose
)
# Clean internal control assays
if (verbose) cli::cli_h3("Removing internal control assays.")
df <- clean_assay_type(
df = df,
check_log = check_log,
remove_control_assay = remove_control_assay,
verbose = verbose
)
# Clean assays flagged by assay warning
if (verbose) cli::cli_h3("Removing assays flagged with assays warning.")
df <- clean_assay_warning(
df = df,
check_log = check_log,
remove_assay_warning = remove_assay_warning,
verbose = verbose
)
# Correct column class
if (verbose) cli::cli_h3("Converting data types of selected columns.")
df <- clean_col_class(
df = df,
check_log = check_log,
convert_df_cols = convert_df_cols,
verbose = verbose
)
# Correct non-unique Uniprot IDs
if (verbose) cli::cli_h3("Converting non-unique OlinkID - UniProt mapping.")
df <- clean_nonunique_uniprot(
df = df,
check_log = check_log,
convert_nonunique_uniprot = convert_nonunique_uniprot,
verbose = verbose
)
# Check for absolute quantification and apply log2 transformation
if (grepl(pattern = "quantified",
x = check_log$col_names$quant,
ignore.case = TRUE)) {
q_col <- check_log$col_names$quant # nolint: object_usage_linter
q_log2 <- paste0(check_log$col_names$quant, "_log2") # nolint: object_usage_linter
cli::cli_inform(
c(
"Detected data in absolute quantification in column
{.field {check_log$col_names$quant}}.",
"i" = "We recommend you apply a logarithmic transformation to the
data:",
"{.code df_log2 <- df |> dplyr::mutate({q_log2} = log2(x = {q_col}))}"
), wrap = TRUE
)
}
# Final output
if (verbose) {
cli::cli_h2("Completed {.fn clean_npx}. Returning clean dataset.")
}
return(
convert_read_npx_output(
df = df,
out_df = out_df
)
)
}
# Help Functions ----------------------------------------------------------
#' Quietly clean proteomics data quantified with Olink's PEA technology
#'
#' @description
#' Internal wrapper around [`clean_npx()`] that runs the same cleaning pipeline
#' while suppressing messages and warnings emitted during processing.
#'
#' Unlike [`clean_npx()`], this function is intended for internal use in cases
#' where a quiet cleaning step is needed. If rows are removed, a single
#' [`cli::cli_inform()`] message is shown. If no rows are removed, no output is
#' printed.
#'
#' @details
#' This function forwards all arguments to [`clean_npx()`] and forces
#' `verbose = FALSE`.
#'
#' If any rows are removed during cleaning, the function prints a single
#' informational message indicating how many entries were removed and
#' instructing the user to run [`clean_npx()`] directly to inspect which rows
#' were removed.
#'
#' @inheritParams clean_npx
#' @inherit clean_npx params return
#'
#' @author
#' Kang Dong
#' Klev Diamanti
#'
#' @keywords internal
#' @noRd
#'
run_clean_npx <- function(df, ...) {
dots <- list(...)
# remove user-supplied verbose if present
dots$verbose <- NULL
valid_args <- names(formals(clean_npx))
unknown_args <- setdiff(x = names(dots), y = valid_args)
if (length(unknown_args) > 0L) {
cli::cli_abort(
c(
"x" = "{cli::qty(unknown_args)} Unknown argument{?s}:
{.val {unknown_args}}.",
"i" = "Check the documentation of {.fn clean_npx} for valid arguments."
),
call = rlang::caller_env(),
wrap = TRUE
)
}
n_before <- nrow(df)
cleaned_df <- withCallingHandlers(
expr = do.call(clean_npx, c(list(df = df), dots, list(verbose = FALSE))),
message = function(m) invokeRestart("muffleMessage"),
warning = function(w) invokeRestart("muffleWarning")
)
n_after <- nrow(cleaned_df)
if (n_after != n_before) {
n_removed <- n_before - n_after # nolint: object_usage_linter
cli::cli_inform(
c("{.val {n_removed}} entr{?y/ies} removed by {.fn clean_npx} from the
input dataset {.arg df}. Run {.fn clean_npx} on your dataset with
{.arg verbose = TRUE} to inspect which rows were removed."),
wrap = FALSE
)
}
return(cleaned_df)
}
#' Help function removing assays with all quantified values `NA`.
#'
#' @description
#' This function filters out rows from a `tibble` or `arrow` object where the
#' assay identifier (one of
#' `r ansi_collapse_quot(x = column_name_dict$col_names$olink_id, sep = "or")`)
#' matches those listed in `check_log$assay_na`, which contains assays
#' composed entirely of `NA` values in their quantification column.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_assay_na <- function(df,
check_log,
remove_assay_na = TRUE,
verbose = FALSE) {
# input check
check_is_scalar_boolean(
x = remove_assay_na,
error = TRUE
)
# If assays with all NA values are retained by the user
if (remove_assay_na == FALSE) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping exclusion of assays with all quantified values {.val NA} as
per user input: {.field remove_assay_na} = {.val {FALSE}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Id not assays with all values NA
if (length(check_log$assay_na) == 0L) {
if (verbose == TRUE) {
cli::cli_inform(
c("No assays with only {.val NA} values.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# CLI message listing excluded assays
cli::cli_inform(
c(
"Excluding {.val {length(check_log$assay_na)}} assay{?s} with only
{.val NA} values: {.val {check_log$assay_na}}.",
"v" = "Returning cleaned dataset."
)
)
# Exclude assays with only NA values
df_cleaned <- df |>
dplyr::filter(
!(.data[[check_log$col_names$olink_id]] %in% check_log$assay_na)
)
# Convert output to desired format (tibble or arrow)
return(df_cleaned)
}
#' Help function removing assays with invalid identifiers.
#'
#' @description
#' This function filters out rows from a `tibble` or `arrow` object where the
#' assay identifier (one of
#' `r ansi_collapse_quot(x = column_name_dict$col_names$olink_id, sep = "or")`)
#' matches values listed in `check_log$oid_invalid`, which identifies
#' invalid or malformed assay identifiers.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_invalid_oid <- function(df,
check_log,
remove_invalid_oid = TRUE,
verbose = FALSE) {
# input check
check_is_scalar_boolean(
x = remove_invalid_oid,
error = TRUE
)
# Keep invalid assay identifiers if user indicates so
if (remove_invalid_oid == FALSE) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping exclusion of assays with invalid identifiers as per user
input: {.field remove_invalid_oid} = {.val {FALSE}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Check if there are any invalid assay identifiers to remove
if (length(check_log$oid_invalid) == 0L) {
if (verbose == TRUE) {
cli::cli_inform(
c("No invalid assay identifiers.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Inform user of which assays will be excluded
cli::cli_inform(
c(
"Excluding {.val {length(check_log$oid_invalid)}} assay{?s} with invalid
identifier{?s}: {.val {check_log$oid_invalid}}.",
"v" = "Returning cleaned dataset."
)
)
# Remove rows where the OlinkID is invalid
df_cleaned <- df |>
dplyr::filter(
!(.data[[check_log$col_names$olink_id]] %in% check_log$oid_invalid)
)
# Return cleaned data frame in desired format
return(df_cleaned)
}
#' Help function removing samples with duplicate identifiers.
#'
#' @description
#' This function filters out rows from a `tibble` or `arrow` object where the
#' sample identifier (one of
#' `r ansi_collapse_quot(x = column_name_dict$col_names$sample_id, sep = "or")`)
#' matches values listed in `check_log$sample_id_dups`, which identifies
#' samples with duplicated identifiers.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_duplicate_sample_id <- function(df,
check_log,
remove_dup_sample_id = TRUE,
verbose = FALSE) {
# input check
check_is_scalar_boolean(
x = remove_dup_sample_id,
error = TRUE
)
# Retain samples with duplicate identifiers
if (remove_dup_sample_id == FALSE) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping exclusion of samples with duplicate identifiers as per user
input: {.field remove_dup_sample_id} = {.val {FALSE}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Check if there are any samples with duplicate identifiers to remove
if (length(check_log$sample_id_dups) == 0L) {
if (verbose == TRUE) {
cli::cli_inform(
c("No duplicate sample identifiers.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Inform user about excluded SampleIDs
cli::cli_inform(
c(
"Excluding {.val {length(check_log$sample_id_dups)}} sample{?s} with
duplicate identifier{?s}: {.val {check_log$sample_id_dups}}.",
"v" = "Returning cleaned dataset."
)
)
# Filter out rows with duplicate SampleIDs
df_cleaned <- df |>
dplyr::filter(
!(.data[[check_log$col_names$sample_id]] %in% check_log$sample_id_dups)
)
# Convert and return the output in the desired format
return(df_cleaned)
}
#' Help function removing control samples based on sample type.
#'
#' @description
#' This function filters out rows from a dataset where the sample type column
#' matches known control sample types:
#' `r ansi_collapse_quot(x = unlist(olink_sample_types[names(olink_sample_types) != "sample"]), sep = "or")`. # nolint: line_length_linter
#' If `remove_control_sample` is set to `FALSE`, or if the sample type column is
#' not present in the `check_log`, the function returns the original data
#' unchanged.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_sample_type <- function(df,
check_log,
remove_control_sample = TRUE,
verbose = FALSE) {
# if remove_control_sample is boolean then we either remove all control
# samples (when TRUE), or we keep samples (when FALSE).
# When remove_control_sample is a character vector, then we remove all user
# designated controls.
if (check_is_scalar_boolean(x = remove_control_sample, error = FALSE)) {
if (remove_control_sample == TRUE) {
ctrl_sample_type <- olink_sample_types[
!(names(olink_sample_types) %in% c("sample"))
] |>
unlist() |>
unname()
} else {
ctrl_sample_type <- character(0L)
}
} else if (check_is_character(x = remove_control_sample, error = TRUE)) {
if (all(remove_control_sample %in% names(olink_sample_types))) {
ctrl_sample_type <- olink_sample_types[names(olink_sample_types)
%in% remove_control_sample] |>
unlist() |>
unname()
} else if (any(remove_control_sample %in% names(olink_sample_types))) {
ctrl_sample_type <- olink_sample_types[names(olink_sample_types)
%in% remove_control_sample] |>
unlist() |>
unname()
olink_sampless_compl <- setdiff(x = remove_control_sample, # nolint: object_usage_linter
y = names(olink_sample_types))
cli::cli_inform(
c("Unexpected entries {.val {olink_sampless_compl}} in
{.arg remove_control_sample}. Expected values:
{.val {names(olink_sample_types)}}.",
"i" = "Proceeding with entries: {.val {ctrl_sample_type}}.")
)
} else {
cli::cli_abort(
c(
"x" = "{cli::qty(remove_control_sample)} No overlap of value{?s} from
{.arg remove_control_sample} to expected values.",
"i" = "Ensure {.arg remove_control_sample} is a scalar boolean or
contains one or more of {.val {names(olink_sample_types)}}!"
),
call = rlang::caller_env(),
wrap = TRUE
)
}
}
# Return original data if user chooses to keep control samples
if (length(ctrl_sample_type) == 0L) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping exclusion of control samples as per user input:
{.field remove_control_sample} = {.val {remove_control_sample}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Check if 'sample_type' column name is available
if (!("sample_type" %in% names(check_log$col_names))) {
cli::cli_inform(
c("No column marking control samples in dataset.",
"i" = "Ensure exclusion of control samples for downstream analysis!",
"i" = "Returning original dataset."
)
)
return(df)
}
# detect how many samples are to be removed
df_sid_stype <- df |>
dplyr::distinct(
.data[[check_log$col_names$sample_id]],
.data[[check_log$col_names$sample_type]]
) |>
dplyr::collect()
uniq_sample_type <- df_sid_stype |>
dplyr::pull(
.data[[check_log$col_names$sample_type]]
) |>
unique()
# message that we are excluding control samples
if (any(ctrl_sample_type %in% uniq_sample_type)) {
uniq_sid <- df_sid_stype |> # nolint: object_usage_linter
dplyr::filter(
.data[[check_log$col_names$sample_type]] %in% .env[["ctrl_sample_type"]]
) |>
dplyr::pull(
.data[[check_log$col_names$sample_id]]
)
cli::cli_inform(
c(
"Excluding {.val {length(uniq_sid)}} control sample{?s}:
{.val {uniq_sid}}.",
"v" = "Returning cleaned dataset."
)
)
}
# Filter out control samples
df_cleaned <- df |>
dplyr::filter(
!(.data[[check_log$col_names$sample_type]]
%in% .env[["ctrl_sample_type"]])
)
# Format and return output
return(df_cleaned)
}
#' Help function removing control assays based on assay type.
#'
#' @description
#' This function filters out internal control assays (`ext_ctrl`, `inc_ctrl`,
#' `amp_ctrl`) from the dataset, unless user specified to retain them. The
#' function uses column mapping provided by `check_log`.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_assay_type <- function(df,
check_log,
remove_control_assay = TRUE,
verbose = FALSE) {
# if remove_control_assay is boolean then we either remove all control assays
# (when TRUE), or we keep all assays (when FALSE).
# When remove_control_assay is a character vector, then we remove all user
# designated controls.
if (check_is_scalar_boolean(x = remove_control_assay, error = FALSE)) {
if (remove_control_assay == TRUE) {
ctrl_assay_type <- olink_assay_types[
!(names(olink_assay_types) %in% c("assay"))
] |>
unlist() |>
unname()
} else {
ctrl_assay_type <- character(0L)
}
} else if (check_is_character(x = remove_control_assay, error = TRUE)) {
if (all(remove_control_assay %in% names(olink_assay_types))) {
ctrl_assay_type <- olink_assay_types[names(olink_assay_types)
%in% remove_control_assay] |>
unlist() |>
unname()
} else if (any(remove_control_assay %in% names(olink_assay_types))) {
ctrl_assay_type <- olink_assay_types[names(olink_assay_types)
%in% remove_control_assay] |>
unlist() |>
unname()
olink_assays_compl <- setdiff(x = remove_control_assay, # nolint: object_usage_linter
y = names(olink_assay_types))
cli::cli_inform(
c("Unexpected entries {.val {olink_assays_compl}} in
{.arg remove_control_assay}. Expected values:
{.val {names(olink_assay_types)}}.",
"i" = "Proceeding with entries: {.val {ctrl_assay_type}}.")
)
} else {
cli::cli_abort(
c(
"x" = "{cli::qty(remove_control_assay)} No overlap of value{?s} from
{.arg remove_control_assay} to expected values.",
"i" = "Ensure {.arg remove_control_assay} is a scalar boolean or
contains one or more of {.val {names(olink_assay_types)}}!"
),
call = rlang::caller_env(),
wrap = TRUE
)
}
}
# Return original data if user chooses to keep control samples
if (length(ctrl_assay_type) == 0L) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping exclusion of control assays as per user input:
{.field remove_control_assay} = {.val {remove_control_assay}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Check if 'assay_type' column name is available
if (!("assay_type" %in% names(check_log$col_names))) {
cli::cli_inform(
c("No column marking control assays in dataset.",
"i" = "Ensure exclusion of control assays for downstream analysis!",
"i" = "Returning original dataset."
)
)
return(df)
}
# detect how many samples are to be removed
df_oid_atype <- df |>
dplyr::distinct(
.data[[check_log$col_names$olink_id]],
.data[[check_log$col_names$assay_type]]
) |>
dplyr::collect()
uniq_atype <- df_oid_atype |>
dplyr::pull(
.data[[check_log$col_names$assay_type]]
) |>
unique()
# message that we are excluding control samples
if (any(ctrl_assay_type %in% uniq_atype)) {
uniq_oid <- df_oid_atype |> # nolint: object_usage_linter
dplyr::filter(
.data[[check_log$col_names$assay_type]] %in% .env[["ctrl_assay_type"]]
) |>
dplyr::pull(
.data[[check_log$col_names$olink_id]]
)
cli::cli_inform(
c(
"Excluding {.val {length(uniq_oid)}} control assay{?s}:
{.val {uniq_oid}}.",
"v" = "Returning cleaned dataset."
)
)
}
# Filter out control assays
df_cleaned <- df |>
dplyr::filter(
!(.data[[check_log$col_names$assay_type]] %in% .env[["ctrl_assay_type"]])
)
# Format and return output
return(df_cleaned)
}
#' Help function removing instances of samples that failed QC.
#'
#' @description
#' This function uses the column marking QC warnings identified by
#' `check_log` to remove samples flagged `FAIL` in the dataset.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_qc_warning <- function(df,
check_log,
remove_qc_warning = TRUE,
verbose = FALSE) {
# input check
check_is_scalar_boolean(
x = remove_qc_warning,
error = TRUE
)
if (remove_qc_warning == FALSE) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping exclusion of samples flagged {.val {'FAIL'}} as per user
input {.field remove_qc_warning} = {.val {FALSE}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
if (nrow(dplyr::filter(.data = df,
grepl(pattern = "fail",
x = .data[[check_log$col_names$qc_warning]],
ignore.case = TRUE))) > 0L) {
df_fail_sample <- df |>
dplyr::filter(
grepl(
pattern = "fail",
x = .data[[check_log$col_names$qc_warning]],
ignore.case = TRUE
)
) |>
dplyr::collect()
fail_sample_n <- df_fail_sample |> # nolint: object_usage_linter
dplyr::pull(
.data[[check_log$col_names$sample_id]]
) |>
unique()
# Inform user about failed SampleIDs
cli::cli_inform(
c(
"Excluding {.val {nrow(df_fail_sample)}} datapoint{?s} from
{.val {length(fail_sample_n)}} sample{?s} flagged with
{.field {check_log$col_names$qc_warning}} = {.val {'FAIL'}}:
{.val {fail_sample_n}}.",
"v" = "Returning cleaned dataset."
)
)
df_cleaned <- df |>
dplyr::filter(
!grepl(
pattern = "fail",
x = .data[[check_log$col_names$qc_warning]],
ignore.case = TRUE
)
)
return(df_cleaned)
} else {
if (verbose == TRUE) {
cli::cli_inform(
c("No samples flagged with {.field {check_log$col_names$qc_warning}}
= {.val {'FAIL'}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
}
#' Help function removing instances of assays flagged with warnings.
#'
#' @description
#' The function is used to remove assay-level QC warnings from the dataset
#' before analysis. It uses the column marking assay QC warnings identified by
#' `check_log` to remove assays flagged as `WARN` in the dataset.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_assay_warning <- function(df,
check_log,
remove_assay_warning = TRUE,
verbose = FALSE) {
# input check
check_is_scalar_boolean(
x = remove_assay_warning,
error = TRUE
)
# retain assays marked with assay warning
if (remove_assay_warning == FALSE) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping exclusion of assays flagged with {.val {'WARN'}} as per user
input {.field remove_assay_warning} = {.val {FALSE}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Check if assay_warn column name is defined
if (!("assay_warn" %in% names(check_log$col_names))) {
cli::cli_inform(
c("No column marking assay warnings in dataset.",
"i" = "Ensure assays with QC warnings are removed prior to downstream
analysis!",
"i" = "Returning original dataset."
)
)
return(df)
}
if (nrow(dplyr::filter(.data = df,
grepl(pattern = "warn",
x = .data[[check_log$col_names$assay_warn]],
ignore.case = TRUE))) > 0L) {
df_warn_assay <- df |>
dplyr::filter(
grepl(
pattern = "warn",
x = .data[[check_log$col_names$assay_warn]],
ignore.case = TRUE
)
) |>
dplyr::collect()
warn_assay_n <- df_warn_assay |> # nolint: object_usage_linter
dplyr::pull(
.data[[check_log$col_names$olink_id]]
) |>
unique()
# Filter out failed assays and return cleaned data
cli::cli_inform(
c(
"Excluding {.val {nrow(df_warn_assay)}} datapoint{?s} from
{.val {length(warn_assay_n)}} assay{?s} flagged with
{.field {check_log$col_names$assay_warn}} = {.val {'WARN'}} or
{.val {'Warning'}}: {.val {warn_assay_n}}.",
"v" = "Returning cleaned dataset."
)
)
df_cleaned <- df |>
dplyr::filter(
!grepl(
pattern = "warn",
x = .data[[check_log$col_names$assay_warn]],
ignore.case = TRUE
)
)
return(df_cleaned)
} else {
if (verbose == TRUE) {
cli::cli_inform(
c("No assays flagged with {.field {check_log$col_names$assay_warn}}
= {.val {'WARN'}} or {.val {'Warning'}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
}
#' Help function removing a set of control samples from the dataset.
#'
#' @description
#' This function removes rows from NPX data where the sample identifiers, as
#' defined in `check_log`, match samples provided in
#' \var{control_sample_ids}. Primary goal of the function is to serve for
#' filtering out technical replicates or control samples prior to downstream
#' analysis.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
#' @examples
#' \dontrun{
#' # use npx_data1 to check that clean_control_sample_id() works
#' log <- OlinkAnalyze::check_npx(
#' df = OlinkAnalyze::npx_data1
#' ) |>
#' suppressWarnings() |>
#' suppressMessages()
#'
#' out <- OlinkAnalyze:::clean_control_sample_id(
#' df = npx_data1,
#' check_npx_log = log,
#' control_sample_id = c("CONTROL_SAMPLE_AS 1", "CONTROL_SAMPLE_AS 2")
#' )
#' }
#'
clean_control_sample_id <- function(df,
check_log,
control_sample_ids = NULL,
verbose = FALSE) {
# Check if sample_id column exist in the data table
if (is.null(control_sample_ids)) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping exclusion of control samples based on
{.arg control_sample_ids}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
check_is_character(x = control_sample_ids, error = TRUE)
if (nrow(dplyr::filter(.data = df,
.data[[check_log$col_names$sample_id]]
%in% .env[["control_sample_ids"]])) > 0L) {
sid <- df |>
dplyr::distinct(
.data[[check_log$col_names$sample_id]]
) |>
dplyr::collect() |>
dplyr::pull(
.data[[check_log$col_names$sample_id]]
)
df_cleaned <- df |>
dplyr::filter(
!(.data[[check_log$col_names$sample_id]]
%in% .env[["control_sample_ids"]])
)
# Filter out control samples and return cleaned data
if (all(control_sample_ids %in% sid)) {
cli::cli_inform(
"Excluding sample{?s}: {.val {control_sample_ids}}.",
"v" = "Returning cleaned dataset."
)
} else {
ctrl_sid_shared <- intersect(x = control_sample_ids, y = sid) # nolint: object_usage_linter
ctrl_sid_setdiff <- setdiff(x = control_sample_ids, y = sid) # nolint: object_usage_linter
cli::cli_inform(
"{cli::qty(ctrl_sid_shared)} Excluding sample{?s}:
{.val {ctrl_sid_shared}}. {cli::qty(ctrl_sid_setdiff)}Sample{?s} not in
dataset: {.val {ctrl_sid_setdiff}}.",
"v" = "Returning cleaned dataset."
)
}
return(df_cleaned)
} else {
cli::cli_inform(
c("None of the sample identifiers in {.arg control_sample_ids} was present
in the dataset {.arg df}.",
"i" = "Returning original dataset.")
)
return(df)
}
}
#' Help function converting types of columns to the expected ones.
#'
#' @description
#' This function checks for mismatches between actual and expected column
#' classes in the input data frame and coerces those columns to the expected
#' class using information from `check_log$col_class`.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_col_class <- function(df,
check_log,
convert_df_cols = TRUE,
verbose = FALSE) {
# input check
check_is_scalar_boolean(
x = convert_df_cols,
error = TRUE
)
# check if user wants df columns to be converted
if (convert_df_cols == FALSE) {
if (verbose == TRUE) {
cli::cli_inform(
c("Skipping conversion of columns with non-expected format as per user
input {.field convert_df_cols} = {.val {FALSE}}.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# Early return if no corrections needed
if (nrow(check_log$col_class) == 0L) {
if (verbose == TRUE) {
cli::cli_inform(
c("Columns are in the correct format.",
"i" = "Returning original dataset.")
)
}
return(df)
}
# convert columns
df_cleaned <- df |>
dplyr::mutate(
dplyr::across(
check_log$col_class |>
dplyr::filter(
.data[["expected_col_class"]] == "numeric"
) |>
dplyr::pull(
.data[["col_name"]]
),
~ suppressWarnings(as.numeric(.x))
)
) |>
dplyr::mutate(
dplyr::across(
check_log$col_class |>
dplyr::filter(
.data[["expected_col_class"]] == "character"
) |>
dplyr::pull(
.data[["col_name"]]
),
~ suppressWarnings(as.character(.x))
)
)
col_class_msg <- paste0(
"* \"", check_log$col_class$col_name, "\": ",
"from \"", check_log$col_class$col_class, "\" converted to ",
"\"", check_log$col_class$expected_col_class, "\"."
)
cli::cli_inform(
c("{cli::qty(col_class_msg)}Converted class{?es} of column{?s}:",
col_class_msg,
"v" = "Returning cleaned dataset.")
)
return(df_cleaned)
}
#' Help function unifying pairs of OlinkID and UniProt identifiers.
#'
#' @description
#' This function checks the non-unique "OlinkID - UniProt" mappings, as defined
#' in `check_log`. It selects the first instance of UniProt ID per OlinkID and
#' replaces the original UniProt column with the unified mapping.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_nonunique_uniprot <- function(df,
check_log,
convert_nonunique_uniprot = TRUE,
verbose = TRUE) {
# input check
check_is_scalar_boolean(
x = convert_nonunique_uniprot,
error = TRUE
)
# check if user wants non-unique uniprot IDs to be converted
if (convert_nonunique_uniprot == FALSE) {
if (verbose == TRUE) {
cli::cli_inform(
c(
"Skipping unification of non-unique
{.val {check_log$col_names$olink_id}} -
{.val {check_log$col_names$uniprot}} mappings as per user input
{.arg convert_nonunique_uniprot}.",
"i" = "Returning original dataset."
)
)
}
return(df)
}
# early return if no corrections needed
if (length(check_log$non_unique_uniprot) == 0L) {
if (verbose == TRUE) {
cli::cli_inform(c(
"Each {.val {check_log$col_names$olink_id}} maps to a unique
{.val {check_log$col_names$uniprot}} identifier.",
"i" = "Returning original dataset."
))
}
return(df)
} else {
# Map Olink ID - UniProt ID
oid_uniprot_map <- df |> # nolint: object_usage_linter
dplyr::filter(
.data[[check_log$col_names$olink_id]] %in% check_log$non_unique_uniprot
) |>
dplyr::distinct(
.data[[check_log$col_names$olink_id]],
.data[[check_log$col_names$uniprot]]
) |>
dplyr::collect() |>
dplyr::group_by(
dplyr::pick(
dplyr::all_of(check_log$col_names$olink_id)
)
) |>
dplyr::summarise(
uniprot_keep = utils::head(
x = .data[[check_log$col_names$uniprot]],
n = 1L
),
uniprot_extra = utils::tail(
x = .data[[check_log$col_names$uniprot]],
n = -1L
) |>
ansi_collapse_quot(),
.groups = "drop"
)
# Convert non-unique UniProt ID, and return all columns from input df
df_cleaned <- df |>
dplyr::left_join(
oid_uniprot_map,
by = check_log$col_names$olink_id
) |>
dplyr::mutate(
uniprot_keep = dplyr::if_else(
is.na(.data[["uniprot_keep"]]),
.data[[check_log$col_names$uniprot]],
.data[["uniprot_keep"]]
)
) |>
dplyr::select(
-dplyr::all_of(
c(check_log$col_names$uniprot, "uniprot_extra")
)
) |>
dplyr::rename_with(
.fn = ~ check_log$col_names$uniprot,
.cols = "uniprot_keep"
) |>
dplyr::select(
dplyr::all_of(
names(df)
)
)
cli::cli_inform(
c(
"{nrow(oid_uniprot_map)} assay identifier{?s} map multiple UniProt
identifiers. The first instance will be used for downstream analysis.",
paste0(
"* ", oid_uniprot_map$uniprot_extra, " will be replaced with ",
"\"", oid_uniprot_map$uniprot_keep, "\" for ",
"\"", oid_uniprot_map$OlinkID, "\"."
)
)
)
return(df_cleaned)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.