R/clean_npx.R
In OlinkAnalyze: Facilitate Analysis of Proteomic Data from Olink

Documented in clean_npx

#' Clean proteomics data quantified with Olink's PEA technology
#'
#' @description
#' This function applies a series of cleaning steps to a data set exported by
#' Olink Software and imported in R by [`read_npx()`]. Some of the steps of this
#' function rely on results from [`check_npx()`].
#'
#' This function removes samples and assays that are not suitable for downstream
#' statistical analysis. Some of the data records that are removed include
#' duplicate sample identifiers, external controls samples, internal control
#' assays, and samples or assays with quality control flags.
#'
#' @details
#' The pipeline performs the following steps:
#'
#' 1. **Remove assays with invalid identifiers**: assays flagged as having
#' invalid identifiers from [`check_npx()`]. Occurs when the original data set
#' provided by Olink Software has been modified.
#' 2. **Remove assays with `NA` quantification values**: assays lacking
#' quantification data are reported with `NA` as quantification. These assays
#' are identified in [`check_npx()`].
#' 3. **Remove samples with duplicate identifiers**: samples with identical
#' identifiers detected by [`check_npx()`]. Instances of duplicate sample
#' identifiers cause errors in the downstream analysis of data with, and it is
#' highly discouraged.
#' 4. **Remove external control samples**:
#'    - Uses column marking sample type (e.g. `SampleType`) to exclude external
#'    control samples.
#'    - Uses column marking sample identifier (e.g. `SampleID`) to remove
#'    external control samples, or samples that ones wants to exclude from the
#'    downstream analysis.
#' 5. **Remove samples failing quality control**: samples with QC status `FAIL`.
#' 6. **Remove internal control assays**: Uses column marking assay type (e.g.
#' `AssayType`) to exclude internal control assays.
#' 7. **Remove assays with quality controls warnings**: assays with QC status
#' `WARN`.
#' 8. **Correct column data type**: ensure that certain columns have the
#' expected data type (class). These columns are identified in [`check_npx()`].
#' 9. **Resolve multiple UniProt mappings per assay**: ensure that each assay
#' identifier (e.g., `OlinkID`) maps uniquely to a single UniProt ID.
#'
#' **Important:**
#'
#' - When data set lacks a column marking sample type (e.g. `SampleType`), one
#' should remove external control samples based on their sample identifiers.
#' This function does not auto-detect external control samples based on their
#' sample identifiers. *Please ensure external control samples have been*
#' *removed prior to downstream statistical analysis.*
#' - When data set lacks a column marking assay type (e.g. `AssayType`), one
#' should remove internal control assays manually. This function does not
#' auto-detect internal control assays. *Please ensure internal control assays*
#' *have been removed prior to downstream statistical analysis.*
#'
#' @author
#'   Kang Dong
#'   Klev Diamanti
#'
#' @inherit .downstream_fun_args params
#' @inherit .read_npx_args params return
#' @param remove_assay_na Logical. If `FALSE`, skips filtering assays with all
#' quantified values `NA`. Defaults to `TRUE`.
#' @param remove_invalid_oid Logical. If `FALSE`, skips filtering assays with
#' invalid identifiers. Defaults to `TRUE`.
#' @param remove_dup_sample_id Logical. If `FALSE`, skips filtering samples with
#' duplicate sample identifiers. Defaults to `TRUE`.
#' @param remove_control_sample If `FALSE`, all control samples are retained. If
#' `TRUE`, all control samples are removed. Alternatively, a character vector
#' with one or more of `r ansi_collapse_quot(x = names(olink_sample_types))`
#' indicating the sample types to remove.
#' @param remove_control_assay If `FALSE`, all internal control assays are
#' retained. If `TRUE`, all internal control assays are removed. Alternatively,
#' a character vector with one or more of
#' `r ansi_collapse_quot(x = names(olink_assay_types))` indicating the assay
#' types to remove.
#' @param remove_qc_warning Logical. If `FALSE`, retains samples flagged as
#' `FAIL` in QC warning. Defaults to `TRUE`.
#' @param remove_assay_warning Logical. If `FALSE`, retains assays flagged as
#' `WARN` in assay warning. Defaults to `TRUE`.
#' @param control_sample_ids character vector of sample identifiers of control
#' samples. Default `NULL`, to mark no samples to be removed.
#' @param convert_df_cols Logical. If `FALSE`, retains columns of `df` as are.
#' Defaults to `TRUE`, were columns required for downstream analysis are
#' converted to the expected format.
#' @param convert_nonunique_uniprot Logical. If `FALSE`, retains non-unique
#' OlinkID - UniProt mapping. Defaults to `TRUE`.
#' @param verbose Logical. If `FALSE` (default), silences step-wise messages.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # run check_npx
#' check_log <- check_npx(
#'   df = npx_data1
#' )
#'
#' # run clean_npx
#' clean_npx(
#'   df = npx_data1,
#'   check_log = check_log
#' )
#'
#' # run clean_npx with messages for all steps
#' clean_npx(
#'   df = npx_data1,
#'   check_log = check_log,
#'   verbose = TRUE
#' )
#' }
#'
clean_npx <- function(df,
                      check_log = NULL,
                      remove_assay_na = TRUE,
                      remove_invalid_oid = TRUE,
                      remove_dup_sample_id = TRUE,
                      remove_control_assay = TRUE,
                      remove_control_sample = TRUE,
                      remove_qc_warning = TRUE,
                      remove_assay_warning = TRUE,
                      control_sample_ids = NULL,
                      convert_df_cols = TRUE,
                      convert_nonunique_uniprot = TRUE,
                      out_df = "tibble",
                      verbose = FALSE) {

  # Validate input dataset
  check_is_dataset(x = df, error = TRUE)
  check_is_scalar_boolean(x = verbose, error = TRUE)
  check_log <- run_check_npx(df = df, check_log = check_log)

  if (verbose) cli::cli_h2("Starting {.fn clean_npx} pipeline.")

  # Clean invalid Olink IDs
  if (verbose) cli::cli_h3("Removing assays with invalid identifiers.")
  df <- clean_invalid_oid(
    df = df,
    check_log = check_log,
    remove_invalid_oid = remove_invalid_oid,
    verbose = verbose
  )

  # Clean assays with all NA values
  if (verbose) cli::cli_h3("Removing assays missing all quantified values.")
  df <- clean_assay_na(
    df = df,
    check_log = check_log,
    remove_assay_na = remove_assay_na,
    verbose = verbose
  )

  # Clean duplicate sample IDs
  if (verbose) cli::cli_h3("Removing duplicated sample identifiers.")
  df <- clean_duplicate_sample_id(
    df = df,
    check_log = check_log,
    remove_dup_sample_id = remove_dup_sample_id,
    verbose = verbose
  )

  # Clean control samples based on sample type
  if (verbose) cli::cli_h3("Removing control samples based on sample type.")
  df <- clean_sample_type(
    df = df,
    check_log = check_log,
    remove_control_sample = remove_control_sample,
    verbose = verbose
  )

  # Clean control samples based on Sample ID
  if (verbose) cli::cli_h3("Removing samples based on sample identifiers.")
  df <- clean_control_sample_id(
    df = df,
    check_log = check_log,
    control_sample_ids = control_sample_ids,
    verbose = verbose
  )

  # Clean Samples with QC Status 'FAIL'
  if (verbose) cli::cli_h3("Removing samples with QC status 'FAIL'.")
  df <- clean_qc_warning(
    df = df,
    check_log = check_log,
    remove_qc_warning = remove_qc_warning,
    verbose = verbose
  )

  # Clean internal control assays
  if (verbose) cli::cli_h3("Removing internal control assays.")
  df <- clean_assay_type(
    df = df,
    check_log = check_log,
    remove_control_assay = remove_control_assay,
    verbose = verbose
  )

  # Clean assays flagged by assay warning
  if (verbose) cli::cli_h3("Removing assays flagged with assays warning.")
  df <- clean_assay_warning(
    df = df,
    check_log = check_log,
    remove_assay_warning = remove_assay_warning,
    verbose = verbose
  )

  # Correct column class
  if (verbose) cli::cli_h3("Converting data types of selected columns.")
  df <- clean_col_class(
    df = df,
    check_log = check_log,
    convert_df_cols = convert_df_cols,
    verbose = verbose
  )

  # Correct non-unique Uniprot IDs
  if (verbose) cli::cli_h3("Converting non-unique OlinkID - UniProt mapping.")
  df <- clean_nonunique_uniprot(
    df = df,
    check_log = check_log,
    convert_nonunique_uniprot = convert_nonunique_uniprot,
    verbose = verbose
  )

  # Check for absolute quantification and apply log2 transformation
  if (grepl(pattern = "quantified",
            x = check_log$col_names$quant,
            ignore.case = TRUE)) {
    q_col <- check_log$col_names$quant # nolint: object_usage_linter
    q_log2 <- paste0(check_log$col_names$quant, "_log2") # nolint: object_usage_linter
    cli::cli_inform(
      c(
        "Detected data in absolute quantification in column
        {.field {check_log$col_names$quant}}.",
        "i" = "We recommend you apply a logarithmic transformation to the
        data:",
        "{.code df_log2 <- df |> dplyr::mutate({q_log2} = log2(x = {q_col}))}"
      ), wrap = TRUE
    )
  }

  # Final output
  if (verbose) {
    cli::cli_h2("Completed {.fn clean_npx}. Returning clean dataset.")
  }

  return(
    convert_read_npx_output(
      df = df,
      out_df = out_df
    )
  )
}

# Help Functions ----------------------------------------------------------

#' Quietly clean proteomics data quantified with Olink's PEA technology
#'
#' @description
#' Internal wrapper around [`clean_npx()`] that runs the same cleaning pipeline
#' while suppressing messages and warnings emitted during processing.
#'
#' Unlike [`clean_npx()`], this function is intended for internal use in cases
#' where a quiet cleaning step is needed. If rows are removed, a single
#' [`cli::cli_inform()`] message is shown. If no rows are removed, no output is
#' printed.
#'
#' @details
#' This function forwards all arguments to [`clean_npx()`] and forces
#' `verbose = FALSE`.
#'
#' If any rows are removed during cleaning, the function prints a single
#' informational message indicating how many entries were removed and
#' instructing the user to run [`clean_npx()`] directly to inspect which rows
#' were removed.
#'
#' @inheritParams clean_npx
#' @inherit clean_npx params return
#'
#' @author
#'   Kang Dong
#'   Klev Diamanti
#'
#' @keywords internal
#' @noRd
#'
run_clean_npx <- function(df, ...) {

  dots <- list(...)

  # remove user-supplied verbose if present
  dots$verbose <- NULL

  valid_args <- names(formals(clean_npx))
  unknown_args <- setdiff(x = names(dots), y = valid_args)

  if (length(unknown_args) > 0L) {
    cli::cli_abort(
      c(
        "x" = "{cli::qty(unknown_args)} Unknown argument{?s}:
        {.val {unknown_args}}.",
        "i" = "Check the documentation of {.fn clean_npx} for valid arguments."
      ),
      call = rlang::caller_env(),
      wrap = TRUE
    )
  }

  n_before <- nrow(df)

  cleaned_df <- withCallingHandlers(
    expr = do.call(clean_npx, c(list(df = df), dots, list(verbose = FALSE))),
    message = function(m) invokeRestart("muffleMessage"),
    warning = function(w) invokeRestart("muffleWarning")
  )

  n_after <- nrow(cleaned_df)

  if (n_after != n_before) {
    n_removed <- n_before - n_after # nolint: object_usage_linter
    cli::cli_inform(
      c("{.val {n_removed}} entr{?y/ies} removed by {.fn clean_npx} from the
      input dataset {.arg df}. Run {.fn clean_npx} on your dataset with
      {.arg verbose = TRUE} to inspect which rows were removed."),
      wrap = FALSE
    )
  }

  return(cleaned_df)
}

#' Help function removing assays with all quantified values `NA`.
#'
#' @description
#' This function filters out rows from a `tibble` or `arrow` object where the
#' assay identifier (one of
#' `r ansi_collapse_quot(x = column_name_dict$col_names$olink_id, sep = "or")`)
#' matches those listed in `check_log$assay_na`, which contains assays
#' composed entirely of `NA` values in their quantification column.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_assay_na <- function(df,
                           check_log,
                           remove_assay_na = TRUE,
                           verbose = FALSE) {
  # input check
  check_is_scalar_boolean(
    x = remove_assay_na,
    error = TRUE
  )

  # If assays with all NA values are retained by the user
  if (remove_assay_na == FALSE) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping exclusion of assays with all quantified values {.val NA} as
        per user input: {.field remove_assay_na} = {.val {FALSE}}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Id not assays with all values NA
  if (length(check_log$assay_na) == 0L) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("No assays with only {.val NA} values.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # CLI message listing excluded assays
  cli::cli_inform(
    c(
      "Excluding {.val {length(check_log$assay_na)}} assay{?s} with only
      {.val NA} values: {.val {check_log$assay_na}}.",
      "v" = "Returning cleaned dataset."
    )
  )

  # Exclude assays with only NA values
  df_cleaned <- df |>
    dplyr::filter(
      !(.data[[check_log$col_names$olink_id]] %in% check_log$assay_na)
    )

  # Convert output to desired format (tibble or arrow)
  return(df_cleaned)
}

#' Help function removing assays with invalid identifiers.
#'
#' @description
#' This function filters out rows from a `tibble` or `arrow` object where the
#' assay identifier (one of
#' `r ansi_collapse_quot(x = column_name_dict$col_names$olink_id, sep = "or")`)
#' matches values listed in `check_log$oid_invalid`, which identifies
#' invalid or malformed assay identifiers.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_invalid_oid <- function(df,
                              check_log,
                              remove_invalid_oid = TRUE,
                              verbose = FALSE) {
  # input check
  check_is_scalar_boolean(
    x = remove_invalid_oid,
    error = TRUE
  )

  # Keep invalid assay identifiers if user indicates so
  if (remove_invalid_oid == FALSE) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping exclusion of assays with invalid identifiers as per user
          input: {.field remove_invalid_oid} = {.val {FALSE}}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Check if there are any invalid assay identifiers to remove
  if (length(check_log$oid_invalid) == 0L) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("No invalid assay identifiers.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Inform user of which assays will be excluded
  cli::cli_inform(
    c(
      "Excluding {.val {length(check_log$oid_invalid)}} assay{?s} with invalid
      identifier{?s}: {.val {check_log$oid_invalid}}.",
      "v" = "Returning cleaned dataset."
    )
  )

  # Remove rows where the OlinkID is invalid
  df_cleaned <- df |>
    dplyr::filter(
      !(.data[[check_log$col_names$olink_id]] %in% check_log$oid_invalid)
    )

  # Return cleaned data frame in desired format
  return(df_cleaned)
}

#' Help function removing samples with duplicate identifiers.
#'
#' @description
#' This function filters out rows from a `tibble` or `arrow` object where the
#' sample identifier (one of
#' `r ansi_collapse_quot(x = column_name_dict$col_names$sample_id, sep = "or")`)
#' matches values listed in `check_log$sample_id_dups`, which identifies
#' samples with duplicated identifiers.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_duplicate_sample_id <- function(df,
                                      check_log,
                                      remove_dup_sample_id = TRUE,
                                      verbose = FALSE) {
  # input check
  check_is_scalar_boolean(
    x = remove_dup_sample_id,
    error = TRUE
  )

  # Retain samples with duplicate identifiers
  if (remove_dup_sample_id == FALSE) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping exclusion of samples with duplicate identifiers as per user
          input: {.field remove_dup_sample_id} = {.val {FALSE}}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Check if there are any samples with duplicate identifiers to remove
  if (length(check_log$sample_id_dups) == 0L) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("No duplicate sample identifiers.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Inform user about excluded SampleIDs
  cli::cli_inform(
    c(
      "Excluding {.val {length(check_log$sample_id_dups)}} sample{?s} with
      duplicate identifier{?s}: {.val {check_log$sample_id_dups}}.",
      "v" = "Returning cleaned dataset."
    )
  )

  # Filter out rows with duplicate SampleIDs
  df_cleaned <- df |>
    dplyr::filter(
      !(.data[[check_log$col_names$sample_id]] %in% check_log$sample_id_dups)
    )

  # Convert and return the output in the desired format
  return(df_cleaned)
}

#' Help function removing control samples based on sample type.
#'
#' @description
#' This function filters out rows from a dataset where the sample type column
#' matches known control sample types:
#' `r ansi_collapse_quot(x = unlist(olink_sample_types[names(olink_sample_types) != "sample"]), sep = "or")`. # nolint: line_length_linter
#' If `remove_control_sample` is set to `FALSE`, or if the sample type column is
#' not present in the `check_log`, the function returns the original data
#' unchanged.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_sample_type <- function(df,
                              check_log,
                              remove_control_sample = TRUE,
                              verbose = FALSE) {
  # if remove_control_sample is boolean then we either remove all control
  # samples (when TRUE), or we keep samples (when FALSE).
  # When remove_control_sample is a character vector, then we remove all user
  # designated controls.
  if (check_is_scalar_boolean(x = remove_control_sample, error = FALSE)) {

    if (remove_control_sample == TRUE) {
      ctrl_sample_type <- olink_sample_types[
        !(names(olink_sample_types) %in% c("sample"))
      ] |>
        unlist() |>
        unname()
    } else {
      ctrl_sample_type <- character(0L)
    }

  } else if (check_is_character(x = remove_control_sample, error = TRUE)) {

    if (all(remove_control_sample %in% names(olink_sample_types))) {
      ctrl_sample_type <- olink_sample_types[names(olink_sample_types)
                                             %in% remove_control_sample] |>
        unlist() |>
        unname()
    } else if (any(remove_control_sample %in% names(olink_sample_types))) {
      ctrl_sample_type <- olink_sample_types[names(olink_sample_types)
                                             %in% remove_control_sample] |>
        unlist() |>
        unname()
      olink_sampless_compl <- setdiff(x = remove_control_sample, # nolint: object_usage_linter
                                      y = names(olink_sample_types))

      cli::cli_inform(
        c("Unexpected entries {.val {olink_sampless_compl}} in
          {.arg remove_control_sample}. Expected values:
          {.val {names(olink_sample_types)}}.",
          "i" = "Proceeding with entries: {.val {ctrl_sample_type}}.")
      )
    } else {
      cli::cli_abort(
        c(
          "x" = "{cli::qty(remove_control_sample)} No overlap of value{?s} from
          {.arg remove_control_sample} to expected values.",
          "i" = "Ensure {.arg remove_control_sample} is a scalar boolean or
          contains one or more of {.val {names(olink_sample_types)}}!"
        ),
        call = rlang::caller_env(),
        wrap = TRUE
      )
    }

  }

  # Return original data if user chooses to keep control samples
  if (length(ctrl_sample_type) == 0L) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping exclusion of control samples as per user input:
          {.field remove_control_sample} = {.val {remove_control_sample}}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Check if 'sample_type' column name is available
  if (!("sample_type" %in% names(check_log$col_names))) {
    cli::cli_inform(
      c("No column marking control samples in dataset.",
        "i" = "Ensure exclusion of control samples for downstream analysis!",
        "i" = "Returning original dataset."
      )
    )
    return(df)
  }

  # detect how many samples are to be removed
  df_sid_stype <- df |>
    dplyr::distinct(
      .data[[check_log$col_names$sample_id]],
      .data[[check_log$col_names$sample_type]]
    ) |>
    dplyr::collect()
  uniq_sample_type <- df_sid_stype |>
    dplyr::pull(
      .data[[check_log$col_names$sample_type]]
    ) |>
    unique()

  # message that we are excluding control samples
  if (any(ctrl_sample_type %in% uniq_sample_type)) {
    uniq_sid <- df_sid_stype |> # nolint: object_usage_linter
      dplyr::filter(
        .data[[check_log$col_names$sample_type]] %in% .env[["ctrl_sample_type"]]
      ) |>
      dplyr::pull(
        .data[[check_log$col_names$sample_id]]
      )

    cli::cli_inform(
      c(
        "Excluding {.val {length(uniq_sid)}} control sample{?s}:
        {.val {uniq_sid}}.",
        "v" = "Returning cleaned dataset."
      )
    )
  }

  # Filter out control samples
  df_cleaned <- df |>
    dplyr::filter(
      !(.data[[check_log$col_names$sample_type]]
        %in% .env[["ctrl_sample_type"]])
    )

  # Format and return output
  return(df_cleaned)
}

#' Help function removing control assays based on assay type.
#'
#' @description
#' This function filters out internal control assays (`ext_ctrl`, `inc_ctrl`,
#' `amp_ctrl`) from the dataset, unless user specified to retain them. The
#' function uses column mapping provided by `check_log`.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_assay_type <- function(df,
                             check_log,
                             remove_control_assay = TRUE,
                             verbose = FALSE) {
  # if remove_control_assay is boolean then we either remove all control assays
  # (when TRUE), or we keep all assays (when FALSE).
  # When remove_control_assay is a character vector, then we remove all user
  # designated controls.
  if (check_is_scalar_boolean(x = remove_control_assay, error = FALSE)) {

    if (remove_control_assay == TRUE) {
      ctrl_assay_type <- olink_assay_types[
        !(names(olink_assay_types) %in% c("assay"))
      ] |>
        unlist() |>
        unname()
    } else {
      ctrl_assay_type <- character(0L)
    }

  } else if (check_is_character(x = remove_control_assay, error = TRUE)) {

    if (all(remove_control_assay %in% names(olink_assay_types))) {
      ctrl_assay_type <- olink_assay_types[names(olink_assay_types)
                                           %in% remove_control_assay] |>
        unlist() |>
        unname()
    } else if (any(remove_control_assay %in% names(olink_assay_types))) {
      ctrl_assay_type <- olink_assay_types[names(olink_assay_types)
                                           %in% remove_control_assay] |>
        unlist() |>
        unname()
      olink_assays_compl <- setdiff(x = remove_control_assay, # nolint: object_usage_linter
                                    y = names(olink_assay_types))

      cli::cli_inform(
        c("Unexpected entries {.val {olink_assays_compl}} in
          {.arg remove_control_assay}. Expected values:
          {.val {names(olink_assay_types)}}.",
          "i" = "Proceeding with entries: {.val {ctrl_assay_type}}.")
      )
    } else {
      cli::cli_abort(
        c(
          "x" = "{cli::qty(remove_control_assay)} No overlap of value{?s} from
          {.arg remove_control_assay} to expected values.",
          "i" = "Ensure {.arg remove_control_assay} is a scalar boolean or
          contains one or more of {.val {names(olink_assay_types)}}!"
        ),
        call = rlang::caller_env(),
        wrap = TRUE
      )
    }

  }

  # Return original data if user chooses to keep control samples
  if (length(ctrl_assay_type) == 0L) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping exclusion of control assays as per user input:
          {.field remove_control_assay} = {.val {remove_control_assay}}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Check if 'assay_type' column name is available
  if (!("assay_type" %in% names(check_log$col_names))) {
    cli::cli_inform(
      c("No column marking control assays in dataset.",
        "i" = "Ensure exclusion of control assays for downstream analysis!",
        "i" = "Returning original dataset."
      )
    )
    return(df)
  }

  # detect how many samples are to be removed
  df_oid_atype <- df |>
    dplyr::distinct(
      .data[[check_log$col_names$olink_id]],
      .data[[check_log$col_names$assay_type]]
    ) |>
    dplyr::collect()
  uniq_atype <- df_oid_atype |>
    dplyr::pull(
      .data[[check_log$col_names$assay_type]]
    ) |>
    unique()

  # message that we are excluding control samples
  if (any(ctrl_assay_type %in% uniq_atype)) {
    uniq_oid <- df_oid_atype |> # nolint: object_usage_linter
      dplyr::filter(
        .data[[check_log$col_names$assay_type]] %in% .env[["ctrl_assay_type"]]
      ) |>
      dplyr::pull(
        .data[[check_log$col_names$olink_id]]
      )

    cli::cli_inform(
      c(
        "Excluding {.val {length(uniq_oid)}} control assay{?s}:
        {.val {uniq_oid}}.",
        "v" = "Returning cleaned dataset."
      )
    )
  }

  # Filter out control assays
  df_cleaned <- df |>
    dplyr::filter(
      !(.data[[check_log$col_names$assay_type]] %in% .env[["ctrl_assay_type"]])
    )

  # Format and return output
  return(df_cleaned)
}

#' Help function removing instances of samples that failed QC.
#'
#' @description
#' This function uses the column marking QC warnings identified by
#' `check_log` to remove samples flagged `FAIL` in the dataset.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_qc_warning <- function(df,
                             check_log,
                             remove_qc_warning = TRUE,
                             verbose = FALSE) {
  # input check
  check_is_scalar_boolean(
    x = remove_qc_warning,
    error = TRUE
  )

  if (remove_qc_warning == FALSE) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping exclusion of samples flagged {.val {'FAIL'}} as per user
          input {.field remove_qc_warning} = {.val {FALSE}}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  if (nrow(dplyr::filter(.data = df,
                         grepl(pattern = "fail",
                               x = .data[[check_log$col_names$qc_warning]],
                               ignore.case = TRUE))) > 0L) {

    df_fail_sample <- df |>
      dplyr::filter(
        grepl(
          pattern = "fail",
          x = .data[[check_log$col_names$qc_warning]],
          ignore.case = TRUE
        )
      ) |>
      dplyr::collect()

    fail_sample_n <- df_fail_sample |> # nolint: object_usage_linter
      dplyr::pull(
        .data[[check_log$col_names$sample_id]]
      ) |>
      unique()

    # Inform user about failed SampleIDs
    cli::cli_inform(
      c(
        "Excluding {.val {nrow(df_fail_sample)}} datapoint{?s} from
        {.val {length(fail_sample_n)}} sample{?s} flagged with
        {.field {check_log$col_names$qc_warning}} = {.val {'FAIL'}}:
        {.val {fail_sample_n}}.",
        "v" = "Returning cleaned dataset."
      )
    )

    df_cleaned <- df |>
      dplyr::filter(
        !grepl(
          pattern = "fail",
          x = .data[[check_log$col_names$qc_warning]],
          ignore.case = TRUE
        )
      )

    return(df_cleaned)

  } else {

    if (verbose == TRUE) {
      cli::cli_inform(
        c("No samples flagged with {.field {check_log$col_names$qc_warning}}
          = {.val {'FAIL'}}.",
          "i" = "Returning original dataset.")
      )
    }

    return(df)

  }
}

#' Help function removing instances of assays flagged with warnings.
#'
#' @description
#' The function is used to remove assay-level QC warnings from the dataset
#' before analysis. It uses the column marking assay QC warnings identified by
#' `check_log` to remove assays flagged as `WARN` in the dataset.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_assay_warning <- function(df,
                                check_log,
                                remove_assay_warning = TRUE,
                                verbose = FALSE) {
  # input check
  check_is_scalar_boolean(
    x = remove_assay_warning,
    error = TRUE
  )

  # retain assays marked with assay warning
  if (remove_assay_warning == FALSE) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping exclusion of assays flagged with {.val {'WARN'}} as per user
          input {.field remove_assay_warning} = {.val {FALSE}}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Check if assay_warn column name is defined
  if (!("assay_warn" %in% names(check_log$col_names))) {
    cli::cli_inform(
      c("No column marking assay warnings in dataset.",
        "i" = "Ensure assays with QC warnings are removed prior to downstream
          analysis!",
        "i" = "Returning original dataset."
      )
    )
    return(df)
  }

  if (nrow(dplyr::filter(.data = df,
                         grepl(pattern = "warn",
                               x = .data[[check_log$col_names$assay_warn]],
                               ignore.case = TRUE))) > 0L) {

    df_warn_assay <- df |>
      dplyr::filter(
        grepl(
          pattern = "warn",
          x = .data[[check_log$col_names$assay_warn]],
          ignore.case = TRUE
        )
      ) |>
      dplyr::collect()

    warn_assay_n <- df_warn_assay |> # nolint: object_usage_linter
      dplyr::pull(
        .data[[check_log$col_names$olink_id]]
      ) |>
      unique()

    # Filter out failed assays and return cleaned data
    cli::cli_inform(
      c(
        "Excluding {.val {nrow(df_warn_assay)}} datapoint{?s} from
        {.val {length(warn_assay_n)}} assay{?s} flagged with
        {.field {check_log$col_names$assay_warn}} = {.val {'WARN'}} or
        {.val {'Warning'}}: {.val {warn_assay_n}}.",
        "v" = "Returning cleaned dataset."
      )
    )

    df_cleaned <- df |>
      dplyr::filter(
        !grepl(
          pattern = "warn",
          x = .data[[check_log$col_names$assay_warn]],
          ignore.case = TRUE
        )
      )

    return(df_cleaned)

  } else {

    if (verbose == TRUE) {
      cli::cli_inform(
        c("No assays flagged with {.field {check_log$col_names$assay_warn}}
          = {.val {'WARN'}} or {.val {'Warning'}}.",
          "i" = "Returning original dataset.")
      )
    }

    return(df)

  }
}

#' Help function removing a set of control samples from the dataset.
#'
#' @description
#' This function removes rows from NPX data where the sample identifiers, as
#' defined in `check_log`, match samples provided in
#' \var{control_sample_ids}. Primary goal of the function is to serve for
#' filtering out technical replicates or control samples prior to downstream
#' analysis.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
#' @examples
#' \dontrun{
#' # use npx_data1 to check that clean_control_sample_id() works
#' log <- OlinkAnalyze::check_npx(
#'   df = OlinkAnalyze::npx_data1
#' ) |>
#'   suppressWarnings() |>
#'   suppressMessages()
#'
#' out <- OlinkAnalyze:::clean_control_sample_id(
#'   df = npx_data1,
#'   check_npx_log = log,
#'   control_sample_id = c("CONTROL_SAMPLE_AS 1", "CONTROL_SAMPLE_AS 2")
#' )
#' }
#'
clean_control_sample_id <- function(df,
                                    check_log,
                                    control_sample_ids = NULL,
                                    verbose = FALSE) {

  # Check if sample_id column exist in the data table
  if (is.null(control_sample_ids)) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping exclusion of control samples based on
        {.arg control_sample_ids}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  check_is_character(x = control_sample_ids, error = TRUE)

  if (nrow(dplyr::filter(.data = df,
                         .data[[check_log$col_names$sample_id]]
                         %in% .env[["control_sample_ids"]])) > 0L) {

    sid <- df |>
      dplyr::distinct(
        .data[[check_log$col_names$sample_id]]
      ) |>
      dplyr::collect() |>
      dplyr::pull(
        .data[[check_log$col_names$sample_id]]
      )

    df_cleaned <- df |>
      dplyr::filter(
        !(.data[[check_log$col_names$sample_id]]
          %in% .env[["control_sample_ids"]])
      )

    # Filter out control samples and return cleaned data
    if (all(control_sample_ids %in% sid)) {
      cli::cli_inform(
        "Excluding sample{?s}: {.val {control_sample_ids}}.",
        "v" = "Returning cleaned dataset."
      )
    } else {
      ctrl_sid_shared <- intersect(x = control_sample_ids, y = sid) # nolint: object_usage_linter
      ctrl_sid_setdiff <- setdiff(x = control_sample_ids, y = sid) # nolint: object_usage_linter
      cli::cli_inform(
        "{cli::qty(ctrl_sid_shared)} Excluding sample{?s}:
        {.val {ctrl_sid_shared}}. {cli::qty(ctrl_sid_setdiff)}Sample{?s} not in
        dataset: {.val {ctrl_sid_setdiff}}.",
        "v" = "Returning cleaned dataset."
      )
    }

    return(df_cleaned)

  } else {

    cli::cli_inform(
      c("None of the sample identifiers in {.arg control_sample_ids} was present
        in the dataset {.arg df}.",
        "i" = "Returning original dataset.")
    )

    return(df)
  }
}

#' Help function converting types of columns to the expected ones.
#'
#' @description
#' This function checks for mismatches between actual and expected column
#' classes in the input data frame and coerces those columns to the expected
#' class using information from `check_log$col_class`.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_col_class <- function(df,
                            check_log,
                            convert_df_cols = TRUE,
                            verbose = FALSE) {
  # input check
  check_is_scalar_boolean(
    x = convert_df_cols,
    error = TRUE
  )

  # check if user wants df columns to be converted
  if (convert_df_cols == FALSE) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Skipping conversion of columns with non-expected format as per user
          input {.field convert_df_cols} = {.val {FALSE}}.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # Early return if no corrections needed
  if (nrow(check_log$col_class) == 0L) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c("Columns are in the correct format.",
          "i" = "Returning original dataset.")
      )
    }
    return(df)
  }

  # convert columns
  df_cleaned <- df |>
    dplyr::mutate(
      dplyr::across(
        check_log$col_class |>
          dplyr::filter(
            .data[["expected_col_class"]] == "numeric"
          ) |>
          dplyr::pull(
            .data[["col_name"]]
          ),
        ~ suppressWarnings(as.numeric(.x))
      )
    ) |>
    dplyr::mutate(
      dplyr::across(
        check_log$col_class |>
          dplyr::filter(
            .data[["expected_col_class"]] == "character"
          ) |>
          dplyr::pull(
            .data[["col_name"]]
          ),
        ~ suppressWarnings(as.character(.x))
      )
    )

  col_class_msg <- paste0(
    "* \"", check_log$col_class$col_name, "\": ",
    "from \"", check_log$col_class$col_class, "\" converted to ",
    "\"", check_log$col_class$expected_col_class, "\"."
  )

  cli::cli_inform(
    c("{cli::qty(col_class_msg)}Converted class{?es} of column{?s}:",
      col_class_msg,
      "v" = "Returning cleaned dataset.")
  )

  return(df_cleaned)
}

#' Help function unifying pairs of OlinkID and UniProt identifiers.
#'
#' @description
#' This function checks the non-unique "OlinkID - UniProt" mappings, as defined
#' in `check_log`. It selects the first instance of UniProt ID per OlinkID and
#' replaces the original UniProt column with the unified mapping.
#'
#' @inherit clean_npx params return author
#'
#' @keywords internal
#' @noRd
#'
clean_nonunique_uniprot <- function(df,
                                    check_log,
                                    convert_nonunique_uniprot = TRUE,
                                    verbose = TRUE) {

  # input check
  check_is_scalar_boolean(
    x = convert_nonunique_uniprot,
    error = TRUE
  )

  # check if user wants non-unique uniprot IDs to be converted
  if (convert_nonunique_uniprot == FALSE) {
    if (verbose == TRUE) {
      cli::cli_inform(
        c(
          "Skipping unification of non-unique
          {.val {check_log$col_names$olink_id}} -
          {.val {check_log$col_names$uniprot}} mappings as per user input
          {.arg convert_nonunique_uniprot}.",
          "i" = "Returning original dataset."
        )
      )
    }
    return(df)
  }

  # early return if no corrections needed
  if (length(check_log$non_unique_uniprot) == 0L) {
    if (verbose == TRUE) {
      cli::cli_inform(c(
        "Each {.val {check_log$col_names$olink_id}} maps to a unique
        {.val {check_log$col_names$uniprot}} identifier.",
        "i" = "Returning original dataset."
      ))
    }
    return(df)
  } else {

    # Map Olink ID - UniProt ID
    oid_uniprot_map <- df |> # nolint: object_usage_linter
      dplyr::filter(
        .data[[check_log$col_names$olink_id]] %in% check_log$non_unique_uniprot
      ) |>
      dplyr::distinct(
        .data[[check_log$col_names$olink_id]],
        .data[[check_log$col_names$uniprot]]
      ) |>
      dplyr::collect() |>
      dplyr::group_by(
        dplyr::pick(
          dplyr::all_of(check_log$col_names$olink_id)
        )
      ) |>
      dplyr::summarise(
        uniprot_keep = utils::head(
          x = .data[[check_log$col_names$uniprot]],
          n = 1L
        ),
        uniprot_extra = utils::tail(
          x = .data[[check_log$col_names$uniprot]],
          n = -1L
        ) |>
          ansi_collapse_quot(),
        .groups = "drop"
      )

    # Convert non-unique UniProt ID, and return all columns from input df
    df_cleaned <- df |>
      dplyr::left_join(
        oid_uniprot_map,
        by = check_log$col_names$olink_id
      ) |>
      dplyr::mutate(
        uniprot_keep = dplyr::if_else(
          is.na(.data[["uniprot_keep"]]),
          .data[[check_log$col_names$uniprot]],
          .data[["uniprot_keep"]]
        )
      ) |>
      dplyr::select(
        -dplyr::all_of(
          c(check_log$col_names$uniprot, "uniprot_extra")
        )
      ) |>
      dplyr::rename_with(
        .fn = ~ check_log$col_names$uniprot,
        .cols = "uniprot_keep"
      ) |>
      dplyr::select(
        dplyr::all_of(
          names(df)
        )
      )

    cli::cli_inform(
      c(
        "{nrow(oid_uniprot_map)} assay identifier{?s} map multiple UniProt
        identifiers. The first instance will be used for downstream analysis.",
        paste0(
          "* ", oid_uniprot_map$uniprot_extra, " will be replaced with ",
          "\"", oid_uniprot_map$uniprot_keep, "\" for ",
          "\"", oid_uniprot_map$OlinkID, "\"."
        )
      )
    )

    return(df_cleaned)

  }

}