R/check_npx.R

Defines functions check_darid check_npx_nonunique_uniprot check_npx_qcwarn_assays check_npx_col_class check_npx_all_na_sample check_npx_duplicate_sample_ids check_npx_all_na_assays check_npx_olinkid check_npx_update_col_names check_npx_col_names run_check_npx check_npx

Documented in check_darid check_npx check_npx_all_na_assays check_npx_all_na_sample check_npx_col_class check_npx_col_names check_npx_duplicate_sample_ids check_npx_nonunique_uniprot check_npx_olinkid check_npx_qcwarn_assays check_npx_update_col_names run_check_npx

#' Check NPX data format
#'
#' @description
#' This function performs various checks on NPX data, including checking
#' column names, validating Olink identifiers, identifying assays with \emph{NA}
#' values for all samples and detecting duplicate sample identifiers.
#'
#' @details
#' OlinkAnalyze uses pre-defined names of columns of data frames to perform
#' downstream analyses. At the same time, different Olink platforms export data
#' with different column names (e.g. different protein quantification metric).
#' This function aims to instruct each function of OlinkAnalyze on the column it
#' should be using for the downstream analysis. This should be seamless for data
#' exported from Olink Software and imported to R using the read_npx function.
#'
#' However, in certain cases the columns of interest might be named differently.
#' This function allows assigning custom-named columns of a data frame to
#' internally expected variables that will in turn instruct Olink Analyze
#' functions to use them for downstream analysis. For example, if one wished to
#' use the column \var{PCNormalizedNPX} for their analysis instead of the
#' column \var{NPX}, then they can assign this new name to the internal
#' variable \var{quant} to inform the package that in the downstream analysis
#' \var{PCNormalizedNPX} should be used. See example 3.
#'
#' Similarly, in case of multiple matches (e.g. the data frame contains both
#' columns \var{LOD} and \var{PlateLOD}) the ties will need to be resolved by
#' the user using the argument \var{preferred_names} from this function.  See
#' example 4.
#'
#' The argument \var{preferred_names} is a named character vector with internal
#' column names as names and column names of the current data set as values.
#' Names of the input vector can be one or more of the following:
#' `r ansi_collapse_quot(x = column_name_dict$col_key)`
#'
#' @author
#'   Masoumeh Sheikhi
#'   Klev Diamanti
#'
#' @inheritParams .downstream_fun_args
#' @param preferred_names A named character vector where names are internal
#' column names and values are column names to be selected from the input data
#' frame. Read the \emph{description} for further information.
#'
#' @return A list containing the following elements:
#' \itemize{
#'   \item \strong{col_names} List of column names from the input data frame
#'   marking the columns to be used in downstream analyses.
#'   \item \strong{oid_invalid} Character vector of invalid \var{OlinkID}.
#'   \item \strong{assay_na} Character vector of assays with all samples having
#'   \emph{NA} values.
#'   \item \strong{sample_id_dups} Character vector of duplicate \var{SampleID}.
#'   \item \strong{sample_id_na} Character vector containing \var{SampleID} of
#'   samples with quantified values \emph{NA} for all assays.
#'   \item \strong{col_class} Data frame with columns of incorrect type
#'   including column key \var{col_key}, column name \var{col_name}, detected
#'   column type \var{col_class} and expected column type
#'   \var{expected_col_class}.
#'   \item \strong{assay_qc} Character vector containing \var{OlinkID} of assays
#'   with at least one assay warning.
#'   \item\strong{non_unique_uniprot} Character vector of \var{OlinkID} mapped
#'   to more than one \var{UniProt} ID.
#'   \item \strong{darid_invalid} Character vector containing outdated
#'   combinations of \var{DataAnalysisRefID} and \var{PanelDataArchiveVersion}.
#' }
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Example 0: Use npx_data1 to check that check_npx works
#' check_npx_result <- OlinkAnalyze::npx_data1 |>
#'   OlinkAnalyze::check_npx() |>
#'   suppressWarnings()
#'
#' # read NPX data
#' npx_file <- system.file("extdata",
#'                         "npx_data_ext.parquet",
#'                         package = "OlinkAnalyze")
#' npx_df <- OlinkAnalyze::read_npx(filename = npx_file)
#'
#' # Example 1: run df as is
#' OlinkAnalyze::check_npx(df = npx_df)
#'
#' # Example 2: SampleType missing from data frame
#' npx_df |>
#'   dplyr::select(
#'     -dplyr::all_of(
#'       c("SampleType")
#'     )
#'   ) |>
#'   OlinkAnalyze::check_npx()
#'
#' # Example 3: Use PCNormalizedNPX instead on NPX
#' OlinkAnalyze::check_npx(
#'   df = npx_df,
#'   preferred_names = c("quant" = "PCNormalizedNPX")
#' )
#'
#' # Example 4: Use PCNormalizedNPX instead on NPX, and PlateLOD instead of LOD
#' npx_df |>
#'   dplyr::mutate(
#'     LOD = 1L,
#'     PlateLOD = 2L
#'   ) |>
#'   OlinkAnalyze::check_npx(
#'     preferred_names = c("quant" = "PCNormalizedNPX",
#'                         "lod" = "PlateLOD")
#'   )
#' }
#'
check_npx <- function(df,
                      preferred_names = NULL) {

  # check input ----

  check_is_dataset(x = df,
                   error = TRUE)

  # check functions ----

  check_npx_out_lst <- list()

  # column names
  check_npx_out_lst$col_names <- check_npx_col_names(
    df = df,
    preferred_names = preferred_names
  )

  # check Olink IDs
  check_npx_out_lst$oid_invalid <- check_npx_olinkid(
    df = df,
    col_names = check_npx_out_lst$col_names
  )

  # assays with all NA values
  check_npx_out_lst$assay_na <- check_npx_all_na_assays(
    df = df,
    col_names = check_npx_out_lst$col_names
  )

  # duplicate sample IDs
  check_npx_out_lst$sample_id_dups <- check_npx_duplicate_sample_ids(
    df = df,
    col_names = check_npx_out_lst$col_names
  )

  # samples with all NA values
  check_npx_out_lst$sample_id_na <- check_npx_all_na_sample(
    df = df,
    col_names = check_npx_out_lst$col_names
  )

  # column classes
  check_npx_out_lst$col_class <- check_npx_col_class(
    df = df,
    col_names = check_npx_out_lst$col_names
  )

  # assay QC
  check_npx_out_lst$assay_qc <- check_npx_qcwarn_assays(
    df = df,
    col_names = check_npx_out_lst$col_names
  )

  # non-unique uniprot id
  check_npx_out_lst$non_unique_uniprot <- check_npx_nonunique_uniprot(
    df = df,
    col_names = check_npx_out_lst$col_names
  )

  # check Data Analysis Reference ID and Panel Archive Version combination
  check_npx_out_lst$darid_invalid <- check_darid(
    df = df,
    col_names = check_npx_out_lst$col_names
  )

  # return results ----

  return(check_npx_out_lst)

}

#' Check and run [`check_npx()`] if not provided.
#'
#' @details
#' This function acts as a wrapper for [`check_npx()`]. It will check if the
#' input `check_log` provided by the user is valid. If not, it will throw
#' relevant errors or warnings. Alternatively, if `check_log` was not provided
#' by the user, it will run [`check_npx()`] to provide `check_log` to enable
#' downstream functions to run.#'
#'
#' @inherit .downstream_fun_args params author
#' @inherit check_npx return
#'
#' @keywords internal
#'
run_check_npx <- function(df,
                          check_log = NULL) {
  # generate check_log if not provided ----
  if (is.null(check_log)) {
    cli::cli_inform(
      c(
        "{.arg check_log} not provided. Running {.fn check_npx}.",
        "i" = "It is recommended that the user runs {.fn check_npx} to get a
        full picture of the results from the data validity check!"
      )
    )

    check_log <- check_npx(df = df)
    return(check_log)
  }

  # checks if check_log was provided ----

  check_is_list(x = check_log, error = TRUE)

  ## check that check_log has all expected output names ----

  # check that check_log has names
  if (is.null(names(check_log))) {
    cli::cli_abort(
      c(
        "x" = "{.arg check_log} is a list with no names!",
        "i" = "Ensure that {.arg check_log} is the output of {.fn check_npx}
        for dataset {.arg df}!"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )
  }

  # check that all expected elements in check_log are in place
  check_log_missing <- setdiff(
    x = check_npx_lst_names,
    y = names(check_log)
  )
  if (length(check_log_missing) > 0L) {
    cli::cli_abort(
      c(
        "x" = "Element{?s} {.val {check_log_missing}} are missing from
        {.arg check_log}!",
        "i" = "Ensure that {.arg check_log} is the output of {.fn check_npx}
        for dataset {.arg df}!"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )
  }

  # check if check_log contains additional elements
  check_log_additional <- setdiff(
    x = names(check_log),
    y = check_npx_lst_names
  )
  if (length(check_log_additional) > 0L) {
    cli::cli_warn(
      c(
        "Additional element{?s} {.val {check_log_additional}} detected in
        {.arg check_log}!",
        "i" = "Ensure that {.arg check_log} is the output of {.fn check_npx}
        for dataset {.arg df}!"
      )
    )
  }

  ## check that df column names are in place ----

  # missing required column keys
  check_log_cnames_missing <- setdiff(
    x = column_name_dict |> # required column names
      dplyr::filter(
        .data[["col_miss"]] == FALSE
      ) |>
      dplyr::pull(
        .data[["col_key"]]
      ),
    y = names(check_log$col_names)
  )
  if (length(check_log_cnames_missing) > 0L) {
    df_req_cols_miss <- column_name_dict |>
      dplyr::filter(
        .data[["col_miss"]] == FALSE
        & .data[["col_key"]] %in% .env[["check_log_cnames_missing"]]
      )

    miss_cols <- paste0(
      "* \"", df_req_cols_miss$col_key, "\": One of ",
      sapply(df_req_cols_miss$col_names,
             ansi_collapse_quot,
             sep = "or"), "."
    )

    cli::cli_abort(
      c(
        "x" = "{cli::qty(df_req_cols_miss$col_key)} There {?is/are} no column
        name{?s} associated with the following key{?s}:",
        miss_cols,
        "i" = "Ensure that {.arg check_log} is the output of {.fn check_npx}
        for dataset {.arg df}!"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )
  }

  # additional unexpected column names
  check_log_cnames_additional <- setdiff(
    x = names(check_log$col_names),
    y = column_name_dict |> # all column names
      dplyr::pull(
        .data[["col_key"]]
      )
  )
  if (length(check_log_cnames_additional) > 0L) {
    cli::cli_warn(
      c(
        "Unexpected key{?s} {.val {check_log_cnames_additional}} corresponding
        to column names detected in {.arg check_log$col_names}!",
        "i" = "Ensure that {.arg check_log} is the output of {.fn check_npx}
        for dataset {.arg df}!"
      )
    )
  }

  # check that actual column names are in place - sort of security check that
  # check_log corresponds to the current df
  check_log_cols_miss <- setdiff(
    x = unlist(x = check_log$col_names,
               recursive = TRUE,
               use.names = FALSE),
    y = names(df)
  )
  if (length(check_log_cols_miss) > 0L) {
    cli::cli_abort(
      c(
        "x" = "Column name{?s} {.val {check_log_cols_miss}} from
        {.arg check_log} {?is/are} missing from the dataset {.arg df}!",
        "i" = "Ensure that {.arg check_log} is the output of {.fn check_npx}
        for dataset {.arg df}!"),
      call = rlang::caller_env(),
      wrap = FALSE
    )
  }

  return(check_log)
}

#' Check, update and define column names used in downstream analyses
#'
#' @description
#' OlinkAnalyze uses pre-defined names of columns of data frames to perform
#' downstream analyses. At the same time, different Olink platforms export data
#' with different column names (e.g. different protein quantification metric).
#' This function aims to instruct each function of OlinkAnalyze on the column it
#' should be using for the downstream analysis. This should be seamless for data
#' exported from Olink Software and imported to R using the read_npx function.
#'
#' However, in certain cases the columns of interest might be named differently.
#' This function allows assigning custom-named columns of a data frame to
#' internally expected variables that will in turn instruct Olink Analyze
#' functions to use them for downstream analysis. For example, if one wished to
#' use the column \var{PCNormalizedNPX} for their analysis instead of the
#' column \var{NPX}, then they can assign this new name to the internal
#' variable \var{quant} to inform the package that in the downstream analysis
#' \var{PCNormalizedNPX} should be used. See example 3.
#'
#' Similarly, in case of multiple matches (e.g. the data frame contains both
#' columns \var{LOD} and \var{PlateLOD}) the ties will need to be resolved by
#' the user using the argument \var{preferred_names} from this function.  See
#' example 4.
#'
#' The argument \var{preferred_names} is a named character vector with internal
#' column names as names and column names of the current data set as values.
#' Names of the input vector can be one or more of the following:
#' `r cli::ansi_collapse(x = column_name_dict$col_key)`
#'
#' @author
#'  Klev Diamanti
#'  Masoumeh Sheikhi
#'
#' @inheritParams check_npx
#'
#' @keywords internal
#'
#' @return List of column names from the input data frame marking the columns to
#' be used in downstream analyses.
#'
#' @examples
#' \donttest{
#' # read NPX data
#' npx_file <- system.file("extdata",
#'                         "npx_data_ext.parquet",
#'                         package = "OlinkAnalyze")
#' npx_df <- OlinkAnalyze::read_npx(filename = npx_file)
#'
#' # Example 1: run df as is
#' OlinkAnalyze:::check_npx_col_names(df = npx_df)
#'
#' # Example 2: SampleType missing from data frame
#' npx_df |>
#'   dplyr::select(
#'     -dplyr::all_of(
#'       c("SampleType")
#'     )
#'   ) |>
#'   OlinkAnalyze:::check_npx_col_names()
#'
#' # Example 3: Use PCNormalizedNPX instead on NPX
#' OlinkAnalyze:::check_npx_col_names(
#'   df = npx_df,
#'   preferred_names = c("quant" = "PCNormalizedNPX")
#' )
#'
#' # Example 4: Use PCNormalizedNPX instead on NPX, and PlateLOD instead of LOD
#' npx_df |>
#'   dplyr::mutate(
#'     LOD = 1L,
#'     PlateLOD = 2L
#'   ) |>
#'   OlinkAnalyze:::check_npx_col_names(
#'     preferred_names = c("quant" = "PCNormalizedNPX",
#'                         "lod" = "PlateLOD")
#'   )
#' }
#'
check_npx_col_names <- function(df,
                                preferred_names = NULL) {

  # if not NULL, preferred_names is checked in check_npx_update_col_names
  if (!is.null(preferred_names)) {

    column_name_dict_updated <- check_npx_update_col_names(
      preferred_names = preferred_names
    )

  } else {

    column_name_dict_updated <- column_name_dict |>
      dplyr::mutate(
        col_name_mod = FALSE
      )

  }

  # Intersect expected names to column names ----

  column_name_dict_updated <- column_name_dict_updated |>
    dplyr::mutate(
      col_df = lapply(
        .data[["col_names"]],
        function(x) {
          intersect( # nolint: return_linter
            x = x,
            y = names(df)
          )
        }
      ),
      col_df_len = sapply(.data[["col_df"]], length)
    )

  # Check correctness of preferred_names ----

  # is user's input correct?
  # check if the user input has no matches to the column names of the data frame
  df_custom_names <- column_name_dict_updated |>
    dplyr::filter(
      .data[["col_name_mod"]] == TRUE
      & .data[["col_df_len"]] == 0L
    )

  if (!is.null(preferred_names) && nrow(df_custom_names) > 0L) {

    cli::cli_abort(
      c(
        "x" = "{cli::qty(df_custom_names$col_key)} Value{?s}
        {.val {unlist(df_custom_names$col_names)}} from {.arg preferred_names}
        corresponding to key{?s} {.val {df_custom_names$col_key}} {?is/are}
        missing from the input dataset {.arg df}.",
        "i" = "Please ensure all provided column names are present in the data!"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  # check presence of required columns ----

  # keep all required cols for which there is no matching column in dataset
  df_req_cols <- column_name_dict_updated |>
    dplyr::filter(
      .data[["col_miss"]] == FALSE
      & .data[["col_df_len"]] == 0L
    )

  if (nrow(df_req_cols) > 0L) {

    miss_cols <- paste0(
      "* \"", df_req_cols$col_key, "\": One of ",
      sapply(df_req_cols$col_names,
             ansi_collapse_quot,
             sep = "or"), "."
    )

    cli::cli_abort(
      c("x" = "{cli::qty(df_req_cols$col_key)} There {?is/are} no column
        name{?s} associated with the following key{?s}:",
        miss_cols,
        "i" = "Please ensure presence of columns above in dataset {.arg df}. If
        columns are present and the column name does not match one of the
        expected ones, please use argument {.arg preferred_names} to point to
        the correct column."),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  # break ties for multi-matches by order ----

  # keep all required cols for which there is no matching column in dataset
  df_multi_ties_cols <- column_name_dict_updated |>
    dplyr::filter(
      .data[["col_multi"]] == FALSE
      & .data[["col_df_len"]] > 1L
      & .data[["col_order"]] == TRUE
    ) |>
    dplyr::rename(
      "col_df_tmp" = "col_df"
    ) |>
    dplyr::mutate(
      col_df = lapply(.data[["col_df_tmp"]], utils::head, n = 1L),
      col_df_len = sapply(.data[["col_df"]], length)
    )

  if (nrow(df_multi_ties_cols) > 0L) {

    # update column_name_dict_updated
    column_name_dict_updated <- column_name_dict_updated |>
      dplyr::filter(
        !(.data[["col_key"]] %in% df_multi_ties_cols$col_key)
      ) |>
      dplyr::bind_rows(
        df_multi_ties_cols |>
          dplyr::select(
            -dplyr::all_of("col_df_tmp")
          )
      ) |>
      dplyr::arrange(
        match(x = .data[["col_key"]], table = column_name_dict$col_key)
      )

    # inform message string
    multi_ties_cols <- paste0(
      "* \"", df_multi_ties_cols$col_key, "\": \"",
      unlist(df_multi_ties_cols$col_df), "\" was selected. Options were ",
      sapply(df_multi_ties_cols$col_df_tmp,
             ansi_collapse_quot,
             sep = "or"), "."
    )

    cli::cli_inform(
      c("i" = "{cli::qty(df_multi_ties_cols$col_key)} More than one column names
      in {.arg df} was associated with certain key{?s}. One was selected based
      on an ordered list:",
        multi_ties_cols,
        "Please use {.arg preferred_names} to select a different column
        name."),
      wrap = FALSE
    )

  }


  # check multi-matches in non-multi cols columns ----

  # keep all cols that are not allowed to have multiple matches and have more
  # than one matching columns in dataset
  df_multi_cols <- column_name_dict_updated |>
    dplyr::filter(
      .data[["col_multi"]] == FALSE
      & .data[["col_df_len"]] > 1L
    )

  if (nrow(df_multi_cols) > 0L) {

    multi_cols <- paste0(
      "* \"", df_multi_cols$col_key, "\": ",
      sapply(df_multi_cols$col_names,
             ansi_collapse_quot,
             sep = "or"), "."
    )

    cli::cli_abort(
      c("x" = "{cli::qty(df_multi_cols$col_key)} There is more than one column
      names in {.arg df} associated with the following key{?s}:",
        multi_cols,
        "i" = "Please use {.arg preferred_names} to break ties of column
        names."),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  # check if no columns from the data frame match the same key from
  # column_name_dict


  # return ----

  # remove any nullable columns
  column_name_df <- column_name_dict_updated |>
    dplyr::filter(
      .data[["col_df_len"]] >= 1L
    ) |>
    dplyr::pull(
      .data[["col_df"]]
    )

  return(column_name_df)
}

#' Update column names to be used in downstream analyses
#'
#' @description
#' OlinkAnalyze uses pre-defined names of columns of data frames to perform
#' downstream analyses. However, in certain cases the columns of interest might
#' be named differently. The aim of this function is to assign custom-named
#' columns of a data frame to internally expected variables that will in turn
#' enable analysis of Olink data. For example, if one wished to #' use the
#' column \var{PCNormalizedNPX} for their analysis instead of the column
#' \var{NPX}, then they can assign this new name to the internal variable
#' \var{quant} to inform the package that in the downstream analysis
#' \var{PCNormalizedNPX} should be used.
#'
#' This function takes as input a named character vector with internal column
#' names as names and column names of the current data set as values. Names of
#' the input vector can be one or more of the following:
#' `r cli::ansi_collapse(x = column_name_dict$col_key)`
#'
#' @author
#'  Klev Diamanti
#'  Masoumeh Sheikhi
#'
#' @inheritParams check_npx
#'
#' @keywords internal
#'
#' @return \var{column_name_dict} updated based on \var{preferred_names}.
#'
check_npx_update_col_names <- function(preferred_names) {

  # check input ----

  # Check if preferred_names is character
  check_is_character(x = preferred_names,
                     error = TRUE)

  # check for names not matching expected ----

  # Check valid names
  if (!all(names(preferred_names) %in% column_name_dict$col_key)) {

    # identify names of the vector preferred_names that do not match names from
    # column_name_dict. Names should match to be able to update the field.
    missing_names <- setdiff(x = names(preferred_names), # nolint: object_usage_linter
                             y = column_name_dict$col_key)

    cli::cli_abort(
      c("x" = "Unexpected name{?s} in {.arg preferred_names}:
        {.val {missing_names}}!",
        "i" = "Expected one or more of the following names:
        {.val {column_name_dict$col_key}}"),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  # check for duplicated names ----

  dup_names <- names(preferred_names)[duplicated(names(preferred_names))]

  if (length(dup_names) > 0L) {

    cli::cli_abort(
      c("x" = "Duplicated name{?s} in {.arg preferred_names}:
        {.val {dup_names}}!",
        "i" = "Expected unique names for each column."),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }


  # update column names ----

  # Do not update entries that are not specified in `preferred_names`
  column_name_dict_keep <- column_name_dict |>
    dplyr::filter(
      !(.data[["col_key"]] %in% names(preferred_names))
    ) |>
    dplyr::mutate(
      col_name_mod = FALSE
    )
  # Update entries that are specified in `preferred_names`
  column_name_dict_change <- column_name_dict |>
    dplyr::filter(
      .data[["col_key"]] %in% names(preferred_names)
    ) |>
    dplyr::arrange(
      match(
        x = .data[["col_key"]],
        table = names(preferred_names)
      )
    ) |>
    dplyr::mutate(
      col_names = as.list(.env[["preferred_names"]]),
      col_name_mod = TRUE
    )
  # Merge the entries to a new updated dictionary
  column_name_dict_updated <- column_name_dict_keep |>
    dplyr::bind_rows(
      column_name_dict_change
    ) |>
    dplyr::arrange(
      match(
        x = .data[["col_key"]],
        table = column_name_dict$col_key
      )
    )

  # return ----

  return(column_name_dict_updated)

}

#' Help function checking whether df contains invalid Olink identifiers
#'
#' @description
#' This function checks if Olink identifiers (\var{OlinkID}) match the pattern
#' of a prefix "OID" followed by 5 integer numbers.
#'
#' @author
#'  Masoumeh Sheikhi
#'
#' @inheritParams check_npx
#' @param col_names A list of matched column names. This is the output of the
#' \var{check_npx_col_names} function.
#'
#' @keywords internal
#'
#' @return A character vector with invalid \var{OlinkID}.
#'
check_npx_olinkid <- function(df,
                              col_names) {

  # identify Olink identifiers ----

  # extract invalid Olink IDs
  invalid_oid <- df |>
    dplyr::distinct(
      .data[[col_names$olink_id]]
    )  |>
    dplyr::filter(
      !grepl(
        pattern = "^OID\\d{5}$|^OID\\d{5}_OID\\d{5}$",
        x = .data[[col_names$olink_id]]
      )
    )  |>
    dplyr::collect() |>
    dplyr::pull(
      .data[[col_names$olink_id]]
    )

  # warning if there are invalid Olink identifiers ----

  # warning if there is any invalid Olink ID
  if (length(invalid_oid) > 0L) {
    cli::cli_warn(
      c(
        "Unrecognized OlinkID{?s} detected: {.val {invalid_oid}}",
        "i" = "Consider running {.fn clean_npx} next!"
      )
    )
  }

  # return ----

  return(invalid_oid)
}

#' Help function to identify Olink assays with all quantified values \emph{NA}
#'
#' @description
#' This function checks if there are assays with the quantified values for all
#' samples \emph{NA}.
#'
#' @details
#' We have added the tags importFrom for "dbplyr" and "duckdb" because
#' "devtools::check()" would complain with a note that the two libraries are
#' imported but never used. To avoid that we used solutions taken from here:
#' 1. https://github.com/hadley/r-pkgs/issues/203
#' 2. https://github.com/pbs-software/pbs-modelling/issues/95
#'
#' @author
#'  Simon Forsberg
#'  Masoumeh Sheikhi
#'
#' @inheritParams check_npx
#' @inheritParams check_npx_olinkid
#'
#' @keywords internal
#'
#' @return A character vector containing \var{OlinkID} of assays with quantified
#' values \emph{NA} for all samples, otherwise returns \emph{character(0)}.
#'
#' @importFrom duckdb duckdb
#' @importFrom dbplyr memdb_frame
#'
check_npx_all_na_assays <- function(df, col_names) {

  # Identify assays with only NAs
  all_nas <- df |>
    dplyr::select(
      dplyr::all_of(
        c(col_names$olink_id,
          col_names$quant)
      )
    ) |>
    dplyr::group_by(
      .data[[col_names$olink_id]]
    ) |>
    dplyr::mutate(
      is_na = dplyr::if_else(is.na(.data[[col_names$quant]]), 1L, 0L)
    ) |>
    arrow::to_duckdb() |>
    dplyr::summarise(
      n = dplyr::n(),
      n_na = sum(.data[["is_na"]], na.rm = TRUE),
      .groups = "drop"
    ) |>
    dplyr::filter(
      .data[["n"]] == .data[["n_na"]]
    ) |>
    dplyr::collect() |>
    dplyr::pull(
      .data[[col_names$olink_id]]
    ) |>
    sort()

  # Issue warning if any assays with only NAs are found
  if (length(all_nas) > 0L) {
    cli::cli_warn(
      c(
        "{.val {all_nas}} ha{?s/ve} {.val {col_names$quant}} = NA for all
        samples.",
        "i" = "Consider running {.fn clean_npx} next!"
      )
    )
  }

  return(all_nas)
}

#' Help function checking for duplicate sample identifiers in data.
#'
#' @description
#' This function checks if there are duplicate sample identifiers for any assay.
#'
#' @author
#'  Masoumeh Sheikhi
#'
#' @inheritParams check_npx
#' @inheritParams check_npx_olinkid
#'
#' @keywords internal
#'
#' @return A character vector of duplicate \var{SampleID} found in the data.
#'
check_npx_duplicate_sample_ids <- function(df, col_names) {

  # Select relevant columns
  sample_summary <- df  |>
    dplyr::select(dplyr::all_of(c(
      col_names$sample_id,
      col_names$olink_id
    ))) |>
    dplyr::group_by(
      .data[[col_names$sample_id]],
      .data[[col_names$olink_id]]
    ) |>
    dplyr::summarise(freq = dplyr::n(),
                     .groups = "drop") |>
    dplyr::collect()

  # Find duplicates
  duplicates <- character(0L)
  duplicates <- sample_summary |>
    dplyr::filter(.data[["freq"]] > 1) |>
    dplyr::collect() |>
    dplyr::pull(.data[[col_names$sample_id]]) |>
    unique()

  # Warn if duplicates are found
  if (length(duplicates) > 0L) {
    cli::cli_warn(
      c(
        "Duplicate SampleID{?s} detected: {.val {duplicates}}",
        "i" = "Consider running {.fn clean_npx} next!"
      )
    )
  }

  return(duplicates)
}

#' Help function to identify Olink samples with all quantified values \emph{NA}
#'
#' @description
#' This function checks if there are samples with the quantified values for all
#' assays \emph{NA}.
#'
#' @details
#' We have added the tags importFrom for "dbplyr" and "duckdb" because
#' "devtools::check()" would complain with a note that the two libraries are
#' imported but never used. To avoid that we used solutions taken from here:
#' 1. https://github.com/hadley/r-pkgs/issues/203
#' 2. https://github.com/pbs-software/pbs-modelling/issues/95
#'
#' @author
#'  Simon Forsberg
#'  Masoumeh Sheikhi
#'  Klev Diamanti
#'
#' @inheritParams check_npx
#' @inheritParams check_npx_olinkid
#'
#' @keywords internal
#'
#' @return A character vector containing \var{SampleID} of samples with
#' quantified values \emph{NA} for all assays, otherwise returns
#' \emph{character(0)}.
#'
#' @importFrom duckdb duckdb
#' @importFrom dbplyr memdb_frame
#'
check_npx_all_na_sample <- function(df, col_names) {

  # Identify assays with only NAs
  all_na_sample <- df |>
    dplyr::select(
      dplyr::all_of(
        c(col_names$sample_id,
          col_names$quant)
      )
    ) |>
    dplyr::group_by(
      .data[[col_names$sample_id]]
    ) |>
    dplyr::mutate(
      is_na = dplyr::if_else(is.na(.data[[col_names$quant]]), 1L, 0L)
    ) |>
    arrow::to_duckdb() |>
    dplyr::summarise(
      n = dplyr::n(),
      n_na = sum(.data[["is_na"]], na.rm = TRUE),
      .groups = "drop"
    ) |>
    dplyr::filter(
      .data[["n"]] == .data[["n_na"]]
    ) |>
    dplyr::collect() |>
    dplyr::pull(
      .data[[col_names$sample_id]]
    ) |>
    sort()

  # Issue warning if any assays with only NAs are found
  if (length(all_na_sample) > 0L) {
    cli::cli_warn(
      c(
        "{.val {all_na_sample}} ha{?s/ve} {.val {col_names$quant}} = NA for all
        assays.",
        "i" = "Consider running {.fn clean_npx} next!"
      )
    )
  }

  return(all_na_sample)
}

#' Help function checking types of columns in data.
#'
#' @description
#' This function checks if certain columns from \var{df} have the correct type
#' to enable downstream analysis. Columns to be checked are marked as such in
#' the columns \var{col_class} and \var{col_class_check} of
#' \var{column_name_dict}.
#'
#' @author
#'  Klev Diamanti
#'
#' @inheritParams check_npx
#' @inheritParams check_npx_olinkid
#'
#' @keywords internal
#'
#' @returns A data frame with the columns \var{col_name}, \var{col_key},
#' \var{col_class} and \var{expected_col_class} marking columns with the
#' incorrect type.
#'
check_npx_col_class <- function(df, col_names) {

  # we first convert 'col_names' into a data frame with 'col_key' the names of
  # 'col_names', and 'col_df' the elements of the list of 'col_names'.
  # Basically, 'col_key' is used to match to 'column_name_dict', and 'col_df' is
  # used to match to the actual column names of the dataset 'df'.
  df_col_names <- dplyr::tibble(
    col_key = names(col_names),
    col_df = unname(col_names)
  ) |>
    tidyr::unnest(
      cols = dplyr::all_of(
        c("col_key", "col_df")
      )
    )

  # we select only the columns for which we want to confirm they column class.
  # Then we select only 'col_key' and 'col_class', which we rename to
  # 'expected_col_class'. 'col_key' is used to match to 'df_col_names' to allow
  # checking only the column names of the dataset 'df' in question.
  # 'expected_col_class' is used to retain the expected column classes. The
  # goal is that this data frame will contain 'col_class' that will allow us to
  # check if it matches the dataset, and 'col_df' that will match the column
  # name of the dataset.
  col_keys_check <- column_name_dict |>
    dplyr::filter(
      .data[["col_class_check"]] == TRUE
    ) |>
    dplyr::select(
      dplyr::all_of(
        c("col_key", "expected_col_class" = "col_class")
      )
    ) |>
    dplyr::inner_join(
      df_col_names,
      by = "col_key",
      relationship = "one-to-many"
    )

  # Check column class for each column in dataset 'df' using the internal
  # functions 'check_is_numeric' and 'check_is_character'.
  col_class_numeric <- df |>
    dplyr::slice_head(
      n = 100L
    ) |>
    dplyr::collect() |>
    lapply(
      is.numeric
    ) |>
    as.matrix()
  col_class_character <- df |>
    dplyr::slice_head(
      n = 100L
    ) |>
    dplyr::collect() |>
    lapply(
      is.character
    ) |>
    as.matrix()

  # combine the checks from numeric and character from above before, and
  # ultimately, check if expected column class matches the column class found in
  # the dataset 'df'.
  df_col_class <- dplyr::tibble(
    col_name = rownames(col_class_numeric),
    is_numeric = col_class_numeric[, 1L]
  ) |>
    dplyr::left_join(
      dplyr::tibble(
        col_name = rownames(col_class_character),
        is_character = col_class_character[, 1L]
      ),
      by = "col_name",
      relationship = "one-to-one"
    ) |>
    dplyr::mutate(
      col_class = dplyr::case_when(
        .data[["is_numeric"]] == TRUE ~ "numeric",
        .data[["is_character"]] == TRUE ~ "character",
        .data[["is_numeric"]] == TRUE
        & .data[["is_character"]] == TRUE ~ "unknown",
        TRUE ~ "other",
        .default = "unknown"
      )
    ) |>
    dplyr::select(
      -dplyr::all_of(
        c("is_numeric", "is_character")
      )
    ) |>
    dplyr::inner_join(
      col_keys_check,
      by = c("col_name" = "col_df"),
      relationship = "one-to-one"
    ) |>
    dplyr::filter(
      .data[["col_class"]] != .data[["expected_col_class"]]
    )

  if (nrow(df_col_class) > 0L) {

    col_class_msg <- paste0("* \"", df_col_class$col_name, "\"",
                            ": Expected \"", df_col_class$expected_col_class,
                            "\". Detected \"", df_col_class$col_class, "\".")

    cli::cli_warn(
      c(
        "{cli::qty(col_class_msg)} Detected column{?s} with incorrect data
        type{?s}:",
        col_class_msg,
        "i" = "{cli::qty(col_class_msg)} Use the function {.fn clean_npx} or
        manually convert the column{?s} to the expected type prior to downstream
        analyses."
      )
    )

  }

  return(df_col_class)

}

#' Help function checking data for assay QC warnings.
#'
#' @author
#'  Klev Diamanti
#'
#' @inheritParams check_npx
#' @inheritParams check_npx_olinkid
#'
#' @keywords internal
#'
#' @returns A character vector containing \var{OlinkID} of assays with at least
#' one QC warning, otherwise a \emph{character(0)}.
#'
check_npx_qcwarn_assays <- function(df, col_names) {

  if ("assay_warn" %in% names(col_names)) {

    qc_warn_assays <- df |>
      dplyr::select(
        dplyr::all_of(
          c(col_names$olink_id, col_names$assay_warn)
        )
      ) |>
      dplyr::filter(
        grepl(
          pattern = "warn",
          x = .data[[col_names$assay_warn]],
          ignore.case = TRUE
        )
      ) |>
      dplyr::distinct(
        .data[[col_names$olink_id]]
      ) |>
      dplyr::collect() |>
      dplyr::pull(
        .data[[col_names$olink_id]]
      ) |>
      unique() |>
      sort()

    if (length(qc_warn_assays) > 0L) {
      cli::cli_inform(
        c(
          "{.val {length(qc_warn_assays)}} assay{?s} exhibited assay QC warnings
          in column {.arg {unname(col_names$assay_warn)}} of the dataset:
          {.val {qc_warn_assays}}.",
          "i" = "Consider running {.fn clean_npx} next!"
        )
      )
    }

  } else {

    qc_warn_assays <- character(0L)

  }

  return(qc_warn_assays)
}

#' Help function checking for assays mapping to multiple UniProt identifiers.
#'
#' @author
#'  Kathleen Nevola
#'  Kang Dong
#'  Klev Diamanti
#'
#' @description
#' Occasionally, updates in panel versions include updates in \var{UniProt}
#' identifiers (e.g. change in formatting). This function identifies cases where
#' an assay identifier \var{OlinkID} maps to multiple \var{UniProt} identifiers.
#'
#' @inheritParams check_npx
#' @inheritParams check_npx_olinkid
#'
#' @keywords internal
#'
#' @return A character vector of assay identifiers \var{OlinkID} that map to
#' more than one \var{UniProt} identifiers.
#'
check_npx_nonunique_uniprot <- function(df, col_names) {

  # Group by OlinkID and count distinct UniProt entries, and identify OlinkIDs
  # linked to multiple UniProt IDs
  oid_uniprot_dups <- df |>
    dplyr::distinct( # Ensure uniqueness of OlinkID-UniProt pairs
      .data[[col_names$olink_id]],
      .data[[col_names$uniprot]]
    ) |>
    dplyr::group_by(
      .data[[col_names$olink_id]]
    ) |>
    dplyr::summarise(
      freq = dplyr::n(),
      .groups = "drop"
    ) |>
    dplyr::filter(
      .data[["freq"]] > 1L
    ) |>
    dplyr::collect() |>
    dplyr::pull(
      .data[[col_names$olink_id]]
    )

  # Emit a warning if any duplicates are found
  if (length(oid_uniprot_dups) > 0L) {
    cli::cli_warn(
      c(
        "Detected multiple UniProt identifiers for assay{?s}:
        {.val {oid_uniprot_dups}}.",
        "i" = "Consider running {.fn clean_npx} next!"
      )
    )
  } else {
    oid_uniprot_dups <- character(0L)
  }

  return(oid_uniprot_dups)
}

#' Help function checking for DARID and PanelDataArchiveVersion combinations
#'
#' @author
#'  Kathleen Nevola
#'  Kang Dong
#'  Klev Diamanti
#'
#' @description
#' DarIDs D.07, 08, 10, and 14 need to exported with Panel Data Archive Version
#' 1.5 or later. This function identifies cases where \var{DataAnalysisRefID}
#' are paired with earlier \var{PanelDataArchiveVersion}.
#'
#' @inheritParams check_npx
#'
#' @keywords internal
#'
#' @return A warning message if any invalid combinations are found.
#'
check_darid <- function(df, col_names) {

  # Return empty string and no warning when no qc_version is present
  if (!("qc_version" %in% names(col_names))) {
    return(
      dplyr::tibble(
        !!col_names$panel_version := character(0L)
      )
    )
  }

  # Identify invalid panel_version and qc_version combinations
  invalid_darid <- df |>
    dplyr::distinct(
      .data[[col_names$panel_version]],
      .data[[col_names$qc_version]]
    ) |>
    dplyr::collect() |>
    dplyr::inner_join(
      outdated_darid_panel_archive,
      by = stats::setNames("darid_list",
                           col_names$panel_version)
    ) |>
    dplyr::filter(
      as.numeric_version(.data[[col_names$qc_version]]) <
        as.numeric_version(.data[["min_version"]])
    ) |>
    dplyr::select(
      dplyr::all_of(c(
        col_names$panel_version,
        col_names$qc_version
      ))
    )

  # Emit a warning if any invalid combinations are found
  if (nrow(invalid_darid) > 0L) {

    invalid_darid_msg <- paste0(
      col_names$panel_version, ": ",
      paste(unique(invalid_darid[[col_names$panel_version]]),
            collapse = ", "),
      "; ", col_names$qc_version, ": ",
      paste(unique(invalid_darid[[col_names$qc_version]]),
            collapse = ", "),
      "."
    )

    cli::cli_warn(
      c(
        "i" = "Outdated Data Analysis Reference ID and
        Panel Archive Version combination detected.",
        "*" = invalid_darid_msg,

        ">" = "Re-export data using Panel Archive Version 1.5.0+ and
        use the newest version of the Fixed LOD file
        when calculating LOD (Version 6+).",

        "!" = "Failure to re-export may result in
        incorrect PC normalization across lots and Fixed LOD
        calculations."
      )
    )

    return(invalid_darid)

  } else {

    return(
      dplyr::tibble(
        !!col_names$panel_version := character(0L),
        !!col_names$qc_version := character(0L)
      )
    )

  }

}

Try the OlinkAnalyze package in your browser

Any scripts or data that you put into this service are public.

OlinkAnalyze documentation built on June 24, 2026, 1:06 a.m.