R/olink_normalization_format.R

Defines functions olink_format_oid_no_overlap olink_format_rm_ext_ctrl olink_normalization_format

Documented in olink_format_oid_no_overlap olink_format_rm_ext_ctrl olink_normalization_format

#' Format the output of olink_normalization for seamless use with downstream
#' analysis functions.
#'
#' @author
#'   Danai G. Topouza
#'   Klev Diamanti
#'
#' @description
#' For within-product bridging and subset normalization:
#' * Adds non-overlapping assays between projects to the bridged file without
#' adjustment.
#' * Removes external controls, except sample controls.
#'
#' For cross-product bridging:
#' * Adds non-overlapping assays between projects and not bridgeable assays to
#' the bridged file without adjustment.
#' * Removes external controls, except sample controls.
#' * Replaces the NPX values of the non-reference project by the Median Centered
#' or QS Normalized NPX, according to the Bridging Recommendation.
#' * Edits the BridgingRecommendation column to indicate whether an assay is
#' NotBridgeable, NotOverlapping, MedianCentering, or QuantileSmoothing bridged.
#' * Replaces OlinkID by the concatenation of each product's OlinkIDs to
#' record the OlinkIDs from both projects for bridgeable assays. Assays that
#' are NotBridgeable or NotOverlapping retain their original OlinkIDs and NPX
#' values.
#' * Replaces Panel by the concatenation of each product panel per assay. Assays
#'  that are NotBridgeable or NotOverlapping retain their original Panel value.
#' * Removes  MedianCenteredNPX, QSNormalizedNPX, OlinkID_E3072 columns.
#'
#' #' For reference median normalization:
#' * Adds non-overlapping assays from the dataset, but not from the reference
#' medians,  to the bridged file without adjustment.
#' * Removes external controls, except sample controls.
#'
#' In all cases, normalization and formatting changes are applied to the NPX
#' column. The contents of the Count and PCNormalizedNPX columns remain
#' unchanged.
#'
#' @param df_norm A "tibble" of Olink data in long format resulting from the
#' `olink_normalization` function.
#' @param lst_check Normalization input list checks generated by
#' `olink_norm_input_check`.
#'
#' @return A "tibble" of Olink data in long format containing both input
#' datasets with the bridged NPX quantifications, with the above
#' modifications.
#'
#' @examples
#' \donttest{
#' # bridge samples
#' bridge_samples <- intersect(
#'   x = unique(OlinkAnalyze:::data_ht_small$SampleID),
#'   y = unique(OlinkAnalyze:::data_3k_small$SampleID)
#' ) |>
#'   (\(x) x[!grepl("CONTROL", x)])()
#'
#' # run olink_normalization
#' df_norm <- olink_normalization(
#'   df1 = OlinkAnalyze:::data_ht_small,
#'   df2 = OlinkAnalyze:::data_3k_small,
#'   overlapping_samples_df1 = bridge_samples,
#'   df1_project_nr = "Explore HT",
#'   df2_project_nr = "Explore 3072",
#'   reference_project = "Explore HT"
#' )
#'
#' # generate lst_check
#' lst_check <- OlinkAnalyze:::olink_norm_input_check(
#'   df1 = OlinkAnalyze:::data_3k_small,
#'   df2 = OlinkAnalyze:::data_ht_small,
#'   overlapping_samples_df1 = bridge_samples,
#'   overlapping_samples_df2 = NULL,
#'   df1_project_nr = "P1",
#'   df2_project_nr = "P2",
#'   reference_project = "P2",
#'   reference_medians = NULL
#' )
#'
#' # format output
#' OlinkAnalyze:::olink_normalization_format(
#'   df_norm = df_norm,
#'   lst_check = lst_check
#' )
#' }
#'
olink_normalization_format <- function(df_norm,
                                       lst_check) {

  # Extract data from non-overlapping assays ----

  if (!is.null(lst_check$non_overlapping_oid)
      && length(unlist(lst_check$non_overlapping_oid)) > 0L) {
    no_overlap_oid <- olink_format_oid_no_overlap(
      lst_check = lst_check
    )
  } else {
    no_overlap_oid <- NULL
  }

  # Combine data with df_norm ----

  if (lst_check$norm_mode %in% c(olink_norm_modes$bridge,
                                 olink_norm_modes$subset,
                                 olink_norm_modes$ref_median)) {

    if (!is.null(no_overlap_oid)) {
      df_combo <- df_norm |>
        dplyr::bind_rows(
          no_overlap_oid
        )
    } else {
      df_combo <- df_norm
    }

  } else if (lst_check$norm_mode == olink_norm_modes$norm_cross_product) {

    # Extract data for assays = "NotBridgeable" ----

    oid_col_name <- lst_check$ref_cols$olink_id
    not_ref_oid_col_name <- paste0(lst_check$ref_cols$olink_id,
                                   "_", lst_check$not_ref_product)

    quant_col_name <- lst_check$ref_cols$quant

    df_not_bridgeable <- df_norm |>
      dplyr::filter(
        # keep only assays that are not bridgeable
        .data[["BridgingRecommendation"]] == "NotBridgeable"
      ) |>
      dplyr::mutate(
        !!oid_col_name := dplyr::if_else(
          .data[["Project"]] == lst_check$ref_name,
          .data[[oid_col_name]],
          .data[[not_ref_oid_col_name]]
        )
      )

    not_bridgeable_assays <- df_not_bridgeable |>
      dplyr::distinct(
        .data[[oid_col_name]]
      ) |>
      nrow()

    if (not_bridgeable_assays > 0L) {
      cli::cli_inform(
        c("i" = "{.val {not_bridgeable_assays}} not bridgeable assays are
        included in the bridged dataset without adjustment.")
      )
    }

    # Keep the data following BridgingRecommendation for bridgeable assays
    df_bridgeable <- df_norm |>
      dplyr::filter(
        # keep only assays that are bridgeable
        .data[["BridgingRecommendation"]] != "NotBridgeable"
      ) |>
      dplyr::mutate(
        !!oid_col_name := paste0(.data[[oid_col_name]], "_",
                                 .data[[not_ref_oid_col_name]]),
        !!quant_col_name := dplyr::case_when(
          .data[["BridgingRecommendation"]] == "MedianCentering" ~
            .data[["MedianCenteredNPX"]],
          .data[["BridgingRecommendation"]] == "QuantileSmoothing" ~
            .data[["QSNormalizedNPX"]],
          .default = .data[[quant_col_name]]
        )
      )

    # combine data

    df_combo <- df_bridgeable |>
      dplyr::bind_rows(
        df_not_bridgeable
      )

    # add missing asssays
    if (!is.null(no_overlap_oid)) {
      df_combo <- df_combo |>
        dplyr::bind_rows(
          no_overlap_oid
        )
    }

    # concatenate panel column
    # keep reference product first
    ref_product_panels <- c("HT" = "Explore_HT",
                            "Reveal" = "Reveal")

    # reorder Panel to keep reference product first
    # then concatenate panels per OlinkID
    # NotBridgeable or NotOverlapping assays retain their original panel
    df_combo  <- df_combo |>
      dplyr::mutate(
        Panel_order = forcats::fct_relevel(Panel, ref_product_panels[[ lst_check$ref_product]])) |> # nolint: line_length_linter
      dplyr::group_by(.data[[oid_col_name]]) |>
      dplyr::mutate(Panel = paste(sort(unique(.data[["Panel_order"]])),
                                  collapse = "_")) |>
      dplyr::select(-.data[["Panel_order"]])


    # clean up
    df_combo <- df_combo |>
      dplyr::select( # Remove extra columns
        -dplyr::any_of(
          c("MedianCenteredNPX", "QSNormalizedNPX", not_ref_oid_col_name)
        )
      )
  }

  # Remove external controls and sort by project
  df_full <- df_combo |>
    olink_format_rm_ext_ctrl(lst_check = lst_check) |>
    dplyr::arrange(
      .data[["Project"]], .data[[lst_check$ref_cols$sample_id]]
    )

  return(df_full)
}

#' Remove negative controls and plate controls from dataset. For use in
#' olink_normalization_format function. Generates a message stating which
#' control samples were removed.
#'
#' @author
#'   Danai G. Topouza
#'   Klev Diamanti
#'
#' @param df NPX dataset to be processed.
#' @param lst_check Normalization input list checks generated by
#' `olink_norm_input_check`.
#'
#' @return A "tibble" of Olink data in long format containing the input dataset
#' with negative controls and plate controls removed.
#'
olink_format_rm_ext_ctrl <- function(df,
                                     lst_check) {

  ext_ctrl_regexp <- FALSE

  # if sample_type is present in data for both datasets, use it to
  # identify NCs and PCs
  if (length(lst_check$ref_cols$sample_type) > 0L
      && length(lst_check$not_ref_cols$sample_type) > 0L) {

    exclude_ext_ctrl_sampletype <- function(df,
                                            lst_check,
                                            ext_ctrl_type) {
      ext_ctrl_sid <- df |>
        dplyr::filter(
          dplyr::if_any(
            dplyr::any_of(
              c(lst_check$ref_cols$sample_type,
                lst_check$not_ref_cols$sample_type)
            ),
            ~ .x %in% .env[["ext_ctrl_type"]]
          )
        ) |>
        dplyr::pull(
          .data[[lst_check$ref_cols$sample_id]]
        ) |>
        unique()
      return(ext_ctrl_sid)
    }

    nc_sid <- exclude_ext_ctrl_sampletype(df = df,
                                          lst_check = lst_check,
                                          ext_ctrl_type = "NEGATIVE_CONTROL")
    pc_sid <- exclude_ext_ctrl_sampletype(df = df,
                                          lst_check = lst_check,
                                          ext_ctrl_type = "PLATE_CONTROL")
    ext_ctrl_regexp <- FALSE

  } else {

    # Set variable to capture Negative Controls and Plate Controls
    neg_ctrl <- c("Negative", "NEGATIVE", "NEG", "Neg")
    neg_ctrl_regex <- paste(neg_ctrl, collapse = "|")

    pc_ctrl <- c("PLATE", "IPC", "Plate", "plate", "Plate Control",
                 "plate control", "plate_control", "Plate_Control")
    pc_ctrl_regex <- paste(pc_ctrl, collapse = "|")

    exclude_ext_ctrl_sampleid <- function(df,
                                          lst_check,
                                          ext_ctrl_regex) {
      ext_ctrl_sid <- df |>
        dplyr::distinct(
          .data[[lst_check$ref_cols$sample_id]]
        ) |>
        dplyr::filter(
          stringr::str_detect(
            string = .data[[lst_check$ref_cols$sample_id]],
            pattern = ext_ctrl_regex
          )
        ) |>
        dplyr::pull(
          .data[[lst_check$ref_cols$sample_id]]
        )
      return(ext_ctrl_sid)
    }

    nc_sid <- exclude_ext_ctrl_sampleid(df = df,
                                        lst_check = lst_check,
                                        ext_ctrl_regex = neg_ctrl_regex)
    pc_sid <- exclude_ext_ctrl_sampleid(df = df,
                                        lst_check = lst_check,
                                        ext_ctrl_regex = pc_ctrl_regex)
    ext_ctrl_regexp <- TRUE
  }

  if (length(c(nc_sid, pc_sid)) > 0L) {
    # remove NCs and PCs for the dataset
    df <- df |>
      dplyr::filter(
        !(.data[[lst_check$ref_cols$sample_id]] %in% c(nc_sid, pc_sid))
      )

    if (length(nc_sid) > 0L) {
      cli::cli_inform(
        c("i" = "{.val {length(nc_sid)}} Negative Control{?s} {?was/were}
        removed from dataset: {.val {nc_sid}}")
      )
    }
    if (length(pc_sid) > 0L) {
      cli::cli_inform(
        c("i" = "{.val {length(pc_sid)}} Plate Control{?s} {?was/were} removed
        from dataset: {.val {pc_sid}}")
      )
    }
    if (ext_ctrl_regexp == TRUE) {
      cli::cli_inform(
        c("!" = "Negative Control and Plate Control samples were identified and
          removed based on common patterns in sample identifiers. Please verify
          that no other samples were removed unintentionally!"
        )
      )
    }
  }

  return(df)
}

#' Retrieve non-overlapping assays between two NPX datasets
#'
#' @description
#' For use in `olink_normalization_format` function. Generates a message stating
#' how many assays were not overlapping. Appends additional columns depending on
#' the normalization type to match normalized data output. For cross-product
#' normalization, splits any concatenated OlinkIDs.
#'
#' @author
#'   Danai Topouza
#'   Klev Diamanti
#'
#' @param lst_check Normalization input list checks generated by
#' `olink_norm_input_check`.
#'
#' @return A combined "tibble" of Olink data in long format containing only the
#' non-overlapping assays from each input dataset.
#'
olink_format_oid_no_overlap <- function(lst_check) {

  extract_non_overlapping_df <- function(df,
                                         df_name,
                                         df_oid,
                                         df_oid_no_overlap) {
    if (df_name %in% names(df_oid_no_overlap)) {

      ## Split any combined product OlinkIDs from cross-product normalization
      ## at the underscore to catch all assays
      df_assay_split <- df_oid_no_overlap[[df_name]] |>
        stringr::str_subset("_") |>
        stringr::str_split(pattern = "_") |>
        unlist()

      ## Append split IDs to existing list
      df_assays <- c(
        unlist(df_oid_no_overlap[[df_name]]),
        df_assay_split
      )

    } else {

      df_assays <- character(0L)

    }

    ## Get non-overlapping assays for dataset
    df_no_overlap <- df |>
      dplyr::filter(
        .data[[df_oid]] %in% df_assays
      ) |>
      dplyr::mutate(
        Project = df_name
      )

    return(df_no_overlap)
  }

  # Get non-overlapping assays in ref df ----

  ref_df_no_overlap <- extract_non_overlapping_df(
    df = lst_check$ref_original_df,
    df_name = lst_check$ref_name,
    df_oid = lst_check$ref_cols$olink_id,
    df_oid_no_overlap = lst_check$non_overlapping_oid
  )

  # Process other dataset ----

  # Processing for reference median normalization
  if (lst_check$norm_mode == olink_norm_modes$ref_median) {

    num_non_overlap <- ref_df_no_overlap |> # nolint object_usage_linter
      dplyr::pull(
        .data[[lst_check$ref_cols$olink_id]]
      ) |>
      unique()

    cli::cli_inform(
      c("i" = "{.val {length(num_non_overlap)}} non-overlapping assay{?s} found
        in the dataset but not in the reference medians. Assay{?s} {?is/are}
        included in the normalized dataset without adjustment.")
    )

    # Keep only non-overlapping assays from ref_df, not from ref median data
    # Set non-overlapping adjustment factor to 0
    df_non_overlapping <- ref_df_no_overlap |>
      dplyr::mutate(
        Adj_factor = 0
      )

  } else  { # Continue for all other normalization types

    # Get non-overlapping assays for not ref df ----

    not_ref_df_no_overlap <- extract_non_overlapping_df(
      df = lst_check$not_ref_original_df,
      df_name = lst_check$not_ref_name,
      df_oid = lst_check$ref_cols$olink_id,
      df_oid_no_overlap = lst_check$non_overlapping_oid
    )

    cli::cli_inform(
      c("i" = "{.val {length(unlist(lst_check$non_overlapping_oid))}}
      non-overlapping assay{?s} {?is/are} included in the normalized dataset
      without adjustment. Assays found in only one project will have decreased
      statistical power due to the lower number of samples.")
    )

    # Combine non-overlapping assays from df1 and df2
    df_non_overlapping <- ref_df_no_overlap |>
      dplyr::bind_rows(
        not_ref_df_no_overlap
      )

    # Processing for within-product bridging and subset normalization
    if (lst_check$norm_mode %in% c(olink_norm_modes$bridge,
                                   olink_norm_modes$subset)) {

      # Set non-overlapping adjustment factor to 0
      df_non_overlapping <- df_non_overlapping |>
        dplyr::mutate(
          Adj_factor = 0
        )

    } else if (lst_check$norm_mode == olink_norm_modes$norm_cross_product) {
      # Processing for cross-product bridging

      # Set bridging recommendation for non-overlapping assays
      df_non_overlapping <- df_non_overlapping |>
        dplyr::mutate(
          BridgingRecommendation = "NotOverlapping"
        )
    }

  }

  return(df_non_overlapping)
}

Try the OlinkAnalyze package in your browser

Any scripts or data that you put into this service are public.

OlinkAnalyze documentation built on Jan. 29, 2026, 1:08 a.m.