R/read_npx_wide.R

Defines functions read_npx_wide_bottom read_npx_wide_bottom_version red_npx_wide_top_mid_long read_npx_wide_panel_version read_npx_wide_middle read_npx_wide_top read_npx_wide_check_top read_npx_wide_npxs_version read_npx_wide_split_row read_npx_wide

Documented in read_npx_wide read_npx_wide_bottom read_npx_wide_bottom_version read_npx_wide_check_top read_npx_wide_middle read_npx_wide_npxs_version read_npx_wide_panel_version read_npx_wide_split_row read_npx_wide_top red_npx_wide_top_mid_long

#' Convert Olink data in wide format with
#' `r ansi_collapse_quot(get_olink_data_types(broad_platform = "qPCR"))` data to
#' long format.
#'
#' @author
#'   Klev Diamanti
#'
#' @inherit .read_npx_args params return
#' @param df A tibble containing the full Olink dataset in wide format.
#' @param file Path to Olink software output file in wide format. Expected one
#' of file extensions
#' `r ansi_collapse_quot(get_file_ext(name_sub = c("excel", "delim")), "or")`.
#' @param olink_platform Olink platform used to generate the input file.
#' One of `NULL` (default) for auto-detection,
#' `r ansi_collapse_quot(get_olink_platforms(broad_platform = "qPCR"), "or")`.
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_format}}
#'   \code{\link{read_npx_wide_split_row}}
#'   \code{\link{read_npx_wide_npxs_version}}
#'   \code{\link{read_npx_wide_top}}
#'   \code{\link{read_npx_wide_middle}}
#'   \code{\link{read_npx_wide_bottom}}
#'
read_npx_wide <- function(df,
                          file,
                          data_type,
                          olink_platform) {

  # initial checks ----

  check_is_tibble(x = df,
                  error = TRUE)

  check_file_exists(file = file,
                    error = TRUE)

  broad_platform <- "qPCR"

  check_olink_data_type(x = data_type,
                        broad_platform = broad_platform)

  check_olink_platform(x = olink_platform,
                       broad_platform = broad_platform)

  # get expected format specifications ----

  format_spec <- olink_wide_spec |>
    dplyr::filter(
      .data[["data_type"]] == .env[["data_type"]]
    )

  # split the file into sub-data frames ----

  df_split_row <- read_npx_wide_split_row(
    df = df,
    file = file,
    data_type = data_type,
    format_spec = format_spec
  )

  # header matrix NPXS version ----

  npxs_v <- read_npx_wide_npxs_version(
    df = df_split_row$df_head
  )

  # top list of df to long ----

  df_top_list <- read_npx_wide_top(
    df = df_split_row$df_top,
    file = file,
    olink_platform = olink_platform,
    format_spec = format_spec
  )

  col_names <- sapply(df_top_list, function(x) x$col_index)

  # middle list of df to long ----

  df_middle_list <- read_npx_wide_middle(
    df = df_split_row$df_mid,
    file = file,
    data_type = data_type,
    col_names = col_names
  )

  # combine top and middle matrices ----

  df_long <- red_npx_wide_top_mid_long(
    df_top_list = df_top_list,
    df_middle_list = df_middle_list,
    data_type = data_type,
    format_spec = format_spec
  )

  # add bottom df to long ----

  if (format_spec$has_qc_data == TRUE) {

    df_bottom <- read_npx_wide_bottom(
      df = df_split_row$df_bottom,
      file = file,
      olink_platform = olink_platform,
      data_type = data_type,
      col_names = col_names,
      format_spec = format_spec,
      df_plate_panel = df_long |>
        dplyr::select(
          dplyr::all_of(c("col_index", "PlateID", "Panel"))
        ) |>
        dplyr::distinct()
    )

    # add columns from bottom matrix
    if ("PlateID" %in% colnames(df_bottom$df_bottom_oid)) {

      df_long <- df_long |>
        dplyr::left_join(
          dplyr::bind_rows(df_bottom),
          by = c("col_index", "PlateID"),
          relationship = "many-to-one"
        )

    } else {

      df_long <- df_long |>
        dplyr::left_join(
          dplyr::bind_rows(df_bottom),
          by = "col_index",
          relationship = "many-to-one"
        )

    }

  }

  # modify output df ----

  df_long <- df_long |>
    dplyr::mutate(
      `Olink NPX Signature Version` = npxs_v
    ) |>
    # remove col_index
    dplyr::select(
      -dplyr::all_of("col_index")
    )

  # rename columns ----

  olink_wide_rename_npxs_tmp <- olink_wide_rename_npxs |>
    dplyr::filter(
      .data[["OA_internal"]] %in% colnames(df_long)
    )

  df_long <- df_long |>
    dplyr::rename_with(
      .fn = ~olink_wide_rename_npxs_tmp$NPXS,
      .cols = dplyr::all_of(olink_wide_rename_npxs_tmp$OA_internal)
    )

  # return ----

  return(df_long)
}

#' Split Olink wide files to sub-matrices.
#'
#' @description
#' Olink datasets in wide format contain 2 or 3 rows with all columns `NA`
#' marking sub-matrices of the data. This function takes advantage of that
#' feature and splits the dataset into 3 or 4 sub-matrices. Each sub-matrix is
#' used downstream to assemble a long data frame.
#'
#' Specifically:
#' \itemize{
#'   \item \strong{Head matrix} consists of the first 2 rows of the wide
#'   dataset. This matrix contains the project name, the NPX Signature version
#'   that was used to generate the wide dataset and the quantification method.
#'   \item \strong{Top matrix} consists of the next 4 or 5 rows of the wide
#'   dataset, depending on the quantification method. This matrix contains data
#'   on assays, panels, columns with plate identifiers, columns with sample QC
#'   warnings and column with deviations from the internal controls. Note that
#'   not all the columns are present in all datasets and for all quantification
#'   methods. The local environment variable \var{olink_wide_spec} marks all the
#'   expected configurations.
#'   \item \strong{Middle matrix} is marked by rows with all columns `NA` above
#'   and below. This matrix contains sample identifiers, quantification
#'   measurements for all assays, plate identifiers, sample QC warnings and
#'   deviations from the internal controls.
#'   \item \strong{Bottom matrix} is located below the middle matrix and
#'   contains information of LOD, missing frequency, assay warning and data
#'   normalization approach. Note that this matrix is not available for all
#'   quantification methods.
#' }
#'
#' @author
#'   Klev Diamanti
#'
#' @inheritParams read_npx_wide
#' @param format_spec A tibble derived from \var{olink_wide_spec} in the local
#' environment containing the expected format of the Olink wide file based on
#' the \var{olink_platform} and \var{data_type}.
#'
#' @return A named list of tibbles containing the sub-matrices of the Olink wide
#' format file split on:
#' \itemize{
#'   \item \strong{Head matrix} as \var{df_head}
#'   \item \strong{Top matrix} as \var{df_top}
#'   \item \strong{Middle} as \var{df_mid}
#'   \item \strong{Bottom matrix} as \var{df_bottom}
#' }
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_wide}}
#'   \code{\link{read_npx_wide_npxs_version}}
#'   \code{\link{read_npx_wide_top}}
#'   \code{\link{read_npx_wide_middle}}
#'   \code{\link{read_npx_wide_bottom}}
#'
read_npx_wide_split_row <- function(df,
                                    file,
                                    data_type,
                                    format_spec) {

  # help function to fill NA ----

  # we are not using tidyr::fill because it still depends on dplyr::mutate_at
  # which is superseded, and throws error in test
  fill_na_with_previous <- function(x) {
    last_non_na <- NA
    for (i in seq_along(x)) {
      if (!is.na(x[i])) {
        last_non_na <- x[i]
      } else {
        x[i] <- last_non_na
      }
    }
    return(x)
  }

  # detect rows with all NA columns ----

  # Use the rows full of NAs in the file to compute the number of rows
  # that contain data about assays.
  na_row_index <- df |>
    # total number of columns
    dplyr::mutate(
      total_col = {
        dplyr::pick(dplyr::everything()) |>
          ncol()
      }
    ) |>
    # count number of NA in each row
    dplyr::mutate(
      row_number = dplyr::row_number(),
      num_na = {
        dplyr::pick(dplyr::everything()) |>
          is.na() |>
          rowSums()
      }
    ) |>
    # filter rows with all cols NA
    dplyr::filter(
      .data[["num_na"]] == .data[["total_col"]]
    )

  if (nrow(na_row_index) > 0L) {
    na_row_index <- na_row_index |>
      # allow for consecutive all NA rows
      dplyr::mutate(
        dif_nxt_r = dplyr::lead(.data[["row_number"]]) - .data[["row_number"]],
        dif_nxt_r = fill_na_with_previous(x = .data[["dif_nxt_r"]])
      ) |>
      # create groups of consecutive rows with all cols NA
      dplyr::mutate(
        all_na_g = dplyr::case_when(
          dplyr::row_number() == 1L ~ paste0("g", .data[["row_number"]]),
          dplyr::lag(.data[["dif_nxt_r"]]) == 1L ~ NA_character_,
          TRUE ~ paste0("g", .data[["row_number"]]),
          .default = NA_character_
        ),
        all_na_g = fill_na_with_previous(x = .data[["all_na_g"]])
      ) |>
      # extract start and end of groups
      dplyr::group_by(.data[["all_na_g"]]) |>
      dplyr::summarise(
        start_g = min(.data[["row_number"]]),
        end_g = max(.data[["row_number"]]),
        .groups = "drop"
      ) |>
      dplyr::arrange(
        .data[["start_g"]], .data[["end_g"]]
      )
  }

  # check that row indexes are correct ----

  # check that there are 1 or 2 rows with all NA
  if (nrow(na_row_index) != format_spec$n_na_rows) {

    cli::cli_abort(
      message = c(
        "x" = "We identified
        {.val { ifelse(identical(na_row_index, integer(0L)),
                       0L, nrow(na_row_index))}}
        rows with all columns `NA` in file {.file {file}}, while we expected
        {.val {format_spec$n_na_rows}}!",
        "i" = "Has the file been modified manually?"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  # split df from start and stop row indexes ----

  # mark starting and ending row indexes of each sub-matrix
  df_split_index <- dplyr::tibble(
    start_row = c(
      1L, # header
      3L, # skip first two rows
      na_row_index$end_g + 1L # all NA rows +1
    ),
    end_row = c(
      2L, # header
      na_row_index$start_g - 1L, # all NA rows -1
      nrow(df) # last row of file
    )
  )

  # extract sub-matrices
  list_df_split <- lapply(
    seq_len(nrow(df_split_index)),
    function(i) {
      df_s_i <- df_split_index |>
        dplyr::slice(
          i
        )

      df_split <- df |>
        dplyr::slice(
          df_s_i$start_row:df_s_i$end_row
        )

      return(df_split)
    }
  )

  # output is a list of 2 or 3 dataframes, depending on data_type
  # name each data frame
  if (nrow(na_row_index) == 1L) {
    names(list_df_split) <- c("df_head", "df_top", "df_mid")
  } else {
    names(list_df_split) <- c("df_head", "df_top", "df_mid", "df_bottom")
  }

  # return list of data frames ----

  return(list_df_split)

}

#' Extract version of NPX Signature from the head matrix of Olink datasets in
#' wide format.
#'
#' @author
#'   Klev Diamanti
#'
#' @param df Head matrix of Olink dataset in wide format \var{df_head}.
#'
#' @return The version of the NPX Signature software.
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_wide}}
#'   \code{\link{read_npx_wide_split_row}}
#'   \code{\link{read_npx_wide_npxs_version}}
#'   \code{\link{read_npx_wide_top}}
#'   \code{\link{read_npx_wide_middle}}
#'   \code{\link{read_npx_wide_bottom}}
#'
read_npx_wide_npxs_version <- function(df) {

  # check necessary columns ----

  check_columns(df = df, col_list = list("V1", "V2"))

  # extract NPXS sw version ----

  npxs_sw_v <- df |>
    dplyr::slice_head(
      n = 1L
    ) |>
    dplyr::pull(
      .data[["V2"]]
    ) |>
    as.character() |>
    strsplit(split = " ", fixed = TRUE) |>
    lapply(utils::tail, 1L) |>
    unlist()

  # return ----

  return(npxs_sw_v)

}

#' Additional checks of the top matrix of Olink dataset in wide format.
#'
#' @author
#'   Klev Diamanti
#'
#' @inheritParams read_npx_wide_top
#'
#' @return NULL unless an inconsistency is spotted.
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_wide_top}}
#'
read_npx_wide_check_top <- function(df,
                                    file,
                                    format_spec) {

  # checks ----

  ## column 1 contains the expected values ----

  # check that df contains "V1"
  check_columns(df = df, col_list = list("V1"))

  # rows containing metadata about assays and panels based on the quantification
  # method in the top dataset.
  top_mat_v1 <- format_spec |>
    dplyr::pull(
      .data[["top_matrix_v1"]]
    ) |>
    unlist()

  if (!identical(dplyr::pull(df, .data[["V1"]]), top_mat_v1)) {

    top_v1_miss <- top_mat_v1[!(top_mat_v1 %in% df$V1)] # nolint: object_usage_linter

    cli::cli_abort(
      message = c(
        "x" = "Column 1 of the top matrix with assay data in file
        {.file {file}} does not contain: {.val {top_v1_miss}}",
        "i" = "Has the file been modified manually?"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  ## row 2 contains Plate ID (and QC Warning) ----

  # rows containing information on plate id and qc warning
  top_mat_assay_labels <- format_spec |>
    dplyr::pull(
      .data[["top_matrix_assay_labels"]]
    ) |>
    unlist()

  if (!all(top_mat_assay_labels %in% df[2L, ])) {

    top_mat_assay_miss <- top_mat_assay_labels[ # nolint: object_usage_linter
      !(top_mat_assay_labels %in% df[2L, ])
    ]

    cli::cli_abort(
      message = c(
        "x" = "Row 2 of the top matrix with assay data in file {.file {file}}
        does not contain: {.val {top_mat_assay_miss}}",
        "i" = "Has the file been modified manually?"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  ## row 2 contains the correct number of internal controls ----

  # list of expected names of internal control assays
  int_ctrl_list <- format_spec |>
    dplyr::pull(
      .data[["top_matrix_assay_int_ctrl"]]
    ) |>
    unlist()

  panel_list <- df[1L, 2L:ncol(df)] |>
    as.character() |>
    unique()

  # expected combos of int ctrl and panels
  panel_int_ctrl_exp <- expand.grid(
    "panel" = panel_list,
    "int_ctrl" = int_ctrl_list
  ) |>
    dplyr::as_tibble() |>
    dplyr::mutate(
      dplyr::across(
        dplyr::everything(),
        ~ as.character(.x)
      )
    ) |>
    dplyr::mutate(
      int_ctrl_strip = strsplit(x = .data[["int_ctrl"]],
                                split = " ",
                                fixed = TRUE) |>
        lapply(utils::head, 2L) |>
        lapply(paste, collapse = " ") |>
        unlist()
    )

  # check for internal control assays
  int_ctrl_df_col <- colnames(df)[which(df[2L, ] %in% int_ctrl_list)]

  # run only if there are internal controls
  if (length(int_ctrl_df_col) > 0L) {

    # data frame of internal controls present
    int_ctrl_df <- df |>
      dplyr::select(
        dplyr::all_of(int_ctrl_df_col)
      ) |>
      dplyr::slice(
        1L:2L
      ) |>
      t() |>
      `colnames<-`(c("panel", "int_ctrl")) |>
      dplyr::as_tibble(
        .name_repair = "minimal"
      ) |>
      dplyr::mutate(
        in_df = TRUE
      )

    # intersect real with expected datasets
    int_ctrl_df_present <- panel_int_ctrl_exp |>
      dplyr::inner_join(
        int_ctrl_df,
        by = c("panel", "int_ctrl")
      )

    # missing internal controls
    int_ctrl_df_missing <- panel_int_ctrl_exp |>
      dplyr::anti_join(
        int_ctrl_df,
        by = c("panel", "int_ctrl")
      ) |>
      dplyr::filter(
        .data[["int_ctrl"]] == .data[["int_ctrl_strip"]]
      ) |>
      dplyr::anti_join(
        int_ctrl_df_present,
        by = c("panel", "int_ctrl_strip")
      )

    if (nrow(int_ctrl_df_missing) > 0L) {

      cli::cli_abort(
        message = c(
          "x" = "{cli::qty(unique(int_ctrl_df_missing$panel))} Panel{?s}
          {.val {unique(int_ctrl_df_missing$panel)}} {?is/are} missing one or
          more of the internal control assays
          {.val {unique(int_ctrl_df_missing$int_ctrl)}} from row 2 of the top
          matrix with assay data in file {.file {file}}!",
          "i" = "Has the file been modified manually?"
        ),
        call = rlang::caller_env(),
        wrap = FALSE
      )

    }

  }

  ## row 3 contains the correct number of deviation from internal controls ----

  # list of expected tags of deviations from internal controls
  dev_int_ctrl_tag <- format_spec |>
    dplyr::pull(
      .data[["top_matrix_assay_dev_int_ctrl"]]
    ) |>
    unlist()

  if (length(dev_int_ctrl_tag) > 0L) {

    # names of deviation from internal control assays
    dev_int_ctrl_list <- format_spec |>
      dplyr::pull(
        .data[["top_matrix_uniprot_dev_int_ctrl"]]
      ) |>
      unlist()

    panel_list <- df[1L, 2L:ncol(df)] |>
      as.character() |>
      unique()

    # expected combos of int ctrl and panels
    panel_dev_int_ctrl_exp <- expand.grid(
      "panel" = panel_list,
      "dev_int_ctrl" = dev_int_ctrl_list
    ) |>
      dplyr::as_tibble() |>
      dplyr::mutate(
        dplyr::across(
          dplyr::everything(),
          ~ as.character(.x)
        )
      ) |>
      dplyr::mutate(
        dev_int_ctrl_strip = strsplit(x = .data[["dev_int_ctrl"]],
                                      split = " ",
                                      fixed = TRUE) |>
          lapply(utils::head, 2L) |>
          lapply(paste, collapse = " ") |>
          unlist()
      )

    # check for deviation from internal control assays
    dev_int_ctrl_df_col <- colnames(df)[which(df[3L, ] %in% dev_int_ctrl_list)]

    # run only if there are deviations from internal controls
    if (length(dev_int_ctrl_df_col) > 0L) {

      # data frame of internal controls present
      dev_int_ctrl_df <- df |>
        dplyr::select(
          dplyr::all_of(dev_int_ctrl_df_col)
        ) |>
        dplyr::slice(
          c(1L, 3L)
        ) |>
        t() |>
        `colnames<-`(c("panel", "dev_int_ctrl")) |>
        dplyr::as_tibble(
          .name_repair = "minimal"
        ) |>
        dplyr::mutate(
          in_df = TRUE
        )

      # intersect real with expected datasets
      dev_int_ctrl_df_present <- panel_dev_int_ctrl_exp |>
        dplyr::inner_join(
          dev_int_ctrl_df,
          by = c("panel", "dev_int_ctrl")
        )

      # missing internal controls
      dev_int_ctrl_df_missing <- panel_dev_int_ctrl_exp |>
        dplyr::anti_join(
          dev_int_ctrl_df,
          by = c("panel", "dev_int_ctrl")
        ) |>
        dplyr::filter(
          .data[["dev_int_ctrl"]] == .data[["dev_int_ctrl_strip"]]
        ) |>
        dplyr::anti_join(
          dev_int_ctrl_df_present,
          by = c("panel", "dev_int_ctrl_strip")
        )

      if (nrow(dev_int_ctrl_df_missing) > 0L) {

        cli::cli_abort( # nolint: return_linter
          message = c(
            "x" = "{cli::qty(unique(int_ctrl_df_missing$panel))} Panel{?s}
            {.val {unique(dev_int_ctrl_df_missing$panel)}} {?is/are} missing one
            or more of the deviations from the internal control assays
            {.val {unique(dev_int_ctrl_df_missing$dev_int_ctrl)}} from row 3 of
            the top matrix with assay data in file {.file {file}}!",
            "i" = "Has the file been modified manually?"
          ),
          call = rlang::caller_env(),
          wrap = FALSE
        )

      }

    }

  }

}

#' Split the top matrix from Olink dataset in wide format.
#'
#' @description
#' The function splits the top matrix \var{df_top} into chunks of columns, each
#' of which contains separate information that will be combined with matching
#' chunks from \var{df_mid} to convert the wide dataset into a long one.
#'
#' @author
#'   Klev Diamanti
#'
#' @inheritParams read_npx_wide
#' @inheritParams read_npx_wide_split_row
#' @param df Top matrix of Olink dataset in wide format \var{df_top}.
#'
#' @return A list of data frames from top matrix in long format:
#' \itemize{
#'   \item Data frame containing Olink assays \var{df_top_oid}
#'   \item Data frame containing plate identifiers \var{df_top_pid}
#'   \item Data frame containing QC warnings \var{df_top_qc_warn}
#'   \item Data frame containing internal control assays \var{df_top_int_ctrl}
#'   \item Data frame containing deviation from internal control assays
#'   \var{df_top_dev_int_ctrl}
#' }
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_wide}}
#'   \code{\link{read_npx_wide_split_row}}
#'   \code{\link{read_npx_wide_npxs_version}}
#'   \code{\link{read_npx_wide_middle}}
#'   \code{\link{read_npx_wide_bottom}}
#'
read_npx_wide_top <- function(df,
                              file,
                              olink_platform,
                              format_spec) {

  # check input and top matrix ----

  read_npx_wide_check_top(
    df = df,
    file = file,
    format_spec = format_spec
  )

  # transpose df to long ----

  df_t <- t(df) # transpose the wide df
  colnames(df_t) <- df_t[1L, ] # add colnames
  df_t <- df_t |>
    dplyr::as_tibble(
      rownames = "col_index"
    ) |>
    dplyr::slice(
      2L:dplyr::n()
    )

  # we have checked in read_npx_wide_check_top that all expected columns based
  # on data_type are present

  # split df_t into its parts ----

  # extract customer assays from assay column
  df_top_oid <- df_t |>
    dplyr::filter(
      !is.na(.data[["OlinkID"]])
    )

  # extract plate_id and qc_warning from Assay column
  df_pid_qcw <- lapply(
    unlist(format_spec$top_matrix_assay_labels),
    function(x) {
      df_t |> # nolint: return_linter
        dplyr::filter(
          is.na(.data[["OlinkID"]])
          & .data[["Assay"]] %in% .env[["x"]]
        ) |>
        dplyr::select(
          -dplyr::any_of(c("Uniprot ID", "OlinkID", "Unit"))
        ) |>
        dplyr::rename(
          "Var" = "Assay"
        )
    }
  )
  names(df_pid_qcw) <- paste("df_top", names(df_pid_qcw), sep = "_")

  # extract internal control from Assay column
  df_top_int_ctrl <- df_t |>
    dplyr::filter(
      is.na(.data[["OlinkID"]])
      & .data[["Assay"]] %in% unlist(format_spec$top_matrix_assay_int_ctrl)
    )

  # extract deviation from internal controls from Assay column
  df_top_dev_int_ctrl <- df_t |>
    dplyr::filter(
      is.na(.data[["OlinkID"]])
      & .data[["Assay"]] %in% unlist(format_spec$top_matrix_assay_dev_int_ctrl)
    ) |>
    dplyr::select(
      -dplyr::any_of(c("OlinkID", "Unit"))
    )

  # checks ----

  ## check sum of rows ----

  if (nrow(df_t) != (nrow(df_top_oid)
                     + sapply(df_pid_qcw, nrow) |> sum()
                     + nrow(df_top_int_ctrl)
                     + nrow(df_top_dev_int_ctrl))) {

    top_mat_unknown_cols <- setdiff(df_t$col_index, # nolint: object_usage_linter
                                    c(df_top_oid$col_index,
                                      sapply(df_pid_qcw, \(x) x$col_index) |>
                                        unname() |>
                                        unlist(),
                                      df_top_int_ctrl$col_index,
                                      df_top_dev_int_ctrl$col_index))

    cli::cli_abort(
      message = c(
        "x" = "The top matrix with the assay data in file {.file {file}} in row
        `Assay` contains unrecognized values in columns:
        {.val {top_mat_unknown_cols}}!",
        "i" = "Has the file been modified manually?"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  ## check df_top_oid ----

  # no NAs are allowed in df_top_oid in any column other than "Uniprot ID"
  # the latter because the assay NT-proBNP does not have a Uniprot ID
  if (any(is.na(dplyr::select(df_top_oid, -dplyr::all_of("Uniprot ID"))))) {

    cli::cli_abort(
      message = c(
        "x" = "The top matrix with the assay data in file {.file {file}} expects
        no empty cells for assays other than internal controls. Identified
        {.val {sum(is.na(df_top_oid))}} empty cells!",
        "i" = "Has the file been modified manually?"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  # we will check number of assays only for Target 96 and 48 Olink platforms.
  # Other Olink platforms such as Flex and Focus allow a varying number of
  # assays, which does not allow us to check number of assays.
  expected_num_assays <- accepted_olink_platforms |>
    dplyr::filter(
      .data[["name"]] == .env[["olink_platform"]]
    ) |>
    dplyr::mutate(
      total_n = .data[["base_index"]] * length(unique(df_top_oid$Panel))
    ) |>
    dplyr::pull(
      .data[["total_n"]]
    )

  if (!is.na(expected_num_assays)
      && nrow(df_top_oid) != expected_num_assays) {

    cli::cli_abort(
      message = c(
        "x" = "Detected {.val {nrow(df_top_oid)}} assays in
        {.val {length(unique(df_top_oid$Panel))}} panels in file {.file {file}},
        but expected {.val {expected_num_assays}}!",
        "i" = "Has the file been modified manually?"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  ## check df_plate & df_qc_warn ----

  # when both elements of the list with required labels are present in the top
  # wide matrix, expect identical dimensions
  if (all(sapply(df_pid_qcw, nrow) > 0L)) {

    # check that that dimensions of the dfs in the list are identical
    if (sapply(df_pid_qcw, dim) |> unique(MARGIN = 2L) |> ncol()  != 1L) {

      cli::cli_abort(
        message = c(
          "x" = "Expected equal number of
          {.val {unlist(format_spec$top_matrix_assay_labels)}} columns in the
          top matrix with the assay data in file {.file {file}}!",
          "i" = "Has the file been modified manually?"
        ),
        call = rlang::caller_env(),
        wrap = FALSE
      )

    }

  }

  # return ----

  list_df_top <- append(x = df_pid_qcw,
                        values = list(df_top_oid = df_top_oid))

  # remove df with no rows
  list_df_top <- list_df_top[which(lapply(list_df_top, nrow) != 0L)]

  # add df_top_int_ctrl if not empty
  if (nrow(df_top_int_ctrl) != 0L) {
    list_df_top <- append(x = list_df_top,
                          values = list(df_top_int_ctrl = df_top_int_ctrl))
  }

  # add df_top_dev_int_ctrl if not empty
  if (nrow(df_top_dev_int_ctrl) != 0L) {
    list_df_top <- append(
      x = list_df_top,
      values = list(df_top_dev_int_ctrl = df_top_dev_int_ctrl)
    )
  }

  return(list_df_top)

}

#' Split the middle matrix from Olink dataset in wide format.
#'
#' @description
#' Use chunks of columns from \code{\link{read_npx_wide_top}} to split the
#' middle matrix \var{df_mid} into corresponding chunks of columns.
#'
#' @author
#'   Klev Diamanti
#'
#' @inheritParams read_npx_wide
#' @param df Middle matrix of Olink dataset in wide format \var{df_mid}.
#' @param col_names Names list of character vectors containing column names from
#' each chunk of columns \var{df_top} was split on in function.
#' \code{\link{read_npx_wide_top}}.
#'
#' @return A list of data frames (df_oid, df_pid, df_qc_warn and df_int_ctrl) in
#' long format from the middle matrix of an Olink wide file:
#' \itemize{
#'   \item Data frame containing measurements of Olink assays \var{df_mid_oid}
#'   \item Data frame containing plate identifiers \var{df_mid_pid}
#'   \item Data frame containing QC warnings \var{df_mid_qc_warn}
#'   \item Data frame containing measurements of internal control assays
#'   \var{df_mid_int_ctrl}
#'   \item Data frame containing measurements of deviations from internal
#'   control assays \var{df_mid_dev_int_ctrl}
#' }
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_wide}}
#'   \code{\link{read_npx_wide_split_row}}
#'   \code{\link{read_npx_wide_npxs_version}}
#'   \code{\link{read_npx_wide_top}}
#'   \code{\link{read_npx_wide_bottom}}
#'
read_npx_wide_middle <- function(df,
                                 file,
                                 data_type,
                                 col_names) {

  # check columns ----

  check_columns(df = df, col_list = list("V1"))

  # check unique sample identifiers ----

  n_uniq_sample <- dplyr::pull(df, .data[["V1"]]) |> unique() |> length()
  if (nrow(df) != n_uniq_sample) {

    cli::cli_inform(
      message = c(
        "i" = "The middle matrix in file {.file {file}} does not contain unique
        sample identifiers. Identified {.val {nrow(df) - n_uniq_sample}}
        duplicates!"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  # add a new column `rid` the will be used as a proxy to allow duplicated
  # sample identifiers as input.
  # Down stream we will be using SampleID and rid to match matrices.
  df <- df |>
    dplyr::mutate(
      rid = dplyr::row_number()
    )

  # check that all relevant columns exist ----

  check_columns(df = df,
                col_list = col_names |>
                  unlist() |>
                  unname() |>
                  (\(.x) c(.x, "rid"))() |> # adding rid to checked columns
                  as.list())

  # split datasets ----

  ## split assays and pivot to longer ----

  list_df_mid <- list()

  list_df_mid$df_mid_oid <- df |>
    dplyr::select(
      dplyr::all_of(c("V1", "rid", col_names$df_top_oid))
    ) |>
    dplyr::rename(
      "SampleID" = dplyr::all_of("V1")
    ) |>
    tidyr::pivot_longer(
      -dplyr::all_of(c("SampleID", "rid")),
      names_to = "col_index",
      values_to = data_type
    )

  ## split plates and pivot to longer ----

  list_df_mid$df_mid_plate <- df |>
    dplyr::select(
      dplyr::all_of(c("V1", "rid", col_names$df_top_plate))
    ) |>
    dplyr::rename(
      "SampleID" = dplyr::all_of("V1")
    ) |>
    tidyr::pivot_longer(
      -dplyr::all_of(c("SampleID", "rid")),
      names_to = "col_index",
      values_to = "PlateID"
    )

  ## split qc_warnings and pivot to longer ----

  # done only if the are columns with QC Warning
  if ("df_top_qc_warn" %in% names(col_names)) {

    list_df_mid$df_mid_qc_warn <- df |>
      dplyr::select(
        dplyr::all_of(c("V1", "rid", col_names$df_top_qc_warn))
      ) |>
      dplyr::rename(
        "SampleID" = dplyr::all_of("V1")
      ) |>
      tidyr::pivot_longer(
        -dplyr::all_of(c("SampleID", "rid")),
        names_to = "col_index",
        values_to = "QC_Warning"
      )

  }

  ## split internal_controls and pivot to longer ----

  # done only if the are columns with QC Warning
  if ("df_top_int_ctrl" %in% names(col_names)) {

    list_df_mid$df_mid_int_ctrl <- df |>
      dplyr::select(
        dplyr::all_of(c("V1", "rid", col_names$df_top_int_ctrl))
      ) |>
      dplyr::rename(
        "SampleID" = dplyr::all_of("V1")
      ) |>
      tidyr::pivot_longer(
        -dplyr::all_of(c("SampleID", "rid")),
        names_to = "col_index",
        values_to = data_type
      )
  }

  ## split deviation from internal_controls and pivot to longer ----

  # done only if the are columns with QC Warning
  if ("df_top_dev_int_ctrl" %in% names(col_names)) {

    list_df_mid$df_mid_dev_int_ctrl <- df |>
      dplyr::select(
        dplyr::all_of(c("V1", "rid", col_names$df_top_dev_int_ctrl))
      ) |>
      dplyr::rename(
        "SampleID" = dplyr::all_of("V1")
      ) |>
      tidyr::pivot_longer(
        -dplyr::all_of(c("SampleID", "rid")),
        names_to = "col_index",
        values_to = data_type
      )
  }

  # checks ---------

  ## check number of rows in plate and qc_warning data frames ----

  # done only if the are columns with QC Warning
  if ("df_mid_qc_warn" %in% names(list_df_mid)) {

    if (nrow(list_df_mid$df_mid_plate) != nrow(list_df_mid$df_mid_qc_warn)) {

      cli::cli_abort(
        message = c(
          "x" = "Uneven number of entries of \"Plate ID\" and \"QC Warning\" in
          the middle matrix of the Olink wide format file {.file {file}}!",
          "i" = "Has the file been modified manually?"
        ),
        call = rlang::caller_env(),
        wrap = FALSE
      )

    }

  }

  ## check if all cols from df_mid were consumed ----

  col_names_mid <- sapply(list_df_mid, function(x) x$col_index) |>
    unlist() |>
    unname() |>
    unique() |>
    (\(.x) c("V1", "rid", .x))() |>
    sort()

  if (!identical(x = colnames(df) |> sort(),
                 y = col_names_mid)) {

    col_mid_missing <- colnames(df)[!(colnames(df) %in% col_names_mid)]
    col_mid_missing <- sub(pattern = "V", replacement = "", x = col_mid_missing)

    cli::cli_abort(
      message = c(
        "x" = "Unable to assign {cli::qty(col_mid_missing)} column{?s}
        {.val {col_mid_missing}} from the Olink wide format file
        {.file {file}}!",
        "i" = "Has the file been modified manually?"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  # return ----

  return(list_df_mid)

}

#' Help function to extract Panel_Version from Panel column.
#'
#' @param df A tibble containing the column \var{Panel}.
#'
#' @return Same tibble as input with additional column \var{Panel_Version}.
#'
#' @keywords internal
#'
read_npx_wide_panel_version <- function(df) {

  # check columns ----

  check_columns(df = df,
                col_list = list("Panel"))

  # extarct Panel_Version and modify Panel ----

  panel_v <- df |>
    # add Panel_Version
    dplyr::mutate(
      # if else allows us to have Panel_Version NA when the pattern (v.X) is not
      # present
      Panel_Version = dplyr::if_else(
        grepl(pattern = "(", x = .data[["Panel"]], fixed = TRUE),
        strsplit(x = .data[["Panel"]],
                 split = "(",
                 fixed = TRUE) |>
          lapply(utils::tail, 1L) |>
          unlist() |>
          (\(x) {
            sub(pattern = ")", # nolint: return_linter
                replacement = "",
                x = x,
                fixed = TRUE)
          })(),
        NA_character_
      )
    ) |>
    # modify Panel
    dplyr::mutate(
      # if else allows us to have Panel unchanged when the pattern (v.X) is not
      # present
      Panel = dplyr::if_else(
        grepl(pattern = "(", x = .data[["Panel"]], fixed = TRUE),
        strsplit(x = .data[["Panel"]],
                 split = "(",
                 fixed = TRUE) |>
          lapply(utils::head, -1L) |>
          lapply(paste, collapse = "(") |>
          unlist(),
        .data[["Panel"]]
      )
    )

  return(panel_v)
}

#' Combine top and middle matrices in long format.
#'
#' @description
#' Combined corresponding chunks of columns from the top and middle matrix
#' that were computed from \code{\link{read_npx_wide_top}} and
#' \code{\link{read_npx_wide_middle}}, respectively.
#'
#' @author
#'   Klev Diamanti
#'
#' @inheritParams read_npx_wide
#' @inheritParams read_npx_wide_split_row
#' @param df_top_list List of data frames from the top matrix. Output of
#' function \code{\link{read_npx_wide_top}}.
#' @param df_middle_list List of data frames from the middle matrix. Output of
#' function \code{\link{read_npx_wide_middle}}.
#'
#' @return Tibble in long format combining the top and middle matrices.
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_wide_top}}
#'   \code{\link{read_npx_wide_middle}}
#'
red_npx_wide_top_mid_long <- function(df_top_list,
                                      df_middle_list,
                                      data_type,
                                      format_spec) {
  # prepare components of long df ----

  ## df oid ----

  df_long <- df_middle_list$df_mid_oid |>
    dplyr::left_join(
      df_top_list$df_top_oid,
      by = "col_index",
      relationship = "many-to-one"
    ) |>
    read_npx_wide_panel_version()

  ## df internal controls ----

  if ("df_mid_int_ctrl" %in% names(df_middle_list)
      && "df_top_int_ctrl" %in% names(df_top_list)) {

    df_int_ctrl <- df_middle_list$df_mid_int_ctrl |>
      dplyr::left_join(
        df_top_list$df_top_int_ctrl,
        by = "col_index",
        relationship = "many-to-one"
      ) |>
      read_npx_wide_panel_version()

    df_long <- df_long |>
      dplyr::bind_rows(
        df_int_ctrl
      )
    rm(df_int_ctrl)
  }

  ## df plate id ----

  df_plate <- df_middle_list$df_mid_plate |>
    dplyr::left_join(
      df_top_list$df_top_plate,
      by = "col_index",
      relationship = "many-to-one"
    ) |>
    dplyr::select(
      -dplyr::any_of(c("Var", "Unit", "col_index"))
    ) |>
    read_npx_wide_panel_version() |>
    dplyr::select(
      -dplyr::all_of("Panel_Version")
    )

  df_long <- df_long |>
    dplyr::left_join(
      df_plate,
      by = c("SampleID", "rid", "Panel"),
      relationship = "many-to-one"
    )
  rm(df_plate)

  ## df qc warning ----

  if (format_spec$has_qc_data == TRUE) {

    df_qc_warn <- df_middle_list$df_mid_qc_warn |>
      dplyr::left_join(
        df_top_list$df_top_qc_warn,
        by = "col_index",
        relationship = "many-to-one"
      ) |>
      dplyr::select(
        -dplyr::any_of(c("Var", "Unit", "col_index"))
      ) |>
      read_npx_wide_panel_version() |>
      dplyr::select(
        -dplyr::all_of("Panel_Version")
      )

    df_long <- df_long |>
      dplyr::left_join(
        df_qc_warn,
        by = c("SampleID", "rid", "Panel"),
        relationship = "many-to-one"
      )
    rm(df_qc_warn)
  }

  ## df deviation internal controls ----

  if ("df_mid_dev_int_ctrl" %in% names(df_middle_list)
      && "df_top_dev_int_ctrl" %in% names(df_top_list)) {

    df_dev_int_ctrl <- df_middle_list$df_mid_dev_int_ctrl |>
      dplyr::left_join(
        df_top_list$df_top_dev_int_ctrl,
        by = "col_index",
        relationship = "many-to-one"
      ) |>
      dplyr::select(
        -dplyr::all_of(c("Assay", "col_index"))
      ) |>
      tidyr::pivot_wider(
        id_cols = dplyr::all_of(c("SampleID", "rid", "Panel")),
        names_from = dplyr::all_of("Uniprot ID"),
        values_from = dplyr::all_of(data_type)
      ) |>
      read_npx_wide_panel_version() |>
      dplyr::select(
        -dplyr::all_of("Panel_Version")
      )

    df_long <- df_long |>
      dplyr::left_join(
        df_dev_int_ctrl,
        by = c("SampleID", "rid", "Panel"),
        relationship = "many-to-one"
      )
    rm(df_dev_int_ctrl)
  }

  # remove rid ----

  df_long <- df_long |>
    dplyr::select(
      -dplyr::all_of("rid")
    )

  # return ----

  return(df_long)
}

#' Additional checks of the bottom matrix of Olink dataset in wide format.
#'
#' @description
#' The rows included in the bottom matrix have evolved through the years. For us
#' to be able to support as many such versions as possible we have used the
#' local environment variable \var{olink_wide_bottom_matrix} to mark these
#' different versions. This function extract these version and allows us to
#' check the validity of the data.
#'
#' @author
#'   Klev Diamanti
#'
#' @inheritParams read_npx_wide_bottom
#'
#' @return Tibble with the bottom matrix specifications for the Olink wide file.
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_wide_bottom}}
#'
read_npx_wide_bottom_version <- function(df,
                                         file,
                                         data_type,
                                         olink_platform) {

  # extract all possible variable names from the global matrix
  format_spec_bottom <- olink_wide_bottom_matrix |>
    dplyr::filter(
      .data[["olink_platform"]] == .env[["olink_platform"]]
      & .data[["data_type"]] == .env[["data_type"]]
    )

  # number of unique versions of names
  format_spec_bottom_v <- format_spec_bottom$version |> unique()

  # list dfs to store possible combinations of names in V1 of bottom matrix
  list_bottom_v <- list()

  # return if 0 is the only available version
  if (length(format_spec_bottom_v) == 1L
      && format_spec_bottom_v == 0L) {

    list_bottom_v$`0` <- format_spec_bottom |>
      dplyr::select(
        dplyr::all_of("plate_specific"),
        dplyr::starts_with("variable")
      )

  } else {

    # if 0 is not the only version, then create a list of df with each element
    # containing one version merged with 0 version
    format_spec_bottom_v <- format_spec_bottom_v[format_spec_bottom_v != 0L]

    list_bottom_v <- lapply(format_spec_bottom_v,
                            function(x) {
                              format_spec_bottom |> # nolint: return_linter
                                dplyr::filter(
                                  .data[["version"]] %in% c(0L, x)
                                ) |>
                                dplyr::select(
                                  dplyr::all_of("plate_specific"),
                                  dplyr::starts_with("variable")
                                )
                            })
    names(list_bottom_v) <- format_spec_bottom_v

  }

  # list with all possible combinations
  format_spec_bottom <- list_bottom_v

  # check first column ----

  # check that column "V1" exists in the df
  check_columns(df = df, col_list = list("V1"))

  # check that at least one of the alternatives combinations of names
  # contains all names in V1
  format_spec_bottom <- lapply(format_spec_bottom, function(x) {
    name_in_df <- lapply(x$variable_alt_names,
                         \(y) (y[y %in% df$V1])) |> # nolint: return_linter
      lapply(\(y) (ifelse(length(y) == 0L, NA_character_, y))) |>
      unlist()

    x |> # nolint: return_linter
      dplyr::mutate(
        variable_name_in_df = name_in_df,
        in_df = dplyr::if_else(
          is.na(.data[["variable_name_in_df"]]),
          FALSE,
          TRUE
        )
      )
  })

  names_in_v1 <- lapply(format_spec_bottom, function(.x) {
    .x |> # nolint: return_linter
      dplyr::mutate(
        total_n = dplyr::n(),
        true_n = sum(.data[["in_df"]])
      ) |>
      dplyr::select(
        dplyr::all_of(c("total_n", "true_n"))
      ) |>
      dplyr::distinct()
  }) |>
    dplyr::bind_rows(
      .id = "combo"
    ) |>
    dplyr::filter(
      .data[["total_n"]] == .data[["true_n"]]
    ) |>
    dplyr::arrange(
      dplyr::desc(.data[["total_n"]])
    ) |>
    dplyr::slice_head(
      n = 1L
    ) |>
    dplyr::pull(
      .data[["combo"]]
    )

  # if none or multiple combinations match
  # or, if df$V1 is a superset of the expected columns
  if (length(names_in_v1) != 1L
      || nrow(format_spec_bottom[[names_in_v1]]) != length(unique(df$V1))) {

    bottom_mat_v1_expected <- sapply(
      format_spec_bottom,
      function(x) {
        sapply(x$variable_alt_names, utils::head, 1L) |> # nolint: return_linter
          cli::ansi_collapse() |>
          (\(.x) paste("*", .x))()
      }
    )

    cli::cli_abort(
      message = c(
        "x" = "Unexpected values in column 1 of the bottom matrix with QC data
        in file {.file {file}}.",
        "Expected one of the combos:",
        bottom_mat_v1_expected,
        "i" = "Has the file been modified manually?"
      ),
      call = rlang::caller_env(),
      wrap = FALSE
    )

  }

  format_spec_bottom[[names_in_v1]] <- format_spec_bottom[[names_in_v1]] |>
    dplyr::mutate(
      version = .env[["names_in_v1"]]
    )

  return(format_spec_bottom[[names_in_v1]])
}

#' Convert the bottom matrix from Olink dataset in wide format to long.
#'
#' @description
#' Use chunks of columns from \code{\link{read_npx_wide_top}} to covert the
#' bottom matrix \var{df_bottom} into a long format tibble.
#'
#' @author
#'   Klev Diamanti
#'
#' @inheritParams read_npx_wide
#' @inheritParams read_npx_wide_split_row
#' @inheritParams read_npx_wide_middle
#' @param df Bottom matrix of Olink dataset in wide format \var{df_bottom}.
#' @param df_plate_panel Tibble with unique combinations of panels and plates
#' from the combination of top and middle data frames.
#'
#' @return A tibble with the bottom matrix of an Olink wide file in long format.
#'
#' @keywords internal
#'
#' @seealso
#'   \code{\link{read_npx_wide}}
#'   \code{\link{read_npx_wide_split_row}}
#'   \code{\link{read_npx_wide_npxs_version}}
#'   \code{\link{read_npx_wide_top}}
#'   \code{\link{read_npx_wide_middle}}
#'
read_npx_wide_bottom <- function(df,
                                 file,
                                 olink_platform,
                                 data_type,
                                 col_names,
                                 format_spec,
                                 df_plate_panel) {
  # get first column options ----

  # clean up format_spec_bottom for downstream use
  format_spec_bottom_df <- read_npx_wide_bottom_version(
    df = df,
    file = file,
    data_type = data_type,
    olink_platform = olink_platform
  ) |>
    dplyr::select(
      -dplyr::all_of(c("variable_name", "variable_alt_names", "in_df"))
    )

  # keep necessary columns ----

  # columns expected to be present in the df
  expected_cols <- c("V1", col_names$df_top_oid)
  if ("df_top_int_ctrl" %in% names(col_names)) {
    expected_cols <- c(expected_cols, col_names$df_top_int_ctrl)
  }
  if (any(format_spec_bottom_df$plate_specific == TRUE)) {
    expected_cols <- c(expected_cols, col_names$df_top_plate)
  }
  # check that columns in expected_cols exist in the df
  check_columns(df = df, col_list = as.list(expected_cols))

  df <- df |>
    # keep only columns absolutely required:
    # V1 that contains names of variables
    # columns that contain info for customer assays
    # columns that contain info for internal control assays
    # columns that might contain plate names in case of plate-specific QC data
    dplyr::select(
      dplyr::all_of(expected_cols)
    )

  # per-plate metrics ----

  if (any(format_spec_bottom_df$plate_specific == TRUE)) {

    # plate specific qc metrics
    format_spec_bottom_plate_spec <- format_spec_bottom_df |>
      dplyr::filter(
        .data[["plate_specific"]] == TRUE
      ) |>
      dplyr::pull(
        .data[["variable_name_in_df"]]
      )

    # extract rows with plate-specific metrics
    df_plate_spec <- df |>
      # keep only rows with -plate-specific info
      dplyr::filter(
        .data[["V1"]] %in% format_spec_bottom_plate_spec
      ) |>
      remove_all_na_cols()

    # equal number of rows for each QC metric at bottom matrix
    df_plate_spec_n_row <- df_plate_spec |>
      dplyr::pull(
        .data[["V1"]]
      ) |>
      table() |>
      unname() |>
      unique() |>
      length()

    if (df_plate_spec_n_row != 1L) {

      cli::cli_abort(
        message = c(
          "x" = "Column 1 of the bottom matrix does not contain the same number
          of rows for plate-specific QC
          {cli::qty(format_spec_bottom_plate_spec)} measurement{?s}
          {.val {format_spec_bottom_plate_spec}} in file {.file {file}}!",
          "i" = "Has the file been modified manually?"
        ),
        call = rlang::caller_env(),
        wrap = FALSE
      )

    }

    # for each variable in V1 do a pivot_longer
    df_plate_spec <- lapply(
      format_spec_bottom_plate_spec,
      function(x) {
        df_plate_spec |> # nolint: return_linter
          # keep only one Vq variable at a time
          dplyr::filter(
            .data[["V1"]] == .env[["x"]]
          ) |>
          # remove V1 columns to allow for pivoting
          dplyr::select(
            -dplyr::all_of("V1")
          ) |>
          # make the wide matrix into a long one
          # each PlateID will become a column
          tidyr::pivot_longer(
            -dplyr::all_of(col_names$df_top_plate),
            names_to = "col_index",
            values_to = x
          ) |>
          # one more pivot longer to make PlateID into a single column
          tidyr::pivot_longer(
            -dplyr::all_of(c("col_index", x)),
            names_to = "col_index_pid",
            values_to = "PlateID"
          ) |>
          # remove the PlateID index column
          dplyr::select(
            -dplyr::all_of("col_index_pid")
          )
      }
    )

    # left join all data frames from the list
    df_plate_spec <- Reduce(f = function(df_1, df_2) {
      dplyr::left_join(x = df_1, # nolint: return_linter
                       y = df_2,
                       by = c("PlateID", "col_index"),
                       relationship = "one-to-one")
    },
    x = df_plate_spec)

  }

  # plates-shared metrics ----

  # plate shared qc metrics
  format_spec_bottom_plate_share <- format_spec_bottom_df |>
    dplyr::filter(
      .data[["plate_specific"]] == FALSE
    ) |>
    dplyr::pull(
      .data[["variable_name_in_df"]]
    )

  # remove rows processed earlier as plate-specific
  df_plate_shared <- df |>
    # keep only rows with plate-shared info
    dplyr::filter(
      .data[["V1"]] %in% format_spec_bottom_plate_share
    ) |>
    remove_all_na_cols()

  df_plate_shared <- t(df_plate_shared)
  colnames(df_plate_shared) <- df_plate_shared[1, ]
  df_plate_shared <- df_plate_shared |>
    dplyr::as_tibble(
      rownames = "col_index"
    ) |>
    dplyr::slice(
      2L:dplyr::n()
    )

  # join df_plate_shared and df_plate_spec ----

  if (exists("df_plate_spec")) {

    df_long <- df_plate_shared |>
      dplyr::full_join(
        y = df_plate_spec,
        by = "col_index",
        relationship = "one-to-many"
      ) |>
      # remove duplicates from the pivot_longer in PlateID
      dplyr::inner_join(
        df_plate_panel,
        by = c("col_index", "PlateID")
      ) |>
      dplyr::select(
        -dplyr::all_of(c("Panel"))
      )

  } else {

    df_long <- df_plate_shared

  }

  # output list_df ----

  # split into internal controls and assays
  if ("df_top_int_ctrl" %in% names(col_names)) {

    list_df <- list(
      df_bottom_oid = df_long |>
        dplyr::filter(
          .data[["col_index"]] %in% col_names$df_top_oid
        ),
      df_bottom_int_ctrl = df_long |>
        dplyr::filter(
          .data[["col_index"]] %in% col_names$df_top_int_ctrl
        )
    )

  } else {

    list_df <- list(
      df_bottom_oid = df_long
    )

  }

  return(list_df)

}

Try the OlinkAnalyze package in your browser

Any scripts or data that you put into this service are public.

OlinkAnalyze documentation built on June 24, 2026, 1:06 a.m.