R/operations_utils.R

Defines functions test_receptor_schema assert_receptor_schema make_receptor_schema to_sym annotate_tbl_regex annotate_tbl_distance make_pattern_columns check_seq_options make_seq_options

Documented in assert_receptor_schema make_receptor_schema make_seq_options test_receptor_schema

#' @title Build a `seq_options` list for sequence‑based receptor filtering
#'
#' @description
#' A convenience wrapper that validates the common arguments for
#' **`filter_receptors()`** and returns them in the required list form.
#'
#' @param query_col Character(1). Name of the receptor column to compare
#'   (e.g. `"cdr3_aa"`).
#' @param patterns  Character vector of sequences or regular expressions to
#'   search for.
#' @param method    One of `"exact"`, `"regex"`, `"lev"` (Levenshtein), or
#'   `"hamm"` (Hamming).  Defaults to `"exact"`.
#' @param max_dist  Numeric distance threshold for `"lev"` / `"hamm"`
#'   filtering.  Use `NA` (default) to keep all rows after annotation.
#' @param name_type Passed straight to `annotate_tbl_distance()`; either
#'   `"index"` (default) or `"pattern"`.
#'
#' @return A named list suitable for the `seq_options` argument of
#'   [filter_receptors()].
#'
#' @seealso [filter_receptors()], [annotate_receptors()]
#'
#' @concept utils
#' @export
make_seq_options <- function(query_col,
                             patterns,
                             method = c("exact", "lev", "hamm", "regex"),
                             max_dist = NA,
                             name_type = c("index", "pattern")) {
  checkmate::assert_character(query_col, len = 1)
  checkmate::assert_character(patterns, min.len = 1)

  list(
    query_col = query_col,
    patterns  = patterns,
    method    = match.arg(method),
    max_dist  = max_dist,
    name_type = match.arg(name_type)
  )
}

check_seq_options <- function(seq_options, mode = NULL) {
  checkmate::check_list(seq_options, null.ok = FALSE)
  checkmate::check_choice(mode, choices = c("filter", "mutate"), null.ok = FALSE)

  if (!is.null(seq_options$patterns) &&
    length(seq_options$patterns) > 0 &&
    !is.null(seq_options$query_col)) {
    defaults <- list(method = "exact", max_dist = NA, name_type = "index")

    seq_options <- utils::modifyList(defaults, seq_options)

    seq_options$method <- match.arg(seq_options$method, c("exact", "regex", "lev", "hamm"))

    if (mode == "filter" &&
      is.na(seq_options$max_dist) &&
      seq_options$method %in% c("lev", "hamm")) {
      cli::cli_abort("You passed `seq_options` to `filter`, but didn't provide `max_dist` for filtering. Either provide `max_dist` or use `left_join` to annotate receptors with distances to patterns.")
    }

    seq_options
  } else {
    cli::cli_abort("Missing fields in `seq_options`, please use {.run immundata::make_seq_options()} to create the options")
  }
}


make_pattern_columns <- function(patterns,
                                 col_prefix,
                                 name_type = c("pattern", "index")) {
  checkmate::assert_character(patterns, min.len = 1)
  checkmate::assert_character(col_prefix, max.len = 1)
  name_type <- match.arg(name_type)

  sapply(seq_along(patterns), function(p_index) {
    p_seq <- patterns[[p_index]]

    if (name_type == "pattern") {
      safe_name <- gsub("[^A-Za-z0-9]", "_", p_seq) # just in case
      col_name_out <- paste0(col_prefix, safe_name)
    } else if (name_type == "index") {
      col_name_out <- paste0(col_prefix, p_index)
    } else {
      # TODO: what the heck
      stop("!")
    }

    col_name_out
  })
}


#' @keywords internal
annotate_tbl_distance <- function(tbl_data,
                                  query_col,
                                  patterns,
                                  method = c("lev", "hamm"),
                                  max_dist = NA,
                                  name_type = c("pattern", "index")) {
  checkmate::assert_character(query_col, len = 1)
  checkmate::assert_character(patterns, min.len = 1)

  method <- match.arg(method)
  name_type <- match.arg(name_type)

  uniq <- tbl_data |>
    distinct(!!rlang::sym(query_col)) |>
    as_tbl()

  # TODO: settings for kmers
  if (!is.na(max_dist)) {
    uniq <- uniq |>
      mutate(
        kmer_left = dbplyr::sql(cli::format_inline("{query_col}[:3]")),
        kmer_right = dbplyr::sql(cli::format_inline("{query_col}[-2:]"))
      )
  }

  if (method == "lev") {
    col_prefix <- imd_schema("sim_lev")
  } else if (method == "hamm") {
    col_prefix <- imd_schema("sim_hamm")
  }

  dist_cols <- make_pattern_columns(
    patterns = patterns,
    col_prefix = col_prefix,
    name_type = name_type
  )

  # TODO: Optimize it via SQL instead of cycles - if it is even needed...
  # TODO: lump together multiple patterns in batches
  for (i in seq_along(patterns)) {
    p <- patterns[[i]]
    col_name_out <- dist_cols[i]

    #
    # 1) Levenshtein distance
    #
    if (method == "lev") {
      if (!is.na(max_dist)) {
        len_p <- nchar(p)
        sql_expr <- cli::format_inline(
          "CASE WHEN ",
          " kmer_left = {query_col}[:3] AND kmer_right = {query_col}[-2:] AND",
          " length({query_col}) >= {len_p - max_dist} AND length({query_col}) <= {len_p + max_dist}",
          " THEN levenshtein({query_col}, '{p}')",
          " ELSE NULL END"
        )
      } else {
        len_p <- nchar(p)
        sql_expr <- cli::format_inline(
          "levenshtein({query_col}, '{p}')"
        )
      }

      uniq <- uniq |>
        mutate({{ col_name_out }} := dbplyr::sql(sql_expr))
    }

    #
    # 2) Hamming distance
    #
    else {
      len_p <- nchar(p)
      sql_expr <- cli::format_inline(
        "CASE WHEN length({query_col}) = {len_p}",
        " THEN hamming({query_col}, '{p}')",
        " ELSE NULL END"
      )

      uniq <- uniq |>
        mutate({{ col_name_out }} := dbplyr::sql(sql_expr))
    }
  }

  #
  # TODO: In case of max_dist, pre-optimize for levenshtein by filtering out too short or too long distances
  # TODO: benchmark 1 - distinct vs no distinct
  # TODO: benchmark 2 - pre-optimize vs no optimize

  # TODO: fun experiment - compute for patterns, then filter out, then compute again, and so on.
  # filter out -> filter out those who has <= max_dist (!) because we already found them and just need to store

  # TODO: Benchmarks
  # 1) distinct vs non-distinct
  # 2) pre-optimize vs no optimization for levenshtein
  # 3) step-by-step filtering out "good" sequences
  # 4) precompute sequence length before (!) any filtering, on data loading, and don't compute it here

  # TODO: max dist. Left join - compute. Right join - filter

  if (is.na(max_dist)) {
    uniq <- uniq |>
      as_duckdb_tibble()
  } else {
    sql_expr <- sprintf("LEAST(%s) <= %d", paste(dist_cols, collapse = ", "), max_dist)

    uniq <- uniq |>
      filter(dbplyr::sql(sql_expr)) |>
      as_duckdb_tibble()
  }

  uniq |> compute()
}


#' @keywords internal
annotate_tbl_regex <- function(tbl_data,
                               query_col,
                               patterns,
                               filter_out = FALSE,
                               name_type = c("index", "pattern")) {
  checkmate::assert_character(query_col, len = 1)
  checkmate::assert_character(patterns, min.len = 1)
  checkmate::assert_logical(filter_out)

  name_type <- match.arg(name_type)

  uniq <- tbl_data |> distinct(!!rlang::sym(query_col))

  col_prefix <- imd_schema("sim_regex")

  dist_cols <- make_pattern_columns(
    patterns = patterns,
    col_prefix = col_prefix,
    name_type = name_type
  )

  # TODO: Optimize it via SQL instead of cycles - if it is even needed...
  for (i in seq_along(patterns)) {
    p <- patterns[[i]]
    col_name_out <- dist_cols[i]

    # annotate with DuckDB regexp_matches()
    uniq <- uniq |>
      mutate(!!col_name_out := dd$regexp_matches(!!rlang::sym(query_col), p))
  }

  tbl_data <- tbl_data |> left_join(uniq, by = query_col)
  if (filter_out) {
    # TODO: need to replace it with if_else when it is available in duckplyr
    sql_expr <- paste(dist_cols, collapse = " OR ")

    tbl_data |>
      as_tbl() |>
      filter(dbplyr::sql(sql_expr)) |>
      as_duckdb_tibble() |>
      compute() # TODO: We need a compute here because sometimes duckplyr can't find the table
  } else {
    tbl_data
  }
}


to_sym <- function(val) {
  if (length(val) == 1) {
    rlang::sym(val)
  } else {
    rlang::syms(val)
  }
}

#' @title Create or validate a receptor schema object
#'
#' @description
#' Helper functions for defining and validating the `schema` used by
#' [agg_receptors()] to identify unique receptors.
#'
#' `make_receptor_schema()` creates a schema list object.
#' `assert_receptor_schema()` checks if an object is a valid schema list and throws
#'   an error if not.
#' `test_receptor_schema()` checks if an object is a valid schema list or a
#'   character vector (which `agg_receptors` can also accept) and returns `TRUE`
#'   or `FALSE`.
#'
#' @param features Character vector. Column names defining the features of a
#'   single receptor chain (e.g., V gene, J gene, CDR3 sequence).
#' @param chains Optional character vector (max length 2). Locus names (e.g.,
#'   `"TRA"`, `"TRB"`) to filter by or pair. If `NULL` or length 1, only
#'   filtering occurs. If length 2, pairing logic is enabled in [agg_receptors()].
#'   Default: `NULL`.
#' @param schema An object to test or assert as a valid schema. Can be a list
#'   created by `make_receptor_schema` or a character vector (for `test_receptor_schema`).
#'
#' @return
#' `make_receptor_schema` returns a list with elements `features` and `chains`.
#' `assert_receptor_schema` returns `TRUE` invisibly if valid, or stops execution.
#' `test_receptor_schema` returns `TRUE` or `FALSE`.
#'
#' @rdname make_receptor_schema
#' @concept utils
#' @export
make_receptor_schema <- function(features, chains = NULL) {
  checkmate::check_character(features, min.len = 1)
  checkmate::check_character(chains, max.len = 2, null.ok = TRUE)

  list(features = features, chains = chains)
}


#' @rdname make_receptor_schema
#' @export
assert_receptor_schema <- function(schema) {
  # TODO: globals.R with schema list

  checkmate::assert(
    checkmate::check_character(schema, min.len = 1),
    checkmate::check_list(schema, len = 2, null.ok = FALSE) &&
      checkmate::check_names(names(schema), must.include = c("features", "chains"))
  )

  receptor_chains <- imd_receptor_chains(schema)

  if (!is.null(receptor_chains)) {
    # Validate chain syntax rules
    if (length(receptor_chains) > 2) {
      cli::cli_abort("Schema can have at most 2 chain elements. Found {length(receptor_chains)}: [{paste(receptor_chains, collapse=', ')}]")
    }

    if (length(receptor_chains) >= 1) {
      # Check first chain doesn't contain pipe
      if (grepl("\\|", receptor_chains[1])) {
        cli::cli_abort("The first chain in the schema cannot contain '|' character. Found: '{receptor_chains[1]}'. The OR syntax is only allowed in the second chain.")
      }
    }

    if (length(receptor_chains) == 2) {
      # Check if second chain contains OR syntax (|)
      if (grepl("\\|", receptor_chains[2])) {
        # Split and validate the alternatives
        relaxed_chain_alternatives <- trimws(unlist(strsplit(receptor_chains[2], "\\|")))

        # Validate the alternatives
        if (length(relaxed_chain_alternatives) != 2) {
          cli::cli_abort("Relaxed pairing syntax requires exactly 2 alternatives separated by '|'. Found {length(relaxed_chain_alternatives)} in '{receptor_chains[2]}'")
        }

        # Check for empty alternatives
        if (any(relaxed_chain_alternatives == "")) {
          cli::cli_abort("Empty chain name found in '{receptor_chains[2]}'. Both alternatives must be valid chain names.")
        }

        # Check for duplicate alternatives
        if (length(unique(relaxed_chain_alternatives)) != length(relaxed_chain_alternatives)) {
          cli::cli_abort("Duplicate chain names found in '{receptor_chains[2]}'. Alternatives must be different.")
        }

        # Check that alternatives are different from the required chain
        if (receptor_chains[1] %in% relaxed_chain_alternatives) {
          cli::cli_abort("The required chain '{receptor_chains[1]}' cannot also be an alternative in '{receptor_chains[2]}'")
        }
      } else {
        # Strict pairing - check for accidental spaces or typos
        if (grepl("[\\s,;]", receptor_chains[2])) {
          cli::cli_warn("Found potential separator characters in '{receptor_chains[2]}'. For relaxed pairing, use the pipe character '|' to separate alternatives (e.g., 'IGL|IGK')")
        }
      }
    }
  }

  TRUE
}


#' @rdname make_receptor_schema
#' @export
test_receptor_schema <- function(schema) {
  checkmate::test_character(schema, min.len = 1) || (
    checkmate::test_list(schema, len = 2, null.ok = FALSE) &&
      checkmate::test_subset(names(schema), c("features", "chains"), empty.ok = FALSE)
  )
}

Try the immundata package in your browser

Any scripts or data that you put into this service are public.

immundata documentation built on April 4, 2026, 9:09 a.m.