R/searchSeq.R

Defines functions findSeq searchSeq

Documented in findSeq searchSeq

#' Search for a sequence
#'
#' Search for one or more amino acid or junction CDR3 sequences in a study
#' tibble.
#'
#' @param study_table A tibble generated by the LymphoSeq2 functions
#' [readImmunoSeq()] or [productiveSeq()].  "junction_aa" or "junction",
#' "duplicate_frequency", and "duplicate_count" are required columns.
#' @param sequence A character vector of one ore more amino acid or junction
#' CDR3 sequences to search.
#' @param seq_type A character vector specifying the type of sequences to be
#' searched.  Available options are "junction_aa" or "junction".
#' @param edit_distance An integer giving the minimum edit distance that the
#' sequence must be less than or equal to.  See details below.
#' @param match A string indicating the type of sequence matching to perform.
#' Acceptable values are `"global"` and `"partial"`. See details below.
#' @details An exact partial match means the searched sequence is contained
#' within target sequence.  An exact global match means the searched sequence is
#' identical to the target sequence.
#'
#' Edit distance is a way of quantifying how dissimilar two sequences
#' are to one another by counting the minimum number of operations required to
#' transform one sequence into the other.  For example, an edit distance of 0
#' means the sequences are identical and an edit distance of 1 indicates that
#' the sequences different by a single amino acid or junction.
#' @return Returns the rows for every instance in the list of data frames where
#' the searched sequence(s) appeared.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing",
#'  package = "LymphoSeq2")
#' study_table <- LymphoSeq2::readImmunoSeq(path = file_path, threads = 1)
#' study_table <- LymphoSeq2::topSeqs(study_table, top = 100)
#' aa1 <- "CASSPVSNEQFF"
#' aa2 <- "CASSQEVPPYQAFF"
#' LymphoSeq2::searchSeq(
#'   study_table = study_table,
#'   sequence = aa1,
#'   seq_type = "junction_aa",
#'   edit_distance = 0,
#'   match = "global"
#' )
#' LymphoSeq2::searchSeq(
#'   study_table = study_table,
#'   sequence = c(aa1, aa2),
#'   seq_type = "junction_aa",
#'   edit_distance = 0,
#'   match = "global"
#' )
#' LymphoSeq2::searchSeq(
#'   study_table = study_table,
#'   sequence = aa1,
#'   seq_type = "junction_aa",
#'   edit_distance = 1,
#'   match = "global"
#' )
#' nt <- "CTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTCCGGTAAGCAATGAGCAGTTCTTCGGGCCA"
#' LymphoSeq2::searchSeq(
#'   study_table = study_table,
#'   sequence = nt,
#'   seq_type = "junction",
#'   edit_distance = 3,
#'   match = "global"
#' )
#' LymphoSeq2::searchSeq(
#'   study_table = study_table,
#'   sequence = "CASSPVS",
#'   seq_type = "junction_aa",
#'   edit_distance = 0,
#'   match = "global"
#' )
#' LymphoSeq2::searchSeq(
#'   study_table = study_table,
#'   sequence = nt,
#'   seq_type = "junction",
#'   edit_distance = 0,
#'   match = "global"
#' )
#' @export
searchSeq <- function(study_table,
                      sequence,
                      seq_type = "junction",
                      edit_distance = 0,
                      match = "global") {
  query_list <- study_table |>
    dplyr::filter(!is.na(!!base::as.symbol(seq_type))) |>
    dplyr::pull(!!base::as.symbol(seq_type))
  search_tables <- sequence |>
    purrr::map(~ findSeq(.x, query_list, edit_distance, seq_type, match)) |>
    dplyr::bind_rows()
  search_tables <- dplyr::left_join(study_table, search_tables,
      by = stats::setNames(nm = seq_type)) |>
    dplyr::filter(!is.na(edit_distance))
  return(search_tables)
}

#' Find sequences of interest
#'
#' @describeIn searchSeq Find all sequences below edit distance threshold from
#'  query list.
#' @param query_list List of productive CDR3 nucleotide or amino acid sequences
#' @return Tibble of sequences that differ from the input sequence by the
#'  edit distance threshold provided
findSeq <- function(sequence, query_list, edit_distance, seq_type, match) {
  if (match == "global") {
    partial <- FALSE
  } else if (match == "partial") {
    partial <- TRUE
  }
  edist <- utils::adist(sequence, query_list, partial = partial)
  match_list <- query_list[(edist <= edit_distance)]
  edist_list <- edist[(edist <= edit_distance)]
  sequence_table <- tibble::tibble(
    c1 = match_list,
    c2 = edist_list,
    c3 = sequence,
    .name_repair = ~ c(seq_type, "edit_distance", "searchSequence")
  ) |>
    dplyr::filter(!is.na(!!base::as.symbol(seq_type))) |>
    dplyr::distinct()
  return(sequence_table)
}
shashidhar22/LymphoSeq2 documentation built on Jan. 16, 2024, 4:29 a.m.