R/searchSeq.R

Defines functions findSeq searchSeq

Documented in findSeq searchSeq

#' Search for a sequence
#' 
#' Search for one or more amino acid or junction CDR3 sequences in a study tibble.
#' 
#' @param study_table A tibble generated by the LymphoSeq2 functions readImmunoSeq 
#' or productiveSeq.  "junction_aa" or "junction", "duplicate_frequency", and 
#' "duplicate_count" are required columns.
#' @param sequence A character vector of one ore more amino acid or junction 
#' CDR3 sequences to search.
#' @param seq_type A character vector specifying the type of sequence(s) to be 
#' searched.  Available options are "junction_aa" or "junction". 
#' @param edit_distance An integer giving the minimum edit distance that the 
#' sequence must be less than or equal to.  See details below.
#' @param match A string indicating the type of sequence matching to perform. 
#' Acceptable values are "global" and "partial". See details below.
#' @details An exact partial match means the searched sequence is contained within 
#' target sequence.  An exact global match means the searched sequence is identical to 
#' the target sequence.
#' 
#' Edit distance is a way of quantifying how dissimilar two sequences 
#' are to one another by counting the minimum number of operations required to 
#' transform one sequence into the other.  For example, an edit distance of 0 
#' means the sequences are identical and an edit distance of 1 indicates that 
#' the sequences different by a single amino acid or junction.
#' @return Returns the rows for every instance in the list of data frames where 
#' the searched sequence(s) appeared.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq2
#' stable <- readImmunoSeq(path = file_path)
#' aa1 <- "CASSPVSNEQFF"
#' aa2 <- "CASSQEVPPYQAFF"
#' searchSeq(study_table = stable, 
#'           sequence = aa1, 
#'           seq_type = "junction_aa", 
#'           edit_distance = 0)
#' searchSeq(study_table = stable, 
#'           sequence = c(aa1, aa2), 
#'           seq_type = "junction_aa", 
#'           edit_distance = 0)
#' searchSeq(study_table = stable, 
#'           sequence = aa1, 
#'           seq_type = "junction_aa", 
#'           edit_distance = 1)
#' nt <- "CTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTCCGGTAAGCAATGAGCAGTTCTTCGGGCCA"
#' searchSeq(study_table = stable,
#'           sequence = nt,
#'           seq_type = "junction",
#'           edit_distance = 3)
#' searchSeq(study_table = stable,
#'           sequence = "CASSPVS",
#'           seq_type = "junction_aa",
#'           edit_distance = 0)
#' searchSeq(study_table = study_table,
#'           sequence = nt,
#'           seq_type = "junction",
#'           edit_distance = 0)
#' @export
#' @import tidyverse
searchSeq <- function(study_table, sequence, seq_type = "junction", edit_distance = 0, match = "global") {
    query_list <- study_table %>% 
                  dplyr::filter(!is.na(!!base::as.symbol(seq_type))) %>%
                  dplyr::pull(!!base::as.symbol(seq_type))
    search_tables <- sequence %>% 
                     purrr::map(~findSeq(.x, query_list, edit_distance, seq_type, match)) %>% 
                     dplyr::bind_rows()
    search_tables <- dplyr::left_join(study_table, search_tables, by = stats::setNames(nm = seq_type)) %>% 
                     dplyr::filter(!is.na(edit_distance))
    return(search_tables)
}

#' Find sequences of interest
#' 
#' @describeIn searchSeq Find all sequences below edit distance threshold from query list
#' 
#' @inheritParams searchSeq
findSeq <- function(sequence, query_list, edit_distance, seq_type, match){
    if (match == "global") {
        partial = FALSE
    } else if (match == "partial") {
        partial = TRUE
    }
    edist <- utils::adist(sequence, query_list, partial = partial)
    match_list <- query_list[(edist <= edit_distance)]
    edist_list <- edist[(edist <= edit_distance)]
    sequence_table <- tibble::tibble(c1 = match_list, 
                                     c2 = edist_list, 
                                     c3 = sequence, 
                                     .name_repair= ~ c(seq_type, "edit_distance", "searchSequence")) %>%
                      dplyr::filter(!is.na(!!base::as.symbol(seq_type))) %>% 
                      dplyr::distinct()
    return(sequence_table)
}
elulu3/LymphoSeqTest documentation built on Aug. 27, 2022, 5:47 a.m.