R/searchSeq.R
In LymphoSeq: Analyze high-throughput sequencing of T and B cell receptors

Documented in searchSeq

#' Search for a sequence
#' 
#' Search for one or more amino acid or nucleotide CDR3 sequences in a list of 
#' data frames.
#' 
#' @param list A list of data frames generated by the LymphoSeq functions readImmunoSeq 
#' or productiveSeq.  "aminoAcid" or "nucleotide", "frequencyCount", and 
#' "count" are required columns.
#' @param sequence A character vector of one ore more amino acid or nucleotide 
#' CDR3 sequences to search.
#' @param type A character vector specifying the type of sequence(s) to be 
#' searched.  Available options are "aminoAcid" or "nucleotide". 
#' @param match A character vector specifying whether an exact partial or exact global
#'  match of the searched sequence(s) is desired.  Available options are 
#' "partial" and "global".
#' @param editDistance An integer giving the minimum edit distance that the 
#' sequence must be less than or equal to.  See details below.
#' @details An exact partial match means the searched sequence is contained within 
#' target sequence.  An exact global match means the searched sequence is identical to 
#' the target sequence.
#' 
#' Edit distance is a way of quantifying how dissimilar two sequences 
#' are to one another by counting the minimum number of operations required to 
#' transform one sequence into the other.  For example, an edit distance of 0 
#' means the sequences are identical and an edit distance of 1 indicates that 
#' the sequences different by a single amino acid or nucleotide.
#' @return Returns the rows for every instance in the list of data frames where 
#' the searched sequence(s) appeared.
#' @examples
#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
#' 
#' file.list <- readImmunoSeq(path = file.path)
#' 
#' aa1 <- "CASSPVSNEQFF"
#' 
#' aa2 <- "CASSQEVPPYQAFF"
#' 
#' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", 
#'    match = "global", editDistance = 0)
#' 
#' searchSeq(list = file.list, sequence = c(aa1, aa2), 
#'    type = "aminoAcid", match = "global", editDistance = 0)
#' 
#' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", editDistance = 1)
#' 
#' nt <- "CTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTCCGGTAAGCAATGAGCAGTTCTTCGGGCCA"
#' 
#' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 3)
#' 
#' searchSeq(list = file.list, sequence = "CASSPVS", type = "aminoAcid", 
#'    match = "partial", editDistance = 0)
#' 
#' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 0)
#' @export
#' @importFrom plyr llply
#' @importFrom utils adist
searchSeq <- function(list, sequence, type = "aminoAcid", match = "global", editDistance = 0) {
    if (editDistance >= 1) {
        merged.results <- data.frame()
        i <- 1
        for (i in 1:length(list)) {
            file <- list[[i]]
            ed <- adist(sequence, file[, type])
            rownames(ed) <- sequence
            colnames(ed) <- file[, type]
            ed.index <- which(ed <= editDistance, arr.ind = TRUE)
            if (nrow(ed.index) >= 1) {
                ed.subset <- as.integer()
                j <- 1
                for (j in 1:nrow(ed.index)) {
                    ed.j <- ed[ed.index[j, "row"], ed.index[j, "col"]]
                    ed.subset <- c(ed.subset, ed.j)
                }
                results <- data.frame(searchSequnece = rownames(ed)[ed.index[, 1]], 
                                      foundSequnece = colnames(ed)[ed.index[, 2]], 
                                      editDistance = ed.subset)
                results <- results[!duplicated(results), ]
                search <- plyr::llply(list[i], function(x) 
                    x[which(x[, type] %in% results$foundSequnece), ])
                found <- NULL
                k <- 1
                for (k in 1:length(search)) {
                    if (nrow(search[[k]]) != 0) {
                        search[[k]] <- cbind(rep(names(search)[k], nrow(search[[k]])), 
                                             search[[k]])
                        colnames(search[[k]])[1] <- "sample"
                        found <- rbind(found, search[[k]])
                    }
                }
                names(found)[which(names(found) == type)] <- "foundSequnece"
                merged.search <- merge(results, found)
                merged.results <- rbind(merged.results, merged.search)
                merged.results <- merged.results[c("sample", setdiff(names(merged.results), "sample"))]
            }
        }
        if (nrow(merged.results) >= 1) {
            merged.results$foundSequnece = as.character(merged.results$foundSequnece)
            merged.results$searchSequnece = as.character(merged.results$searchSequnece)
            merged.results = merged.results[order(merged.results$frequencyCount, decreasing = TRUE),]
            rownames(merged.results) = NULL
            return(merged.results)
        } else {
            cat("No sequences found.")
        }
    } else {
        if (match == "global") {
            search <- plyr::llply(list, function(x) 
                x[which(x[, type] %in% sequence), ])
        }
        if (match == "partial") {
            search <- plyr::llply(list, function(x) 
                x[grep(paste(sequence, collapse = "|"), x[, type]), ])
        }
        found <- NULL
        l <- 1
        for (l in 1:length(search)) {
            if (nrow(search[[l]]) != 0) {
                search[[l]] <- cbind(rep(names(search)[l], nrow(search[[l]])), search[[l]])
                colnames(search[[l]])[1] <- "sample"
                found <- rbind(found, search[[l]])
            }
        }
        if (is.null(found)) {
            message("No sequences found.")
        } else {
            found = found[order(found$frequencyCount, decreasing = TRUE),]
            rownames(found) = NULL
            return(found)
        }
    }
}