packFinder: de novo Annotation of Pack-TYPE Transposable Elements

Documented in identifyTirMatches

#' @title
#' Identify Terminal Inverted Repeat Matches
#'
#' @description
#' Searches a \code{\link[Biostrings:XStringSet-class]{DNAStringSet}} 
#' for potential TIRs based on sequence similarity.
#'
#' @param tirSeq 
#' A \code{\link[Biostrings:DNAString-class]{DNAString}} 
#' object to be searched for.
#'
#' @param Genome
#' A \code{\link[Biostrings:XStringSet-class]{DNAStringSet}} 
#' object containing the 
#' \code{\link[Biostrings:DNAString-class]{DNAString}} 
#' objects to be searched.
#'
#' @param mismatch
#' The allowable mismatch between \code{tirSeq} and a 
#' given slice of \code{Genome}. Includes indels.
#'
#' @param strand
#' The directionality of the search string ("+" or "-"). Note
#' that this does affect the search for tirSeqs, if you wish 
#' to search the reverse strand you should use the reverse 
#' complement of your sequence. 
#'
#' @param tsdLength
#' Integer referring to the length of the flanking TSD region.
#' 
#' @param fixed
#' Logical that will be passed to the `fixed` argument of 
#' \code{\link[Biostrings]{matchPattern}}. Determines the behaviour of IUPAC
#' ambiguity codes when searching for TIR sequences.
#'
#' @details 
#' Called by \code{\link{packSearch}}. Used by 
#' \code{\link{packSearch}} as an initial filtering stage. 
#' \code{\link[Biostrings]{matchPattern}} from Biostrings 
#' is used for pattern matching. It is recommended to use 
#' the general pipeline function \code{\link{packSearch}} 
#' for identification of potential pack elements, however 
#' each stage may be called individually.
#'
#' @return
#' A dataframe, \code{tirMatches}, containing identified 
#' matches. The dataframe is in the format generated by 
#' \code{\link{packSearch}}.
#' 
#' @seealso 
#' \code{\link[Biostrings:XStringSet-class]{DNAStringSet}},
#' \code{\link{packSearch}}, \code{\link[Biostrings]{matchPattern}},
#' \code{\link[Biostrings:DNAString-class]{DNAString}} 
#' 
#' @examples
#' data(arabidopsisThalianaRefseq)
#' 
#' forwardMatches <- identifyTirMatches(
#'     Biostrings::DNAString("CACTACAA"),
#'     arabidopsisThalianaRefseq,
#'     tsdLength = 3,
#'     strand = "+"
#' )
#' 
#' @author
#' Jack Gisby
#'
#' @export

identifyTirMatches <- function(tirSeq, Genome, mismatch = 0, strand = "*", 
                               tsdLength, fixed=TRUE) {
    if (strand != "-" & strand != "+") {
        stop("Argument 'strand' must be specified as '-' or '+'")
    }
    
    tirMatches <- initialisePackMatches()
    
    # get tir matches
    for (i in seq_len(length(Genome))) {

        matches <- Biostrings::matchPattern(
            tirSeq,
            Genome[[i]],
            max.mismatch = mismatch,
            with.indels = TRUE,
            fixed = fixed
        )
        
        if (length(matches) > 0) {
            tirMatches <- rbind(tirMatches, data.frame(
                seqnames = names(Genome)[i],
                start = GenomicRanges::start(matches),
                end = GenomicRanges::start(matches) + 
                    GenomicRanges::width(matches) - 1,
                width = GenomicRanges::width(matches),
                strand = strand
            ))
        }
    }
    
    # remove matches whose TSD sequences do not exist (index range)
    if (strand == "-") {
        removeMatch <- mapply(function(end, seqnames, tsdLength, Genome) {
            seq <- Genome[names(Genome) == seqnames][[1]]
            return((end + tsdLength) > length(seq))
        },
        tirMatches$end,
        tirMatches$seqnames,
        MoreArgs = list(tsdLength, Genome)
        )
        tirMatches <- tirMatches[removeMatch == FALSE, ]
    } else if (strand == "+") {
        tirMatches <- tirMatches[tirMatches$start > tsdLength, ]
    }
    
    return(tirMatches)
}