R/RcppExports.R
In AhoCorasickTrie: Fast Searching for Multiple Keywords in Multiple Texts

Documented in AhoCorasickSearch AhoCorasickSearchList

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Fast searching for one or more keywords in a list of texts
#'
#' @param keywords Character vector of one or more keywords
#' @param textList List of lists, each sublist with one or more texts to search
#' @param alphabet Alphabet to use; one of \code{ascii}, \code{aminoacid}, or \code{nucleicacid}
#' @param groupByKeyword If true, matches are grouped by keyword (instead of by text)
#' @param iterationFeedback When set to a positive integer \code{i}, console output will indicate when searching every \code{i}th text
#' @return List of lists of matches, grouped by either text or by keyword (each list of texts gets its own list of matches)
#' @description Builds an Aho-Corasick trie from one or more keywords and uses it to search a list of
#'   one or more texts. For a large number of keywords, Aho-Corasick is much faster
#'   than a naive approach (such as \code{lapply(keywords, gregexpr, text)}).
#'
#'   Use \code{\link{AhoCorasickSearchList}} instead of \code{\link{AhoCorasickSearch}} when you want to keep the matches
#'   of each input sublist separate. If the sublists of the input list have names, the resulting list of lists
#'   will use those names, but sublists with no matches will still be in the resulting list.
#'   If the texts of the sublists have names, the resulting sublists of matches will use
#'   those names, and the texts with no matches will be dropped. If the input texts do
#'   not have names, then the resulting sublists of matches will be in the same order as the
#'   input texts, and non-matched texts will be kept to preserve that order. Thus, it is more
#'   efficient to use named input texts (so non-matched texts can be dropped).
#'
#'   The default alphabet allows all 128 ASCII characters in the keywords and the texts.
#'   Characters outside this range will cause an error. A more efficient trie is possible
#'   if the alphabet size can be reduced. For example, DNA sequences use at most 19 distinct
#'   characters and usually only 4; protein sequences use at most 26 distinct characters and
#'   usually only 20. Set the \code{alphabet} parameter if a reduced alphabet is appropriate.
#'
#'   UTF-8 (Unicode) matching is not currently supported.
#' @seealso
#' \itemize{
#' \item \href{https://www.codeproject.com/Articles/12383/Aho-Corasick-string-matching-in-C}{Aho-Corasick string matching in C#} for the article this package is based on
#' \item \code{Biostrings matchPDict} for a more memory efficient, but DNA-only, implementation of the algorithm
#' }
#' @examples
#' listEquals = function(a, b) { is.null(unlist(a)) && is.null(unlist(b)) ||
#'                               !is.null(a) && !is.null(b) && all(unlist(a) == unlist(b)) }
#' keywords = c("Abra", "cadabra", "is", "the", "Magic", "Word")
#'
#' # 1. Search a list of lists without names
#' # * sublists are accessed by index
#' # * texts are accessed by index
#' # * non-matched texts are kept (input index order is preserved)
#' listSearch = AhoCorasickSearchList(keywords,
#'                                    list(c("What in", "the world"),
#'                                         c("is"),
#'                                         "secret about",
#'                                         "the Magic Word?"))
#' stopifnot(listEquals(listSearch[[1]][[1]], list()))
#' stopifnot(listEquals(listSearch[[1]][[2]][[1]], list(keyword="the", offset=1)))
#' stopifnot(listEquals(listSearch[[2]][[1]][[1]], list(keyword="is", offset=1)))
#' stopifnot(listEquals(listSearch[[3]], list()))
#' stopifnot(listEquals(listSearch[[4]][[1]][[1]], list(keyword="the", offset=1)))
#' stopifnot(listEquals(listSearch[[4]][[1]][[2]], list(keyword="Magic", offset=5)))
#' stopifnot(listEquals(listSearch[[4]][[1]][[3]], list(keyword="Word", offset=11)))
#'
#' # 2. Search a named list of named lists
#' # * sublists are accessed by name
#' # * matched texts are accessed by name
#' # * non-matched texts are dropped
#' namedSearch = AhoCorasickSearchList(keywords,
#'                                     list(subject=c(phrase1="What in", phrase2="the world"),
#'                                          verb=c(phrase1="is"),
#'                                          predicate1=c(phrase1="secret about"),
#'                                          predicate2=c(phrase1="the Magic Word?")))
#' stopifnot(listEquals(namedSearch$subject$phrase2[[1]], list(keyword="the", offset=1)))
#' stopifnot(listEquals(namedSearch$verb$phrase1[[1]], list(keyword="is", offset=1)))
#' stopifnot(listEquals(namedSearch$predicate1, list()))
#' stopifnot(listEquals(namedSearch$predicate2$phrase1[[1]], list(keyword="the", offset=1)))
#' stopifnot(listEquals(namedSearch$predicate2$phrase1[[2]], list(keyword="Magic", offset=5)))
#' stopifnot(listEquals(namedSearch$predicate2$phrase1[[3]], list(keyword="Word", offset=11)))
#' @export
AhoCorasickSearchList <- function(keywords, textList, alphabet = "ascii", groupByKeyword = FALSE, iterationFeedback = 0L) {
    .Call('_AhoCorasickTrie_AhoCorasickSearchList', PACKAGE = 'AhoCorasickTrie', keywords, textList, alphabet, groupByKeyword, iterationFeedback)
}

#' Fast searching for one or more keywords in one or more texts
#'
#' @param text Character vector of one or more texts to search
#' @inheritParams AhoCorasickSearchList
#' @return List of matches, grouped by either text or by keyword
#' @description Builds an Aho-Corasick trie from one or more keywords and uses it to
#'   search one or more texts. For a large number of keywords, Aho-Corasick is much faster
#'   than a naive approach (such as \code{lapply(keywords, gregexpr, text)}).
#'
#'   Use \code{\link{AhoCorasickSearchList}} instead of \code{\link{AhoCorasickSearch}} when you want to keep the matches
#'   of each input text separate. If the input texts have names, the resulting list of matches will include those
#'   names and non-matched texts will be excluded from the results. If the input texts do
#'   not have names, then the resulting list of matches will be in the same order as the
#'   input texts, and non-matched texts will be kept to preserve that order. Thus, it is more
#'   efficient to use named input texts (so non-matched texts can be dropped).
#'
#'   The default alphabet allows all 128 ASCII characters in the keywords and the texts.
#'   Characters outside this range will cause an error. A more efficient trie is possible
#'   if the alphabet size can be reduced. For example, DNA sequences use at most 19 distinct
#'   characters and usually only 4; protein sequences use at most 26 distinct characters and
#'   usually only 20. Set the \code{alphabet} parameter if a reduced alphabet is appropriate.
#'
#'   UTF-8 (Unicode) matching is not currently supported.
#' @seealso
#' \itemize{
#' \item \href{https://www.codeproject.com/Articles/12383/Aho-Corasick-string-matching-in-C}{Aho-Corasick string matching in C#} for the article this package is based on
#' \item \code{Biostrings matchPDict} for a more memory efficient, but DNA-only, implementation of the algorithm
#' }
#' @examples
#' listEquals = function(a, b) { is.null(unlist(a)) && is.null(unlist(b)) ||
#'                               !is.null(a) && !is.null(b) && all(unlist(a) == unlist(b)) }
#'
#' # 1. Search for multiple keywords in a single text
#' keywords = c("Abra", "cadabra", "is", "the", "Magic", "Word")
#' oneSearch = AhoCorasickSearch(keywords, "Is Abracadabra the Magic Word?")
#' stopifnot(listEquals(oneSearch[[1]][[1]], list(keyword="Abra", offset=4)))
#' stopifnot(listEquals(oneSearch[[1]][[2]], list(keyword="cadabra", offset=8)))
#' stopifnot(listEquals(oneSearch[[1]][[3]], list(keyword="the", offset=16)))
#' stopifnot(listEquals(oneSearch[[1]][[4]], list(keyword="Magic", offset=20)))
#' stopifnot(listEquals(oneSearch[[1]][[5]], list(keyword="Word", offset=26)))
#'
#' # 2. Search multiple named texts in a named list with keyword grouping and aminoacid alphabet
#' # * all matches to a keyword are accessed by name
#' # * non-matched keywords are dropped
#' proteins = c(protein1="PEPTIDEPEPTIDEDADADARARARARAKEKEKEKEPEPTIDE",
#'              protein2="DERPADERPAPEWPEWPEEPEERAWRAWWARRAGTAGPEPTIDEKESEQUENCE")
#' peptides = c("PEPTIDE", "DERPA", "SEQUENCE", "KEKE", "PEPPIE")
#'
#' peptideSearch = AhoCorasickSearch(peptides, proteins, alphabet="aminoacid", groupByKeyword=TRUE)
#' stopifnot(listEquals(peptideSearch$PEPTIDE, list(list(keyword="protein1", offset=1),
#'                                                  list(keyword="protein1", offset=8),
#'                                                  list(keyword="protein1", offset=37),
#'                                                  list(keyword="protein2", offset=38))))
#' stopifnot(listEquals(peptideSearch$DERPA, list(list(keyword="protein2", offset=1),
#'                                                list(keyword="protein2", offset=6))))
#' stopifnot(listEquals(peptideSearch$SEQUENCE, list(list(keyword="protein2", offset=47))))
#' stopifnot(listEquals(peptideSearch$KEKE, list(list(keyword="protein1", offset=29),
#'                                               list(keyword="protein1", offset=31),
#'                                               list(keyword="protein1", offset=33))))
#' stopifnot(listEquals(peptideSearch$PEPPIE, NULL))
#'
#' # 3. Grouping by keyword without text names: offsets are given without reference to the text
#' names(proteins) = NULL
#' peptideSearch = AhoCorasickSearch(peptides, proteins, groupByKeyword=TRUE)
#' stopifnot(listEquals(peptideSearch$PEPTIDE, list(1, 8, 37, 38)))
#' stopifnot(listEquals(peptideSearch$DERPA, list(1, 6)))
#' stopifnot(listEquals(peptideSearch$SEQUENCE, list(47)))
#' stopifnot(listEquals(peptideSearch$KEKE, list(29, 31, 33)))
#' @export
AhoCorasickSearch <- function(keywords, text, alphabet = "ascii", groupByKeyword = FALSE, iterationFeedback = 0L) {
    .Call('_AhoCorasickTrie_AhoCorasickSearch', PACKAGE = 'AhoCorasickTrie', keywords, text, alphabet, groupByKeyword, iterationFeedback)
}