R/model_to_regex.R

#' Build a regular expression to fit chosen parameters
#' @param model pattern that a word should match. Consists of letters and
#'   unknown characters specifications. Dot \code{.} stands for unknown
#'   character. It may be followed by \code{\{...\}} repetition quantifier (i.e.
#'   \code{.\{n\}}, \code{.\{n,\}}, \code{.\{n,m\}}). Asterisk \code{*} stands
#'   for unknown number of unknown characters. See examples.\cr By default
#'   \code{model} is set to \code{"*"}.
#' @param allow characters allowed to fill gaps in a word. Can be listed in a
#'   single string or in a vector. By default is set to \code{letters}.
#' @param ban characters not allowed to fill gaps in a word.
#' @param type can be \code{"usual"}, \code{"scrabble"}, or \code{"anagram"}.
#'   Abbreviated input is allowed: e.g. \code{"u"}, \code{"s"}, or
#'   \code{"a"}.\cr \code{type} defines how often allowed characters can be used
#'   to fill the gaps. Say, character appears \code{n} times in \code{allow} and
#'   \code{m} times in \code{ban}. If \code{d = n - m} is less or equal to zero,
#'   whatever the \code{type} is, this character won't be used to fill the gaps.
#'   For the case when \code{d > 0}:\cr \itemize{ \item If \code{type} is
#'   \code{"usual"} then the character is allowed to fill the gaps
#'   \strong{unlimited} number of times. \cr \item If \code{type} is
#'   \code{"scrabble"} then the character is allowed to fill the gaps \strong{no
#'   more} than d times. \item If \code{type} is \code{"anagram"} then the
#'   character should be used \strong{exactly} d times. }
#'
#' @section Warning: If \code{type = "scrabble"} or \code{"anagram"}, output
#'   regex will contain perl-like syntax. So, to use it in \code{grep} or
#'   \code{gsub} for example, set \code{perl} parameter to \code{TRUE}.
#'
#' @examples
#' ## Regular expression to match all the 5-letter words starting with "c".
#' model_to_regex("c.{4}")
#' ## Disallow "a" and "b".
#' model_to_regex("c.{4}", ban = "ab")
#' ## Allow only "a" and "b" to fill the gap.
#' model_to_regex("c.{4}", allow = "ab")
#' ## Allow "a", "b", and "c", but then ban "c" (result is the same as the previous example)
#' model_to_regex("c.{4}", allow = "abc", ban = "c")
#'
#' ## Regex to match all words that start with "p" and end with "zed".
#' model_to_regex("p*zed")
#'
#' ## Regex to match all the words that can be constructed of the word "thing".
#' model_to_regex(allow = "thing", type = "scrabble")
#' ## Get at lest 4-letter words.
#' model_to_regex(".{4,}", allow = "thing", type = "scrabble")
#'
#' ## Regex to match anagrams of the word "thing"
#' model_to_regex(allow = "thing", type = "anagram")
#'
#' @seealso \code{\link{find_word}}, \code{\link{scrabble}},
#'   \code{\link{anagram}}
#'
#' @export
model_to_regex <- function(model = "*", allow = letters, ban = character(0),
                           type = "usual"){
    ## to use magrittr with no notes from CRAN check
    . <- NULL
    ## to use dplyr
    char <- max <- look.ahead <- NULL

    check_model(model)
    type <- match.arg(type, c("usual", "scrabble", "anagram"))

    allow.count <- char_count(allow)
    ban.count <- char_count(ban)
    names(allow.count) <- c("char", "allow")
    names(ban.count) <- c("char", "ban")
    char.spec <- merge(allow.count, ban.count, all.x = T)
    char.spec[is.na(char.spec)] <- 0

    if (type == "usual"){
        char.spec$max <- char.spec$allow & !char.spec$ban
    }else{
        char.spec$max <- pmax(char.spec$allow - char.spec$ban, 0)
    }
    char.spec <- char.spec[, c("char", "max")]
    char.spec <- char.spec[char.spec$max > 0, ]

    chars.r <- paste(char.spec$char, collapse = "") %>%
        paste0("[", ., "]")
    regex <- model %>%
        gsub("\\.", chars.r, .) %>%
        gsub("\\*", paste0(chars.r, "*"), .) %>%
        paste0("^", ., "$")
    if (type != "usual"){
        model.char.spec <- char_count(model) %>%
            filter(!grepl("\\*|\\.|,|\\{|\\}|\\d", char))
        names(model.char.spec) <- c("char", "max")
        char.spec %<>% rbind(model.char.spec) %>%
            dplyr::group_by(char) %>%
            dplyr::summarise(max = sum(max)) %>%
            dplyr::ungroup()
        char.spec %<>% dplyr::mutate(
            look.ahead = paste0("([^", char, "]*", char, "[^", char, "]*)"))
    }
    switch (type,
        scrabble = {
            char.spec %<>% dplyr::mutate(
                look.ahead = paste0("(?=^((", look.ahead, "{", 1, ",", max,
                                    "})|([^", char, "]*))$)"))
        },
        anagram = {
            char.spec %<>% dplyr::mutate(
                look.ahead = paste0("(?=^", look.ahead, "{", max, "}$)"))
        }
    )
    regex <- paste(char.spec$look.ahead, collapse = "") %>% paste0(regex)
    regex
}

check_model <- function(model){
    model.msg <- "Wrong model. Model should be a single string with following characters allowed:
    '.' possibly followed by {...} repetition quantifier, lowercase letters and '*'."
    model.regex <- "^([a-z]|\\*|(\\.(\\{[0-9]*,?[0-9]*\\})?))*$"
    if (!is.character(model)) stop(model.msg)
    if (length(model) != 1)     stop(model.msg)
    if (!grepl(model.regex, model)) stop(model.msg)
}

Try the wfindr package in your browser

Any scripts or data that you put into this service are public.

wfindr documentation built on May 2, 2019, 12:20 p.m.