R/which_pattern.R

#' Which regex pattern first matches string
#'
#' For each element in the `string` vector, this function finds which `pattern`
#' is the first to match it using `grepl`. Thus pattern order influences the results.
#'
#' @param string vector of strings to match against patterns
#' @param pattern vector of patterns to match strings
#'
#' @return Numeric index of the pattern to first match each string or NA if no match found.
#'
#' @examples
#' which_pattern(month.name, c("^J", "[A,a]", "[O,o]"))
#' # [1]  1  2  2  2  2  1  1  2 NA  3  3 NA
#'
#' which_pattern(month.name, c("[A,a]", "^J", "[O,o]"))
#' #[1]  1  1  1  1  1  2  2  1 NA  3  3 NA
#'
#' @export
which_pattern <- function(string, pattern) {
  if (length(string) == 0 | length(pattern) == 0) return(NULL)
  l <- sapply(pattern, function(p) if (is.na(p)) rep(FALSE, length(string)) else grepl(p, string),
              simplify = "matrix", USE.NAMES = FALSE)
  m <- matrix(unlist(l), nrow = length(string), ncol = length(pattern))
  apply(as.matrix(m), 1, function(x) c(which(x), NA_integer_)[[1]])
}

#' Get first matching pattern band
#'
#' For each element in the `string` vector, this function finds the name (or number)
#' of a collection of regex patterns --- "band" --- one of which was the first to match
#' the string element. Order of bands in the `bandlist` influences the results.
#' The function is useful when classifying character variables based on content.
#'
#' @param string vector of strings to match against patterns
#' @param bandlist list of pattern bands: <name> = <vector of applicable patterns>
#' @param as.factor if FALSE, return band names, not factors; otherwise if TRUE
#'
#' @return Vector of strings (or factors) of pattern "bands" that matched each string first,
#' numeric pattern index if no band names given.
#'
#' @examples
#' # In the following examples patterns should match as narrow as possible
#' # because the first match wins.
#' seasons <- list(
#'   Wi = c("De", "Ja", "Fe"),
#'   Sp = c("Ma", "Ap"),
#'   Su = c("Ju", "Au"),
#'   Au = c("Se", "Oc", "No")
#' )
#' patband(month.name, seasons, as.factor = TRUE)
#' # [1] Wi Wi Sp Sp Sp Su Su Su Au Au Au Wi
#' # Levels: Wi Sp Su Au
#'
#' quarters <- list(
#'   c("Ja", "Fe", "Mar"),
#'   c("Ap", "May", "Jun"),
#'   c("Jul", "Au", "Se"),
#'   c("Oc", "No", "De")
#'   )
#' patband(month.name, quarters)
#' # [1] 1 1 1 2 2 2 3 3 3 4 4 4
#'
#' @seealso [which_pattern()] for the underlying pattern matching behaviour.
#' @export
patband <- function(string, bandlist, as.factor = FALSE) {
  bandlist <- as.list(bandlist)
  if (is.null(names(bandlist))) {
    num <- seq(1, length.out = length(bandlist))
    out <- rep(num, sapply(bandlist, length))
    if (as.factor) out <- factor(out, levels = num)
  } else {
    out <- rep(names(bandlist), sapply(bandlist, length))
    if (as.factor) out <- factor(out, levels = names(bandlist))
  }
  out[which_pattern(string, Reduce(c, bandlist))]
}
avidclam/amxtra documentation built on May 17, 2019, 12:01 p.m.