R/locate.R

Defines functions invert_match str_locate_all str_locate location

Documented in invert_match str_locate str_locate_all

location <- function(x, all = FALSE) {
  start <- as.vector(x)
  if (all && identical(start, -1L)) {
    return(cbind(start = integer(), end = integer()))
  }

  end <- as.vector(x) + attr(x, "match.length") - 1L

  no_match <- start == -1L
  start[no_match] <- NA_integer_
  end[no_match] <- NA_integer_

  cbind(start = start, end = end)
}

#' Locate the position of patterns in a string
#'
#' @description Vectorised over `string` and `pattern`, though using vectorised
#' patterns is relatively slow compared to `stringr`. If the match is of length
#' 0, (e.g. from a special match like `$`) end will be one character less
#' than start.
#'
#' @inheritParams str_detect
#' @return For `str_locate()`, an `integer matrix`. First column gives start
#'   postion of match, and second column gives end position. For
#'   `str_locate_all()` a `list` of integer matrices.
#' @seealso
#'   [str_extract()] for a convenient way of extracting matches.
#'
#' @examples
#' fruit <- c("apple", "banana", "pear", "pineapple")
#' str_locate(fruit, "$")
#' str_locate(fruit, "a")
#' str_locate(fruit, "e")
#' str_locate("apple", c("a", "b", "p", "p"))
#'
#' str_locate_all(fruit, "a")
#' str_locate_all(fruit, "e")
#'
#' # Find location of every character
#' str_locate_all(fruit, "")
#' @export
str_locate <- function(string, pattern) {
  check_lengths(string, pattern)

  if (length(pattern) > 1) {
    out <- mapply(
      function(p, s) {
        regexpr(p, s,
                fixed = is_fixed(p),
                perl = is_perl(p),
                ignore.case = ignore_case(p))
      },
      pattern,
      string,
      SIMPLIFY = FALSE,
      USE.NAMES = FALSE
    )
    do.call(rbind, lapply(out, location))

  } else {
    out <- regexpr(pattern, string,
                   fixed = is_fixed(pattern),
                   perl = is_perl(pattern),
                   ignore.case = ignore_case(pattern))
    location(out)
  }
}

#' @rdname str_locate
#' @export
str_locate_all <- function(string, pattern) {
  check_lengths(string, pattern)

  if (length(pattern) > 1) {
    out <- mapply(
      function(p, s) {
        gregexpr(p, s,
                 fixed = is_fixed(p),
                 perl = is_perl(p),
                 ignore.case = ignore_case(p))
      },
      pattern,
      string,
      SIMPLIFY = TRUE,
      USE.NAMES = FALSE
    )
    lapply(out, function(x) do.call(rbind, lapply(x, location, all = TRUE)))

  } else {
    out <- gregexpr(pattern, string,
                    fixed = is_fixed(pattern),
                    perl = is_perl(pattern),
                    ignore.case = ignore_case(pattern))
    lapply(out, location, all = TRUE)
  }
}

#' Switch location of matches to location of non-matches.
#'
#' @description Invert a matrix of match locations to match the opposite of
#' what was previously matched.
#'
#' @param loc `numeric matrix` of match locations, as from [str_locate_all()]
#' @return Returns a `numeric matrix` giving locations of non-matches.
#' @export
#' @examples
#' numbers <- "1 and 2 and 4 and 456"
#' and_loc <- str_locate_all(numbers, "and")[[1]]
#' str_sub(numbers, and_loc[, "start"], and_loc[, "end"])
#'
#' num_loc <- invert_match(and_loc)
#' str_sub(numbers, num_loc[, "start"], num_loc[, "end"])
invert_match <- function(loc) {
  cbind(
    start = c(0L, loc[, "end"] + 1L),
    end = c(loc[, "start"] - 1L, -1L)
  )
}
csdaw/stringrb documentation built on Aug. 13, 2022, 10:55 p.m.