R/dplyr-funcs-string.R
In arrow: Integration to 'Apache' 'Arrow'

Documented in contains_regex get_stringr_pattern_options

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# String function helpers

#' Get `stringr` pattern options
#'
#' This function assigns definitions for the `stringr` pattern modifier
#' functions (`fixed()`, `regex()`, etc.) inside itself, and uses them to
#' evaluate the quoted expression `pattern`, returning a list that is used
#' to control pattern matching behavior in internal `arrow` functions.
#'
#' @param pattern Unevaluated expression containing a call to a `stringr`
#' pattern modifier function
#'
#' @return List containing elements `pattern`, `fixed`, and `ignore_case`
#' @keywords internal
get_stringr_pattern_options <- function(pattern) {
  fixed <- function(pattern, ignore_case = FALSE, ...) {
    check_dots(...)
    list(pattern = pattern, fixed = TRUE, ignore_case = ignore_case)
  }
  regex <- function(pattern, ignore_case = FALSE, ...) {
    check_dots(...)
    list(pattern = pattern, fixed = FALSE, ignore_case = ignore_case)
  }
  coll <- function(...) {
    arrow_not_supported("Pattern modifier `coll()`")
  }
  boundary <- function(...) {
    arrow_not_supported("Pattern modifier `boundary()`")
  }
  check_dots <- function(...) {
    dots <- list(...)
    if (length(dots)) {
      warning(
        "Ignoring pattern modifier ",
        ngettext(length(dots), "argument ", "arguments "),
        "not supported in Arrow: ",
        oxford_paste(names(dots)),
        call. = FALSE
      )
    }
  }

  ensure_opts <- function(opts) {
    # default options for the simple cases
    if (is.character(opts)) {
      opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE)
    }
    opts
  }

  pattern_expr <- clean_pattern_namespace(quo_get_expr(pattern))
  pattern <- quo_set_expr(pattern, pattern_expr)

  ensure_opts(eval_tidy(pattern, data = list(regex = regex, coll = coll, boundary = boundary, fixed = fixed)))
}

# Ensure that e.g. stringr::regex and regex both work within patterns
clean_pattern_namespace <- function(pattern) {
  modifier_funcs <- c("fixed", "regex", "coll", "boundary")
  if (is_call(pattern, modifier_funcs, ns = "stringr")) {
    function_called <- call_name(pattern[1])
    pattern[1] <- call2(function_called)
  }

  pattern
}

#' Does this string contain regex metacharacters?
#'
#' @param string String to be tested
#' @keywords internal
#' @return Logical: does `string` contain regex metacharacters?
contains_regex <- function(string) {
  grepl("[.\\|()[{^$*+?]", string)
}

# format `pattern` as needed for case insensitivity and literal matching by RE2
format_string_pattern <- function(pattern, ignore.case, fixed) {
  # Arrow lacks native support for case-insensitive literal string matching and
  # replacement, so we use the regular expression engine (RE2) to do this.
  # https://github.com/google/re2/wiki/Syntax
  if (ignore.case) {
    if (fixed) {
      # Everything between "\Q" and "\E" is treated as literal text.
      # If the search text contains any literal "\E" strings, make them
      # lowercase so they won't signal the end of the literal text:
      pattern <- gsub("\\E", "\\e", pattern, fixed = TRUE)
      pattern <- paste0("\\Q", pattern, "\\E")
    }
    # Prepend "(?i)" for case-insensitive matching
    pattern <- paste0("(?i)", pattern)
  }
  pattern
}

# format `replacement` as needed for literal replacement by RE2
format_string_replacement <- function(replacement, ignore.case, fixed) {
  # Arrow lacks native support for case-insensitive literal string
  # replacement, so we use the regular expression engine (RE2) to do this.
  # https://github.com/google/re2/wiki/Syntax
  if (ignore.case && fixed) {
    # Escape single backslashes in the regex replacement text so they are
    # interpreted as literal backslashes:
    replacement <- gsub("\\", "\\\\", replacement, fixed = TRUE)
  }
  replacement
}

# Currently, Arrow does not supports a locale option for string case conversion
# functions, contrast to stringr's API, so the 'locale' argument is only valid
# for stringr's default value ("en"). The following are string functions that
# take a 'locale' option as its second argument:
#   str_to_lower
#   str_to_upper
#   str_to_title
#
# Arrow locale will be supported with ARROW-14126
stop_if_locale_provided <- function(locale) {
  if (!identical(locale, "en")) {
    arrow_not_supported(
      "Providing a value for 'locale' other than the default ('en')",
      body = c(">" = "To change locale, use 'Sys.setlocale()'")
    )
  }
}


# Split up into several register functions by category to satisfy the linter
register_bindings_string <- function() {
  register_bindings_string_join()
  register_bindings_string_regex()
  register_bindings_string_other()
}

register_bindings_string_join <- function() {
  arrow_string_join_function <- function(null_handling, null_replacement = NULL) {
    # the `binary_join_element_wise` Arrow C++ compute kernel takes the separator
    # as the last argument, so pass `sep` as the last dots arg to this function
    function(...) {
      args <- lapply(list(...), function(arg) {
        # handle scalar literal args, and cast all args to string for
        # consistency with base::paste(), base::paste0(), and stringr::str_c()
        if (!inherits(arg, "Expression")) {
          if (length(arg) != 1) {
            arrow_not_supported(
              "Literal vectors of length != 1 in string concatenation"
            )
          }
          Expression$scalar(as.character(arg))
        } else {
          call_binding("as.character", arg)
        }
      })
      Expression$create(
        "binary_join_element_wise",
        args = args,
        options = list(
          null_handling = null_handling,
          null_replacement = null_replacement
        )
      )
    }
  }

  register_binding(
    "base::paste",
    function(..., sep = " ", collapse = NULL, recycle0 = FALSE) {
      if (!is.null(collapse)) {
        arrow_not_supported("`collapse` argument")
      }
      if (!inherits(sep, "Expression") && is.na(sep)) {
        validation_error("Invalid separator")
      }
      arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep)
    },
    notes = "the `collapse` argument is not yet supported"
  )

  register_binding(
    "base::paste0",
    function(..., collapse = NULL, recycle0 = FALSE) {
      if (!is.null(collapse)) {
        arrow_not_supported("`collapse` argument")
      }
      arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "")
    },
    notes = "the `collapse` argument is not yet supported"
  )

  register_binding(
    "stringr::str_c",
    function(..., sep = "", collapse = NULL) {
      if (!is.null(collapse)) {
        arrow_not_supported("`collapse` argument")
      }
      if (!inherits(sep, "Expression") && is.na(sep)) {
        validation_error("`sep` must be a single string, not `NA`.")
      }
      arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep)
    },
    notes = "the `collapse` argument is not yet supported"
  )

  register_binding("base::strrep", function(x, times) {
    Expression$create("binary_repeat", x, times)
  })

  register_binding("stringr::str_dup", function(string, times) {
    Expression$create("binary_repeat", string, times)
  })
}

register_bindings_string_regex <- function() {
  create_string_match_expr <- function(arrow_fun, string, pattern, ignore_case) {
    out <- Expression$create(
      arrow_fun,
      string,
      options = list(pattern = pattern, ignore_case = ignore_case)
    )
  }

  register_binding("base::grepl", function(pattern,
                                           x,
                                           ignore.case = FALSE,
                                           fixed = FALSE) {
    arrow_fun <- ifelse(fixed, "match_substring", "match_substring_regex")
    out <- create_string_match_expr(
      arrow_fun,
      string = x,
      pattern = pattern,
      ignore_case = ignore.case
    )
    call_binding("if_else", call_binding("is.na", out), FALSE, out)
  })


  register_binding("stringr::str_detect", function(string, pattern, negate = FALSE) {
    opts <- get_stringr_pattern_options(enquo(pattern))
    arrow_fun <- ifelse(opts$fixed, "match_substring", "match_substring_regex")
    out <- create_string_match_expr(arrow_fun,
      string = string,
      pattern = opts$pattern,
      ignore_case = opts$ignore_case
    )
    if (negate) {
      out <- !out
    }
    out
  })

  register_binding(
    "stringr::str_like",
    function(string, pattern, ignore_case = TRUE) {
      Expression$create(
        "match_like",
        string,
        options = list(pattern = pattern, ignore_case = ignore_case)
      )
    }
  )

  register_binding(
    "stringr::str_count",
    function(string, pattern) {
      opts <- get_stringr_pattern_options(enquo(pattern))
      if (!is.string(pattern)) {
        arrow_not_supported("`pattern` must be a length 1 character vector; other values")
      }
      arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex")

      Expression$create(
        arrow_fun,
        string,
        options = list(pattern = opts$pattern, ignore_case = opts$ignore_case)
      )
    },
    notes = "`pattern` must be a length 1 character vector"
  )

  register_binding("base::startsWith", function(x, prefix) {
    Expression$create(
      "starts_with",
      x,
      options = list(pattern = prefix)
    )
  })

  register_binding("base::endsWith", function(x, suffix) {
    Expression$create(
      "ends_with",
      x,
      options = list(pattern = suffix)
    )
  })

  register_binding("stringr::str_starts", function(string, pattern, negate = FALSE) {
    opts <- get_stringr_pattern_options(enquo(pattern))
    if (opts$fixed) {
      out <- call_binding("startsWith", x = string, prefix = opts$pattern)
    } else {
      out <- create_string_match_expr(
        arrow_fun = "match_substring_regex",
        string = string,
        pattern = paste0("^", opts$pattern),
        ignore_case = opts$ignore_case
      )
    }
    if (negate) {
      out <- !out
    }
    out
  })

  register_binding("stringr::str_ends", function(string, pattern, negate = FALSE) {
    opts <- get_stringr_pattern_options(enquo(pattern))
    if (opts$fixed) {
      out <- call_binding("endsWith", x = string, suffix = opts$pattern)
    } else {
      out <- create_string_match_expr(
        arrow_fun = "match_substring_regex",
        string = string,
        pattern = paste0(opts$pattern, "$"),
        ignore_case = opts$ignore_case
      )
    }
    if (negate) {
      out <- !out
    }
    out
  })

  # Encapsulate some common logic for sub/gsub/str_replace/str_replace_all
  arrow_r_string_replace_function <- function(max_replacements) {
    function(pattern, replacement, x, ignore.case = FALSE, fixed = FALSE) {
      if (length(pattern) != 1) {
        validation_error("`pattern` must be a length 1 character vector")
      }
      if (length(replacement) != 1) {
        validation_error("`replacement` must be a length 1 character vector")
      }
      Expression$create(
        ifelse(fixed && !ignore.case, "replace_substring", "replace_substring_regex"),
        x,
        options = list(
          pattern = format_string_pattern(pattern, ignore.case, fixed),
          replacement = format_string_replacement(replacement, ignore.case, fixed),
          max_replacements = max_replacements
        )
      )
    }
  }

  arrow_stringr_string_replace_function <- function(max_replacements) {
    force(max_replacements)
    function(string, pattern, replacement) {
      opts <- get_stringr_pattern_options(enquo(pattern))
      arrow_r_string_replace_function(max_replacements)(
        pattern = opts$pattern,
        replacement = replacement,
        x = string,
        ignore.case = opts$ignore_case,
        fixed = opts$fixed
      )
    }
  }

  arrow_stringr_string_remove_function <- function(max_replacements) {
    force(max_replacements)
    function(string, pattern) {
      opts <- get_stringr_pattern_options(enquo(pattern))
      arrow_r_string_replace_function(max_replacements)(
        pattern = opts$pattern,
        replacement = "",
        x = string,
        ignore.case = opts$ignore_case,
        fixed = opts$fixed
      )
    }
  }

  register_binding("base::sub", arrow_r_string_replace_function(1L))
  register_binding("base::gsub", arrow_r_string_replace_function(-1L))
  register_binding("stringr::str_replace", arrow_stringr_string_replace_function(1L))
  register_binding("stringr::str_replace_all", arrow_stringr_string_replace_function(-1L))
  register_binding("stringr::str_remove", arrow_stringr_string_remove_function(1L))
  register_binding("stringr::str_remove_all", arrow_stringr_string_remove_function(-1L))

  register_binding("base::strsplit", function(x, split, fixed = FALSE, perl = FALSE,
                                              useBytes = FALSE) {
    assert_that(is.string(split))

    arrow_fun <- ifelse(fixed, "split_pattern", "split_pattern_regex")
    # warn when the user specifies both fixed = TRUE and perl = TRUE, for
    # consistency with the behavior of base::strsplit()
    if (fixed && perl) {
      warning("Argument 'perl = TRUE' will be ignored", call. = FALSE)
    }
    # since split is not a regex, proceed without any warnings or errors regardless
    # of the value of perl, for consistency with the behavior of base::strsplit()
    Expression$create(
      arrow_fun,
      x,
      options = list(pattern = split, reverse = FALSE, max_splits = -1L)
    )
  })

  register_binding(
    "stringr::str_split",
    function(string,
             pattern,
             n = Inf,
             simplify = FALSE) {
      opts <- get_stringr_pattern_options(enquo(pattern))
      arrow_fun <- ifelse(opts$fixed, "split_pattern", "split_pattern_regex")
      if (opts$ignore_case) {
        arrow_not_supported("Case-insensitive string splitting")
      }
      if (n == 0) {
        arrow_not_supported("Splitting strings into zero parts")
      }
      if (identical(n, Inf)) {
        n <- 0L
      }
      if (simplify) {
        warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE)
      }
      # The max_splits option in the Arrow C++ library controls the maximum number
      # of places at which the string is split, whereas the argument n to
      # str_split() controls the maximum number of pieces to return. So we must
      # subtract 1 from n to get max_splits.
      Expression$create(
        arrow_fun,
        string,
        options = list(
          pattern = opts$pattern,
          reverse = FALSE,
          max_splits = n - 1L
        )
      )
    },
    notes = "Case-insensitive string splitting and splitting into 0 parts not supported"
  )
}

register_bindings_string_other <- function() {
  register_binding(
    "base::nchar",
    function(x, type = "chars", allowNA = FALSE, keepNA = NA) {
      if (allowNA) {
        arrow_not_supported("allowNA = TRUE")
      }
      if (is.na(keepNA)) {
        keepNA <- !identical(type, "width")
      }
      if (!keepNA) {
        # TODO: I think there is a fill_null kernel we could use, set null to 2
        arrow_not_supported("keepNA = TRUE")
      }
      if (identical(type, "bytes")) {
        Expression$create("binary_length", x)
      } else {
        Expression$create("utf8_length", x)
      }
    },
    notes = "`allowNA = TRUE` and `keepNA = TRUE` not supported"
  )

  register_binding("stringr::str_to_lower", function(string, locale = "en") {
    stop_if_locale_provided(locale)
    Expression$create("utf8_lower", string)
  })

  register_binding("stringr::str_to_upper", function(string, locale = "en") {
    stop_if_locale_provided(locale)
    Expression$create("utf8_upper", string)
  })

  register_binding("stringr::str_to_title", function(string, locale = "en") {
    stop_if_locale_provided(locale)
    Expression$create("utf8_title", string)
  })

  register_binding("stringr::str_trim", function(string, side = c("both", "left", "right")) {
    side <- match.arg(side)
    trim_fun <- switch(side,
      left = "utf8_ltrim_whitespace",
      right = "utf8_rtrim_whitespace",
      both = "utf8_trim_whitespace"
    )
    Expression$create(trim_fun, string)
  })

  register_binding(
    "base::substr",
    function(x, start, stop) {
      if (length(start) != 1) {
        arrow_not_supported("`start` must be length 1 - other lengths")
      }
      if (length(stop) != 1) {
        arrow_not_supported("`stop` must be length 1 - other lengths")
      }

      # substr treats values as if they're on a continuous number line, so values
      # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
      # this behavior
      if (start <= 0) {
        start <- 1
      }

      # if `stop` is lower than `start`, this is invalid, so set `stop` to
      # 0 so that an empty string will be returned (consistent with base::substr())
      if (stop < start) {
        stop <- 0
      }

      # if the input is a string we use "utf8_slice_codeunits"; if the
      # input is binary we use "binary_slice". This does not consider
      # a binary Scalar.
      x_is_binary <- inherits(x, "ArrowObject") &&
        x$type_id() %in% c(Type$BINARY, Type$LARGE_BINARY, Type$FIXED_SIZE_BINARY)
      if (x_is_binary) {
        fun <- "binary_slice"
      } else {
        fun <- "utf8_slice_codeunits"
      }

      Expression$create(
        fun,
        x,
        # we don't need to subtract 1 from `stop` as C++ counts exclusively
        # which effectively cancels out the difference in indexing between R & C++
        options = list(start = start - 1L, stop = stop)
      )
    },
    notes = "`start` and `stop` must be length 1"
  )

  register_binding("base::substring", function(text, first, last) {
    call_binding("substr", x = text, start = first, stop = last)
  })

  register_binding("stringr::str_sub", function(string, start = 1L, end = -1L) {
    if (length(start) != 1) {
      arrow_not_supported("`start` must be length 1 - other lengths")
    }
    if (length(end) != 1) {
      arrow_not_supported("`end` must be length 1 - other lengths")
    }

    # In stringr::str_sub, an `end` value of -1 means the end of the string, so
    # set it to the maximum integer to match this behavior
    if (end == -1) {
      end <- .Machine$integer.max
    }

    # strings returned by utf8_slice_codeunits are exclusive of the `end` position.
    # stringr::str_sub returns strings inclusive of the `end` position, so add 1 to `end`.
    # NOTE:this isn't necessary for positive values of `end`, because utf8_slice_codeunits
    # is 0-based while R is 1-based, which cancels out the effect of the exclusive `end`
    if (end < -1) {
      end <- end + 1L
    }

    # subtract 1 from `start` because C++ is 0-based and R is 1-based
    # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0
    # when `start` < 0, both str_sub and utf8_slice_codeunits count backwards from the end
    if (start > 0) {
      start <- start - 1L
    }

    Expression$create(
      "utf8_slice_codeunits",
      string,
      options = list(start = start, stop = end)
    )
  },
  notes = "`start` and `end` must be length 1"
  )


  register_binding("stringr::str_pad", function(string,
                                                width,
                                                side = c("left", "right", "both"),
                                                pad = " ") {
    assert_that(is_integerish(width))
    side <- match.arg(side)
    assert_that(is.string(pad))

    if (side == "left") {
      pad_func <- "utf8_lpad"
    } else if (side == "right") {
      pad_func <- "utf8_rpad"
    } else if (side == "both") {
      pad_func <- "utf8_center"
    }

    Expression$create(
      pad_func,
      string,
      options = list(width = width, padding = pad)
    )
  })
}