tests/testthat/test-fixed_regex_linter.R

# NB: escaping is confusing. We have to double-escape everything -- the first
#   escape creates a string that will be parse()d, the second escape is normal
#   escaping that would be done in R code. E.g. in "\\\\.", the R code would
#   read like "\\.", but in order to create those two slashes, we need to write
#   "\\\\." in the string here.

test_that("fixed_regex_linter skips allowed usages", {
  linter <- fixed_regex_linter()

  expect_lint("gsub('^x', '', y)", NULL, linter)
  expect_lint("grep('x$', '', y)", NULL, linter)
  expect_lint("sub('[a-zA-Z]', '', y)", NULL, linter)
  expect_lint("grepl(fmt, y)", NULL, linter)
  expect_lint("regexec('\\\\s', '', y)", NULL, linter)
  expect_lint("grep('a(?=b)', x, perl = TRUE)", NULL, linter)
  expect_lint("grep('0+1', x, perl = TRUE)", NULL, linter)
  expect_lint("grep('1*2', x)", NULL, linter)
  expect_lint("grep('a|b', x)", NULL, linter)
  expect_lint("grep('\\\\[|\\\\]', x)", NULL, linter)

  # if fixed=TRUE is already set, regex patterns don't matter
  expect_lint("gsub('\\\\.', '', y, fixed = TRUE)", NULL, linter)

  # ignore.case=TRUE implies regex interpretation
  expect_lint("gsub('abcdefg', '', y, ignore.case = TRUE)", NULL, linter)

  # char classes starting with [] might contain other characters -> not fixed
  expect_lint("sub('[][]', '', y)", NULL, linter)
  expect_lint("sub('[][ ]', '', y)", NULL, linter)
  expect_lint("sub('[],[]', '', y)", NULL, linter)

  # wrapper functions don't throw
  expect_lint("gregexpr(pattern = pattern, data, perl = TRUE, ...)", NULL, linter)
})

test_that("fixed_regex_linter blocks simple disallowed usages", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint("gsub('\\\\.', '', x)", lint_msg, linter)
  expect_lint("grepl('abcdefg', x)", lint_msg, linter)
  expect_lint("gregexpr('a-z', y)", lint_msg, linter)
  expect_lint("regexec('\\\\$', x)", lint_msg, linter)
  expect_lint("grep('\n', x)", lint_msg, linter)

  # naming the argument doesn't matter (if it's still used positionally)
  expect_lint("gregexpr(pattern = 'a-z', y)", lint_msg, linter)
})

patrick::with_parameters_test_that(
  "fixed_regex_linter is robust to unrecognized escapes error",
  {
    expect_lint(
      sprintf("grep('\\\\%s', x)", char),
      rex::rex("This regular expression is static"),
      fixed_regex_linter()
    )

    expect_lint(
      sprintf("strsplit('a%sb', '\\\\%s')", char, char),
      rex::rex("This regular expression is static"),
      fixed_regex_linter()
    )
  },
  .cases = local({
    char <- c(
      "^", "$", "{", "}", "(", ")", ".", "*", "+", "?",
      "|", "[", "]", "\\\\", "<", ">", "=", ":", ";", "/",
      "_", "-", "!", "@", "#", "%", "&", "~"
    )
    data.frame(char = char, .test_name = char, stringsAsFactors = FALSE)
  })
)

test_that("fixed_regex_linter catches regex like [.] or [$]", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint("grep('[.]', x)", lint_msg, linter)
  expect_lint("grepl('a[*]b', x)", lint_msg, linter)

  # also catch char classes for [ and ]
  expect_lint("gregexpr('[]]', x)", lint_msg, linter)
})

test_that("fixed_regex_linter catches null calls to strsplit as well", {
  linter <- fixed_regex_linter()

  expect_lint("strsplit(x, '^x')", NULL, linter)
  expect_lint("strsplit(x, '\\\\s')", NULL, linter)
  expect_lint("strsplit(x, 'a(?=b)', perl = TRUE)", NULL, linter)
  expect_lint("strsplit(x, '0+1', perl = TRUE)", NULL, linter)
  expect_lint("strsplit(x, 'a|b')", NULL, linter)

  expect_lint("tstrsplit(x, '1*2')", NULL, linter)
  expect_lint("tstrsplit(x, '[a-zA-Z]')", NULL, linter)
  expect_lint("tstrsplit(x, fmt)", NULL, linter)

  # if fixed=TRUE is already set, regex patterns don't matter
  expect_lint("strsplit(x, '\\\\.', fixed = TRUE)", NULL, linter)
  expect_lint("strsplit(x, '\\\\.', fixed = T)", NULL, linter)
})

test_that("fixed_regex_linter catches calls to strsplit as well", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint("strsplit(x, '\\\\.')", lint_msg, linter)
  expect_lint("strsplit(x, '[.]')", lint_msg, linter)

  expect_lint("tstrsplit(x, 'abcdefg')", lint_msg, linter)
})

test_that("fixed_regex_linter is more exact about distinguishing \\s from \\:", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint("grep('\\\\s', '', x)", NULL, linter)
  expect_lint("grep('\\\\:', '', x)", lint_msg, linter)
})

## tests for stringr functions
test_that("fixed_regex_linter skips allowed stringr usages", {
  linter <- fixed_regex_linter()

  expect_lint("str_replace(y, '[a-zA-Z]', '')", NULL, linter)
  expect_lint("str_replace_all(y, '^x', '')", NULL, linter)
  expect_lint("str_detect(y, fmt)", NULL, linter)
  expect_lint("str_extract(y, '\\\\s')", NULL, linter)
  expect_lint("str_extract_all(y, '\\\\s')", NULL, linter)
  expect_lint("str_which(x, '1*2')", NULL, linter)

  # if fixed() is already set, regex patterns don't matter
  expect_lint("str_replace(y, fixed('\\\\.'), '')", NULL, linter)

  # namespace qualification doesn't matter
  expect_lint("stringr::str_replace(y, stringr::fixed('abcdefg'), '')", NULL, linter)
})

test_that("fixed_regex_linter blocks simple disallowed usages of stringr functions", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint("str_replace_all(x, '\\\\.', '')", lint_msg, linter)
  expect_lint("str_detect(x, 'abcdefg')", lint_msg, linter)
  expect_lint("str_locate(y, 'a-z')", lint_msg, linter)
  expect_lint("str_subset(x, '\\\\$')", lint_msg, linter)
  expect_lint("str_which(x, '\n')", lint_msg, linter)

  # named, positional arguments are still caught
  expect_lint("str_locate(y, pattern = 'a-z')", lint_msg, linter)
  # nor do other named arguments throw things off
  expect_lint("str_starts(x, '\\\\.', negate = TRUE)", lint_msg, linter)
})

test_that("fixed_regex_linter catches calls to str_split as well", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint("str_split(x, '^x')", NULL, linter)
  expect_lint("str_split(x, fmt)", NULL, linter)

  # if fixed() is already set, regex patterns don't matter
  expect_lint("str_split(x, fixed('\\\\.'))", NULL, linter)
  expect_lint("str_split(x, '\\\\.')", lint_msg, linter)
  expect_lint("str_split(x, '[.]')", lint_msg, linter)
})

test_that("str_replace_all's multi-replacement version is handled", {
  linter <- fixed_regex_linter()

  # While each of the replacements is fixed, and this _could_ in principle be replaced by
  #   a pipeline where each step does one of the replacements and fixed() is used, this is overkill.
  #   Instead, ensure that no lint is returned for this case
  expect_lint('str_replace_all(x, c("one" = "1", "two" = "2", "three" = "3"))', NULL, linter)
  expect_lint('grepl(c("a" = "b"), x)', NULL, linter)
})

test_that("1- or 2-width octal escape sequences are handled", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint('strsplit(x, "\\1")', lint_msg, linter)
})

test_that("one-character character classes with escaped characters are caught", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint("gsub('[\\n]', '', x)", lint_msg, linter)
  expect_lint("gsub('[\\\"]', '', x)", lint_msg, linter)
  expect_lint('gsub("\\\\<", "x", x, perl = TRUE)', lint_msg, linter)

  expect_lint("str_split(x, '[\\1]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\12]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\123]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\xa]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\x32]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\uF]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\u01]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\u012]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\u0123]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\U8]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\U1d4d7]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\u{1}]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\U{F7D5}]')", lint_msg, linter)
  expect_lint("str_split(x, '[\\U{1D4D7}]')", lint_msg, linter)
})

test_that("bracketed unicode escapes are caught", {
  linter <- fixed_regex_linter()
  lint_msg <- rex::rex("This regular expression is static")

  expect_lint('gsub("\\u{A0}", " ", out, useBytes = TRUE)', lint_msg, linter)
  expect_lint('gsub("abc\\U{A0DEF}ghi", " ", out, useBytes = TRUE)', lint_msg, linter)
  expect_lint('gsub("\\u{A0}\\U{0001d4d7}", " ", out, useBytes = TRUE)', lint_msg, linter)
})

test_that("escaped characters are handled correctly", {
  linter <- fixed_regex_linter()
  expect_lint("gsub('\\n+', '', sql)", NULL, linter)
  expect_lint('gsub("\\n{2,}", "\n", D)', NULL, linter)
  expect_lint('gsub("[\\r\\n]", "", x)', NULL, linter)
  expect_lint('gsub("\\n $", "", y)', NULL, linter)
  expect_lint('gsub("```\\n*```r*\\n*", "", x)', NULL, linter)
  expect_lint('strsplit(x, "(;|\n)")', NULL, linter)
  expect_lint('strsplit(x, "(;|\\n)")', NULL, linter)
  expect_lint('grepl("[\\\\W]", x, perl = TRUE)', NULL, linter)
  expect_lint('grepl("[\\\\W]", x)', NULL, linter)
})

# make sure the logic is properly vectorized
test_that("single expression with multiple regexes is OK", {
  expect_lint('c(grep("^a", x), grep("b$", x))', NULL, fixed_regex_linter())
})

test_that("fixed replacements vectorize and recognize str_detect", {
  linter <- fixed_regex_linter()
  # properly vectorized
  expect_lint(
    trim_some("
      c(
        grepl('abcdefg', x),
        grepl('a[.]\\\\.b\\n', x)
      )
    "),
    list(
      rex::rex('Here, you can use "abcdefg" with fixed = TRUE'),
      rex::rex('Here, you can use "a..b\\n" with fixed = TRUE')
    ),
    linter
  )

  # stringr hint works
  expect_lint(
    "str_detect(x, 'abc')",
    rex::rex('Here, you can use stringr::fixed("abc") as the pattern'),
    linter
  )
})

test_that("fixed replacement is correct with UTF-8", {
  skip_if(
    .Platform$OS.type == "windows" && !hasName(R.Version(), "crt"),
    message = "UTF-8 support is required"
  )

  expect_lint(
    "grepl('[\\U{1D4D7}]', x)",
    rex::rex('Here, you can use "\U1D4D7" with fixed = TRUE'),
    fixed_regex_linter()
  )
})

# TODO(michaelchirico): one difference for stringr functions vs. base is that
#   stringr is much friendlier to piping, so that
#   > str %>% str_replace_all("x$", "y")
#   actually doesn't need fixed(), but the logic now is only looking at "y"
#   since it's the second argument and a non-regex string. Similarly,
#   > str %>% str_detect("x")
#   is a false negative. thankfully there appear to be few false positives here

# TODO(michaelchirico): we could in principle build in logic to detect whether
#   perl=TRUE and interpret "regex or not" accordingly. One place
#   up in practice is for '\<', which is a special character in default
#   regex but not in PCRE. Empirically relevant for HTML-related regex e.g. \\<li\\>

#' Generate a string with a non-printable Unicode entry robust to test environment
#'
#' Non-printable unicode behaves wildly different with `encodeString()`
#'   across R versions and platforms. Nonetheless, all of these expressions
#'   are valid replacements.
#' @noRd
robust_non_printable_unicode <- function() {
  if (getRversion() < "4.1.0") {
    "abc\\U000a0defghi"
  } else if (.Platform$OS.type == "windows") {
    "abc\U{0a0def}ghi"
  } else {
    "abc\\U{0a0def}ghi"
  }
}

# styler: off
patrick::with_parameters_test_that("fixed replacements are correct", {
  skip_if(
    regex_expr %in% c("abc\\U{A0DEF}ghi", "[\\U1d4d7]", "[\\U{1D4D7}]", "\\u{A0}\\U{0001d4d7}") &&
      .Platform$OS.type == "windows" &&
      !hasName(R.Version(), "crt"),
    message = "UTF-8 support is required"
  )
  expect_lint(
    sprintf("grepl('%s', x)", regex_expr),
    rex::rex(sprintf('Here, you can use "%s" with fixed = TRUE', fixed_expr)),
    fixed_regex_linter()
  )
}, .cases = tibble::tribble(
  ~.test_name,            ~regex_expr,            ~fixed_expr,
  "[.]",                  "[.]",                  ".",
  '[\\\"]',               '[\\\"]',               '\\"',
  "[]]",                  "[]]",                  "]",
  "\\\\.",                "\\\\.",                ".",
  "\\\\:",                "\\\\:",                ":",
  "\\\\<",                "\\\\<",                "<",
  "\\\\$",                "\\\\$",                "$",
  "[\\1]",                "[\\1]",                "\\001",
  "\\1",                  "\\1",                  "\\001",
  "[\\12]",               "[\\12]",               "\\n",
  "[\\123]",              "[\\123]",              "S",
  "a[*]b",                "a[*]b",                "a*b",
  "abcdefg",              "abcdefg",              "abcdefg",
  "abc\\U{A0DEF}ghi",     "abc\\U{A0DEF}ghi",     robust_non_printable_unicode(),
  "a-z",                  "a-z",                  "a-z",
  "[\\n]",                "[\\n]",                "\\n",
  "\\n",                  "\n",                   "\\n",
  "[\\u01]",              "[\\u01]",              "\\001",
  "[\\u012]",             "[\\u012]",             "\\022",
  "[\\u0123]",            "[\\u0123]",            "\u0123",
  "[\\u{1}]",             "[\\u{1}]",             "\\001",
  "[\\U1d4d7]",           "[\\U1d4d7]",           "\U1D4D7",
  "[\\U{1D4D7}]",         "[\\U{1D4D7}]",         "\U1D4D7",
  "[\\U8]",               "[\\U8]",               "\\b",
  "\\u{A0}",              "\\u{A0}",              "\uA0",
  "\\u{A0}\\U{0001d4d7}", "\\u{A0}\\U{0001d4d7}", "\uA0\U1D4D7",
  "[\\uF]",               "[\\uF]",               "\\017",
  "[\\U{F7D5}]",          "[\\U{F7D5}]",          "\UF7D5",
  "[\\x32]",              "[\\x32]",              "2",
  "[\\xa]",               "[\\xa]",               "\\n"
))
# styler: on

test_that("'unescaped' regex can optionally be skipped", {
  linter <- fixed_regex_linter(allow_unescaped = TRUE)

  expect_lint("grepl('a', x)", NULL, linter)
  expect_lint("str_detect(x, 'a')", NULL, linter)
  expect_lint("grepl('[$]', x)", rex::rex('Here, you can use "$"'), linter)
})

local({
  pipes <- pipes(exclude = c("%$%", "%T>%"))
  patrick::with_parameters_test_that(
    "linter is pipe-aware",
    {
      linter <- fixed_regex_linter()
      lint_msg <- "This regular expression is static"

      expect_lint(paste("x", pipe, "grepl(pattern = 'a')"), lint_msg, linter)
      expect_lint(paste("x", pipe, "grepl(pattern = '^a')"), NULL, linter)
      expect_lint(paste("x", pipe, "grepl(pattern = 'a', fixed = TRUE)"), NULL, linter)
      expect_lint(paste("x", pipe, "str_detect('a')"), lint_msg, linter)
      expect_lint(paste("x", pipe, "str_detect('^a')"), NULL, linter)
      expect_lint(paste("x", pipe, "str_detect(fixed('a'))"), NULL, linter)

      expect_lint(paste("x", pipe, "gsub(pattern = 'a', replacement = '')"), lint_msg, linter)
      expect_lint(paste("x", pipe, "gsub(pattern = '^a', replacement = '')"), NULL, linter)
      expect_lint(paste("x", pipe, "gsub(pattern = 'a', replacement = '', fixed = TRUE)"), NULL, linter)
      expect_lint(paste("x", pipe, "str_replace('a', '')"), lint_msg, linter)
      expect_lint(paste("x", pipe, "str_replace('^a', '')"), NULL, linter)
      expect_lint(paste("x", pipe, "str_replace(fixed('a'), '')"), NULL, linter)
    },
    pipe = pipes,
    .test_name = names(pipes)
  )
})

Try the lintr package in your browser

Any scripts or data that you put into this service are public.

lintr documentation built on Nov. 7, 2023, 5:07 p.m.