R/regex.R

Defines functions regexplain_dependencies result_page deprecate_knitr_option view_regex wrap_regex collapse_span_inserts wrap_result max_match_index expand_matches regex

Documented in regex view_regex wrap_regex wrap_result

#' Extract matched groups from regexp
#'
#' @param text Text to search
#' @param pattern regexp
#' @param global If `TRUE`, enables global pattern matching
#' @inheritParams base::regexec
regex <- function(
  text,
  pattern,
  ignore.case = FALSE,
  perl = FALSE,
  fixed = FALSE,
  useBytes = FALSE,
  global = TRUE
) {
  # Use regex to get matches by group, gives start index and length
  m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
  m <- purrr::map2(text, m, ~ list(text = .x, idx = expand_matches(.y)))

  attr(m, "global") <- global
  if (!global) {
    return(m)
  }

  mmi <- max_match_index(m)
  if (any(!is.na(mmi))) {
    subtext <- purrr::map_chr(m, "text") %>% purrr::map2_chr(mmi, substring)
    sub_idx <- which(!is.na(subtext))
    m2 <- regex(subtext[sub_idx], pattern, ignore.case, perl, fixed, useBytes)
    for (i in seq_along(m2)) {
      if (is.null(m2[[i]]$idx[[1]])) next
      m_idx <- sub_idx[i]
      m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[m_idx] - 1L
      m2[[i]]$idx$pass <- m2[[i]]$idx$pass + 1L
      m[[m_idx]]$idx <- rbind(m[[m_idx]]$idx, m2[[i]]$idx)
    }
  }
  m
}

expand_matches <- function(m) {
  if (m[1] == -1) {
    return(NULL)
  }
  m_length <- attr(m, "match.length")
  if (identical(as.vector(m[[1]]), 1L) && m_length == 0) {
    return(NULL)
  }
  x <- purrr::map2(m, m_length, ~ c(.x, .x + .y))
  x <- as.data.frame(do.call(rbind, x))
  names(x) <- c("start", "end")
  x$start <- ifelse(x$start == 0L, NA_integer_, x$start)
  x$end <- ifelse(x$end == 0L, NA_integer_, x$end)
  x$group <- 1:nrow(x) - 1L
  x$pass <- 1L
  x
}

max_match_index <- function(m) {
  purrr::map(m, "idx") %>%
    purrr::map_int(function(idx) {
      if (!is.null(idx)) {
        max(idx$start, idx$end, na.rm = TRUE)
      } else {
        NA
      }
    })
}


#' Wrap matches in HTML span tags to colorize via CSS
#'
#' @param x Individual list item in list returned by [regex()]
#' @inheritParams view_regex
#' @keywords internal
wrap_result <- function(x, escape = FALSE, exact = FALSE) {
  if (is.null(x$idx[[1]])) {
    return(if (escape) escape_html(x$text) else x$text)
  }
  text <- x$text

  inserts <- x$idx
  inserts$class <- sprintf("group g%02d", inserts$group)
  inserts$pad <- 0L
  names(inserts)[which(names(inserts) == "group")] <- "i"

  for (j in seq_len(nrow(inserts))) {
    if (inserts$i[j] == 0) next
    if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
    overlap <- filter(
      inserts[1:(j - 1), ],
      .data$i != 0,
      .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j]
    )
    inserts[j, "pad"] <- inserts$pad[j] + nrow(overlap)
  }
  inserts <- dplyr::bind_rows(
    inserts %>% select(-.data$end, dplyr::everything(), loc = .data$start) %>% mutate(type = "start"),
    inserts %>% select(-.data$start, dplyr::everything(), loc = .data$end) %>% mutate(type = "end")
  ) %>%
    filter(!is.na(.data$loc)) %>%
    dplyr::arrange(loc, class, dplyr::desc(type)) %>%
    mutate(
      class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
      insert = ifelse(.data$type == "start", sprintf('<span class="%s">', .data$class), "</span>")
    )

  inserts <- if (max(inserts$pass) == 1) {
    collapse_span_inserts(inserts)
  } else {
    split(inserts, inserts$pass) %>%
      purrr::map(collapse_span_inserts) %>%
      dplyr::bind_rows() %>%
      group_by(.data$loc, .data$type) %>%
      summarize(insert = paste(.data$insert, collapse = "")) %>%
      dplyr::ungroup()
  }

  # inserts now gives html (span open and close) to insert and loc
  # first split text at inserts$loc locations,
  # then recombine by zipping with inserts$insert text
  # start at 0, unless there's a hit on first character
  # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
  idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
  if (!(nchar(text) + 1) %in% idx_split) {
    idx_split <- c(idx_split, nchar(text) + 1)
  }
  text_split <- c()
  for (k in seq_along(idx_split[-1])) {
    text_split <- c(text_split, substr(text, idx_split[k], idx_split[k + 1] - 1))
  }
  out <- c()
  for (j in seq_along(text_split)) {
    out <- c(
      out,
      ifelse(escape, escape_html(text_split[j]), text_split[j]),
      if (!is.na(inserts$insert[j])) inserts$insert[j]
    )
  }
  if (exact) out <- escape_backslash(out)
  paste(out, collapse = "")
}

collapse_span_inserts <- function(inserts) {
  inserts_g0 <- filter(inserts, class == "group g00")
  inserts_other <- filter(inserts, class != "group g00")
  dplyr::bind_rows(
    filter(inserts_g0, type == "start"),
    inserts_other,
    filter(inserts_g0, type == "end")
  ) %>%
    mutate(type = sprintf("%05d%s", dplyr::row_number(), type)) %>%
    group_by(.data$loc, .data$type) %>%
    summarize(insert = paste(.data$insert, collapse = "")) %>%
    dplyr::ungroup() %>%
    mutate(type = sub("^\\d{5}", "", type))
}

#' Wraps capture groups in regex pattern in span tags to colorize with CSS
#'
#' @inheritParams view_regex
#' @keywords internal
wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
  stopifnot(length(pattern) == 1)
  if (escape) pattern <- escape_html(pattern)
  # 1. walk characters in pattern
  # 2. if current is open parens
  #    1. walk backwards, counting number of "\\" until first non-"\\" char
  #    2. If odd, then not an opening group
  #    3. Look forward, if followed by "?" then not a capturing group
  #    4. If capturing group then add opening "<span...>(" to out and
  #       add TRUE for valid capture group to parens stack
  #    5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack
  # 3. if close parens, add closing "</span>" to out
  out <- c()
  paren_stack <- c()
  group <- 0
  pattern_chars <- strsplit(pattern, "")[[1]]
  for (i in seq_along(pattern_chars)) {
    is_capture_group <- FALSE
    if (pattern_chars[i] == "(") {
      backslash_count <- 0
      if (i != 1) {
        j <- i - 1
        while (pattern_chars[j] == "\\" && j > 0) {
          backslash_count <- backslash_count + 1
          j <- j - 1
        }
      }
      if (backslash_count %% 2 == 0) {
        if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") {
          is_capture_group <- TRUE
        }
      }
      if (is_capture_group) {
        group <- group + 1
        paren_stack <- c(TRUE, paren_stack) # push
        out <- c(out, paste0('<span class="g', sprintf("%02d", group), '">('))
      } else {
        paren_stack <- c(FALSE, paren_stack) # push
        out <- c(out, "(")
      }
    } else if (pattern_chars[i] == ")") {
      closes_capture_group <- paren_stack[1]
      paren_stack <- paren_stack[-1] # pop
      if (closes_capture_group) {
        out <- c(out, ")</span>")
      } else {
        out <- c(out, ")")
      }
    } else {
      out <- c(out, pattern_chars[i])
    }
  }
  if (exact) out <- escape_backslash(out)
  paste(out, collapse = "")
}

#' View grouped regex results
#'
#' View the result of the regular expression when applied to the given text.
#' The default behavior renders the result as HTML and opens the file in
#' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned.
#' If the output is destined for a [knitr] document, set `knitr` to `TRUE`.
#'
#' @examples
#' view_regex("example", "amp", render = FALSE)
#' @param text Text to search
#' @param pattern Regex pattern to look for
#' @param render Render results as HTML?
#' @param escape Escape HTML-related characters in `text`?
#' @param exact Should the regex pattern be displayed as entered by the user
#'   into R console or source (default)? When `TRUE`, regex is displayed with
#'   the double `\\\\` required for escaping backslashes in R. When `FALSE`,
#'   regex is displayed as interpreted by the regex engine (i.e. double `\\\\`
#'   as a single `\\`).
#' @param result_only Should only the result be displayed? If `FALSE`, then
#'   the colorized regular expression is also displayed in the output.
#' @inheritDotParams base::regexec ignore.case perl fixed useBytes
#' @export
view_regex <- function(
  text,
  pattern,
  ...,
  render = TRUE,
  escape = render,
  exact = escape,
  result_only = FALSE
) {
  knitr <- isTRUE(getOption("knitr.in.progress"))
  if (knitr) {
    render <- FALSE
    escape <- TRUE
  }
  regex_opts <- deprecate_knitr_option(...)
  regex_opts$text <- text
  regex_opts$pattern <- pattern
  res <- do.call(regex, regex_opts)
  res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact)
  res <- purrr::map_chr(res, function(resi) {
    result_pad <- ""
    if (grepl("pad\\d{2}", resi)) {
      max_pad <- max(stringi::stri_extract_all_regex(resi, "pad\\d{2}")[[1]])
      max_pad_level <- as.integer(stringi::stri_extract_all_regex(max_pad, "\\d{2}"))
      if (max_pad_level - 3 > 0) {
        result_pad <- sprintf("pad%02d", max_pad_level - 3)
      }
    }
    paste('<p class="regexplain', result_pad, '">', resi, "</p>")
  })
  res <- paste(res, collapse = "")
  if (!nchar(pattern)) res <- paste("<p class='regexplain'>", text, "</p>")
  if (knitr) {
    return(
      htmltools::tagList(
        htmltools::HTML(res),
        regexplain_dependencies(full = FALSE)
      )
    )
  }
  if (!render) {
    return(res)
  }
  page <- result_page(wrap_regex(pattern, escape, exact), res, "View Regex")
  htmltools::browsable(page)
}

deprecate_knitr_option <- function(...) {
  regex_opts <- list(...)
  if ("knitr" %in% names(regex_opts)) {
    warning("The `knitr` parameter of `view_regex()` has been removed. Running `view_regex()` in R Markdown is automatically detected.")
  }
  regex_opts[setdiff(names(regex_opts), "knitr")]
}

result_page <- function(pattern, result, title = NULL) {
  pattern <- htmltools::HTML(pattern)
  result <- htmltools::HTML(result)
  page_title <- title
  htmltools::tagList(
    htmltools::div(
      class = "regexplain__result",
      if (!is.null(title)) htmltools::tags$head(htmltools::tags$title(page_title)),
      htmltools::h2("Pattern"),
      htmltools::pre(class = "regexplain__pattern", pattern),
      htmltools::h2("Matches"),
      result,
      regexplain_dependencies(full = TRUE)
    )
  )
}

regexplain_dependencies <- function(full = TRUE) {
  htmltools::htmlDependency(
    name = "regexplain",
    version = packageVersion("regexplain"),
    package = "regexplain",
    src = "styles",
    stylesheet = c(
      "groups.css",
      if (full) c("skeleton.css", "view_regex.css")
    ),
    all_files = FALSE
  )
}
gadenbuie/regexhelp documentation built on June 1, 2021, 4:24 p.m.