R/parse.R

Defines functions lines_and_cols_match is_insufficiently_parsed_number is_insufficiently_parsed_string ensure_correct_txt add_id_and_short get_parse_data tokenize has_crlf_as_first_line_sep parse_safely

Documented in add_id_and_short ensure_correct_txt get_parse_data has_crlf_as_first_line_sep is_insufficiently_parsed_string parse_safely tokenize

#' Save parsing from text
#'
#' Parses text safely, i.e. throws an informative error if EOL style does not
#' match LF or indicates the exact position where the parsing failed. Note
#' that we can only detect wrong EOL style if it occurs on the first line
#' already.
#' @param text Text to parse.
#' @param ... Parameters passed to [base::parse()].
#' @keywords internal
#' @examples
#' try(styler:::parse_safely("a + 3 -4 -> x\r\n glück + 1"))
#' # This cannot be detected as a EOL style problem because the first
#' # line ends as expected with \n
#' try(styler:::parse_safely("a + 3 -4 -> x\nx + 2\r\n glück + 1"))
#'
#' styler:::parse_safely("a + 3 -4 -> \n glück + 1")
parse_safely <- function(text, ...) {
  tried_parsing <- withCallingHandlers(
    parse(text = text, ...),
    error = function(e) {
      if (has_crlf_as_first_line_sep(e$message, text)) {
        msg <- c(
          x = "The code to style seems to use Windows style line endings (CRLF).",
          `!` = "styler currently only supports Unix style line endings (LF). ",
          i = "Please change the EOL character in your editor to Unix style
                 and try again."
        )
      } else {
        msg <- c(x = "Styling failed")
      }
      cli::cli_abort(msg, parent = e, call = NULL)
    },
    warning = function(w) {
      cli::cli_warn(w$message)
      w
    }
  )
  tried_parsing
}

#' Check if a string uses CRLF EOLs
#'
#' @param message A message returned with `tryCatch()`.
#' @param initial_text The initial text to style.
#' @keywords internal
has_crlf_as_first_line_sep <- function(message, initial_text) {
  split <- strsplit(message, ":", fixed = TRUE)[[1L]]
  if (length(split) > 1L && split[1L] == "<text>") {
    start_char <- as.numeric(split[3L])
    offending_line <- initial_text[as.integer(split[2L])]
    if (!is.na(offending_line) && substr(offending_line, start_char, start_char + 1L) == "\r\n") {
      return(TRUE)
    }
  }
  FALSE
}
#' Obtain token table from text
#'
#' [utils::getParseData()] is used to obtain a flat parse table from `text`.
#'
#' Apart from the columns provided by `utils::getParseData()`, the following
#' columns are added:
#'
#'   * A column "short" with the first five characters of "text".
#'   * A column "pos_id" for (positional id) which can be used for sorting
#'     (because "id" cannot be used in general). Note that the nth value of this
#'     column corresponds to n as long as no tokens are inserted.
#'   * A column "child" that contains *nest*s.
#'
#' @inheritParams get_parse_data
#' @return A flat parse table
#'
#' @keywords internal
tokenize <- function(text) {
  get_parse_data(text, include_text = TRUE) %>%
    ensure_correct_txt(text) %>%
    enhance_mapping_special()
}

#' Obtain robust parse data
#'
#' Wrapper around `utils::getParseData(parse(text = text))` that returns a flat
#' parse table. When caching information should be added, make sure that
#' the cache is activated with `cache_activate()` and both `transformers` and
#' `cache_dir` are non-`NULL`.
#' @param text The text to parse.
#' @param include_text Passed to [utils::getParseData()] as `includeText`.
#' @param ... Other arguments passed to [utils::getParseData()].
#' @keywords internal
get_parse_data <- function(text, include_text = TRUE, ...) {
  parsed <- parse_safely(text, keep.source = TRUE)
  pd <- utils::getParseData(parsed, includeText = include_text) %>%
    styler_df()
  if (getRversion() < "4.2") {
    is_unicode_parsing_error <- grepl("^\"<U\\+[0-9]+>\"$", pd$text)
    if (any(is_unicode_parsing_error)) {
      cli::cli_abort(c(
        "Can't parse input due to unicode restriction in base R.",
        i = "Please upgrade R to >= 4.2 to style this input.",
        "Context: {.url https://github.com/r-lib/styler/issues/847}"
      ))
    }
  }
  pd <- add_id_and_short(pd)

  pd
}

#' Add column `pos_id` and `short`
#'
#' Adds column `pos_id` and `short` to a flat parse table.
#' @param pd A flat parse table
#' @keywords internal
add_id_and_short <- function(pd) {
  pd$pos_id <- seq2(1L, nrow(pd))
  pd$short <- substr(pd$text, 1L, 5L)
  pd
}


#' Ensure a correct `text` of all strings and numeric constants
#'
#' Make sure `text` of the tokens `STR_CONST` and `NUM_CONST` is correct and
#' adapt if necessary. We replace offending `text` in the terminal expressions
#' with the text of their parents if their line / col position matches and
#' return an error otherwise.
#' @param pd A parse table.
#' @keywords internal
ensure_correct_txt <- function(pd, text) {
  is_problematic_text <- magrittr::or(
    is_insufficiently_parsed_string(pd),
    is_insufficiently_parsed_number(pd)
  )
  if (!any(is_problematic_text)) {
    return(pd)
  }
  problematic_text <- vec_slice(pd, is_problematic_text)
  is_parent_of_problematic_string <- pd$id %in% problematic_text$parent

  is_unaffected_token <- !magrittr::or(
    is_problematic_text, is_parent_of_problematic_string
  )

  pd_with_all_text <- get_parse_data(text, include_text = TRUE)
  parent_cols_for_merge <- c("id", "text", "short", line_col_names())
  parent_of_problematic_text <-
    pd_with_all_text[is_parent_of_problematic_string, parent_cols_for_merge]
  problematic_text$text <- NULL
  problematic_text$short <- NULL
  new_text <- merge(problematic_text, parent_of_problematic_text,
    by.x = "parent",
    by.y = "id",
    suffixes = c("", "parent")
  ) %>%
    styler_df()

  if (!lines_and_cols_match(new_text)) {
    abort(paste(
      "Error in styler:::ensure_correct_txt()."
    ), .internal = TRUE)
  }
  names_to_keep <- setdiff(
    names(new_text),
    paste0(line_col_names(), "parent")
  )
  vec_rbind(
    new_text[, names_to_keep],
    vec_slice(pd, is_unaffected_token),
    vec_slice(pd, is_parent_of_problematic_string)
  ) %>%
    arrange_pos_id()
}


#' Identify strings that were not fully parsed
#'
#' Identifies strings that were not fully parsed due to their vast length.
#' @details
#' The meaning of the variable `is_problematic_string` in the source code
#' changes from "all strings" to "all problematic strings", is partly
#' misleading and this approach was chosen for performance reasons only.
#' @param pd A parse table.
#' @keywords internal
is_insufficiently_parsed_string <- function(pd) {
  grepl("^\\[", pd$text) & pd$token == "STR_CONST"
}

is_insufficiently_parsed_number <- function(pd) {
  grepl("^0x", pd$text) & pd$token == "NUM_CONST"
}

#' Check whether columns match
#' @keywords internal
#' @noRd
lines_and_cols_match <- function(data) {
  left <- paste0(line_col_names(), "")
  right <- paste0(line_col_names(), "parent")
  identical(
    unlist(data[left], use.names = FALSE),
    unlist(data[right], use.names = FALSE)
  )
}
r-lib/styler documentation built on Sept. 12, 2024, 1:11 p.m.