Nothing
#' Parsed sourced file from a filename
#'
#' This object is given as input to each linter.
#'
#' @details
#' The file is read using the `encoding` setting.
#' This setting is found by taking the first valid result from the following locations
#'
#' 1. The `encoding` key from the usual lintr configuration settings.
#' 2. The `Encoding` field from a Package `DESCRIPTION` file in a parent directory.
#' 3. The `Encoding` field from an R Project `.Rproj` file in a parent directory.
#' 4. `"UTF-8"` as a fallback.
#'
#' @param filename the file to be parsed.
#' @param lines a character vector of lines.
#' If `NULL`, then `filename` will be read.
#' @return A `list` with three components:
#' \describe{
#' \item{expressions}{a `list` of
#' `n+1` objects. The first `n` elements correspond to each expression in
#' `filename`, and consist of a list of 9 elements:
#' \itemize{
#' \item{`filename` (`character`)}
#' \item{`line` (`integer`) the line in `filename` where this expression begins}
#' \item{`column` (`integer`) the column in `filename` where this expression begins}
#' \item{`lines` (named `character`) vector of all lines spanned by this
#' expression, named with the line number corresponding to `filename`}
#' \item{`parsed_content` (`data.frame`) as given by [utils::getParseData()] for this expression}
#' \item{`xml_parsed_content` (`xml_document`) the XML parse tree of this
#' expression as given by [xmlparsedata::xml_parse_data()]}
#' \item{`content` (`character`) the same as `lines` as a single string (not split across lines)}
#' }
#'
#' The final element of `expressions` is a list corresponding to the full file
#' consisting of 6 elements:
#' \itemize{
#' \item{`filename` (`character`)}
#' \item{`file_lines` (`character`) the [readLines()] output for this file}
#' \item{`content` (`character`) for .R files, the same as `file_lines`;
#' for .Rmd or .qmd scripts, this is the extracted R source code (as text)}
#' \item{`full_parsed_content` (`data.frame`) as given by
#' [utils::getParseData()] for the full content}
#' \item{`full_xml_parsed_content` (`xml_document`) the XML parse tree of all
#' expressions as given by [xmlparsedata::xml_parse_data()]}
#' \item{`terminal_newline` (`logical`) records whether `filename` has a terminal
#' newline (as determined by [readLines()] producing a corresponding warning)}
#' }
#' }
#' \item{error}{A `Lint` object describing any parsing error.}
#' \item{lines}{The [readLines()] output for this file.}
#' }
#'
#' @examplesIf requireNamespace("withr", quietly = TRUE)
#' tmp <- withr::local_tempfile(lines = c("x <- 1", "y <- x + 1"))
#' get_source_expressions(tmp)
#' @export
get_source_expressions <- function(filename, lines = NULL) {
source_expression <- srcfile(filename, encoding = settings$encoding)
# Ensure English locale for terminal newline and zero-length variable warning messages
old_lang <- set_lang("en")
on.exit(reset_lang(old_lang))
source_expression$lines <- if (is.null(lines)) {
read_lines(filename)
} else {
lines
}
# Only regard explicit attribute terminal_newline=FALSE as FALSE and all other cases (e.g. NULL or TRUE) as TRUE.
# We don't use isFALSE since it is introduced in R 3.5.0.
terminal_newline <- !identical(attr(source_expression$lines, "terminal_newline", exact = TRUE), FALSE)
e <- NULL
source_expression$lines <- extract_r_source(
filename = source_expression$filename,
lines = source_expression$lines,
error = function(e) lint_rmd_error(e, source_expression)
)
names(source_expression$lines) <- seq_along(source_expression$lines)
source_expression$content <- get_content(source_expression$lines)
parsed_content <- get_source_expression(source_expression, error = function(e) lint_parse_error(e, source_expression))
if (inherits(e, "lint") && (is.na(e$line) || !nzchar(e$line) || e$message == "unexpected end of input")) {
# Don't create expression list if it's unreliable (invalid encoding or unhandled parse error)
expressions <- list()
} else {
top_level_map <- generate_top_level_map(parsed_content)
xml_parsed_content <- safe_parse_to_xml(parsed_content)
expressions <- lapply(
X = top_level_expressions(parsed_content),
FUN = get_single_source_expression,
parsed_content,
source_expression,
filename,
top_level_map
)
if (!is.null(xml_parsed_content) && !is.na(xml_parsed_content)) {
expression_xmls <- lapply(
xml_find_all(xml_parsed_content, "/exprlist/*"),
function(top_level_expr) xml2::xml_add_parent(xml2::xml_new_root(top_level_expr), "exprlist")
)
for (i in seq_along(expressions)) {
expressions[[i]]$xml_parsed_content <- expression_xmls[[i]]
}
}
# add global expression
expressions[[length(expressions) + 1L]] <- list(
filename = filename,
file_lines = source_expression$lines,
content = source_expression$lines,
full_parsed_content = parsed_content,
full_xml_parsed_content = xml_parsed_content,
terminal_newline = terminal_newline
)
}
list(expressions = expressions, error = e, lines = source_expression$lines)
}
lint_parse_error <- function(e, source_expression) {
# R >= 4.3.0
if (inherits(e, "parseError")) {
return(lint_parse_error_r43(e, source_expression))
}
message_info <- re_matches(
e$message,
rex(
except_some_of(":"),
":",
capture(name = "line", digits),
":",
capture(name = "column", digits),
":",
space,
capture(name = "message", anything),
"\n"
)
)
if (!is.na(message_info$line)) {
return(lint_parse_error_r42(message_info, source_expression))
}
# an error that does not use R_ParseErrorMsg
lint_parse_error_nonstandard(e, source_expression)
}
#' Ensure a string is valid for printing
#'
#' Helper to ensure a valid string is provided as line where necessary.
#' Handles two cases:
#'
#' 1. NA lines, as generated by running outside of code blocks in Rmd documents
#' 2. invalid string lines, as generated by invalid encoding settings
#'
#' [nchar()] can detect both cases: 1. returns `NA_integer_`, 2. errors
#'
#' @param line Possibly misencoded or NA line
#'
#' @return `line` if it's a valid non-NA string, otherwise an empty string.
#'
#' @noRd
fixup_line <- function(line) {
nchars <- tryCatch(nchar(line, type = "chars"), error = function(e) NA_integer_)
if (is.na(nchars)) {
""
} else {
line
}
}
#' Convert an R >= 4.3.0 classed parseError with metadata into a lint
#'
#' @param e A parse error of class `parseError` generated by R >= 4.3.0
#' @param source_expression The source expression that generated the parse error
#'
#' @return A [Lint()] based on the metadata attached to `e`.
#'
#' @noRd
lint_parse_error_r43 <- function(e, source_expression) {
msg <- re_substitutes(e$message, rex(" (", except_some_of(")"), ")", end), "")
line_number <- e$lineno
column <- e$colno
substr(msg, 1L, 1L) <- toupper(substr(msg, 1L, 1L))
msg <- paste0(msg, ".")
if (inherits(e, "invalidMBCS")) {
msg <- paste(msg, "Is the encoding correct?")
}
if (column == 0L) {
line_number <- line_number - 1L
}
if (line_number < 1L || line_number > length(source_expression$lines)) {
# Safely handle invalid location info
line_number <- 1L
column <- 1L
}
line <- fixup_line(source_expression$lines[[line_number]])
if (column == 0L || column > nchar(line) + 1L) {
column <- nchar(line) + 1L
}
Lint(
filename = source_expression$filename,
line_number = line_number,
column_number = column,
type = "error",
message = msg,
line = line
)
}
#' Convert a R < 4.3.0 standard parse error message into a lint
#'
#' @param message_info Match of the structured parse error message regex, matched in [lint_parse_error()]
#' @param source_expression The source expression that generated the parse error
#'
#' @return A [Lint()] based on text mining of the error message captured by `message_info`,
#'
#' @noRd
lint_parse_error_r42 <- function(message_info, source_expression) {
line_number <- as.integer(message_info$line)
column_number <- as.integer(message_info$column)
# If the column number is zero it means the error really occurred at the
# end of the previous line
if (column_number %==% 0L) {
line_number <- line_number - 1L
line <- fixup_line(source_expression$lines[[line_number]])
column_number <- nchar(line) + 1L
} else {
line <- fixup_line(source_expression$lines[[line_number]])
}
Lint(
filename = source_expression$filename,
line_number = line_number,
column_number = column_number,
type = "error",
message = message_info$message,
line = line
)
}
#' Convert a R < 4.3.0 non-standard parse error message into a lint
#'
#' Uses a hand-crafted regex and some heuristics to find location information, falling back to Line 1, Column 1 if all
#' attempts fail.
#'
#' @param e A parse error that doesn't fit the standard `message_info` regex used for most parse errors in R < 4.3.0
#' @param source_expression The source expression that generated the parse error
#'
#' @return A [Lint()] based on trying to extract information from the error message of `e`.
#'
#' @noRd
lint_parse_error_nonstandard <- function(e, source_expression) {
if (grepl("invalid multibyte character in parser at line", e$message, fixed = TRUE)) {
# nocov start: platform-specific
l <- as.integer(re_matches(
e$message,
rex("invalid multibyte character in parser at line ", capture(name = "line", digits))
)$line)
# Invalid encoding in source code
return(
Lint(
filename = source_expression$filename,
line_number = l,
column_number = 1L,
type = "error",
message = "Invalid multibyte character in parser. Is the encoding correct?",
line = fixup_line(source_expression$lines[[l]])
)
)
# nocov end
} else if (grepl("invalid multibyte string, element", e$message, fixed = TRUE)) {
# Invalid encoding, will break even re_matches() below, so we need to handle this first.
return(
Lint(
filename = source_expression$filename,
line_number = 1L,
column_number = 1L,
type = "error",
message = "Invalid multibyte string. Is the encoding correct?",
line = ""
)
)
} else if (grepl("repeated formal argument", e$message, fixed = TRUE)) {
# nocov start: only throws on certain versions; tested on L389
matches <- re_matches(
e$message,
rex(
"repeated formal argument '",
capture(name = "symbol", anything),
"' on line ",
capture(name = "line", digits)
)
)
sym <- matches$symbol
l <- as.integer(matches$line)
# Repeated formal argument 'sym' on line l
return(
Lint(
filename = source_expression$filename,
line_number = l,
column_number = 1L,
type = "error",
message = sprintf("Repeated formal argument '%s'.", sym),
line = fixup_line(source_expression$lines[[l]])
)
)
# nocov end
}
# Hand-crafted regex to parse all error messages generated by the R parser code for R < 4.3.
# The error messages can be found in src/main/gram.c and src/main/character.c in the R code.
# This code produces a list of all possible error messages:
#
# nolint start: commented_code_linter.
# parser_files <- c("src/main/gram.c", "src/main/character.c")
#
# lines <- unlist(lapply(
# file.path("https://raw.githubusercontent.com/wch/r-source/trunk", parser_files),
# readLines
# ))
# error_calls <- grep("error(_(", lines, fixed = TRUE, value = TRUE)
# error_formats <- trimws(gsub("^.*error\\(_\\(\"(.+)\".+", "\\1", error_calls))
# error_formats <- unique(error_formats)
# nolint end
parse_error_rx <- rex(
start,
capture(anything, name = "msg_1"),
or(" at ", " on ", " ("),
"line ",
capture(digits, name = "line"),
maybe(")"),
capture(anything, name = "msg_2"),
end
)
if (grepl(parse_error_rx, e$message, perl = TRUE)) {
# nocov start: only throws on certain versions. Tested on L396: Nul character not allowed
rx_match <- re_matches(
e$message,
parse_error_rx
)
l <- as.integer(rx_match$line)
# Sometimes the parser "line" runs one past the last line
l <- pmin(l, length(source_expression$lines))
msg <- paste0(rx_match$msg_1, if (nzchar(rx_match$msg_2)) " ", rx_match$msg_2, ".")
substr(msg, 1L, 1L) <- toupper(substr(msg, 1L, 1L))
return(
Lint(
filename = source_expression$filename,
line_number = l,
column_number = 1L,
type = "error",
message = msg,
line = source_expression$lines[[l]]
)
)
# nocov end
}
message_info <- re_matches(
e$message,
rex(
single_quotes, capture(name = "name", anything), single_quotes,
anything,
double_quotes, capture(name = "starting", anything), double_quotes
)
)
loc <- re_matches(source_expression$content, rex(message_info$starting), locations = TRUE)
line_location <- loc[!is.na(loc$start) & !is.na(loc$end), ]
if (nrow(line_location) == 0L) {
if (grepl("attempt to use zero-length variable name", e$message, fixed = TRUE)) {
# empty symbol: ``, ``(), ''(), ""(), fun(''=42), fun(""=42), fun(a=1,""=42)
loc <- re_matches(
source_expression$content,
rex(
"``" %or%
list(or("''", '""'), any_spaces, "(") %or%
list(or("(", ","), any_spaces, or("''", '""'), any_spaces, "=")
),
options = "multi-line",
locations = TRUE
)
loc <- loc[!is.na(loc$start) & !is.na(loc$end), ]
if (nrow(loc) > 0L) {
line_location <- loc[1L, ]
}
} else {
# nocov start
return(
Lint(
filename = source_expression$filename,
line_number = 1L,
column_number = 1L,
type = "error",
message = e$message,
line = ""
)
)
# nocov end
}
}
newline_locs <- get_newline_locs(source_expression$content)
line_number <- which(newline_locs >= line_location$start)[1L] - 1L
column_number <- line_location$start - newline_locs[line_number]
Lint(
filename = source_expression$filename,
line_number = line_number,
column_number = column_number,
type = "error",
message = e$message,
line = source_expression$lines[[line_number]]
)
}
lint_rmd_error <- function(e, source_expression) {
message_info <- re_matches(
e$message,
rex(
except_some_of(":"),
":",
capture(name = "line", digits),
":",
capture(name = "column", digits),
":",
space,
capture(name = "message", anything),
"\n"
)
)
line_number <- as.integer(message_info$line)
column_number <- as.integer(message_info$column)
Lint(
filename = source_expression$filename,
line_number = line_number,
column_number = column_number,
type = "error",
message = message_info$message,
line = source_expression$lines[line_number]
)
}
get_single_source_expression <- function(loc,
parsed_content,
source_expression,
filename,
top_level_map) {
line_nums <- parsed_content$line1[loc]:parsed_content$line2[loc]
expr_lines <- source_expression$lines[line_nums]
names(expr_lines) <- line_nums
content <- get_content(source_expression$lines, parsed_content[loc, ])
id <- parsed_content$id[loc]
pc <- parsed_content[which(top_level_map == id), ]
list(
filename = filename,
line = parsed_content[loc, "line1"],
column = parsed_content[loc, "col1"],
lines = expr_lines,
parsed_content = pc,
xml_parsed_content = xml2::xml_missing(),
content = content
)
}
get_source_expression <- function(source_expression, error = identity) {
parse_error <- FALSE
parsed_content <- tryCatch(
parse(
text = source_expression$content,
srcfile = source_expression,
keep.source = TRUE
),
error = error
)
# TODO: Remove when minimum R version is bumped to > 3.5
#
# This needs to be done twice to avoid a bug fixed in R 3.4.4
# https://bugs.r-project.org/bugzilla/show_bug.cgi?id=16041
parsed_content <- tryCatch(
parse(
text = source_expression$content,
srcfile = source_expression,
keep.source = TRUE
),
error = error
)
if (inherits(parsed_content, c("error", "lint"))) {
assign("e", parsed_content, envir = parent.frame())
parse_error <- TRUE
}
# Triggers an error if the lines contain invalid characters.
parsed_content <- tryCatch(
nchar(source_expression$content, type = "chars"),
error = error
)
if (inherits(parsed_content, c("error", "lint"))) {
# Let parse errors take precedence over encoding problems
if (!parse_error) assign("e", parsed_content, envir = parent.frame())
return() # parsed_content is unreliable if encoding is invalid
}
source_expression$parsed_content <- parsed_content
fix_octal_escapes(fix_eq_assigns(fix_tab_indentations(source_expression)), source_expression$lines)
}
get_newline_locs <- function(x) {
newline_search <- re_matches(x, rex("\n"), locations = TRUE, global = TRUE)[[1L]]$start
c(
0L,
if (!is.na(newline_search[1L])) newline_search,
nchar(x) + 1L
)
}
# Fix column numbers when there are tabs
# getParseData() counts 1 tab as a variable number of spaces instead of one:
# https://github.com/wch/r-source/blame/e7401b68ab0e032fce3e376aaca9a5431619b2b4/src/main/gram.y#L512
# The number of spaces is so that the code is brought to the next 8-character indentation level e.g.:
# "1\t;" --> "1 ;"
# "12\t;" --> "12 ;"
# "123\t;" --> "123 ;"
# "1234\t;" --> "1234 ;"
# "12345\t;" --> "12345 ;"
# "123456\t;" --> "123456 ;"
# "1234567\t;" --> "1234567 ;"
# "12345678\t;" --> "12345678 ;"
# "123456789\t;" --> "123456789 ;"
# "1234567890\t;" --> "1234567890 ;"
fix_tab_indentations <- function(source_expression) {
parse_data <- getParseData(source_expression)
if (is.null(parse_data)) {
return(NULL)
}
tab_cols <- gregexpr("\t", source_expression[["lines"]], fixed = TRUE)
names(tab_cols) <- seq_along(tab_cols)
matched_lines <- vapply(tab_cols, function(line_match) !is.na(line_match[1L]) && line_match[1L] > 0L, logical(1L))
if (!any(matched_lines)) {
return(parse_data)
}
tab_cols <- tab_cols[matched_lines]
fix_columns <- c("line1", "line2", "col1", "col2")
parse_data[, fix_columns] <- fix_tab_columns(parse_data[, fix_columns], tab_cols)
parse_data
}
fix_tab_columns <- function(parse_content, tab_cols) {
dat <- cbind(
c(parse_content$line1, parse_content$line2),
c(parse_content$col1, parse_content$col2)
)
lines <- as.integer(names(tab_cols))
for (i in seq_along(tab_cols)) {
is_curr_line <- dat[, 1L] == lines[[i]]
if (!any(is_curr_line)) {
next
}
tab_col <- tab_cols[[i]]
line_tab_offsets <- tab_offsets(tab_col)
for (j in seq_along(tab_col)) {
is_line_to_change <- is_curr_line & dat[, 2L] > tab_col[[j]]
if (any(is_line_to_change)) {
dat[is_line_to_change, 2L] <- dat[is_line_to_change, 2L] - line_tab_offsets[[j]]
}
}
}
dat
}
tab_offsets <- function(tab_columns) {
cum_offset <- 0L
vapply(
tab_columns - 1L,
function(tab_idx) {
offset <- 7L - (tab_idx + cum_offset) %% 8L # using a tab width of 8 characters
cum_offset <<- cum_offset + offset
offset
},
integer(1L),
USE.NAMES = FALSE
)
}
# This function wraps equal assign expressions in a parent expression so they
# are the same as the corresponding <- expression
fix_eq_assigns <- function(pc) {
if (is.null(pc) || any(c("equal_assign", "expr_or_assign_or_help") %in% pc$token)) {
return(pc)
}
eq_assign_locs <- which(pc$token == "EQ_ASSIGN")
# check whether the equal-assignment is the final entry
if (length(eq_assign_locs) == 0L || tail(eq_assign_locs, 1L) == nrow(pc)) {
return(pc)
}
prev_locs <- vapply(eq_assign_locs, prev_with_parent, pc = pc, integer(1L))
next_locs <- vapply(eq_assign_locs, next_with_parent, pc = pc, integer(1L))
expr_locs <- prev_locs != lag(next_locs)
expr_locs[is.na(expr_locs)] <- TRUE
id_itr <- max(pc$id)
true_locs <- which(expr_locs)
n_expr <- length(true_locs)
supplemental_content <- data.frame(
line1 = integer(n_expr),
col1 = integer(n_expr),
line2 = integer(n_expr),
col2 = integer(n_expr),
id = integer(n_expr),
parent = integer(n_expr),
token = character(n_expr),
terminal = logical(n_expr),
text = character(n_expr),
stringsAsFactors = FALSE
)
for (i in seq_len(n_expr)) {
start <- true_locs[i]
# TODO(michaelchirico): vectorize this loop away. the tricky part is,
# this loop doesn't execute on most R versions (we tried 3.6.3 and 4.2.0).
# so it likely requires some GHA print debugging -- tedious :)
end <- true_locs[i]
j <- end + 1L
# nocov start: only runs on certain R versions
while (j <= length(expr_locs) && !expr_locs[j]) {
end <- j
j <- j + 1L
}
# nocov end
prev_loc <- prev_locs[start]
next_loc <- next_locs[end]
id_itr <- id_itr + 1L
supplemental_content[i, ] <- list(
pc[prev_loc, "line1"],
pc[prev_loc, "col1"],
pc[next_loc, "line2"],
pc[next_loc, "col2"],
id_itr,
pc[eq_assign_locs[true_locs[i]], "parent"],
"expr", # R now uses "equal_assign"
FALSE,
""
)
new_parent_locs <- c(
prev_locs[start:end],
eq_assign_locs[start:end],
next_locs[start:end],
next_loc
)
pc[new_parent_locs, "parent"] <- id_itr
}
rownames(supplemental_content) <- supplemental_content$id
res <- rbind(pc, supplemental_content)
res[order(res$line1, res$col1, res$line2, res$col2, res$id), ]
}
step_with_parent <- function(pc, loc, offset) {
id <- pc$id[loc]
parent_id <- pc$parent[loc]
with_parent <- pc[pc$parent == parent_id, ]
with_parent <- with_parent[order(with_parent$line1, with_parent$col1, with_parent$line2, with_parent$col2), ]
loc <- which(with_parent$id == id)
which(pc$id == with_parent$id[loc + offset])
}
prev_with_parent <- function(pc, loc) step_with_parent(pc, loc, offset = -1L)
next_with_parent <- function(pc, loc) step_with_parent(pc, loc, offset = 1L)
top_level_expressions <- function(pc) {
if (is.null(pc)) {
return(integer(0L))
}
which(pc$parent <= 0L)
}
# workaround for bad parse data bug for octal escapes
# https://bugs.r-project.org/show_bug.cgi?id=18323
fix_octal_escapes <- function(pc, lines) {
# subset first to prevent using nchar() on MBCS input
is_str_const <- pc$token == "STR_CONST"
str_const <- pc[is_str_const, ]
str_const_mismatch <- str_const$col2 - str_const$col1 != nchar(str_const$text) - 1L
if (!any(str_const_mismatch)) {
return(pc)
}
str_const <- str_const[str_const_mismatch, ]
out <- character(nrow(str_const))
single_line <- str_const$line1 == str_const$line2
out[single_line] <- substr(
lines[str_const$line1[single_line]],
str_const$col1[single_line],
str_const$col2[single_line]
)
for (ii in which(!single_line)) {
out[ii] <- paste(
c(
substring(lines[str_const$line1[ii]], str_const$col1[ii]),
if (str_const$line1[ii] < str_const$line2[ii] - 1L) {
lines[(str_const$line1[ii] + 1L):(str_const$line2[ii] - 1L)]
},
substr(lines[str_const$line2[ii]], 1L, str_const$col2[ii])
),
collapse = "\n"
)
}
pc$text[is_str_const][str_const_mismatch] <- out
pc
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.