Nothing
# Tokenizer: Lexes Arl source code into tokens (LPAREN, RPAREN, SYMBOL, NUMBER, STRING,
# QUOTE, QUASIQUOTE, etc.) with line/col. Delimiters control what counts as a token boundary.
#
# @field delimiters Character vector of single-character delimiters.
#
#' @keywords internal
#' @noRd
# Module-level regex pattern for tokenization (compiled once at package load)
# Ordered alternation: PCRE tries each branch in order, first match wins.
.TOKEN_PATTERN <- paste0(
'"(?:[^"\\\\]|\\\\.)*"', # (1) Complete string
'|"(?:[^"\\\\]|\\\\.)*', # (2) Unterminated string (error detect)
'|;[^\\n]*', # (3) Comment
'|,@', # (4) Unquote-splicing
'|[()]', # (5) Parens
"|'", # (6) Quote
'|`', # (7) Quasiquote
'|,', # (8) Unquote
'|:::', # (9) Triple colon (before ::)
'|::', # (10) Double colon (before :keyword)
"|:[^ \\t\\n\\r()'`,;@\":]+", # (11) Keyword
"|[^ \\t\\n\\r()'`,;@\":]+(?::{2,3}[^ \\t\\n\\r()'`,;@\":]+)*", # (12) Atom
'|:' # (13) Lone colon
)
# Process string escape sequences using fixed gsub (C-level)
# Sentinel approach ensures \\n -> \ + n (not newline)
.process_string_escapes <- function(s) {
s <- gsub("\\\\", "\001", s, fixed = TRUE) # \\ -> sentinel
s <- gsub("\\n", "\n", s, fixed = TRUE) # \n -> newline
s <- gsub("\\t", "\t", s, fixed = TRUE) # \t -> tab
s <- gsub("\\r", "\r", s, fixed = TRUE) # \r -> CR
s <- gsub("\\\"", "\"", s, fixed = TRUE) # \" -> quote
s <- gsub("\001", "\\", s, fixed = TRUE) # sentinel -> backslash
s
}
# Reverse of .process_string_escapes: convert literal special chars to escape sequences
unescape_string <- function(s) {
s <- gsub("\\", "\001", s, fixed = TRUE) # backslash -> sentinel
s <- gsub("\"", "\\\"", s, fixed = TRUE) # quote -> \"
s <- gsub("\n", "\\n", s, fixed = TRUE) # newline -> \n
s <- gsub("\t", "\\t", s, fixed = TRUE) # tab -> \t
s <- gsub("\r", "\\r", s, fixed = TRUE) # CR -> \r
s <- gsub("\001", "\\\\", s, fixed = TRUE) # sentinel -> \\
s
}
# Classify an atom token text into its type and value
.classify_atom <- function(text) {
# Standalone dot
if (text == ".") {
return(list(type = "DOT", value = "."))
}
# Booleans
if (text == "#t") return(list(type = "BOOLEAN", value = TRUE))
if (text == "#f") return(list(type = "BOOLEAN", value = FALSE))
if (text == "TRUE") return(list(type = "BOOLEAN", value = TRUE))
if (text == "FALSE") return(list(type = "BOOLEAN", value = FALSE))
# Nil
if (text == "#nil") return(list(type = "NIL", value = NULL))
if (text == "NULL") return(list(type = "NIL", value = NULL))
# Special numeric constants
if (text == "Inf") return(list(type = "NUMBER", value = Inf))
if (text == "-Inf") return(list(type = "NUMBER", value = -Inf))
if (text == "NaN") return(list(type = "NUMBER", value = NaN))
# NA variants
if (text == "NA") return(list(type = "NA", value = NA))
if (text == "NA_real_") return(list(type = "NA", value = NA_real_))
if (text == "NA_integer_") return(list(type = "NA", value = NA_integer_))
if (text == "NA_character_") return(list(type = "NA", value = NA_character_))
if (text == "NA_complex_") return(list(type = "NA", value = NA_complex_))
# Fast pre-check: only attempt numeric parsing if the first character suggests a number
nc <- nchar(text)
ch1 <- substr(text, 1L, 1L)
is_sign <- (ch1 == "+" || ch1 == "-")
ch_after_sign <- if (is_sign && nc > 1L) substr(text, 2L, 2L) else ""
starts_numeric <- (ch1 >= "0" && ch1 <= "9") || (is_sign && ch_after_sign >= "0" && ch_after_sign <= "9")
if (starts_numeric) {
last_ch <- substr(text, nc, nc)
# Integer literal (e.g., 4L, 42L, -10L)
if (last_ch == "L" && grepl("^[-+]?\\d+L$", text)) {
num_str <- substr(text, 1L, nc - 1L)
return(list(type = "NUMBER", value = as.integer(num_str)))
}
# Complex/imaginary (ends with 'i')
if (last_ch == "i") {
# Full complex number (e.g., 2+4i, 3.14-2.5i)
if (grepl("^[-+]?\\d+(\\.\\d+)?([eE][-+]?\\d+)?[+-]\\d+(\\.\\d+)?([eE][-+]?\\d+)?i$", text)) {
start_pos <- 1L
if (is_sign) start_pos <- 2L
sep_pos <- NULL
for (j in start_pos:nc) {
ch <- substr(text, j, j)
if (ch == "+" || ch == "-") {
sep_pos <- j
break
}
}
real_str <- substr(text, 1L, sep_pos - 1L)
imag_str <- substr(text, sep_pos, nc - 1L)
return(list(type = "NUMBER", value = complex(real = as.numeric(real_str), imaginary = as.numeric(imag_str))))
}
# Pure imaginary number (e.g., 4i, 3.14i, -2.5i)
if (grepl("^[-+]?\\d+(\\.\\d+)?([eE][-+]?\\d+)?i$", text)) {
num_str <- substr(text, 1L, nc - 1L)
return(list(type = "NUMBER", value = complex(real = 0, imaginary = as.numeric(num_str))))
}
}
# Regular number (including scientific notation)
if (grepl("^[-+]?\\d+(\\.\\d+)?([eE][-+]?\\d+)?$", text)) {
return(list(type = "NUMBER", value = as.numeric(text)))
}
}
# Default: symbol
list(type = "SYMBOL", value = text)
}
Tokenizer <- R6::R6Class(
"ArlTokenizer",
public = list(
delimiters = NULL,
# @description Create tokenizer with default or custom delimiters.
# @param delimiters Optional character vector; default includes space, parens, quote, comma, etc.
initialize = function(delimiters = NULL) {
if (is.null(delimiters)) {
delimiters <- c(" ", "\t", "\n", "\r", "(", ")", "'", "`", ",", ";", "@", '"', ":")
}
self$delimiters <- delimiters
},
# @description Lex source string into a list of tokens (each list(type, value, line, col)).
# @param source Character string of Arl source code.
# @return List of token lists.
tokenize = function(source) {
if (nchar(source) == 0L) return(list())
# Phase 1: Single C-level regex pass
m <- gregexpr(.TOKEN_PATTERN, source, perl = TRUE)[[1]]
# No matches
if (length(m) == 1L && m[1L] == -1L) return(list())
match_starts <- as.integer(m)
match_lengths <- attr(m, "match.length")
n_matches <- length(match_starts)
# Extract matched texts
texts <- substring(source, match_starts, match_starts + match_lengths - 1L)
# Phase 2: Vectorized line/col computation
newline_pos <- gregexpr("\n", source, fixed = TRUE)[[1]]
if (length(newline_pos) == 1L && newline_pos[1L] == -1L) {
# No newlines - everything is on line 1
lines <- rep(1L, n_matches)
cols <- match_starts
} else {
line_starts <- c(1L, newline_pos + 1L)
lines <- findInterval(match_starts, line_starts)
cols <- match_starts - line_starts[lines] + 1L
}
# Phase 3: Classification loop
tokens <- vector("list", n_matches) # pre-allocate
tok_idx <- 0L
for (k in seq_len(n_matches)) {
txt <- texts[k]
ln <- lines[k]
cl <- cols[k]
ch1 <- substr(txt, 1L, 1L)
if (ch1 == ";") {
# Comment - skip
next
}
if (ch1 == "\"") {
# String - check if terminated
if (substr(txt, nchar(txt), nchar(txt)) != "\"" || nchar(txt) < 2L) {
stop(sprintf("Unterminated string at line %d, column %d", ln, cl))
}
# Strip quotes and process escapes
inner <- substr(txt, 2L, nchar(txt) - 1L)
val <- .process_string_escapes(inner)
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "STRING", value = val, line = ln, col = cl)
next
}
if (ch1 == "(") {
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "LPAREN", value = "(", line = ln, col = cl)
next
}
if (ch1 == ")") {
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "RPAREN", value = ")", line = ln, col = cl)
next
}
if (ch1 == "'") {
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "QUOTE", value = "'", line = ln, col = cl)
next
}
if (ch1 == "`") {
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "QUASIQUOTE", value = "`", line = ln, col = cl)
next
}
if (txt == ",@") {
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "UNQUOTE_SPLICING", value = ",@", line = ln, col = cl)
next
}
if (ch1 == ",") {
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "UNQUOTE", value = ",", line = ln, col = cl)
next
}
if (ch1 == ":") {
# :: or ::: operators
if (txt == "::") {
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "SYMBOL", value = "::", line = ln, col = cl)
next
}
if (txt == ":::") {
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "SYMBOL", value = ":::", line = ln, col = cl)
next
}
# Lone colon (empty keyword)
if (txt == ":") {
stop(sprintf("Empty keyword at line %d, column %d", ln, cl))
}
# Keyword - strip leading colon
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = "KEYWORD", value = substr(txt, 2L, nchar(txt)), line = ln, col = cl)
next
}
# Atom (everything else) - classify
atom <- .classify_atom(txt)
tok_idx <- tok_idx + 1L
tokens[[tok_idx]] <- list(type = atom$type, value = atom$value, line = ln, col = cl)
}
# Trim pre-allocated list to actual size
if (tok_idx == 0L) return(list())
tokens[seq_len(tok_idx)]
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.