R/parser.R

# Parser: Converts token lists from Tokenizer into Arl S-expressions (R calls). Expands
# quote/quasiquote sugar into explicit forms. Uses source_tracker for source locations.

# Module-level constant for quote sugar mapping (allocated once at package load)
.SUGAR_MAP <- list(
  QUOTE = "quote",
  QUASIQUOTE = "quasiquote",
  UNQUOTE = "unquote",
  UNQUOTE_SPLICING = "unquote-splicing"
)
#
# @field source_tracker SourceTracker for attaching source info to expressions.
#
#' @keywords internal
#' @noRd
Parser <- R6::R6Class(
  "ArlParser",
  public = list(
    source_tracker = NULL,
    # @description Create parser with optional source tracker.
    # @param source_tracker Optional SourceTracker; used for src on parsed expressions.
    initialize = function(source_tracker = NULL) {
      self$source_tracker <- source_tracker
    },
    # @description Parse a list of tokens into a list of Arl expressions (R calls/symbols/atoms).
    # @param tokens List of tokens from Tokenizer$tokenize().
    # @param source_name Optional name for error reporting and source attachment.
    # @return List of expressions.
    parse = function(tokens, source_name = NULL) {
      pos <- 1
      expressions <- list()
      tracker <- self$source_tracker
      if (is.null(tracker)) {
        tracker <- SourceTracker$new()
      }

      parse_expr <- function() {
        if (pos > length(tokens)) {
          stop("Unexpected end of input")
        }

        token <- tokens[[pos]]

        make_src <- function(start_token, end_src = NULL) {
          end_line <- start_token$line
          end_col <- start_token$col
          if (!is.null(end_src)) {
            if (!is.null(end_src$end_line)) {
              end_line <- end_src$end_line
            }
            if (!is.null(end_src$end_col)) {
              end_col <- end_src$end_col
            }
          }
          tracker$src_new(source_name, start_token$line, start_token$col, end_line, end_col)
        }

        # Quote/quasiquote/unquote sugar
        if (token$type %in% names(.SUGAR_MAP)) {
          pos <<- pos + 1
          quoted <- parse_expr()
          op <- .SUGAR_MAP[[token$type]]
          expr <- as.call(list(as.symbol(op), quoted))
          return(tracker$src_set(expr, make_src(token, tracker$src_get(quoted))))
        }

        # Atoms
        if (token$type == "NUMBER") {
          pos <<- pos + 1
          return(tracker$src_set(token$value, make_src(token)))
        }

        if (token$type == "STRING") {
          pos <<- pos + 1
          return(tracker$src_set(token$value, make_src(token)))
        }

        if (token$type == "BOOLEAN") {
          pos <<- pos + 1
          return(tracker$src_set(token$value, make_src(token)))
        }

        if (token$type == "NIL") {
          pos <<- pos + 1
          # Use a sentinel symbol so #nil survives in (as.call elements); NULL would be dropped.
          return(tracker$src_set(as.symbol(".__nil"), make_src(token)))
        }

        if (token$type == "NA") {
          pos <<- pos + 1
          return(tracker$src_set(token$value, make_src(token)))
        }

        if (token$type == "SYMBOL") {
          pos <<- pos + 1

          # Handle / qualified access: math/sin -> (module-ref math sin)
          # Bare "/" stays as division symbol
          # R's %op% infix operators (e.g. %/%, %*%) are exempt
          if (token$value != "/" && grepl("/", token$value, fixed = TRUE) &&
              !grepl("^%.*%$", token$value)) {
            parts <- strsplit(token$value, "/", fixed = TRUE)[[1]]
            # Guard: all parts must be non-empty (reject "foo/" or "foo//bar")
            if (length(parts) >= 2 && all(nzchar(parts))) {
              # Build nested (module-ref ...) calls: a/b/c -> (module-ref (module-ref a b) c)
              expr <- as.symbol(parts[1])
              for (i in seq(2, length(parts))) {
                expr <- as.call(list(as.symbol("module-ref"), expr, as.symbol(parts[i])))
              }
              return(tracker$src_set(expr, make_src(token)))
            }
            # Invalid format (empty parts), fall through to return as-is
          }

          # Handle :: and ::: syntactic sugar
          # Check for ::: first (3 colons) as it's more specific
          if (grepl(":::", token$value, fixed = TRUE)) {
            parts <- strsplit(token$value, ":::", fixed = TRUE)[[1]]
            if (length(parts) == 2 && nzchar(parts[1]) && nzchar(parts[2])) {
              # Convert pkg:::name to (::: pkg name)
              expr <- as.call(list(as.symbol(":::"), as.symbol(parts[1]), as.symbol(parts[2])))
              return(tracker$src_set(expr, make_src(token)))
            }
            # Invalid format, fall through to return as-is
          } else if (grepl("::", token$value, fixed = TRUE)) {
            parts <- strsplit(token$value, "::", fixed = TRUE)[[1]]
            if (length(parts) == 2 && nzchar(parts[1]) && nzchar(parts[2])) {
              # Convert pkg::name to (:: pkg name)
              expr <- as.call(list(as.symbol("::"), as.symbol(parts[1]), as.symbol(parts[2])))
              return(tracker$src_set(expr, make_src(token)))
            }
            # Invalid format, fall through to return as-is
          }

          return(tracker$src_set(as.symbol(token$value), make_src(token)))
        }

        if (token$type == "KEYWORD") {
          pos <<- pos + 1
          # Store keywords as a special structure
          # We'll use this in the compiler/runtime to convert to named arguments
          keyword <- structure(token$value, class = "arl_keyword")
          return(tracker$src_set(keyword, make_src(token)))
        }

        # Standalone dot: treat as symbol . (e.g. lambda (. args) rest parameter)
        if (token$type == "DOT") {
          pos <<- pos + 1
          return(tracker$src_set(as.symbol("."), make_src(token)))
        }

        # Lists (S-expressions)
        if (token$type == "LPAREN") {
          pos <<- pos + 1
          start_token <- token
          # Use c() with list() to preserve NULL values (elements[[i]] <- NULL doesn't work)
          # But collect in chunks to avoid O(n^2) behavior
          elements <- list()
          chunk <- vector("list", 32)  # Collect in chunks of 32
          chunk_idx <- 1
          seen_dot <- FALSE
          dot_cdr <- NULL

          dotted_heads <- NULL

          while (pos <= length(tokens) && tokens[[pos]]$type != "RPAREN") {
            elem <- parse_expr()
            # After first element(s), DOT means dotted pair: (a . b) or (a b . c)
            if (pos <= length(tokens) && tokens[[pos]]$type == "DOT") {
              # (a .) with no cdr: parse as list (a .) so lambda/macro can error on invalid formals
              if (pos + 1 <= length(tokens) && tokens[[pos + 1]]$type == "RPAREN") {
                chunk[[chunk_idx]] <- elem
                chunk_idx <- chunk_idx + 1
                next
              }
              pos <<- pos + 1  # consume DOT
              if (pos > length(tokens) || tokens[[pos]]$type == "RPAREN") {
                tok <- tokens[[pos - 1]]
                stop(sprintf("Missing cdr after '.' at line %d, column %d", tok$line, tok$col))
              }
              dot_cdr <- parse_expr()
              if (chunk_idx > 1) {
                elements <- c(elements, chunk[1:(chunk_idx - 1)])
              }
              dotted_heads <- c(elements, list(elem))
              seen_dot <- TRUE
              break
            }
            chunk[[chunk_idx]] <- elem
            chunk_idx <- chunk_idx + 1

            # Flush chunk when full
            if (chunk_idx > 32) {
              elements <- c(elements, chunk)
              chunk <- vector("list", 32)
              chunk_idx <- 1
            }
          }

          # Flush remaining chunk
          if (chunk_idx > 1) {
            elements <- c(elements, chunk[1:(chunk_idx - 1)])
          }

          if (pos > length(tokens)) {
            stop(sprintf("Unclosed parenthesis at line %d, column %d", token$line, token$col))
          }

          end_token <- tokens[[pos]]
          pos <<- pos + 1  # Skip RPAREN

          span <- tracker$src_new(
            source_name, start_token$line, start_token$col,
            end_token$line, end_token$col)

          # Dotted pair: (a . b) or (a b . c)
          if (seen_dot && !is.null(dot_cdr) && length(dotted_heads) > 0) {
            result <- dot_cdr
            for (j in rev(seq_along(dotted_heads))) {
              result <- Cons$new(dotted_heads[[j]], result)
            }
            return(tracker$src_set(result, span))
          }

          # Empty list
          if (length(elements) == 0) {
            return(tracker$src_set(list(), span))
          }

          # Convert to R call
          expr <- as.call(elements)
          return(tracker$src_set(expr, span))
        }

        if (token$type == "RPAREN") {
          stop(sprintf("Unexpected ')' at line %d, column %d", token$line, token$col))
        }

        stop(sprintf("Unexpected token type: %s", token$type))
      }

      chunk <- vector("list", 32)
      chunk_idx <- 1L

      while (pos <= length(tokens)) {
        expr <- parse_expr()
        chunk[[chunk_idx]] <- expr
        chunk_idx <- chunk_idx + 1L

        if (chunk_idx > 32L) {
          expressions <- c(expressions, chunk)
          chunk <- vector("list", 32)
          chunk_idx <- 1L
        }
      }

      # Flush remaining chunk
      if (chunk_idx > 1L) {
        expressions <- c(expressions, chunk[seq_len(chunk_idx - 1L)])
      }

      expressions
    },

    # @description Convert an Arl expression back to its string representation.
    # Inverse of parse(): the output can be parsed back with read().
    # @param expr A Arl expression (symbol, call, atomic, Cons, keyword, etc.).
    # @return Character string.
    write = function(expr) {
      # Strip source tracking attribute so deparse() is clean
      attr(expr, "arl_src") <- NULL

      # NULL -> #nil
      if (is.null(expr)) return("#nil")

      # Cons (dotted pair) -> (a b . c)
      if (inherits(expr, "ArlCons")) {
        p <- expr$parts()
        elems <- vapply(p$prefix, function(e) self$write(e), character(1))
        tail_str <- self$write(p$tail)
        return(paste0("(", paste(elems, collapse = " "), " . ", tail_str, ")"))
      }

      # Keyword -> :foo
      if (inherits(expr, "arl_keyword")) {
        return(paste0(":", as.character(expr)))
      }

      # Symbol
      if (is.symbol(expr)) {
        name <- as.character(expr)
        if (name == ".__nil") return("#nil")
        return(name)
      }

      # Logical (#t, #f, NA)
      if (is.logical(expr) && length(expr) == 1L) {
        if (is.na(expr)) return("NA")
        return(if (expr) "#t" else "#f")
      }

      # String (including NA_character_)
      if (is.character(expr) && length(expr) == 1L) {
        if (is.na(expr)) return("NA_character_")
        return(paste0('"', unescape_string(expr), '"'))
      }

      # Integer (including NA_integer_)
      if (is.integer(expr) && length(expr) == 1L) {
        if (is.na(expr)) return("NA_integer_")
        return(paste0(as.character(expr), "L"))
      }

      # Double (NaN, Inf, -Inf, NA_real_, regular)
      if (is.double(expr) && length(expr) == 1L) {
        if (is.nan(expr)) return("NaN")
        if (is.infinite(expr)) return(if (expr > 0) "Inf" else "-Inf")
        if (is.na(expr)) return("NA_real_")
        return(deparse(expr))
      }

      # Complex (including NA_complex_)
      if (is.complex(expr) && length(expr) == 1L) {
        if (is.na(expr)) return("NA_complex_")
        return(deparse(expr))
      }

      # Empty list -> ()
      if (is.list(expr) && length(expr) == 0L) {
        return("()")
      }

      # Call (S-expression)
      if (is.call(expr)) {
        if (is.symbol(expr[[1L]])) {
          head_name <- as.character(expr[[1L]])

          # Sugar: (quote x) -> 'x, etc. (2-element forms)
          if (length(expr) == 2L) {
            sugar <- switch(head_name,
              "quote" = "'",
              "quasiquote" = "`",
              "unquote" = ",",
              "unquote-splicing" = ",@",
              NULL
            )
            if (!is.null(sugar)) {
              return(paste0(sugar, self$write(expr[[2L]])))
            }
          }

          # Sugar: (:: pkg name) -> pkg::name, (::: pkg name) -> pkg:::name
          if (head_name %in% c("::", ":::") && length(expr) == 3L &&
              is.symbol(expr[[2L]]) && is.symbol(expr[[3L]])) {
            return(paste0(
              as.character(expr[[2L]]), head_name, as.character(expr[[3L]])
            ))
          }

          # Sugar: (module-ref lhs rhs) -> lhs/rhs
          # Nested module-ref naturally produces a/b/c via recursion
          if (head_name == "module-ref" && length(expr) == 3L &&
              is.symbol(expr[[3L]])) {
            lhs <- self$write(expr[[2L]])
            rhs <- as.character(expr[[3L]])
            return(paste0(lhs, "/", rhs))
          }
        }

        # General call: (f a b ...)
        elems <- vapply(
          seq_along(expr),
          function(i) self$write(expr[[i]]),
          character(1)
        )
        return(paste0("(", paste(elems, collapse = " "), ")"))
      }

      stop(sprintf("write: cannot serialize object of class %s",
                   paste(class(expr), collapse = "/")), call. = FALSE)
    }
  )
)

Try the arl package in your browser

Any scripts or data that you put into this service are public.

arl documentation built on March 19, 2026, 5:09 p.m.