knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(rly)
{rly} consists of two files : lex.R
and yacc.R
. There's quite a bit "going on" in those two files and they may be helpful examples of how to structure {R6} classes.
The inst/examples
directory contains several different examples. Here are some more.
We can build a "lexer" to pull out URLs from text input. Sure, we could just use stringi::stri_match_all_regex()
for this particular example, but the intent is to work with a known, straightforward domain to see how "lexing" works:
# Define what tokens we accept. In this case, just URLs TOKENS <- c("URL") # Build our "lexing" rules. This is an {R6} class. # If you've never played with {R6} classes, head on # over to <https://cran.rstudio.com/web/packages/R6/> # to learn more about it and also take a look at the # packages that depend on/use it (which may help you # grok {R6} a bit better.) URLexer <- R6::R6Class( classname = "Lexer", public = list( # tell it abt the tokens we accept tokens = TOKENS, # we use the t_ prefix to identify that this is the # "matcher" for the token and then give it the regular # expression that goes with this token. The URL # regex is a toy one that says to match http or https # strings until it hits a space. # # The `t` parameter is the the full context of the token # parser at the time it gets to this token. # # here, we're just printing a message out and continuing # but we could do anything we (programmatically) want t_URL = function(re = 'http[s]*://[^[:space:]]+', t) { message("Found URL: ", t$value) # Be verbose when we find a URL return(t) # we need to return the potentially modified token }, # whenever a newline is encounterd we increment a line # # counter. this is useful when providing contextual errors t_newline = function(re='\\n+', t) { t$lexer$lineno <- t$lexer$lineno + nchar(t$value) return(NULL) }, # the goal of the lexer is to give us valid input # but we can ignore errors if we're just looking for # certain things (like URLs) t_error = function(t) { t$lexer$skip(1) return(t) } ) ) # Create out lexer lexer <- rly::lex(URLexer) # Feed it some data lexer$input(s = " http://google.com https://rstudio.org/ Not a URL https://rud.is/b Another non-URL https://r-project.org/ https://one.more.url/with/some/extra/bits.html ") # We'll put found URLs here (rly inefficient) found_urls <- character(0) # keep track of the # of invalid token info (also inefficient) invalid <- list() # Now, we'll iterate through the tokens we were given repeat { tok <- lexer$token() # get the next token if (is.null(tok)) break # no more tokens, done with lexing switch( tok$type, # Do this when we find a token identified as a `URL` URL = found_urls <- append(found_urls, tok$value), # Do this whenever we find an invalid token error = invalid <- append(invalid, list(data.frame( bad_thing = tok$value, stream_pos = tok$lexpos, line = tok$lineno, stringsAsFactors = FALSE ))) ) } invalid <- do.call(rbind.data.frame, invalid) nrow(invalid) # number of errors head(invalid, 10) # it'll be clear we never told it abt whitespace found_urls # the good stuff
We can extend this to do things with different types of URIs (not necessarily "http"-ish URLs)
# we'll define different token types for HTTP URLs, HTTPS URLs and # MAILTO URLs TOKENS <- c("HTTP_URL", "HTTPS_URL", "MAILTO_URL") URLexer <- R6::R6Class( classname = "Lexer", public = list( tokens = TOKENS, # three different token regexes t_HTTPS_URL = function(re = 'https://[^[:space:]]+', t) { message("Found HTTPS URL: ", t$value) return(t) }, t_MAILTO_URL = function(re = 'mailto:[^[:space:]]+', t) { message("Found MAILTO URL: ", t$value) return(t) }, t_HTTP_URL = function(re = 'http://[^[:space:]]+', t) { message("Found HTTP URL: ", t$value) return(t) }, t_error = function(t) { t$lexer$skip(1) # if we don't do this the lexer will error out on tokens we don't match (which is usually what we want) return(t) } ) ) # Create out lexer lexer <- rly::lex(URLexer) # Feed it some data lexer$input(s = " http://google.com https://rstudio.org/ Not a URL https://rud.is/b Another non-URL mailto:fred@example.com?subject=Hello https://r-project.org/ mailto:steve@example.com https://one.more.url/with/some/extra/bits.html ") http_urls <- character(0) https_urls <- character(0) mailto_urls <- character(0) repeat { tok <- lexer$token() # get the next token if (is.null(tok)) break # no more tokens, done with lexing switch( tok$type, HTTP_URL = http_urls <- append(http_urls, tok$value), HTTPS_URL = https_urls <- append(https_urls, tok$value), MAILTO_URL = mailto_urls <- append(mailto_urls, tok$value) ) } http_urls https_urls mailto_urls
Here is an example showing a {rly} implementation of a calculator with variables.
Sys.setenv("__R_CHECK_LENGTH_1_CONDITION_"=FALSE) Sys.setenv("__R_CHECK_LENGTH_1_LOGIC2_"=FALSE)
Let's bootstrap our tokenizer/lexer:
TOKENS = c('NAME', 'NUMBER') LITERALS = c('=', '+', '-', '*', '/', '(', ')') # these are "LEXEMES" (ref: https://stackoverflow.com/questions/14954721/what-is-the-difference-between-a-token-and-a-lexeme) Lexer <- R6::R6Class( classname = "Lexer", public = list( tokens = TOKENS, literals = LITERALS, t_NAME = '[a-zA-Z_][a-zA-Z0-9_]*', t_NUMBER = function(re='\\d+', t) { t$value <- strtoi(t$value) return(t) }, t_ignore = " \t", t_newline = function(re='\\n+', t) { t$lexer$lineno <- t$lexer$lineno + nchar(t$value) return(NULL) }, t_error = function(t) { cat(sprintf("Illegal character '%s'", t$value[1])) t$lexer$skip(1) return(t) } ) )
Now, we write our expression parser. Note that we use TOKENS
and LITERALS
from the lexer we just wrote so the parser has some context for what gets passed to it as the tokeneizer emits bits to it:
Parser <- R6::R6Class( classname = "Parser", public = list( tokens = TOKENS, literals = LITERALS, # Parsing rules precedence = list( c('left', '+', '-'), c('left', '*', '/'), c('right', 'UMINUS') ), # dictionary of names (can be inefficient but it's cool here) names = new.env(hash=TRUE), # One type of "statement" is NAME=expression p_statement_assign = function(doc='statement : NAME "=" expression', p) { self$names[[as.character(p$get(2))]] <- p$get(4) }, # Another type of "statement" is just an expression p_statement_expr = function(doc='statement : expression', p) { cat(p$get(2)) cat('\n') }, # Classic simple definition of an expression p_expression_binop = function(doc="expression : expression '+' expression | expression '-' expression | expression '*' expression | expression '/' expression", p) { if(p$get(3) == '+') p$set(1, p$get(2) + p$get(4)) else if(p$get(3) == '-') p$set(1, p$get(2) - p$get(4)) else if(p$get(3) == '*') p$set(1, p$get(2) * p$get(4)) else if(p$get(3) == '/') p$set(1, p$get(2) / p$get(4)) }, # unary minus is a special case we need to handle # see https://www.ibm.com/support/knowledgecenter/en/SSLTBW_2.3.0/com.ibm.zos.v2r3.bpxa600/bpxa698.htm # for %prec explanation # note order does kinda matter in both lexer and parser rule specs p_expression_uminus = function(doc="expression : '-' expression %prec UMINUS", p) { p$set(1, -p$get(3)) }, # parnens expression p_expression_group = function(doc="expression : '(' expression ')'", p) { p$set(1, p$get(3)) }, p_expression_number = function(doc='expression : NUMBER', p) { p$set(1, p$get(2)) }, p_expression_name = function(doc='expression : NAME', p) { p$set(1, self$names[[as.character(p$get(2))]]) }, p_error = function(p) { if(is.null(p)) cat("Syntax error at EOF") else cat(sprintf("Syntax error at '%s'", p$value)) } ) ) lexer <- lex(Lexer) parser <- yacc(Parser) # these will each end with `NULL` as that's how the `parser` signals it's done parser$parse("3", lexer) parser$parse("3 + 5", lexer) parser$parse("3 + 5 * 10 - 100", lexer) parser$parse("A + B * C - D", lexer) # valid lexical syntax but no data to work on; in a real calculator this wld error out parser$parse("A + B * C - D = E", lexer) # invalid lexical syntax parser$parse("A = 1 + 2", lexer) # valid syntax, still no output b/c we just did assignment parser$parse("A", lexer) invisible(parser$parse("B = 5", lexer)) # using invisible() only to suppress useless NULLs invisible(parser$parse("C = 10", lexer)) invisible(parser$parse("D = 100", lexer)) parser$parse("A + B * C - D", lexer)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.