R/parse.R

Defines functions comm_reset comm_and_call_extract comm_extract symb_mark_rem prsdat_fix_exprlist prsdat_find_paren prsdat_fix_while prsdat_fix_if prsdat_fix_simple prsdat_fix_for prsdat_fix_fun prsdat_reduce parse_tests parse_dat_get parse_with_comments comments_assign ancestry_descend top_level_parse_parents

# Copyright (C) Brodie Gaslam
#
# This file is part of "unitizer - Interactive R Unit Tests"
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.


# Strategy for parsing comments:
#
# Look up all entries with parent == 0; these are the top level entries
# All the comments that are -id are top level comments. The id refers to the
# next statement. Comments don't have parents when they are the last thing in
# the file.  So logic, using getParseData():
#
#
# - Get all top level ids
# - Get all comments that have -ids or 0 ids
# - For each comment with -id, check whether comment line is same as the previous
#   top level comment end line
#   + if yes, associate with that statement
#   + if not, associate with the -id statement
#  - For zero ids, just check if on same line as last top level statement
#
# After this, need to break up the data frame into pieces based on what the
# top level parent is.  This is fine and well, though we need to reset the
# top level parents so we can do this recursively for unitizer_sect
#
# Unfortunately, this gets seriously complicated by the fact that comments
# inside calls have for parent the call, irrespective of where they are.
# So we need to figure out (ONLY FOR TOP LEVEL COMMENTS):
# - is a comment on it's own line?
# - if yes, find the next "top level" item
# - if no, find previous "top level" item
#
# Basically, the process is as follows:
# 1. define top level (starts of at zero, and for zero, make sure
#    any negative parents are zero)
# 2. Split data frame by top level, and:
#    - assign comments to each top level object
#    - assign data frame chunks to each top level object
# 3. Recurse through all the objects until we get to terminals
#
# Net result should be an expression that, for each non terminal
# object will have a piece of the original parse data frame attached,
# as well as comments (do we need the original parse data frame, maybe
# not if we process it all in one go?)


# Searches Through Generations Until it Finds Top Level
#
# Returns the id of the ancestor that is just before \code{`top.level`},
# or \code{`top.level`} if the parent already is \code{`top.level`}. The
# idea is to reduce the set of parents for all elements to just the top
# level parents as this allows us to split the parse data into sections,
# including the calls that were direct children of top level, as well as
# the children to those sections.
#
# @param ids integer the ids to look through
# @param par.ids integer the parenthood relationships
# @param top.level the id of the top level
# @return integer the top level parent ids for \code{`ids`}

top_level_parse_parents <- function(ids, par.ids, top.level=0L) {
  if(
    !is.integer(ids) || !is.integer(par.ids) ||
    !identical(length(ids), length(par.ids))
  )
    stop("Arguments `ids` and `par.ids` must be equal length integer vectors")
  if(!identical(length(setdiff(abs(par.ids), c(ids, top.level))), 0L))
    stop("Argument `par.ids` contains ids not in `ids`.")
  if(!is.integer(top.level) && !identical(length(top.level), 1L))
    stop("Argument `top.level` must be a one length integer")
  if(identical(top.level, 0L)) {
    par.ids <- pmax(0L, par.ids)
  } else if (any(par.ids) < 0) {
    # nocov start
    stop(
      "Advanced Parse Error: Argument `par.ids` contains values less than ",
      "zero, but that is only allowed when `top.level` == 0L."
    )
    # nocov end
  }
  # Create lookup matrix so we can look up ids directly.  This will be a slightly
  # sparse matrix to the extend `ids` doesn't contain every number between
  # range(ids).  The idea is to be able to lookup id-par pairs by direct index
  # access.  This is not super efficient since we keep recalculating some of the
  # data over and over with each recursion.

  id.range <- range(ids)
  if(id.range[[1L]] < 1L)
    stop("Advanced Parse Error: Expected only strictly positive unique ids")
  par.full <- rep(NA_integer_, id.range[[2L]])
  par.full[ids] <- par.ids
  res <- rep(NA_integer_, length(ids))

  for(i in seq_along(par.ids)) {
    cur.id <- new.id <- par.ids[[i]]
    while(cur.id != top.level) {
      new.id <- par.full[[cur.id]]
      if(is.na(new.id)) break;
      if(new.id == top.level) {
        new.id <- cur.id
        break;
      }
      cur.id <- new.id
    }
    res[[i]] <- new.id
  }
  res
}
# For Each ID Determines Generation
#
# @param ids integer() the object ids
# @param par.ids integer() the parents of each \code{ids}
# @param id integer() the first parent
# @return matrix containing ids and corresponding generation for the ids

ancestry_descend <- function(ids, par.ids, id, level=0L) {
  # Initialize result matrix, can be no bigger than ids

  max.size <- length(ids)
  res <- matrix(
    rep(NA_integer_, max.size * 2L), ncol=2L,
    dimnames=list(NULL, c("children", "level"))
  )
  ind.start <- 1L
  par.idx <- 1L
  par.list <- id
  id.split <- list2env(split(ids, par.ids))

  repeat {
    if(!length(par.list)) break
    child.len <-
      length(children <- id.split[[as.character(par.list[[par.idx]])]])
    if(child.len) {
      ind.end <- ind.start + child.len - 1L
      inds <- ind.start:ind.end
      res[inds, 1L] <- children
      res[inds, 2L] <- level
      ind.start <- ind.end + 1L
    }
    par.idx <- par.idx + 1L
    if(par.idx > length(par.list)) {
      par.list <- res[which(res[, 2L] == level), 1L]
      level <- level + 1L
      par.idx <- 1L
    }
  }
  res[!is.na(res[, 1L]), ]
}
# Need this to pass R CMD check; problems likely caused by `transform` and
# `subset`.

if(getRversion() >= "2.15.1")  utils::globalVariables(c("token", "col1", "line1"))

# Assign Comments From Parse Data to Expression Elements
#
# Based on parse data from \code{`\link{getParseData}`}, figures
# out what comments belong to what expression.  If a comment is
# on the same line as an expression, the comment is assigned to that
# expression (or whatever the nearest expression is on that line if
# there is more than one).  If a comment is on it's own line,
# then the match is done to the next expression.
#
# The expectation is that only "top level" expressions will
# be submitted as part of `comment.dat` (i.e. they all have
# the same parent, they don't strictly have to be top.level).
#
# @param expr and expression
# @param comment.dat a data frame derived from \code{`\link{getParseData}`}
# @return an expression with comments attached as attributes to each
#   expression component

comments_assign <- function(expr, comment.dat) {
  if(!identical(length(unique(comment.dat$parent)), 1L))
    stop( # nocov start
      "Advanced Parse Error: there were multiple parent ids in argument ",
      "`comment.dat`; this should not happen."
    )     # nocov end
  if(!length(expr) || !length(which(comment.dat$token == "COMMENT")))
    return(expr)

  # Make sure `comment.dat` is in format we understand
  # Theory: everything not "COMMENT" should be included, except:
  # - opening parens on second row (these denote a function call)
  # - closing braces of any kind on last row
  # Additionally, in order for stuff to match up properly, anything that is not
  # "expr" needs to be moved to the front (in theory, should be at most one thing
  # and should be an infix operator of some sort)

  if(
    !tail(comment.dat$token, 1L) %in%
    c("COMMENT", "expr", tk.lst$non.exps, tk.lst$brac.close, "';'")
  )
    # nocov start
    stop(
      "Advanced Parse Error: unexpected ending token in parse data."
    )
    # nocov end
  if(
    length(which(comment.dat$token %in% tk.lst$brac.open)) > 1L ||
    length(which(comment.dat$token %in% tk.lst$brac.close)) > 1L
  ) {
    stop( # nocov start
      "Advanced Parse Error: more than one bracket at top level."
    )     # nocov end
  }
  if(
    length(brac.pos <- which(comment.dat$token %in% tk.lst$brac.close)) &&
    !identical(brac.pos, nrow(comment.dat))
  ) {
    # nocov start
    # shouldn't happen, can't test
    if(
      !identical(comment.dat$token[brac.pos], "')'") ||
      !identical(brac.pos, nrow(comment.dat) - 1L) ||
      !identical(comment.dat$token[[1L]], "FUNCTION")
    ) {
      stop(
        "Advanced Parse Error: closing brackets may only be on last row, ",
        "unless a paren and part of a functions formal definition."
      )
    }
    # nocov end
  }
  if(
    !all(match(comment.dat$token, tk.lst$brac.open[-3L], 0L) <= 1L) ||
    !all(match(comment.dat$token, tk.lst$brac.open[3L], 0L) <= 2L)
  ) {
    stop(   # nocov start
      "Advanced Parse Error: opening brackets may only be on first row, ",
      "or second if paren."
    )       # nocov end
  }
  if(
    !identical(
      which(tk.lst$brac.open %in% comment.dat$token),
      which(tk.lst$brac.close %in% comment.dat$token)
    )
  )
    stop("Advanced Parse Error: mismatched brackets.") # nocov
  # extra.toks <- if(any(brac.open %in% comment.dat$token)) 2L else 1L
  # Trim our data to just what matters:

  comm.notcomm <- prsdat_reduce(comment.dat)
  if(!identical(nrow(comm.notcomm), length(expr))) {
    # nocov start
    stop(
      "Advanced Parse Error: Argument `expr` length cannot be matched with  ",
      "values in `comment.dat`"
    )
    # nocov end
  }
  # for the purposes of this process, constants and symbols are basically
  # expressions

  comm.notcomm <- transform(
    comm.notcomm,
    token=ifelse(
      token %in% c(
        tk.lst$exps, tk.lst$non.exps, tk.lst$non.exps.extra, tk.lst$ops
      ),
      "expr", token
  ) )
  # what comments are on same line as something else

  comm.comm <- subset(comment.dat, token=="COMMENT")
  comm.expr <- subset(comm.notcomm, token=="expr")

  # identify whether a token is the first or last on it's line.  Values mean
  # - 3L is only item on line (we think)
  # - 2L is last item on line (we think)

  comm.expr$first.last.on.line <- with(
    comm.expr,
    ave(
      col1, line1,
      FUN=function(x)
        if(length(x) == 1L) 3L
        else ifelse(x == max(x), 2L, ifelse(x == min(x), 1L, 0L))
  ) )
  # For each comment on a line that also has an expression, find the expression
  # that is also on that line

  comm.comm$assign.to.prev <- with(
    comm.expr, line2[match(comm.comm$line1, line2)]
  )
  comm.comm$match <- with(comm.expr,  {
    last.or.only <- first.last.on.line %in% 2L:3L
    id[last.or.only][match(comm.comm$assign.to.prev, line1[last.or.only])]
  } )
  # For each comment on its own line, find the expression that follows it

  first.or.only <- comm.expr$first.last.on.line %in% c(1L, 3L)
  comm.comm$assign.to.next <- vapply(
    comm.comm$line1,
    function(x)
      if(any(idx <- (comm.expr$line1 > x))) min(comm.expr$line1[idx])
      else NA_integer_,
    integer(1L)
  )
  comm.comm$match <- ifelse(
    is.na(comm.comm$match),
    comm.expr$id[first.or.only][
      match(comm.comm$assign.to.next, comm.expr$line1[first.or.only])
    ],
    comm.comm$match
  )
  # Assign comments to matching expression in attributes

  for(i in seq_along(comm.comm$match)) {
    if(is.na(comm.comm$match[[i]])) next
    expr.pos <- which(comm.notcomm$id == comm.comm$match[[i]])
    if(!identical(length(expr.pos), 1L))
      stop("Advanced Parse Error.")  # nocov
    if(!is.null(expr[[expr.pos]])) {
      # names are registered in global pool, so you can only attach attributes
      # to as single unique in memory instance, irrespective of where or how
      # many times a name occurs in an expression.  Because of this, we must
      # turn names that we want to attach comments to into simple language by
      # adding parens.  Note this changes structure of expression but hopefully
      # doesn't mess anything up later on...
      #
      # Also, constants, NULL, etc.

      if(length(expr[[expr.pos]]) < 2) {
        expr[[expr.pos]] <- call("(", expr[[expr.pos]])
        attr(expr[[expr.pos]], "unitizer_parse_symb") <- TRUE
      }
      attr(expr[[expr.pos]], "comment") <-
        c(attr(expr[[expr.pos]], "comment"), comm.comm$text[[i]])
    }
  }
  expr
}
# Need this to pass R CMD check; problems likely caused by `transform` and
# `subset`.

if(getRversion() >= "2.15.1")  utils::globalVariables(c("id", "parent", "token", "line2"))

# Recursively Descends Through a Parsed Expression and Assigns Comments
#
# In order to implement this we had to make several assumptions about the
# behaviour of \code{`\link{getParseData}`}.  In particular:
# \itemize{
#   \item Top level comments show up with negative ids, but are top level
#     for all intents and purposes
#   \item All content tokens (i.e. anything other than brackets, commas,
#     etc.) are contained inside an \code{`expr`}, even if the only thing the
#     `expr` contains is a simple constant (note some exceptions exist to
#     this (search for FUCK in the source).
#   \item Comments are not content tokens and can exist on the top level
#     without being wrapped in an \code{`expr`}
#   \item The only tokens that count as elements in an expression are
#     opening brackets and \code{`expr`}; this assumption is necessary
#     to allow mapping the parsed data back to the expression.  What
#     confuses the issue a bit is that operators show up at the top level,
#     but you can actually
#     ignore them.  Also, parentheses should only be kept if they are the
#     topmost item, as otherwise they are part of a function call and
#     should be ignored.
#   \item Comments inside function formals are not assigned to the formals
#     proper
#   \item `exprlist` tokens are removed completely b/c as far as we can
#     tell they are not part of the parsed object (but exist in parse
#     data).
#   \item known issue: comments in formals after a line break are assigned
#     to the body of the function as opposed to \code{`function`}, but this
#     should not be apparent in common use.
#   \item you cannot attach comments to \code{`NULL`}, if you must use
#     \code{`(NULL)`}.  This is a feature, as it proivdes a way to put
#     comments in the file without them showing up during \code{`unitizer`}
#     use.
# }
# Note that as a result of this trial and error interpretation of
# \code{`\link{getParseData}`} it is likely that comment parsing is
# not 100 percent robust.
#
# Due to some reference weirdness going on when dealing directly with
# expressions had to change this function to accept text/file rather
# than an expression as an input (but even that didn't fix it!!!)
#
# @keywords internal
# @aliases parse_tests
# @seealso comments_assign, getParseData, parse
# @param file containing code to parse with comments
# @param text optional, text to parse if \code{`file`} is not specified
# @param comment logical(1L) whether to try to get comments
# @return an expression with comments retrieved from the parse attached
#   to the appropriate sub-expressions/calls as a \dQuote{comment} \code{`\link{attr}`}

parse_with_comments <- function(file, text=NULL) {
  # Looping to deal with issue #41

  res <- parse_dat_get(file, text)
  parse.dat.raw <- res$dat
  expr <- res$expr
  if(!length(expr)) return(expr)

  # Now proceed with actual parsing

  # hack to deal with issues with expressions retaining previous assigned
  # comments (need to examine this further)

  expr <- comm_reset(expr)

  # set negative ids to be top level parents
  parse.dat.raw.1 <- transform(
    parse.dat.raw, parent=ifelse(parent < 0, 0L, parent)
  )
  ancestry <- with(parse.dat.raw.1, ancestry_descend(id, parent, 0L))
  parse.dat <- prsdat_fix_exprlist(parse.dat.raw.1, ancestry)

  if(is.null(parse.dat))
    stop("Argument `expr` did not contain any parse data")
  if(!is.data.frame(parse.dat))
    stop("Argument `expr` produced parse data that is not a data frame")
  if(!nrow(parse.dat)) return(expr)
  if(!
    identical(
      names(parse.dat),
      c(
        "line1", "col1", "line2", "col2", "id", "parent", "token",  "terminal",
        "text"
  ) ) )
    stop("Argument `expr` produced parse data with unexpected column names")
  if(!identical(unname(vapply(parse.dat, class, "")), c("integer", "integer", "integer", "integer", "integer", "integer",  "character", "logical", "character")))
    stop("Argument `expr` produced data with unexpected column data types")
  if(!all(parse.dat$token %in% unlist(tk.lst))) {
    # nocov start
    # shouldn't happen, can't test
    stop(
      "Advanced Parse Error: unexpected tokens in parse data (",
      paste0(parse.dat$token[!parse.dat$token %in% unlist(tk.lst)]) ,
      ")."
    )
    # nocov end
  }

  prsdat_recurse <- function(expr, parse.dat, top.level) {
    if(identical(parse.dat$token[[1L]], "FUNCTION"))
      parse.dat <- prsdat_fix_fun(parse.dat)
    if(identical(parse.dat$token[[1L]], "FOR"))
      parse.dat <- prsdat_fix_for(parse.dat)
    if(identical(parse.dat$token[[1L]], "IF"))
      parse.dat <- prsdat_fix_if(parse.dat)
    if(identical(parse.dat$token[[1L]], "WHILE"))
      parse.dat <- prsdat_fix_while(parse.dat)

    par.ids <- with(parse.dat, top_level_parse_parents(id, parent, top.level))
    parse.dat.split <- split(parse.dat, par.ids)
    prsdat.par <- parse.dat.split[[as.character(top.level)]]
    prsdat.children <-
      parse.dat.split[names(parse.dat.split) != as.character(top.level)]

    # Check that the parse data doesn't break the assumptions we've made,
    # particularly, that for any child section, there are no overlapping
    # sections at the top level

    line.dat <- vapply(
      prsdat.children,
      function(x) with(x, c(max=max(line2), min=min(line1))), c(max=0L, min=0L)
    )
    col.dat <- vapply(
      seq_along(prsdat.children),
      function(i)
        with(
          prsdat.children[[i]],
          {
            c(
              max=max(col2[which(line2 == line.dat["max", i])]),
              min=min(col1[which(line1 == line.dat["min", i])])
        ) } ),
      c(max=0L, min=0L)
    )
    if(
      any(head(line.dat["max", ], -1L) > tail(line.dat["min", ], -1L)) ||
      any(
        head(line.dat["max", ], -1L) == tail(line.dat["min", ], -1L) &
        head(col.dat["max", ], -1L) >= tail(col.dat["min", ], -1L)
    ) ) {
      # nocov start
      # shouldn't happen, can't test

      stop("Advanced Parse Error: expression parse data overlapping.")
      # nocov end
    }
    # For each parent expression, assign comments; parent expressions that
    # include a function definition have to exclude the formals part (which is a
    # pairlist) because the `getParseData` output does not produce a parent
    # element for the formals; in practice this shouldn't have any impact
    # because test items will never be at such a low level (i.e. any comments at
    # this level would never be shown anyway).

    assignable.elems <- vapply(
      expr,
      function(x)
        !identical(typeof(x), "pairlist") && !any("srcref" == class(x)),
      logical(1L)
    )
    if(!is.call(expr) && !is.expression(expr) && !is.null(expr)) {
      if(!length(assignable.elems) %in% c(1L)) {
        stop(  # nocov start
          "Advanced Parse Error: expression is terminal token yet multiple ",
          "assignable elems."
        )      # nocov end
      }
      if(isTRUE(assignable.elems)) expr <- comments_assign(expr, prsdat.par)
    } else if (!is.null(expr)) {
      expr[assignable.elems] <-
        comments_assign(expr[assignable.elems], prsdat.par)
    }
    # Now do the same for the child expression by recursively calling this
    # function until there are no children left, but need to be careful here
    # because we only need to call this for non-terminal leaves of the parse
    # tree.  Simply removing non terminal leaves from call should leave
    # everything in correct order because the only time there are order
    # mismatches are with infix operators and those are terminal leaves anyway.

    if(
      !any(
        vapply(
          prsdat.children,
          function(child) with(child, "COMMENT" %in% token),
          logical(1L)
    ) ) ) return(expr)

    # stuff that corresponds to elements in `expr`, will re-order to match `expr`
    prsdat.par.red <- prsdat_reduce(prsdat.par)
    if(!identical(nrow(prsdat.par.red), length(which(assignable.elems)))) {
      # nocov start
      stop(
        "Advanced Parse Error: mismatch between expression and parse data."
      )
      # nocov end
    }
    j <- 1
    if(!is.expression(expr) && !is.call(expr)) {
      # nocov start
      # shouldn't happen, can't test

      if(term.len <- length(which(!prsdat.par.red$terminal)) > 1L) {
        stop(
          "Advanced Parse Error: terminal expression has more than one token."
        )
      } else if (term.len) {
        expr <- Recall(
          expr, prsdat.children[[j]], as.integer(names(prsdat.children)[[j]])
        )
      }
      # nocov end
    } else {
      for(i in 1:nrow(prsdat.par.red)) {
        if(prsdat.par.red$terminal[[i]]) next
        new.val <- Recall(
          expr[assignable.elems][[i]], prsdat.children[[j]],
          as.integer(names(prsdat.children)[[j]])
        )
        if(!is.null(new.val)) expr[assignable.elems][[i]] <- new.val
        j <- j + 1
    } }
    expr
  }
  prsdat_recurse(expr, parse.dat, top.level=0L)
}
# Handle the issues with needing to run parse twice due to weird getParseData
# output

parse_dat_get <- function(file, text) {
  parse.dat.raw <- NULL
  for(i in 1:2) {
    if(!is.null(text)) {
      if(!missing(file))
        # nocov start
        stop("Internal Error: cannot specify both `file` and `text` arguments.")
        # nocov end
      expr <- try(parse(text=text, keep.source=TRUE))
    } else {
      expr <- try(parse(file, keep.source=TRUE))
    }
    if(inherits(expr, "try-error")) stop("parsing failed")
    if(!length(expr)) break
    parse.dat.raw <- getParseData(expr)
    if(is.null(parse.dat.raw)) break

    if(!nrow(parse.dat.raw))
      stop("Advanced Parse Error: parse data mismatch.") # nocov
    parse.dat.check <- cbind(
      parse.dat.raw[
        match(parse.dat.raw$parent, parse.dat.raw$id), c("line1", "col1")
      ],
      setNames(
        parse.dat.raw[, c("line1", "col1")], c("line1.child", "col1.child")
      )
    )
    if(
      length(
        with(parse.dat.check,
          which(
            line1.child < line1 |
            (line1.child == line1) & col1.child < col1
      ) ) )
    ) {
      # Parsing is not self consistent; some child items have for parents items
      # that are lexically posterior
      if(identical(i, 1L))  # Try again once to see if that fixes it
        next
      # nocov start
      stop("Advanced Parse Error: cannot retrieve self consistent parse data")
      # nocov end
    }
    break  # Parsing worked as expected
  }
  list(expr=expr, dat=parse.dat.raw)
}

parse_tests <- function(file, comments=TRUE, text=NULL) {

  if(!isTRUE(comments) && !identical(comments, FALSE))
    stop("Argument `comments` must be TRUE or FALSE")
  if(!is.null(text) && !missing(file))
    stop("If Argument `text` is specified, argument `file` must be missing")

  parsed <- NULL
  if(comments) {
    parsed <- tryCatch(
      parse_with_comments(file, text),
      error=function(e) {
        if(identical(conditionMessage(e), "parsing failed"))
          stop("Unable to parse test file; see previous messages")
        warning(
          "Unable to recover comments in advanced parse because:\n\n",
          paste0(
            "    ",
            strwrap(conditionMessage(e), getOption('width') - 10),
            "\n"
          ),
          "\nFalling back to simple parse.",
          immediate.=TRUE, call.=FALSE
        )
        NULL
  } ) }
  # Either no comment mode, or couldn't extract in comment mode

  if(is.null(parsed)) {
    if(is.null(text)) {
      parse(file, keep.source=FALSE)
    } else parse(text=text, keep.source=FALSE)
  } else parsed
}
# Need this to pass R CMD check; problems likely caused by `transform` and
# `subset`.

if(getRversion() >= "2.15.1")  utils::globalVariables(c("token"))

# Reduce Parsed Data to Just the Things That should Exist In Expression
#
# additionally, special handling due to function and formals not getting wrapped
# in their own `expr` (why the FUCK!!!!)
#
# @aliases prsdat_remove_fun
# @param parse.dat top level parse data
# @return parse data reduced to key elements, ordered so that infix operators
#   show up first instead of in middle

prsdat_reduce <- function(parse.dat) {
  parse.dat.red <- subset(
    parse.dat,
    !token %in% c(tk.lst$brac.close, tk.lst$unassign, tk.lst$seps, "COMMENT") &
    !(token == "'('" & 1L:length(token) == 2L)
  )
  # at this point, must be all expressions, an opening bracket, or an operator of some
  # sort, and iff the operator is @ or $, or if there is only one item in the data frame
  # then it can be NUM_CONST or STR_CONST or symbol for the second one
  if(any(c("'$'", "'@'") %in% parse.dat.red$token)) {
    if(!identical(nrow(parse.dat.red), 3L)) {
      # nocov start
      stop(
        "Advanced Parse Error: top level statement with `@` or `$` must be ",
        "three elements long"
      )
      # nocov end
    }
    if(!identical(parse.dat.red$token[[1L]], "expr")) {
      # nocov start
      stop(
        "Advanced Parse Error: left argument to `@` or `$` must be an  ",
        " expression"
      )
      # nocov end
    }
    if(
      identical(parse.dat.red$token, "'@'") &&
      !identical(parse.dat.red$token[[3L]], "SLOT")
    ) {
      # nocov start
      stop("Advanced Parse Error: right argument to `@` must be SLOT")
      # nocov end
    }
    if(
      identical(parse.dat.red$token, "'$'") &&
      !identical(parse.dat.red$token[[3L]], "SYMBOL")
    ) {
      # nocov start
      stop("Advanced Parse Error: right argument to `$` must be SYMBOL")
      # nocov end
    }
  } else if (nrow(parse.dat.red) == 1L) {
    if(
      !parse.dat.red$token[[1L]] %in%
      c("expr", tk.lst$non.exps, tk.lst$non.exps.extra, tk.lst$brac.open)
    ) {
      # nocov start
      stop(
        "Advanced Parse Error: single element parent levels must be symbol or ",
        "constant or expr"
      )
      # nocov end
    }
  } else if (
    length(
      which(
        parse.dat.red$token %in%
        c(tk.lst$exps, tk.lst$non.exps, tk.lst$non.exps.extra)
    ) ) < nrow(parse.dat.red) - 1L
  ) {
    # nocov start
    stop(
      "Advanced Parse Error: in most cases all but at most one token must be ",
      "type `expr` or `exprlist`."
    )
    # nocov end
  }
  parse.dat.red[
    order(
      parse.dat.red$token %in%
      c(tk.lst$exps, tk.lst$non.exps, tk.lst$non.exps.extra)
  ), ]
}
# Need this to pass R CMD check; problems likely caused by `transform` and
# `subset`.

if(getRversion() >= "2.15.1")  utils::globalVariables(c("id", "token"))

# Functions to Adjust Parse Data To Match Expression
#
# \itemize{
#   \item \code{`prsdat_fix_fun`} extract all comments from formals and brings them
#     up a level, and then removes formals
#   \item \code{`prsdat_fix_for`} brings contents of `forcond` to same level as
#     `for` to match up with expression
#   \item \code{`prsdat_fix_for`} extracts expression from the condition (though
#     apparently not from `ifcond`)
#   \item \code{`prsdat_fix_exprlist`} excises the \code{`exprlist`} portions of
#     \code{`exprlist`} as those don't exist in the expressions proper; they
#     don't do anything, and have extraneous semi colons.  We need to remove
#     them, and then make sure all their children become children of the
#     parent of the exprlist
# parent
#   \item \code{`prsdat_find_paren`} returns locations of first set
#     of open and close parens
# }
# @aliases prsdat_fix_for, prsdat_find_paren, prsdat_fix_exprlist
# @param parse.dat a data frame of the type produced by \code{`\link{getParseData}`}
# @return \itemize{
#   \item for \code{`parsdat_fix*`}, a data frame of the type produced by \code{`\link{getParseData}`}
#   \item for \code{`parsdat_find_paren`}, a length two integer vector with the ids of the parens
# }

prsdat_fix_fun <- function(parse.dat) {
  if(!identical(parse.dat$token[[1L]], "FUNCTION"))
    stop("Argument `parse.dat` must start with a 'FUNCTION' token.")
  subset(
    parse.dat,
    1L:nrow(parse.dat) > which(id == prsdat_find_paren(parse.dat)[[2]]) | token == "COMMENT" | 1L:nrow(parse.dat) == 1L
  )
}
# Need this to pass R CMD check; problems likely caused by `transform` and
# `subset`.

if(getRversion() >= "2.15.1")  utils::globalVariables(c("id", "parent", "token"))

prsdat_fix_for <- function(parse.dat) {
  if(!identical(parse.dat$token[[1L]], "FOR"))
    stop("Argument `parse.dat` must start with a 'FOR' token.")
  if(!identical(parse.dat$token[parse.dat$token != "COMMENT"][[2]], "forcond"))
    stop("Argument `parse.dat` does not have token `forcond` in expected location")
  if(!identical(length(which(parse.dat$token == "forcond")), 1L))
    stop("Argument `parse.dat` should have exactly one `forcond` token")
  par.range <- prsdat_find_paren(parse.dat)
  par.level <- subset(parse.dat, id == par.range[[1]])$parent
  tokens <- tail(head(subset(parse.dat, parent==par.level)$token, -1L), -1L)
  tokens.no.comm <- tokens[tokens != "COMMENT"]
  if(!identical(length(tokens.no.comm), 3L))
    stop("Logic error: `forcond` should have three elements")
  if(!identical(which(tokens.no.comm == "IN"), 2L))
    stop("Logic error: `forcond` should have exactly one 'IN' in position 2L")
  parse.dat.mod <- subset(parse.dat, !token %in% c("forcond", "IN") & ! id %in% par.range)
  `[<-`(parse.dat.mod, parse.dat.mod$parent == par.level, "parent", parse.dat[1L, "parent"])
}
# Need this to pass R CMD check; problems likely caused by `transform` and
# `subset`.

if(getRversion() >= "2.15.1")  utils::globalVariables(c("id", "parent", "token"))

prsdat_fix_simple <- function(parse.dat, tok) {
  if(! tok %in% c("IF", "WHILE"))
    # nocov start
    stop(
      "Advanced Parse Error: this function only supports 'IF' and 'WHILE' ",
      "tokens"
    )
    # nocov end
  if(!identical(parse.dat$token[[1L]], tok))
    stop("Argument `parse.dat` must start with an '", tok, "' token.")
  par.id <- parse.dat$parent[[1L]]
  par.range <- prsdat_find_paren(parse.dat)
  early.tokens <- parse.dat$token[1L:(which(parse.dat$id == par.range[[1L]]) - 1L)]
  if(
    any(! early.tokens %in% c(tok, "COMMENT")) ||
    !identical(length(which(early.tokens == tok)), 1L)
  )
    # nocov start
    stop(
      "Advanced Parse Error: could not parse ", tok, " statement."
    )
    # nocov end
  parse.delete <-
    subset(parse.dat, parent == par.id & token %in% c("'('", "')'", "ELSE"))
  if(!nrow(parse.delete) %in% c(2L, 3L))
    # nocov start
    stop(
      "Advanced Parse Error: unexpected number of ", tok,
      " statement sub-components; contact maintainer."
    )
    # nocov end
  if(any(parse.dat$parent %in% parse.delete$id))
    # nocov start
    stop(
      "Advanced Parse Error: unexpected parent relationships in ", tok,
      " statement."
    )
    # nocov end
  subset(parse.dat, ! id %in% parse.delete$id)
}
prsdat_fix_if <- function(parse.dat) prsdat_fix_simple(parse.dat, "IF")
prsdat_fix_while <- function(parse.dat) prsdat_fix_simple(parse.dat, "WHILE")

prsdat_find_paren <- function(parse.dat) {
  par.clos.pos <- match("')'", parse.dat$token)
  if(is.na(par.clos.pos))
    # nocov start
    stop(
      "Advanced Parse Error: failed attempting to parse function block."
    )
    # nocov end
  par.op.pos <- match("'('", parse.dat$token[1:par.clos.pos])
  if(is.na(par.op.pos))
  if(
    !identical(par.op.pos, 2L) &&
    !identical(unique(parse.dat$token[2L:(par.op.pos - 1L)]), "COMMENT")
  )
    # nocov start
    stop(
      "Advanced Parse Error: failed attempting to `for` function block."
    )
    # nocov end
  c(open=parse.dat$id[[par.op.pos]], close=parse.dat$id[[par.clos.pos]])
}
prsdat_fix_exprlist <- function(parse.dat, ancestry) {

  z <- ancestry
  z[, "level"] <- z[match(parse.dat$id, z[, "children"]), "level"]
  # order by level to make sure we remove exprlists in correct order
  lev.ord <- order(z[, "level"])
  dat.ord <- parse.dat[lev.ord, ]
  ind.all <- seq.int(nrow(dat.ord))
  # map parents vs. position in ordered list
  par.map <- list2env(split(ind.all, dat.ord[["parent"]]))

  dat.exprlist <- which(dat.ord[["token"]] == "exprlist")
  ind.exp <- seq_along(dat.exprlist)
  ind.exclude <- logical(length(ind.all))

  if(length(dat.exprlist)) {
    dat.ord <- within(
      dat.ord,
      {
        for(exprlist.ind in dat.exprlist) {

          # Find first `exprlist`

          exprlist.par <- parent[[exprlist.ind]]

          # Promote all children of exprlist and remove semi-colons and actual
          # exprlist.  This requires updating the value of the parent column in
          # `dat.ord`, and then re-assigning the parent ship relationship

          exprlist.par.chr <- as.character(exprlist.par)
          exprlist.id.chr <- as.character(id[[exprlist.ind]])
          exprlist.children <- par.map[[exprlist.id.chr]]

          # semi colons with exprlist as parent need to be discarded

          semicol.ind <- exprlist.children[which(token[exprlist.children] == "';'")]

          # change exprlist children parent to exprlist parent

          parent[exprlist.children] <- exprlist.par

          # Update mapping to reflect new parentship

          par.map[[exprlist.par.chr]] <<- c(
            par.map[[exprlist.par.chr]],
            par.map[[exprlist.id.chr]]
          )
          par.map[[exprlist.id.chr]] <<- NULL

          # extend exclusion list

          ind.exclude[c(exprlist.ind, semicol.ind)] <<- TRUE
        }
        rm(
          exprlist.par, exprlist.par.chr, exprlist.id.chr, exprlist.children,
          exprlist.ind, semicol.ind
  ) } ) }
  # Now actually remove the exprlist and semi colons, and re-order

  parse.dat.mod <-
    dat.ord[order(lev.ord), ][which(!ind.exclude[order(lev.ord)]), ]
  if(!all(parse.dat.mod$parent %in% c(0, parse.dat.mod$id)))
    stop("Advanced Parse Error: `exprlist` excision did not work!")  # nocov
  parse.dat.mod
}
# Removes Symbol Marker Used To Hold Comments

symb_mark_rem <- function(x) {
  if(isTRUE(attr(x, "unitizer_parse_symb"))) {
    if(length(x) != 2L || x[[1L]] != as.name("(") || length(x[[2L]]) != 1L) {
      stop(  # nocov start
        "Advanced Parse Error: Unexpected structure for object with language ",
        "with 'unitizer_parse_symb' attribute attached."
    ) }      # nocov end
    x <- x[[2L]]
  }
  x
}
# Utility Function to Extract Comments From Expression
#
# Note that when dealing with expressions the very first item will typically
# be NULL to allow for logic that works with nested structures.
#
# \code{comm_and_call_extract} also pulls out a cleaned up version of the call
# along with the comments, but the comments come out in a vector instead of a
# list showing the structure where the comments were pulled from.
#
# Used mostly for testing purposes.

comm_extract <- function(x) {

  if(missing(x)) return(list(NULL))
  comm <- attr(x, "comment")
  x <- symb_mark_rem(x)             # get rid of comment container
  if(missing(x)) return(list(NULL)) # need to do this twice because missing args that are parsed aren't necessarily recognized as missing immediately

  if(is.expression(x) || length(x) > 1L) {
    return(c(list(comm), lapply(x, comm_extract)))
  } else {
    return(list(comm))
} }

comm_and_call_extract <- function(x) {

  comments <- character()

  rec <- function(call) {
    if(missing(call)) return(call)
    comm <- attr(call, "comment")
    if(!is.null(comm)) {
      comments <<- c(comments, comm)
      attr(call, "comment") <- NULL
    }
    call.clean <- symb_mark_rem(call)             # get rid of comment container
    if(missing(call.clean)) return(call.clean)    # need to do this twice because missing args that are parsed aren't necessarily recognized as missing immediately

    if(is.expression(call.clean) || length(call.clean) > 1L) {
      for(i in seq_along(call.clean)) {
        call.sub <- call.clean[[i]]
        if(!missing(call.sub) && !is.null(call.sub)) call.clean[[i]] <- rec(call.clean[[i]])
    } }
    call.clean
  }
  list(call=rec(x), comments=comments)
}
# Utility Function to Reset Comments
#
# Required due to bizarre behavior (bug?) where some expression attributes
# appear to have reference like behavior even when they are re-generated
# from scratch from a text expression (wtf, really).

comm_reset <- function(x) {
  if(is.null(x) || is.name(x) && !nchar(x)) return(x)
  attr(x, "comment") <- NULL
  if(is.pairlist(x)) return(x)
  if(length(x) > 1L || is.expression(x))
    for(i in seq_along(x)) if(!is.null(x[[i]])) x[[i]] <- Recall(x[[i]])
  x
}
# Listing on known tokens
#
# As of this writing, the following tokens from \file{src/main/gram.c} are
# not handled:
#
#      [,1]           [,2]             [,3]                [,4]
# [1,] "'\\n'"        "cr"             "ifcond"            "sub"
# [2,] "'%'"          "END_OF_INPUT"   "INCOMPLETE_STRING" "sublist"
# [3,] "$accept"      "equal_assign"
# [4,] "$end"         "error"          "LINE_DIRECTIVE"    "TILDE"
# [5,] "$undefined"   "ERROR"          "LOW"               "UMINUS"
# [6,] "COLON_ASSIGN" "expr_or_assign" "NOT"               "UNOT"
# [7,] "cond"         "formlist"       "prog"              "UPLUS"
#
# So far, we have not been able to produce \code{`getParseData`} data frames
# that contain them.  It may not be possible to do so for all of them.  For
# example, \code{`INCOMPLETE_STRING`} shows up during a parse error, so could
# never be part of a fully parsed expression.
#
# Updates 11/2020: We've now seen "expr_or_assign_or_help", and according to
# Kalibera "equal_assign" is now in use, although the example he gave `a = 1`
# produces "expr_or_assign_or_help".
#
# Updates 11/2020: Turns out these are generated in 3.6.3, but apparently not in
# 4.0+.

tk.lst <- list(
  comment="COMMENT",
  brac.close=c("'}'", "']'", "')'"),
  brac.open=c("'{'", "'['", "'('", "LBB"),
  exps=c("expr", "exprlist", "equal_assign", "expr_or_assign_or_help"),
  # no comments on these as they are just removed
  seps=c("','", "';'"),
  # in addition to `expr`, these are the ones that can get comments attached
  non.exps=c(
    "SYMBOL", "STR_CONST", "NUM_CONST", "NULL_CONST",
    "SLOT", "NEXT", "BREAK", "SYMBOL_FUNCTION_CALL"
  ),
  # these can also get comments attached, but shouldn't be at the end of a
  # parse data block
  non.exps.extra=c(
    "FUNCTION", "FOR",
    # not 100% sure SYMBOL_PACKAGE belongs here; it can't possibly have
    # comments right after it on the same line
    "IF", "REPEAT", "WHILE", "SYMBOL_PACKAGE"
  ),
  ops=c(
    paste0(
      "'",
      c("-", "+", "!", "~", "?", ":", "*", "/", "^", "$", "@"),
      "'"
    ),
    "SPECIAL", "GT", "GE", "LT", "LE", "EQ", "NE", "AND", "AND2",
    "OR", "OR2", "LEFT_ASSIGN", "RIGHT_ASSIGN", "EQ_ASSIGN"
  ),
  # note these should never show up at top level
  ops.other=c("NS_GET", "NS_GET_INT"),
  # these cannot have comments attached to them
  unassign=c(
    "EQ_SUB", "SYMBOL_SUB", "EQ_FORMALS", "SYMBOL_FORMALS",
    "IN", "forcond", "ELSE"
  )
)

Try the unitizer package in your browser

Any scripts or data that you put into this service are public.

unitizer documentation built on Oct. 8, 2023, 5:06 p.m.