RDataTracker: Provenance Collector for R

# Copyright (C) President and Fellows of Harvard College and 
# Trustees of Mount Holyoke College, 2014, 2015, 2016, 2017.

# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public
#   License along with this program.  If not, see
#   <http://www.gnu.org/licenses/>.

########################## DDGStatement.R ############################

# This file contains definitions of S4 classes to manage information about
# individual R statements and functions that operate on individual statements
#
# All of these functions are internal to the RDataTracker / provR library and
# not called from user code.

# Needed to work with S4 classes.  Normally, this library is automatically
# loaded.  However, it is not loaded when running non-interactively, as
# in our test cases or if a user uses RScript to run R files.


#' .ddg.init.statements initialize the data used to manage the statements
#' @return nothing
#' @noRd

.ddg.init.statements <- function() {
  .ddg.set("ddg.statement.num", 0)
  .ddg.set("ddg.statements", list())
}

#' .ddg.statement.num returns the number of DDG Statements created
#' @return the number of DDG statements created
#' @noRd

.ddg.statement.num <- function() {
  return(.ddg.get("ddg.statement.num"))
}

#' .ddg.statements returns a list of DDG Statements created
#' @return the list of DDG statements created
#' @noRd

.ddg.statements <- function() {
  return(.ddg.get("ddg.statements"))
}

#' .ddg.statement returns the ith DDG Statement
#' @param i the index of the statement to return
#' @return the ith DDGStatement
#' @noRd

.ddg.statement <- function(i) {
  ddg.statements <- .ddg.statements()
  return(ddg.statements[[i]])
}

#' .ddg.add.ddgstatement adds a DDGStatement to the end of the list
#' @param stmt a DDGStatement object 
#' @return nothing
#' @noRd

.ddg.add.ddgstatement <- function(stmt) {
  ddg.statements <- c(.ddg.statements(), stmt)
  .ddg.set("ddg.statements", ddg.statements)
}

# Information about where in the source code this statement appears.
methods::setClass("DDGStatementPos",
    slots = list(
        startLine = "numeric",
        startCol = "numeric",
        endLine = "numeric",
        endCol = "numeric")
)

# This is called automatically when there is a call to create a new
# DDGStatementPos object.
methods::setMethod ("initialize",
    "DDGStatementPos",
    function(.Object, parseData){
      # If the parse data is missing, we set all the fields to -1
      if (length(parseData) == 1 && is.na(parseData)) {
        .Object@startLine <- -1
        .Object@startCol <- -1
        .Object@endLine <- -1
        .Object@endCol<- -1
      }

      # If we have parseData, we extract the information into oure
      # object.
      else {
        # print (paste ("pos =", parseData$line1, parseData$col1, parseData$line2, parseData$col2))
        .Object@startLine <- parseData$line1
        .Object@startCol <- parseData$col1
        .Object@endLine <- parseData$line2
        .Object@endCol<- parseData$col2
      }
      #print(.Object)
      return (.Object)
    }
)

# This class contains all the information that we need when building a ddg.
# We create this when we parse the statement so that it is only done once
# and then look up the information we need when the statement executes.
methods::setClass("DDGStatement",
    slots = list(
        text = "character",     # The original text in the file
        parsed = "expression",  # The parse tree for the statement
        abbrev = "character",   # A shortened version of the text to use in node names
        annotated = "expression",  # An annotated version of the statement.  This is
                                   # what we actually execute.
        
        # Note that vars.used through has.dev.off do not apply to a situation where
        # the statement is a function declaration, since declaring the statement
        # does not read from files, etc.  That happens when the function is called,
        # at which point we will refer to the information in the contained statements.

        vars.used = "character", # A list of the variables that are used in the statement
        vars.set = "character",  # If this is an assignment statement, this is the 
                                 # variable assigned
        vars.possibly.set = "character",  # If this contains any internal
                                          # assignment statements, like an 
                                          # if-statement might, for example, these 
                                          # are the variables assigned within the 
                                          # statement.
        isDdgFunc = "logical",   # True if this is a call to a ddg function
        pos = "DDGStatementPos",  # The location of this statement in the source code.
                                  # Has the value null.pos() if it is not available.
        script.num = "numeric",   # The number for the script this statement comes from.
                                  # Has the value -1 if it is not available
        functions.called = "list" # A list of the statement's function calls and 
                                  # potential function calls.]
      )
)

# This is called when a new DDG Statement is created.  It initializes all of the slots.
methods::setMethod ("initialize",
  "DDGStatement",
    function(.Object, parsed, pos, script.num){
      .Object@parsed <- parsed

      # deparse can return a vector of strings.  We convert that into
      # one long string.
      .Object@text <- paste(deparse(.Object@parsed[[1]]), collapse="")
      if (.ddg.debug.lib()) print(paste ("Parsing", .Object@text))

      .Object@abbrev <-
          # If this is a call to .ddg.eval, we only want the argument to .ddg.eval
          # (which is a string) to appear in the node label
          if (grepl("^.ddg.eval", .Object@text)) {
            .ddg.abbrev.cmd(.Object@parsed[[1]][[2]])
          }
          else {
            .ddg.abbrev.cmd(.Object@text)
          }

      .Object@annotated <- parsed
      
      vars.used <- .ddg.find.var.uses(.Object@parsed[[1]])

      # Remove index variable in for statement (handled separately in .ddg.forloop).
      if (length(parsed) > 0 && !is.symbol(parsed[[1]]) && parsed[[1]][[1]] == "for") {
        index.var <- c(parsed[[1]][[2]])
        vars.used <- vars.used[! vars.used %in% index.var]
      }

      .Object@vars.used <- vars.used

      .Object@vars.set <- .ddg.find.simple.assign(.Object@parsed[[1]])

      .Object@vars.possibly.set <- .ddg.find.assign(.Object@parsed[[1]])
      
      # If this is a function declaration, record information
      # about non-locals used or set in the function.
      if (length (.Object@parsed[[1]]) >= 3 && 
          .ddg.is.assign (.Object@parsed[[1]]) &&
          .ddg.is.functiondecl (.Object@parsed[[1]][[3]])) {
        .ddg.save.func.decl.info(.Object@parsed[[1]][[2]], .Object@parsed[[1]][[3]])
      }
        
      # .ddg.eval is treated differently than other calls to ddg functions since
      # we will execute the parameter as a command and want a node for it.
      .Object@isDdgFunc <- (grepl("^ddg|^.ddg|^prov", .Object@text) 
        && !grepl("^.ddg.eval", .Object@text))

      .Object@pos <-
          if (is.object(pos)) {
            pos
          }
          else {
            .ddg.null.pos()
          }

      .Object@script.num <-
          if (is.na(script.num)) -1
          else script.num

      # find the list of the names of the function calls in the statement
    .Object@functions.called <- .ddg.find.calls( .Object@parsed[[1]] )

      return(.Object)
    }
)

#' .ddg.create.DDGStatements creates the DDGStatement list for a list of parsed expressions.
#' @param exprs - a list of parsed expressions
#' @param script.name - the name of the script the expressions come from
#' @param script.num - the number of the script the expressions came from
#' @param parseData - information provided by the parser that we use to find line numbers
#' @param enclosing.pos - if exprs are statements within a function definition, enclosing.pos
#' is the source position information of the entire function declaration
#' @return a list of DDGStatement objects
#' @noRd

.ddg.create.DDGStatements <- function (exprs, script.name, script.num, 
                                       parseData = NULL, enclosing.pos = NULL) {
  # The parse data gives us line number information
  if (is.null(parseData)) {
    parseData <- utils::getParseData(exprs, includeText=TRUE)
    
    if (is.null(parseData)) {
      # In this case there is no line number information available
      cmds <- vector("list", (length(exprs)))
      for (i in 1:length(exprs)) {
        expr <- as.expression(exprs[i])
        cmds[[i]] <- .ddg.construct.DDGStatement(expr, NA, script.name, 
                                                 script.num, parseData)
      }
      return(cmds)
    }
    
    non.comment.parse.data <- parseData[parseData$token != "COMMENT", ]
    if (nrow(non.comment.parse.data) == 0) {
      return(list())
    }
    
    # Start at the first non-comment expression in parseData
    next.parseData <- 1
  }
  
  else {
    non.comment.parse.data <- parseData[parseData$token != "COMMENT", ]
    
    # Start at the first entry in parse data that begins after the enclosing 
    # function begins, ends before the enclosing function ends, and matches the 
    # text of the first expression.
    next.parseData <- 
      which(non.comment.parse.data$line1 >= enclosing.pos@startLine & 
            non.comment.parse.data$line2 <= enclosing.pos@endLine & 
            non.comment.parse.data$text == paste(deparse(exprs[[1]]), collapse="\n") )[1]
  }
  
  # Create the DDGStatements
  cmds <- vector("list", (length(exprs)))
  next.cmd <- 1
  for (i in 1:length(exprs)) {
    expr <- as.expression(exprs[i][[1]])
    next.expr.pos <- methods::new (Class = "DDGStatementPos", 
                                   non.comment.parse.data[next.parseData, ])
    cmds[[next.cmd]] <- .ddg.construct.DDGStatement(expr, next.expr.pos, 
                                                    script.name, script.num, 
                                                    parseData)
    next.cmd <- next.cmd + 1
    
    # If there are more expressions, determine where to look next in the parseData
    if (i < length(exprs)) {
      last.ending.line <- non.comment.parse.data[next.parseData, ]$line2
      last.parent <- non.comment.parse.data[next.parseData, "parent"]
      last.id <- non.comment.parse.data[next.parseData, "id"]
      
      # Find the first entry in parseData that has the same parent as the
      # previous expression and starts after the previous expression.
      next.parseData <- which(non.comment.parse.data$parent == last.parent & 
                              non.comment.parse.data$line1 >= last.ending.line & 
                              non.comment.parse.data$id > last.id) [1]
    }
  }
  
  return (cmds)
}

#' .ddg.find.calls finds the function calls and potential function calls in an expression.
#' This function wraps the last two vectors in the returned value of 
#' .ddg.find.calls.rec into a data frame, keeping the other two vectors the same,
#' before returning the resulting list.
#' @param expr The parse tree for the statement
#' @return A list containing the following:
#' [1]: functions from unknown libraries (character vector)
#' [2]: variable names, which may refer to functions (character vector)
#' [3]: known function calls with their respective libraries (data frame)
#' @noRd

.ddg.find.calls <- function(expr) 
{
  # The returned list of .ddg.find.calls.rec(expr) contains:
  #   [1]: functions from unknown libraries
  #   [2]: variable names, which may refer to functions
  #   [3]: functions with known libraries
  #   [4]: libraries which the functions in [3] are from
  result <- .ddg.find.calls.rec(expr)

  # [3] and [4] of the returned value of .ddg.find.calls.rec(expr) are paired.
  # As this function wraps [3] and [4] into a data frame before returning the 
  # resulting list, if [3] is null (the statement does not contain `::` or `:::` 
  # operators), then we can just return the result from the recursive function 
  # without [4].
  if( is.null(result[[3]]) )
    return( result[-4] )
  
  # wraps [3] and [4] into a data frame, changing the column names to match
  # the column names for ddg.function.nodes to enable rbind.
  fn.known.lib <- data.frame(result[[3]], result[[4]], stringsAsFactors = FALSE)
  names(fn.known.lib) <- c("ddg.fun", "ddg.lib")
  
  return( list(result[[1]], result[[2]], unique(fn.known.lib)) )
}


#' .dgg.find.calls.rec is a recursive helper function for .ddg.find.calls.
#' This function WILL FAIL if the user overwrites `::` or `:::`
#' @param expr an expression
#' @return A named list of the following character vectors:
#' [1]: functions from unknown libraries
#' [2]: variable names, which may refer to functions
#' [3]: functions with known libraries
#' [4]: libraries which the functions in [3] are from
#' @noRd

.ddg.find.calls.rec <- function(expr)
{
  # base case: a name or a constant
  if( ! is.call(expr) || .ddg.is.functiondecl(expr) )
  {
    elem <- toString(expr)
    
    # parameter names could be "", as is the case in a[2, ]
    # the if branch places such function parameter names that are not ""
    # into the var.names list for checking if they are function names at runtime.
    if( is.name(expr) && ! identical(elem, "") )
      return( list(NULL, elem, NULL, NULL) )
    else
      return( list(NULL, NULL, NULL, NULL) )
  }
  
  # expr is a call && expr[[1]] is a call: recurse on all parts of expr
  if( is.call(expr[[1]]) )
  {
    recursion.result <- lapply(expr, .ddg.find.calls.rec)
    
    fn.unknown.lib <- unlist( mapply(`[`, recursion.result, 1) )
    var.names <- unlist( mapply(`[`, recursion.result, 2) )
    
    fn.known.lib <- unlist( mapply(`[`, recursion.result, 3) )
    libraries <- unlist( mapply(`[`, recursion.result, 4) )
  }
  else
  {
    elem1 <- toString(expr[[1]])

    # general case for `::` or `:::`
    # e.g. stringi::stri_join
    # The parse tree is:  `::`, stringi, stri_join
    if( identical(elem1, "::") || identical(elem1, ":::") )
{
      fn.unknown.lib <- elem1
      var.names <- NULL
      
      fn.known.lib <- toString(expr[[3]])
      libraries <- toString(expr[[2]])
    }
    else  # general case
  {
      # If expr is a call, expr[[1]] is not a call and not `::` or `:::`,
      # then expr[[1]] is a function name.
      #
      # This recurses on all parts of expr but the first element, then
      # appending expr[[1]] to the list of functions with unknown libraries
      # (fn.unknown.lib) after combining the result of the recursive calls.
      #
      # e.g. 
      # let expr be 'as.character(a)'
      # expr is a call, its parse tree is:  as.character, a
      # expr[[1]], as.character, is the function name (not a call)
      
      # edge case: function call with no parameters
      if( is.null(expr[-1]) )
        return( list(elem1, NULL, NULL, NULL) )
      
      recursion.result <- lapply(expr[-1], .ddg.find.calls.rec)
      
      fn.unknown.lib <- unlist( mapply(`[`, recursion.result, 1) )
      fn.unknown.lib <- append(elem1, fn.unknown.lib)
      
      var.names <- unlist( mapply(`[`, recursion.result, 2) )
    
      fn.known.lib <- unlist( mapply(`[`, recursion.result, 3) )
      libraries <- unlist( mapply(`[`, recursion.result, 4) )
  }
}

  fn.unknown.lib <- unique(fn.unknown.lib)
  var.names <- unique(var.names)
  
  return( list(fn.unknown.lib, var.names, fn.known.lib, libraries) )
}

# .ddg.null.pos provides a special null value for when source code position 
#' information is missing.
#' @return a special null value
#' @noRd

.ddg.null.pos <- function() {
  return (methods::new (Class = "DDGStatementPos", NA))
}


#' .ddg.abbrev.cmd abbreviates a command to the specified length.
#' Default is 60 characters.
#' @param cmd - command string.
#' @param len (optional) - number of characters.
#' @return abbreviated command
#' @noRd

.ddg.abbrev.cmd <- function(cmd, len=60) {
  if (length(cmd) > 1) {
    cmd <- paste (cmd, collapse = " ")
  }

  if (file.exists(cmd)) basename(cmd)
  else if (nchar(cmd) <= len) cmd
  else if (substr(cmd, len, len) != "\\") substr(cmd, 1, len)
  else if (substr(cmd, len-1, len) == "\\\\") substr(cmd, 1, len)
  else substr(cmd, 1, len-1)
}

#' .ddg.find.var.uses returns a vector containing all the variables
#' used in an expression.  Each value is unique in the returned
#' vector, so that if a variable is used more than once, it
#' only appears once.
#' @param main.object - input expression.
#' @return vector of variables used in the expression
#' @noRd

.ddg.find.var.uses <- function(main.object) {
  # Recursive helper function.
  .ddg.find.var.uses.rec <- function(obj) {

    # Base cases.
    if (is.atomic(obj)) {
      return(character())  # A name is not atomic!
    }

    if (is.name(obj)) {
      if (nchar(obj) == 0) return (character())

      # Operators also pass the is.name test.  Make sure that if it is a
      # single character, then it is alpha-numeric.
      if (nchar(obj) == 1 && !grepl("[[:alpha:]]", obj)) return (character())
      # print(paste(".ddg.find.var.uses found name", deparse(obj)))
      return (deparse(obj))
    }

    if (!is.recursive(obj)) return(character())

    if (.ddg.is.functiondecl(obj)) return(character())

    tryCatch(
      {
        if (.ddg.is.assign(obj)) {

          # If assigning to a simple variable, recurse on the right
          # hand side of the assignment.

          # covers cases: '=', '<-', '<<-' for simple variable assignments
          # e.g.  a <- 2
          if (is.symbol(obj[[2]])) {
            unique(unlist(.ddg.find.var.uses.rec(obj[[3]])))
          }

          # If assigning to an expression (like a[b]), recurse on the
          # indexing part of the lvalue as well as on the expression.
          # covers cases:
          # storage.mode(z)
          # a[1] <- 2, a[b] <- 3
          else if (is.call(obj[[2]])) {
            variables <- c( .ddg.find.var.uses.rec(obj[[2]][[2]]), 
                            unlist(.ddg.find.var.uses.rec(obj[[3]])) )

            # for array index cases like a[b] <- 3,
            # where there could be a variable in the brackets
            if( obj[[2]][[1]] == "[" || obj[[2]][[1]] == "[[" ) {
              variables <- c( variables, unlist (.ddg.find.var.uses.rec(obj[[2]][[3]]) ))
            }
            
            unique( variables )
          }

          # covers cases where there is a string literal.
          # for assign function
          else if (is.character(obj[[2]])) {
            unique( c(unlist(.ddg.find.var.uses.rec(parse(text = obj[[2]])[[1]])), 
                      unlist(.ddg.find.var.uses.rec(parse(text = obj[[3]])[[1]]))) )
          }

          # not entirely sure what this catches
          else {
            unique(c (.ddg.find.var.uses.rec(obj[[2]]), 
                      unlist(.ddg.find.var.uses.rec(obj[[3]]))))
          }
        }

        # Not an assignment.  Recurse on all parts of the expression
        # except the operator.
        else {
          unique(unlist(lapply(obj[2:length(obj)], .ddg.find.var.uses.rec)))
        }
      },
      error = function(e)
      {
        print (paste(".ddg.find.var.uses.rec:  Error analyzing", deparse(obj)))
        print (e)
        character()
      }
    )
  }

  return(.ddg.find.var.uses.rec(main.object))
}


#' .ddg.find.simple.assign returns the name of the variable assigned
#' to if the object passed in is an expression representing an
#' assignment statement.  Otherwise, it returns NULL.
#' @param obj - input expression.
#' @return name of variable assigned to
#' @noRd

.ddg.find.simple.assign <- function(obj)
{
  if (.ddg.is.assign(obj)) {
    .ddg.get.var(obj[[2]])
  }
  else {
    ""
  }
}


#' .ddg.is.assign returns TRUE if the object passed is an expression
#' object containing an assignment statement.
#' @param expr - a parsed expression.
#' @param globals.only If TRUE only return TRUE if the assignment
#'    is non-local.
#' @return True if an expression object containing an assignment
#' @noRd

.ddg.is.assign <- function (expr, globals.only = FALSE)
{
  if (is.call(expr))
  {
    # This also finds uses of ->.
    if (!globals.only && identical(expr[[1]], as.name("<-"))) {
      return (TRUE)
    }

    # This also finds uses of ->>.
    else if (identical(expr[[1]], as.name("<<-"))) {
      return (TRUE)
    }

    else if (!globals.only && identical(expr[[1]], as.name("="))) {
      return (TRUE)
    }

    else if (identical(expr[[1]], as.name("assign"))) {
      return (TRUE)
    }
  }

  return (FALSE)
}

#' .ddg.get.var returns the variable being referenced in an
#' expression. It should be passed an expression object that is
#' either a variable, a vector access (like a[1]), a list member
#' (like a[[i]]) or a data frame access (like a$foo[i]).  For all of
#' these examples, it would return "a".
#' @param lvalue - a parsed expression.
#' @return name of variable referenced in the expression
#' @noRd

.ddg.get.var <- function(lvalue)
{
  if (is.symbol(lvalue))
    deparse(lvalue)

  # for string literals
  # e.g. when the assign function is used
  else if ( is.character(lvalue) )
    .ddg.get.var( parse(text = lvalue)[[1]] )

  else
    .ddg.get.var(lvalue[[2]])
}


#' .ddg.find.assign returns a vector containing the names of all
#' the variables assigned in an expression.  The parameter should
#' be an expression object. For example, if obj represents the
#' expression "a <- (b <- 2) * 3", the vector returned will contain
#' both a and b.
#' @param obj - a parsed expression.
#' @param globals.only If TRUE, only look for assignments to non-locals.
#' @return a vector containing all variables assigned in the expression
#' @noRd

.ddg.find.assign <- function(obj, globals.only = FALSE) {
  # Base case.
  if (!is.recursive(obj)) return(character())

  # Assignment statement.  Add the variable being assigned to the
  # vector and recurse on the expression being assigned.
  if (.ddg.is.assign(obj, globals.only)) {
    var <- .ddg.get.var(obj[[2]])

    # Don't look for assignments in the body of a function as those
    # won't happen until the function is called.
    # Don't recurse on NULL.
    if (!(is.null(obj[[3]]))) {
      if (.ddg.is.functiondecl(obj[[3]])) var
      else c(var, unlist(.ddg.find.assign (obj[[3]], globals.only)))
    }
    else var
  }

  # Not an assignment statement.  Recurse on the parts of the
  # expression.
  else {
    unique(unlist(lapply(obj, .ddg.find.assign, globals.only)))
  }
}

#' ddg.is.functiondecl tests to see if an expression is a function
#' declaration.
#' @param expr - a parsed expression.
#' @return True if expression is a function declaration
#' @noRd

.ddg.is.functiondecl <- function(expr) {
  if (is.symbol(expr) || !is.language(expr)) return (FALSE)
  if (is.null(expr[[1]]) || !is.language(expr[[1]])) return (FALSE)
  return (expr[[1]] == "function")
}

#' .ddg.get.statement.type returns the control type (if applicable) of a
#' parsed statement.
#' @param parsed.command a parsed statement
#' @return the control type of the statement
#' @noRd

.ddg.get.statement.type <- function(parsed.command) {
  if (length(parsed.command) > 1) return(as.character(parsed.command[[1]]))
  return("")
}

#' .ddg.is.call.to returns TRUE if the parsed expression passed
#' in is a call to the specified function.
#' @param parsed.expr - a parse tree
#' @param func.name - the name of a function
#' @return True if the expression is a call to the specified function
#' @noRd

.ddg.is.call.to <- function(parsed.expr, func.name) {
  # Check if a function call.
  if (is.call(parsed.expr)) {
    # Check if the function called is the specified function.
    if (parsed.expr[[1]] == func.name) {
      return (TRUE)
    }
  }
  return (FALSE)
}

#' .ddg.has.call.to returns TRUE if the parsed expression passed
#' in contains a call to the specified function.
#' @param parsed.expr - a parse tree
#' @param func.name - the name of a function
#' @return True if the parsed expression contains a call to the specified function
#' @noRd

.ddg.has.call.to <- function(parsed.expr, func.name) {
  # Base case.
  if (!is.recursive(parsed.expr)) return(FALSE)

  # If this is a function declaration, skip it
  if (.ddg.is.functiondecl(parsed.expr)) return(FALSE)

  # A call to the specified function.
  if (.ddg.is.call.to(parsed.expr, func.name)) {
    return (TRUE)
  }
  # Not a call to the specified function.  Recurse on the parts of
  # the expression.
  else {
    return (any(sapply(parsed.expr, 
                       function(parsed.expr) {
                         return(.ddg.has.call.to(parsed.expr, func.name))
                       }
                       )))
  }
}