provParseR: Pulls Information from Prov.Json Files

Documented in get.args get.data.nodes get.data.proc get.environment get.error.nodes get.func.lib get.func.nodes get.func.proc get.libs get.preexisting get.proc.data get.proc.nodes get.proc.proc get.scripts get.stdout.nodes get.tool.info get.val.type prov.parse

# Copyright (C) President and Fellows of Harvard College and 
# Trustees of Mount Holyoke College, 2018, 2019, 2020.

# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public
#   License along with this program.  If not, see
#   <http://www.gnu.org/licenses/>.

# Created by Orenna Brand & Joe Wonsil, summer 2018
# Updated by Barbara Lerner, fall 2018

#' Collection of information gathered from parsing a PROV file
#' 
#' This is the class that stores provenance information.  It is created by 
#' prov.parse.  Rather than access the slots directly, it is better to use
#' the access functions the package provides.
#' 
#' @slot proc.nodes the procedure nodes
#' @slot data.nodes the data nodes
#' @slot func.nodes the function nodes
#' @slot proc.proc.edges control flow edges
#' @slot proc.data.edges output data edges
#' @slot data.proc.edges input data edges
#' @slot func.proc.edges function use edges
#' @slot func.lib.edges function library edges
#' @slot agents tool that created the provenance
#' @slot args arguments passed when provenance was created
#' @slot envi environmental information
#' @slot libs libraries
#' @slot scripts scripts executed 
#' 
#' @seealso The parse function, which creates the ProvInfo object, \code{\link{prov.parse}}
#' @seealso The access functions, including \code{\link{get.environment}}
#' @import methods
#' @exportClass ProvInfo
ProvInfo <- methods::setClass("ProvInfo",
    slots = list(
        proc.nodes = "data.frame", 
        data.nodes = "data.frame", 
        func.nodes = "data.frame", 
        proc.proc.edges = "data.frame", 
        proc.data.edges = "data.frame", 
        data.proc.edges = "data.frame", 
        func.proc.edges = "data.frame", 
        func.lib.edges = "data.frame", 
        agents = "data.frame", 
        args = "list",
        envi = "data.frame", 
        libs = "data.frame", 
        scripts = "data.frame")
)

# This is called when a new ProvInfo object is created.  It initializes all of the slots.
methods::setMethod ("initialize",
    "ProvInfo",
    function(.Object, prov){
      
      # Removing 'rdt:' prefix for legibility of data.
      if (length(prov) == 0) warning ("Provenance is empty")
      prov <- gsub("rdt:", "", prov)
      prov <- gsub("prov:", "", prov)
      
      # Converting to an R-useable data structure.
      prov.data <- jsonlite::fromJSON(prov)
      
      # Creating the master list by unlisting the nodes/edges.
      master.list <- unlist(prov.data, recursive = F)
      
      # This removes the appended prefixes created by unlisting.
      # This leaves the nodes with their original names.
      names(master.list) <- gsub("^.*\\.", "", names(master.list))
      
      
      # These nodes cannot be parsed with the generalized function.
      # Therefore they are done separately and appended later.
      .Object@agents <- parse.agents(master.list)
      .Object@args <- parse.args(master.list)
      .Object@envi <- parse.envi(master.list)
      .Object@libs <- parse.libs(master.list)
      .Object@scripts <- parse.scripts(master.list)
      
      # This list represents the characters codes for the different
      # possible objects.
      obj.chars <- c("p", "d", "f", "pp", "pd", "dp", "fp", "m")
      
      # Utilizes char codes to produce the list of data frames.
      prov.df <- lapply(obj.chars, parse.from.identifier, m.list = master.list)
      
      .Object@proc.nodes <- prov.df[[1]]
      .Object@data.nodes <- .add.snapshot.paths(prov.df[[2]], .Object@envi)
      .Object@func.nodes <- prov.df[[3]]
      .Object@proc.proc.edges <- prov.df[[4]]
      .Object@proc.data.edges <- prov.df[[5]]
      .Object@data.proc.edges <- prov.df[[6]]
      .Object@func.proc.edges <- prov.df[[7]]
      .Object@func.lib.edges <- prov.df[[8]]
      
      # Check the type of the elapsedTime column.
      # Convert to a column of doubles if it is a column of strings.
      if(length(.Object@proc.nodes) > 0) {
        elapsedTime <- .Object@proc.nodes["elapsedTime"][ , 1]
        
        if(typeof(elapsedTime) == "character") {
          elapsedTime <- parse.elapsed.time(elapsedTime)
          .Object@proc.nodes["elapsedTime"] <- elapsedTime
        }
      }
      
      # Complete
      return (.Object)
    }
)
      
#' Changes the value of snapshot nodes to include a full path to the snapshot.
#' @param data.nodes the data frame containing all data nodes
#' @param env the data frame containing environment information
#' @return the modified data nodes data frame
#' @noRd
.add.snapshot.paths <- function (data.nodes, env) {
  if (any(data.nodes$type %in% c("Snapshot", "StandardOutputSnapshot"))) {
    prov.dir <- env[env$label == "provDirectory", ]$value
    data.nodes[data.nodes$type %in% c("Snapshot", "StandardOutputSnapshot"), ]$value <- paste (prov.dir, data.nodes[data.nodes$type %in% c("Snapshot", "StandardOutputSnapshot"), ]$value, sep="/")
  }
  return (data.nodes)
}

# Generalized parser - parses the nodes with the requested node identifier
parse.from.identifier <- function(requested, m.list) {
  
	# Constructs pattern to match to using the grep function.
  grep.arg <- paste("^", requested, "[[:digit:]]", sep = "")
  
  # Using the pattern to pull out only the requested
  # nodes/edges from the master list.
  # This list had data stored in rows not columns
  nodes <- m.list[grep(grep.arg, names(m.list))]
  
  # Calls helper function to convert nodes to a data frame
  return(parse.general(nodes))
}

# Generalized parser - parses the given nodes into a data frame
parse.general <- function(nodes) {
  
  # case: nothing to parse
  if (length(nodes) == 0) return(data.frame())
  
  # The num of columns are stored in each row in the list
  # Pull out how many columns so we can index through each
  # row and receive the columns
  col.length <- 1:length(nodes[[1]])
  
  # Use index nums to pull out each column so they are
  # no longer stored as rows but columns
  col.list <- lapply(col.length, function(x) {
    return(mapply(`[`, nodes, x))
  })
  
  # To each column, replace the string "NA" with
  # an actual R value of NA, the extract the column 
  # as a vector to coerce into one type
  node.vec <- lapply(col.length, function(x) {
    col <- mapply(`[`, nodes, x)
    col[col=="NA"] <- NA
    return(mapply(`[`, col, 1))
  })
  
  # Convert the data frame, we do not have factors
  # in data so keep them as strings
  nodes.df <- data.frame(node.vec, stringsAsFactors = F)
  colnames(nodes.df) <- names(nodes[[1]])
  nodes.df <- cbind.data.frame(names(nodes), nodes.df, stringsAsFactors = F)
  names(nodes.df)[names(nodes.df) == "names(nodes)"] <- "id"
  rownames(nodes.df) <- NULL
  
  return(nodes.df)
}

# Agent parser
parse.agents <- function(m.list) {
	
	# Extract nodes from master list
  nodes <- m.list[grep("^a[[:digit:]]", names(m.list))]
  
  # Remove entries regarding arguments, if any
  nodes <- lapply(nodes, function(x) {
              
              indices <- grep("args", names(x))
                            
              if(identical(indices, integer(0)))
                return(x)
              else
                return(x[-indices])
           })
  
  # Parse into data frame and return
  return(parse.general(nodes))
}

# Parser for argumnents.
# Should I assume there to be only 1 set of this?
# The general parser and the parsing of the agent node doesn't assume that though
#
# Extracts the arguments in each agent into a named list, where:
# 1. Each element is the value of an argument, converted into its 
# corresponding type listed in the property args.types.
# 2. The name of each element is the name of the argument.
# The name of each list is the agent number.
parse.args <- function(m.list) {
  
  # Extract nodes from master list
  nodes <- m.list[grep("^a[[:digit:]]", names(m.list))]
  
  # If no agent nodes, return empty list
  if(length(nodes) == 0)
    return(list())
  
  # For each agent, extract entries regarding arguments, if any.
  # Replace with NULL if there are none.
  # Unfortunately, the NULL will be in its own list.
  # This results in each list having the 3 lists of argument information,
  # or 1 list of NULL.
  nodes <- lapply(nodes, function(x) {
              
              indices <- grep("args", names(x))
              
              if(identical(indices, integer(0)))
                return(NULL)
              else
                x[indices]
           })
  
  # Then, for each agent (loop through indices of main list),
  # figure out which indices of the main list do not have a list of NULL.
  nodes.indices <- 1:length(nodes)
  nodes.indices <- lapply(nodes.indices, function(i)
                   {
                      if(is.null(nodes[[i]]))
                        return(NULL)
                      else
                        return(i)
                   })
  nodes.indices <- unlist(nodes.indices, recursive = TRUE, use.names = FALSE)
  
  # If null, that means there are no agents have arguments. Return empty list.
  if(is.null(nodes.indices))
    return(list())
  
  # Extract nodes with arguments.
  nodes <- nodes[nodes.indices]
  
  # For each agent, convert to a named list such that
  # each element is the value of an argument converted back to its original type,
  # the name of each element is the name of the argument
  # This results in a list of named lists.
  args <- lapply(nodes, function(node) {
    
    # Extract the argument values and types for easy reference
    vals <- node[[grep("values", names(node))]]
    types <- node[[grep("types", names(node))]]
    
    # Obtain the indices. 
    # This is so that we could use the lapply function instead of a loop.
    indices <- c(1:length(vals))
    
    # The alternative to using a loop.
    # For each index, i, convert the argument value into its original argument type.
    # Types supported: logical, integer, numeric
    # Anything else remains as a string.
    vals <- lapply(indices, function(i) {
      if(types[i] == "logical")
        return(as.logical(vals[i]))
      else if(types[i] == "integer")
        return(as.integer(vals[i]))
      else if(types[i] == "numeric")
        return(as.numeric(vals[i]))
      else
        return(vals[i])
    })
    
    # Before returning, name each argument value with its corresponding argument name
    names(vals) <- node[[grep("names", names(node))]]
    return(vals)
  }) # end type conversion for each agent
  
  return(args)
}

# Environment parser
parse.envi <- function(m.list) {
  
  env <- m.list$environment
  
  # Remove nested lists which are parsed in a different
  # function
  env <- env[-which(names(env) == "sourcedScripts")]
  env <- env[-which(names(env) == "sourcedScriptTimeStamps")]
  if ("sourcedScriptHashes" %in% names(env)) {
    env <- env[-which(names(env) == "sourcedScriptHashes")]
  }
  
  # Swap rows and columns for clarity and apply name the column
  environment <- t(as.data.frame(env))
  environment <- data.frame(environment, stringsAsFactors = F)
  colnames(environment) <- c("value")
  environment <- cbind.data.frame(rownames(environment), environment, stringsAsFactors = F)
  names(environment)[names(environment) == "rownames(environment)"] <- "label"
  rownames(environment) <- NULL
  
  return(environment)
}

# Libraries parser
parse.libs <- function(m.list) {
  # Use the general function, however it will 
  # add unneeded columns
  libraries <- parse.from.identifier("l", m.list)
  
  # Pull out columns of info wanted.  The "whereLoaded" column was
  # added with rdtLite 1.4
  columns <- colnames(libraries)
  if ("whereLoaded" %in% columns) {
  	libraries <- libraries[,c("id", "name", "version", "whereLoaded")]
  }
  else {
    libraries <- libraries[,c("id", "name", "version")]
    libraries$whereLoaded <- "unknown"
  }
  return(libraries)
}

# Source scripts parser
parse.scripts <- function(m.list) {
  
  # The source scripts are nested in the environment object
  env <- m.list$environment
  
  # Put the main script in the table
  scripts.df <- 
    if ("scriptHash" %in% names(env)) 
      data.frame (script = env$script[1], 
                  timestamp = env$scriptTimeStamp[1],
                  hash = env$scriptHash[1], stringsAsFactors = F)
    else 
      data.frame (script = env$script[1], 
                  timestamp = env$scriptTimeStamp[1],
                  hash = "", stringsAsFactors = F)
  
  # Grab the sourced script names
  scripts <- env$`sourcedScripts`
  
  if (length(scripts) > 0 && scripts[1] != "") {
    # Append the sourced scripts
    scripts.df <- 
      if ("sourcedScriptHashes" %in% names(env)) 
        rbind (scripts.df, cbind(script = scripts, 
                                 timestamp = env$`sourcedScriptTimeStamps`, 
                                 hash = env$`sourcedScriptHashes`))
      else 
        rbind (scripts.df, cbind(script = scripts, 
                                 timestamp = env$`sourcedScriptTimeStamps`, 
                                 hash = rep("", length(scripts))))
  }
  
  return(scripts.df)
}

# Parse a vector of elapsedTime values from strings to a vector of doubles.
# elapsedTime strings can have ',' and/or '.' for digit grouping and/or as a decimal separator.
# elapsedTime values will always have at least a decimal separator.
parse.elapsed.time <- function(vector) {
	
	vector <- sapply(
		vector,
		function(str) {
			# Try to parse the string normally as a double.
			# If and when it fails, a warning will be thrown and NA will be returned.
			val <- suppressWarnings(as.double(str))
			
			# If parsing fails, manipulate the string into a format where it will parse.
			# e.g. When ',' and/or '.' are used to group digits or as a decimal separator
			if(is.na(val[1])) {
				
				# split string into array where ',' or '.' occurs
				# there will always be at least a decimal separator
				regex <- '(,|\\.)'
				parts <- strsplit(str, regex)[[1]]
				
				# as the last part is always the part after the decimal separator,
				# add a '.' before it before combining the parts back into a single string
				parts[length(parts)] <- paste('.', parts[length(parts)], sep='')
				str <- paste(parts, collapse='')
				
				val <- as.double(str)
			}
			
			return(val)
		})
	
	return(unname(vector))
}

#' Provenance parser
#' 
#' The prov.parse function parses the provenance collected by rdt or rdtLite.  This 
#' provenance can be stored in a prov.json file or passed to prov.parse as a string.  
#' The provParseR package also defines
#' a number of functions that extract and return information from the parsed
#' provenance.
#'
#' @param prov.input A path to a json file that has been created by rdt
#'   or rdtLite or a string that is in prov.json format.
#' @param isFile A logical value that indicates whether the provenance information 
#'   is stored in a file (isFile=T) or in a string (isFile=F).
#' @return A ProvInfo object that can be passed to the access functions provided
#'   by the provParseR package.
#' @export
#' @examples
#' prov <- prov.parse(system.file ("testdata", "prov.json", package="provParseR", mustWork=TRUE))
#' @seealso The access functions, including \code{\link{get.environment}}
prov.parse <- function(prov.input, isFile = T) {

  # If the input is a string does not need to be read from the file system
  if (isFile) {
    prov <- readLines(prov.input)
  } else {
    prov <- prov.input
  }
  
  return (methods::new (Class = "ProvInfo", prov))
}

## ==Simple wrapper functions for calling data frames==##


    
#' Provenance access functions
#' 
#' These functions extract information from a ProvInfo object created by the prov.parse function 
#' and return this information as a data frame.
#' 
#' @param prov a ProvInfo object created by calling \code{\link{prov.parse}}.
#' @param only.files If true, the output of get.input.files contains just files.  If false,
#'    it contains both files and URLs.
#' @param var.name a string containing the name of a variable used in the script the
#'  provenance is for
#' 
#' @examples
#' prov <- prov.parse(system.file ("testdata", "prov.json", package="provParseR", mustWork=TRUE))
#' get.proc.nodes(prov)
#' get.input.files(prov)
#' get.urls(prov)
#' get.output.files(prov)
#' get.variables.set(prov)
#' get.variables.used(prov)
#' get.variable.named(prov, "z")
#' get.data.nodes(prov)
#' get.error.nodes(prov)
#' get.func.nodes(prov)
#' get.proc.proc(prov)
#' get.data.proc(prov)
#' get.proc.data(prov)
#' get.func.proc(prov)
#' get.func.lib(prov)
#' get.libs(prov)
#' get.scripts(prov)
#' get.environment(prov)
#' get.val.type(prov, "d1")
#' get.tool.info(prov)
#' get.args(prov)
#' get.stdout.nodes(prov)
#' 
#' @return All access functions return NULL if there is no parsed provenance.  If parsed provenance
#'   exists, but there is no provenance for the type of information requested, such as no input 
#'   files, an empty data frame is returned.
#' 
#' @seealso \code{\link{prov.parse}}

#' @return get.environment returns a data frame containing information about how the provenance was collected.
#'    The data frame has 2 columns:  label and value.  The labels are: 
#'    \itemize{
#'    \item {name} {- whose value will always be "environment"}
#'    \item {architecture}
#'    \item {operatingSystem}
#'    \item {language}
#'    \item {langVersion}
#'    \item {script} {- the absolute path to the script executed}
#'    \item {scriptTimeStamp} {- when the script was last modified}
#'    \item {workingDirectory}
#'    \item {provDirectory} {- where the provenance is stored}
#'    \item {provTimeStamp} {- when the provenance was collected}
#'    \item {hashAlgorithm}
#'    }
#' @rdname access
#' @export
get.environment <- function(prov) {
  return(if (!is.null(prov)) {
    prov@envi
  } else {
    NULL
  })
}

#' @return get.libs returns a data frame describing the libraries used by the 
#'   script.  It contains 3 columns:  id, name, and version.
#' @rdname access
#' @export
get.libs <- function(prov) {
  return(if (!is.null(prov)) {
    prov@libs
  } else {
    NULL
  })
}

#' @return get.tool.info returns a data frame describing the tool that 
#'   collected the provenance.  It contains 3 columns:  tool.name, tool.version
#'   and json.version.  
#' @rdname access
#' @export
get.tool.info <- function(prov) {
  return(if (!is.null(prov)) {
            prov@agents[c("tool.name", "tool.version", "json.version")]
          } else {
            NULL
          })
}

#' @return get.args returns a named list describing the arguments that were passed
#'    to prov.run or prov.init when the provenance was collected.
#'    Each element is the value of an argument in its original type, 
#'    each element name is the name of the arguemnt the value corresponds to.
#' @rdname access
#' @export
get.args <- function(prov) {
  if(is.null(prov)) {
    return(NULL)
  }
  else {
    # removes the 'a' identifier from the id of the agent each list corresponds to
    args <- prov@args
    names(args) <- grep("[[:digit:]]", names(args))
    
    return(args)
  }
}

#' @return get.scripts returns a data frame identifying all the scripts executed.  The main script
#'    will be first, followed by all sourced scripts.  The data frame contains 
#'    2 columns:  name and timestamp (when the script was last modified).  
#' @rdname access
#' @export
get.scripts <- function(prov) {
  return(if (!is.null(prov)) {
    prov@scripts
  } else {
    NULL
  })
}

#' @return get.saved.scripts returns a data frame identifying the location of saved copies
#'    of all the scripts executed.  The main script
#'    will be first, followed by all sourced scripts.  The data frame contains 
#'    2 columns:  name and timestamp (when the script was last modified).  
#' @rdname access
#' @export
get.saved.scripts <- function (prov) {
  scripts <- get.scripts(prov)
  env <- get.environment(prov)
  prov.dir <- env[env$label == "provDirectory", ]$value
  
  script.names <- paste0(prov.dir, "/scripts/", basename(scripts$script))
  script.timestamps <- unname(scripts$timestamp)
  
  return(data.frame(script = script.names, timestamp = script.timestamps, stringsAsFactors = FALSE))
}

#' @return get.proc.nodes returns a data frame identifying all the procedural nodes executed.  
#'    These are represented in PROV-JSON as activities and include nodes
#'    corresponding to lines of code, start or finish nodes that surround
#'    blocks of code, and nodes to represent the binding of function arguments
#'    to parameters.  The data frame contains 
#'    8 columns:  
#'    \itemize{
#'      \item{id} {- a unique id}
#'      \item{name} {- a description of what the node represents.  Often this is a line of code from
#'        the script, perhaps shortened}
#'      \item{type} {- one of Operation, Binding, Start, Finish, or Incomplete}
#'      \item{elapsedTime} {- when this executed relative to the start of the script}
#'      \item {scriptNum} {- a number identifing the script it comes from, with script 1 being the main
#'        script}
#'      \item {startLine} {- the line in the script this corresponds to, which may be NA, and the following
#'        other position infofmation}
#'      \item {startCol}
#'      \item {endLine}
#'      \item {endCol}
#'    }
#' @rdname access
#' @export
get.proc.nodes <- function(prov) {
  if (!is.null(prov)) {
    proc.nodes <- prov@proc.nodes
    if (ncol(proc.nodes) == 0) {
      #   id      name      type elapsedTime scriptNum startLine startCol endLine endCol
      # 1 p1 Issue10.R     Start       0.441        NA        NA       NA      NA     NA
      proc.nodes <- data.frame (id=character(), name=character(), type=character(), 
          elapsedTime=character(), scriptNum=integer(), startLine=integer(), 
          startCol=integer(), endLine=integer(), endCol=integer(), stringsAsFactors=FALSE)
    }
    return (proc.nodes)
  } else {
    return (NULL)
  }
}

#' @return get.data.nodes returns a data frame with an entry for each data node
#'   in the provenance.  The data frame contains the following columns:
#'   \itemize{
#'      \item {id} {- a unique id}
#' 			\item {name} {- the descriptive name for the node, which is generally a variable name, file name, or URL}
#' 			\item {value} {- either a text value (possible shortened) or the name of a file where the value is stored}
#' 			\item {valType} {- a description of the value's type, including its container (such as list, vector, etc.), 
#'         dimensions and member types (such as character, numeric, etc.)}
#' 			\item {type} {- the type of the node, one of Data, Snapshot, File, URL, Exception, Device, 
#'         StandardOutput, or StandardOutputSnapshot}
#' 			\item {scope} {- a hex number identifying the scope.  This is only used for node's with type Data or Snapshot}
#' 			\item {fromEnv} {- a logical value.  If true, it means the variable had a value before the script began execution}
#' 			\item {hash} {- the hash value for File nodes}
#' 			\item {timestamp} {- the time at which the node was created}
#' 			\item {location} {- for file nodes, the absolute path to the file}
#'   }
#' @rdname access
#' @export
get.data.nodes <- function(prov) {
  if (!is.null(prov)) {
    data.nodes <- prov@data.nodes
    if (ncol (data.nodes) == 0) {
      #   id  name   value    valType                                                 type   scope
      # 1 d1   y       5  {"container":"vector", "dimension":[1], "type":["numeric"]} Data R_GlobalEnv
            
      # fromEnv      hash    timestamp   location
      #  FALSE
      
      data.nodes <- data.frame (id=character(), name=character(), value=character(), valType=character(),
          type=character(), scope=character(), fromEnv=logical(), hash=character(), timestamp=character(), 
          location=character(), stringsAsFactors=FALSE)
    }
    return (data.nodes)
    
  } else {
    return (NULL)
  }
}

#' @return get.stdout.nodes returns a data frame with an entry for each standard output node
#'   in the provenance.  The data frame contains the following columns:
#'   \itemize{
#'      \item {id} {- a unique id}
#' 			\item {value} {- either a text value (possible shortened) or the name of a file where the value is stored}
#' 			\item {timestamp} {- the time at which the node was created}
#'   }
#' @rdname access
#' @export
get.stdout.nodes <- function(prov) {
  data.nodes <- get.data.nodes(prov)
  stdout.nodes <- data.nodes[data.nodes$type %in% c("StandardOutput","StandardOutputSnapshot"),]
  stdout.table <- subset (stdout.nodes, select=c("id", "value", "timestamp"))
  return (stdout.table)
}
  
#' @return get.error.nodes returns a data frame with an entry for each error node
#'   in the provenance.  The data frame contains the following columns:
#'   \itemize{
#'      \item {id} {- a unique id}
#' 			\item {value} {- either a text value (possible shortened) or the name of a file where the value is stored}
#' 			\item {timestamp} {- the time at which the node was created}
#'   }
#' @rdname access
#' @export
get.error.nodes <- function(prov) {
  data.nodes <- get.data.nodes(prov)
  error.nodes <- data.nodes[data.nodes$type=="Exception",]
  error.table <- subset (error.nodes, select=c("id", "value", "timestamp"))
  return (error.table)
}

#' @return get.func.nodes returns a data frame containing information about the functions
#'   used from other libraries within the script.  The data frame has 2 columns:  id 
#'   (a unique id) and name (the name of the function called).  
#' @rdname access
#' @export
get.func.nodes <- function(prov) {
  if (!is.null(prov)) {
    func.nodes <- prov@func.nodes
    if (ncol(func.nodes) == 0) {
      #   id    name
      # 1 f1 str_to_upper
      func.nodes <- data.frame(id=character(), name=character(), stringsAsFactors=FALSE)
    }
    return(func.nodes)
  } else {
    return (NULL)
  }
}

#' @return get.proc.proc returns a data frame containing information about the edges
#'   that go between two procedural nodes.  These edges indicate a control-flow relationship
#'   between the two activities.  The data frame has 3 columns:  id 
#'   (a unique id), informant (the tail of the edge), and informed (the head of the edge).  
#' @rdname access
#' @export
get.proc.proc <- function(prov) {
  if (!is.null(prov)) {
    proc.proc.edges <- prov@proc.proc.edges
    if (ncol(proc.proc.edges) == 0) {
        #     id  informant informed
        # 1   pp1    p1       p2
      proc.proc.edges <- data.frame (id=character(), informant=character(), 
          informed=character(), stringsAsFactors=FALSE)
    }
    return (proc.proc.edges)
    
  } else {
    return (NULL)
  }
}

#' @return get.data.proc returns a data frame containing information about the edges
#'   that go from data nodes to procedural nodes.  These edges indicate an input relationship
#'   where the data is used by the activity.  The data frame has 3 columns:  id 
#'   (a unique id), entity (the input data), and activity (the procedural node that uses the
#'   data).  
#' @rdname access
#' @export
get.data.proc <- function(prov) {
  if (!is.null(prov)) {
    data.proc.edges <- prov@data.proc.edges
    if (ncol (data.proc.edges) == 0) {
      #     id    entity  activity
      # 1   dp1     d1       p7
      data.proc.edges <- data.frame (id=character(), entity=character(), 
          activity=character(), stringsAsFactors=FALSE)
    }
    return (data.proc.edges)
  } else {
    return (NULL)
  }
}

#' @return get.proc.data returns a data frame containing information about the edges
#'   that go from procedural nodes to data nodes.  These edges indicate an output relationship
#'   where the data is produed by the activity.  The data frame has 3 columns:  id 
#'   (a unique id), entity (the output data), and activity (the procedural node that produces the
#'   data).  
#' @rdname access
#' @export
get.proc.data <- function(prov) {
  if (!is.null(prov)) {
    proc.data.edges <- prov@proc.data.edges
    if (ncol(proc.data.edges) == 0) {
      #     id  activity entity
      # 1   pd1   p6       d1
      proc.data.edges <- data.frame (id=character(), activity=character(), 
          entity=character(), stringsAsFactors=FALSE)
    }
    return (proc.data.edges)
  } else {
    return (NULL)
  }
}

#' @return get.proc.func returns a data frame containing information about where externally-defined
#'   functions are used in the script.  The data frame has 3 columns:  func_id (the id of the
#'   function node), activity (the procedural node 
#'   that calls the function) and function (the function's name).  
#' @rdname access
#' @export
get.func.proc <- function(prov) {
  if (is.null (prov)) return (NULL)
  
  else {
    func.proc.edges <- prov@func.proc.edges
    if (nrow (func.proc.edges) == 0) return (data.frame("func_id"=vector(), "function"=vector(), "activity"=vector()))
    func.nodes <- get.func.nodes (prov)
    func.proc.df <- merge (func.proc.edges, func.nodes, by.x = "entity", by.y = "id")
    colnames(func.proc.df)[1] <- "func_id"
    colnames(func.proc.df)[4] <- "function"
    return (func.proc.df[c("func_id", "function", "activity")])
  } 
}

#' @return get.func.lib returns a data frame containing information about what
#'   libraries externally-defined
#'   functions come from.  The data frame has 3 columns:  func_id (the id of the
#'   function node), library (a library node)
#'   and function (the name of a function).  
#' @rdname access
#' @export
get.func.lib <- function(prov) {
  if (is.null (prov)) return (NULL)
  
  else {
    func.lib.edges <- prov@func.lib.edges
    if (nrow (func.lib.edges) == 0) return (data.frame("func_id"=vector(), "library"=vector(), "function"=vector()))
    func.nodes <- get.func.nodes (prov)
    func.lib.df <- merge (func.lib.edges, func.nodes, by.x = "entity", by.y = "id")
    colnames(func.lib.df)[1] <- "func_id"
    colnames(func.lib.df)[3] <- "library"
    colnames(func.lib.df)[4] <- "function"
    return (func.lib.df[c("func_id", "function", "library")])
  } 
}

get.libs.needed <- function (prov, proc_ids) {
    func.proc.edges <- prov@func.proc.edges
    if (nrow (func.proc.edges) == 0) return (vector())
    func.nodes <- get.func.nodes (prov)
    print (func.nodes)
    matching <- func.nodes[func.nodes$activity %in% proc_ids, ]
    print (matching)
    return (matching)
}
	

#' @rdname access
#' @return get.input.files returns a data frame containing a subset of the data nodes that correspond to files that are 
#'   read by the script.  If only.files is False, the data frame contains information about both input files and URLs.
#' @export
get.input.files <- function (prov, only.files=FALSE) {
  data.nodes <- get.data.nodes(prov)
  if (is.null (data.nodes)) return (NULL)
  
  if (only.files) {
    file.nodes <- data.nodes[data.nodes$type == "File", ]
  }
  else {
    file.nodes <- data.nodes[data.nodes$type %in% c("File","URL"), ]
  }
  if (nrow (file.nodes) == 0) {
    return (file.nodes)
  }
  
  input.data <- get.data.proc(prov)$entity
  input.files <- file.nodes[file.nodes$id %in% input.data, ]
  return (input.files)
}

#' @rdname access
#' @return get.urls returns a data frame containing a subset of the data nodes that correspond to urls used 
#'   in the script.  
#' @export
get.urls <- function (prov) {
  data.nodes <- get.data.nodes(prov)
  if (is.null (data.nodes)) return (NULL)
  
  url.nodes <- data.nodes[data.nodes$type == "URL", ]
  return (url.nodes)
}

#' @rdname access
#' @return get.output.files returns a data frame containing a subset of the data nodes that correspond to files that are 
#'   written by the script.  
#' @export
get.output.files <- function (prov) {
  data.nodes <- get.data.nodes(prov)
  if (is.null (data.nodes)) return (NULL)
  
  file.nodes <- data.nodes[data.nodes$type == "File", ]
  if (nrow (file.nodes) == 0) {
    return (file.nodes)
  }
  
  output.data <- get.proc.data(prov)$entity
  output.files <- file.nodes[file.nodes$id %in% output.data, ]
  return (output.files)
}

#' @rdname access
#' @return get.preexisting returns a data frame containing variables in the global environment 
#' that are used but not set by a script or a console session.
#' @export
get.preexisting <- function(prov) {
  data.nodes <- get.data.nodes(prov)
  if (is.null (data.nodes)) return (NULL)

  vars <- data.nodes[data.nodes$fromEnv == TRUE, "name"]
  vars <- as.data.frame(vars, stringsAsFactors = FALSE)
  colnames(vars) <- "name"
  
  return(vars)
}

#' @rdname access
#' @return get.variables.set returns a data frame containing a subset of the data nodes that correspond to variables
#'   assigned to in the script.  
#' @export
get.variables.set <- function (prov) {
  data.nodes <- get.data.nodes(prov)
  if (is.null (data.nodes)) return (NULL)
  
  data.nodes <- data.nodes[data.nodes$type %in% c ("Data", "Snapshot"), ]
  if (nrow (data.nodes) == 0) {
    return (data.nodes)
  }
  
  output.data <- get.proc.data(prov)$entity
  variables.set <- data.nodes[data.nodes$id %in% output.data, ]
  return (variables.set)
}

#' @rdname access
#' @return get.variables.used returns a data frame containing a subset of the data nodes that correspond to variables
#'   whose values are used in the script.  
#' @export
get.variables.used <- function (prov) {
  data.nodes <- get.data.nodes(prov)
  if (is.null (data.nodes)) return (NULL)
  
  data.nodes <- data.nodes[data.nodes$type %in% c ("Data", "Snapshot"), ]
  if (nrow (data.nodes) == 0) {
    return (data.nodes)
  }
  
  input.data <- get.data.proc(prov)$entity
  variables.used <- data.nodes[data.nodes$id %in% input.data, ]
  return (variables.used)
}

#' @rdname access
#' @return get.variable.named returns a data frame containing a subset of the data nodes that correspond to variables
#'   with the specified name.  
#' @export
get.variable.named <- function (prov, var.name) {
  data.nodes <- get.data.nodes(prov)
  if (is.null (data.nodes)) return (NULL)
  
  variable.nodes <- data.nodes[data.nodes$type %in% c ("Data", "Snapshot"), ]
  variable.nodes <- variable.nodes[variable.nodes$name == var.name, ]
  return (variable.nodes)
}

#' get.val.type parses the valTypes of each data node in the given provenance,
#'	or the valType of the specified node, and returns it in a data frame.
#'
#' @param node.id A vector of node id.
#' @return A data frame containing the valType of the specified data node, 
#'	or the valTypes of all data nodes if no data node is specified. Return NULL
#'	if there are no data nodes or if the specified data node is not found.
#'  If not NULL, the data frame will contain 4 columns in the following order:
#'   \itemize{
#'      \item {id} {- The data node id.}
#'      \item {container} {- The type for the data's container, such as list, vector, etc. 
#'         NA in cases such as environment and function where the original valType is not a json object.}
#'      \item {dimension} {- The size of the data, represented as a string list when there are 2 or more dimensions.
#'         NA in cases such as environment and function where the original valType is not a json object.}
#'      \item {type} {- The type(s) contained within the container, represented as a string list in containers such as 
#'         data frames when there are multiple types. NA in cases like lists where the type of each element can be complex.}
#'   }
#' @rdname access
#' @export
get.val.type <- function(prov, node.id = NULL) {
	
	data.nodes <- get.data.nodes(prov)[ , c("id", "valType")]
	
	# extract row for specified node, if applicable
	if(! is.null(node.id))
		data.nodes <- data.nodes[data.nodes$id %in% node.id, ]
	
	# node not found, return null.
	if(nrow(data.nodes) == 0)
		return(NULL)
	
	# use sapply to parse val.type into a matrix with 3 columns
	# since it's a matrix, can query each column or just put into df!!
	parsed.val.type <- sapply(data.nodes[ , "valType"], function(val.type) {
		# a string vector to store the parsed valType
		# keep all terms as strings in order for sapply to be able to convert
		# the resulting list of character vectors into a matrix
		# container, dim, type
		arr = vector(mode = "character", length = 3L)
		
		# there are 2 types of valType:
		# a json object as a string, or
		# a simple string
		if(grepl("^\\{(.+)\\}$", val.type)) {
			
			# Type is string parsed from entity valType
			val.type <- jsonlite::fromJSON(val.type)
			
			arr[1] <- val.type$container
			
			# format dimension and type into a list
			# so that we can put it in a single element of a data frame
			arr[2] <- paste(val.type$dimension, collapse = ",")
			
			# type could be null (e.g. list)
			if(is.null(val.type$type))
				arr[3] <- NA
			else
				arr[3] <- paste(val.type$type, collapse= ", ")
			
		} else {
			arr[1] <- NA
			arr[2] <- NA
			arr[3] <- val.type
		}
		
		return(arr)
	}, USE.NAMES = FALSE)
	
	# form result data frame and return
	result <- data.frame("id" = data.nodes[ , "id"],
						 "container" = parsed.val.type[1, ],
						 "dimension" = parsed.val.type[2, ],
						 "type" = parsed.val.type[3, ],
						 stringsAsFactors = FALSE)
	return(result)
}

## ==== ##