# Copyright (C) President and Fellows of Harvard College and
# Trustees of Mount Holyoke College, 2014, 2015, 2016, 2017, 2018.
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
############################ API_rdt.R #############################
#' Provenance Collection Functions
#'
#' prov.init intializes a new provenance graph. Called by the user
#' in console mode.
#'
#' RDataTracker is an R package that collects provenance as an R script
#' executes. The resulting provenance provides a detailed record of the
#' execution of the script and includes information on the steps that were
#' performed and the intermediate data values that were created. The
#' resulting provenance can be used for a wide variety of applications
#' that include debugging scripts, cleaning code, and reproducing results.
#'
#' There are two ways in which a user can collect provenance. To collect
#' provenance from commands stored in a script file, use prov.run. This
#' will execute the commands that are in the script, collecting provenance
#' as it does so.
#'
#' The user can also collect provenance while executing commands in the
#' console. To do this, first execute prov.init. Then enter console
#' commands as normal. When done with the commands for which you want
#' provenance, use prov.quit. If you want to save the current provenance
#' without turning off provenance collection, call prov.save instead of
#' prov.quit. You can call prov.save multiple times before calling prov.quit.
#' Each call will append to the same provenance file.
#'
#' The provenance is stored in PROV-JSON format. For immediate use it may
#' be retrieved from memory using the prov.json function. For later use
#' the provenance is also written to the file prov.json. This file and
#' associated files are written by default to the R session temporary
#' directory. The user can change this location by (1) using the optional
#' parameter prov.dir in the prov.run or prov.init functions, or (2) setting
#' the prov.dir option (e.g. by using the R options command or editing the
#' Rprofile.site or .Rprofile file). If prov.dir is set to ".", the current working
#' directory is used.
#'
#' The level of detail collected by RDataTracker may be set using parameters
#' of the prov.run and prov.init functions. Options include collecting
#' provenance inside functions and inside control constructs and saving
#' snapshots of large intermediate values as separate files. These
#' features are turned off by default to optimize performance. Common
#' settings for the level of detail can also be set and managed using the
#' prov.set.detail and related functions.
#' @param prov.dir the directory where the provenance graph will be
#' saved. If not provided, the directory specified by the prov.dir
#' option is used. Otherwise the R session temporary directory
#' is used.
#' @param overwrite if FALSE, includes a time stamp in the provenance
#' graph directory name.
#' @param annotate.inside.functions if TRUE, provenance is collected
#' inside functions.
#' @param first.loop the first loop to collect provenance in a for,
#' while, or repeat statement.
#' @param max.loops the maximum number of loops to collect
#' provenance in a for, while, or repeat statement. If max.loops = -1,
#' there is no limit. If max.loops = 0, no loops are annotated.
#' If non-zero, it indicates the number of iterations of each loop for
#' which provenance should be collected. If max.loops is non-zero, provenance
#' is also collected inside if-statements.
#' @param snapshot.size the maximum size for snapshot files. If 0,
#' no snapshots are saved. If Inf, the complete state of an object is stored
#' in the snapshot file. For other values, the head of the object, truncated
#' to a size near the specified limit, is saved. The size is in kilobytes.
#' @param hash.algorithm the hash algorithm to use for files.
#' Choices are md5 (default), sha1, crc32, sha256, sha512, xxhash32,
#' xxhash64 and murmur32. This feature uses the digest function from
#' the digest package.
#' @param save.debug If TRUE, debug files are saved to the debug directory.
#' This is intended for developers of the RDataTracker / provR package.
#' @return prov.init initializes the provenance collector. The prov.init
#' function does not return a value.
#' @export
#' @rdname prov.run
#' @seealso \code{\link{prov.json}} for access to the JSON text of the provenance,
#' \code{\link{prov.display}} to view the provenance graphically.
#' \code{\link{prov.set.detail}} to see an alternative way to set the amount of
#' provenance collected.
#' \code{\link{prov.annotate.on}} and \code{\link{prov.annotate.off}} to see how to control
#' annotation of individual functions
prov.init <- function(prov.dir = NULL, overwrite = TRUE, annotate.inside.functions =
FALSE, first.loop = 1, max.loops = 0, snapshot.size = 0, hash.algorithm = "md5",
save.debug = FALSE) {
if (.ddg.is.set("ddg.initialized") && .ddg.get ("ddg.initialized") == TRUE) {
stop ("Provenance collection is already started.
Call prov.quit() to stop the current collection before starting a new one.")
return()
}
# Save name of provenance collection tool.
.ddg.set("ddg.tool.name", "RDataTracker")
# Save hash algorithm
.ddg.set("ddg.hash.algorithm", hash.algorithm)
# Initialize list of input & output file nodes
.ddg.init.filenodes()
# Initialize hash table
.ddg.init.hashtable()
# Set environment constants.
.ddg.set.details.omitted(FALSE)
# If ddg.detail is not set, use values of annotate.inside, max.loops
# and snapshot.size.
if (is.null(prov.get.detail())) {
# Store value of annotate.inside.
.ddg.set("ddg.annotate.inside", annotate.inside.functions)
# Store maximum number of loops to annotate.
if (max.loops < 0) max.loops <- 10^10
# Store maximum snapshot size.
.ddg.set("ddg.snapshot.size", snapshot.size)
}
# Intialize loops
.ddg.init.loops (first.loop, max.loops)
# Set functions to be used for script annotation
.ddg.set.annotation.functions()
# Functions to be annotated
.ddg.set("ddg.annotate.on", NULL)
# Functions not to be annotated
.ddg.set("ddg.annotate.off", NULL)
# Intialize return values
.ddg.init.return.values()
# Set DGD Explorer port
.ddg.set("ddg.explorer.port", 6096)
# Initialize the stack of commands and environments being executed in active functions
.ddg.set("ddg.cur.expr.stack", vector())
.ddg.init (prov.dir, overwrite, save.debug)
}
#' prov.save
#'
#' prov.save saves the current provenance graph to a prov-json file.
#' If more R statements are executed, the provenance for these statements
#' is added to the graph. The graph is finalized with prov.quit.
#' Called by the user in console mode.
#' @return prov.save writes the current provenance to a file but does not
#' return a value.
#' @export
#' @rdname prov.run
prov.save <- function(save.debug = FALSE) {
.ddg.save (save.debug)
}
#' prov.quit
#'
#' prov.quit saves and closes the current provenance graph.
#' Called by the user in console mode.
#' @return prov.quit writes the current provenance to a file but does not
#' return a value.
#' @export
#' @rdname prov.run
prov.quit <- function(save.debug = FALSE) {
.ddg.quit (save.debug)
}
#' prov.run
#'
#' prov.run initiates execution of a script and collects provenance as
#' the script executes.
#' @param r.script.path the full path to the R script file that is being
#' executed. A copy of the script will be saved with the provenance graph.
#' @param display if TRUE, the provenance graph is displayed in DDG Explorer
#' @return prov.run runs a script, collecting provenance as it does so.
#' It does not return a value.
#' @export
#' @rdname prov.run
#' @examples
#' \dontrun{prov.run ("script.R")}
#' prov.init()
#' a <- 1
#' b <- 2
#' prov.save()
#' ab <- a + b
#' prov.quit()
prov.run <- function(r.script.path, prov.dir = NULL, overwrite = TRUE,
annotate.inside.functions = FALSE, first.loop = 1, max.loops = 0,
snapshot.size = 0, hash.algorithm = "md5", save.debug = FALSE, display = FALSE) {
# Stop & display message if R script path is missing
if (missing(r.script.path)) {
stop("Please provide the name of the R script to execute. If the script
is not in the working directory, please include the full path.")
}
# Stop & display message if R script file is not found
if (!file.exists(r.script.path)) {
stop("R script file not found.")
}
# Store R script path
.ddg.set("ddg.r.script.path", r.script.path)
# Set script mode to True
.ddg.set("ddg.script.mode", TRUE)
# Initialize the provenance graph
prov.init(prov.dir, overwrite, annotate.inside.functions, first.loop, max.loops,
snapshot.size, hash.algorithm, save.debug)
# Execute the script
.ddg.run(r.script.path)
# Display the graph in DDG Explorer
if (display == TRUE) prov.display()
}
#' prov.source
#'
#' prov.source reads and executes an R script. RDataTracker
#' automatically collects provenance for sourced scripts.
#' This function is provided for compatibility with scripts
#' that have been annotated for use with provR.
#' @param file the name of the R script file to source.
#' @return The prov.source function does not return a value.
#' @export
#' @rdname prov.run
prov.source <- function(file) {
# Stop & display message if argument is missing or not in script mode
if (missing(file) || !.ddg.script.mode()) {
stop("The prov.source function is for script annotation only.
Please use prov.run to execute a script and collect provenance.")
}
.ddg.source(file)
}
#' Provenance Access Functions
#'
#' prov.json returns the current provenance graph as a prov-json string.
#'
#' RDataTracker collects provenance as a script executes. Once collected,
#' prov.json can be called to access the provenance as a JSON string.
#' This is useful for applications that operate on the provenance. The
#' JSON is consistent with the PROV-JSON standard.
#'
#' One such application is a graphic visualizer built into RDataTracker.
#' To view the provenance graphically, call prov.display. In the provenance
#' graph, the nodes are data values and operations, with edges connecting
#' them to show data and control flow dependencies. The visualizer also
#' allows the user to view intermediate values of variables, and to graphically
#' view the lineage of how a value was computed, or to look at how a value
#' is used moving forward in the computation. The user can also search for specific
#' data or operation nodes, files, or error messages in the provenance.
#'
#' @return prov.json returns the current provenance graph as a prov-json
#' string
#' @export
#' @rdname prov.json
#' @seealso \code{\link{prov.init}} and \code{\link{prov.run}} for functions to collect provenance
#' @references PROV-JSON standard: \url{https://www.w3.org/Submission/2013/SUBM-prov-json-20130424/}
#' @references RDataTracker PROV-JSON output: \url{https://github.com/End-to-end-provenance/RDataTracker/blob/export/JSON-format.md}
#' @examples
#' prov.init()
#' a <- 1
#' b <- 2
#' ab <- a + b
#' prov.quit()
#' str <- prov.json()
#' pdir <- prov.dir()
#' \dontrun{prov.display()}
prov.json <- function() {
# This is a wrapper function.
# Calls and returns the function with the bulk of the code in OutputJSON.R
return(.ddg.json.string())
}
#' prov.dir returns the current provenance directory.
#' @return prov.dir returns the current provenance directory.
#' @export
#' @rdname prov.json
prov.dir <- function() {
# Display a message if the provenance directory has not been not set
if (is.null(.ddg.path())) {
cat("The provenance directory has not been set. It will be
set when prov.init or prov.run is called.\n")
}
return(.ddg.path())
}
#' prov.display
#'
#' prov.display displays the current provenance as a graph.
#' @return prov.display loads and displays the current provenance graph
#' in DDG Explorer. The prov.display function does not return a value.
#' @export
#' @rdname prov.json
prov.display <- function () {
provViz::prov.visualize(tool="RDataTracker")
invisible()
}
#' .ddg.set.annotation.functions sets the names of the functions used for script
#' annotation in the user's environment. This is a workaround to exporting
#' these functions and would not be allowed by CRAN.
#' @return nothing
#' @noRd
.ddg.set.annotation.functions <- function () {
assign(".ddg.details.omitted", RDataTracker:::.ddg.details.omitted,
envir = globalenv())
assign(".ddg.eval", RDataTracker:::.ddg.eval,
envir = globalenv())
assign(".ddg.finish", RDataTracker:::.ddg.finish,
envir = globalenv())
assign(".ddg.first.loop", RDataTracker:::.ddg.first.loop,
envir = globalenv())
assign(".ddg.forloop", RDataTracker:::.ddg.forloop,
envir = globalenv())
assign(".ddg.function", RDataTracker:::.ddg.function,
envir = globalenv())
assign(".ddg.loop.annotate.off", RDataTracker:::.ddg.loop.annotate.off,
envir = globalenv())
assign(".ddg.loop.annotate.on", RDataTracker:::.ddg.loop.annotate.on,
envir = globalenv())
assign(".ddg.loop.count", RDataTracker:::.ddg.loop.count,
envir = globalenv())
assign(".ddg.loop.count.inc", RDataTracker:::.ddg.loop.count.inc,
envir = globalenv())
assign(".ddg.max.loops", RDataTracker:::.ddg.max.loops,
envir = globalenv())
assign(".ddg.snapshot.size", RDataTracker:::.ddg.snapshot.size,
envir = globalenv())
assign(".ddg.not.inside.loop", RDataTracker:::.ddg.not.inside.loop,
envir = globalenv())
assign(".ddg.reset.loop.count", RDataTracker:::.ddg.reset.loop.count,
envir = globalenv())
assign(".ddg.return.value", RDataTracker:::.ddg.return.value,
envir = globalenv())
assign(".ddg.set.inside.loop", RDataTracker:::.ddg.set.inside.loop,
envir = globalenv())
assign(".ddg.source", RDataTracker:::.ddg.source,
envir = globalenv())
assign(".ddg.start", RDataTracker:::.ddg.start,
envir = globalenv())
assign(".ddg.should.run.annotated", RDataTracker:::.ddg.should.run.annotated,
envir = globalenv())
invisible()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.