# Copyright (C) President and Fellows of Harvard College and
# Trustees of Mount Holyoke College, 2020.
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
###############################################################################
# The provTraceR package displays information about files used or created
# by an R script or a series of R scripts.
# The prov.trace function uses existing provenance to trace file
# lineage. The prov.trace.run function runs the specified script(s),
# collects provenance, and uses the provenance to trace file lineage.
# The scripts parameter may contain a single script name, a vector
# of script names, or a text file of script names. The text file must have
# extension .txt and one script name per line. Blank lines are ignored.
# For prov.trace only: If more than one script is specified, the order
# of the scripts must match the order of execution as recorded in the
# provenance; otherwise an error message is displayed. For console sessions,
# set scripts = "console".
# For prov.trace.run only: The provenance collection tool is specified by the
# parameter prov.tool and must be "rdtLite" or "rdt". The default is "rdtLite".
# Scripts are executed in the order listed.
# It is assumed that provenance for each script is stored under a single
# provenance directory set by the prov.dir option. If not, the provenance
# directory must be specified with the prov.dir parameter. Timestamped
# provenance and provenance in scattered locations are not currently supported.
# Files are matched by hash value. INPUTS lists files that are required
# to run the script or scripts. These include files read by a script and not
# written by an earlier script or previously written by the same script.
# OUTPUTS lists files written by the script or scripts. EXCHANGES lists
# files with the same hash value that were written by one script and read
# by a later script.
# If file.details = TRUE, additional details are displayed, including execution
# timestamps, file timestamps, file hash values, and saved file names.
# Both functions return results as a string. If console = TRUE (the default),
# results are displayed in the console. If save = TRUE, results are saved to
# the file prov-trace.txt.
# The parameter save.dir determines where the results file will be saved.
# If save.dir is NULL (the default), the R session temporary directory is used.
# If save.dir = ".", the current working directory is used. Otherwise the
# directory specified by save.dir is used.
# If check = TRUE, each file recorded in the provenance is checked against the
# user's file system. A dash (-) in the output indicates that the file no longer
# exists, a plus (+) indicates that the file exists but the hash value has changed,
# and a colon (:) indicates that the file exists and the hash value is unchanged.
# If check = FALSE, no comparison is made.
###############################################################################
#' @title
#' File lineage functions
#' @description
#' prov.trace traces file lineage from existing provenance.
#' @param scripts a script name, a vector of script names, a text file of script
#' names (file extension = .txt), or "console"
#' @param prov.dir provenance directory
#' @param file.details whether to display file details
#' @param console whether to display results in the console
#' @param save whether to save results to the file prov-trace.txt
#' @param save.dir where to save the results file. If NULL, use the R session
#' temporary directory. If a period (.), use the current working directory.
#' Otherwise use save.dir.
#' @param check whether to check against the user's file system
#' @return string containing file lineage
#' @export
#' @examples
#' prov.dir <- system.file("testdata", package="provTraceR")
#' prov.trace(c("script-1.R", "script-2.R"), prov.dir=prov.dir)
#' @rdname lineage
prov.trace <- function(scripts, prov.dir=NULL, file.details=FALSE, console=TRUE,
save=FALSE, save.dir=NULL, check=TRUE) {
if (length(scripts) == 1 && tools::file_ext(scripts) == "txt") {
scripts <- get.scripts.from.file(scripts)
}
check.scripts(scripts)
prov <- get.provenance(scripts, prov.dir)
verify.order.of.execution(prov, scripts)
lineage <- trace.files(prov, scripts, file.details, check)
if (console == TRUE) {
cat("\n")
cat(lineage)
}
if (save == TRUE) {
save.to.text.file(lineage, save.dir)
}
invisible(lineage)
}
#' @description
#' prov.trace.run runs the specified script(s), collects provenance, and uses
#' the provenance to trace file lineage.
#' @param scripts a script name, a vector of script names, or a text file of
#' script names (file extension = .txt)
#' @param prov.dir provenance directory
#' @param file.details whether to display file details
#' @param console whether to display results in the console
#' @param save whether to save results to the file prov-trace.txt
#' @param save.dir where to save the results file. If NULL, use the R session
#' temporary directory. If a period (.), use the current working directory.
#' Otherwise use save.dir.
#' @param check whether to check against the user's file system
#' @param prov.tool provenance collection tool (rdtLite or rdt)
#' @param details whether to collect fine-grained provenance
#' @param ... other parameters passed to the provenance collector
#' @return string containing file lineage
#' @export
#' @rdname lineage
prov.trace.run <- function(scripts, prov.dir=NULL, file.details=FALSE, console=TRUE,
save=FALSE, save.dir=NULL, check=TRUE, prov.tool="rdtLite", details=FALSE, ...) {
if (length(scripts) == 1 && tools::file_ext(scripts) == "txt") {
scripts <- get.scripts.from.file(scripts)
}
check.scripts(scripts)
run.scripts(scripts, prov.tool, details, ...)
prov <- get.provenance(scripts, prov.dir)
lineage <- trace.files(prov, scripts, file.details, check)
if (console == TRUE) {
cat("\n")
cat(lineage)
}
if (save == TRUE) {
save.to.text.file(lineage, save.dir)
}
invisible(lineage)
}
#' get.scripts.from.file reads one or more script names from a text file
#' (file extension must be .txt). Blank lines are ignored.
#' @param scripts a text file containing script names
#' @return a vector of script names
#' @noRd
get.scripts.from.file <- function(scripts) {
if (!file.exists(scripts)) {
stop(scripts, "not found")
}
script.names <- readLines(scripts, warn=FALSE)
# empty file
if (length(script.names) == 0) {
stop("Text file is empty")
}
ss <- vector()
# skip blank lines
for (i in 1:length(script.names)) {
sname <- trimws(script.names[i])
if (sname != "") {
ss <- append(ss, sname)
}
}
return(ss)
}
#' check.scripts stops execution if scripts is empty.
#' @param scripts a vector of script names
#' @return no return value
#' @noRd
check.scripts <- function(scripts) {
snum <- length(scripts)
if (snum == 0) {
stop("Vector of script names is empty")
}
for (i in 1:snum) {
sname <- scripts[i]
if (nchar(sname) == 0) {
stop("Script name is empty")
}
}
}
#' check.prov.tool checks that the specfied provenance collector
#' is rdtLite or rdt and checks that the tool is installed.
#' @param prov.tool a provenance collection tool
#' @return no return value
#' @noRd
check.prov.tool <- function(prov.tool) {
# check tool
if (prov.tool != "rdtLite" && prov.tool != "rdt") {
stop("Provenance collector must be rdtLite or rdt")
}
# check installation
if (!requireNamespace(prov.tool, quietly=TRUE)) {
stop(prov.tool, " is not installed")
}
}
#' run.scripts run scripts in the order specified and collects provenance.
#' @param scripts a vector of script names
#' @param prov.tool provenance collection tool (rdtLite or rdt)
#' @param details whether to collect fine-grained provenance
#' @param ... other parameters passed to the provenance collector
#' @return no return value
#' @noRd
run.scripts <- function(scripts, prov.tool, details, ...) {
# check provenance collector
check.prov.tool(prov.tool)
# select tool
if (prov.tool == "rdtLite") {
prov.run <- rdtLite::prov.run
} else {
prov.run <- rdt::prov.run
}
# run each script in turn
snum <- length(scripts)
for (i in 1:snum) {
sname <- scripts[i]
if (sname == "console") {
stop("Use prov.trace for console sessions")
} else if (!file.exists(sname)) {
stop(sname, "not found")
}
tryCatch(prov.run(sname, details=details, ...), error = function(x) {print (x)})
}
}
#' get.provenance returns a list containing provenance for each script.
#' @param scripts a vector of script names
#' @param prov.dir provenance directory
#' @return a list of provenance for each script
#' @noRd
get.provenance <- function(scripts, prov.dir) {
snum <- length(scripts)
prov <- list()
# get provenance directory
if (is.null(prov.dir)) {
prov.dir <- getOption("prov.dir")
} else {
prov.dir <- normalizePath(prov.dir, winslash="/", mustWork=FALSE)
}
if (!dir.exists(prov.dir)) {
stop(prov.dir, "not found")
}
# get provenance for each script
for (i in 1:snum) {
sname <- scripts[i]
if (sname == "console") {
file.name <- "console"
} else if (toupper(substr(sname, nchar(sname)-1, nchar(sname))) != ".R") {
stop(sname, "must end in .R or .r")
} else {
file.name <- substr(sname, 1, nchar(sname)-2)
}
prov.file <- paste(prov.dir, "/prov_", file.name, "/prov.json", sep="")
if (!file.exists(prov.file)) {
stop(prov.file, "not found")
}
prov[[i]] <- provParseR::prov.parse(prov.file)
}
return(prov)
}
#' verify.order.of.execution compares the specified order with the order
#' of execution recorded in the provenance.
#' @param prov a list of provenance for each script
#' @param scripts a vector of script names
#' @return no return value
#' @noRd
verify.order.of.execution <- function(prov, scripts) {
snum <- length(scripts)
# not relevant for a single script
if (snum > 1) {
ts <- vector(length=snum)
for (i in 1:snum) {
ee <- provParseR::get.environment(prov[[i]])
ts[i] <- ee[ee$label=="provTimestamp", "value"]
}
for (i in 1:(snum-1)) {
if (ts[i] > ts[i+1]) {
stop("Scripts were not run in this order")
}
}
}
}
#' trace.files returns a string containing file lineage
#' @param prov a list of provenance for each script
#' @param scripts a vector of script names
#' @param file.details whether to display file details
#' @param check whether to check against the user's file system
#' @return string containing file lineage
#' @noRd
trace.files <- function(prov, scripts, file.details, check) {
# get input & output files
infiles <- get.infiles(prov, scripts)
outfiles <- get.outfiles(prov, scripts)
# get string for each category
scripts.st <- list.scripts(prov, scripts, file.details, check)
inputs.st <- list.input.files(prov, infiles, outfiles, file.details, check)
outputs.st <- list.output.files(prov, outfiles, file.details, check)
exchanges.st <- list.exchanged.files(prov, scripts, infiles, outfiles, file.details, check)
# create file lineage string
lineage <- paste(scripts.st, inputs.st, outputs.st, exchanges.st, sep="")
return(lineage)
}
#' save.to.text.file saves the file lineage string to the file prov-trace.txt
#' on the save.dir directory.
#' @param lineage file lineage string
#' @param save.dir where to save the results file
#' @return no return value
#' @noRd
save.to.text.file <- function(lineage, save.dir) {
sdir <- get.save.dir(save.dir)
trace.file <- paste(sdir, "/prov-trace.txt", sep="")
cat(lineage, file=trace.file)
message(paste("\nSaving results in", trace.file, "\n"))
}
#' get.save.dir returns the directory where the results file (prov-trace.txt)
#' will be saved. If NULL, the R session temporary directory is used. If a
#' period (.), the current working directory is used. Otherwise save.dir is used.
#' @param save.dir where to save the results file
#' @return the result file directory
#' @noRd
get.save.dir <- function(save.dir) {
if (is.null(save.dir)) {
sdir <- normalizePath(tempdir(), winslash = "/", mustWork = FALSE)
} else if (save.dir == ".") {
sdir <- getwd()
} else {
sdir <- normalizePath(save.dir, winslash="/", mustWork=FALSE)
if (!dir.exists(sdir)) {
stop(sdir, "not found")
}
}
return(sdir)
}
#' check.file.system checks if the specified file exists in its original
#' location and if the hash value has changed.
#' @param location original file path and name
#' @param hash hash value
#' @param algorithm hash algorithm
#' @param check whether to check against the user's file system
#' @return a coded string value
#' @noRd
check.file.system <- function(location, hash, algorithm, check) {
if (check == TRUE) {
if (!file.exists(location)) {
tag <- "-"
} else if (hash != digest::digest(file=location, algo=algorithm)) {
tag <- "+"
} else {
tag <- ":"
}
} else {
tag <- " "
}
return(tag)
}
#' get.provenance.dir returns the provenance directory for a particular script.
#' @param script.prov provenance for a script
#' @return the provenance directory
#' @noRd
get.prov.dir <- function(script.prov) {
ee <- provParseR::get.environment(script.prov)
prov.dir <- ee[ee$label=="provDirectory", "value"]
prov.dir <- normalizePath(prov.dir, winslash="/", mustWork=FALSE)
return(prov.dir)
}
#' get.infiles returns a data frame of all input files.
#' @param prov a list of provenance for each script
#' @param scripts a vector of script names
#' @return a data frame of input files
#' @noRd
get.infiles <- function(prov, scripts) {
snum <- length(scripts)
infiles.list <- list()
for (i in 1:snum) {
infiles.list[[i]] <- provParseR::get.input.files(prov[[i]])
if (nrow(infiles.list[[i]]) > 0) {
# add script number
infiles.list[[i]]$script <- i
# add hash algorithm
ee <- provParseR::get.environment(prov[[i]])
infiles.list[[i]]$hash.algorithm <- ee[ee$label=="hashAlgorithm", "value"]
}
}
infiles <- infiles.list[[1]]
if (snum > 1) {
for (i in 2:snum) {
infiles <- rbind(infiles, infiles.list[[i]])
}
}
return(infiles)
}
#' get.outfiles returns a data frame of all output files.
#' @param prov a list of provenance for each script
#' @param scripts a vector of script names
#' @return a data frame of output files
#' @noRd
get.outfiles <- function(prov, scripts) {
snum <- length(scripts)
outfiles.list <- list()
for (i in 1:snum) {
outfiles.list[[i]] <- provParseR::get.output.files(prov[[i]])
if (nrow(outfiles.list[[i]]) > 0) {
# add script number
outfiles.list[[i]]$script <- i
# add hash algorithm
ee <- provParseR::get.environment(prov[[i]])
outfiles.list[[i]]$hash.algorithm <- ee[ee$label=="hashAlgorithm", "value"]
}
}
outfiles <- outfiles.list[[1]]
if (snum > 1) {
for (i in 2:snum) {
outfiles <- rbind(outfiles, outfiles.list[[i]])
}
}
return(outfiles)
}
#' list.scripts returns a string containing information for each script
#' @param prov a list of provenance for each script
#' @param scripts a vector of script names
#' @param file.details whether to display file details
#' @param check whether to check against the user's file system
#' @return string containing scripts information
#' @noRd
list.scripts <- function(prov, scripts, file.details, check) {
st <- "SCRIPTS:\n\n"
snum <- length(scripts)
for (i in 1:snum) {
ee <- provParseR::get.environment(prov[[i]])
script.name <- ee[ee$label=="script", "value"]
hash <- ee[ee$label=="scriptHash", "value"]
if (length(hash) == 0) {
hash <- "NA"
}
algorithm <- ee[ee$label=="hashAlgorithm", "value"]
if (script.name == "Console.R" || hash == "NA") {
tag <- " "
} else {
tag <- check.file.system(script.name, hash, algorithm, check)
}
st <- paste(st, i, " ", tag, " ", script.name, "\n", sep="")
# file details
if (file.details == TRUE) {
prov.dir <- get.prov.dir(prov[[i]])
saved.file <- paste(prov.dir, "/scripts/", basename(script.name), sep="")
timestamp <- ee[ee$label=="scriptTimeStamp", "value"]
executed <- ee[ee$label=="provTimestamp", "value"]
st <- paste(st, " Timestamp: ", timestamp, "\n", sep="")
st <- paste(st, " Hash: ", hash, "/", algorithm, "\n", sep="")
st <- paste(st, " Saved: ", saved.file, "\n", sep="")
st <- paste(st, " Executed: ", executed, "\n\n", sep="")
}
}
st <- paste(st, "\n", sep="")
return(st)
}
#' list.input.files returns a string containing information for each input file
#' @param prov a list of provenance for each script
#' @param infiles a data frame of input files
#' @param outfiles a data frame of output files
#' @param file.details whether to display file details
#' @param check whether to check against the user's file system
#' @return string containing input file information
#' @noRd
list.input.files <- function(prov, infiles, outfiles, file.details, check) {
st <- "INPUTS:\n\n"
count <- 0
if (nrow(infiles) > 0) {
# check for preceding or current output file with same hash value
infiles$match <- FALSE
for (i in 1:nrow(infiles)) {
if (nrow(outfiles) > 0) {
for (j in 1:nrow(outfiles)) {
if (infiles$hash[i] == outfiles$hash[j]) {
# output file from preceding script
if (outfiles$script[j] < infiles$script[i]) {
infiles$match[i] <- TRUE
# output file from current script with same data node
} else if (outfiles$script[j] == infiles$script[i]) {
if (infiles$id[i] == outfiles$id[j]) {
infiles$match[i] <- TRUE
}
}
}
}
}
}
# exclude package .rds files
for (i in 1:nrow(infiles)) {
if (tools::file_ext(infiles$name[i]) == "rds") {
infiles$match[i] <- TRUE
}
}
# display input files without a match
index <- which(infiles$match == FALSE)
count <- length(index)
if (count > 0) {
ii <- infiles[index, ]
# order by script number and location
ii <- ii[order(ii$script, ii$location), ]
for (i in 1:nrow(ii)) {
if (ii$location[i] != "") {
# display location for files
tag <- check.file.system(ii[i, "location"], ii[i, "hash"], ii[i, "algorithm"], check)
st <- paste(st, ii$script[i], " ", tag, " ", ii$location[i], "\n", sep="")
# file details
if (file.details == TRUE) {
prov.dir <- get.prov.dir(prov[[ii$script[i]]])
saved.file <- paste(prov.dir, "/", ii$value[i], sep="")
st <- paste(st, " Timestamp: ", ii$timestamp[i], "\n", sep="")
st <- paste(st, " Hash: ", ii$hash[i], "/", ii$hash.algorithm[i], "\n", sep="")
st <- paste(st, " Saved: ", saved.file, "\n\n", sep="")
}
} else {
# display name for urls
st <- paste(st, ii$script[i], " ", ii$name[i], "\n", sep="")
if (file.details == TRUE) {
st <- paste(st, "\n", sep="")
}
}
}
}
}
if (count == 0) {
st <- paste(st, "None\n", sep="")
}
st <- paste(st, "\n", sep="")
return(st)
}
#' list.output.files returns a string containing information for each output file.
#' @param prov a list of provenance for each script
#' @param outfiles a data frame of output files
#' @param file.details whether to display file details
#' @param check whether to check against the user's file system
#' @return string containing output file information
#' @noRd
list.output.files <- function(prov, outfiles, file.details, check) {
st <- "OUTPUTS:\n\n"
if (nrow(outfiles) > 0) {
oo <- outfiles
# order by script number and location
oo <- oo[order(oo$script, oo$location), ]
for (i in 1:nrow(oo)) {
tag <- check.file.system(oo[i, "location"], oo[i, "hash"], oo[i, "algorithm"], check)
st <- paste(st, oo$script[i], " ", tag, " ", oo$location[i], "\n", sep="")
# file details
if (file.details == TRUE) {
prov.dir <- get.prov.dir(prov[[oo$script[i]]])
saved.file <- paste(prov.dir, "/", oo$value[i], sep="")
st <- paste(st, " Timestamp: ", oo$timestamp[i], "\n", sep="")
st <- paste(st, " Hash: ", oo$hash[i], "/", oo$hash.algorithm[i], "\n", sep="")
st <- paste(st, " Saved: ", saved.file, "\n\n", sep="")
}
}
} else {
st <- paste(st, "None\n", sep="")
}
st <- paste(st, "\n", sep="")
return(st)
}
#' list.exchanged.files returns a string containing information for each file written
#' by one script and read by a subsequent script.
#' @param prov a list of provenance for each script
#' @param scripts a vector of script names
#' @param infiles a data frame of input files
#' @param outfiles a data frame of output files
#' @param file.details whether to display file details
#' @param check whether to check against the user's file system
#' @return string containing exchanged file information
#' @noRd
list.exchanged.files <- function(prov, scripts, infiles, outfiles, file.details, check) {
snum <- length(scripts)
# not relevant for a single script
if (snum == 1) {
st <- ""
} else {
st <- "EXCHANGES:\n\n"
count <- 0
for (i in 2:snum) {
for (j in 1:nrow(infiles)) {
for (k in 1:nrow(outfiles)) {
if (infiles$script[j] == i && outfiles$script[k] < i && infiles$hash[j] == outfiles$hash[k]) {
# display infile location if hash values match
tag <- check.file.system(infiles[j, "location"], infiles[j, "hash"], infiles[j, "algorithm"], check)
st <- paste(st, outfiles$script[k], " > ", infiles$script[j], " ", tag, " ", infiles$location[j], "\n", sep="")
# display outfile location if different
if (infiles$location[j] != outfiles$location[k]) {
st <- paste(st, " ", outfiles$location[k], "\n", sep="")
}
# file details
if (file.details == TRUE) {
prov.dir.in <- get.prov.dir(prov[[infiles$script[j]]])
saved.file.in <- paste(prov.dir.in, "/", infiles$value[j], sep="")
prov.dir.out <- get.prov.dir(prov[[outfiles$script[k]]])
saved.file.out <- paste(prov.dir.out, "/", outfiles$value[j], sep="")
st <- paste(st, " Timestamp: ", infiles$timestamp[j], "\n", sep="")
st <- paste(st, " Hash: ", infiles$hash[j], "/", infiles$hash.algorithm[j], "\n", sep="")
st <- paste(st, " Saved out: ", saved.file.out, "\n", sep="")
st <- paste(st, " Saved in: ", saved.file.in, "\n\n", sep="")
}
count <- count + 1
}
}
}
}
if (count == 0) {
st <- paste(st, "None\n\n", sep="")
}
}
return(st)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.