Nothing
# Copyright (C) President and Fellows of Harvard College and
# Trustees of Mount Holyoke College, 2019, 2020, 2021, 2022.
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
###############################################################################
#' Provenance comparison functions
#'
#' prov.explain reads two provenance collections and finds differences between these two versions.
#'
#' prov.explain and prov.diff.script are intended to help a user determine what
#' has changed if multiple executions of a script lead to different results. prov.explain
#' does this by comparing provenance collected using the rdtLite or rdt packages.
#' prov.diff.script compares copies of the R scripts saved in provenance directories
#' at the time that the scripts were executed.
#'
#' The types of differences that prov.explain can find include:
#' \itemize{
#' \item Environmental information identifying when the scripts were executed, the version of R,
#' the computing systems, the tool and version used to collect the provenance, the location
#' of the provenance file, and the hash algorithm used to hash data files.
#' \item Versions of libraries loaded
#' \item Versions of provenance tools
#' \item Contents and names of main and sourced scripts
#' }
#'
#' @param dir1 path of first provenance directory
#' @param dir2 path of second provenance directory
#' @param save if true saves the report to the file prov-explain.txt in the first directory
#' @export
#' @examples
#' \dontrun{prov.explain("first.test.dir", "second.test.dir")}
#' @rdname explain
prov.explain <- function (dir1, dir2, save = FALSE){
# check the existence of two given directories
check.dir.existence(dir1, dir2)
# case: two directories are the same
if (dir1 == dir2){
warning (paste(dir1, "and", dir2, "are the same directories\n"))
return (NA)
}
# detecting changes
if(save == TRUE){
save.to.text.file(dir1, dir2)
}else{
detect.changes(dir1, dir2)
}
}
#' Provenance Script Diff function
#'
#' prov.diff.script visualizes the differences between two versions of a script
#' that were previously executed.
#'
#' The prov.diff.script compares two versions of a script.
#' Users must specify the name of the first script,
#' the provenance directory associated with the first execution of
#' the script, and the provenance directory associated with the second execution of
#' the script. The name of the second script is optional. If it
#' is omitted, the same script name is looked for in the second provenance
#' directory
#' @param dir1 path to first provenance directory
#' @param dir2 path to second provenance directory
#' @param first.script name of first script. If no value is passed in, it will use the
#' main script
#' @param second.script name of second script. If both first and second script name
#' are NULL, it will use the main script form the second directory. If second
#' script name is NULL, but first script name is not, it will use first script name.
#' @export
#' @rdname explain
prov.diff.script <- function(dir1, dir2, first.script = NULL, second.script = NULL) {
# check the existence of two given directories
check.dir.existence(dir1, dir2)
# extract script name and change paths to first script saved in prov folders
if (FALSE == is.null(first.script)){
use.main.script <- FALSE
first.script <- basename(first.script)
}
else {
use.main.script <- TRUE
first.prov.info <- get.prov.info.object(dir1)
environment <- provParseR::get.environment(first.prov.info)
script.path <- environment[environment$label == "script", ]$value
first.script <- basename(script.path)
}
first.full.script <- paste(dir1, "/scripts/", first.script, sep = "")
# extract script name and change paths to second script saved in prov folders
if(FALSE == is.null(second.script)){
second.script <- basename(second.script)
}else{
if (use.main.script) {
second.prov.info <- get.prov.info.object(dir2)
environment <- provParseR::get.environment(second.prov.info)
script.path <- environment[environment$label == "script", ]$value
second.script <- basename(script.path)
}
else {
second.script <- first.script
}
}
second.full.script <- paste(dir2, "/scripts/", second.script, sep = "")
# check existence of 2 scripts
if(FALSE == file.exists(first.full.script)){
stop(paste(first.script, "not found in", dir1, "\n"))
}
if(FALSE == file.exists(second.full.script)){
stop(paste(second.script, "not found in", dir2, "\n"))
}
# show the diff
diffobj::diffFile(target = first.full.script, current = second.full.script, mode = "sidebyside")
}
#' detect.changes gets ProvInfo objects from provParseR
#' and calls other methods to find changes in different aspects
#' @param dir1 path to first prov directory
#' @param dir2 path to second prov directory
#' @noRd
detect.changes <- function (dir1, dir2){
cat("\nYou entered:\ndir1 =", dir1, "\ndir2 =", dir2)
# gets the ProvInfo objects
first.prov.info <- get.prov.info.object(dir1)
second.prov.info <- get.prov.info.object(dir2)
# detect changes in different aspects
get.script.changes (provParseR::get.scripts(first.prov.info), provParseR::get.scripts(second.prov.info), dir1, dir2)
get.library.changes (provParseR::get.libs(first.prov.info), provParseR::get.libs(second.prov.info))
get.input.files.changes (provParseR::get.input.files(first.prov.info), provParseR::get.input.files(second.prov.info))
get.environment.changes (provParseR::get.environment(first.prov.info), provParseR::get.environment(second.prov.info))
get.prov.tool.changes (provParseR::get.tool.info(first.prov.info), provParseR::get.tool.info(second.prov.info))
}
#' save.to.text.file outputs comparison results to the console
#' and saves them into a text file named prov-explain.txt located
#' in the first provenance directory
#' @param dir1 first provenance directory
#' @param dir2 second provenance directory
#' @noRd
save.to.text.file <- function(dir1, dir2) {
# gets the full path of first provenance directory
explain.file <- paste(dir1, "/prov-explain.txt", sep = "")
sink(explain.file, split=TRUE)
detect.changes(dir1, dir2)
sink()
message(paste("\n\nSaving comparison results in", explain.file))
}
#' get.library.changes gets changes in library by calling a helper
#' method find.library.changes, and prints out the result
#' @param first.lib.df first library data frame
#' @param second.lib.df second library data frame
#' @noRd
get.library.changes <- function (first.lib.df, second.lib.df){
cat("\nLIBRARY CHANGES: ")
# get the list of changes
lib.change.list <- find.library.changes(first.lib.df, second.lib.df)
# if list is null, R returns empty (non-NULL) data frames
lib.difference.df <- as.data.frame(lib.change.list[1])
lib.dir2.df <- as.data.frame(lib.change.list[2])
lib.dir1.df <- as.data.frame(lib.change.list[3])
cat("\nLibrary version differences:\n")
if(nrow(lib.difference.df) == 0){
cat("No differences in library versions have been detected")
}else{
print.data.frame(lib.difference.df, row.names = FALSE)
}
cat("\nLibraries in dir2 but not in dir1:\n")
if(nrow(lib.dir2.df) == 0){
cat("No such libraries were found")
}else{
print.data.frame(lib.dir2.df, row.names = FALSE)
}
cat("\nLibraries in dir1 but not in dir2:")
if(nrow(lib.dir1.df) == 0){
cat("No such libraries were found")
}else{
print.data.frame(lib.dir1.df, row.names = FALSE)
}
}
#' find.library.changes detects changes in libraries used based on the
#' collected provenance from two provenance folders.
#' The method returns a list of 3 data frames: library version differences,
#' libraries in dir2 but not in dir1, libraries in dir1 but not in dir2
#' @param first.lib.df first library data frame
#' @param second.lib.df second library data frame
#' @noRd
find.library.changes <- function (first.lib.df, second.lib.df) {
# case: input data frame(s) do(es) not exist, stop the function immediately
if (FALSE == check.df.existence("Library", first.lib.df, second.lib.df)) {
return(NULL)
}
# rare case: no libraries were recorded by provParseR
if (FALSE == check.df.empty("library", first.lib.df, second.lib.df)) {
return(NULL)
}
# keep the 3 columns we are interested in: id, name, version
first.lib.df <- subset(first.lib.df, select = c("id", "name", "version"))
second.lib.df <- subset(second.lib.df, select = c("id", "name", "version"))
# removes unneccesary id rows
first.lib.df <- subset(first.lib.df, select = -1)
second.lib.df <- subset(second.lib.df, select = -1)
# find library differences
# join two data frames by same "name"
same.name.libs.df <- dplyr::inner_join(first.lib.df, second.lib.df, by = "name")
# detect differences in corresponding version values of each lib name
lib.difference.df <- same.name.libs.df[same.name.libs.df$version.x != same.name.libs.df$version.y, ]
# rename the columns for easier reading
colnames(lib.difference.df) <- c("name", "dir1.version", "dir2.version")
# find libraries in dir2 but not in dir1
# get rows in 2nd df but not in 1st df
lib.dir2.df <- dplyr::anti_join(second.lib.df, first.lib.df, by = "name")
# find libraries in dir1 but not in dir2
# get rows in 1st df but not in 2nd df
lib.dir1.df <- dplyr::anti_join(first.lib.df, second.lib.df, by = "name")
return (list(lib.difference.df, lib.dir2.df, lib.dir1.df))
}
#' get.environment.changes gets environment changes by calling
#' helper method find.environment.changes and prints out the result
#' @param first.env.df first environment data frame
#' @param second.env.df second environment data frame
#' @noRd
get.environment.changes <- function(first.env.df, second.env.df) {
cat("\nENVIRONMENT CHANGES: ")
env.change.list <- find.environment.changes(first.env.df, second.env.df)
# as.data.frame returns an empty data frame if the given data frame is null
# so no need to handle null case here
env.difference.df <- as.data.frame(env.change.list[1])
env.dir2.df <- as.data.frame(env.change.list[2])
env.dir1.df <- as.data.frame(env.change.list[3])
cat("Value differences: \n")
# prints out the update
if(nrow(env.difference.df) == 0){
cat("No differences have been detected")
}else{
for(i in 1:nrow(env.difference.df)){
cat("Attribute:", env.difference.df$label[i], "\n")
cat("### dir1 value:", env.difference.df$dir1.value[i], "\n")
cat("### dir2 value:", env.difference.df$dir2.value[i], "\n")
}
}
# rare case: environment factors in dir2 but not in dir1,
# only prints out when found such factor
if(nrow(env.dir2.df) != 0){
cat("Attributes in dir2 but not in dir1:\n")
print.data.frame(env.dir2.df, row.names = FALSE)
}
# rare case: environment factors in dir1 but not in dir2,
# only prints out when found such factor
if(nrow(env.dir1.df) != 0){
cat("Attributes in dir1 but not in dir2:\n")
print.data.frame(env.dir1.df, row.names = FALSE)
}
}
#' find.environment.changes detects changes in the environment in which
#' the provenance was collected.
#' The method returns a list of 3 main information:
#' environment value changes, any environment factors only in dir2 and
#' any environment factors only in dir1
#' @param first.env.df first environment data frame
#' @param second.env.df second environment data frame
#' @noRd
find.environment.changes <- function (first.env.df, second.env.df) {
# case: input data frame(s) do(es) not exist
if (FALSE == check.df.existence("Environment", first.env.df, second.env.df)) {
return(NULL)
}
# rare case: no environment factors were recorded by provParseR, returns immediately
if (FALSE == check.df.empty("environment factor", first.env.df, second.env.df)) {
return(NULL)
}
# clean data frames of environment factors
first.env.df <- clean.environment.df(first.env.df)
second.env.df <- clean.environment.df(second.env.df)
# find environment changes
# join two data frames by "label"
same.label.env.df <- dplyr::inner_join(first.env.df, second.env.df, by = "label")
env.difference.df <- same.label.env.df[same.label.env.df$value.x != same.label.env.df$value.y, ]
# rename the columns for easier reading
colnames(env.difference.df) <- c("label", "dir1.value", "dir2.value")
# find environment values in dir2 but not in dir1
dir2.env.df <- dplyr::anti_join(second.env.df, first.env.df, by = "label")
# find environment values in dir1 but not in dir2
dir1.env.df <- dplyr::anti_join(first.env.df, second.env.df, by = "label")
return (list(env.difference.df, dir2.env.df, dir1.env.df))
}
#' clean.environment.df reads an environment data frame, remove
#' rows with name "script" and "scriptTimeStamp", then rename
#' some environment factors into helpful message for later printing
#' @param env.df environment data frame
#' @noRd
clean.environment.df <- function(env.df) {
# remove script and script timestamp rows
env.df <- env.df[env.df$label != "script", ]
env.df <- env.df[env.df$label != "scriptTimeStamp", ]
# rename some of the environment factors
env.df[env.df$label == "operatingSystem", "label"] <- "operating system"
env.df[env.df$label == "langVersion", "label"] <- "language version"
env.df[env.df$label == "workingDirectory", "label"] <- "working directory"
env.df[env.df$label == "provDirectory", "label"] <- "provenance directory"
env.df[env.df$label == "provTimestamp", "label"] <- "provenance collection time"
env.df[env.df$label == "hashAlgorithm", "label"] <- "hash algorithm"
env.df[env.df$label == "totalElapsedTime", "label"] <- "total elapsed time"
return(env.df)
}
#' print.prov.tool.changes gets prov tool changes by calling
#' a helper method find.prov.tool.changes and prints out the result
#' @param first.tool.df first tool data frame
#' @param second.tool.df second tool data frame
#' @noRd
get.prov.tool.changes <- function (first.tool.df, second.tool.df) {
cat("\nPROVENANCE TOOL CHANGES: ")
tool.change.list <- find.prov.tool.changes(first.tool.df, second.tool.df)
# if the list returned is null, as.data.frame creates an empty data frame,
# so no need to handle null case here
tool.difference.df <- as.data.frame(tool.change.list[1])
dir2.tool.df <- as.data.frame(tool.change.list[2])
dir1.tool.df <- as.data.frame(tool.change.list[3])
cat("Tool differences: ")
# prints out the update
if(nrow(tool.difference.df) == 0){
cat("No differences have been detected")
}else{
for(i in 1:nrow(tool.difference.df)){
cat("Name:", tool.difference.df$tool.name[i])
cat("### dir1 tool version:", tool.difference.df$dir1.tool.version[i],
"; dir1 json version:", tool.difference.df$dir1.json.version[i])
cat("### dir2 tool version:", tool.difference.df$dir2.tool.version[i],
"; dir2 json version:", tool.difference.df$dir2.json.version[i])
}
}
# case: tool in dir2 but not in dir1 (for example one used rdt, the other used rdtLite)
if(nrow(dir2.tool.df) != 0){
cat("Tool in dir2 but not in dir1:\n")
print.data.frame(dir2.tool.df, row.names = FALSE)
}
# case: tool in dir1 but not in dir2
if(nrow(dir1.tool.df) != 0){
cat("Tool in dir1 but not in dir2:\n")
print.data.frame(dir1.tool.df, row.names = FALSE)
}
}
#' prov.tool.changes checks for changes in provenance tool:
#' currently rdt or rdtLite
#' @param first.tool.df first tool data frame
#' @param second.tool.df second tool data frame
#' @noRd
find.prov.tool.changes <- function (first.tool.df, second.tool.df) {
# case: input data frame(s) do(es) not exist, returns immediately
if (FALSE == check.df.existence("Provenance tool", first.tool.df, second.tool.df)){
return (NULL)
}
# rare case: no provenance tools are shown in the data frame, returns immediately
if (FALSE == check.df.empty("provenance tool", first.tool.df, second.tool.df)){
return(NULL)
}
# finds the tool and json versions in each provenance
same.tool.df <- dplyr::inner_join(first.tool.df, second.tool.df, by = "tool.name")
colnames(same.tool.df) <- c("tool.name", "dir1.tool.version", "dir1.json.version",
"dir2.tool.version", "dir2.json.version")
same.tool.df <- same.tool.df[ , c(1, 2, 4, 3, 5)] # swaps column for nicer output
# case: if there are no updates, returns an empty data frame
if (same.tool.df$dir1.tool.version == same.tool.df$dir2.tool.version
&& same.tool.df$dir1.json.version == same.tool.df$dir2.json.version){
same.tool.df <- data.frame()
}
# case: for future updates, show tools that are in dir2 but not in dir1
dir2.tool.df <- dplyr::anti_join(second.tool.df, first.tool.df, by = "tool.name")
# case: for future updates, show tools that are in dir1 but not in dir2
dir1.tool.df <- dplyr::anti_join(first.tool.df, second.tool.df, by = "tool.name")
return (list(same.tool.df, dir2.tool.df, dir1.tool.df))
}
#' print.script.changes finds differences between 2 R scripts
#' and calls other helper printing functions to display the changes
#' @param first.script.df first script data frame
#' @param second.script.df second script data frame
#' @param dir1 path of first provenance directory
#' @param dir2 path of second provenance directory
#' @noRd
get.script.changes <- function(first.script.df, second.script.df, dir1, dir2) {
cat("\nSCRIPT CHANGES: ")
# check the existence of the 2 data frames
if(FALSE == check.df.existence("Script", first.script.df, second.script.df)){
cat("NA")
return()
}
# rare case : no scripts are recorded in the data frame
if(FALSE == check.df.empty("script", first.script.df, second.script.df)){
cat("\nNA")
return()
}
script.change.list <- find.script.changes(first.script.df, second.script.df, dir1, dir2)
main.script.change.result <- as.double(script.change.list[1])
sourced.script.change.list <- script.change.list[[2]]
# prints out the result
get.main.script.change(main.script.change.result, first.script.df[1, ], second.script.df[1, ])
get.same.name.sourced.scripts(same.name.script.df = sourced.script.change.list[[1]])
get.renamed.sourced.scripts(renamed.script.df = sourced.script.change.list[[2]])
get.unmatched.sourced.scripts(status = "dir1", sourced.script.change.list[[3]])
get.unmatched.sourced.scripts(status = "dir2", sourced.script.change.list[[4]])
}
#' get.main.script.change prints out changes in main script
#' based on the status value returned by compare.main.script method.
#' The message should display the both name if the script got
#' renamed, whether the content of the script has changed, and the
#' timestamp for each script version
#' @param main.script.change.result given status value
#' @param first.main.script.df first data frame with only main script
#' @param second.main.script.df second data frame with only main script
#' @noRd
get.main.script.change <- function(main.script.change.result, first.main.script.df, second.main.script.df){
# extract the name of main scripts from a full path
first.main.script.df$script <- basename(first.main.script.df$script)
second.main.script.df$script <- basename(second.main.script.df$script)
renamed <- FALSE
msg = character()
# case: script got renamed
if(main.script.change.result == 1 || main.script.change.result == 2){
msg = c(msg, "Main script has different name")
msg = c(msg, paste("### dir1 main script name:", first.main.script.df$script))
msg = c(msg, paste("### dir2 main script name:", second.main.script.df$script))
renamed <- TRUE
}
# case: the content of script changed
if(main.script.change.result == 1 || main.script.change.result == 0){
# case: if script was not renamed, prints out the name of the script along with the message
if(FALSE == renamed){
msg = c(msg, paste ("The content of the main script", second.main.script.df$script, "has changed"))
}else{
msg = c(msg, "The content of the main script has changed")
}
msg = c(msg, "Run prov.diff.script to see the changes.")
}else{ # case: the content is not changed (value 2 or 3)
# case: if script was not renamed, prints out the name of the script along with the message
if(renamed){
msg = c(msg, "No change detected in the content of the main script")
}
else {
msg = c(msg, paste ("No change detected in the content of the main script", second.main.script.df$script))
}
}
msg = c(msg, paste("### dir1 main script", first.main.script.df$script, "was last modified at:", first.main.script.df$timestamp))
msg = c(msg, paste("### dir2 main script", second.main.script.df$script, "was last modified at:", second.main.script.df$timestamp))
writeLines (msg)
return (msg)
}
#' find.script.changes find changes in both main and sourced scripts.
#' The method calls other helper functions to get full path of each
#' script in the provenance directories, generate hash value for each
#' script, find changes in main and sourced scripts, then returns a list
#' containing comparison results of main script and sourced scripts.
#' @param first.script.df first data frame with main and sourced scripts
#' @param second.script.df second data frame with main and sourced scripts
#' @param dir1 path to first provenance directory
#' @param dir2 path to second provenance directory
#' @noRd
find.script.changes <- function(first.script.df, second.script.df, dir1, dir2) {
# get right paths for copied scripts located in the provenance folders
first.script.df <- get.copied.script.path(dir1, first.script.df)
second.script.df <- get.copied.script.path(dir2, second.script.df)
# If there are stored hash values, use those. Otherwise, generate hash value
# for each script to be sure the same algorithm is being used for both.
if (first.script.df$hash != "" || second.script.df$hash != "") {
first.script.df <- compute.script.hash.value(first.script.df)
second.script.df <- compute.script.hash.value(second.script.df)
}
#find script changes
main.script.result <- compare.main.script(first.script.df[1, ], second.script.df[1, ])
sourced.script.result.list <- compare.sourced.scripts(first.script.df[-1, ], second.script.df[-1, ])
return (list(main.script.result, sourced.script.result.list))
}
#' compare.main.script find changes in the content and name of main script
#' The function returns 4 values reprensenting 4 status:
#' 0 = different script, same name
#' 1 = different script, different name
#' 2 = same script, different name
#' 3 = same script, same name
#' @param first.main.script.df first data frame which contains only main script
#' @param second.main.script.df second data frame which contains only main script
#' @noRd
compare.main.script <- function(first.main.script.df, second.main.script.df) {
# extract the name of main scripts from a full path
first.main.script.df$script <- basename(first.main.script.df$script)
second.main.script.df$script <- basename(second.main.script.df$script)
if(first.main.script.df$hash != second.main.script.df$hash
&& first.main.script.df$script == second.main.script.df$script){
return (0)
}
if(first.main.script.df$hash != second.main.script.df$hash
&& first.main.script.df$script != second.main.script.df$script){
return (1)
}
if(first.main.script.df$hash == second.main.script.df$hash
&& first.main.script.df$script != second.main.script.df$script){
return (2)
}
if(first.main.script.df$hash == second.main.script.df$hash
&& first.main.script.df$script == second.main.script.df$script){
return (3)
}
}
#' compare.sourced.scripts detects changes in sourced script data frames.
#' The method returns a list of 4 data frames:
#' 1. data frame containing scripts with same name
#' 2. data frame containing renamed scripts with same hash values
#' 3. data frame containing unmatched scripts in the first prov version
#' 4. data frame containing unmatched scripts in the second prov version
#' Note: this method replaces full script path with script name. In other words,
#' returned data frames now contain only script name, not full script path
#' @param first.sourced.script.df first data frame containing only sourced scripts
#' @param second.sourced.script.df second data frame containing only sourced scripts
#' @noRd
compare.sourced.scripts <- function(first.sourced.script.df, second.sourced.script.df) {
# extract the names of each script from their full path
first.sourced.script.df$script <- basename(first.sourced.script.df$script)
second.sourced.script.df$script <- basename(second.sourced.script.df$script)
# case: no sourced scripts were used in both prov versions
if(nrow(first.sourced.script.df) == 0 && nrow(second.sourced.script.df) == 0){
return (list(data.frame(), data.frame(), data.frame(), data.frame()))
}
# case: scripts with same name (same or different hash values)
same.name.script.df <- dplyr::inner_join(first.sourced.script.df, second.sourced.script.df, by = "script")
colnames(same.name.script.df) <- c("script", "dir1.timestamp", "dir1.hashValue", "dir2.timestamp", "dir2.hashValue")
# case: scripts with different name but with same hash values (scripts got renamed)
first.different.name.script.df <- dplyr::anti_join(first.sourced.script.df, second.sourced.script.df, by = "script")
second.different.name.script.df <- dplyr::anti_join(second.sourced.script.df, first.sourced.script.df, by = "script")
renamed.script.df <- dplyr::inner_join(first.different.name.script.df, second.different.name.script.df, by = "hashValue")
colnames(renamed.script.df) <- c("dir1.script", "dir1.timestamp", "hashValue", "dir2.script", "dir2.timestamp")
# case: scripts with different name and different hash values
first.unmatched.script.df <- dplyr::anti_join(first.different.name.script.df, second.different.name.script.df, by = "hashValue")
second.unmatched.script.df <- dplyr::anti_join(second.different.name.script.df, first.different.name.script.df, by = "hashValue")
return(list(same.name.script.df, renamed.script.df, first.unmatched.script.df, second.unmatched.script.df))
}
#' get.same.name.sourced.scripts takes in a script data frame
#' with same names and reports to users if the scripts has changed
#' with the corresponding timestamp
#' @param same.name.script.df a same-name script data frame
#' @noRd
get.same.name.sourced.scripts <- function(same.name.script.df) {
if(FALSE == is.valid.script.df("same-name scripts", same.name.script.df)) {
cat ("NA")
return ("NA")
}
msg = character()
# case: data frame must be non-empty
if(nrow(same.name.script.df) != 0) {
# extract rows with different hash values
modified.script.df <- dplyr::filter(same.name.script.df, same.name.script.df$dir1.hashValue != same.name.script.df$dir2.hashValue)
if(nrow(modified.script.df) != 0){
for(i in 1:nrow(modified.script.df)){
msg = c(msg, paste("Sourced script", modified.script.df$script[i], "has changed"))
msg = c(msg, "Run prov.diff.script to see the changes.")
msg = c(msg, paste("### dir1", modified.script.df$script[i], "was last modified at:", modified.script.df$dir1.timestamp[i]))
msg = c(msg, paste("### dir2", modified.script.df$script[i], "was last modified at:", modified.script.df$dir2.timestamp[i]))
}
}
# extract rows with same hash values
identical.script.df <- dplyr::filter(same.name.script.df, same.name.script.df$dir1.hashValue == same.name.script.df$dir2.hashValue)
if(nrow(identical.script.df) != 0){
for(i in 1:nrow(identical.script.df)){
msg = c(msg, paste("No change detected in sourced script", identical.script.df$script[i]))
}
}
writeLines (msg)
}
return (msg)
}
#' get.renamed.sourced.scripts takes in a data frame
#' of renamed scripts and reports changes to users
#' with the corresponding timestamp
#' @param renamed.script.df a data frame of renamed scripts
#' @noRd
get.renamed.sourced.scripts <- function(renamed.script.df) {
if(FALSE == is.valid.script.df("renamed scripts", renamed.script.df)) {
cat ("NA")
return("NA")
}
# case : data frame must be non-empty
changes <- character(0)
if(nrow(renamed.script.df) != 0) {
for(i in 1:nrow(renamed.script.df)) {
changes <- c(changes, "Sourced script has same content but different names:")
changes <- c(changes, paste("### dir1 sourced script name:", renamed.script.df$dir1.script[i]))
changes <- c(changes, paste("### dir2 sourced script name:", renamed.script.df$dir2.script[i]))
changes <- c(changes, paste("###", renamed.script.df$dir1.script[i], "was last modified at:", renamed.script.df$dir1.timestamp[i]))
changes <- c(changes, paste("###", renamed.script.df$dir2.script[i], "was last modified at:", renamed.script.df$dir2.timestamp[i]))
changes <- c (changes, "")
}
writeLines(changes)
}
return (changes)
}
#' get.unmatched.sourced.scripts takes in a data frame
#' of unmatched scripts, reports changes to users based on
#' the given status (either dir1 or dir2)
#' @param status dir1 or dir2
#' @param unmatched.script.df data frame of unmatched scripts
#' @noRd
get.unmatched.sourced.scripts <- function(status, unmatched.script.df) {
if(FALSE == is.valid.script.df(aspect = paste(status, "unmatched scripts"), script.df = unmatched.script.df)) {
cat ("NA")
return()
}
# case: data frame must be non-empty
msg <- character()
if(nrow(unmatched.script.df) != 0) {
if(status == "dir1") {
msg <- c(msg, "Sourced scripts in dir1 but not in dir2:")
}else if (status == "dir2"){
msg <- c(msg, "Sourced scripts in dir2 but not in dir1:")
}
for(i in 1:nrow(unmatched.script.df)) {
msg <- c(msg, paste ("### ", unmatched.script.df$script[i], ", which was last modified at: ", unmatched.script.df$timestamp[i], sep = ""))
}
writeLines (msg)
}
return (msg)
}
#' is.valid.script.df is a helper function for script printing functions.
#' This checks the existence and type of the given script data frame
#' @param aspect what the script data frame is about
#' @param script.df the script data frame
#' @noRd
is.valid.script.df <- function(aspect, script.df) {
if(is.null(script.df)){
warning(paste("data frame of", aspect, "is NULL\n"))
return (FALSE)
}
if(FALSE == is.data.frame(script.df)){
warning(paste("argument is not a data frame, aspect = ", aspect, "\n", sep = ""))
return (FALSE)
}
return (TRUE)
}
#' compute.script.hash.value generates hash value for each script
#' based on their path in the provenance directory and store these
#' values in a new column in the given data frame
#' @param script.df data frame with scripts
#' @noRd
compute.script.hash.value <- function(script.df) {
hash.values.vector <- sapply(script.df$script, FUN = function(X){
digest::digest(file = X, algo = "md5")
})
script.df$hash <- hash.values.vector
return (script.df)
}
#' get.copied.script.path takes in the path of the provenance directory and
#' a data frame about the locations of the original scripts, then returns
#' a data frame with exact locations of the scripts in the provenance directory,
#' while preserving all columns in the original data frame
#' @param prov.dir provenance directory
#' @param origin.script.df original script data frame
#' @noRd
get.copied.script.path <- function(prov.dir, origin.script.df) {
# extract script names
origin.script.df$script <- basename(origin.script.df$script)
origin.script.df$script <- sapply(origin.script.df$script, insert.path <- function(script.name){
script.name <- paste(prov.dir, "/scripts/", script.name, sep = "")
})
return (origin.script.df)
}
#' get.input.files.changes checks if input files in two data provenance
#' collections are different based on their hash values.
#' NOTE: this function currently does not work for URL as input.
#' @param input.df1 data frame of input files in first data provenance collection
#' @param input.df2 data frame of input files in second data provenance collection
#' @noRd
get.input.files.changes <- function (input.df1, input.df2) {
# QUESTION: are inputs in sourced scripts recorded?
if(FALSE == check.df.existence("Input file", input.df1, input.df2)) {
return ("")
}
cat("\n\nINPUT FILE CHANGES:\n")
# process input files (not URL)
input.files.df1 <- input.df1[input.df1$type == "File", ]
input.files.df2 <- input.df2[input.df2$type == "File", ]
empty <- FALSE
if(nrow(input.files.df1) == 0) {
cat("No input files were found in dir 1\n")
empty <- TRUE
}
else {
get.input.file.changes(input.files.df1, "1")
}
if(nrow(input.files.df2) == 0) {
cat("No input files were found in dir 2\n")
empty <- TRUE
}
else {
get.input.file.changes(input.files.df2, "2")
}
if(empty) return()
# if reached here, both data frames must have some input files
# compare files with same name
same.name.files.df <- dplyr::inner_join(input.files.df1, input.files.df2, by = "name")
compare.input.files.same.name(same.name.files.df)
# compare files with different name
different.name.files.df1 <- dplyr::anti_join(input.files.df1, input.files.df2, by = "name")
different.name.files.df2 <- dplyr::anti_join(input.files.df2, input.files.df1, by = "name")
compare.input.files.different.name(different.name.files.df1, different.name.files.df2)
}
#' compare.input.files.same.name takes in a data frame of input files
#' of dir1 and dir2 that have the same name. The function then compares
#' the content of these files to see if they have been modified. If yes,
#' corresponding modification times are printed.
#' @param same.name.files.df data frame of input files with same names
#' @noRd
compare.input.files.same.name <- function (same.name.files.df) {
if(FALSE == is.null(same.name.files.df) && nrow(same.name.files.df) != 0) {
# case: same hash value, thus no change detected
same.hash.df <- dplyr::filter(same.name.files.df, same.name.files.df$hash.x == same.name.files.df$hash.y)
if (nrow(same.hash.df) > 0) {
for(i in 1:nrow(same.hash.df)) {
cat("No change detected in the input file", same.hash.df$name[i])
}
}
# case: different hash value, thus content changed
different.hash.df <- dplyr::filter(same.name.files.df, same.name.files.df$hash.x != same.name.files.df$hash.y)
if (nrow(different.hash.df) > 0) {
for(i in 1:nrow(different.hash.df)) {
row <- different.hash.df[i, ]
cat("\nThe content of the input file", row$name, "has changed")
cat("### dir1", row$name, "was last modified at:", row$timestamp.x)
cat("### dir2", row$name, "was last modified at:", row$timestamp.y)
}
}
}
}
#' compare.input.files.different.name takes in 2 data frames of input files:
#' one containing files with names in dir1 but not in dir2, the other containing
#' files with names in dir2 but not in dir1. The function then compares
#' hash values of these files to see if there are some files that are the same but
#' have been renamed. As for files with different names and different hash values,
#' the function simply returns information of the files to user.
#' @param different.name.files.df1 data frame of input files whose names are exclusive to dir1
#' @param different.name.files.df2 data frame of input files whose names are exclusive to dir2
#' @noRd
compare.input.files.different.name <- function(different.name.files.df1, different.name.files.df2) {
# case: files with same content but different name
same.hash.df <- dplyr::inner_join(different.name.files.df1, different.name.files.df2, by = "hash")
if(FALSE == is.null(same.hash.df) && nrow(same.hash.df) != 0) {
for(i in 1:nrow(same.hash.df)) {
cat("Content of two input files", same.hash.df$name.x[i], "(dir1) and", same.hash.df$name.y[i], "(dir 2) is the same")
}
}
# case: files with different hash values and names (dir1)
exclusive.files1 <- dplyr::anti_join(different.name.files.df1, different.name.files.df2, by = "hash")
if(FALSE == is.null(exclusive.files1) && nrow(exclusive.files1) != 0) {
cat("\nInput files in dir1 but not in dir2:")
for(i in 1:nrow(exclusive.files1)) {
cat("### ", exclusive.files1$name[i], ", which was last modified at ", exclusive.files1$timestamp[i], sep = "")
}
}
# case: files with different hash values and names (dir2)
exclusive.files2 <- dplyr::anti_join(different.name.files.df2, different.name.files.df1, by = "hash")
if(FALSE == is.null(exclusive.files2) && nrow(exclusive.files2) != 0) {
cat("\nInput files in dir2 but not in dir1:")
for(i in 1:nrow(exclusive.files2)) {
cat("### ", exclusive.files2$name[i], ", which was last modified at ", exclusive.files2$timestamp[i], sep = "")
}
}
}
#' get.input.file.changes prints out all input files recorded in
#' a given data frame, corresponding to a folder/directory number. If the
#' data frame is empty, a helpful message is returned to user.
#' @param input.df data frame of input files
#' @param dir current working provenance directory
#' @noRd
get.input.file.changes <- function(input.df, dir) {
cat("List of input files in dir", dir, ":\n", sep = "")
for(i in 1:nrow(input.df)) {
row <- input.df[i, ]
cat("### Name:", row$name, "\n")
cat("### Last modification time:", row$timestamp, "\n")
cat("### Location:", row$location, "\n\n")
}
}
#' check.dir.existence checks if two given directories exists
#' and stops the program when the directories are non-existent
#' @param dir1 the first directory
#' @param dir2 the second directory
#' @noRd
check.dir.existence <- function (dir1, dir2) {
error.message <- ""
if (!dir.exists(dir1)) {
error.message <- paste(dir1, "directory not found\n")
}
if (!dir.exists(dir2)) {
error.message <- paste(error.message, dir2, " directory not found\n", sep = "")
}
if (error.message != "") {
stop(error.message)
}
return (error.message)
}
#' get.prov.info.object accesses the JSON file from the given directory,
#' and returns a ProvInfo object from provParseR
#' @param directory the provenance directory
#' @return ProvInfo object
#' @noRd
get.prov.info.object <- function (directory) {
# gets the path of two json files
json.file <- paste(directory, "/prov.json", sep = "")
# case: json file does not exist
if(!file.exists(json.file)){
stop(paste("prov.json file in ", directory, "is not found\n"))
}
# returns the ProvInfo object returned by provParseR
return (provParseR::prov.parse(json.file))
}
#' check.df.existence checks if two given data frames are not null.
#' If one of them are null, outputs a warning, returns false to
#' stop the caller of given aspect and continue the program.
#' Otherwise, returns true
#' @param aspect overview of what the data frames are about
#' @param df1 first data frame
#' @param df2 second data frame
#' @noRd
check.df.existence <- function (aspect, df1, df2) {
if(is.null(df1) || is.null(df2)){
warning (paste(aspect, "data frames returned by provParseR is NULL\n"))
return (FALSE)
}
return (TRUE)
}
#' check.df.empty checks if two given data frames are empty.
#' If one of them is empty, outputs a warning, returns false to
#' stop the caller of given aspect and continue the program.
#' Otherwise, returns true
#' @param aspect overview of what the data frames are about
#' @param df1 first data frame
#' @param df2 second data frame
#' @noRd
check.df.empty <- function (aspect, df1, df2) {
if(nrow(df1) == 0 || nrow(df2) == 0){
warning (paste("no", aspect, "was recorded in data frame returned by provParseR\n"))
return (FALSE)
}
return (TRUE)
}
#####################################################################################
#' IN DEVELOPMENT - PROCEDURE NODES
#' find.changes.proc.nodes inspects procedure nodes of both provenance
#' collections and finds the first point in which the procedure node
#' data frames start diverging.
#' @param prov.info1 prov info object of dir1
#' @param prov.info2 prov info object of dir2
#' @noRd
find.changes.proc.nodes <- function (prov.info1, prov.info2) {
# get the corresponding procedure node data frames
proc.node.df1 <- provParseR::get.proc.nodes(prov.info1)
proc.node.df2 <- provParseR::get.proc.nodes(prov.info2)
# filter out Start and Finish nodes, only consider Operation nodes
proc.node.df1 <- dplyr::filter(proc.node.df1, proc.node.df1$type == "Operation")
proc.node.df2 <- dplyr::filter(proc.node.df2, proc.node.df2$type == "Operation")
# get the script data frames
scripts.df1 <- provParseR::get.saved.scripts(prov.info1)
scripts.df2 <- provParseR::get.saved.scripts(prov.info2)
# check existence and emptiness status of the data frames
if(FALSE == check.df.existence("Procedure node", proc.node.df1, proc.node.df2)
|| FALSE == check.df.empty("procedure node", proc.node.df1, proc.node.df2)
|| FALSE == check.df.existence("Saved script", scripts.df1, scripts.df2)
|| FALSE == check.df.empty("saved script", scripts.df1, scripts.df2)) {
return ()
}
# for each script in each provenance folder, get its array of lines
all.scripts.array1 <- get.array.of.arrays.of.lines(scripts.df1)
all.scripts.array2 <- get.array.of.arrays.of.lines(scripts.df2)
# loop through procedure nodes with the shorter length of the two data frames
for(i in 1:min(nrow(proc.node.df1), nrow(proc.node.df2))){
proc.node.info1 <- proc.node.df1[i, ]
proc.node.info2 <- proc.node.df2[i, ]
# get full info for the current proc nodes
detailed.info1 <- get.proc.node.full.info(proc.node.info1, all.scripts.array1)
detailed.info2 <- get.proc.node.full.info(proc.node.info2, all.scripts.array2)
# compare two pieces of information
if(identical(detailed.info1, detailed.info2) == FALSE){
cat("First procedure node differences found:")
# helpful output here (subject to change)
cat("### Line", proc.node.info1$startLine, "in script", basename(scripts.df1$script[proc.node.info1$scriptNum]))
cat("### versus line", proc.node.info2$startLine, "in script", basename(scripts.df2$script[proc.node.info2$scriptNum]))
return()
}
}
# lines are identical so far. Compare lengths
if(nrow(proc.node.df1) == nrow(proc.node.df2)) {
cat("All procedure nodes of dir1 and dir2 are identical")
return()
}
# lengths are different
cat("First procedure node differences found:")
if(nrow(proc.node.df1) > nrow(proc.node.df2)) {
temp.proc <- proc.node.df1[nrow(proc.node.df2) + 1, ]
temp.script <- scripts.df1
}else{
temp.proc <- proc.node.df2[nrow(proc.node.df1) + 1, ]
temp.script <- scripts.df2
}
cat("### Line", temp.proc$startLine, "in script", basename(temp.script$script[temp.proc$scriptNum]))
return()
}
#' get.proc.node.full.info gets the detailed lines about a given procedure node.
#' If there are many lines within 1 procedure node, all lines will be collapsed into
#' one line. Returned character does not contain any white spaces
#' @param proc.node.info current procedure node extracted from the proc node data frame
#' @param all.scripts.full.line an array of arrays of lines of all scripts in a data provenance collection
#' @noRd
get.proc.node.full.info <- function(proc.node.info, all.scripts.full.line) {
current.script.line.array <- all.scripts.full.line[[proc.node.info$scriptNum]]
if(proc.node.info$startLine - proc.node.info$endLine == 0) {
full.line <- current.script.line.array[proc.node.info$startLine]
}else{
full.line <- current.script.line.array[proc.node.info$startLine:proc.node.info$endLine]
paste(full.line, sep = "", collapse = "")
}
return (stringr::str_replace_all(full.line, pattern = " ", replacement = ""))
}
#' get.array.of.arrays.of.lines returns an array of arrays of lines for each script
#' in the given script data frame
#' @param scripts.df script data frame
#' @noRd
get.array.of.arrays.of.lines <- function(scripts.df) {
array.of.arrays <- lapply(scripts.df$script, function(script) {
file <- file(script, "r")
lines <- readLines(file, warn = FALSE)
close(file)
return(lines)
})
return(array.of.arrays)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.