R/provenance_spy.R

Defines functions .spy.on.read_excel .spy.on.readShapePoly .spy.on.read.csv .spy.on.fread .replace.methods.with.spy .spy.on.methods .get.shp.metadata .format.metadata.date get.metadata write.meta.data write_provenance_section toml_quoting single_toml_item prov.output.file prov.input.file sys_calls_to_string clear.provenance get.output.files get.input.files .append.output.file .append.input.file

Documented in clear.provenance get.input.files get.metadata get.output.files .get.shp.metadata prov.input.file prov.output.file .replace.methods.with.spy write.meta.data

# Not exported. Tracks input files used for those functions we track
# Global state for our package, but not globally available
.input.files <- list()
.output.files <- list()


.append.input.file <- function(metadata) {
  artifact_hash <- openssl::sha256(paste(unlist(metadata), collapse = ""))
  # we need to re-assign this due to how the package value (which is not a global value) is managed by R
  package_name <- utils::packageName()
  if (!is.null(package_name)) {
    .input.files[[artifact_hash]] <- metadata
    utils::assignInNamespace(".input.files", .input.files, ns = package_name)
  } else {
    .input.files[[artifact_hash]] <<- metadata
  }
}


.append.output.file <- function(metadata) {
  artifact_hash <- openssl::sha256(paste(unlist(metadata), collapse = ""))
  # we need to re-assign this due to how the package value (which is not a global value) is managed by R
  package_name <- utils::packageName()
  if (!is.null(package_name)) {
    .output.files[[artifact_hash]] <- metadata
    utils::assignInNamespace(".output.files", .output.files, ns = package_name)
  } else {
    .output.files[[artifact_hash]] <<- metadata
  }
}


#' Get the list of files that were recorded as inputs.
#' @param clear True in order to clear the list of input files.
#' @return A list with one entry for each file.
#' @export
get.input.files <- function(clear = FALSE) {
  result <- .input.files
  if (clear) {
    utils::assignInNamespace(".input.files", list(), ns = utils::packageName())
  }
  return(result)
}


#' Get the list of files that were recorded as outputs
#' @param clear True in order to clear the list of output files.
#' @return A list with one entry for each file.
#' @export
get.output.files <- function(clear = FALSE) {
  result <- .output.files
  if (clear) {
    utils::assignInNamespace(".output.files", list(), ns = utils::packageName())
  }
  return(result)
}


#' Clear provenance data for a new run.
#' @export
clear.provenance <- function() {
  get.input.files(clear = TRUE)
  get.output.files(clear = TRUE)
}


sys_calls_to_string <- function(calls) {
  each_call <- lapply(calls, function(c) {
    string_vector <- as.character(c)
    if (length(string_vector) > 1) {
      arguments <- paste(string_vector[2:length(string_vector)], collapse = ",")
    } else {
      arguments <- ""
    }
    paste(string_vector[1], "(", arguments, ")", sep = "", collapse = "")
  })
  unlist(each_call)
}


#' Record that this code used this input file.
#' @param input The filename of the input file.
#' @param role A string to identify what this file does for this program.
#' @export
prov.input.file <- function(input, role) {
  md <- get.metadata(input)
  md$role <- paste(as.character(role), collapse = " ")
  calls <- sys.calls()
  md$stack <- sys_calls_to_string(calls[1:(length(calls) - 1)])
  if (file.exists(input)) {
    md$sha256 <- as.character(openssl::sha256(file(input)))
  }
  .append.input.file(md)
}


#' Record that this code used this output file.
#' @param output The filename of the output file.
#' @param role A string to identify what this file does for this program.
#' @export
prov.output.file <- function(output, role) {
  md <- get.metadata(output)
  md$role <- paste(as.character(role), collapse = " ")
  calls <- sys.calls()
  md$stack <- sys_calls_to_string(calls[1:(length(calls) - 1)])
  if (file.exists(output)) {
    md$sha256 <- as.character(openssl::sha256(file(output)))
  }
  .append.output.file(md)
}


single_toml_item <- function(value) {
  length(value) == 1 | "POSIXlt" %in% class(value)
}


toml_quoting <- function(value) {
  if (is.numeric(value)) {
    value
  } else if (is.list(value) & "POSIXlt" %in% class(value)) {
    strftime(value, format = "%Y-%m-%dT%H:%M:%S")
  } else {
    if (grepl("\n", value, fixed = TRUE)) {
      paste("'''", value, "'''", sep = "", collapse = "")
    } else {
      paste("'", value, "'", sep = "", collapse = "")
    }
  }
}


write_provenance_section <- function(section_name, artifact_list, connection) {
  writeLines(paste("[", section_name, "]", sep = "", collapse = ""), con = connection)
  for (name in names(artifact_list)) {
    input <- artifact_list[[name]]
    writeLines(paste("  [", section_name, ".", name, "]", sep = ""), con = connection)
    for (key in names(input)) {
      value <- input[[key]]
      if (single_toml_item(value)) {
        writeLines(paste(" ", key, "=", toml_quoting(value), sep = " "), con = connection)
      } else {
        writeLines(paste("  ", key, " = [", sep = ""), con = connection)
        vlen <- length(value)
        for (item_idx in 1:(vlen - 1)) {
          item <- value[item_idx]
          writeLines(paste("    ", toml_quoting(item), ",", sep = ""), con = connection)
        }
        item <- value[vlen]
        writeLines(paste("    ", toml_quoting(item), sep = ""), con = connection)
        writeLines("  ]", con = connection)
      }
    }
    writeLines("", con = connection)
  }
}


#' Write the list of input files to a file on disk.
#' @param path The file path to which to write the information.
#' @export
write.meta.data <- function(path) {
  if (is.character(path)) {
    file_connection <- file(path, open = "wt")
  } else if (is.integer(path)) {
    file_connection <- path
  } else {
    message(paste("Cannot write metadata to", path))
    return
  }
  app_meta <- list(runtime = list(
    commandline_arguments = commandArgs(),
    rversion = getRversion(),
    user = unname(Sys.info()["user"]),
    node = unname(Sys.info()["nodename"])
  ))
  write_provenance_section("application", app_meta, file_connection)
  write_provenance_section("inputs", get.input.files(), file_connection)
  write_provenance_section("outputs", get.output.files(), file_connection)
  writeLines("", con = file_connection)
  flush(file_connection)
  if (is.character(path)) {
    close(file_connection)
  }
}

#' Returns metadata for a file
#' @param path The path to the file from which to get file info.
#' @return a list of file info containing creation time, last modified,
#'     and its path.
get.metadata <- function(path) {
  path <- normalizePath(path)

  info <- file.info(path)
  result <- list(
    creation_time = .format.metadata.date(info$ctime),
    last_modified = .format.metadata.date(info$mtime),
    # TODO:
    # owner? info$uname exists but is NA because we're in a container
    # md5?
    path = path
  )

  # since we know there's only 1 input, unlist result. then get last element
  ext <- utils::tail(
    unlist(strsplit(path, ".", fixed = TRUE)),
    1)

  if (ext == "shp") {
    result$extra <- .get.shp.metadata(path)
  }

  return(result)
}


.format.metadata.date <- function(datelike_double) {
  as.POSIXlt(datelike_double)
}


#' Return extra metadata associated with .shp file
#'
#' @param path The path to the shapefile.
#' @return A list of properties of that data, taken from the file.
#'
#' A "shapefile" is actually a collection of files. At minimum .shp, .shpx, and .dbf
#'
#' In addition, there are at least 13 optional files that may be included.
#'
#' https://en.wikipedia.org/wiki/Shapefile#Overview
.get.shp.metadata <- function(path) {
  # NOTE: we're being lazy and just grabbing everything that has the same prefix.

  # split on "."; unlist the result into a vector; take all but the last element, re-join with "."
  base <- paste(utils::head(unlist(strsplit(path, ".", fixed = TRUE)), n = -1), collapse = ".")
  # list files in directory which start with the filename (without extension)
  related <- list.files(dirname(base), full.names = TRUE, pattern = basename(base))

  result <- list()
  for (rel in related) {
    info <- file.info(rel)
    result[[basename(rel)]] <- .format.metadata.date(info$mtime)
  }
  return(result)
}


# methods we hijack
# NOTE: the fullest solution would be to actually take over the namespaces
# using assignInNamespace. However this is a more complicated solution and may
# introduce other issues, and probably is irrelevant as I don't think anyone
# *ever* uses namespaced calls in the covariate code except where Mike adds them.
#
# Docs for hooks including order of operations, .onLoad, "onLoad" hooks, and "attach" hook
# https://stat.ethz.ch/R-manual/R-patched/library/base/html/userhooks.html
#
# attach vs onLoad events
# https://stackoverflow.com/a/56538266
#
# assigning to other package namespaces
# https://stackoverflow.com/a/58238931
.spy.on.methods <- function() {
  attached.packages <- .packages()

  pkg.watch <- list(
    "data.table" = "fread",
    "maptools" = "readShapePoly",
    "readxl" = "read_excel",
    "utils" = "read.csv"
  )
  # sf::st_read, raster::brick

  for (pkg in names(pkg.watch)) {
    methods <- pkg.watch[[pkg]]

    if (pkg %in% attached.packages) {
      # replace now
      .replace.methods.with.spy(methods)
    } else {
      # replace them right after they're loaded
      setHook(
        packageEvent(pkg, "attach"),
        # parameters included for reader comprehension - we don't need or use them
        function(pkg.name, pkg.path) {
          # WARNING: you cannot use `methods` here. The reason is the value is
          # mutated within this loop and R's lazy evaluation means that ALL
          # hook functions will end up having a `methods` value from the final
          # iteration of the loop
          .replace.methods.with.spy(pkg.watch[[pkg.name]])
        },
        action = "append"
      )
    }
  }
}

#' Replace methods in global namespace with the spying methods defined below
#' @param methods These are names of functions to replace.
.replace.methods.with.spy <- function(methods) {
  # NOTE: google searches reveal that questions and answers are sadly conflating some terms
  # as.environment("package:PKGNAME") returns a namespace of exported values
  # getNamespace("PKGNAME") returns a namespace of all package contents
  this.ns <- getNamespace(utils::packageName())
  for (method in methods) {
    spy.method <- get(sprintf(".spy.on.%s", method), envir = this.ns)
    assign(method, spy.method, envir = globalenv())
  }
}


.spy.on.fread <- function(input, ...) {
  # fread has complex arguments. Punt on callers not using the standard first argument "input"
  if (missing("input")) {
    return(data.table::fread(...))
  }

  tryCatch({
    md <- get.metadata(input)
    md$call <- "fread"
    .append.input.file(md)
  },
  error = function(e) {
    message(sprintf("Errored recording metadata for %s - YOU ARE LACKING PROVENANCE", input))
  }, finally = {
    result <- data.table::fread(input, ...)
  })
  return(result)
}

.spy.on.read.csv <- function(file, ...) {
  tryCatch({
    md <- get.metadata(file)
    md$call <- "read.csv"
    .append.input.file(md)
  },
  error = function(e) {
    message(sprintf("Errored recording metadata for %s - YOU ARE LACKING PROVENANCE", file))
  }, finally = {
    result <- utils::read.csv(file, ...)
  })
  return(result)
}

.spy.on.readShapePoly <- function(fn, ...) {
  tryCatch({
    md <- get.metadata(fn)
    md$call <- "readShapePoly"
    .append.input.file(md)
  },
  error = function(e) {
    message(sprintf("Errored recording metadata for %s - YOU ARE LACKING PROVENANCE", fn))
  }, finally = {
    return(maptools::readShapePoly(fn, ...))
  })
}

.spy.on.read_excel <- function(path, ...) {
  tryCatch({
    md <- get.metadata(path)
    md$call <- "read_excel"
    .append.input.file(md)
  },
  error = function(e) {
    message(sprintf("Errored recording metadata for %s - YOU ARE LACKING PROVENANCE", path))
  }, finally = {
    return(readxl::read_excel(path, ...))
  })
}
dd-harp/rampdata documentation built on Sept. 13, 2021, 9:32 p.m.