# Functions relating to keeping up an inventory of files that exist on the KNB
# and may or may not be copied to another computer and untarred
#' Create an empty inventory data.frame
#'
#' @return (data.frame) An empty data.frame.
#'
#' @noRd
inv_init <- function() {
inventory <- data.frame(stringsAsFactors = FALSE)
inventory
}
#' Load files into the inventory from a text file
#'
#' Load files into the inventory from a text file.
#'
#' Files should be the output of the command:
#'
#' you@server:/path/to/acadis$ find . -type f
#'
#' @param path (character) Path to a file containing a file listing.
#' @param inventory (character) A data.frame.
#' @param filter (logical) Whether or not to filter out versioned datasets.
#'
#' @return (data.frame) An inventory.
#'
#' @noRd
inv_load_files <- function(inventory, path, filter=TRUE) {
stopifnot(file.exists(path))
stopifnot("inventory" %in% ls(),
is.data.frame(inventory))
# Read the filenames from disk
files <- read.delim(path,
col.names = c("file"),
header = FALSE,
stringsAsFactors = FALSE)
stopifnot(is.data.frame(files))
# Filter out files not under with 'acadis-field-projects' or 'acadis-gateway'
# subfolders
size_before <- nrow(files)
files <- files[stringi::stri_startswith_fixed(files$file, "./acadis-field-projects/") |
stringi::stri_startswith_fixed(files$file, "./acadis-gateway/"), "file", drop = FALSE]
size_diff <- size_before - nrow(files)
if (size_diff > 0) { cat("Removed", size_diff, "file(s) that weren't inside acadis-gateway or acadis-field-projects subfolders.\n") }
# Filter out versioned datasets
if (filter) {
size_before <- nrow(files)
files <- files[grep("v_\\d\\.\\d", files$file, invert = TRUE), "file", drop = FALSE]
size_diff <- size_before - nrow(files)
if (size_diff > 0) { cat("Removed", size_diff, "file(s) that were part of versioned datasets.\n") }
}
# If inventory is empty, just make the inventory the same as filenames
if (nrow(inventory) == 0) {
return(files)
}
# Only append rows with new filenames
#
# Merging algorithm:
# Add colums we need to `filenames`
# Remove rows from `filenames` that exist in `inventory`
# Merge the two
# Make `filenames` the same shape as `inventory` by appending columns
for (col_name in names(inventory)) {
if (col_name %in% names(files)) { next }
files[,col_name] <- NA
}
stopifnot(identical(names(inventory), names(files)))
# Remove intersections if we had any
inter_vals <- intersect(inventory$file, files$file)
if (length(inter_vals) > 0 ) {
files <- subset(files, match(file, inter_vals, nomatch = 0) == 0)
}
inventory <- rbind(inventory,
files)
inventory
}
#' Load file sizes into an inventory from a text file
#'
#' Removes the column 'size_bytes' from inventory before doing a left join.
#'
#' @param path (character) Path to a file containing sizes.
#' @param inventory (data.frame) A data.frame.
#'
#' @return (data.frame) An inventory.
#'
#' @noRd
inv_load_sizes <- function(inventory, path) {
stopifnot(file.exists(path))
stopifnot("inventory" %in% ls(),
is.data.frame(inventory),
"file" %in% names(inventory))
# Read the sizes from disk
sizes <- read.delim(path,
col.names = c("size_bytes", "file"),
stringsAsFactors = FALSE,
header = FALSE)
stopifnot(is.data.frame(sizes))
if (nrow(inventory) != nrow(sizes)) {
warning(paste("Inventory and incoming 'sizes' data.frame not of the same number of rows.", nrow(inventory), "vs", nrow(sizes)))
}
# Join the sizes onto existing filenames in the inventory
# First drop the existing sizes
inventory <- inventory[,!(names(inventory) %in% "size_bytes"), drop = FALSE]
inventory <- dplyr::left_join(inventory, sizes, by = "file")
# Check the result
if (any(is.na(inventory$file))) { message("Some values in the 'filename' column were NA.")}
if (any(is.na(inventory$size_bytes))) { message("Some values in the 'size_bytes' column were NA.")}
inventory
}
#' Load checksums into the inventory file from a text file
#'
#' This function removes the column 'checksum_sha256' from inventory before doing a
#' left join.
#'
#' @param path (character) Path to a file containing sizes.
#' @param inventory (data.frame) An inventory.
#'
#' @return (data.frame) An inventory.
#'
#' @noRd
inv_load_checksums <- function(inventory, path) {
stopifnot(file.exists(path))
stopifnot("inventory" %in% ls(),
is.data.frame(inventory),
"file" %in% names(inventory))
# Convert the text file to a TSV before reading and joining
in_file <- readLines(path)
in_file_withtabs <- gsub(" ", "\t", in_file)
out_file <- tempfile()
writeLines(in_file_withtabs, out_file)
# Read in the file we made above
checksums <- read.delim(out_file,
header = FALSE,
col.names = c("checksum_sha256", "file"),
sep = "\t",
stringsAsFactors = FALSE)
stopifnot(is.data.frame(checksums))
if (nrow(inventory) != nrow(checksums)) {
warning(paste("Inventory and incoming 'checksums' data.frame not of the same number of rows.", nrow(inventory), "vs", nrow(checksums)))
}
# Join the checksums onto existing filenames in the inventory
# First drop the existing checksums
inventory <- inventory[,!(names(inventory) %in% "checksum_sha256"), drop = FALSE]
inventory <- dplyr::left_join(inventory, checksums, by = "file")
# Check the result
if (any(is.na(inventory$file))) { message("Some values in the 'file' column were NA.")}
if (any(is.na(inventory$checksum_sha256))) { message("Some values in the 'checksum_sha256' column were NA.")}
inventory
}
#' Load DOIs from a text file into the inventory
#'
#' Load DOIs from a text file into the inventory.
#'
#' @param path (character) Location of a text file with DOIs and file paths.
#' @param inventory (data.frame) An inventory.
#'
#' @return (data.frame) The modified inventory.
#'
#' @noRd
inv_load_dois <- function(inventory, path) {
stopifnot(file.exists(path))
stopifnot(is.data.frame(inventory),
"file" %in% names(inventory))
dois <- read.delim(path,
header = FALSE,
col.names = c("file", "pid"),
stringsAsFactors = FALSE)
stopifnot(is.data.frame(dois),
nrow(dois) > 0,
all(is.character(dois$file)),
all(is.character(dois$pid)))
# Join the identifiers onto existing filenames in the inventory
# First drop the existing identifiers
inventory <- inventory[,!(names(inventory) %in% "pid"), drop = FALSE]
inventory <- dplyr::left_join(inventory, dois, by = "file")
inventory
}
#' Load identifiers into the inventory file(s) from a text file
#'
#' This function removes the column 'identifier' from inventory before doing a
#' left join.
#'
#' @param paths (character) Path(s) to files containing identifiers.
#' @param inventory (data.frame) An inventory.
#'
#' @return (data.frame) An inventory.
#'
#' @noRd
inv_load_identifiers <- function(inventory, paths) {
stopifnot(file.exists(path))
stopifnot(is.data.frame(inventory),
"file" %in% names(inventory))
identifiers <- data.frame()
for (path in paths) {
# Read the identifiers from disk
filename_identifiers <- read.csv(path,
col.names = c("file", "identifier"),
header = TRUE,
stringsAsFactors = FALSE)
# rbind them
identifiers <- rbind(identifiers,
filename_identifiers)
}
stopifnot(is.data.frame(identifiers))
# Join the identifiers onto existing filenames in the inventory
# First drop the existing identifiers
inventory <- inventory[,!(names(inventory) %in% "identifier"), drop = FALSE]
inventory <- dplyr::left_join(inventory, identifiers, by = "file")
inventory
}
#' Add a set of extra columns to the inventory
#'
#' Add a set of extra columns to the inventory that are useful for working
#' with them.
#'
#' @param inventory (data.frame) An inventory.
#'
#' @return (data.frame) An inventory.
#'
#' @noRd
inv_add_extra_columns <- function(inventory) {
stopifnot(is(inventory, "data.frame"), "file" %in% names(inventory))
# Mark metadata files
cat("Adding 'is_metadata' column.\n")
inventory$is_metadata <- stringi::stri_endswith_fixed(inventory$file, "ISO.xml") |
stringi::stri_endswith_fixed(inventory$file, "iso19139.xml")
# Mark which root subfolder each file is under
# cat("Adding 'subfolder' column.\n")
# inventory$subfolder[stringi::stri_startswith_fixed(inventory$file, "./acadis-field-projects")] <- "FP"
# inventory$subfolder[stringi::stri_startswith_fixed(inventory$file, "./acadis-gateway")] <- "G"
# Add a column with filename and folder paths
cat("Adding 'folder' and 'filename' columns.\n")
inventory$folder <- unlist(
lapply(
stringr::str_split(inventory$file, "/"),
function(x) {
paste(x[1:(length(x) - 1)], collapse = "/")
}))
inventory$filename <- unlist(
lapply(
stringr::str_split(inventory$file, "/"),
function(x) {
x[length(x)]
}))
# Add depth column
cat("Adding 'depth' column.\n")
inventory$depth <- unlist(
lapply(
stringr::str_split(inventory$file, "/"), length))
# Add column for whether or not a file is an archive
# cat("Adding 'is_archive' column.\n")
# archive_regex <- ".*\\.(tar|gz|bz2|zip|tgz)$"
# inventory$is_archive <- grepl(archive_regex, inventory$filename)
# Add a column for the format ID
cat("Adding 'format_id' column.\n")
inventory$format_id <- guess_format_id(inventory$filename)
# Mark metadata as ISO, .xml files will just be application/xml after this
# but we can assume all of our metadata is ISO
inventory[inventory$is_metadata,"format_id"] <- "http://www.isotc211.org/2005/gmd"
# Add a column for packages
cat("Adding 'package' column.\n")
inventory$package <- NA
inventory <- as.data.frame(inventory) # Conver to data.frame in case it's a tbl_df
stopifnot("depth" %in% names(inventory))
# Traverse depth-first
for (d in seq(max(inventory$depth), min(inventory$depth))) {
cat(paste0(d, "."))
inv_atdepth_metadata <- which(inventory$depth == d & inventory$is_metadata == TRUE)
folders <- unique(inventory[inv_atdepth_metadata,"folder"])
for (folder in folders) {
# Find all files under this folder's hierarchy that haven't already been
# packaged
# Note we add a trailing slash to the folder name so that matches aren't
# made on partial strings, e.g. ./46.10/ vs ./46.100/
files_in_package <- stringi::stri_startswith_fixed(inventory$file, paste0(folder, "/")) &
is.na(inventory$package)
inventory[files_in_package,"package"] <- digest::digest(folder, algo="sha1")
}
}
cat("\n")
# Calculate statistics related to packages
cat("Adding 'package_nfiles', 'package_size_mb', and 'package_has_archives' columns.\n")
inventory <- dplyr::group_by(inventory, package)
inventory <- dplyr::mutate(inventory, package_nfiles = length(package))
as.data.frame(inventory)
}
#' Add a column for parent packages
#'
#' @param inventory (data.frame) An inventory.
#'
#' @return (data.frame) An inventory.
#'
#' @noRd
inv_add_parent_package_column <- function(inventory) {
stopifnot(all(c("file", "package", "is_metadata", "depth") %in% names(inventory)))
packages <- unique(inventory$package)
stopifnot(is.character(packages),
length(packages) > 0)
if (!("parent_package" %in% names(inventory))) {
inventory$parent_package <- ""
}
metadata_files <- inventory[inventory$is_metadata == TRUE,]
for (package in packages) {
if (is.na(package)) {
next
}
metadata_file <- metadata_files[metadata_files$package == package,]
stopifnot(nrow(metadata_file) == 1)
metadata_file_path <- metadata_file[,"file"]
metadata_file_depth <- metadata_file[,"depth"]
stopifnot(is.character(metadata_file_path),
is.numeric(metadata_file_depth))
path_parts <- stringr::str_split(metadata_file_path, "/")[[1]]
path_parts <- path_parts[1:(length(path_parts) - 1)]
# Three is the cutoff here because no package is higher than...
# ./acadis-X-X/some_folder/something (length four)
while (length(path_parts) > 2) {
joined_path <- paste(path_parts, collapse = "/")
# Note use of X == TRUE, this is to cast each part of the chained &
# statements to bools. This is probably a bug somewhere else in my code
# that I'm working around.
parent_files <- metadata_files[(stringi::stri_startswith_fixed(metadata_files$file, joined_path) == TRUE) &
((metadata_files$depth < metadata_file_depth) == TRUE),]
if (nrow(parent_files) == 1) {
inventory[!is.na(inventory$package) &
!is.na(inventory$parent_package) &
inventory$package == package,"parent_package"] <- parent_files[1,"package"]
# Halt execution of the while() loop
path_parts <- c()
} else if (nrow(parent_files) > 1) {
# warning(paste0("Number of direct parent datasets in package ", package, " was greater than one."))
path_parts <- path_parts[1:(length(path_parts) - 1)]
} else {
# Pluck one level off of path_parts
path_parts <- path_parts[1:(length(path_parts) - 1)]
}
}
}
inventory
}
#' Update an inventory with a new inventory
#'
#' @param inventory (data.frame) The old inventory.
#' @param new_state (data.frame) The new inventory.
#'
#' @noRd
inv_update <- function(inventory, new_state) {
stopifnot(is.data.frame(inventory),
is.data.frame(new_state),
nrow(inventory) > 0,
nrow(new_state) > 0)
stopifnot(all(c("file", "pid", "created") %in% names(inventory)))
# Temporary: Filter NA files from new_state
# Need to fix this elsewhere
new_state <- new_state[!is.na(new_state$file),]
for (row_num in seq_len(nrow(new_state))) {
file <- new_state[row_num,"file"]
pid <- new_state[row_num,"pid"]
created <- new_state[row_num,"created"]
stopifnot(is.character(file))
stopifnot(file %in% inventory$file)
stopifnot(is.character(pid))
stopifnot(is.logical(created))
inventory[which(inventory$file == file),"pid"] <- pid
inventory[which(inventory$file == file),"created"] <- created
}
inventory
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.