Nothing
#
# This work was created by participants in the DataONE project, and is
# jointly copyrighted by participating institutions in DataONE. For
# more information on DataONE, see our web site at http://dataone.org.
#
# Copyright 2011-2015
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#' @title A class representing a data package
#' @description The DataPackage class provides methods for adding and extracting
#' data objects from a data package. The contents of a data package
#' can include arbitrary types of objects, including data files, program code,
#' visualizations and images, animations, and any other type of file. The DataPackage class
#' stores the individual members of the data package along with key system-level metadata
#' about each object, including its size, checksum, identifier, and other key information
#' needed to effectively archive the members of the package. In addition, the
#' DataPackage class can include key provenance metadata about the relationships among
#' the objects in the data package. For example, the data package can document that one object
#' provides documentation for another (\code{cito:documents}), and that one object was
#' derived from another (\code{prov:wasDerivedFrom}) by executing a program that
#' used source data (\code{prov:used}) to create a derived data object
#' {\code{prov:wasGeneratedBy}}. These relationships are integral to the data package,
#' and can be visualized by programs that understand the ProvONE provenance
#' model (see \url{https://purl.dataone.org/provone-v1-dev}).
#'
#' The DataPackage class is an R representation of an underlying Open Archives
#' Initiative ORE model (Object Reuse and Exchange;
#' see \url{https://www.openarchives.org/ore/}), and follows the DataONE Data
#' Packaging model
#' (see \url{https://releases.dataone.org/online/api-documentation-v2.0.1/design/DataPackage.html}).
#' @include dmsg.R
#' @include DataObject.R
#' @include SystemMetadata.R
#' @import uuid
#' @rdname DataPackage-class
#' @aliases DataPackage-class
#' @slot relations A list containing provenance relationships of package objects
#' @slot objects A list containing identifiers for objects in the DataPackage
#' @slot sysmeta A SystemMetadata class instance describing the package
#' @slot externalIds A list containing identifiers for objects associated with the DataPackage
#' @slot resmapId A character string specifying the identifier for the package resource map.
#' This is assigned after a package is uploaded or downloaded from a repository.
#' @section Methods:
#' \itemize{
#' \item{\code{\link[=DataPackage-initialize]{initialize}}}{: Initialize a DataPackage object.}
#' \item{\code{\link{addAccessRule}}}{: Add access rules to DataObjects in a DataPackage.}
#' \item{\code{\link{addMember}}}{: Add a DataObject to a DataPackage.}
#' \item{\code{\link{clearAccessPolicy}}}{: Clear access policies for DataObjects in a DataPackage.}
#' \item{\code{\link{containsId}}}{: Returns true if the specified object is a member of the data package.}
#' \item{\code{\link{describeWorkflow}}}{: Add data derivation information to a DataPackage.}
#' \item{\code{\link{getData}}}{: Get the data content of a specified data object.}
#' \item{\code{\link{getSize}}}{: Get the Count of Objects in the DataPackage.}
#' \item{\code{\link{getIdentifiers}}}{: Get the Identifiers of DataPackage members.}
#' \item{\code{\link{getMember}}}{: Return the DataPackage Member by Identifier.}
#' \item{\code{\link{getRelationships}}}{: Retrieve relationships of data package objects.}
#' \item{\code{\link{getValue}}}{: Get values for selected DataPackage members.}
#' \item{\code{\link{hasAccessRule}}}{: Determine if access rules exists for DataObjects in a DataPackage.}
#' \item{\code{\link{insertRelationship}}}{: Insert relationships between objects in a DataPackage.}
#' \item{\code{\link{removeAccessRule}}}{: Remove an access rule from DataObject in a DataPackage.}
#' \item{\code{\link{removeMember}}}{: Remove the specified DataObject from a DataPackage.}
#' \item{\code{\link{removeRelationships}}}{: Remove relationships of objects in a DataPackage.}
#' \item{\code{\link{replaceMember}}}{: Replace the raw data or file associated with a DataObject.}
#' \item{\code{\link{selectMember}}}{: Select package members based on slot values.}
#' \item{\code{\link{serializePackage}}}{: Create an OAI-ORE resource map from the DataPackage.}
#' \item{\code{\link{serializeToBagIt}}}{: Serialize A DataPackage into a BagIt Archive File.}
#' \item{\code{\link{setPublicAccess}}}{: Set the access policy to readable by anyone for DataObject in a DataPackage.}
#' \item{\code{\link{setValue}}}{: Set values for selected DataPackage members}
#' \item{\code{\link{show}}}{: Print DataPackage information in a formatted view.}
#' \item{\code{\link{updateMetadata}}}{: Update selected elements of the XML content of a DataObject in a DataPackage}
#' \item{\code{\link{updateRelationships}}}{: Update package relationships by replacing an old identifier with a new one.}
#' }
#' @seealso \code{\link{datapack}}
#' @export
setClass("DataPackage", slots = c(
relations = "list",
objects = "list", # key=identifier, value=DataObject
sysmeta = "SystemMetadata", # system metadata about the package
externalIds = "list",
resmapId = "character" # resource map identifier(s)
)
)
###########################
## DataPackage constructors
###########################
## Create a DataPackage object
## @description The DataPackage() method is a shortcut to creating a DataPackage object, as this method does
## not allow specifying any options that the \code{\link[=initialize-DataPackage]{initialize}} method allows (using new("DataPackage"")
## @param ... (Not yet used)
## @export
#setGeneric("DataPackage", function(...) { standardGeneric("DataPackage")} )
#' Initialize a DataPackage object.
#' @rdname DataPackage-initialize
#' @aliases DataPackage-initialize
#' @param .Object The object being initialized
#' @param packageId The package id to assign to the package
#' @examples
#' # Create a DataPackage with undefined package id (to be set manually later)
#' pkg <- new("DataPackage")
#' # Alternatively, manually assign the package id when the DataPackage object is created
#' pkg <- new("DataPackage", "urn:uuid:4f953288-f593-49a1-adc2-5881f815e946")
#' @seealso \code{\link{DataPackage-class}}
setMethod("initialize", "DataPackage", function(.Object, packageId) {
dmsg("DataPackage-class.R initialize")
.Object@sysmeta <- new("SystemMetadata")
if (!missing("packageId")) {
## set the packageId and instantiate a new SystemMetadata
.Object@sysmeta@identifier <- packageId
}
.Object@relations = list()
.Object@relations[['updated']] <- FALSE
.Object@objects = list()
.Object@externalIds = list()
.Object@resmapId <- NA_character_
return(.Object)
})
#' @rdname getData
#' @param id Missing or character: if \code{'x'} is DataPackage, the identifier of the
#' package member to get data from
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do1 <- new("DataObject", id="id1", data, format="text/csv", user="smith", mnNodeId="urn:node:KNB")
#' dp <- addMember(dp, do1)
#' bytes <- getData(dp, "id1")
#' @export
setMethod("getData", signature("DataPackage"), function(x, id) {
databytes <- as.raw(NULL)
if (containsId(x, id)) {
do <- x@objects[[id]]
databytes <- getData(do)
return(databytes)
} else {
return(NULL)
}
})
#' Get the Count of Objects in the Package
#' @param x A DataPackage instance
#' @param ... (not yet used)
#' @return The number of object in the Package
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getSize", function(x, ...) { standardGeneric("getSize")} )
#' @rdname getSize
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' getSize(dp)
#' @export
setMethod("getSize", "DataPackage", function(x) {
return(length(x@objects))
})
#setMethod("length", "DataPackage", function(x) {
# return(length(x@objects))
#})
#' Get the Identifiers of Package Members
#' @description The identifiers of the objects in the package are retrieved and returned as a list.
#' @param x A DataPackage instance
#' @param ... (not yet used)
#' @return A list of identifiers
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getIdentifiers", function(x, ...) { standardGeneric("getIdentifiers")} )
#' @rdname getIdentifiers
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' getIdentifiers(dp)
#' @export
setMethod("getIdentifiers", "DataPackage", function(x) {
return(names(x@objects))
})
#' Add a DataObject to the DataPackage
#' @description The DataObject is added to the DataPackage.
#' @param x A DataPackage instance
#' @param do A DataObject instance
#' @param ... (Additional parameters)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("addData", function(x, do, ...) {
.Deprecated("addMember", "datapack")
standardGeneric("addData")
})
#' @rdname addData
#' @details The DataObject \code{"do"} is added to the DataPackage. If the optional \code{"mo"} parameter is specified, then it is
#' assumed that the DataObject \code{"mo"} is a metadata
#' object that describes the science object \code{"do"} that is being added. The \code{addData} function will add a relationship
#' to the DataPackage resource map that indicates that the metadata object describes the science object using the
#' Citation Typing Ontology (CITO).
#' Note: this method updates the passed-in DataPackage object.
#' \code{documents} and \code{isDocumentedBy} relationship.
#' @param mo A DataObject (containing metadata describing \code{"do"} ) to associate with the science object.
#' @return the updated DataPackage object
#' @examples
#' dpkg <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' metadata <- charToRaw("EML or other metadata document text goes here\n")
#' md <- new("DataObject", id="md1", dataobj=metadata, format="text/xml", user="smith",
#' mnNodeId="urn:node:KNB")
#' do <- new("DataObject", id="id1", dataobj=data, format="text/csv", user="smith",
#' mnNodeId="urn:node:KNB")
#' # Associate the metadata object with the science object. The 'mo' object will be added
#' # to the package automatically, since it hasn't been added yet.
#' # This method is now deprecated, so suppress warnings if desired.
#' suppressWarnings(dpkg <- addData(dpkg, do, md))
#' @export
setMethod("addData", signature("DataPackage", "DataObject"), function(x, do, mo=NA_character_) {
x@objects[[do@sysmeta@identifier]] <- do
# If a metadata object identifier is specified on the command line, then add the relationship to this package
# that associates this science object with the metadata object.
if (!missing(mo)) {
# CHeck that the metadata object has already been added to the DataPackage. If it has not
# been added, then add it now.
if (!containsId(x, getIdentifier(mo))) {
moId <- addMember(x, mo)
}
# Now add the CITO "documents" and "isDocumentedBy" relationships
x <- insertRelationship(x, getIdentifier(mo), getIdentifier(do))
}
return(x)
})
#' Add a DataObject to the DataPackage
#' @description The DataObject is added to the DataPackage.
#' @param x A DataPackage instance
#' @param ... (Additional parameters)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("addMember", function(x, ...) {
standardGeneric("addMember")
})
#' @rdname addMember
#' @details The DataObject \code{"do"} is added to the DataPackage. If the optional \code{"mo"} parameter is specified, then it is
#' assumed that the DataObject \code{"mo"} is a metadata
#' object that describes the science object \code{"do"} that is being added. The \code{addMember} function will add a relationship
#' to the DataPackage resource map that indicates that the metadata object describes the science object using the
#' Citation Typing Ontology (CITO).
#' Note: this method updates the passed-in DataPackage object.
#' \code{documents} and \code{isDocumentedBy} relationship.
#' @param do The DataObject to add.
#' @param mo A DataObject (containing metadata describing \code{"do"} ) to associate with the science object. If this DataObject
#' has already been added to the package, the argument can be a \code{"character"} containing the DataObject identifier.
#' @return the updated DataPackage object
#' @examples
#' dpkg <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' metadata <- charToRaw("EML or other metadata document text goes here\n")
#' md <- new("DataObject", id="md1", dataobj=metadata, format="text/xml", user="smith",
#' mnNodeId="urn:node:KNB")
#' do <- new("DataObject", id="id1", dataobj=data, format="text/csv", user="smith",
#' mnNodeId="urn:node:KNB")
#' # Associate the metadata object with the science object. The 'mo' object will be added
#' # to the package automatically, since it hasn't been added yet.
#' dpkg <- addMember(dpkg, do, md)
#' @export
setMethod("addMember", signature("DataPackage"), function(x, do, mo=NA_character_) {
# If only one 'do' is specified, make in into a list. If a list is already specified
# then it can be iterated over.
doList <- list()
# Special case, if the user passed in a single DataObject for sources or derivations,
# convert it to a list to facilitate easier processing in tests below.
if(inherits(do, "DataObject")) {
doList<- list(do)
} else {
doList <- do
}
if(!missing(mo)) {
if(inherits(mo, "character")) {
if (!containsId(x, mo)) {
msg <- sprintf("%s\nPlease specify argument \"mo\" as a \"DataObject\" or first add it to the package using \"addMember\"",
mo)
msg <- sprintf("%s\nThe metadata object with identifier %s has not been added to the package yet", mo)
stop(msg)
}
# Convert 'mo' to a DataObject
mo <- getMember(x, mo)
} else if (!inherits(mo, "DataObject")) {
stop(sprintf("Invalid type \"%s\" for argument \"mo\"\n", mo))
}
}
for (iObj in doList) {
x@objects[[getIdentifier(iObj)]] <- iObj
# If a metadata object identifier is specified on the command line, then add the relationship to this package
# that associates this science object with the metadata object.
if (!missing(mo)) {
# CHeck that the metadata object has already been added to the DataPackage. If it has not
# been added, then add it now.
if (!containsId(x, getIdentifier(mo))) {
x <- addMember(x, mo)
}
# Now add the CITO "documents" and "isDocumentedBy" relationships
x <- insertRelationship(x, getIdentifier(mo), getIdentifier(iObj))
}
}
# If the object's path was documented, add it to the resource map
if (!is.na(iObj@targetPath)){
x <- insertRelationship(x, getIdentifier(iObj), iObj@targetPath, provAtLocation)
}
return(x)
})
#' Record relationships of objects in a DataPackage
#' @description Record a relationship of the form "subject -> predicate -> object", as defined by the Resource Description Framework (RDF), i.e.
#' an RDF triple.
#' @details For use with DataONE, a best practice is to specify the subject and predicate as DataONE persistent identifiers
#' (https://mule1.dataone.org/ArchitectureDocs-current/design/PIDs.html). If the objects are not known to DataONE, then local identifiers can be
#' used, and these local identifiers may be promoted to DataONE PIDs when the package is uploaded to a DataONE member node.
#' The predicate is typically an RDF property (as a IRI) from a schema supported by DataONE, i.e. "http://www.w3.org/ns/prov#wasGeneratedBy"
#' If multiple values are specified for argument objectIDS, a relationship is created for each value in the list "objectIDs". IF a value
#' is not specified for subjectType or objectType, then NA is assigned. Note that if these relationships are fetched via the getRelationships()
#' function, and passed to the createFromTriples() function to initialize a ResourceMap object, the underlying redland package will assign
#' appropriate values for subjects and objects.
#' Note: This method updates the passed-in DataPackage object.
#' @param x A DataPackage object
#' @param subjectID The identifier of the subject of the relationship
#' @param objectIDs A list of identifiers of the object of the relationships (a relationship is recorded for each objectID)
#' @param predicate The IRI of the predicate of the relationship
#' @param ... (Additional parameters)
#' @return the updated DataPackage object
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("insertRelationship", function(x, ...) {
standardGeneric("insertRelationship")
})
#' @rdname insertRelationship
#' @param subjectType the type to assign the subject, values can be 'uri', 'blank'
#' @param objectTypes the types to assign the objects (cal be single value or list), each value can be 'uri', 'blank', or 'literal'
#' @param dataTypeURIs An RDF data type that specifies the type of the object
#' @export
#' @examples
#' dp <- new("DataPackage")
#' # Create a relationship
#' dp <- insertRelationship(dp, "/Users/smith/scripts/genFields.R",
#' "https://knb.ecoinformatics.org/knb/d1/mn/v1/object/doi:1234/_030MXTI009R00_20030812.40.1",
#' "http://www.w3.org/ns/prov#used")
#' # Create a relationshp with the subject as a blank node with an automatically assigned blank
#' # node id
#' dp <- insertRelationship(dp, subjectID=NA_character_, objectIDs="thing6",
#' predicate="http://www.myns.org/wasThing")
#' # Create a relationshp with the subject as a blank node with a user assigned blank node id
#' dp <- insertRelationship(dp, subjectID="urn:uuid:bc9e160e-ca21-47d5-871b-4a4820fe4451",
#' objectIDs="thing7", predicate="http://www.myns.org/hadThing")
#' # Create multiple relationships with the same subject, predicate, but different objects
#' dp <- insertRelationship(dp, subjectID="urn:uuid:95055dc1-b2a0-4a00-bdc2-05c16d048ca2",
#' objectIDs=c("thing4", "thing5"), predicate="http://www.myns.org/hadThing")
#' # Create multiple relationships with subject and object types specified
#' dp <- insertRelationship(dp, subjectID="orcid.org/0000-0002-2192-403X",
#' objectIDs="http://www.example.com/home", predicate="http://www.example.com/hadHome",
#' subjectType="uri", objectType="literal")
setMethod("insertRelationship", signature("DataPackage"),
function(x, subjectID, objectIDs, predicate=NA_character_,
subjectType=NA_character_, objectTypes=NA_character_, dataTypeURIs=NA_character_) {
# Argument has to be character
stopifnot(is.character(subjectID))
stopifnot(is.character(objectIDs))
# If a predicate wasn't provided, then insert the default relationship of
# subjectID -> documents -> objectID; objectID -> documentedBy -> subjectID
if (is.na(predicate)) {
x <- insertRelationship(x, subjectID, objectIDs, citoDocuments)
for (obj in objectIDs) {
x <- insertRelationship(x, obj, subjectID, citoIsDocumentedBy)
}
} else {
# Append new relationships to previously stored ones.
if (!(is.null(x@relations[["relations"]]))) {
relations <- x@relations[["relations"]]
} else {
relations <- data.frame()
}
# If the subjectID or objectIDs were not specified then the user is requesting that these be "anonymous"
# blank nodes, i.e. a blank node identifier is automatically assigned. Assign a uuid now vs having redland
# RDF package assign a node, so that we don't have to query redland for the id that it generated.
# The format of "_<unique identifier>" (no colon included) is used passed W3C RDF
# Validationa (https://www.w3.org/RDF/Validator/). Node ids starting with numeric characters or containing
# colons do not pass validation (note that blank node identifiers such as '_:b1' do not pass validattion).
if (is.na(subjectID)) {
subjectID <- sprintf("_%s", uuid::UUIDgenerate())
subjectType <- "blank"
}
# Add all triples to the data frame. If the length of objectTypes or dataTypeURIs is less
# that the length of objectIDs, then values will be set to NA
i <- 0
for (obj in objectIDs) {
i <- i + 1
# Generate a blank node identifier if id is not specified
if (is.na(obj)) {
obj <- sprintf("_%s", uuid::UUIDgenerate())
objectTypes[i] <- "blank"
}
# Check that the subjectType is a valid type for an RDF subject
if (!is.element(subjectType[i], c("uri", "blank", NA_character_))) {
stop(sprintf("Invalid subject type: %s\n", subjectType[i]))
}
# Check that the objectType is a valid type for an RDF object
if(!is.element(objectTypes[i], c("uri", "literal", "blank", NA_character_))) {
stop(sprintf("Invalid objct type: %s\n", objectTypes[i]))
}
newRels <- data.frame(subject=subjectID, predicate=predicate, object=obj,
subjectType=subjectType, objectType=objectTypes[i],
dataTypeURI=dataTypeURIs[i], row.names = NULL, stringsAsFactors = FALSE)
# Has a relation been added previously?
if (nrow(relations) == 0) {
relations <- newRels
} else {
relations <- rbind(relations, newRels)
}
}
x@relations[["relations"]] <- relations
}
# Remove duplicate relationships
relations <- x@relations[["relations"]]
x@relations[["relations"]] <- unique(relations)
# Set the relationships (resource map) to updated status.
x@relations[["updated"]] <- TRUE
return(x)
})
#' Remove relationships of objects in a DataPackage
#' @description Use this function to remove all or a subset of the relationships that have previously been added in a data package.
#' @details Remove a relationship of the form "subject -> predicate -> object", as defined by the Resource Description Framework (RDF), i.e.
#' an RDF triple. If neither subjectID nor predicate are provided, then all relationships are removed. If one or both
#' are provided, they are used to select matching triples to be removed.
#' Note: This method updates the passed-in DataPackage object.
#' @param x A DataPackage object
#' @param subjectID The identifier of the subject of the relationships to be removed
#' @param predicate The identifier of the predicate of the relationships to be removed
#' @param ... (Additional parameters)
#' @return the updated DataPackage object
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("removeRelationships", function(x, ...) {
standardGeneric("removeRelationships")
})
#' @rdname removeRelationships
#' @export
#' @examples
#' dp <- new("DataPackage")
#' # Create a relationship
#' dp <- insertRelationship(dp, "/Users/smith/scripts/genFields.R",
#' "https://knb.org/data_20030812.40.1",
#' "http://www.w3.org/ns/prov#used")
#' # Create a relationshp with the subject as a blank node with an automatically assigned blank
#' # node id
#' dp <- insertRelationship(dp, subjectID=NA_character_, objectIDs="thing6",
#' predicate="http://myns.org/wasThing")
#' # Create a relationshp with the subject as a blank node with a user assigned blank node id
#' dp <- insertRelationship(dp, subjectID="urn:uuid:bc9e160e-ca21-47d5-871b-4a4820fe4451",
#' objectIDs="thing7", predicate="http://myns.org/hadThing")
#' # Create multiple relationships with the same subject, predicate, but different objects
#' dp <- insertRelationship(dp, subjectID="https://myns.org/subject1",
#' objectIDs=c("thing4", "thing5"), predicate="http://myns.org/hadThing")
#' # Create multiple relationships with subject and object types specified
#' dp <- insertRelationship(dp, subjectID="orcid.org/0000-0002-2192-403X",
#' objectIDs="http://www.example.com/home", predicate="http://myns.org/hadHome",
#' subjectType="uri", objectType="literal")
#' nrow(getRelationships(dp))
#' dp <- removeRelationships(dp, predicate='http://myns.org/wasThing')
#' nrow(getRelationships(dp))
#' dp <- removeRelationships(dp, subjectID='orcid.org/0000-0002-2192-403X')
#' nrow(getRelationships(dp))
#' dp <- removeRelationships(dp, subjectID='https://myns.org/subject1',
#' predicate='http://myns.org/hadThing')
#' nrow(getRelationships(dp))
#' dp <- removeRelationships(dp)
#' nrow(getRelationships(dp))
setMethod("removeRelationships", signature("DataPackage"),
function(x, subjectID=NA_character_, predicate=NA_character_) {
# Argument has to be character
stopifnot(is.character(subjectID))
stopifnot(is.character(predicate))
# Get access to the relations data frame
if (!(is.null(x@relations[["relations"]]))) {
relations <- x@relations[["relations"]]
} else {
# There are no relations, so nothing to be removed
warning("No relationships exist, so can not remove relationships as specified.")
}
# Delete some of the relationships, depending on whether subject and predicate are provided
if (is.na(subjectID) && is.na(predicate)) {
# Delete all relationships because no subject or predicate was indicated
relations <- data.frame()
} else if (is.na(subjectID)) {
# Delete only relationships matching predicate
relations <- relations[!relations$predicate==predicate,]
} else if (is.na(predicate)) {
# Delete only relationships matching subjectID
relations <- relations[!relations$subject==subjectID,]
} else {
# Delete relationships matching both subjectID and predicate
relations <- relations[!(relations$subject==subjectID & relations$predicate==predicate),]
}
# Assign the modified relations data frame to the class slot
x@relations[["relations"]] <- relations
# Set the relationships (resource map) to updated status.
x@relations[["updated"]] <- TRUE
return(x)
})
#' Record derivation relationships between objects in a DataPackage
#' @description Record a derivation relationship that expresses that a target object has been derived from a source object.
#' For use with DataONE, a best practice is to specify the subject and predicate as DataONE persistent identifiers
#' (https://mule1.dataone.org/ArchitectureDocs-current/design/PIDs.html). If the objects are not known to DataONE, then local identifiers can be
#' used, and these local identifiers may be promoted to DataONE PIDs when the package is uploaded to a DataONE member node.
#' @details A derived relationship is created for each value in the list "objectIDs". For each derivedId, one statement will be
#' added expressing that it was derived from the sourceId. The predicate is will be an RDF property (as a IRI) from the W3C PROV
#' specification, namely, "http://www.w3.org/ns/prov#wasDerivedFrom"
#' @param x a DataPackage object
#' @param ... Additional parameters
#' @examples \dontrun{
#' dp <- new("DataPackage")
#' recordDerivation(dp, "doi:1234/_030MXTI009R00_20030812.40.1",
#' "doi:1234/_030MXTI009R00_20030812.45.1")
#' }
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("recordDerivation", function(x, ...) {
.Deprecated("describeWorkflow", "datapack")
standardGeneric("recordDerivation")
})
#' @rdname recordDerivation
#' @param sourceID the identifier of the source object in the relationship
#' @param derivedIDs an identifier or list of identifiers of objects that were derived from the source
setMethod("recordDerivation", signature("DataPackage"), function(x, sourceID, derivedIDs, ...) {
describeWorkflow(x, sources=sourceID, derivations=derivedIDs, ...)
})
#' Retrieve relationships of package objects
#' @description Relationships of objects in a package are defined using the \code{'insertRelationship'} call and retrieved
#' using \code{getRetaionships}. These relationships are returned in a data frame with \code{'subject'}, \code{'predicate'}, \code{'objects'}
#' as the columns, ordered by "subject"
#' @param x A DataPackage object
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getRelationships", function(x, ...) {
standardGeneric("getRelationships")
})
#' @rdname getRelationships
#' @param condense A logical value, if TRUE then a more easily viewed version of relationships are returned.
#' @examples
#' dp <- new("DataPackage")
#' insertRelationship(dp, "/Users/smith/scripts/genFields.R",
#' "http://www.w3.org/ns/prov#used",
#' "https://knb.ecoinformatics.org/knb/d1/mn/v1/object/doi:1234/_030MXTI009R00_20030812.40.1")
#' rels <- getRelationships(dp)
#' @export
setMethod("getRelationships", signature("DataPackage"), function(x, condense=F, ...) {
# Get the relationships stored by insertRelationship
if (!(is.null(x@relations[["relations"]]))) {
relationships <- x@relations[["relations"]]
# Reorder output data frame by "subject" column
if (nrow(relationships) > 0) {
relationships <- relationships[order(relationships$subject, relationships$predicate, relationships$object),]
}
} else {
relationships <- data.frame()
}
# The user has requested that a 'condensed' version of the package relationships be created.
# This version is intened for display purposes and is not suitable for creating a resource map.
if(nrow(relationships) > 0 && condense) {
consoleWidth <- getOption("width")
if(is.na(consoleWidth)) consoleWidth <- 80
paddingWidth <- 10
nColumns <- 3
# Set the max column width according to the current console width,
# leave enough room for 3 columsn with padding, etc.
# Note: this is only an approximation, as the columns may take less
# width that this.
maxColumnWidth <- as.integer((consoleWidth-paddingWidth)/nColumns)
condensedRels <- apply(relationships, c(1,2), function(term) {
if(is.na(term)) return(term)
for(ins in seq_len(nrow(knownNamespaces))) {
ns <- knownNamespaces[ins, 'namespace']
prefix <- knownNamespaces[ins, 'prefix']
# use namespace in term
if(grepl(ns, term, fixed=TRUE)) {
return(condenseStr(sub(ns, paste0(prefix, ':'), term), maxColumnWidth))
}
}
# Didn't match any known namespace, check if the item is a package member identifier,
# and use the source filename if it exists.
if(is.element(term, getIdentifiers(x))) {
fn <- x@objects[[term]]@filename
fnSysmeta <- x@objects[[term]]@sysmeta@fileName
if(!is.na(fnSysmeta)) {
term <- fnSysmeta
} else if (!is.na(fn)) {
term <- basename(fn)
}
}
return(condenseStr(term, maxColumnWidth))
})
rels <- as.data.frame(condensedRels[,1:3])
return(rels[with(rels, order(subject)),])
}
return(relationships)
})
#' Plot derivation relationships obtained from getRelationships
#' @description Creates graph of dataPackage object generated from getRelationships
#' @param x a DataPackage object
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("plotRelationships", function(x, ...) {
standardGeneric("plotRelationships")
})
#' @rdname plotRelationships
#' @param col vector of colors used for plotting
#' @param ... other options passed to the igraph plot function
#' @importFrom graphics legend plot
#' @export
setMethod("plotRelationships", signature(x="DataPackage"), function(x, col=NULL, ...) {
rels <- getRelationships(x, condense=TRUE)
tmp <- rels[grepl("prov:", rels$predicate),]
type <- rels[grepl("rdf:", rels$predicate),]
if(is.null(col)){
col <- c('dodgerblue', 'forestgreen', 'firebrick',
'orange', 'purple', 'brown', 'pink', 'yellow')
}
type$object <- as.character(type$object)
g <- igraph::graph.edgelist(as.matrix(tmp[,c(1,3)]))
g <- igraph::set_edge_attr(g, "label", value = as.character(tmp[,2]))
colorLabels <- factor(sapply(type$object,
function(x){unlist(strsplit(x, ':'))[2]}))
colorLabels <- colorLabels[match(igraph::V(g)$name, type$subject)]
out <- plot(g, vertex.label.color='black', vertex.frame.color='black',
edge.color='black', edge.arrow.size=0.5,
vertex.color=col[as.numeric(colorLabels)], ...)
out
legend(x=-1.5, y=-1.1, sort(unique(colorLabels)), pch=21,
col="#777777", pt.bg=col[1:max(as.numeric(colorLabels))],
pt.cex=2, cex=0.8, bty="n", ncol=1)
})
#' Returns true if the specified object is a member of the package
#' @param x A DataPackage object
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("containsId", function(x, ...) {
standardGeneric("containsId")
})
#' @rdname containsId
#' @param identifier The DataObject identifier to check for inclusion in the DataPackage
#' @return A logical - a value of TRUE indicates that the DataObject is in the DataPackage
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' id <- "myNewId"
#' do <- new("DataObject", id=id, dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' isInPackage <- containsId(dp, identifier="myNewId")
#' @export
setMethod("containsId", signature("DataPackage"), function(x, identifier) {
obj <- x@objects[[identifier]]
found <- !is.null(obj)
return(found)
})
#' Remove the Specified Member from the Package
#' @description Given the identifier of a DataObject in a DataPackage, delete the DataObject
#' from the DataPackage.
#' @param x a DataPackage object
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("removeMember", function(x, ...) {
standardGeneric("removeMember")
})
#' @rdname removeMember
#' @param do The package member to remove, either as a \code{"DataObject"} or \code{"character"} (for the object identifier)
#' @param removeRelationships A \code{logical} value. If TRUE, package relationships for this package member are removed. Default is FALSE.
#' @details The \code{removeMember} method removes the specified DataObject from the DataPackage. In
#' addition, any package relationships that included the DataObject are removed.
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", id="myNewId", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' # Remove the package member and any provenance relationships that reference it.
#' removeMember(dp, "myNewId", removeRelationships=TRUE)
#' @export
setMethod("removeMember", signature("DataPackage"), function(x, do, removeRelationships=FALSE) {
# Note: the dataone 2.0.1 version did not have a removeRelationships parameter, so in order
# to maintain the same behaviour as the 2.0.1 release, relationships are not deleted by default.
identifier <- NA_character_
if(inherits(do, "DataObject")) {
identifier <- getIdentifier(do)
} else if (inherits(do, "character")) {
identifier <- do
} else {
stop(sprintf("Unknown type \"%s\"for parameter '\"do\""), class(do))
}
# To delete an entry, set it to NULL
x@objects[[identifier]] <- NULL
relations <- data.frame()
# The DataObject is being removed, and the relationships that it appears in
# will also be removed.
if(removeRelationships) {
# Get the current package relationships
if (!(is.null(x@relations[["relations"]]))) {
relations <- x@relations[["relations"]]
} else {
return(x)
}
newRels <- data.frame()
# Remove all occurences of this identifier from the provenance relationships
# when it appears in either the subject or object of a relationship
if(nrow(relations) > 0) {
for(irel in seq_len(nrow(relations))) {
subject <- relations[irel, 'subject']
object <- relations[irel, 'object']
testSubject <- checkIdMatch(subject, pattern='.*%s$', identifier)
if(!is.na(testSubject)) next
testObject <- checkIdMatch(object, pattern='.*%s$', identifier)
if(!is.na(testObject)) next
# Didn't find the identifier, keep this relationship
newRels <- rbind(newRels, relations[irel,])
}
}
x@relations[["relations"]] <- newRels
}
x@relations[["updated"]] <- TRUE
return(x)
})
#' Replace the raw data or file associated with a DataObject
#' @description A DataObject is a container for data that can be either an R raw object or
#' a file on local disk. The \code{replaceMember} method can be used to update the
#' date that a DataObject contains, for a DataObject that is a member of a DataPackage,
#' substituting a new file or raw object in the specified DataObject.
#' @param x A DataPackage instance
#' @param do A DataObject instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("replaceMember", function(x, do, ...) {
standardGeneric("replaceMember")
})
#' @rdname replaceMember
#' @details The data that is replacing the existing DataObject data may be of a different
#' format or type than the existing data. Because the data type and format may change, the
#' system metadata that describes the data can be updated as well. The \code{replaceMember}
#' method will update the SystemMetadata \code{size}, \code{checksum} values automatically,
#' but does not update the \code{formatId}, \code{mediaType}, \code{mediaTypeProperty}
#' unless requested, so these should be specified in the call to \code{replaceMember} if necessary.
#' If the \code{newId} argument is used, the specified new identifier will be assigned to the
#' object, otherwise one will be generated if necessary. This new identifier will be used
#' if the DataPackage is uploaded to DataONE, and this object is updating an existing object in DataONE.
#' @param replacement A \code{raw} object or \code{character} (for filename) that will replace the current value in the DataObject \code{do}.
#' @param formatId A value of type \code{"character"}, the DataONE object format for the object.
#' @param mediaType A value of type \code{"character"}, the IANA Media Type (aka MIME-Type) of the object, e.g. "text/csv".
#' @param mediaTypeProperty A value of type \code{"list"} of \code{"character"}, IANA Media Type properties for the \code{"mediaType"} argument.
#' @param newId A value of type \code{"character"} which will replace the identifier for this DataObject.
#' @examples
#' # Create a DataObject and add it to the DataPackage
#' dp <- new("DataPackage")
#' doIn <- new("DataObject", format="text/csv",
#' filename=system.file("./extdata/pkg-example/binary.csv", package="datapack"))
#' dp <- addMember(dp, doIn)
#'
#' # Use the zipped version of the file instead by updating the DataObject
#' dp <- replaceMember(dp, doIn,
#' replacement=system.file("./extdata/pkg-example/binary.csv.zip",
#' package="datapack"),
#' formatId="application/zip")
#' @export
setMethod("replaceMember", signature("DataPackage"), function(x, do, replacement, formatId=NA_character_, mediaType=NA_character_,
mediaTypeProperty=NA_character_,
newId=NA_character_, ...) {
# The DataObject to change argument can be either a DataObject or identifier. Determine which one
# and put the object out of the package so that we can modify it and replace it.
id <- NA_character_
if (inherits(do, "DataObject")) {
id <- getIdentifier(do)
if(! id %in% getIdentifiers(x)) {
stop(sprintf("DataObject for id \"%s\" was not found in the DataPackage", id))
}
} else if (inherits(do, "character")) {
id <- do
if(! id %in% getIdentifiers(x)) {
stop(sprintf("DataObject for id \"%s\" was not found in the DataPackage\n", id))
}
do <- getMember(x, id)
} else {
stop(sprintf("Unknown type \"%s\"for parameter '\"do\"\n", class(do)))
}
# If replacement param is a DataObject, then use it as is, otherwise, use a copy of the
# old DataObject, and revise it.
if(inherits(replacement, "DataObject")) {
newObj <- replacement
} else {
newObj <- getMember(x, id)
}
# Assign a new identifier the first time this routine is called. If this object
# was downloaded from a repository, it will have an '@oldId' assigned.
# If '@oldId' has been assigned, then a new id must be assigned to the sysmeta, as
# this will be required during uploaded/updated to the repo.
if(!is.na(newObj@oldId)) {
# If this is the first time the object is modified, old and current will be the same,
# so assign a new id if they are.
if(newObj@oldId == getIdentifier(newObj)) {
if(is.na(newId)) {
newId <- sprintf("urn:uuid:%s", uuid::UUIDgenerate())
newObj@sysmeta@identifier <- newId
} else {
newObj@sysmeta@identifier <- newId
}
} else {
# This isn't the first time this object has been modified, so only assign a new
# id if the user requested it.
if(!is.na(newId)) {
newObj@sysmeta@identifier <- newId
}
}
} else {
# This object must not have been downloaded from a repo, because @oldId is not assigned,
# so only need to assign a new id if the user specified it.
# If the user specified a new identifier to use ('newId' argument) the assign it now.
if(!is.na(newId)) {
newObj@sysmeta@identifier <- newId
}
}
# If replacement is a DataObject, then replace the existing DataObject 'do' with the
# DataObject 'replacement'
algorithm <- x@sysmeta@checksumAlgorithm
if (is.raw(replacement)) {
newObj@data <- replacement
newObj@filename <- NA_character_
newObj@sysmeta@size <- length(newObj@bytes)
newObj@sysmeta@checksum <- calculateChecksum(newObj, checksumAlgorithm=algorithm)
newObj@sysmeta@checksumAlgorithm <- algorithm
} else if (inherits(replacement, "character")) {
# If 'replacement' is a character string, then it is
# assumed to be a filename that replaces the DataObjects existing filename
if(!file.exists(replacement)) {
stop(sprintf("File %s not found.", replacement))
}
fileinfo <- file.info(replacement)
newObj@filename <- replacement
newObj@sysmeta@fileName <- basename(replacement)
newObj@data <- raw()
newObj@sysmeta@size <- fileinfo$size
newObj@sysmeta@checksum <- calculateChecksum(newObj, checksumAlgorithm=algorithm)
newObj@sysmeta@checksumAlgorithm <- algorithm
} else if (!inherits(replacement, "DataObject")) {
stop(sprintf("Unknown replacement type: %s\n", class(replacement)))
}
# Update these selected sysmeta fields if they were specified in the call.
if(!is.na(formatId)) {
newObj@sysmeta@formatId <- formatId
}
if(!is.na(mediaType)) {
newObj@sysmeta@mediaType <- mediaType
}
if(!is.na(mediaTypeProperty)) {
newObj@sysmeta@mediaTypeProperty <- mediaTypeProperty
}
x <- removeMember(x, do, removeRelationships=FALSE)
newObj@updated[['data']] <- TRUE
newObj@updated[['sysmeta']] <- TRUE
x <- addMember(x, newObj)
if(!is.na(newId)) {
# Update the identifier in the package relationships, replacing the old with the new.
x <- updateRelationships(x, id=id, newId=newId)
}
return(x)
})
#' Update selected elements of the XML content of a DataObject in a DataPackage (aka package member).
#' @param x a DataPackage instance
#' @param do A DataObject instance object, or DataObject identifier
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("updateMetadata", function(x, do, ...) {
standardGeneric("updateMetadata")
})
#' @rdname updateMetadata
#' @description A DataObject that contains an XML document can be edited by specifying a path
#' to the elements to edit (an XPath expression) and a value to replace the text node.
#' @details This method requires some knowledge of the structure of the metadata document as well
#' as facility with the XPath language. If the \code{newId} argument is used, the specified new
#' identifier will be assigned to the object, and the previous identifier will be stored in the \code{oldId} slot,
#' for possible use when updating the DataObject to a repository. If \code{newId} is not used, a new
#' identifier will be generated for the DataObject only the first time that updateMetadata is called for
#' a particular object in a DataPackage.
#' @param xpath A \code{character} value specifying the location in the XML to update.
#' @param replacement A \code{character} value that will replace the elements found with the \code{xpath}.
#' @param newId A value of type \code{"character"} which will replace the identifier for this DataObject.
#' @examples
#' # Create a DataObject and add it to the DataPackage
#' dp <- new("DataPackage")
#' sampleMeta <- system.file("./extdata/sample-eml.xml", package="datapack")
#' id <- "1234"
#' metaObj <- new("DataObject", id="1234", format="eml://ecoinformatics.org/eml-2.1.1",
#' file=sampleMeta)
#' dp <- addMember(dp, metaObj)
#'
#' # In the metadata object, insert the newly assigned data
#' xp <- sprintf("//dataTable/physical/distribution[../objectName/text()=\"%s\"]/online/url",
#' "sample-data.csv")
#' newURL <- sprintf("https://cn.dataone.org/cn/v2/resolve/%s", "1234")
#' dp <- updateMetadata(dp, id, xpath=xp, replacement=newURL)
#' @export
setMethod("updateMetadata", signature("DataPackage"), function(x, do, xpath, replacement,
newId=NA_character_, ...) {
# The DataObject to change argument can be either a DataObject or identifier. Determine which one
# and put the object out of the package so that we can modify it and replace it.
id <- NA_character_
if (inherits(do, "DataObject")) {
id <- getIdentifier(do)
metaObj <- do
if(! id %in% getIdentifiers(x)) {
stop(sprintf("DataObject for id \"%s\" was not found in the DataPackage", id))
}
} else if (inherits(do, "character")) {
id <- do
metaObj <- getMember(x, id)
if(! id %in% getIdentifiers(x)) {
stop(sprintf("DataObject for id \"%s\" was not found in the DataPackage", id))
}
} else {
stop(sprintf("Unknown type \"%s\"for parameter '\"do\""), class(do))
}
if(!inherits(replacement, "character")) {
stop(sprintf("Invalid type \"%s\" for argument \"replacement\", it must be \"character\"",
class(replacement)))
}
if(! is.element(id, getIdentifiers(x))) {
stop(sprintf("Invalid DataObject specified with \"do\", identifier \"%s\" is not a package member of \"x\"", id))
}
# Create a new DataObject with the modified XML
newMetaObj <- updateXML(metaObj, xpath=xpath, replacement=replacement)
x <- replaceMember(x, metaObj, replacement=newMetaObj, newId=newId, ...)
return(x)
})
#' Return the Package Member by Identifier
#' @description Given the identifier of a member of the data package, return the DataObject
#' representation of the member.
#' @param x A DataPackage instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getMember", function(x, ...) {
standardGeneric("getMember")
})
#' @rdname getMember
#' @param identifier A DataObject identifier
#' @return A DataObject if the member is found, or NULL if not
#' @export
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", id="myNewId", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' do2 <- getMember(dp, "myNewId")
setMethod("getMember", signature("DataPackage"), function(x, identifier) {
if (containsId(x, identifier)) {
return(x@objects[[identifier]])
} else {
return(NULL)
}
})
#' Return identifiers for objects that match search criteria
#' @description Return DataObjects or DataObject identifiers that match search terms.
#' @param x A DataPackage instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("selectMember", function(x, ...) {
standardGeneric("selectMember")
})
#' @rdname selectMember
#' @details The \code{"selectMember"} method inspects the DataObject slot \code{"name"} for a match with \code{"value"}
#' for each DataObject in a DataPackage. Matching DataObjects are returned as a list containing either package member
#' identifiers (character) or the DataObjects themselves, depending on the value of the \code{as} parameter.
#' @param name The name of the DataObject slot to inspect, for example "sysmeta@formatId".
#' @param value A character or logical value to match. If specified as a character value, PERL style regular expressions can be used (see ?grepl).
#' @param as A character value to specify the return type, either "DataObject" or "character" (the default)
#' @return A list of matching DataObjects or DataObject identifiers. The default is to return a list of
#' DataObject identifiers.
#' @export
#' @examples
#' #' library(datapack)
#' dp <- new("DataPackage")
#' # Add the script to the DataPackage
#' progFile <- system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack")
#' # An 'id' parameter is not specified, so one will be generated automatically.
#' progObj <- new("DataObject", format="application/R", filename=progFile)
#' dp <- addMember(dp, progObj)
#'
#' # Add a script input to the DataPackage
#' inFile <- system.file("./extdata/pkg-example/binary.csv", package="datapack")
#' inObj <- new("DataObject", format="text/csv", filename=inFile)
#' dp <- addMember(dp, inObj)
#'
#' # Add a script output to the DataPackage
#' outFile <- system.file("./extdata/pkg-example/gre-predicted.png", package="datapack")
#' outObj <- new("DataObject", format="image/png", file=outFile)
#' dp <- addMember(dp, outObj)
#'
#' # Now determine the package member identifier for the R script
#' progIds <- selectMember(dp, name="sysmeta@formatId", value="application/R", as="character")
#' inputId <- selectMember(dp, name="sysmeta@fileName", value="binary.csv")
setMethod("selectMember", signature("DataPackage"), function(x, name, value, as="character") {
# First look at the top level slot names for a match with 'field'
valid <- c("character", "DataObject")
if(!as %in% valid) {
stop(sprintf("The value for parameter \"as\" must be one of %s", paste0(valid, collapse=", ")))
}
matches <- list()
if(length(names(x@objects)) > 0) {
for(iKey in names(x@objects)) {
slotStr <- sprintf("x@objects[[\'%s\']]@%s", iKey, as.character(name))
testValue <- eval(parse(text=slotStr))
if(identical(testValue, value) || grepl(as.character(value), testValue, perl=TRUE)) {
if(as == "character") {
matches[[length(matches)+1]] <- iKey
} else {
matches[[length(matches)+1]] <- getMember(x, iKey)
}
}
}
} else {
stop("The specified package has no members")
}
if(length(matches) > 0) {
if(length(matches) == 1) {
return(matches[[1]])
} else {
return(unlist(matches))
}
} else {
return(matches)
}
})
#' Set values for selected DataPackage members.
#' @description The \code{'setValue'} method is used to modify values stored in DataPackage members.
#' Each member in a DataPackage is a DataObject which is an R S4 object that contains a set of values (slots).
#' The available slots are described at \code{help("DataObject-class")}.
#' @param x A DataPackage instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("setValue", function(x, ...) {
standardGeneric("setValue")
})
#' @rdname setValue
#' @details If the parameter \code{identifiers} is provided, then DataPackage members that
#' have identifiers specified in the list will be updated. If this parameter is not provided
#' then no members will be updated. To update all members in a package, specify the
#' value of \code{identifiers=getIdentifiers(pkg)} where \code{pkg} is the variable name
#' of the DataPackage to update. Note that this method can be used to update the
#' \code{data} or \code{filenane} slots, but it is instead recommended to us the
#' \code{replaceMember} method to achieve this, as the \code{replaceMember} method assists
#' in properly setting the related SystemMetadata values.
#' @param name A DataObject slot name.
#' @param value A new value to assign to the slot for selected DataPackage members.
#' @param identifiers A list of identifiers of DataPackage members to update.
#' @return A DataPackage with possibly updated DataObjects.
#' @export
#' @examples
#' # First create a package that we can modify.
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' # The next statment sets the format type incorrectly as an example, so we can correct it later
#' do <- new("DataObject", id="myNewId", dataobj=data, format="image/jpg", user="jsmith")
#' dp <- addMember(dp, do)
#' data <- charToRaw("7,8.9\n4,10,11")
#' # This next statement also sets the format type incorrectly
#' do <- new("DataObject", id="myNewId2", dataobj=data, format="image/jpg", user="jsmith")
#' dp <- addMember(dp, do)
#' # Change format types to correct value for both package members
#' # Careful! Specifying 'identifiers=getIdentifiers(dp) will update all package members!
#' dp <- setValue(dp, name="sysmeta@formatId", value="text/csv", identifiers=getIdentifiers(dp))
setMethod("setValue", signature("DataPackage"), function(x, name, value, identifiers=NA_character_, ...) {
# First look at the top level slot names for a match with 'field'
matchingIds <- list()
if(length(names(x@objects)) > 0) {
for(iKey in names(x@objects)) {
if(! iKey %in% identifiers) next
tmpObj <- x@objects[[iKey]]
# If a list is specified, we have to manualy convert it to a string that can be parsed,
# as R strips the 'list' out and we no longer have a list. This technique only works
# for a list of characters
if (inherits(value, "list")) {
saveQuotes <- getOption("useFancyQuotes")
options(useFancyQuotes = FALSE)
strList <- paste(dQuote(value), collapse=",")
strList <- sprintf("list(%s)", strList)
setStr <- sprintf("x@objects[[\'%s\']]@%s <- %s", iKey, as.character(name), strList)
options(useFancyQuotes = saveQuotes)
} else if (inherits(value, "character")) {
if(is.na(value)) {
setStr <- sprintf("x@objects[[\'%s\']]@%s <- as.character(%s)", iKey, as.character(name), value)
} else {
setStr <- sprintf("x@objects[[\'%s\']]@%s <- \"%s\"", iKey, as.character(name), value)
}
} else {
setStr <- sprintf("x@objects[[\'%s\']]@%s <- %s", iKey, as.character(name), value)
}
eval(parse(text=setStr))
if (! identical(tmpObj@sysmeta, x@objects[[iKey]]@sysmeta)) x@objects[[iKey]]@updated[['sysmeta']] <- TRUE
if (length(tmpObj@data) != length(x@objects[[iKey]]@data)) x@objects[[iKey]]@updated[['data']] <- TRUE
if (identical(tmpObj@filename, x@objects[[iKey]]@filename)) x@objects[[iKey]]@updated[['data']] <- TRUE
#if(identical(testValue, value)) matchingIds[[length(matchingIds)+1]] <- iKey
}
} else {
stop("The specified package has no members")
}
return(x)
})
#' Get values for selected DataPackage members.
#' @description Given a slot name and set of package member identifiers, return slot values.
#' @param x A DataPackage instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getValue", function(x, ...) {
standardGeneric("getValue")
})
#' @rdname getValue
#' @details If the parameter \code{identifiers} is provided, then only the DataPackage
#' members that have identifiers in the provided list will have there values fetched.
#' If this parameter is not provided, then the values for all DataPackage members are returned.
#' @param name A name of a DataObject slot.
#' @param identifiers A list of DataPackage member identifiers
#' @return A list of values for matching slot names and included identifiers.
#' @export
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", id="myNewId", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' data <- charToRaw("7,8.9\n4,10,11")
#' do <- new("DataObject", id="myNewId2", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' formats <- getValue(dp, name="sysmeta@formatId")
setMethod("getValue", signature("DataPackage"), function(x, name, identifiers=NA_character_) {
values <- list()
if(is.na(identifiers)) identifiers <- getIdentifiers(x)
if(length(names(x@objects)) > 0) {
for(iKey in names(x@objects)) {
if(! iKey %in% identifiers) next
value <- NA_character_
slotStr <- sprintf("value <- x@objects[[\'%s\']]@%s", iKey, as.character(name))
eval(parse(text=slotStr))
#values[[length(values)+1]] <- value
values[[iKey]] <- value
}
} else {
stop("The specified package has no members")
}
values
})
#' @rdname setPublicAccess
#' @aliases setPublicAccess
#' @param identifiers A list of \code{character} values containing package member identifiers that will be updated (default is all package members).
#' @return A DataPackage with modified access rules.
#' @seealso \code{\link{DataPackage-class}}
#' @examples
#' # First create a sample package with two DataObjects
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' # Now add public read to all package members ("id1", "id2")
#' dp <- setPublicAccess(dp)
setMethod("setPublicAccess", signature("DataPackage"), function(x, identifiers=list()) {
if(length(identifiers) == 0) identifiers <- getIdentifiers(x)
if(length(names(x@objects)) > 0) {
for(iKey in names(x@objects)) {
if(! iKey %in% identifiers) next
obj <- getMember(x, identifier=iKey)
obj <- setPublicAccess(obj)
obj@updated[['sysmeta']] <- TRUE
x <- removeMember(x, obj, removeRelationships=FALSE)
x <- addMember(x, obj)
}
} else {
stop("The specified package has no members")
}
return(x)
})
#' @rdname addAccessRule
#' @details Note that when \code{addAccessRule} is called with a `DataPackage` argument, the
#' additional parameter \code{identifiers} can be used:
#' \itemize{
#' \item{identifiers A list of \code{character} values containing package member identifiers that the access rule will be applied to (all members is the default)}.
#' }
#' @return The DataPackage with updated DataObject access policies
#' @seealso \code{\link{DataPackage-class}}
#' @examples
#' # Add an access rule to members of a DataPackage
#' # First create a sample DataPackage
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' # Add access rule to all package members
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write", getIdentifiers(dp))
setMethod("addAccessRule", signature("DataPackage"), function(x, y, ...) {
argList <- list(...)
argListLen <- length(argList)
# Check for "identifiers" as named argument, i.e. 'identifiers=c("id1", "id2")'
if (!"identifiers" %in% names(argList)) {
# User has specified "permission=<blah>, identifiers=<blah>"
if(argListLen >= 2) {
identifiers <- argList[[2]]
} else {
identifiers <- list()
}
}
if(length(identifiers) == 0) identifiers <- getIdentifiers(x)
if(length(names(x@objects)) > 0) {
for(iKey in names(x@objects)) {
if(! iKey %in% identifiers) next
obj <- getMember(x, identifier=iKey)
obj <- addAccessRule(obj, y, ...)
obj@updated[['sysmeta']] <- TRUE
x <- removeMember(x, iKey, removeRelationships=FALSE)
x <- addMember(x, obj)
}
}
return(x)
})
#' @rdname clearAccessPolicy
#' @param identifiers A list of \code{character} values containing package member identifiers that the access rule will be applied to.
#' @return The SystemMetadata object with the cleared access policy.
#' @seealso \code{\link{DataPackage-class}}
#' @examples
#' # Clear access policy for a DataPackage
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#'
#' # Add the access rule to all package members
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com",
#' permission="write")
#' # Now clear the access policy for just the second object
#' dp <- clearAccessPolicy(dp, getIdentifier(obj2))
#'
#' @export
setMethod("clearAccessPolicy", signature("DataPackage"), function(x, identifiers=list(), ...) {
if(length(identifiers)==0) identifiers <- getIdentifiers(x)
if(length(names(x@objects)) > 0) {
for(iKey in names(x@objects)) {
if(! iKey %in% identifiers) next
obj <- getMember(x, identifier=iKey)
obj <- clearAccessPolicy(obj, ...)
obj@updated[['sysmeta']] <- TRUE
x <- removeMember(x, iKey, removeRelationships=FALSE)
x <- addMember(x, obj)
}
}
return(x)
})
#
#' @rdname hasAccessRule
#' @description If called for a DataPackage, then the SystemMetadata for DataObjects in the DataPackage are checked.
#' @param identifiers A list of \code{character} values containing package member identifiers for which the access rule will be checked.
#' @seealso \code{\link{DataPackage-class}}
#' @examples
#' #
#' # Check access rules for member DataObjects of a DataPackage.
#' # First create an example DataPackage
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' # Add access rules to all package members
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
#' hasWrite <- hasAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' hasChange <- hasAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
#' @return When called for a DataPackage, boolean TRUE if the access rule exists in all specified package members already, FALSE otherwise
setMethod("hasAccessRule", signature("DataPackage"), function(x, subject, permission, identifiers=list(), ...) {
found <- FALSE
# Have to provide a list of member ids to check
if(length(identifiers) == 0) identifiers <- getIdentifiers(x)
if(length(getIdentifiers(x)) > 0) {
for(id in getIdentifiers(x)) {
if(! id %in% identifiers) next
do <- getMember(x, id)
# If even one specified member doesn't have the perms, then fail.
if (!hasAccessRule(do, subject, permission)) return (FALSE)
}
}
return(TRUE)
})
#' @rdname removeAccessRule
#' @return The Datapackage with members having updated access policies.
#' @param identifiers A list of \code{character} values containing package member identifiers that the access rule will be
#' applied to (default is all package members).
#' @seealso \code{\link{DataPackage-class}}
#' @examples
#' #
#' # Remove access rules from a DataPackage.
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' # Add access rule to all package members
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission" )
#' # Now take 'changePermission' away for user 'uid=smith...', specifying parameter 'y'
#' # as a character string containing a 'subject'.
#' dp <- removeAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' dp <- removeAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
#'
#' # Alternatively, parameter "y" can be a data.frame containing one or more access rules:
#' # Add write, changePermission for uid=jones,...
#' dp <- addAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "write")
#' dp <- addAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "changePermission")
#' # Now take privs for uid=jones,... away
#' accessRules <- data.frame(subject=c("uid=jones,ou=Account,dc=example,dc=com",
#' "uid=jones,ou=Account,dc=example,dc=com"),
#' permission=c("write", "changePermission"))
#' dp <- removeAccessRule(dp, accessRules)
setMethod("removeAccessRule", signature("DataPackage"), function(x, y, permission=NA_character_,
identifiers=list(), ...) {
if(length(identifiers) == 0) identifiers <- getIdentifiers(x)
if(length(names(x@objects)) > 0) {
for(iKey in names(x@objects)) {
if(! iKey %in% identifiers) next
obj <- getMember(x, identifier=iKey)
obj <- removeAccessRule(obj, y, permission=permission, ...)
x <- removeMember(x, iKey, removeRelationships=FALSE)
x <- addMember(x, obj)
}
}
return(x)
})
#' Create an OAI-ORE resource map from the package
#' @description The DataPackage is serialized as a OAI-ORE resource map to the specified file.
#' @param x A DataPackage object
#' @param ... Additional arguments
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("serializePackage", function(x, ...) {
standardGeneric("serializePackage")
})
#' @rdname serializePackage
#' @details The resource map that is created is serialized by default as RDF/XML. Other serialization formats
#' can be specified using the \code{syntaxName} and \code{mimeType} parameters. Other available formats
#' include:
#' \tabular{ll}{
#' \strong{syntaxName}\tab \strong{mimeType}\cr
#' json\tab application/json\cr
#' ntriples\tab application/n-triples\cr
#' turtle\tab text/turtle\cr
#' dot\tab text/x-graphviz\cr
#' }
#' Note that the \code{syntaxName} and \code{mimeType} arguments together specify o serialization format.
#'
#' Also, for packages that will be uploaded to the DataONE network, "rdfxml" is the only
#' accepted format.
#'
#' The resolveURI string value is prepended to DataPackage member identifiers in the resulting resource map.
#' If no resolveURI value is specified, then 'https://cn.dataone.org/cn/v1/resolve' is used.
#' @param file The file to which the ResourceMap will be serialized
#' @param id A unique identifier for the serialization. The default value is the id assigned
#' to the DataPackage when it was created.
#' @param syntaxName The name of the syntax to use for serialization - default is "rdfxml"
#' @param mimeType The mimetype of the serialized output - the default is "application/rdf+xml"
#' @param namespaces A data frame containing one or more namespaces and their associated prefix
#' @param syntaxURI URI of the serialization syntax
#' @param resolveURI A character string containing a URI to prepend to datapackage identifiers
#' @param creator A \code{character} string containing the creator of the package.
#' @export
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", id="do1", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' data2 <- charToRaw("7,8,9\n10,11,12")
#' do2 <- new("DataObject", id="do2", dataobj=data2, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do2)
#' dp <- describeWorkflow(dp, sources=do, derivations=do2)
#' \dontrun{
#' td <- tempdir()
#' status <- serializePackage(dp, file=paste(td, "resmap.json", sep="/"), syntaxName="json",
#' mimeType="application/json")
#' status <- serializePackage(dp, file=paste(td, "resmap.xml", sep="/"), syntaxName="rdfxml",
#' mimeType="application/rdf+xml")
#' status <- serializePackage(dp, file=paste(td, "resmap.ttl", sep="/"), syntaxName="turtle",
#' mimeType="text/turtle")
#' }
setMethod("serializePackage", signature("DataPackage"), function(x, file,
id = NA_character_,
syntaxName="rdfxml",
mimeType="application/rdf+xml",
namespaces=data.frame(namespace=character(), prefix=character(), stringsAsFactors=FALSE),
syntaxURI=NA_character_, resolveURI=NA_character_,
creator=NA_character_) {
resMap <- getResourceMap(x, id, creator, resolveURI)
status <- serializeRDF(resMap, file, syntaxName, mimeType, namespaces, syntaxURI)
freeResourceMap(resMap)
rm(resMap)
return(status)
})
#' Serialize A DataPackage into a BagIt Archive File
#' @description The BagIt packaging format \url{https://tools.ietf.org/html/draft-kunze-bagit-08}
#' is used to prepare an archive file that contains the contents of a DataPackage.
#' @details A BagIt Archive File is created by copying each member of a DataPackage, and preparing
#' files that describe the files in the archive, including information about the size of the files
#' and a checksum for each file. An OAI-ORE resource map is automatically created and added to the
#' archive. These metadata files and the data files are then packaged into
#' a single zip file.
#' @param x A DataPackage object
#' @param ... Additional arguments
#' @seealso \code{\link{DataPackage-class}}
#' @examples
#' # Create the first data object
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3,5,6")
#' do <- new("DataObject", id="do1", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' # Create a second data object
#' data2 <- charToRaw("7,8,9,4,10,11")
#' do2 <- new("DataObject", id="do2", dataobj=data2, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do2)
#' # Create a relationship between the two data objects
#' dp <- describeWorkflow(dp, sources="do2", derivations="do2")
#' # Write out the data package to a BagIt file
#' \dontrun{
#' bagitFile <- serializeToBagIt(dp, syntaxName="json", mimeType="application/json")
#' }
#' @export
setGeneric("serializeToBagIt", function(x, ...) {
standardGeneric("serializeToBagIt")
})
#' @rdname serializeToBagIt
#' @importFrom zip zip
#' @import uuid
#' @import digest
#' @param mapId A unique identifier for the package resource map. If not specified, one will be automatically generated.
#' @param syntaxName The name of the syntax to use for the resource map serialization, defaults to "rdfxml"
#' @param mimeType The mimetype for the resource map serialization, defaults to "application/rdf+xml".
#' @param namespaces An optional data frame containing one or more namespaces and their associated prefix for the resource map serialization.
#' @param syntaxURI An optional string specifying the URI for the resource map serialization.
#' @param resolveURI A character string containing a URI to prepend to datapackage identifiers for the resource map.
#' @param creator A \code{character} string containing the creator of the package.
#' @seealso For more information and examples regarding the parameters specifying the creation of the resource map, see \link{serializePackage}.
#' @return The file name that contains the BagIt zip archive.
setMethod("serializeToBagIt", signature("DataPackage"), function(x, mapId=NA_character_,
syntaxName=NA_character_,
namespaces=data.frame(),
mimeType=NA_character_,
syntaxURI=NA_character_,
resolveURI=NA_character_,
creator=NA_character_, ...) {
cwd <- getwd()
on.exit(expr = setwd(cwd))
# Create a temp working area where the BagIt directory structure will be created
tmpDir <- tempdir()
bagDir <- file.path(tmpDir, "bag")
if(file.exists(bagDir)) {
unlink(bagDir, recursive=TRUE)
}
# Create the bag directories
dir.create(bagDir)
payloadDir <- file.path(bagDir, "data")
if(!file.exists(payloadDir)) dir.create(payloadDir)
metadataDir <- file.path(bagDir, "metadata")
if(!file.exists(metadataDir)) dir.create(metadataDir)
sysMetaDir <- file.path(metadataDir, "sysmeta")
if(!file.exists(sysMetaDir)) dir.create(sysMetaDir)
# Create bagit.txt
bagitFileText <- sprintf("BagIt-Version: 1.0\nTag-File-Character-Encoding: UTF-8")
writeLines(bagitFileText, file.path(bagDir, "bagit.txt"))
# Create a ResourceMap object and serialize it
if(is.na(mapId)) {
mapId <- sprintf("urn:uuid:%s", uuid::UUIDgenerate())
}
if(is.na(syntaxName)) {
syntaxName="rdfxml"
}
if(is.na(mimeType)) {
mimeType <- "application/rdf+xml"
}
if(is.na(resolveURI)) {
resolveURI <- ""
}
tmpFile <- tempfile()
serializePackage(x, file=tmpFile, id=mapId, syntaxName=syntaxName, namespaces=namespaces,
mimeType=mimeType, resolveURI=resolveURI, creator=creator)
relFile <- file.path("oai-ore.xml")
resMapFilepath <- file.path(metadataDir, relFile)
file.copy(tmpFile, resMapFilepath)
# Add resource map to the manifest
resMapMd5 <- digest(resMapFilepath, algo="md5", file=TRUE)
manifestFileTextLine <- sprintf("%s %s", resMapMd5, gsub(paste(".*bag", .Platform$file.sep, sep=""), "", resMapFilepath))
writeLines(manifestFileTextLine, file.path(bagDir, "tagmanifest-md5.txt"))
identifiers <- getIdentifiers(x)
# Get a resource map so that it can be queried for science metadata
resMap <- getResourceMap(x)
# Check to see if the user supplied any science metadata
scienceMetadataUris <- getScienceMetadataUris(resMap)
# The number that's appended to the filename if multiple documents exist
scienceMetadataCount <- 1
# URLEncode each URI
metadataIds <- lapply(scienceMetadataUris, utils::URLdecode)
# Determines whether or not an object is in a list of URIs
in_uri <- function(objectId, uriList) {
for (uri in uriList) {
if (grepl(objectId, uri, fixed=TRUE)) {
return (TRUE)
}
}
return (FALSE)
}
#' Recursively determines the name for a science metadata object.
#' The base file name (eml, datacite, science-metadata, etc) should stay the same.
#' Call the method with the base name and the number of existing files to start with.
#' This is most likely 0.
#' If there's a count defined, add it to the end of the file in ()
#' Then call the method again with count += 1
#' Eventually a free file name will be found, and then the function returns that name
getMetadataName <- function(baseName, count) {
if (count > 0) {
scienceMetadataFilename <- sprintf('%s(%d).xml', baseName, count)
} else {
scienceMetadataFilename <- sprintf('%s.xml', baseName)
}
scienceMetadataPath <- file.path(metadataDir, scienceMetadataFilename)
if (file.exists(scienceMetadataPath)) {
# If one was already written, then append (1), (2), (3), etc to the file name
# Add 1 to the count so that the next number is one higher
scienceMetadataCount <- scienceMetadataCount+1
return (getMetadataName(baseName, scienceMetadataCount))
}
return (scienceMetadataFilename)
}
# Get each member of the package and write it to disk
for(idNum in seq_along(identifiers)) {
dataObj <- getMember(x, identifiers[idNum])
systemMetadata <- dataObj@sysmeta
# Determine the file name and path of the data object.
if (!is.na(dataObj@targetPath)) {
dataObjectLocation <- file.path(payloadDir, dataObj@targetPath)
} else if (!is.na(systemMetadata@fileName)) {
# Otherwise, if they specified a file name use that
# The file name slot is a full path, only get the file name portion of it
dataObjectLocation <-file.path(payloadDir, basename(systemMetadata@fileName))
} else {
# If the file name wasn't specified, use the identifier
dataObjectLocation <- file.path(payloadDir, getIdentifier(dataObj))
}
# Check if the object is a science metadata document by checking if its identifier is in the list of
# science metadata identifiers
objectId = getIdentifier(dataObj)
if(in_uri(objectId=objectId, uriList=metadataIds)) {
scienceMetadataFilename <- getFormatFilename(getFormatId(dataObj))
scienceMetadataFilename <- getMetadataName(scienceMetadataFilename, 0)
writeToBag(dataObj, file.path(metadataDir, scienceMetadataFilename), bagDir, sysMetaDir,
isScienceMetadata=TRUE)
} else {
# Otherwise it's a plain data object
writeToBag(dataObj, dataObjectLocation, bagDir, sysMetaDir)
}
}
dataInfo <- file.info(list.files(payloadDir, full.names=TRUE, recursive=TRUE))
payloadBytes <- sum(dataInfo[['size']])
tagInfo <- file.info(list.files(metadataDir, full.names=TRUE, recursive=TRUE))
tagBytes <- sum(tagInfo[['size']])
bagBytes = tagBytes + payloadBytes
# Convert the value returned from file.info (bytes) into a more human readable form.
# Use SI convention for defining size
if(bagBytes < 1024) {
bagSize <- bagBytes
sizeUnits <- "B"
} else if (bagBytes < 1048576) {
# Size is displayed in Kilobytes
bagSize <- bagBytes / 1024
sizeUnits <- "KB"
} else if (bagBytes < 1073741824) {
# Size is displayed in megabytes
bagSize <- bagBytes / 1048576
sizeUnits <- "MB"
} else {
# Size is displayed in gigabytes
bagSize <- bagBytes / 1073741824
sizeUnits <- "GB"
}
bagInfo <- sprintf("Payload-Oxum: %d.%d\nBagging-Date: %s\nBag-Size: %f %s",
bagBytes, length(list.files(payloadDir)),
format(Sys.time(), format="%Y-%m-%d"),
bagSize, sizeUnits)
# Create bag-info.txt
writeLines(bagInfo, file.path(bagDir, "bag-info.txt"))
# Add the minimum required tag files
tagFiles <- c("bag-info.txt", "bagit.txt")
for (i in seq_along(tagFiles)) {
thisFile <- tagFiles[i]
thisMd5 <- digest(file.path(bagDir, thisFile), algo="md5", file=TRUE)
write(sprintf("%s %s", thisMd5, thisFile),file=file.path(bagDir, "tagmanifest-md5.txt"),append=TRUE)
}
zipFile <- tempfile(fileext=".zip")
# Now zip up the directory
setwd(normalizePath(bagDir))
if(normalizePath(getwd()) != normalizePath(bagDir)) {
stop(sprintf("Unable to set working directory to the BagIt dir: %s", bagDir))
}
zip::zip(zipFile, files=list.files(recursive=TRUE))
# Return the zip file name
return(zipFile)
})
#' @title Add data derivation information to a DataPackage
#' @description Add information about the relationships among DataObject members
#' in a DataPackage, retrospectively describing the way in which derived data were
#' created from source data using a processing program such as an R script. These provenance
#' relationships allow the derived data to be understood sufficiently for users
#' to be able to reproduce the computations that created the derived data, and to
#' trace lineage of the derived data objects. The method \code{describeWorkflow}
#' will add provenance relationships between a script that was executed, the files
#' that it used as sources, and the derived files that it generated.
#' @details This method operates on a DataPackage that has had DataObjects for
#' the script, data sources (inputs), and data derivations (outputs) previously
#' added to it, or can reference identifiers for objects that exist in other DataPackage
#' instances. This allows a user to create a standalone package that contains all of
#' its source, script, and derived data, or a set of data packages that are chained
#' together via a set of derivation relationships between the members of those packages.
#'
#' Provenance relationships are described following the the ProvONE data model, which
#' can be viewed at \url{https://purl.dataone.org/provone-v1-dev}. In particular,
#' the following relationships are inserted (among others):
#' \itemize{
#' \item{\code{prov:used}} {indicates which source data was used by a program execution}
#' \item{\code{prov:generatedBy}} {indicates which derived data was created by a program execution}
#' \item{\code{prov:wasDerivedFrom}} {indicates the source data from which derived data were created using the program}
#' }
#'
#' @param x The \code{DataPackage} to add provenance relationships to.
#' @param ... Additional parameters
setGeneric("describeWorkflow", function(x, ...) {
standardGeneric("describeWorkflow")
})
#' @rdname describeWorkflow
#' @param sources A list of DataObjects for files that were read by the program. Alternatively, a list
#' of DataObject identifiers can be specified as a list of character strings.
#' @param program The DataObject created for the program such as an R script. Alternatively the DataObject identifier can
#' be specified.
#' @param derivations A list of DataObjects for files that were generated by the program. Alternatively, a list
#' of DataObject identifiers can be specified as a list of character strings.
#' @param insertDerivations A \code{logical} value. If TRUE then the provenance relationship
#' \code{prov:wasDerivedFrom} will be used to connect every source and derivation. The default value
#' is TRUE.
#' @seealso The R 'recordr' package for run-time recording of provenance relationships.
#' @import uuid
#' @export
#' @examples
#' library(datapack)
#' dp <- new("DataPackage")
#' # Add the script to the DataPackage
#' progFile <- system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack")
#' progObj <- new("DataObject", format="application/R", filename=progFile)
#' dp <- addMember(dp, progObj)
#'
#' # Add a script input to the DataPackage
#' inFile <- system.file("./extdata/pkg-example/binary.csv", package="datapack")
#' inObj <- new("DataObject", format="text/csv", filename=inFile)
#' dp <- addMember(dp, inObj)
#'
#' # Add a script output to the DataPackage
#' outFile <- system.file("./extdata/pkg-example/gre-predicted.png", package="datapack")
#' outObj <- new("DataObject", format="image/png", file=outFile)
#' dp <- addMember(dp, outObj)
#'
#' # Add the provenenace relationshps, linking the input and output to the script execution
#' # Note: 'sources' and 'derivations' can also be lists of "DataObjects" or "DataObject' identifiers
#' dp <- describeWorkflow(dp, sources = inObj, program = progObj, derivations = outObj)
#' # View the results
#' utils::head(getRelationships(dp))
setMethod("describeWorkflow", signature("DataPackage"), function(x, sources=list(),
program=NA_character_,
derivations=list(), insertDerivations=TRUE, ...) {
# Check each "source" list member and check if it is the correct type, either
# DataObject or character (for DataObject id). Build a list of member ids for
# use later.
inIds <- list()
# Special case, if the user passed in a single DataObject for sources or derivations,
# convert it to a list to facilitate easier processing in tests below.
if(inherits(sources, "DataObject")) sources <- list(sources)
if(inherits(derivations, "DataObject")) derivations <- list(derivations)
# Warn user if they haven't provided enough info to insert any prov relationships
if(missing(program)) {
if(length(sources) == 0) {
stop("Both arguments \"program\" and \"sources\" are missing.")
}
if(length(derivations) == 0) {
stop("Both arguments \"program\" and \"derivations\" are missing.")
}
} else {
# Program was specified, but no inputs, outputs
if(length(sources) == 0 && length(derivations) == 0) {
stop("Argument \"program\" is specified, but both \"sources\" and \"derivations\" are missing.")
}
}
if(length(sources) > 0) {
for (isrc in seq_along(sources)) {
obj <- sources[[isrc]]
if(inherits(obj, "DataObject")) {
inIds[[length(inIds)+1]] <- getIdentifier(obj)
} else if (inherits(obj, "character")) {
inIds[[length(inIds)+1]] <- obj
} else {
stop(sprintf("Invalid type \'%s\' for source[[%s]]", class(obj), isrc))
}
}
}
# Check each "derivation" list member and check if it is the correct type, either
# DataObject or character (for DataObject id). Build a list of member ids for use
# later.
# Special case, if the user passed in a single DataObject for inputs, stick it in
# a list.
outIds <- list()
if(length(derivations) > 0) {
for (idst in seq_along(derivations)) {
obj <- derivations[[idst]]
if(inherits(obj, "DataObject")) {
outIds[[length(outIds)+1]] <- getIdentifier(obj)
} else if (inherits(obj, "character")) {
outIds[[length(outIds)+1]] <- obj
} else {
stop(sprintf("Invalid type \'%s\' for derivations[[%s]]", class(obj), idst))
}
}
}
if(inherits(program, "DataObject")) {
scriptId <- getIdentifier(program)
} else if (inherits(program, "character")) {
if(!is.na(program)) {
scriptId <- program
} else {
scriptId <- NA_character_
}
} else {
stop(sprintf("Invalid type \'%s\' for program", class(program)))
}
# Check that pids are in the data package
pkgIds <- getIdentifiers(x)
if(!is.na(scriptId)) {
if(!is.element(scriptId, pkgIds)) {
stop(sprintf("Argument \'program\'is not a package memmber."))
}
}
# Process inputs and outputs separately from the program identifier, as there may not
# have been a program specified.
if(length(inIds) > 0) {
for (iCnt in seq_along(inIds)) {
pid <- inIds[[iCnt]]
# This pid is not a package member, so add it to the list of external pids
if (!is.element(pid, pkgIds)) {
x@externalIds[[length(x@externalIds)+1]] <- pid
}
x <- insertRelationship(x, subjectID=pid, objectIDs=provONEdata, predicate=rdfType, objectTypes="uri")
}
}
if(length(outIds) > 0) {
for (iCnt in seq_along(outIds)) {
pid <- outIds[[iCnt]]
# This pid is not a package member, so add it to the list of external pids
if (!is.element(pid, pkgIds)) {
x@externalIds[[length(x@externalIds)+1]] <- pid
}
x <- insertRelationship(x, subjectID=pid, objectIDs=provONEdata, predicate=rdfType, objectTypes="uri")
}
}
# If a program argument was not specified, then just record derivations from the inputs to the
# outputs
if(!is.na(scriptId)) {
# The script identifier must be for the local package, it is not supported to specify
# a script from another package using this method, primarily because it is the identifier of the Execution
# object that is required to establish the relationships "executionPid -> prov:used -> dataPid" and
# "dataPid -> wasGeneratedBy -> executionPid". We do not know the Execution id for the scriptId, which
# is needed to set the other relationships required for indexing.
if(grepl("\\s*https?:.*", scriptId, perl=TRUE)) {
stop(sprintf("The \"program\" parameter must specify an identifier that is a member of the current package.\nThe identifier %s is not valid", scriptId))
}
# Currently we have to have a prov:execution associated with each R script, so that metacatui will
# render the used and gen files with the R script, via the qualified association and hadPlan, OK!
executionId <- sprintf("urn:uuid:%s", uuid::UUIDgenerate())
# Qualified association to link the execution and each of the program (plan)
associationId <- sprintf("_%s", uuid::UUIDgenerate())
planId <- scriptId
# Qualified association
# Subject id of NA will cause a random blank node identifier to be produced
x <- insertRelationship(x, subjectID=executionId, objectIDs=associationId, predicate=provQualifiedAssociation, objectTypes="blank")
# Execution rdf type
x <- insertRelationship(x, subjectID=executionId, objectIDs=provONEexecution, predicate=rdfType, objectTypes="uri")
# prov:hadPlan
x <- insertRelationship(x, subjectID=associationId, objectIDs=planId, predicate=provHadPlan, subjectType="blank", objectTypes="uri")
# prov rdf type declaration for association
x <- insertRelationship(x, subjectID=associationId, objectIDs=provAssociation, predicate=rdfType, subjectType="blank", objectTypes="uri")
# prov rdf type declaration for program
x <- insertRelationship(x, subjectID=planId, objectIDs=provONEprogram, predicate=rdfType, objectType="uri")
x <- insertRelationship(x, subjectID=executionId, objectIDs=executionId, predicate=DCTERMSidentifier, objectTypes="literal", dataTypeURIs=xsdString)
# Process files used by the script
if(length(inIds) > 0) {
for (iCnt in seq_along(inIds)) {
thisPid <- inIds[[iCnt]]
# Record prov:used relationship between the input dataset and the execution
x <- insertRelationship(x, subjectID=executionId, objectIDs=thisPid, predicate=provUsed)
}
}
# Process files generated by the script
if(length(outIds) > 0) {
for (iCnt in seq_along(outIds)) {
thisPid <- outIds[[iCnt]]
# Record prov:wasGeneratedBy relationship between the output dataset and the execution
x <- insertRelationship(x, subjectID=thisPid, objectIDs=executionId, predicate=provWasGeneratedBy)
}
}
}
if(insertDerivations) {
# Record the 'prov:wasDerivedFrom' relationships, directly linking the output files to the input files.
# This section can be run even if a 'program' argument is not defined.
if(length(outIds) > 0 && length(inIds) > 0) {
for (iOut in seq_along(outIds)) {
outputId <- outIds[[iOut]]
for (iIn in seq_along(inIds)) {
inputId <- inIds[[iIn]]
x <- insertRelationship(x, subjectID=outputId, objectIDs=inputId, predicate=provWasDerivedFrom)
}
}
}
}
return(x)
})
#' Update package relationships by replacing an old identifier with a new one.
#' @description When package members are updated, they receive a new identifier (replaceMember). It is therefor
#' necessary to update the package relationships to update occurrences of the old identifier
#' with the new one when the old identifier appears in the "subject" or "object" of a
#' relationship.
#' @param x A DataPackage object
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("updateRelationships", function(x, ...) {
standardGeneric("updateRelationships")
})
#' @rdname updateRelationships
#' @param id A character value containing the identifier to be replaced.
#' @param newId A character value containing the identifier that will replace the old identifier.
#' @export
setMethod("updateRelationships", signature("DataPackage"), function(x, id, newId, ...) {
relations <- getRelationships(x)
x@relations = list()
if(nrow(relations) > 0) {
for (irow in seq_len(nrow(relations))) {
subject <- relations[irow, 'subject']
predicate <- relations[irow, 'predicate']
object <- relations[irow, 'object']
objectType <- relations[irow, 'objectType']
subjectType <- relations[irow, 'subjectType']
dataTypeURI <- relations[irow, 'dataTypeURI']
testSubject <- checkIdMatch(subject, pattern='.*%s$', id)
if(!is.na(testSubject)) subject <- newId
testObject <- checkIdMatch(object, pattern='.*%s$', id)
if(!is.na(testObject)) object <- newId
x <- insertRelationship(x, subjectID=subject, objectIDs=object, predicate=predicate,
subjectType=subjectType, objectTypes=objectType, dataTypeURIs=dataTypeURI)
}
}
return(x)
})
setMethod("show", "DataPackage",
#function(object)print(rbind(x = object@x, y=object@y))
function(object) {
ids <- getIdentifiers(object)
if(length(ids) == 0) {
cat(sprintf("This package does not contain any DataObjects.\n"))
return()
}
# currentWidth starts as width of combined initial widths, with 80 as
# the start (80 - 6 spaces for padding)
nfields <- 8
nspaces <- nfields - 1
maxWidth <- getOption("width") - nspaces
currentWidth <- 59
fileNameWidth <- 10
formatIdWidth <- 8
mediaTypeWidth <- 10
sizeWidth <- 8
identifierWidth <- 10
# This column width is set for the title width and not the contents, as the title is always longer
updatedWidth <- 8
localWidth <- 5
# Set the minimum field width for each field
fileNameMinWidth <- fileNameWidth
formatIdMinWidth <- formatIdWidth
mediaTypeMinWidth <- mediaTypeWidth
sizeMinWidth <- sizeWidth
identifierMinWidth <- identifierWidth
updatedMinWidth <- updatedWidth
localMinWidth <- localWidth
# Set the max field width for each field based on all values in a column
fileNameMaxWidth <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@fileName)) }))))
formatIdMaxWidth <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@formatId)) }))))
mediaTypeMaxWidth <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@mediaType)) }))))
sizeMaxWidth <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@size)) }))))
identifierMaxWidth <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@identifier)) }))))
updatedMaxWidth <- 8
localMaxWidth <- 5
done <- FALSE
# Continue until no fields can be increased in width.
# Now that the width of each field and hence total width is known, iteratively adjust each field until
# the max line width is reached.
while(!done) {
updated <- list()
# fieldWidth, totlaWidth, done <- setColumnWidth(current, max, increment, totalWidth)
values <- setColumnWidth(fileNameWidth, min=fileNameMinWidth, max=fileNameMaxWidth, increment=5, currentTotal=currentWidth, displayWidth=maxWidth)
fileNameWidth <- values[[1]]
currentWidth <- values[[2]]
updated[[length(updated)+1]] <- values[[3]]
values <- setColumnWidth(formatIdWidth, min=formatIdMinWidth, max=formatIdMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
formatIdWidth <- values[[1]]
currentWidth <- values[[2]]
updated[[length(updated)+1]] <- values[[3]]
values <- setColumnWidth(mediaTypeWidth, min=mediaTypeMinWidth, max=mediaTypeMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
mediaTypeWidth <- values[[1]]
currentWidth <- values[[2]]
updated[[length(updated)+1]] <- values[[3]]
values <- setColumnWidth(sizeWidth, min=sizeMinWidth, max=sizeMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
sizeWidth <- values[[1]]
currentWidth <- values[[2]]
updated[[length(updated)+1]] <- values[[3]]
values <- setColumnWidth(identifierWidth, min=identifierMinWidth, max=identifierMaxWidth, increment=10, currentTotal=currentWidth, displayWidth=maxWidth)
identifierWidth <- values[[1]]
currentWidth <- values[[2]]
updated[[length(updated)+1]] <- values[[3]]
values <- setColumnWidth(updatedWidth, min=updatedMinWidth, max=updatedMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
updatedWidth <- values[[1]]
currentWidth <- values[[2]]
updated[[length(updated)+1]] <- values[[3]]
values <- setColumnWidth(localWidth, min=localMinWidth, max=localMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
localWidth <- values[[1]]
currentWidth <- values[[2]]
updated[[length(updated)+1]] <- values[[3]]
# Only test at the end of the line, as some fields have smaller increments that may have been allowed, but
# larger injcremented fields would have stopped the loop.
updated <- as.logical(updated)
if(!any(updated)) break
}
fmt <- paste0(
"%-", sprintf("%2d", fileNameWidth), "s ",
"%-", sprintf("%2d", formatIdWidth), "s ",
"%-", sprintf("%2d", mediaTypeWidth), "s ",
"%-", sprintf("%2d", sizeWidth), "s ",
"%-", sprintf("%2d", identifierWidth), "s ",
"%-", sprintf("%2d", updatedWidth), "s ",
"%-", sprintf("%2d", localWidth), "s ",
"\n")
cat(sprintf("Members:\n\n"))
cat(sprintf(fmt, "filename", "format", "mediaType", "size", "identifier", "modified", "local"))
lapply(ids, function(id) {
# The objects data has a size from sysmeta, but no data locally, so it must have been
# lazy loaded from a repository. The sysmeta@size could be non-zero but no local data only
# if it was incorrectly set manually or the object was lazyloaded.
hasLocalData <- !is.na(object@objects[[id]]@filename) || (length(object@objects[[id]]@data) > 0)
hasLocalDataStr <- if (isTRUE(hasLocalData)) 'y' else 'n'
fn <- object@objects[[id]]@filename
fnSysmeta <- object@objects[[id]]@sysmeta@fileName
if(!is.na(fnSysmeta)) {
filename <- fnSysmeta
} else if (!is.na(fn)) {
filename <- basename(fn)
} else {
filename <- NA_character_
}
cat(sprintf(fmt,
condenseStr(filename, fileNameWidth),
condenseStr(object@objects[[id]]@sysmeta@formatId, formatIdWidth),
condenseStr(object@objects[[id]]@sysmeta@mediaType, mediaTypeWidth),
condenseStr(as.character(object@objects[[id]]@sysmeta@size), sizeWidth),
condenseStr(object@objects[[id]]@sysmeta@identifier, identifierWidth),
condenseStr(if (object@objects[[id]]@updated[['sysmeta']] || object@objects[[id]]@updated[['data']]) 'y' else 'n', updatedWidth),
condenseStr(hasLocalDataStr, localWidth)))
})
cat(sprintf("\nPackage identifier: %s\n", object@resmapId))
rightsHolders <- unique(getValue(object, name="sysmeta@rightsHolder"))
cat(sprintf("RightsHolder: %s\n\n", paste(rightsHolders, collapse=",")))
relationships <- getRelationships(object, condense=TRUE)
if(nrow(relationships) > 0) {
if(object@relations[['updated']]) {
cat(sprintf("\nRelationships (updated):\n\n"))
} else {
cat(sprintf("\nRelationships:\n"))
}
show(relationships)
} else {
cat(sprintf("\nThis package does not contain any provenance relationships."))
}
rightsHolders <- unique(getValue(object, name="sysmeta@rightsHolder"))
}
)
setColumnWidth <- function(fieldWidth, min, max, increment, currentTotal, displayWidth) {
# Try to increate the field width by the increment amount. If this is too great,
# decrease the amount by 1 and try again. Continue until the field can be incremented
# the increment width can't be decreased.
# Can't have 0 as increment, this could cause an endless loop
if(increment == 0) increment <- 1
for(inc in increment:1) {
# Couldn't determine max field with because no data in any fields
if(is.na(max)) {
return(list(fieldWidth, currentTotal, FALSE))
} else if (max < min) {
# The max value for a field is less that the required min, so return the required min
return(list(min, currentTotal, FALSE))
} else if(fieldWidth >= max) {
# Current field can't be incremented past it's max
# return fieldWidth, current line width, was this width updated?
return(list(max, currentTotal, FALSE))
} else if ((inc + currentTotal) > displayWidth) {
# Skip if inc will increase total width pass display max
# Can we increment less?
next
} else if((fieldWidth + inc) > max) {
# Skip if inc will increase field width pass it's max
next
} else {
# Increment field
fieldWidth <- fieldWidth + inc
currentTotal <- currentTotal + inc
# Return new field width, new current line total, and whether the width was modified (does it still need to be updated with another pass?)
return(list(fieldWidth, currentTotal, TRUE))
}
}
return(list(fieldWidth, currentTotal, FALSE))
}
# Return a shortened version of a string to the specified length. The
# beginning and end of the string is returned, with ellipses inbetween
# to denote the removed portion, e.g.
# condenseStr("/Users/smith/data/urn:uuid:a84c2234-d07f-41d6-8c53-61b570afc79f.csv", 30)
# "/Users/smith...1b570afc79f.csv"
condenseStr <- function(inStr, newLength) {
if(is.na(inStr)) return(inStr)
strLen <- nchar(inStr)[[1]]
if(newLength >= strLen) return(inStr)
# Requested length too short, so return first part of string
if(newLength < 5) return(substr(inStr, 1, newLength))
# Substract space for ellipses
charLen <- as.integer(newLength - 3)
# Get str before ellipses
len1 <- as.integer(charLen / 2)
# Add additional char at end if desired length is odd
len2 <- as.integer(charLen / 2) + charLen %% 2
# Get str after ellipses
str1 <- substr(inStr, 1, len1)
str2 <- substr(inStr, strLen-(len2-1), strLen)
newStr <- sprintf("%s...%s", str1, str2)
return(newStr)
}
# Writes an object and its system metadata to a bag.
writeToBag <- function(objectToWrite, objectPath, bagDir, sysMetaDirectory, isScienceMetadata=FALSE) {
# Make sure that the path works on the target system
objectPath <- getPlatformPath(objectPath)
# Create the directory if it doesn't exist
if(!file.exists(dirname(objectPath))) {
# Use recursive because objectPath can include intermediate paths
# that might not exist. For example, ./data/non-existent-dir/measurements.
dir.create(dirname(objectPath), recursive=TRUE)
}
writeSystemMetadata <- function(sysMetaObject, systemMetadataPath) {
sysmetaXML <- serializeSystemMetadata(sysMetaObject, version="v2")
writeLines(sysmetaXML, systemMetadataPath)
}
objectIdentifier <- NULL
# Determine whether the bytes of the file are on disk or in memory. Each way is handled differently
if(!is.na(objectToWrite@filename)) {
if(file.exists(objectToWrite@filename)) {
file.copy(objectToWrite@filename, objectPath)
objectIdentifier <- getIdentifier(objectToWrite)
} else {
stop(sprintf("Error serializing to BagIt format, data object \"%s\", uses file %s but this file doesn't exist", objectToWrite, objectToWrite@filename))
}
} else {
# Must be an in-memory data object
tf <- tempfile()
con <- file(tf, "wb")
writeBin(getData(objectToWrite), con)
close(con)
file.copy(tf, objectPath)
unlink(tf)
rm(tf)
}
# Write the system metadata
sysMetaToWrite <- objectToWrite@sysmeta
systemMetadataPath <- file.path(sysMetaDirectory,
paste('sysmeta-', sysMetaToWrite@identifier, '.xml', sep=""))
systemMetadataPath<- getPlatformPath(systemMetadataPath)
writeSystemMetadata(sysMetaToWrite, systemMetadataPath)
# Get the relative bag path for each file so that absolute paths aren't included
bagDir <- getPlatformPath(bagDir)
bagDir <- paste(bagDir, .Platform$file.sep, sep="")
# Add this data and metadata package member to the top level bag metadata files
relativeDataPath <- gsub(bagDir, "", objectPath)
# Write the non-system metadata documents to the appropriate bag files
if (isScienceMetadata) {
metaMd5 <- digest(objectPath, algo="md5", file=TRUE)
metadataManifestLine <- sprintf("%s %s", as.character(metaMd5), relativeDataPath)
write(metadataManifestLine,file=file.path(bagDir, "tagmanifest-md5.txt"),append=TRUE)
} else {
objectMd5 <- digest(objectPath, algo="md5", file=TRUE)
relativeMetaPath <- gsub(bagDir, "", systemMetadataPath)
objectManifestLine <- sprintf("%s %s", as.character(objectMd5), relativeDataPath)
write(objectManifestLine,file=file.path(bagDir, "manifest-md5.txt"),append=TRUE)
}
# Write the system metadata associated with the data or science metadata obbject
scienceMetadataMd5 <- digest(systemMetadataPath, algo="md5", file=TRUE)
relativeSysmetaPath <- gsub(bagDir, "", systemMetadataPath)
sysmetatadataManifestLine <- sprintf("%s %s", as.character(scienceMetadataMd5), relativeSysmetaPath)
write(sysmetatadataManifestLine,file=file.path(bagDir, "tagmanifest-md5.txt"),append=TRUE)
}
# Returns the package's resource map
getResourceMap <- function(x, id=NA_character_, creator=NA_character_, resolveURI=NA_character_) {
# Get the relationships stored in this datapackage.
relations <- getRelationships(x)
# Create a ResourceMap object and serialize it to the specified file
#
# If a serialization id was not specified, then use the id assigned to the DataPackage when it
# was created. If a DataPackage id was not assigned, then create a unique id.
if(is.na(id)) {
if(is.na(x@sysmeta@identifier) || is.null(x@sysmeta@identifier)) {
id <- sprintf("urn:uuid:%s", uuid::UUIDgenerate())
} else {
id <- x@sysmeta@identifier
}
}
# Create a resource map from previously stored triples, for example, from the relationships in a DataPackage
resMap <- new("ResourceMap", id)
resMap <- createFromTriples(resMap, relations=relations, identifiers=getIdentifiers(x), resolveURI=resolveURI,
externalIdentifiers=x@externalIds, creator=creator)
return(resMap)
}
# Finds the identifiers of any science metadata documents in the package by querying
# a resource map.
getScienceMetadataUris <- function(resMap) {
queryResult <- tryCatch(
{
# Query that finds all subjects that document another object. If ?o is unused, a warning is raised;
# use o by performing a sanity check that the object being documented is also documentedBy the science metadata
# object.
queryString <- 'PREFIX cito: <http://purl.org/spar/cito/>
SELECT ?s WHERE {
?s cito:documents ?o .
?o cito:isDocumentedBy ?s .
}'
query <- new("Query", resMap@world, queryString, base_uri=NULL, query_language="sparql", query_uri=NULL)
queryResult <- redland::getResults(query, resMap@model, "csv")
freeQuery(query)
result <- utils::read.csv(text=queryResult, stringsAsFactors = FALSE)
# Extract the 'subject' column from the relationship csv and place into a list
# Make sure that there aren't any duplicate identifiers
scinceMetadataUris <- unique(result[,'s'])
},
error=function(cond) {
list()
})
return (queryResult)
}
getFormatFilename <- function(format) {
if (grepl("ecoinformatics", format, fixed = TRUE)) {
return('eml')
} else if (grepl("openarchives.org/OAI/2.0/oai_dc", format, fixed = TRUE)) {
return("dc.xml")
} else if (grepl("loc.gov/METS/", format, fixed = TRUE)) {
return("mets")
} else if (grepl("datacite.org/schema/kernel", format, fixed = TRUE)) {
return("datacite")
}
return("science-metadata")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.