R/DataPackage.R

Defines functions getFormatFilename getScienceMetadataUris getResourceMap writeToBag condenseStr setColumnWidth

#
#   This work was created by participants in the DataONE project, and is
#   jointly copyrighted by participating institutions in DataONE. For
#   more information on DataONE, see our web site at http://dataone.org.
#
#     Copyright 2011-2015
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
#

#' @title A class representing a data package
#' @description The DataPackage class provides methods for adding and extracting
#' data objects from a data package. The contents of a data package
#' can include arbitrary types of objects, including data files, program code,
#' visualizations and images, animations, and any other type of file. The DataPackage class
#' stores the individual members of the data package along with key system-level metadata
#' about each object, including its size, checksum, identifier, and other key information
#' needed to effectively archive the members of the package.  In addition, the
#' DataPackage class can include key provenance metadata about the relationships among
#' the objects in the data package.  For example, the data package can document that one object
#' provides documentation for another (\code{cito:documents}), and that one object was
#' derived from another (\code{prov:wasDerivedFrom}) by executing a program that 
#' used source data (\code{prov:used}) to create a derived data object 
#' {\code{prov:wasGeneratedBy}}.  These relationships are integral to the data package,
#' and can be visualized by programs that understand the ProvONE provenance 
#' model (see \url{https://purl.dataone.org/provone-v1-dev}). 
#' 
#' The DataPackage class is an R representation of an underlying Open Archives 
#' Initiative ORE model (Object Reuse and Exchange; 
#' see \url{https://www.openarchives.org/ore/}), and follows the DataONE Data
#' Packaging model
#' (see \url{https://releases.dataone.org/online/api-documentation-v2.0.1/design/DataPackage.html}).
#' @include dmsg.R
#' @include DataObject.R
#' @include SystemMetadata.R
#' @import uuid
#' @rdname DataPackage-class
#' @aliases DataPackage-class
#' @slot relations A list containing provenance relationships of package objects
#' @slot objects A list containing identifiers for objects in the DataPackage
#' @slot sysmeta A SystemMetadata class instance describing the package
#' @slot externalIds A list containing identifiers for objects associated with the DataPackage
#' @slot resmapId A character string specifying the identifier for the package resource map. 
#'               This is assigned after a package is uploaded or downloaded from a repository.
#' @section Methods:
#' \itemize{
#'  \item{\code{\link[=DataPackage-initialize]{initialize}}}{: Initialize a DataPackage object.}
#'  \item{\code{\link{addAccessRule}}}{: Add access rules to DataObjects in a DataPackage.}
#'  \item{\code{\link{addMember}}}{: Add a DataObject to a DataPackage.}
#'  \item{\code{\link{clearAccessPolicy}}}{: Clear access policies for DataObjects in a DataPackage.}
#'  \item{\code{\link{containsId}}}{: Returns true if the specified object is a member of the data package.}
#'  \item{\code{\link{describeWorkflow}}}{: Add data derivation information to a DataPackage.}
#'  \item{\code{\link{getData}}}{: Get the data content of a specified data object.}
#'  \item{\code{\link{getSize}}}{: Get the Count of Objects in the DataPackage.}
#'  \item{\code{\link{getIdentifiers}}}{: Get the Identifiers of DataPackage members.}
#'  \item{\code{\link{getMember}}}{: Return the DataPackage Member by Identifier.}
#'  \item{\code{\link{getRelationships}}}{: Retrieve relationships of data package objects.}
#'  \item{\code{\link{getValue}}}{: Get values for selected DataPackage members.}
#'  \item{\code{\link{hasAccessRule}}}{: Determine if access rules exists for DataObjects in a DataPackage.}
#'  \item{\code{\link{insertRelationship}}}{: Insert relationships between objects in a DataPackage.}
#'  \item{\code{\link{removeAccessRule}}}{: Remove an access rule from DataObject in a DataPackage.}
#'  \item{\code{\link{removeMember}}}{: Remove the specified DataObject from a DataPackage.}
#'  \item{\code{\link{removeRelationships}}}{: Remove relationships of objects in a DataPackage.}
#'  \item{\code{\link{replaceMember}}}{: Replace the raw data or file associated with a DataObject.}
#'  \item{\code{\link{selectMember}}}{: Select package members based on slot values.}
#'  \item{\code{\link{serializePackage}}}{: Create an OAI-ORE resource map from the DataPackage.}
#'  \item{\code{\link{serializeToBagIt}}}{: Serialize A DataPackage into a BagIt Archive File.}
#'  \item{\code{\link{setPublicAccess}}}{: Set the access policy to readable by anyone for DataObject in a DataPackage.}
#'  \item{\code{\link{setValue}}}{: Set values for selected DataPackage members}
#'  \item{\code{\link{show}}}{: Print DataPackage information in a formatted view.}
#'  \item{\code{\link{updateMetadata}}}{: Update selected elements of the XML content of a DataObject in a DataPackage}
#'  \item{\code{\link{updateRelationships}}}{: Update package relationships by replacing an old identifier with a new one.}
#' }
#' @seealso \code{\link{datapack}}
#' @export
setClass("DataPackage", slots = c(
    relations               = "list",
    objects                 = "list",          # key=identifier, value=DataObject
    sysmeta                 = "SystemMetadata", # system metadata about the package
    externalIds             = "list",
    resmapId                = "character" # resource map identifier(s)
    )
)

###########################
## DataPackage constructors
###########################

## Create a DataPackage object
## @description The DataPackage() method is a shortcut to creating a DataPackage object, as this method does
## not allow specifying any options that the \code{\link[=initialize-DataPackage]{initialize}} method allows (using new("DataPackage"")
## @param ... (Not yet used)
## @export
#setGeneric("DataPackage", function(...) { standardGeneric("DataPackage")} )

#' Initialize a DataPackage object.
#' @rdname DataPackage-initialize
#' @aliases DataPackage-initialize
#' @param .Object The object being initialized
#' @param packageId The package id to assign to the package
#' @examples
#' # Create a DataPackage with undefined package id (to be set manually later)
#' pkg <- new("DataPackage")
#' # Alternatively, manually assign the package id when the DataPackage object is created
#' pkg <- new("DataPackage", "urn:uuid:4f953288-f593-49a1-adc2-5881f815e946")
#' @seealso \code{\link{DataPackage-class}}
setMethod("initialize", "DataPackage", function(.Object, packageId) {
    dmsg("DataPackage-class.R initialize")

    .Object@sysmeta <- new("SystemMetadata")
    if (!missing("packageId")) {
        ## set the packageId and instantiate a new SystemMetadata
        .Object@sysmeta@identifier <- packageId
    }
    .Object@relations = list()
    .Object@relations[['updated']] <- FALSE
    .Object@objects = list()
    .Object@externalIds = list()
    .Object@resmapId <- NA_character_
   return(.Object)
})

#' @rdname getData
#' @param id Missing or character: if \code{'x'} is DataPackage, the identifier of the
#' package member to get data from
#' @examples 
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do1 <- new("DataObject", id="id1", data, format="text/csv", user="smith", mnNodeId="urn:node:KNB")
#' dp <- addMember(dp, do1)
#' bytes <- getData(dp, "id1")
#' @export
setMethod("getData", signature("DataPackage"), function(x, id) {
    databytes <- as.raw(NULL)
    if (containsId(x, id)) {
        do <- x@objects[[id]]
        databytes <- getData(do)
        return(databytes)
    } else {
        return(NULL)
    }
})

#' Get the Count of Objects in the Package
#' @param x A DataPackage instance
#' @param ... (not yet used)
#' @return The number of object in the Package
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getSize", function(x, ...) { standardGeneric("getSize")} )

#' @rdname getSize
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' getSize(dp)
#' @export
setMethod("getSize", "DataPackage", function(x) {
  return(length(x@objects))
})

#setMethod("length", "DataPackage", function(x) {
#    return(length(x@objects))
#})

#' Get the Identifiers of Package Members
#' @description The identifiers of the objects in the package are retrieved and returned as a list.
#' @param x A DataPackage instance
#' @param ... (not yet used)
#' @return A list of identifiers
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getIdentifiers", function(x, ...) { standardGeneric("getIdentifiers")} )

#' @rdname getIdentifiers
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' getIdentifiers(dp)
#' @export
setMethod("getIdentifiers", "DataPackage", function(x) {
    return(names(x@objects))
})

#' Add a DataObject to the DataPackage
#' @description The DataObject is added to the DataPackage.
#' @param x A DataPackage instance
#' @param do A DataObject instance
#' @param ... (Additional parameters)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("addData", function(x, do, ...) { 
    .Deprecated("addMember", "datapack")
    standardGeneric("addData")
})

#' @rdname addData
#' @details The DataObject \code{"do"} is added to the DataPackage. If the optional \code{"mo"} parameter is specified, then it is 
#' assumed that the DataObject \code{"mo"} is a metadata
#' object that describes the science object \code{"do"} that is being added. The \code{addData} function will add a relationship
#' to the DataPackage resource map that indicates that the metadata object describes the science object using the 
#' Citation Typing Ontology (CITO).
#' Note: this method updates the passed-in DataPackage object.
#' \code{documents} and \code{isDocumentedBy} relationship.
#' @param mo A DataObject (containing metadata describing \code{"do"} ) to associate with the science object.
#' @return the updated DataPackage object
#' @examples
#' dpkg <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' metadata <- charToRaw("EML or other metadata document text goes here\n")
#' md <- new("DataObject", id="md1", dataobj=metadata, format="text/xml", user="smith", 
#'   mnNodeId="urn:node:KNB")
#' do <- new("DataObject", id="id1", dataobj=data, format="text/csv", user="smith", 
#'   mnNodeId="urn:node:KNB")
#' # Associate the metadata object with the science object. The 'mo' object will be added 
#' # to the package  automatically, since it hasn't been added yet.
#' # This method is now deprecated, so suppress warnings if desired. 
#' suppressWarnings(dpkg <- addData(dpkg, do, md))
#' @export
setMethod("addData", signature("DataPackage", "DataObject"), function(x, do, mo=NA_character_) {
  x@objects[[do@sysmeta@identifier]] <- do
  # If a metadata object identifier is specified on the command line, then add the relationship to this package
  # that associates this science object with the metadata object.
  if (!missing(mo)) {
    # CHeck that the metadata object has already been added to the DataPackage. If it has not
    # been added, then add it now.
    if (!containsId(x, getIdentifier(mo))) {
      moId <- addMember(x, mo)
    }
    # Now add the CITO "documents" and "isDocumentedBy" relationships
    x <- insertRelationship(x, getIdentifier(mo), getIdentifier(do))
  }
  return(x)
})

#' Add a DataObject to the DataPackage
#' @description The DataObject is added to the DataPackage.
#' @param x A DataPackage instance
#' @param ... (Additional parameters)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("addMember", function(x, ...) { 
    standardGeneric("addMember")
})

#' @rdname addMember
#' @details The DataObject \code{"do"} is added to the DataPackage. If the optional \code{"mo"} parameter is specified, then it is 
#' assumed that the DataObject \code{"mo"} is a metadata
#' object that describes the science object \code{"do"} that is being added. The \code{addMember} function will add a relationship
#' to the DataPackage resource map that indicates that the metadata object describes the science object using the 
#' Citation Typing Ontology (CITO).
#' Note: this method updates the passed-in DataPackage object.
#' \code{documents} and \code{isDocumentedBy} relationship.
#' @param do The DataObject to add.
#' @param mo A DataObject (containing metadata describing \code{"do"} ) to associate with the science object. If this DataObject 
#' has already been added to the package, the argument can be a \code{"character"} containing the DataObject identifier.
#' @return the updated DataPackage object
#' @examples
#' dpkg <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' metadata <- charToRaw("EML or other metadata document text goes here\n")
#' md <- new("DataObject", id="md1", dataobj=metadata, format="text/xml", user="smith", 
#'   mnNodeId="urn:node:KNB")
#' do <- new("DataObject", id="id1", dataobj=data, format="text/csv", user="smith", 
#'   mnNodeId="urn:node:KNB")
#' # Associate the metadata object with the science object. The 'mo' object will be added 
#' # to the package  automatically, since it hasn't been added yet.
#' dpkg <- addMember(dpkg, do, md)
#' @export
setMethod("addMember", signature("DataPackage"), function(x, do, mo=NA_character_) {
    
    # If only one 'do' is specified, make in into a list. If a list is already specified
    # then it can be iterated over.
    doList <- list()
    # Special case, if the user passed in a single DataObject for sources or derivations,
    # convert it to a list to facilitate easier processing in tests below.
    if(inherits(do, "DataObject")) {
        doList<- list(do)
    } else {
        doList <- do
    }
    
    if(!missing(mo)) {
        if(inherits(mo, "character")) {
            if (!containsId(x, mo)) {
                msg <- sprintf("%s\nPlease specify argument \"mo\" as a \"DataObject\" or first add it to the package using \"addMember\"", 
                               mo)
                msg <- sprintf("%s\nThe metadata object with identifier %s has not been added to the package yet", mo)
                stop(msg)
            }
            # Convert 'mo' to a DataObject
            mo <- getMember(x, mo)
        } else if (!inherits(mo, "DataObject")) {
            stop(sprintf("Invalid type \"%s\" for argument \"mo\"\n", mo))
        }
    }
    
    for (iObj in doList) {
        x@objects[[getIdentifier(iObj)]] <- iObj
        # If a metadata object identifier is specified on the command line, then add the relationship to this package
        # that associates this science object with the metadata object.
        if (!missing(mo)) {
            # CHeck that the metadata object has already been added to the DataPackage. If it has not
            # been added, then add it now.
            if (!containsId(x, getIdentifier(mo))) {
                x <- addMember(x, mo)
            }
            # Now add the CITO "documents" and "isDocumentedBy" relationships
            x <- insertRelationship(x, getIdentifier(mo), getIdentifier(iObj))
        }
    }
    # If the object's path was documented, add it to the resource map
    if (!is.na(iObj@targetPath)){
        x <- insertRelationship(x, getIdentifier(iObj), iObj@targetPath, provAtLocation)
    }
    return(x)
})

#' Record relationships of objects in a DataPackage
#' @description Record a relationship of the form "subject -> predicate -> object", as defined by the Resource Description Framework (RDF), i.e.
#' an RDF triple. 
#' @details For use with DataONE, a best practice is to specify the subject and predicate as DataONE persistent identifiers 
#' (https://mule1.dataone.org/ArchitectureDocs-current/design/PIDs.html). If the objects are not known to DataONE, then local identifiers can be
#' used, and these local identifiers may be promoted to DataONE PIDs when the package is uploaded to a DataONE member node.
#' The predicate is typically an RDF property (as a IRI) from a schema supported by DataONE, i.e. "http://www.w3.org/ns/prov#wasGeneratedBy"
#' If multiple values are specified for argument objectIDS, a relationship is created for each value in the list "objectIDs". IF a value
#' is not specified for subjectType or objectType, then NA is assigned. Note that if these relationships are fetched via the getRelationships()
#' function, and passed to the createFromTriples() function to initialize a ResourceMap object, the underlying redland package will assign
#' appropriate values for subjects and objects.
#' Note: This method updates the passed-in DataPackage object.
#' @param x A DataPackage object
#' @param subjectID The identifier of the subject of the relationship
#' @param objectIDs A list of identifiers of the object of the relationships (a relationship is recorded for each objectID)
#' @param predicate The IRI of the predicate of the relationship
#' @param ... (Additional parameters)
#' @return the updated DataPackage object
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("insertRelationship", function(x, ...) {
  standardGeneric("insertRelationship")
})

#' @rdname insertRelationship
#' @param subjectType the type to assign the subject, values can be 'uri', 'blank'
#' @param objectTypes the types to assign the objects (cal be single value or list), each value can be 'uri', 'blank', or 'literal'
#' @param dataTypeURIs An RDF data type that specifies the type of the object
#' @export
#' @examples
#' dp <- new("DataPackage")
#' # Create a relationship
#' dp <- insertRelationship(dp, "/Users/smith/scripts/genFields.R",
#'     "https://knb.ecoinformatics.org/knb/d1/mn/v1/object/doi:1234/_030MXTI009R00_20030812.40.1",
#'     "http://www.w3.org/ns/prov#used")
#' # Create a relationshp with the subject as a blank node with an automatically assigned blank 
#' # node id
#' dp <- insertRelationship(dp, subjectID=NA_character_, objectIDs="thing6", 
#'     predicate="http://www.myns.org/wasThing")
#' # Create a relationshp with the subject as a blank node with a user assigned blank node id
#' dp <- insertRelationship(dp, subjectID="urn:uuid:bc9e160e-ca21-47d5-871b-4a4820fe4451", 
#'       objectIDs="thing7", predicate="http://www.myns.org/hadThing")
#' # Create multiple relationships with the same subject, predicate, but different objects
#' dp <- insertRelationship(dp, subjectID="urn:uuid:95055dc1-b2a0-4a00-bdc2-05c16d048ca2", 
#'       objectIDs=c("thing4", "thing5"), predicate="http://www.myns.org/hadThing")
#' # Create multiple relationships with subject and object types specified
#' dp <- insertRelationship(dp, subjectID="orcid.org/0000-0002-2192-403X", 
#'     objectIDs="http://www.example.com/home", predicate="http://www.example.com/hadHome",
#'                    subjectType="uri", objectType="literal")                
setMethod("insertRelationship", signature("DataPackage"),
          function(x, subjectID, objectIDs, predicate=NA_character_, 
                   subjectType=NA_character_, objectTypes=NA_character_, dataTypeURIs=NA_character_) {

  # Argument has to be character
  stopifnot(is.character(subjectID))
  stopifnot(is.character(objectIDs))
  
  # If a predicate wasn't provided, then insert the default relationship of 
  # subjectID -> documents -> objectID; objectID -> documentedBy -> subjectID
  if (is.na(predicate)) {
    x <- insertRelationship(x, subjectID, objectIDs, citoDocuments)
    
    for (obj in objectIDs) {
        x <- insertRelationship(x, obj, subjectID, citoIsDocumentedBy)
    }
  } else {
    # Append new relationships to previously stored ones.
    if (!(is.null(x@relations[["relations"]]))) {
      relations <- x@relations[["relations"]]
    } else {
      relations <- data.frame()
    }
    
    # If the subjectID or objectIDs were not specified then the user is requesting that these be "anonymous"
    # blank nodes, i.e. a blank node identifier is automatically assigned. Assign a uuid now vs having redland
    # RDF package assign a node, so that we don't have to query redland for the id that it generated. 
    # The format of "_<unique identifier>" (no colon included) is used passed W3C RDF 
    # Validationa (https://www.w3.org/RDF/Validator/). Node ids starting with numeric characters or containing 
    # colons do not pass validation (note that blank node identifiers such as '_:b1' do not pass validattion).
    if (is.na(subjectID)) {
      subjectID <- sprintf("_%s", uuid::UUIDgenerate())
      subjectType <- "blank"
    }
    
    # Add all triples to the data frame. If the length of objectTypes or dataTypeURIs is less
    # that the length of objectIDs, then values will be set to NA
    i <- 0
    for (obj in objectIDs) {
      i <- i + 1
      # Generate a blank node identifier if id is not specified
      if (is.na(obj)) {
        obj <- sprintf("_%s", uuid::UUIDgenerate())
        objectTypes[i] <- "blank"
      }
      
      # Check that the subjectType is a valid type for an RDF subject
      if (!is.element(subjectType[i], c("uri", "blank", NA_character_))) {
        stop(sprintf("Invalid subject type: %s\n", subjectType[i]))
      }
      # Check that the objectType is a valid type for an RDF object
      if(!is.element(objectTypes[i], c("uri", "literal", "blank", NA_character_))) {
        stop(sprintf("Invalid objct type: %s\n", objectTypes[i]))
      }
      newRels <- data.frame(subject=subjectID, predicate=predicate, object=obj, 
                            subjectType=subjectType, objectType=objectTypes[i], 
                            dataTypeURI=dataTypeURIs[i], row.names = NULL, stringsAsFactors = FALSE)
      
      # Has a relation been added previously?
      if (nrow(relations) == 0) {
        relations <- newRels
      } else {
        relations <- rbind(relations, newRels)
      }
    }
    
    x@relations[["relations"]] <- relations
  }
  
  # Remove duplicate relationships
  relations <- x@relations[["relations"]]
  x@relations[["relations"]] <- unique(relations)
  
  # Set the relationships (resource map) to updated status.
  x@relations[["updated"]] <- TRUE
  return(x)
})

#' Remove relationships of objects in a DataPackage
#' @description Use this function to remove all or a subset of the relationships that have previously been added in a data package. 
#' @details Remove a relationship of the form "subject -> predicate -> object", as defined by the Resource Description Framework (RDF), i.e.
#' an RDF triple. If neither subjectID nor predicate are provided, then all relationships are removed.  If one or both
#' are provided, they are used to select matching triples to be removed.
#' Note: This method updates the passed-in DataPackage object.
#' @param x A DataPackage object
#' @param subjectID The identifier of the subject of the relationships to be removed
#' @param predicate The identifier of the predicate of the relationships to be removed
#' @param ... (Additional parameters)
#' @return the updated DataPackage object
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("removeRelationships", function(x, ...) {
    standardGeneric("removeRelationships")
})

#' @rdname removeRelationships
#' @export
#' @examples
#' dp <- new("DataPackage")
#' # Create a relationship
#' dp <- insertRelationship(dp, "/Users/smith/scripts/genFields.R",
#'     "https://knb.org/data_20030812.40.1",
#'     "http://www.w3.org/ns/prov#used")
#' # Create a relationshp with the subject as a blank node with an automatically assigned blank 
#' # node id
#' dp <- insertRelationship(dp, subjectID=NA_character_, objectIDs="thing6", 
#'     predicate="http://myns.org/wasThing")
#' # Create a relationshp with the subject as a blank node with a user assigned blank node id
#' dp <- insertRelationship(dp, subjectID="urn:uuid:bc9e160e-ca21-47d5-871b-4a4820fe4451", 
#'       objectIDs="thing7", predicate="http://myns.org/hadThing")
#' # Create multiple relationships with the same subject, predicate, but different objects
#' dp <- insertRelationship(dp, subjectID="https://myns.org/subject1", 
#'       objectIDs=c("thing4", "thing5"), predicate="http://myns.org/hadThing")
#' # Create multiple relationships with subject and object types specified
#' dp <- insertRelationship(dp, subjectID="orcid.org/0000-0002-2192-403X", 
#'     objectIDs="http://www.example.com/home", predicate="http://myns.org/hadHome",
#'                    subjectType="uri", objectType="literal")
#' nrow(getRelationships(dp)) 
#' dp <- removeRelationships(dp, predicate='http://myns.org/wasThing')
#' nrow(getRelationships(dp)) 
#' dp <- removeRelationships(dp, subjectID='orcid.org/0000-0002-2192-403X')
#' nrow(getRelationships(dp)) 
#' dp <- removeRelationships(dp, subjectID='https://myns.org/subject1', 
#'     predicate='http://myns.org/hadThing')
#' nrow(getRelationships(dp)) 
#' dp <- removeRelationships(dp)
#' nrow(getRelationships(dp)) 
setMethod("removeRelationships", signature("DataPackage"), 
            function(x, subjectID=NA_character_, predicate=NA_character_) {
              
                # Argument has to be character
                stopifnot(is.character(subjectID))
                stopifnot(is.character(predicate))

                # Get access to the relations data frame
                if (!(is.null(x@relations[["relations"]]))) {
                    relations <- x@relations[["relations"]]
                } else {
                    # There are no relations, so nothing to be removed
                    warning("No relationships exist, so can not remove relationships as specified.")
                }
                
                # Delete some of the relationships, depending on whether subject and predicate are provided
                if (is.na(subjectID) && is.na(predicate)) {
                    # Delete all relationships because no subject or predicate was indicated
                    relations <- data.frame()
                } else if (is.na(subjectID)) {
                    # Delete only relationships matching predicate
                    relations <- relations[!relations$predicate==predicate,]
                } else if (is.na(predicate)) {
                    # Delete only relationships matching subjectID
                    relations <- relations[!relations$subject==subjectID,]
                } else {
                    # Delete relationships matching both subjectID and predicate
                    relations <- relations[!(relations$subject==subjectID & relations$predicate==predicate),]
                }
                
                # Assign the modified relations data frame to the class slot
                x@relations[["relations"]] <- relations 
                
                # Set the relationships (resource map) to updated status.
                x@relations[["updated"]] <- TRUE
                return(x)
            })

#' Record derivation relationships between objects in a DataPackage
#' @description Record a derivation relationship that expresses that a target object has been derived from a source object.
#' For use with DataONE, a best practice is to specify the subject and predicate as DataONE persistent identifiers 
#' (https://mule1.dataone.org/ArchitectureDocs-current/design/PIDs.html). If the objects are not known to DataONE, then local identifiers can be
#' used, and these local identifiers may be promoted to DataONE PIDs when the package is uploaded to a DataONE member node.
#' @details A derived relationship is created for each value in the list "objectIDs".  For each derivedId, one statement will be
#' added expressing that it was derived from the sourceId.  The predicate is will be an RDF property (as a IRI) from the W3C PROV
#' specification, namely, "http://www.w3.org/ns/prov#wasDerivedFrom"
#' @param x a DataPackage object
#' @param ... Additional parameters
#' @examples \dontrun{
#' dp <- new("DataPackage")
#' recordDerivation(dp, "doi:1234/_030MXTI009R00_20030812.40.1", 
#'                  "doi:1234/_030MXTI009R00_20030812.45.1")
#'                      }
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("recordDerivation", function(x, ...) {
    .Deprecated("describeWorkflow", "datapack")
    standardGeneric("recordDerivation")
})

#' @rdname recordDerivation
#' @param sourceID the identifier of the source object in the relationship
#' @param derivedIDs an identifier or list of identifiers of objects that were derived from the source 
setMethod("recordDerivation",  signature("DataPackage"), function(x, sourceID, derivedIDs, ...) {
    describeWorkflow(x, sources=sourceID, derivations=derivedIDs, ...)
})

#' Retrieve relationships of package objects
#' @description Relationships of objects in a package are defined using the \code{'insertRelationship'} call and retrieved
#' using \code{getRetaionships}. These relationships are returned in a data frame with \code{'subject'}, \code{'predicate'}, \code{'objects'}
#' as the columns, ordered by "subject"
#' @param x A DataPackage object
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getRelationships", function(x, ...) {
  standardGeneric("getRelationships")
})

#' @rdname getRelationships
#' @param condense A logical value, if TRUE then a more easily viewed version of relationships are returned.
#' @examples
#' dp <- new("DataPackage")
#' insertRelationship(dp, "/Users/smith/scripts/genFields.R",
#'     "http://www.w3.org/ns/prov#used",
#'     "https://knb.ecoinformatics.org/knb/d1/mn/v1/object/doi:1234/_030MXTI009R00_20030812.40.1")
#' rels <- getRelationships(dp)
#' @export
setMethod("getRelationships", signature("DataPackage"), function(x, condense=F, ...) {
  # Get the relationships stored by insertRelationship
  if (!(is.null(x@relations[["relations"]]))) {
      relationships <- x@relations[["relations"]]
      # Reorder output data frame by "subject" column
      if (nrow(relationships) > 0) {
          relationships <- relationships[order(relationships$subject, relationships$predicate, relationships$object),]
      }
  } else {
      relationships <- data.frame()
  }
    
  # The user has requested that a 'condensed' version of the package relationships be created.
  # This version is intened for display purposes and is not suitable for creating a resource map.
    if(nrow(relationships) > 0 && condense) {
        consoleWidth <- getOption("width")
        if(is.na(consoleWidth)) consoleWidth <- 80
        paddingWidth <- 10
        nColumns <- 3
        # Set the max column width according to the current console width,
        # leave enough room for 3 columsn with padding, etc.
        # Note: this is only an approximation, as the columns may take less
        # width that this.
        maxColumnWidth <- as.integer((consoleWidth-paddingWidth)/nColumns)
        condensedRels <- apply(relationships, c(1,2), function(term) {
            if(is.na(term)) return(term)
            for(ins in seq_len(nrow(knownNamespaces))) {
                ns <- knownNamespaces[ins, 'namespace']
                prefix <- knownNamespaces[ins, 'prefix']
                # use namespace in term
                if(grepl(ns, term, fixed=TRUE)) {
                    return(condenseStr(sub(ns, paste0(prefix, ':'), term), maxColumnWidth))
                }
            }
            # Didn't match any known namespace, check if the item is a package member identifier,
            # and use the source filename if it exists.
            if(is.element(term, getIdentifiers(x))) {
                fn <- x@objects[[term]]@filename
                fnSysmeta <- x@objects[[term]]@sysmeta@fileName
                if(!is.na(fnSysmeta)) {
                   term <- fnSysmeta
                } else if (!is.na(fn)) {
                   term <- basename(fn)
                } 
            }
            return(condenseStr(term, maxColumnWidth))
        })
        rels <- as.data.frame(condensedRels[,1:3])
        return(rels[with(rels, order(subject)),])
    }
 return(relationships)
})



#' Plot derivation relationships obtained from getRelationships
#' @description Creates graph of dataPackage object generated from getRelationships
#' @param x a DataPackage object
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("plotRelationships", function(x, ...) {
  standardGeneric("plotRelationships")
})


#' @rdname plotRelationships
#' @param col vector of colors used for plotting 
#' @param ... other options passed to the igraph plot function
#' @importFrom graphics legend plot
#' @export
setMethod("plotRelationships", signature(x="DataPackage"), function(x, col=NULL, ...) {
	rels <- getRelationships(x, condense=TRUE)
	tmp <- rels[grepl("prov:", rels$predicate),]
	type <- rels[grepl("rdf:", rels$predicate),]
	if(is.null(col)){
		col <- c('dodgerblue', 'forestgreen', 'firebrick', 
			'orange', 'purple', 'brown', 'pink', 'yellow')
	}
	type$object <- as.character(type$object)
	g <- igraph::graph.edgelist(as.matrix(tmp[,c(1,3)]))
	g <- igraph::set_edge_attr(g, "label", value = as.character(tmp[,2]))
		colorLabels <- factor(sapply(type$object, 
			function(x){unlist(strsplit(x, ':'))[2]}))
	colorLabels <- colorLabels[match(igraph::V(g)$name, type$subject)]
	out <- plot(g, vertex.label.color='black', vertex.frame.color='black',
		edge.color='black', edge.arrow.size=0.5, 
		vertex.color=col[as.numeric(colorLabels)], ...)
	out
	legend(x=-1.5, y=-1.1, sort(unique(colorLabels)), pch=21,
		col="#777777", pt.bg=col[1:max(as.numeric(colorLabels))], 
		pt.cex=2, cex=0.8, bty="n", ncol=1)
})


#' Returns true if the specified object is a member of the package
#' @param x A DataPackage object
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("containsId", function(x, ...) {
    standardGeneric("containsId")
})

#' @rdname containsId
#' @param identifier The DataObject identifier to check for inclusion in the DataPackage
#' @return A logical - a value of TRUE indicates that the DataObject is in the DataPackage
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' id <- "myNewId"
#' do <- new("DataObject", id=id, dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' isInPackage <- containsId(dp, identifier="myNewId")
#' @export
setMethod("containsId", signature("DataPackage"), function(x, identifier) {
    obj <- x@objects[[identifier]]
    found <- !is.null(obj)
    return(found)
})

#' Remove the Specified Member from the Package
#' @description Given the identifier of a DataObject in a DataPackage, delete the DataObject
#' from the DataPackage.
#' @param x a DataPackage object
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export 
    setGeneric("removeMember", function(x, ...) {
  standardGeneric("removeMember")
})

#' @rdname removeMember
#' @param do The package member to remove, either as a \code{"DataObject"} or \code{"character"} (for the object identifier)
#' @param removeRelationships A \code{logical} value. If TRUE, package relationships for this package member are removed. Default is FALSE.
#' @details The \code{removeMember} method removes the specified DataObject from the DataPackage. In 
#' addition, any package relationships that included the DataObject are removed.
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", id="myNewId", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' # Remove the package member and any provenance relationships that reference it.
#' removeMember(dp, "myNewId", removeRelationships=TRUE)
#' @export
setMethod("removeMember", signature("DataPackage"), function(x, do, removeRelationships=FALSE) {
    
    # Note: the dataone 2.0.1 version did not have a removeRelationships parameter, so in order
    # to maintain the same behaviour as the 2.0.1 release, relationships are not deleted by default.
    identifier <- NA_character_
    if(inherits(do, "DataObject")) {
        identifier <- getIdentifier(do)
    } else if (inherits(do, "character")) {
        identifier <- do
    } else {
        stop(sprintf("Unknown type \"%s\"for parameter '\"do\""), class(do))
    }
    
    # To delete an entry, set it to NULL
    x@objects[[identifier]] <- NULL
    
    relations <- data.frame()
    # The DataObject is being removed, and the relationships that it appears in
    # will also be removed.
    if(removeRelationships) {
        # Get the current package relationships
        if (!(is.null(x@relations[["relations"]]))) {
            relations <- x@relations[["relations"]]
        } else {
            return(x)
        }
        
        newRels <- data.frame()
        # Remove all occurences of this identifier from the provenance relationships
        # when it appears in either the subject or object of a relationship
        if(nrow(relations) > 0) {
            for(irel in seq_len(nrow(relations))) {
                subject <- relations[irel, 'subject']        
                object <- relations[irel, 'object']        
                testSubject <- checkIdMatch(subject, pattern='.*%s$', identifier)
                if(!is.na(testSubject)) next
                testObject <- checkIdMatch(object, pattern='.*%s$', identifier)
                if(!is.na(testObject)) next
                # Didn't find the identifier, keep this relationship 
                newRels <- rbind(newRels, relations[irel,])
            }
        }
        x@relations[["relations"]] <- newRels
    }
    
    x@relations[["updated"]] <- TRUE
    return(x)
})

#' Replace the raw data or file associated with a DataObject
#' @description A DataObject is a container for data that can be either an R raw object or
#' a file on local disk. The \code{replaceMember} method can be used to update the
#' date that a DataObject contains, for a DataObject that is a member of a DataPackage, 
#' substituting a new file or raw object in the specified DataObject.
#' @param x A DataPackage instance
#' @param do A DataObject instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export 
setGeneric("replaceMember", function(x, do, ...) {
  standardGeneric("replaceMember")
})

#' @rdname replaceMember
#' @details The data that is replacing the existing DataObject data may be of a different
#' format or type than the existing data. Because the data type and format may change, the
#' system metadata that describes the data can be updated as well. The \code{replaceMember}
#' method will update the SystemMetadata \code{size}, \code{checksum} values automatically, 
#' but does not update the \code{formatId}, \code{mediaType}, \code{mediaTypeProperty}
#' unless requested, so these should be specified in the call to \code{replaceMember} if necessary. 
#' If the \code{newId} argument is used, the specified new identifier will be assigned to the 
#' object, otherwise one will be generated if necessary. This new identifier will be used
#' if the DataPackage is uploaded to DataONE, and this object is updating an existing object in DataONE.
#' @param replacement A \code{raw} object or \code{character} (for filename) that will replace the current value in the DataObject \code{do}.
#' @param formatId A value of type \code{"character"}, the DataONE object format for the object.
#' @param mediaType A value of type \code{"character"}, the IANA Media Type (aka MIME-Type) of the object, e.g. "text/csv".
#' @param mediaTypeProperty A value of type \code{"list"} of \code{"character"}, IANA Media Type properties for the \code{"mediaType"} argument.
#' @param newId A value of type \code{"character"} which will replace the identifier for this DataObject.
#' @examples
#' # Create a DataObject and add it to the DataPackage
#' dp <- new("DataPackage")
#' doIn <- new("DataObject", format="text/csv", 
#'             filename=system.file("./extdata/pkg-example/binary.csv", package="datapack"))
#' dp <- addMember(dp, doIn)
#' 
#' # Use the zipped version of the file instead by updating the DataObject
#' dp <- replaceMember(dp, doIn, 
#'           replacement=system.file("./extdata/pkg-example/binary.csv.zip", 
#'           package="datapack"),
#'                     formatId="application/zip")
#' @export
setMethod("replaceMember", signature("DataPackage"), function(x, do, replacement, formatId=NA_character_, mediaType=NA_character_, 
                                                              mediaTypeProperty=NA_character_,
                                                              newId=NA_character_, ...) {
    
    
    # The DataObject to change argument can be either a DataObject or identifier. Determine which one
    # and put the object out of the package so that we can modify it and replace it.
    id <- NA_character_
    if (inherits(do, "DataObject")) {
        id <- getIdentifier(do)
        if(! id  %in% getIdentifiers(x)) {
            stop(sprintf("DataObject for id \"%s\" was not found in the DataPackage", id))
        }
    } else if (inherits(do, "character")) {
        id <- do
        if(! id %in% getIdentifiers(x)) {
            stop(sprintf("DataObject for id \"%s\" was not found in the DataPackage\n", id))
        }
        do <- getMember(x, id)
    } else {
        stop(sprintf("Unknown type \"%s\"for parameter '\"do\"\n", class(do)))
    }
    
    # If replacement param is a DataObject, then use it as is, otherwise, use a copy of the
    # old DataObject, and revise it.
    if(inherits(replacement, "DataObject")) {
        newObj <- replacement
    } else {
        newObj <- getMember(x, id)
    }
    # Assign a new identifier the first time this routine is called. If this object
    # was downloaded from a repository, it will have an '@oldId' assigned.
    # If '@oldId' has been assigned, then a new id must be assigned to the sysmeta, as
    # this will be required during uploaded/updated to the repo.
    if(!is.na(newObj@oldId)) {
        # If this is the first time the object is modified, old and current will be the same,
        # so assign a new id if they are.
        if(newObj@oldId == getIdentifier(newObj)) {
            if(is.na(newId)) {
                newId <- sprintf("urn:uuid:%s", uuid::UUIDgenerate())
                newObj@sysmeta@identifier <- newId
            } else {
                newObj@sysmeta@identifier <- newId
            }
        } else {
            # This isn't the first time this object has been modified, so only assign a new
            # id if the user requested it.
            if(!is.na(newId)) {
                newObj@sysmeta@identifier <- newId
            }
        }
    } else {
        # This object must not have been downloaded from a repo, because @oldId is not assigned,
        # so only need to assign a new id if the user specified it.
        # If the user specified a new identifier to use ('newId' argument) the assign it now. 
        if(!is.na(newId)) {
            newObj@sysmeta@identifier <- newId
        }
    }
    
    # If replacement is a DataObject, then replace the existing DataObject 'do' with the
    # DataObject 'replacement'
    algorithm <- x@sysmeta@checksumAlgorithm
    if (is.raw(replacement)) {
        newObj@data <- replacement
        newObj@filename <- NA_character_
        newObj@sysmeta@size <- length(newObj@bytes)
        newObj@sysmeta@checksum <- calculateChecksum(newObj, checksumAlgorithm=algorithm)
        newObj@sysmeta@checksumAlgorithm <- algorithm
    } else if (inherits(replacement, "character")) {
        # If 'replacement' is a character string, then it is
        # assumed to be a filename that replaces the DataObjects existing filename
        if(!file.exists(replacement)) {
            stop(sprintf("File %s not found.", replacement))
        }
        fileinfo <- file.info(replacement)
        newObj@filename <- replacement
        newObj@sysmeta@fileName <- basename(replacement)
        newObj@data <- raw()
        newObj@sysmeta@size <- fileinfo$size
        newObj@sysmeta@checksum <- calculateChecksum(newObj, checksumAlgorithm=algorithm)
        newObj@sysmeta@checksumAlgorithm <- algorithm
    } else if (!inherits(replacement, "DataObject")) {
        stop(sprintf("Unknown replacement type: %s\n", class(replacement)))
    }
    
    # Update these selected sysmeta fields if they were specified in the call.
    if(!is.na(formatId)) {
        newObj@sysmeta@formatId <- formatId
    }
    if(!is.na(mediaType)) {
        newObj@sysmeta@mediaType <- mediaType
    }
    if(!is.na(mediaTypeProperty)) {
        newObj@sysmeta@mediaTypeProperty <- mediaTypeProperty
    }
    
    x <- removeMember(x, do, removeRelationships=FALSE)
    newObj@updated[['data']] <- TRUE
    newObj@updated[['sysmeta']] <- TRUE
    x <- addMember(x, newObj)
    
    if(!is.na(newId)) {
        # Update the identifier in the package relationships, replacing the old with the new.
        x <- updateRelationships(x, id=id, newId=newId)
    }
    
    return(x)
})

#' Update selected elements of the XML content of a DataObject in a DataPackage (aka package member).
#' @param x a DataPackage instance
#' @param do A DataObject instance object, or DataObject identifier
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export 
setGeneric("updateMetadata", function(x, do, ...) {
    standardGeneric("updateMetadata")
})

#' @rdname updateMetadata
#' @description A DataObject that contains an XML document can be edited by specifying a path
#' to the elements to edit (an XPath expression) and a value to replace the text node. 
#' @details This method requires some knowledge of the structure of the metadata document as well
#' as facility with the XPath language. If the \code{newId} argument is used, the specified new 
#' identifier will be assigned to the object, and the previous identifier will be stored in the \code{oldId} slot, 
#' for possible use when updating the DataObject to a repository. If \code{newId} is not used, a new
#' identifier will be generated for the DataObject only the first time that updateMetadata is called for
#' a particular object in a DataPackage.
#' @param xpath A \code{character} value specifying the location in the XML to update.
#' @param replacement A \code{character} value that will replace the elements found with the \code{xpath}.
#' @param newId A value of type \code{"character"} which will replace the identifier for this DataObject.
#' @examples
#' # Create a DataObject and add it to the DataPackage
#' dp <- new("DataPackage")
#' sampleMeta <- system.file("./extdata/sample-eml.xml", package="datapack")
#' id <- "1234"
#' metaObj <- new("DataObject", id="1234", format="eml://ecoinformatics.org/eml-2.1.1", 
#'                 file=sampleMeta)
#' dp <- addMember(dp, metaObj)
#' 
#' # In the metadata object, insert the newly assigned data 
#' xp <- sprintf("//dataTable/physical/distribution[../objectName/text()=\"%s\"]/online/url", 
#'               "sample-data.csv") 
#' newURL <- sprintf("https://cn.dataone.org/cn/v2/resolve/%s", "1234")
#' dp <- updateMetadata(dp, id, xpath=xp, replacement=newURL)
#' @export
setMethod("updateMetadata", signature("DataPackage"), function(x, do, xpath, replacement, 
                                                              newId=NA_character_, ...) {
    
    # The DataObject to change argument can be either a DataObject or identifier. Determine which one
    # and put the object out of the package so that we can modify it and replace it.
    id <- NA_character_
    if (inherits(do, "DataObject")) {
        id <- getIdentifier(do)
        metaObj <- do
        if(! id  %in% getIdentifiers(x)) {
            stop(sprintf("DataObject for id \"%s\" was not found in the DataPackage", id))
        }
    } else if (inherits(do, "character")) {
        id <- do
        metaObj <- getMember(x, id)
        if(! id %in% getIdentifiers(x)) {
            stop(sprintf("DataObject for id \"%s\" was not found in the DataPackage", id))
        }
    } else {
        stop(sprintf("Unknown type \"%s\"for parameter '\"do\""), class(do))
    }
    
    if(!inherits(replacement, "character")) {
        stop(sprintf("Invalid type \"%s\" for argument \"replacement\", it must be \"character\"",
                     class(replacement)))
    }
    
    if(! is.element(id, getIdentifiers(x))) {
        stop(sprintf("Invalid DataObject specified with \"do\", identifier \"%s\" is not a package member of \"x\"", id))
    }
    
    # Create a new DataObject with the modified XML
    newMetaObj <- updateXML(metaObj, xpath=xpath, replacement=replacement)
    x <- replaceMember(x, metaObj, replacement=newMetaObj, newId=newId, ...)
    
    return(x)
})

#' Return the Package Member by Identifier
#' @description Given the identifier of a member of the data package, return the DataObject
#' representation of the member.
#' @param x A DataPackage instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getMember", function(x, ...) {
    standardGeneric("getMember")
})

#' @rdname getMember
#' @param identifier A DataObject identifier
#' @return A DataObject if the member is found, or NULL if not
#' @export
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", id="myNewId", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' do2 <- getMember(dp, "myNewId")
setMethod("getMember", signature("DataPackage"), function(x, identifier) {
    if (containsId(x, identifier)) {
        return(x@objects[[identifier]])
    } else {
        return(NULL)
    }
})

#' Return identifiers for objects that match search criteria
#' @description Return DataObjects or DataObject identifiers that match search terms.
#' @param x A DataPackage instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("selectMember", function(x, ...) {
    standardGeneric("selectMember")
})

#' @rdname selectMember
#' @details The \code{"selectMember"} method inspects the DataObject slot \code{"name"} for a match with \code{"value"}
#' for each DataObject in a DataPackage. Matching DataObjects are returned as a list containing either package member
#' identifiers (character) or the DataObjects themselves, depending on the value of the \code{as} parameter.
#' @param name The name of the DataObject slot to inspect, for example "sysmeta@formatId".
#' @param value A character or logical value to match. If specified as a character value, PERL style regular expressions can be used (see ?grepl).
#' @param as A character value to specify the return type, either "DataObject" or "character" (the default)
#' @return A list of matching DataObjects or DataObject identifiers. The default is to return a list of 
#' DataObject identifiers.
#' @export
#' @examples
#' #' library(datapack)
#' dp <- new("DataPackage")
#' # Add the script to the DataPackage
#' progFile <- system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack")
#' # An 'id' parameter is not specified, so one will be generated automatically.
#' progObj <- new("DataObject", format="application/R", filename=progFile)
#' dp <- addMember(dp, progObj)
#' 
#' # Add a script input to the DataPackage
#' inFile <- system.file("./extdata/pkg-example/binary.csv", package="datapack") 
#' inObj <- new("DataObject", format="text/csv", filename=inFile)
#' dp <- addMember(dp, inObj)
#' 
#' # Add a script output to the DataPackage
#' outFile <- system.file("./extdata/pkg-example/gre-predicted.png", package="datapack")
#' outObj <- new("DataObject", format="image/png", file=outFile)
#' dp <- addMember(dp, outObj)
#' 
#' # Now determine the package member identifier for the R script
#' progIds  <- selectMember(dp, name="sysmeta@formatId", value="application/R", as="character")
#' inputId <- selectMember(dp, name="sysmeta@fileName", value="binary.csv")
setMethod("selectMember", signature("DataPackage"), function(x, name, value, as="character") {
    # First look at the top level slot names for a match with 'field'
    valid <- c("character", "DataObject")
    if(!as %in% valid) {
        stop(sprintf("The value for parameter \"as\" must be one of %s", paste0(valid, collapse=", ")))
    }
    matches <- list()
    if(length(names(x@objects)) > 0) {
        for(iKey in names(x@objects)) {
            slotStr <- sprintf("x@objects[[\'%s\']]@%s", iKey, as.character(name))
            testValue <- eval(parse(text=slotStr))
            if(identical(testValue, value) || grepl(as.character(value), testValue, perl=TRUE)) {
                if(as == "character") {
                    matches[[length(matches)+1]] <- iKey
                } else {
                    matches[[length(matches)+1]] <- getMember(x, iKey)
                }
            }
        }
    } else {
        stop("The specified package has no members")
    }
    if(length(matches) > 0) {
        if(length(matches) == 1) {
            return(matches[[1]])
        } else {
            return(unlist(matches))
        }
    } else {
        return(matches)
    }
})
#' Set values for selected DataPackage members.
#' @description The \code{'setValue'} method is used to modify values stored in DataPackage members.
#' Each member in a DataPackage is a DataObject which is an R S4 object that contains a set of values (slots).
#' The available slots are described at \code{help("DataObject-class")}.
#' @param x A DataPackage instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("setValue", function(x, ...) {
  standardGeneric("setValue")
})

#' @rdname setValue
#' @details If the parameter \code{identifiers} is provided, then DataPackage members that
#' have identifiers specified in the list will be updated. If this parameter is not provided
#' then no members will be updated. To update all members in a package, specify the
#' value of \code{identifiers=getIdentifiers(pkg)} where \code{pkg} is the variable name
#' of the DataPackage to update. Note that this method can be used to update the
#' \code{data} or \code{filenane} slots, but it is instead recommended to us the
#' \code{replaceMember} method to achieve this, as the \code{replaceMember} method assists 
#' in properly setting the related SystemMetadata values.
#' @param name A DataObject slot name.
#' @param value A new value to assign to the slot for selected DataPackage members.
#' @param identifiers A list of identifiers of DataPackage members to update.
#' @return A DataPackage with possibly updated DataObjects.
#' @export
#' @examples
#' # First create a package that we can modify. 
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' # The next statment sets the format type incorrectly as an example, so we can correct it later
#' do <- new("DataObject", id="myNewId", dataobj=data, format="image/jpg", user="jsmith")
#' dp <- addMember(dp, do)
#' data <- charToRaw("7,8.9\n4,10,11")
#' # This next statement also sets the format type incorrectly
#' do <- new("DataObject", id="myNewId2", dataobj=data, format="image/jpg", user="jsmith")
#' dp <- addMember(dp, do)
#' # Change format types to correct value for both package members
#' # Careful! Specifying 'identifiers=getIdentifiers(dp) will update all package members!
#' dp <- setValue(dp, name="sysmeta@formatId", value="text/csv", identifiers=getIdentifiers(dp))
setMethod("setValue", signature("DataPackage"), function(x, name, value, identifiers=NA_character_, ...) {
  # First look at the top level slot names for a match with 'field'
  matchingIds <- list()
  if(length(names(x@objects)) > 0) {
    for(iKey in names(x@objects)) {
      if(! iKey %in% identifiers) next
      tmpObj <- x@objects[[iKey]]
      # If a list is specified, we have to manualy convert it to a string that can be parsed,
      # as R strips the 'list' out and we no longer have a list. This technique only works
      # for a list of characters
      if (inherits(value, "list")) {
          saveQuotes <- getOption("useFancyQuotes")
          options(useFancyQuotes = FALSE)
          strList <- paste(dQuote(value), collapse=",")
          strList <- sprintf("list(%s)", strList)
          setStr <- sprintf("x@objects[[\'%s\']]@%s <- %s", iKey, as.character(name), strList)
          options(useFancyQuotes = saveQuotes)
      } else  if (inherits(value, "character")) {
          if(is.na(value)) {
              setStr <- sprintf("x@objects[[\'%s\']]@%s <- as.character(%s)", iKey, as.character(name), value)
          } else {
              setStr <- sprintf("x@objects[[\'%s\']]@%s <- \"%s\"", iKey, as.character(name), value)
          }
      } else {
          setStr <- sprintf("x@objects[[\'%s\']]@%s <- %s", iKey, as.character(name), value)
      }
      eval(parse(text=setStr))
      if (! identical(tmpObj@sysmeta, x@objects[[iKey]]@sysmeta)) x@objects[[iKey]]@updated[['sysmeta']] <- TRUE
      if (length(tmpObj@data) != length(x@objects[[iKey]]@data)) x@objects[[iKey]]@updated[['data']] <- TRUE
      if (identical(tmpObj@filename, x@objects[[iKey]]@filename)) x@objects[[iKey]]@updated[['data']] <- TRUE
      #if(identical(testValue, value)) matchingIds[[length(matchingIds)+1]] <- iKey
    }
  } else {
    stop("The specified package has no members")
  }
  return(x)
})

#' Get values for selected DataPackage members.
#' @description Given a slot name and set of package member identifiers, return slot values.
#' @param x A DataPackage instance
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("getValue", function(x, ...) {
    standardGeneric("getValue")
})

#' @rdname getValue
#' @details If the parameter \code{identifiers} is provided, then only the DataPackage
#' members that have identifiers in the provided list will have there values fetched.
#' If this parameter is not provided, then the values for all DataPackage members are returned.
#' @param name A name of a DataObject slot.
#' @param identifiers A list of DataPackage member identifiers
#' @return A list of values for matching slot names and included identifiers.
#' @export
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", id="myNewId", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' data <- charToRaw("7,8.9\n4,10,11")
#' do <- new("DataObject", id="myNewId2", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' formats <- getValue(dp, name="sysmeta@formatId")
setMethod("getValue", signature("DataPackage"), function(x, name, identifiers=NA_character_) {
    values <- list()
    if(is.na(identifiers)) identifiers <- getIdentifiers(x)
    if(length(names(x@objects)) > 0) {
        for(iKey in names(x@objects)) {
            if(! iKey %in% identifiers) next
            value <- NA_character_
            slotStr <- sprintf("value <- x@objects[[\'%s\']]@%s", iKey, as.character(name))
            eval(parse(text=slotStr))
            #values[[length(values)+1]] <- value
            values[[iKey]] <- value
        }
    } else {
        stop("The specified package has no members")
    }
    values
})

#' @rdname setPublicAccess
#' @aliases setPublicAccess
#' @param identifiers A list of \code{character} values containing package member identifiers that will be updated (default is all package members).
#' @return A DataPackage with modified access rules.
#' @seealso \code{\link{DataPackage-class}}
#' @examples
#' # First create a sample package with two DataObjects
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' # Now add public read to all package members ("id1", "id2")
#' dp <- setPublicAccess(dp)
setMethod("setPublicAccess", signature("DataPackage"), function(x, identifiers=list()) {
    if(length(identifiers) == 0) identifiers <- getIdentifiers(x)
    if(length(names(x@objects)) > 0) {
        for(iKey in names(x@objects)) {
            if(! iKey %in% identifiers) next
            obj <- getMember(x, identifier=iKey)
            obj <-  setPublicAccess(obj)
            obj@updated[['sysmeta']] <- TRUE
            x <- removeMember(x, obj, removeRelationships=FALSE)
            x <- addMember(x, obj)
        }
    } else {
        stop("The specified package has no members")
    }
    return(x)
})

#' @rdname addAccessRule
#' @details Note that when \code{addAccessRule} is called with a `DataPackage` argument, the 
#' additional parameter \code{identifiers} can be used:
#' \itemize{
#'   \item{identifiers A list of \code{character} values containing package member identifiers that the access rule will be applied to (all members is the default)}.
#' }
#' @return The DataPackage with updated DataObject access policies
#' @seealso \code{\link{DataPackage-class}}
#' @examples 
#' # Add an access rule to members of a DataPackage
#' # First create a sample DataPackage
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' # Add access rule to all package members
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write", getIdentifiers(dp))
setMethod("addAccessRule", signature("DataPackage"), function(x, y, ...) {
    argList <- list(...)
    argListLen <- length(argList)
    # Check for "identifiers" as named argument, i.e. 'identifiers=c("id1", "id2")'
    if (!"identifiers" %in% names(argList)) {
        # User has specified "permission=<blah>, identifiers=<blah>"
        if(argListLen >= 2) {
            identifiers <- argList[[2]]
        } else {
            identifiers <- list()
        }
    } 
   
    if(length(identifiers) == 0) identifiers <- getIdentifiers(x) 
    
    if(length(names(x@objects)) > 0) {
        for(iKey in names(x@objects)) {
            if(! iKey %in% identifiers) next
            obj <- getMember(x, identifier=iKey)
            obj <- addAccessRule(obj, y, ...)
            obj@updated[['sysmeta']] <- TRUE
            x <- removeMember(x, iKey, removeRelationships=FALSE)
            x <- addMember(x, obj)
        }
    }
    return(x)
})

#' @rdname clearAccessPolicy
#' @param identifiers A list of \code{character} values containing package member identifiers that the access rule will be applied to.
#' @return The SystemMetadata object with the cleared access policy.
#' @seealso \code{\link{DataPackage-class}}
#' @examples 
#' # Clear access policy for a DataPackage
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' 
#' # Add the access rule to all package members
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", 
#'     permission="write")
#' # Now clear the access policy for just the second object 
#' dp <- clearAccessPolicy(dp, getIdentifier(obj2))
#' 
#' @export
setMethod("clearAccessPolicy", signature("DataPackage"), function(x, identifiers=list(), ...) {
    if(length(identifiers)==0) identifiers <- getIdentifiers(x)
    if(length(names(x@objects)) > 0) {
        for(iKey in names(x@objects)) {
            if(! iKey %in% identifiers) next
            obj <- getMember(x, identifier=iKey)
            obj <- clearAccessPolicy(obj, ...)
            obj@updated[['sysmeta']] <- TRUE
            x <- removeMember(x, iKey, removeRelationships=FALSE)
            x <- addMember(x, obj)
        }
    }
    return(x)
})

#
#' @rdname hasAccessRule
#' @description If called for a DataPackage, then the SystemMetadata for DataObjects in the DataPackage are checked.
#' @param identifiers A list of \code{character} values containing package member identifiers for which the access rule will be checked.
#' @seealso \code{\link{DataPackage-class}}
#' @examples 
#' #
#' # Check access rules for member DataObjects of a DataPackage.
#' # First create an example DataPackage
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' # Add access rules to all package members
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
#' hasWrite <- hasAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' hasChange <- hasAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
#' @return When called for a DataPackage, boolean TRUE if the access rule exists in all specified package members already, FALSE otherwise
setMethod("hasAccessRule", signature("DataPackage"), function(x, subject, permission, identifiers=list(), ...) {
    found <- FALSE
    # Have to provide a list of member ids to check
    if(length(identifiers) == 0) identifiers <- getIdentifiers(x)
    if(length(getIdentifiers(x)) > 0) {
        for(id in getIdentifiers(x)) {
            if(! id %in% identifiers) next
            do <- getMember(x, id)
            # If even one specified member doesn't have the perms, then fail.
            if (!hasAccessRule(do, subject, permission)) return (FALSE)
        }
    }
    return(TRUE)
})

#' @rdname removeAccessRule
#' @return The Datapackage with members having updated access policies.
#' @param identifiers A list of \code{character} values containing package member identifiers that the access rule will be 
#' applied to (default is all package members).
#' @seealso \code{\link{DataPackage-class}}
#' @examples 
#' # 
#' # Remove access rules from a DataPackage.
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
#' dp <- addMember(dp, obj)
#' data2 <- charToRaw("7,8,9\n4,10,11\n")
#' obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
#' dp <- addMember(dp, obj2)
#' # Add access rule to all package members
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission" )
#' # Now take 'changePermission' away for user 'uid=smith...', specifying parameter 'y' 
#' # as a character string containing a 'subject'.
#' dp <- removeAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' dp <- removeAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
#' 
#' # Alternatively, parameter "y" can be a data.frame containing one or more access rules:
#' # Add write, changePermission for uid=jones,...
#' dp <- addAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "write")
#' dp <- addAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "changePermission")
#' # Now take privs for uid=jones,... away
#' accessRules <- data.frame(subject=c("uid=jones,ou=Account,dc=example,dc=com", 
#'                                      "uid=jones,ou=Account,dc=example,dc=com"), 
#'                                      permission=c("write", "changePermission"))
#' dp <- removeAccessRule(dp, accessRules)
setMethod("removeAccessRule", signature("DataPackage"), function(x, y, permission=NA_character_, 
                                                              identifiers=list(), ...) {
    if(length(identifiers) == 0) identifiers <- getIdentifiers(x)
    if(length(names(x@objects)) > 0) {
        for(iKey in names(x@objects)) {
            if(! iKey %in% identifiers) next
            obj <- getMember(x, identifier=iKey)
            obj <- removeAccessRule(obj, y, permission=permission, ...)
            x <- removeMember(x, iKey, removeRelationships=FALSE)
            x <- addMember(x, obj)
        }
    }
    return(x)
})

#' Create an OAI-ORE resource map from the package
#' @description The DataPackage is serialized as a OAI-ORE resource map to the specified file.
#' @param x A DataPackage object
#' @param ... Additional arguments
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("serializePackage", function(x, ...) {
  standardGeneric("serializePackage")
})

#' @rdname serializePackage
#' @details The resource map that is created is serialized by default as RDF/XML. Other serialization formats
#' can be specified using the \code{syntaxName} and \code{mimeType} parameters. Other available formats
#' include: 
#' \tabular{ll}{
#' \strong{syntaxName}\tab \strong{mimeType}\cr
#' json\tab application/json\cr
#' ntriples\tab application/n-triples\cr
#' turtle\tab text/turtle\cr
#' dot\tab text/x-graphviz\cr
#' } 
#' Note that the \code{syntaxName} and \code{mimeType} arguments together specify o serialization format.
#' 
#' Also, for packages that will be uploaded to the DataONE network, "rdfxml" is the only 
#' accepted format.  
#' 
#' The resolveURI string value is prepended to DataPackage member identifiers in the resulting resource map. 
#' If no resolveURI value is specified, then 'https://cn.dataone.org/cn/v1/resolve' is used.
#' @param file The file to which the ResourceMap will be serialized
#' @param id A unique identifier for the serialization. The default value is the id assigned
#' to the DataPackage when it was created.
#' @param syntaxName The name of the syntax to use for serialization - default is "rdfxml"
#' @param mimeType The mimetype of the serialized output - the default is "application/rdf+xml"
#' @param namespaces A data frame containing one or more namespaces and their associated prefix
#' @param syntaxURI URI of the serialization syntax
#' @param resolveURI A character string containing a URI to prepend to datapackage identifiers
#' @param creator A \code{character} string containing the creator of the package.
#' @export
#' @examples
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3\n4,5,6")
#' do <- new("DataObject", id="do1", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' data2 <- charToRaw("7,8,9\n10,11,12")
#' do2 <- new("DataObject", id="do2", dataobj=data2, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do2)
#' dp <- describeWorkflow(dp, sources=do, derivations=do2)
#' \dontrun{
#' td <- tempdir()
#' status <- serializePackage(dp, file=paste(td, "resmap.json", sep="/"), syntaxName="json",  
#'     mimeType="application/json")
#' status <- serializePackage(dp, file=paste(td, "resmap.xml", sep="/"), syntaxName="rdfxml", 
#'     mimeType="application/rdf+xml")
#' status <- serializePackage(dp, file=paste(td, "resmap.ttl", sep="/"), syntaxName="turtle", 
#'     mimeType="text/turtle")
#' }
setMethod("serializePackage", signature("DataPackage"), function(x, file, 
                                                                 id = NA_character_,
                                                                 syntaxName="rdfxml", 
                                                                 mimeType="application/rdf+xml", 
                                                                 namespaces=data.frame(namespace=character(), prefix=character(), stringsAsFactors=FALSE),
                                                                 syntaxURI=NA_character_, resolveURI=NA_character_,
                                                                 creator=NA_character_) {
  resMap <- getResourceMap(x, id, creator, resolveURI)
  status <- serializeRDF(resMap, file, syntaxName, mimeType, namespaces, syntaxURI)

  freeResourceMap(resMap)
  rm(resMap)
  return(status)
})

#' Serialize A DataPackage into a BagIt Archive File
#' @description The BagIt packaging format \url{https://tools.ietf.org/html/draft-kunze-bagit-08}
#'     is used to prepare an archive file that contains the contents of a DataPackage.
#' @details A BagIt Archive File is created by copying each member of a DataPackage, and preparing
#'     files that describe the files in the archive, including information about the size of the files
#'     and a checksum for each file. An OAI-ORE resource map is automatically created and added to the
#'     archive. These metadata files and the data files are then packaged into
#'     a single zip file. 
#' @param x A DataPackage object
#' @param ... Additional arguments
#' @seealso \code{\link{DataPackage-class}}
#' @examples
#' # Create the first data object
#' dp <- new("DataPackage")
#' data <- charToRaw("1,2,3,5,6")
#' do <- new("DataObject", id="do1", dataobj=data, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do)
#' # Create a second data object
#' data2 <- charToRaw("7,8,9,4,10,11")
#' do2 <- new("DataObject", id="do2", dataobj=data2, format="text/csv", user="jsmith")
#' dp <- addMember(dp, do2)
#' # Create a relationship between the two data objects
#' dp <- describeWorkflow(dp, sources="do2", derivations="do2")
#' # Write out the data package to a BagIt file
#' \dontrun{
#' bagitFile <- serializeToBagIt(dp, syntaxName="json", mimeType="application/json")
#' }
#' @export
setGeneric("serializeToBagIt", function(x, ...) {
  standardGeneric("serializeToBagIt")
})
#' @rdname serializeToBagIt
#' @importFrom zip zip
#' @import uuid
#' @import digest
#' @param mapId A unique identifier for the package resource map. If not specified, one will be automatically generated. 
#' @param syntaxName The name of the syntax to use for the resource map serialization, defaults to "rdfxml"
#' @param mimeType The mimetype for the resource map serialization, defaults to "application/rdf+xml".
#' @param namespaces An optional data frame containing one or more namespaces and their associated prefix for the resource map serialization.
#' @param syntaxURI An optional string specifying the URI for the resource map serialization.
#' @param resolveURI A character string containing a URI to prepend to datapackage identifiers for the resource map.
#' @param creator A \code{character} string containing the creator of the package.
#' @seealso For more information and examples regarding the parameters specifying the creation of the resource map, see \link{serializePackage}.
#' @return The file name that contains the BagIt zip archive.

setMethod("serializeToBagIt", signature("DataPackage"), function(x, mapId=NA_character_,
                                                                 syntaxName=NA_character_,
                                                                 namespaces=data.frame(),
                                                                 mimeType=NA_character_,
                                                                 syntaxURI=NA_character_,
                                                                 resolveURI=NA_character_, 
                                                                 creator=NA_character_, ...) {
    cwd <- getwd()
    on.exit(expr = setwd(cwd))
    
    # Create a temp working area where the BagIt directory structure will be created
    tmpDir <- tempdir()
    bagDir <- file.path(tmpDir, "bag")
    if(file.exists(bagDir)) {
        unlink(bagDir, recursive=TRUE)
    }
    # Create the bag directories
    dir.create(bagDir)
    payloadDir <- file.path(bagDir, "data")
    if(!file.exists(payloadDir)) dir.create(payloadDir)

    metadataDir <- file.path(bagDir, "metadata")
    if(!file.exists(metadataDir)) dir.create(metadataDir)
    sysMetaDir <- file.path(metadataDir, "sysmeta")
    if(!file.exists(sysMetaDir)) dir.create(sysMetaDir)

    # Create bagit.txt
    bagitFileText <- sprintf("BagIt-Version: 1.0\nTag-File-Character-Encoding: UTF-8")
    writeLines(bagitFileText, file.path(bagDir, "bagit.txt"))

    # Create a ResourceMap object and serialize it
    if(is.na(mapId)) {
        mapId <- sprintf("urn:uuid:%s", uuid::UUIDgenerate())
    }
    if(is.na(syntaxName)) {
        syntaxName="rdfxml"
    }
    if(is.na(mimeType)) {
        mimeType <- "application/rdf+xml"
    }
    if(is.na(resolveURI)) {
        resolveURI <- ""
    }
    tmpFile <- tempfile()
    serializePackage(x, file=tmpFile, id=mapId, syntaxName=syntaxName, namespaces=namespaces,
                     mimeType=mimeType, resolveURI=resolveURI, creator=creator)
    
    relFile <- file.path("oai-ore.xml")
    resMapFilepath <- file.path(metadataDir, relFile)
    file.copy(tmpFile, resMapFilepath)
    # Add resource map to the manifest
    resMapMd5 <- digest(resMapFilepath, algo="md5", file=TRUE)
    manifestFileTextLine <- sprintf("%s %s", resMapMd5, gsub(paste(".*bag", .Platform$file.sep, sep=""), "", resMapFilepath))
    writeLines(manifestFileTextLine, file.path(bagDir, "tagmanifest-md5.txt"))

    identifiers <- getIdentifiers(x)
    # Get a resource map so that it can be queried for science metadata
    resMap <- getResourceMap(x)
    # Check to see if the user supplied any science metadata
    scienceMetadataUris <- getScienceMetadataUris(resMap)
    # The number that's appended to the filename if multiple documents exist
    scienceMetadataCount <- 1
    # URLEncode each URI
    metadataIds <- lapply(scienceMetadataUris, utils::URLdecode)
    

    # Determines whether or not an object is in a list of URIs
    in_uri <- function(objectId, uriList) {
        for (uri in uriList) {
            if (grepl(objectId, uri, fixed=TRUE)) {
                return (TRUE)
            }
        }
        return (FALSE)
    }
    
    #' Recursively determines the name for a science metadata object.
    #' The base file name (eml, datacite, science-metadata, etc) should stay the same.
    #' Call the method with the base name and the number of existing files to start with.
    #' This is most likely 0.
    #' If there's a count defined, add it to the end of the file in ()
    #' Then call the method again with count += 1
    #' Eventually a free file name will be found, and then the function returns that name
    getMetadataName <- function(baseName, count) {
        if (count > 0) {
            scienceMetadataFilename <- sprintf('%s(%d).xml', baseName, count)
        } else {
            scienceMetadataFilename <- sprintf('%s.xml', baseName)
        }
        
        scienceMetadataPath <- file.path(metadataDir, scienceMetadataFilename)
        if (file.exists(scienceMetadataPath)) {
            # If one was already written, then append (1), (2), (3), etc to the file name
            # Add 1 to the count so that the next number is one higher
            scienceMetadataCount <- scienceMetadataCount+1
            return (getMetadataName(baseName, scienceMetadataCount))
        }
        return (scienceMetadataFilename)
    }

    # Get each member of the package and write it to disk
    for(idNum in seq_along(identifiers)) {
        dataObj <- getMember(x, identifiers[idNum])
        systemMetadata <- dataObj@sysmeta
        # Determine the file name and path of the data object.
        if (!is.na(dataObj@targetPath)) {
            dataObjectLocation <- file.path(payloadDir, dataObj@targetPath)
        } else if (!is.na(systemMetadata@fileName)) {
            # Otherwise, if they specified a file name use that
            # The file name slot is a full path, only get the file name portion of it
            dataObjectLocation <-file.path(payloadDir, basename(systemMetadata@fileName))
        } else {
            # If the file name wasn't specified, use the identifier
            dataObjectLocation <- file.path(payloadDir, getIdentifier(dataObj))
        }
        
        # Check if the object is a science metadata document by checking if its identifier is in the list of
        # science metadata identifiers
        objectId = getIdentifier(dataObj)
        if(in_uri(objectId=objectId, uriList=metadataIds)) {
            scienceMetadataFilename <- getFormatFilename(getFormatId(dataObj))
            scienceMetadataFilename <- getMetadataName(scienceMetadataFilename, 0)
            writeToBag(dataObj, file.path(metadataDir, scienceMetadataFilename), bagDir, sysMetaDir,
                       isScienceMetadata=TRUE)
        } else {
           # Otherwise it's a plain data object
           writeToBag(dataObj, dataObjectLocation, bagDir, sysMetaDir)
        }
    }

    dataInfo <- file.info(list.files(payloadDir, full.names=TRUE, recursive=TRUE))
    payloadBytes <- sum(dataInfo[['size']])
    tagInfo <- file.info(list.files(metadataDir, full.names=TRUE, recursive=TRUE))
    tagBytes <- sum(tagInfo[['size']])
    bagBytes = tagBytes + payloadBytes
    # Convert the value returned from file.info (bytes) into a more human readable form.
    # Use SI convention for defining size
    if(bagBytes < 1024) {
        bagSize <- bagBytes
        sizeUnits <- "B"
    } else if (bagBytes < 1048576) {
        # Size is displayed in Kilobytes
        bagSize <- bagBytes / 1024
        sizeUnits <- "KB"
    } else if (bagBytes < 1073741824) {
        # Size is displayed in megabytes
        bagSize <- bagBytes / 1048576
        sizeUnits <- "MB"
    } else {
        # Size is displayed in gigabytes
        bagSize <- bagBytes / 1073741824
        sizeUnits <- "GB"

    }

    bagInfo <- sprintf("Payload-Oxum: %d.%d\nBagging-Date: %s\nBag-Size: %f %s",
                       bagBytes, length(list.files(payloadDir)),
                       format(Sys.time(), format="%Y-%m-%d"), 
                       bagSize, sizeUnits)

    # Create bag-info.txt
    writeLines(bagInfo, file.path(bagDir, "bag-info.txt"))
    # Add the minimum required tag files
    tagFiles <- c("bag-info.txt", "bagit.txt")
    for (i in seq_along(tagFiles)) {
        thisFile <- tagFiles[i]
        thisMd5 <- digest(file.path(bagDir, thisFile), algo="md5", file=TRUE)
        write(sprintf("%s %s", thisMd5, thisFile),file=file.path(bagDir, "tagmanifest-md5.txt"),append=TRUE)
    }
    zipFile <- tempfile(fileext=".zip")
    # Now zip up the directory
    setwd(normalizePath(bagDir))
    if(normalizePath(getwd()) != normalizePath(bagDir)) {
        stop(sprintf("Unable to set working directory to the BagIt dir: %s", bagDir))
    }
    zip::zip(zipFile, files=list.files(recursive=TRUE))
    # Return the zip file name
    return(zipFile)
})

#' @title Add data derivation information to a DataPackage
#' @description Add information about the relationships among DataObject members 
#' in a DataPackage, retrospectively describing the way in which derived data were 
#' created from source data using a processing program such as an R script.  These provenance
#' relationships allow the derived data to be understood sufficiently for users
#' to be able to reproduce the computations that created the derived data, and to
#' trace lineage of the derived data objects. The method \code{describeWorkflow} 
#' will add provenance relationships between a script that was executed, the files 
#' that it used as sources, and the derived files that it generated.
#' @details This method operates on a DataPackage that has had DataObjects for 
#' the script, data sources (inputs), and data derivations (outputs) previously 
#' added to it, or can reference identifiers for objects that exist in other DataPackage
#' instances. This allows a user to create a standalone package that contains all of
#' its source, script, and derived data, or a set of data packages that are chained
#' together via a set of derivation relationships between the members of those packages.
#' 
#' Provenance relationships are described following the the ProvONE data model, which
#' can be viewed at \url{https://purl.dataone.org/provone-v1-dev}.  In particular, 
#' the following relationships are inserted (among others):
#' \itemize{
#'  \item{\code{prov:used}} {indicates which source data was used by a program execution}
#'  \item{\code{prov:generatedBy}} {indicates which derived data was created by a program execution}
#'  \item{\code{prov:wasDerivedFrom}} {indicates the source data from which derived data were created using the program}
#' }
#'   
#' @param x The \code{DataPackage} to add provenance relationships to.
#' @param ... Additional parameters
setGeneric("describeWorkflow", function(x, ...) {
    standardGeneric("describeWorkflow")
})

#' @rdname describeWorkflow
#' @param sources A list of DataObjects for files that were read by the program. Alternatively, a list 
#' of DataObject identifiers can be specified as a list of character strings.
#' @param program The DataObject created for the program such as an R script. Alternatively the DataObject identifier can
#' be specified. 
#' @param derivations A list of DataObjects for files that were generated by the program. Alternatively, a list 
#' of DataObject identifiers can be specified as a list of character strings.
#' @param insertDerivations A \code{logical} value. If TRUE then the provenance relationship 
#'     \code{prov:wasDerivedFrom} will be used to connect every source and derivation. The default value 
#'     is TRUE.
#' @seealso The R 'recordr' package for run-time recording of provenance relationships.
#' @import uuid
#' @export
#' @examples
#' library(datapack)
#' dp <- new("DataPackage")
#' # Add the script to the DataPackage
#' progFile <- system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack")
#' progObj <- new("DataObject", format="application/R", filename=progFile)
#' dp <- addMember(dp, progObj)
#' 
#' # Add a script input to the DataPackage
#' inFile <- system.file("./extdata/pkg-example/binary.csv", package="datapack") 
#' inObj <- new("DataObject", format="text/csv", filename=inFile)
#' dp <- addMember(dp, inObj)
#' 
#' # Add a script output to the DataPackage
#' outFile <- system.file("./extdata/pkg-example/gre-predicted.png", package="datapack")
#' outObj <- new("DataObject", format="image/png", file=outFile)
#' dp <- addMember(dp, outObj)
#' 
#' # Add the provenenace relationshps, linking the input and output to the script execution
#' # Note: 'sources' and 'derivations' can also be lists of "DataObjects" or "DataObject' identifiers
#' dp <- describeWorkflow(dp, sources = inObj, program = progObj, derivations = outObj) 
#' # View the results
#' utils::head(getRelationships(dp))
setMethod("describeWorkflow", signature("DataPackage"), function(x, sources=list(), 
                                                                  program=NA_character_, 
                                                                  derivations=list(), insertDerivations=TRUE, ...) {
    
    # Check each "source" list member and check if it is the correct type, either
    # DataObject or character (for DataObject id). Build a list of member ids for
    # use later.
    inIds <- list()
    # Special case, if the user passed in a single DataObject for sources or derivations,
    # convert it to a list to facilitate easier processing in tests below.
    if(inherits(sources, "DataObject")) sources <- list(sources)
    if(inherits(derivations, "DataObject")) derivations <- list(derivations)
    
    # Warn user if they haven't provided enough info to insert any prov relationships
    if(missing(program)) {
        if(length(sources) == 0) {
            stop("Both arguments \"program\" and \"sources\" are missing.")
        }
        if(length(derivations) == 0) {
            stop("Both arguments \"program\" and \"derivations\" are missing.")
        }
    } else {
        # Program was specified, but no inputs, outputs
        if(length(sources) == 0 && length(derivations) == 0) {
            stop("Argument \"program\" is specified, but both \"sources\" and \"derivations\" are missing.")
        }
    }
    
    if(length(sources) > 0) {
        for (isrc in seq_along(sources)) {
            obj <- sources[[isrc]]
            if(inherits(obj, "DataObject")) {
                inIds[[length(inIds)+1]] <- getIdentifier(obj)
            } else if (inherits(obj, "character")) {
                inIds[[length(inIds)+1]] <- obj
            } else {
                stop(sprintf("Invalid type \'%s\' for source[[%s]]", class(obj), isrc))
            }
        }
    }
    # Check each "derivation" list member and check if it is the correct type, either
    # DataObject or character (for DataObject id). Build a list of member ids for use
    # later.
    # Special case, if the user passed in a single DataObject for inputs, stick it in 
    # a list.
    outIds <- list()
    if(length(derivations) > 0) {
        for (idst in seq_along(derivations)) {
            obj <- derivations[[idst]]
            if(inherits(obj, "DataObject")) {
                outIds[[length(outIds)+1]] <- getIdentifier(obj)
            } else if (inherits(obj, "character")) {
                outIds[[length(outIds)+1]] <- obj
            } else {
                stop(sprintf("Invalid type \'%s\' for derivations[[%s]]", class(obj), idst))
            }
        }
    }
    
    if(inherits(program, "DataObject")) {
        scriptId <- getIdentifier(program)
    } else if (inherits(program, "character")) {
        if(!is.na(program)) {
            scriptId <- program
        } else {
            scriptId <- NA_character_
        }
    } else {
        stop(sprintf("Invalid type \'%s\' for program", class(program)))
    }
    # Check that pids are in the data package
    pkgIds <- getIdentifiers(x)
    if(!is.na(scriptId)) {
        if(!is.element(scriptId, pkgIds)) {
            stop(sprintf("Argument \'program\'is not a package memmber."))
        }
    }
    # Process inputs and outputs separately from the program identifier, as there may not
    # have been a program specified.
    if(length(inIds) > 0) {
        for (iCnt in seq_along(inIds)) {
            pid <- inIds[[iCnt]]
            # This pid is not a package member, so add it to the list of external pids
            if (!is.element(pid, pkgIds)) {
                x@externalIds[[length(x@externalIds)+1]] <- pid
            }
            x <- insertRelationship(x, subjectID=pid, objectIDs=provONEdata, predicate=rdfType, objectTypes="uri")
        }
    }
    if(length(outIds) > 0) {
        for (iCnt in seq_along(outIds)) {
            pid <- outIds[[iCnt]]
            # This pid is not a package member, so add it to the list of external pids
            if (!is.element(pid, pkgIds)) {
                x@externalIds[[length(x@externalIds)+1]] <- pid
            }
            x <- insertRelationship(x, subjectID=pid, objectIDs=provONEdata, predicate=rdfType, objectTypes="uri")
        }
    }
    # If a program argument was not specified, then just record derivations from the inputs to the
    # outputs
    if(!is.na(scriptId)) {
        # The script identifier must be for the local package, it is not supported to specify
        # a script from another package using this method, primarily because it is the identifier of the Execution
        # object that is required to establish the relationships "executionPid -> prov:used -> dataPid" and 
        # "dataPid -> wasGeneratedBy -> executionPid". We do not know the Execution id for the scriptId, which
        # is needed to set the other relationships required for indexing.
        if(grepl("\\s*https?:.*", scriptId, perl=TRUE)) {
            stop(sprintf("The \"program\" parameter must specify an identifier that is a member of the current package.\nThe identifier %s is not valid", scriptId))
        }
            
        # Currently we have to have a prov:execution associated with each R script, so that metacatui will
        # render the used and gen files with the R script, via the qualified association and hadPlan, OK!
        executionId <- sprintf("urn:uuid:%s", uuid::UUIDgenerate())
        # Qualified association to link the execution and each of the program (plan)
        associationId <- sprintf("_%s", uuid::UUIDgenerate())
        
        planId <- scriptId
        # Qualified association
        # Subject id of NA will cause a random blank node identifier to be produced
        x <- insertRelationship(x, subjectID=executionId, objectIDs=associationId, predicate=provQualifiedAssociation, objectTypes="blank")
        # Execution rdf type
        x <- insertRelationship(x, subjectID=executionId, objectIDs=provONEexecution, predicate=rdfType, objectTypes="uri")
        # prov:hadPlan
        x <- insertRelationship(x, subjectID=associationId, objectIDs=planId, predicate=provHadPlan, subjectType="blank", objectTypes="uri")
        # prov rdf type declaration for association
        x <- insertRelationship(x, subjectID=associationId, objectIDs=provAssociation, predicate=rdfType, subjectType="blank", objectTypes="uri")
        # prov rdf type declaration for program
        x <- insertRelationship(x, subjectID=planId, objectIDs=provONEprogram, predicate=rdfType, objectType="uri") 
        x <- insertRelationship(x, subjectID=executionId, objectIDs=executionId, predicate=DCTERMSidentifier, objectTypes="literal", dataTypeURIs=xsdString)
        
        # Process files used by the script
        if(length(inIds) > 0) {
            for (iCnt in seq_along(inIds)) {
                thisPid <- inIds[[iCnt]]
                # Record prov:used relationship between the input dataset and the execution
                x <- insertRelationship(x, subjectID=executionId, objectIDs=thisPid, predicate=provUsed)
            }
        }
        
        # Process files generated by the script
        if(length(outIds) > 0) {
            for (iCnt in seq_along(outIds)) {
                thisPid <- outIds[[iCnt]]
                # Record prov:wasGeneratedBy relationship between the output dataset and the execution
                x <- insertRelationship(x, subjectID=thisPid, objectIDs=executionId, predicate=provWasGeneratedBy)
            }
        }
    }
    
    if(insertDerivations) {
        # Record the 'prov:wasDerivedFrom' relationships, directly linking the output files to the input files.
        # This section can be run even if a 'program' argument is not defined.
        if(length(outIds) > 0 && length(inIds) > 0) {
            for (iOut in seq_along(outIds)) {
                outputId <- outIds[[iOut]]
                for (iIn in seq_along(inIds)) {
                    inputId <- inIds[[iIn]]
                    x <- insertRelationship(x, subjectID=outputId, objectIDs=inputId, predicate=provWasDerivedFrom)
                }
            }
        }
    }
    return(x)
})

#' Update package relationships by replacing an old identifier with a new one.
#' @description When package members are updated, they receive a new identifier (replaceMember). It is therefor
#' necessary to update the package relationships to update occurrences of the old identifier
#' with the new one when the old identifier appears in the "subject" or "object" of a 
#' relationship.
#' @param x A DataPackage object
#' @param ... (Not yet used)
#' @seealso \code{\link{DataPackage-class}}
#' @export
setGeneric("updateRelationships", function(x, ...) {
    standardGeneric("updateRelationships")
})

#' @rdname updateRelationships
#' @param id A character value containing the identifier to be replaced.
#' @param newId A character value containing the identifier that will replace the old identifier.
#' @export
setMethod("updateRelationships", signature("DataPackage"), function(x, id, newId, ...) {
    
   relations <- getRelationships(x) 
   x@relations = list()
   
   if(nrow(relations) > 0) {
     for (irow in seq_len(nrow(relations))) {
         subject <- relations[irow, 'subject']
         predicate <- relations[irow, 'predicate']
         object <- relations[irow, 'object']
         objectType <- relations[irow, 'objectType']
         subjectType <- relations[irow, 'subjectType']
         dataTypeURI <- relations[irow, 'dataTypeURI']
         
         testSubject <- checkIdMatch(subject, pattern='.*%s$', id)
         if(!is.na(testSubject)) subject <- newId
         testObject <- checkIdMatch(object, pattern='.*%s$', id)
         if(!is.na(testObject)) object <- newId
         x <- insertRelationship(x, subjectID=subject, objectIDs=object, predicate=predicate, 
                                 subjectType=subjectType, objectTypes=objectType, dataTypeURIs=dataTypeURI)
     }
   }
   
   return(x)
})

setMethod("show", "DataPackage",
    #function(object)print(rbind(x = object@x, y=object@y))
    function(object) {
      
        ids <- getIdentifiers(object)
        if(length(ids) == 0) {
            cat(sprintf("This package does not contain any DataObjects.\n"))
            return()
        }
      
        # currentWidth starts as width of combined initial widths, with 80 as 
        # the start (80 - 6 spaces for padding)
        nfields <- 8
        nspaces <- nfields - 1
        maxWidth <- getOption("width") - nspaces
        currentWidth <- 59
        
        fileNameWidth <- 10
        formatIdWidth <- 8
        mediaTypeWidth <- 10
        sizeWidth <- 8
        identifierWidth <- 10
        # This column width is set for the title width and not the contents, as the title is always longer
        updatedWidth <- 8
        localWidth <- 5
        
        # Set the minimum field width for each field
        fileNameMinWidth     <- fileNameWidth
        formatIdMinWidth     <- formatIdWidth
        mediaTypeMinWidth    <- mediaTypeWidth
        sizeMinWidth         <- sizeWidth
        identifierMinWidth   <- identifierWidth
        updatedMinWidth      <- updatedWidth
        localMinWidth        <- localWidth
        
        # Set the max field width for each field based on all values in a column
        fileNameMaxWidth     <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@fileName)) }))))
        formatIdMaxWidth     <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@formatId)) }))))
        mediaTypeMaxWidth    <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@mediaType)) }))))
        sizeMaxWidth         <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@size)) }))))
        identifierMaxWidth   <- max(unlist((lapply(ids, function(id) { nchar(as.character(object@objects[[id]]@sysmeta@identifier)) }))))
        updatedMaxWidth      <- 8
        localMaxWidth        <- 5
              
        done <- FALSE
        # Continue until no fields can be increased in width.
        # Now that the width of each field and hence total width is known, iteratively adjust each field until
        # the max line width is reached.
        while(!done) {
          updated <- list()
          # fieldWidth, totlaWidth, done <- setColumnWidth(current, max, increment, totalWidth)
          values <- setColumnWidth(fileNameWidth, min=fileNameMinWidth, max=fileNameMaxWidth, increment=5, currentTotal=currentWidth, displayWidth=maxWidth)
          fileNameWidth <- values[[1]]
          currentWidth <- values[[2]]
          updated[[length(updated)+1]] <- values[[3]]
          
          values <- setColumnWidth(formatIdWidth, min=formatIdMinWidth, max=formatIdMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
          formatIdWidth <- values[[1]]
          currentWidth <- values[[2]]
          updated[[length(updated)+1]] <- values[[3]]
          
          values <- setColumnWidth(mediaTypeWidth, min=mediaTypeMinWidth, max=mediaTypeMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
          mediaTypeWidth <- values[[1]]
          currentWidth <- values[[2]]
          updated[[length(updated)+1]] <- values[[3]]
          
          values <- setColumnWidth(sizeWidth, min=sizeMinWidth, max=sizeMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
          sizeWidth <- values[[1]]
          currentWidth <- values[[2]]
          updated[[length(updated)+1]] <- values[[3]]
          
          values <- setColumnWidth(identifierWidth, min=identifierMinWidth, max=identifierMaxWidth, increment=10, currentTotal=currentWidth, displayWidth=maxWidth)
          identifierWidth <- values[[1]] 
          currentWidth <- values[[2]]
          updated[[length(updated)+1]] <- values[[3]]
          
          values <- setColumnWidth(updatedWidth, min=updatedMinWidth, max=updatedMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
          updatedWidth <- values[[1]] 
          currentWidth <- values[[2]]
          updated[[length(updated)+1]] <- values[[3]]
          
          values <- setColumnWidth(localWidth, min=localMinWidth, max=localMaxWidth, increment=1, currentTotal=currentWidth, displayWidth=maxWidth)
          localWidth <- values[[1]] 
          currentWidth <- values[[2]]
          updated[[length(updated)+1]] <- values[[3]]
        
          # Only test at the end of the line, as some fields have smaller increments that may have been allowed, but
          # larger injcremented fields would have stopped the loop.
          updated <- as.logical(updated)
          if(!any(updated)) break
        }
        fmt <- paste0(
            "%-", sprintf("%2d", fileNameWidth), "s ",
            "%-", sprintf("%2d", formatIdWidth), "s ",
            "%-", sprintf("%2d", mediaTypeWidth), "s ",
            "%-", sprintf("%2d", sizeWidth), "s ",
            "%-", sprintf("%2d", identifierWidth), "s ",
            "%-", sprintf("%2d", updatedWidth), "s ",
            "%-", sprintf("%2d", localWidth), "s ",
            "\n")
        cat(sprintf("Members:\n\n"))
        cat(sprintf(fmt, "filename", "format", "mediaType", "size", "identifier", "modified", "local"))
        lapply(ids, function(id) { 
            # The objects data has a size from sysmeta, but no data locally, so it must have been
            # lazy loaded from a repository. The sysmeta@size could be non-zero but no local data only
            # if it was incorrectly set manually or the object was lazyloaded.
            hasLocalData <- !is.na(object@objects[[id]]@filename) || (length(object@objects[[id]]@data) > 0)
            hasLocalDataStr <- if (isTRUE(hasLocalData)) 'y' else 'n'
            fn <- object@objects[[id]]@filename
            fnSysmeta <- object@objects[[id]]@sysmeta@fileName
            if(!is.na(fnSysmeta)) {
              filename <- fnSysmeta
            } else if (!is.na(fn)) {
              filename <- basename(fn)
            } else {
              filename <- NA_character_
            }
            cat(sprintf(fmt, 
                        condenseStr(filename, fileNameWidth),
                        condenseStr(object@objects[[id]]@sysmeta@formatId, formatIdWidth),
                        condenseStr(object@objects[[id]]@sysmeta@mediaType, mediaTypeWidth),
                        condenseStr(as.character(object@objects[[id]]@sysmeta@size), sizeWidth),
                        condenseStr(object@objects[[id]]@sysmeta@identifier, identifierWidth),
                        condenseStr(if (object@objects[[id]]@updated[['sysmeta']] || object@objects[[id]]@updated[['data']])  'y' else 'n', updatedWidth), 
                        condenseStr(hasLocalDataStr, localWidth)))
        })

        cat(sprintf("\nPackage identifier: %s\n", object@resmapId))
        rightsHolders <- unique(getValue(object, name="sysmeta@rightsHolder"))
        cat(sprintf("RightsHolder: %s\n\n", paste(rightsHolders, collapse=",")))
        relationships <- getRelationships(object, condense=TRUE)
        if(nrow(relationships) > 0) {
            if(object@relations[['updated']]) {
                cat(sprintf("\nRelationships (updated):\n\n"))
            } else {
                cat(sprintf("\nRelationships:\n"))
            }
          show(relationships)
        } else {
          cat(sprintf("\nThis package does not contain any provenance relationships."))  
        }
        rightsHolders <- unique(getValue(object, name="sysmeta@rightsHolder"))
    }
)

setColumnWidth <- function(fieldWidth, min, max, increment, currentTotal, displayWidth) {
  
  # Try to increate the field width by the increment amount. If this is too great,
  # decrease the amount by 1 and try again. Continue until the field can be incremented
  # the increment width can't be decreased.
  # Can't have 0 as increment, this could cause an endless loop
  if(increment == 0) increment <- 1
  for(inc in increment:1) {
    # Couldn't determine max field with because no data in any fields
    if(is.na(max)) {
      return(list(fieldWidth, currentTotal, FALSE))
    } else if (max < min) {
      # The max value for a field is less that the required min, so return the required min
      return(list(min, currentTotal, FALSE))
    } else if(fieldWidth >= max) {
      # Current field can't be incremented past it's max
      # return fieldWidth, current line width, was this width updated?
      return(list(max, currentTotal, FALSE))
    } else if ((inc + currentTotal) > displayWidth) {
      # Skip if inc will increase total width pass display max
      # Can we increment less?
      next
    } else if((fieldWidth + inc) > max) {
      # Skip if inc will increase field width pass it's max
      next
    } else {
      # Increment field
      fieldWidth <- fieldWidth + inc
      currentTotal <- currentTotal + inc
      # Return new field width, new current line total, and whether the width was modified (does it still need to be updated with another pass?)
      return(list(fieldWidth, currentTotal, TRUE))
    }
  }
  return(list(fieldWidth, currentTotal, FALSE))
}

# Return a shortened version of a string to the specified length. The
# beginning and end of the string is returned, with ellipses inbetween
# to denote the removed portion, e.g. 
#    condenseStr("/Users/smith/data/urn:uuid:a84c2234-d07f-41d6-8c53-61b570afc79f.csv", 30)
#    "/Users/smith...1b570afc79f.csv"
condenseStr <- function(inStr, newLength) {
    if(is.na(inStr)) return(inStr)
    strLen <- nchar(inStr)[[1]]
    if(newLength >= strLen) return(inStr)
    # Requested length too short, so return first part of string
    if(newLength < 5) return(substr(inStr, 1, newLength))
    # Substract space for ellipses
    charLen <- as.integer(newLength - 3)
    # Get str before ellipses
    len1 <- as.integer(charLen / 2)
    # Add additional char at end if desired length is odd
    len2 <- as.integer(charLen / 2) + charLen %% 2
    # Get str after ellipses
    str1 <- substr(inStr, 1, len1)
    str2 <- substr(inStr, strLen-(len2-1), strLen)
    newStr <- sprintf("%s...%s", str1, str2)
    return(newStr)
}

# Writes an object and its system metadata to a bag.
writeToBag <- function(objectToWrite, objectPath, bagDir, sysMetaDirectory, isScienceMetadata=FALSE) {
    # Make sure that the path works on the target system
    objectPath <- getPlatformPath(objectPath)
    # Create the directory if it doesn't exist
    if(!file.exists(dirname(objectPath))) {
        # Use recursive because objectPath can include intermediate paths
        # that might not exist. For example, ./data/non-existent-dir/measurements.
        dir.create(dirname(objectPath), recursive=TRUE)
    }

    writeSystemMetadata <- function(sysMetaObject, systemMetadataPath) {
        sysmetaXML <- serializeSystemMetadata(sysMetaObject, version="v2")
        writeLines(sysmetaXML, systemMetadataPath)
    }
    objectIdentifier <- NULL
    # Determine whether the bytes of the file are on disk or in memory. Each way is handled differently
    if(!is.na(objectToWrite@filename)) {
        if(file.exists(objectToWrite@filename)) {
            file.copy(objectToWrite@filename, objectPath)
            objectIdentifier <- getIdentifier(objectToWrite)
            } else {
                stop(sprintf("Error serializing to BagIt format, data object \"%s\", uses file %s but this file doesn't exist", objectToWrite, objectToWrite@filename))
            }
        } else {
            # Must be an in-memory data object
            tf <- tempfile()
            con <- file(tf, "wb")
            writeBin(getData(objectToWrite), con)
            close(con)
            file.copy(tf, objectPath)
            unlink(tf)
            rm(tf)
        }
    
    # Write the system metadata
    sysMetaToWrite <- objectToWrite@sysmeta
    systemMetadataPath <- file.path(sysMetaDirectory,
                                         paste('sysmeta-', sysMetaToWrite@identifier, '.xml', sep=""))
    systemMetadataPath<- getPlatformPath(systemMetadataPath)
    writeSystemMetadata(sysMetaToWrite, systemMetadataPath)

    # Get the relative bag path for each file so that absolute paths aren't included
    bagDir <- getPlatformPath(bagDir)
    bagDir <- paste(bagDir, .Platform$file.sep, sep="")

    # Add this data and metadata package member to the top level bag metadata files
    relativeDataPath <- gsub(bagDir, "", objectPath)
    # Write the non-system metadata documents to the appropriate bag files
    if (isScienceMetadata) {
        metaMd5 <- digest(objectPath, algo="md5", file=TRUE)
        metadataManifestLine <- sprintf("%s %s", as.character(metaMd5), relativeDataPath)
        write(metadataManifestLine,file=file.path(bagDir, "tagmanifest-md5.txt"),append=TRUE)
    } else {
        objectMd5 <- digest(objectPath, algo="md5", file=TRUE)
        relativeMetaPath <- gsub(bagDir, "", systemMetadataPath)
        objectManifestLine <- sprintf("%s %s", as.character(objectMd5), relativeDataPath)
        write(objectManifestLine,file=file.path(bagDir, "manifest-md5.txt"),append=TRUE)
    }
    # Write the system metadata associated with the data or science metadata obbject
    scienceMetadataMd5 <-  digest(systemMetadataPath, algo="md5", file=TRUE)
    relativeSysmetaPath <- gsub(bagDir, "", systemMetadataPath)
    sysmetatadataManifestLine <- sprintf("%s %s", as.character(scienceMetadataMd5), relativeSysmetaPath)
    write(sysmetatadataManifestLine,file=file.path(bagDir, "tagmanifest-md5.txt"),append=TRUE)
}

# Returns the package's resource map
getResourceMap <- function(x, id=NA_character_, creator=NA_character_, resolveURI=NA_character_) {
    # Get the relationships stored in this datapackage.
    relations <- getRelationships(x)
    # Create a ResourceMap object and serialize it to the specified file
    #
    # If a serialization id was not specified, then use the id assigned to the DataPackage when it
    # was created. If a DataPackage id was not assigned, then create a unique id.
    if(is.na(id)) {
        if(is.na(x@sysmeta@identifier) || is.null(x@sysmeta@identifier)) {
            id <- sprintf("urn:uuid:%s", uuid::UUIDgenerate())
        } else {
            id <- x@sysmeta@identifier
        }
    }

    # Create a resource map from previously stored triples, for example, from the relationships in a DataPackage
    resMap <- new("ResourceMap", id)
    resMap <- createFromTriples(resMap, relations=relations, identifiers=getIdentifiers(x), resolveURI=resolveURI,
                                externalIdentifiers=x@externalIds, creator=creator)
    return(resMap)
}

# Finds the identifiers of any science metadata documents in the package by querying
# a resource map.
getScienceMetadataUris <- function(resMap) {

    queryResult <- tryCatch(
    {
    # Query that finds all subjects that document another object. If ?o is unused, a warning is raised;
    # use o by performing a sanity check that the object being documented is also documentedBy the science metadata
    # object.
    queryString <- 'PREFIX  cito: <http://purl.org/spar/cito/> 
        SELECT ?s WHERE {
            ?s cito:documents ?o .
            ?o cito:isDocumentedBy ?s .
        }'
    query <- new("Query", resMap@world, queryString, base_uri=NULL, query_language="sparql", query_uri=NULL)
    queryResult <- redland::getResults(query, resMap@model, "csv")
    freeQuery(query)
    result <- utils::read.csv(text=queryResult, stringsAsFactors = FALSE)
    # Extract the 'subject' column from the relationship csv and place into a list
    # Make sure that there aren't any duplicate identifiers
    scinceMetadataUris <- unique(result[,'s'])
    },
    error=function(cond) {
        list()
    })
    return (queryResult)
}

getFormatFilename <- function(format) {
    if (grepl("ecoinformatics", format, fixed = TRUE)) {
        return('eml')
    } else if (grepl("openarchives.org/OAI/2.0/oai_dc", format, fixed = TRUE)) {
        return("dc.xml")
    } else if (grepl("loc.gov/METS/", format, fixed = TRUE)) {
        return("mets")
    } else if (grepl("datacite.org/schema/kernel", format, fixed = TRUE)) {
        return("datacite")
    }
    return("science-metadata")
}
ropensci/datapack documentation built on June 11, 2022, 11:20 a.m.