R/DataObject.R

Defines functions getChecksumAlgorithmAbbreviation sanitizePath pathToWindows pathToPOSIX getPlatformPath

#
#   This work was created by participants in the DataONE project, and is
#   jointly copyrighted by participating institutions in DataONE. For
#   more information on DataONE, see our web site at http://dataone.org.
#
#     Copyright 2011-2015
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
#

#' DataObject wraps raw data with system-level metadata
#' @description DataObject is a wrapper class that associates raw data or a data file with system-level metadata 
#' describing the data.  The system metadata includes attributes such as the object's identifier, 
#' type, size, checksum, owner, version relationship to other objects, access rules, and other critical metadata.
#' The SystemMetadata is compliant with the DataONE federated repository network's definition of SystemMetadata, and
#' is encapsulated as a separate object of type \code{\link{SystemMetadata}} that can be manipulated as needed. Additional science-level and
#' domain-specific metadata is out-of-scope for SystemMetadata, which is intended only for critical metadata for
#' managing objects in a repository system.
#' @details   
#' A DataObject can be constructed by passing the data and SystemMetadata to the new() method, or by passing
#' an identifier, data, format, user, and DataONE node identifier, in which case a SystemMetadata instance will
#' be generated with these fields and others that are calculated (such as size and checksum).
#' 
#' Data are associated with the DataObject either by passing it as a \code{'raw'} value to the \code{'dataobj'}
#' parameter in the constructor, which is then stored in memory, or by passing a fully qualified file path to the 
#' data in the \code{'filename'} parameter, which is then stored on disk.  One of dataobj or filename is required.
#' Use the \code{'filename'} approach when data are too large to be managed effectively in memory.  Callers can
#' access the \code{'filename'} slot to get direct access to the file, or can call \code{'getData()'} to retrieve the
#' contents of the data or file as a raw value (but this will read all of the data into memory).
#' @slot sysmeta A value of type \code{"SystemMetadata"}, containing the metadata about the object
#' @slot data A value of type \code{"raw"}, containing the data represented in this object
#' @slot filename A character value that contains the fully-qualified path to the object data on disk
#' @slot dataURL A character value for the URL used to load data into this DataObject
#' @slot updated A list containing logical values which indicate if system metadata or the data object have been updated since object creation.
#' @slot oldId A character string containing the previous identifier used, before a \code{"replaceMember"} call.
#' @slot targetPath An optional character string holding the path of where the file is placed in a downloaded package.
#' @rdname DataObject-class
#' @keywords classes
#' @import methods
#' @include dmsg.R
#' @include SystemMetadata.R
#' @aliases DataObject-class
#' @section Methods:
#' \itemize{
#'   \item{\code{\link[=DataObject-initialize]{initialize}}}{: Initialize a DataObject}
#'   \item{\code{\link{addAccessRule}}}{: Add a Rule to the AccessPolicy}
#'   \item{\code{\link{canRead}}}{: Test whether the provided subject can read an object.}
#'   \item{\code{\link{getData}}}{: Get the data content of a specified data object}
#'   \item{\code{\link{getFormatId}}}{: Get the FormatId of the DataObject}
#'   \item{\code{\link{getIdentifier}}}{: Get the Identifier of the DataObject}
#'   \item{\code{\link{hasAccessRule}}}{: Determine if an access rules exists for a DataObject.}
#'   \item{\code{\link{setPublicAccess}}}{: Add a Rule to the AccessPolicy to make the object publicly readable.}
#'   \item{\code{\link{updateXML}}}{: Update selected elements of the xml content of a DataObject}
#' }
#' @seealso \code{\link{datapack}}
#' @examples
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' targetPath <- "myData/time-trials/trial_data.csv"
#' do <- new("DataObject", "id1", dataobj=data, "text/csv", 
#'   "uid=jones,DC=example,DC=com", "urn:node:KNB", targetPath=targetPath)
#' getIdentifier(do)
#' getFormatId(do)
#' getData(do)
#' canRead(do, "uid=anybody,DC=example,DC=com")
#' do <- setPublicAccess(do)
#' canRead(do, "public")
#' canRead(do, "uid=anybody,DC=example,DC=com")
#' # Also can create using a file for storage, rather than memory
#' \dontrun{
#' tf <- tempfile()
#' con <- file(tf, "wb")
#' writeBin(data, con)
#' close(con)
#' targetPath <- "myData/time-trials/trial_data.csv"
#' do <- new("DataObject", "id1", format="text/csv", user="uid=jones,DC=example,DC=com", 
#'   mnNodeId="urn:node:KNB", filename=tf, targetPath=targetPath)
#' }
#' @export
setClass("DataObject", slots = c(
    sysmeta                 = "SystemMetadata",
    data                    = "raw",
    filename                = "character",
    dataURL                 = "character",
    updated                 = "list",
    oldId                   = "character",
    targetPath              = "character")
)

##########################
## DataObject constructors
##########################

#' Initialize a DataObject
#' @rdname DataObject-initialize
#' @aliases DataObject-initialize
#' @description When initializing a DataObject using passed in data, one can either pass 
#' in the \code{'id'} param as a \code{'SystemMetadata'} object, or as a \code{'character'} string 
#' representing the identifier for an object along with parameters for format, user,and associated member node.
#' If \code{'data'} is not missing, the \code{'data'} param holds the \code{'raw'} data.  Otherwise, the
#' \code{'filename'} parameter must be provided, and points at a file containing the bytes of the data.
#' @details If filesystem storage is used for the data associated with a DataObject, care must be
#' taken to not modify or remove that file in R or via other facilities while the DataObject exists in the R session.
#' Changes to the object are not detected and will result in unexpected results. Also, if the \code{'dataobj'} parameter
#' is used to specify the data source, then \code{'filename'} argument may also be specified, but in this case 
#' the value \code{'filename'} parameter is used to tell DataONE the filename to create when this file is
#' downloaded from a repository.
#' @param .Object the DataObject instance to be initialized
#' @param id the identifier for the DataObject, unique within its repository. Optionally this can be an existing SystemMetadata object
#' @param dataobj the bytes of the data for this object in \code{'raw'} format, optional if \code{'filename'} is provided
#' @param format the format identifier for the object, e.g."text/csv", "eml://ecoinformatics.org/eml-2.1.1"
#' @param user the identity of the user owning the package, typically in X.509 format
#' @param mnNodeId the node identifier for the repository to which this object belongs.
#' @param filename the filename for the fully qualified path to the data on disk, optional if \code{'data'} is provided
#' @param seriesId A unique string to identifier the latest of multiple revisions of the object.
#' @param mediaType The When specified, indicates the IANA Media Type (aka MIME-Type) of the object. The value should include the media type and subtype (e.g. text/csv).
#' @param mediaTypeProperty A list, indicates IANA Media Type properties to be associated with the parameter \code{"mediaType"}
#' @param dataURL A character string containing a URL to remote data (a repository) that this DataObject represents.
#' @param targetPath An optional string that denotes where the file should go in a downloaded package
#' @param checksumAlgorithm A character string specifying the checksum algorithm to use
#' @import digest
#' @import uuid
#' @examples
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' do <- new("DataObject", "id1", dataobj=data, "text/csv", 
#'   "uid=jones,DC=example,DC=com", "urn:node:KNB", targetPath="data/rasters/data.tiff")
#' @seealso \code{\link{DataObject-class}}
setMethod("initialize", "DataObject", function(.Object, id=NA_character_, dataobj=NA, format=NA_character_, user=NA_character_, 
                                               mnNodeId=NA_character_, filename=NA_character_, seriesId=NA_character_,
                                               mediaType=NA_character_, mediaTypeProperty=list(), dataURL=NA_character_,
                                               targetPath=NA_character_, checksumAlgorithm="SHA-256") {
  
    # If no value has been passed in for 'id', then create a UUID for it.
    if (!inherits(id, "SystemMetadata") && is.na(id)) {
        id <- paste0("urn:uuid:", UUIDgenerate())
    }
    
    # Params specified 
    # dataUrl filename dataobj comment
    # ------- -------- ------- -------
    # Y       N        N       used for lazy loaded DataObjects, 'dataUrl' is the data source
    # N       Y        Y       'dataobj' is the data source, 'filename' is sysmeta.filename (download filename)
    # N       Y        N       'filename' is the data source, 'filename' is sysmeta.filename
    # N       N        Y       Invalid, if 'dataobj' is specified, 'filename' must also be specified.
    # 
    hasDataUrl <- !is.na(dataURL)
    hasDataObj <- !is.na(dataobj[[1]])
    hasFilename <- !is.na(filename)
    
    if (!hasDataUrl && !hasDataObj && !hasFilename) {
        stop("Either the \"dataobj\" parameter containing raw data or the \"filename\" parameter with a file reference to the data\n or the \"xdataURL\" parameter must be provided.")
    }
    
    if (typeof(id) == "character") {
        smfile <- NA_character_
        size <- 0
        sha256 <- NA_character_
        dmsg("@@ DataObject-class:R initialize as character")
        if(hasDataUrl) {
            .Object@dataURL <- dataURL
            .Object@data <- as.raw(NULL)
            .Object@filename <- NA_character_
            smfile <- basename(dataURL) 
        } else {
            # Validate: dataobj must be raw if provided. Also, the filename argument must be provided, which will
            # be used as the sysmeta.fileName value.
            if (hasDataObj) {
                if(!is.raw(dataobj[[1]])) stop("The value of the \"dataobj\" parameter must be of type \"raw\"")
                smfile <- NA_character_
                # If dataobj is specified, then file at 'filename' location doesn't have to exist, as in this case 'filename'
                # specifies sysmeta.fileName and not the data source.
                if(!hasFilename) {
                #    warning("If the \"dataobj\" parameter is specified, the \"filename\" parameter must also be, to specify the download filename")
                    smfile <- basename(filename)
                }
                size <- length(dataobj)
                .Object@data <- dataobj
                .Object@filename <- NA_character_
                .Object@dataURL <- NA_character_
            } else {
                if(!file.exists(filename)) stop(sprintf("The \"filename\" argument value \"%s\" must be for file that exists", filename))
                fileinfo <- file.info(filename)
                if(!fileinfo$size > 0) stop(sprintf("The \"filename\" argument value \"%s\" must be for a non-empty file.", filename))
                size <- fileinfo$size
                .Object@data <- as.raw(NULL)
                .Object@dataURL <- dataURL
                .Object@filename <- normalizePath(filename)
                smfile <- basename(filename)
            }
        } 
        
        checksum <- calculateChecksum(.Object, checksumAlgorithm=checksumAlgorithm)
        # Build a SystemMetadata object describing the data
        # It's OK to set sysmeta v2 fields here, as they will only get serialized to v2 format if requested. The default is
        # to serialze to v1 format which does not include seriesId, mediaType, fileName.
        .Object@sysmeta <- new("SystemMetadata", identifier=id, formatId=format, size=size, submitter=user, rightsHolder=user, 
                               checksum=checksum, checksumAlgorithm=checksumAlgorithm, originMemberNode=mnNodeId, authoritativeMemberNode=mnNodeId, 
                               seriesId=seriesId, mediaType=mediaType, fileName=basename(smfile), 
                               mediaTypeProperty=mediaTypeProperty)
    } else if (typeof(id) == "S4" && inherits(id, "SystemMetadata")) {
        .Object@sysmeta <- id
        if(hasDataObj) {
            if(!is.raw(dataobj[[1]])) stop("The value of the \"dataobj\" parameter must be of type \"raw\"")
            .Object@data <- dataobj
            .Object@dataURL <- NA_character_
        } else {
            .Object@data <- as.raw(NULL)
            .Object@dataURL <- NA_character_
        }
        if(hasFilename && file.exists(filename)) {
            .Object@filename <- normalizePath(filename)
            .Object@dataURL <- NA_character_
        } else {
            .Object@filename <- NA_character_
            .Object@dataURL <- NA_character_
        }
        
        # Ensure that the checksum and algorithm of the passed in sysmeta matches the requested
        # values from the parameter list for the DataObject
        if(tolower(id@checksumAlgorithm) != tolower(checksumAlgorithm)) {
            checksum <- calculateChecksum(.Object, checksumAlgorithm=checksumAlgorithm)
            .Object@sysmeta@checksum <- checksum
            .Object@sysmeta@checksumAlgorithm <- checksumAlgorithm
        }
    } else {
        stop("Invalid value for \"identifier\" argument, it must be a character or SystemMetadata value\n")
    }

    # Test if this DataObject is brand new, or possibly created from an existing object, i.e.
    # downloaded from a data repository
    .Object@updated <- list("sysmeta" = FALSE, "data" = FALSE)
    .Object@oldId <- NA_character_
    if (!is.na(targetPath)) {
        targetPath <- pathToPOSIX(targetPath)
    }

    .Object@targetPath <- targetPath
    return(.Object)
})

#' Get the data content of a specified data object
#' 
#' @param x  DataObject or DataPackage: the data structure from where to get the data
#' @param ... Additional arguments
#' @aliases getData
#' @seealso \code{\link{DataObject-class}}
#' @export
setGeneric("getData", function(x, ...) {
    standardGeneric("getData")
})

#' @rdname getData
#' @return raw representation of the data
#' @aliases getData
#' @examples
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' do <- new("DataObject", "id1", dataobj=data, "text/csv", 
#'   "uid=jones,DC=example,DC=com", "urn:node:KNB")
#' bytes <- getData(do)
setMethod("getData", signature("DataObject"), function(x) {
  if (length(x@data) > 0) {
    return(x@data)
  } else if(!is.na(x@filename)) {
    # Read the file from disk and return the contents as raw
    stopifnot(!is.na(x@filename))
    fileinfo <- file.info(x@filename)
    con <- file(x@filename, "rb")
    temp <- readBin(con, raw(), x@sysmeta@size)
    close(con)
    return(temp)
  } else if (!is.na(x@dataURL)) {
    # This DataObject was created by downloading an object from
    # a repository, but the size of the object to downlaod was too
    # large, so downloading the data was deferred. Now the user is
    # trying to get the data, so we have to download the data, regardless
    # of size.
    # TODO: this request may fail if the data isn't publicly readable, as this isn't
    # request doesn't use the dataone authorized request, i.e. dataone::getObject
    if(requireNamespace("httr", quietly=TRUE)) {
      #if(!is.element("package:httr", search())) env <- attachNamespace("httr")
      response <- httr::GET(x@dataURL)
      if (response$status != "200") {
        errorMsg <- httr::http_status(response)$message
        stop(sprintf("getData() error: %s\n", errorMsg))
      }
      # Can't set a slot in the DataObject to hold the data, as we
      # are returning data and not the modified DataObject
      data <- httr::content(response, as = "raw")
      return(data)
    } else {
        msg <- sprintf("Unable to get package member data from remote location: %s", x@dataURL)
        msg <- sprintf("%s\nInstalling package \"httr\" is required for this operation", msg)
        stop(msg)
    }
  }
})

#' Get the Identifier of the DataObject
#' @param x DataObject
#' @param ... (not yet used)
#' @return the identifier
#' @aliases getIdentifier
#' @seealso \code{\link{DataObject-class}}
#' @export
setGeneric("getIdentifier", function(x, ...) {
    standardGeneric("getIdentifier")
})

#' @rdname getIdentifier
#' @aliases getIdentifier
#' @examples 
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' do <- new("DataObject", "id1", dataobj=data, "text/csv", 
#'   "uid=jones,DC=example,DC=com", "urn:node:KNB")
#' id <- getIdentifier(do)
setMethod("getIdentifier", signature("DataObject"), function(x) {
	return(x@sysmeta@identifier)
})

#' Get the FormatId of the DataObject
#' @param x DataObject
#' @param ... (not yet used)
#' @return the formatId
#' @aliases getFormatId
#' @seealso \code{\link{DataObject-class}}
#' @export
setGeneric("getFormatId", function(x, ...) {
			standardGeneric("getFormatId")
})

#' @rdname getFormatId
#' @aliases getFormatId
#' @examples
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' do <- new("DataObject", "id1", dataobj=data, "text/csv", 
#'   "uid=jones,DC=example,DC=com", "urn:node:KNB")
#' fmtId <- getFormatId(do)
setMethod("getFormatId", signature("DataObject"), function(x) {
    return(x@sysmeta@formatId)
})

#
#' @rdname hasAccessRule
#' @description If called for a DataObject, then the SystemMetadata for the DataObject is checked.
#' @seealso \code{\link{DataObject-class}}
#' @examples 
#' #
#' # Check access rules for a DataObject
#' data <- system.file("extdata/sample-data.csv", package="datapack")
#' do <- new("DataObject", file=system.file("./extdata/sample-data.csv", package="datapack"), 
#'                                          format="text/csv")
#' do <- setPublicAccess(do)
#' isPublic <- hasAccessRule(do, "public", "read")
#' accessRules <- data.frame(subject=c("uid=smith,ou=Account,dc=example,dc=com", 
#'                           "uid=wiggens,o=unaffiliated,dc=example,dc=org"), 
#'                           permission=c("write", "changePermission"), 
#'                           stringsAsFactors=FALSE)
#' do <- addAccessRule(do, accessRules)
#' SmithHasWrite <- hasAccessRule(do, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' @return When called for a DataObject, boolean TRUE if the access rule exists already, FALSE otherwise
setMethod("hasAccessRule", signature("DataObject"), function(x, subject, permission) {
    found <- hasAccessRule(x@sysmeta, subject, permission)
    return(found)
})

#' @rdname removeAccessRule
#' @return The DataObject object with the updated access policy.
#' @seealso \code{\link{DataObject-class}}
#' @examples 
#' #
#' # Remove access rules form a DataObject.
#' library(datapack)
#' do <- new("DataObject", file=system.file("./extdata/sample-data.csv", package="datapack"), 
#'                         format="text/csv")
#' do <- setPublicAccess(do)
#' isPublic <- hasAccessRule(do, "public", "read")
#' accessRules <- data.frame(subject=c("uid=smith,ou=Account,dc=example,dc=com", 
#'                           "uid=wiggens,o=unaffiliated,dc=example,dc=org"), 
#'                           permission=c("write", "changePermission"), 
#'                           stringsAsFactors=FALSE)
#' do <- addAccessRule(do, accessRules)
#' do <- removeAccessRule(do, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
#' # hasAccessRule should return FALSE
#' hasWrite <- hasAccessRule(do, "smith", "write")
#' 
#' # Alternatively, parameter "y" can be a data.frame containing one or more access rules:
#' do <- addAccessRule(do, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' accessRules <- data.frame(subject=c("uid=smith,ou=Account,dc=example,dc=com", 
#'   "uid=slaughter,o=unaffiliated,dc=example,dc=org"), 
#'   permission=c("write", "changePermission"))
#' sysmeta <- removeAccessRule(do, accessRules)
#' @export
setMethod("removeAccessRule", signature("DataObject"), function(x, y, ...) {
    x@sysmeta <- removeAccessRule(x@sysmeta, y, ...)
    return(x)
})

#' Add a Rule to the AccessPolicy to make the object publicly readable.
#' 
#' To be called prior to creating the object in DataONE.  When called before 
#' creating the object, adds a rule to the access policy that makes this object
#' publicly readable.  If called after creation, it will only change the system
#' metadata locally, and will not have any effect on remotely uploaded copies of
#' the DataObject. 
#' @param x DataObject
#' @param ... (not yet used)
#' @return A DataObject with modified access rules.
#' @aliases setPublicAccess
#' @seealso \code{\link{DataObject-class}}
#' @export
setGeneric("setPublicAccess", function(x, ...) {
  standardGeneric("setPublicAccess")
})

#' @rdname setPublicAccess
#' @aliases setPublicAccess
#' @seealso \code{\link{DataObject-class}}
#' @examples
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' do <- new("DataObject", "id1", dataobj=data, "text/csv", 
#'   "uid=jones,DC=example,DC=com", "urn:node:KNB")
#' do <- setPublicAccess(do)
setMethod("setPublicAccess", signature("DataObject"), function(x) {
    # Check if public: read is already set, and if not, set it
    if (!hasAccessRule(x@sysmeta, "public", "read")) {
        x@sysmeta <- addAccessRule(x@sysmeta, "public", "read")
    }
    return(x)
})

#' @rdname addAccessRule
#' @return The DataObject with the updated access policy
#' @seealso \code{\link{DataObject-class}}
#' @examples 
#' # Add an access rule to a DataObject
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="1234", dataobj=data, format="text/csv")
#' obj <- addAccessRule(obj, "uid=smith,ou=Account,dc=example,dc=com", "write")
setMethod("addAccessRule", signature("DataObject"), function(x, y, ...) {
    x@sysmeta <- addAccessRule(x@sysmeta, y, ...)
  return(x)
})

#' @rdname clearAccessPolicy
#' @return The DataObject with the cleared access policy.
#' @seealso \code{\link{DataObject-class}}
#' @examples 
#' # Clear access policy for a DataObject
#' do <- new("DataObject", format="text/csv", filename=system.file("extdata/sample-data.csv", 
#'           package="datapack"))
#' do <- addAccessRule(do, "uid=smith,ou=Account,dc=example,dc=com", "write")
#' do <- clearAccessPolicy(do)
#' @export
setMethod("clearAccessPolicy", signature("DataObject"), function(x, ...) {
        
    x@sysmeta <- clearAccessPolicy(x@sysmeta)
    
    return(x)
})

#' Test whether the provided subject can read an object.
#' 
#' Using the AccessPolicy, tests whether the subject has read permission
#' for the object.  This method is meant work prior to submission to a repository, 
#' and will show the permissions that would be enforced by the repository on submission.
#' Currently it only uses the AccessPolicy to determine who can read (and not the rightsHolder field,
#' which always can read an object).  If an object has been granted read access by the
#' special "public" subject, then all subjects have read access.
#' @details The subject name used in both the AccessPolicy and in the \code{'subject'}
#' argument to this method is a string value, but is generally formatted as an X.509
#' name formatted according to RFC 2253.
#' @param x DataObject
#' @param ... Additional arguments
#' @return boolean TRUE if the subject has read permission, or FALSE otherwise
#' @aliases canRead
#' @seealso \code{\link{DataObject-class}}
#' @export
setGeneric("canRead", function(x, ...) {
  standardGeneric("canRead")
})

#' @rdname canRead
#' @param subject : the subject name of the person/system to check for read permissions
#' @export
#' @examples 
#' data <- charToRaw("1,2,3\n4,5,6\n")
#' obj <- new("DataObject", id="1234", dataobj=data, format="text/csv")
#' obj <- addAccessRule(obj, "smith", "read")
#' access <- canRead(obj, "smith")
setMethod("canRead", signature("DataObject"), function(x, subject) {

    canRead <- hasAccessRule(x@sysmeta, "public", "read") | hasAccessRule(x@sysmeta, subject, "read")
	return(canRead)
})

#' Update selected elements of the XML content of a DataObject
#' @description The data content of the DataObject is updated by using the \code{xpath} 
#' argument to locate the elements to update with the character value specified in the 
#' \code{replacement} argument.
#' @param x A DataObject instance
#' @param ... Additional parameters (not yet used)
#' @return The modified DataObject
#' @rdname updateXML
#' @import XML
#' @export
#' @examples \dontrun{
#' library(datapack)
#' dataObj <- new("DataObject", format="text/csv", file=sampleData)
#' sampleEML <- system.file("extdata/sample-eml.xml", package="datapack")
#' dataObj <- updateMetadata(dataObj, xpath="", replacement=)
#' }
#' @seealso \code{\link{DataObject-class}}
setGeneric("updateXML", function(x, ...) {
    standardGeneric("updateXML")
})

#' @rdname updateXML
#' @param xpath A \code{character} value specifying the location in the XML to update.
#' @param replacement A \code{character} value that will replace the elements found with the \code{xpath}.
#' @export
#' @examples 
#' library(datapack)
#' # Create the metadata object with a sample EML file
#' sampleMeta <- system.file("./extdata/sample-eml.xml", package="datapack")
#' metaObj <- new("DataObject", format="eml://ecoinformatics.org/eml-2.1.1", file=sampleMeta)
#' # In the metadata object, replace "sample-data.csv" with 'sample-data.csv.zip'
#' xp <- sprintf("//dataTable/physical/objectName[text()=\"%s\"]", "sample-data.csv")
#' metaObj <- updateXML(metaObj, xpath=xp, replacement="sample-data.csv.zip")
setMethod("updateXML", signature("DataObject"), function(x, xpath=NA_character_, replacement=NA_character_, ...) {
    
    filename <- NA_character_
    filepath <- NA_character_
    metadataDoc <- NA_character_
    nodeSet <- list()
    
    # Use the existing checksum algorithm for the new (replaced) DataObject content
    checksumAlgorithm <- x@sysmeta@checksumAlgorithm
    
    # Get the xml content and update it if the xpath is found
    # Check that the parsing didn't generate an error
    result = tryCatch ({
        metadataDoc = xmlInternalTreeParse(rawToChar(getData(x)))
        nodeSet = xpathApply(metadataDoc,xpath)
    }, warning = function(warningCond) {
        cat(sprintf("Warning: %s\n", warningCond$message))
    }, error = function(errorCond) {
        cat(sprintf("Error: %s\n", errorCond$message))
    }, finally = {
        if(length(nodeSet) == 0) {
            stop(sprintf("No elements found in XML of DataObject with id: %s using xpath: %s", 
                         getIdentifier(x), xpath))
        }
    })
    
    # Substitute the new value(s) into the document
    sapply(nodeSet,function(node){
        xmlValue(node) = replacement
    })
    
    newfile <- tempfile(pattern="metadata", fileext=".xml")
    saveXML(metadataDoc, file=newfile)
    # xml2 version of updating XML
    #metadataDoc <- read_xml(getData(x), encoding = "", as_html = FALSE, options = "NOBLANKS")
    #node <- xml_find_first(metadataDoc,  xpath=xpath, ns = xml_ns(metadataDoc))
    #xml_text(node) <- replacement
    #write_xml(metadataDoc, filepath)
    
    # See how the data was stored in the previous version of the DataObject and
    # create the new, replacement DataObject using the same method (i.e. either internal data or external file)
    if (length(x@data) > 0) {
        metadata <- readChar(newfile, file.info(newfile)$size)
        x@data <- charToRaw(metadata)
        x@filename <- NA_character_
        x@sysmeta@size <- length(x@data)
        x@sysmeta@checksum <- calculateChecksum(x, checksumAlgorithm=checksumAlgorithm)
    } else {
        # Read the file from disk and return the contents as raw
        x@data <- raw()
        x@filename <- newfile
        fileinfo <- file.info(newfile)
        x@sysmeta@size <- fileinfo$size
        x@sysmeta@checksum <- calculateChecksum(x, checksumAlgorithm=checksumAlgorithm)
    }
    
    return(x)
})

setMethod("show", "DataObject",
          #function(object)print(rbind(x = object@x, y=object@y))
          function(object) {
              consoleWidth <- getOption("width")
              if(is.na(consoleWidth)) consoleWidth <- 80
              nameWidth <- 25
              valueWidth <- 30
              colWidth <- as.integer((consoleWidth - 5)/2)
              
              fmt <- paste0("%-", sprintf("%2d", nameWidth), "s ", ": ",
                           "%-", sprintf("%2d", valueWidth), "s ",
                           "\n")
              fmt2 <- paste0("%-", sprintf("%2d", colWidth), "s ",
                           "%-", sprintf("%2d", colWidth), "s ",
                           "\n")
              
              cat(sprintf("Access\n"))
              cat(sprintf(fmt, "  identifer", object@sysmeta@identifier))
              cat(sprintf(fmt, "  submitter", object@sysmeta@submitter))
              cat(sprintf(fmt, "  rightHolder", object@sysmeta@rightsHolder))
              cat(sprintf("  access policy:\n"))
              if(nrow(object@sysmeta@accessPolicy) > 0) {
                  cat(sprintf(fmt2, "    subject", "permission"))
                  for(irow in seq_len(nrow(object@sysmeta@accessPolicy))) {
                      subject <- object@sysmeta@accessPolicy[irow, 'subject']
                      permission <- object@sysmeta@accessPolicy[irow, 'permission']
                      cat(sprintf(fmt2, condenseStr(paste0("    ", subject), colWidth), permission))
                  }
              } else {
                 cat(sprintf("\t\tNo access policy defined\n")) 
              }
              cat(sprintf("Physical\n"))
              cat(sprintf(fmt, "  formatId", object@sysmeta@formatId))
              cat(sprintf(fmt, "  mediaType", object@sysmeta@mediaType))
              cat(sprintf(fmt, "  mediaTypeProperty", object@sysmeta@mediaTypeProperty))
              cat(sprintf(fmt, "  size", object@sysmeta@size))
              cat(sprintf("System\n"))
              cat(sprintf(fmt, "  seriesId", object@sysmeta@seriesId))
              cat(sprintf(fmt, "  serialVersion", object@sysmeta@serialVersion))
              cat(sprintf(fmt, "  obsoletes", object@sysmeta@obsoletes))
              cat(sprintf(fmt, "  obsoletedBy", object@sysmeta@obsoletedBy))
              cat(sprintf(fmt, "  archived", object@sysmeta@archived))
              cat(sprintf(fmt, "  dateUploaded", object@sysmeta@dateUploaded))
              cat(sprintf(fmt, "  dateSysMetadataModified", object@sysmeta@dateSysMetadataModified))
              cat(sprintf("Data\n"))
              if(!is.na(object@filename)) {
                cat(sprintf(fmt, "  filename", object@filename))
              } else {
                cat("  ", class(object@data), ": ", utils::head(object@data), " ...\n")
              }
          }
)

# Returns a path in the form of the native OS. When run
# on Windows, a Windows compliant path is returned. When run on
# PSOIX, a POSIX compliant path is returned.
getPlatformPath <- function(filePath) {
    if(.Platform$OS.type == "windows") {
        filePath <- pathToWindows(filePath)
    } else {
        filePath <-pathToPOSIX(filePath)
    }
    return(filePath)
}

# Turns a path into a POSIX compliant path
pathToPOSIX <- function(filePath) {
    filePath <- gsub('\\\\', '/', filePath)
    filterList <- list( '$', '?', '|', '"', '<', '>', '..')
    pathInformation <- sanitizePath(filePath, filterList)
    # Replace any windows-style paths
    return(file.path(pathInformation[1], pathInformation[2]))
}

# Turns a path into a Windows compliant path
pathToWindows<- function(filePath) {
    # List of things that shouldn't be in a path
    filterList <- list( '?', '*', '|', '"', '<', '>', '..')
    pathInformation <- sanitizePath(filePath, filterList)
    return(file.path(pathInformation[1], pathInformation[2]))
}

# Takes a path and a list of characters that should be removed
# and returns the path without the characters. It also sanitizes the
# file name .
sanitizePath <- function(filePath, filterList) {
    filename <- basename(filePath)
    path = dirname(filePath)
    
    filename <- fs::path_sanitize(filename, "")
    
    # List of things that shouldn't be in a path
    for (filterCharacter in filterList) {
        path <- gsub(filterCharacter, '_', path, fixed=TRUE)
    }
    return(c(path, filename))
}

# DataONE uses different abbreviations for checksum algorithms than the R 'digest' function.
# Given a DataONE checksum algorithm abbreviation, return the corresponding 'digest' abbreviation,
# which is needed in the 'digest' function call.
getChecksumAlgorithmAbbreviation <- function(checksumAlgorithm="SHA-256") {
    
    # DataONE and the R 'digest' package have different abbreviations for checksum algorithm designations.
    # Take a DataONE abbreviation and return the corresponding R 'digest' abbreviation
    
    if (tolower(checksumAlgorithm) == "md5") {
        abbr="md5"
    }
    else if (tolower(checksumAlgorithm) == "sha1") {
        abbr="sha1"
    }
    else if (tolower(checksumAlgorithm) == "sha-1") {
        abbr="sha1"
    }
    else if (tolower(checksumAlgorithm) == "sha256") {
        abbr="sha256"
    }
    else if (tolower(checksumAlgorithm) == "sha-256") {
        abbr="sha256"
    } else {
        warning(sprintf("Unknown checksum algorithm %s", checksumAlgorithm))
    }
    
    return(abbr) 
}

#' Calculate a checksum for the DataObject using the specified checksum algorithm
#' @description calculates a checksum
#' @param x A DataObject instance
#' @param ... Additional parameters (not yet used)
#' @note this method is intended for internal package use only.
#' @return The calculated checksum
setGeneric("calculateChecksum", function(x, ...) {
    standardGeneric("calculateChecksum")
})

#' @rdname calculateChecksum
#' @param checksumAlgorithm a \code{character} value specifying the checksum algorithm to use (i.e "MD5" or "SHA1" or "SHA256")
setMethod("calculateChecksum", signature("DataObject"), function(x, checksumAlgorithm="SHA256", ...) {
    abbr <- getChecksumAlgorithmAbbreviation(checksumAlgorithm)
    
    if(!is.na(x@dataURL)) {
        if (tolower(checksumAlgorithm) == x@sysmeta@checksumAlgorithm) {
            checksum <- x@sysmeta@checksum
        } else {
            warning("Unable to calculate checksum for DataObject without local content.")
        }
    } else if (length(x@data) > 0) {
        checksum <- digest(x@data, algo=abbr, serialize=FALSE, file=FALSE)
    } else if (!is.na(x@filename)) {
        checksum <- digest(x@filename, algo=abbr, serialize=FALSE, file=TRUE)
    } else {
        warning("DataObject does not contain data or a data URL.")
    }
    
    return(checksum)
    
})
          

Try the datapack package in your browser

Any scripts or data that you put into this service are public.

datapack documentation built on June 11, 2022, 1:05 a.m.