tests/testthat/test_DataPackage.R

test_that("datapack library loads", {
    expect_true(library(datapack, logical.return = TRUE))
})

test_that("datapack initialization works", {
  # Test that contructor bug is fix, i.e. internal structures being
  # reused in newly created objects: https://github.com/ropensci/datapack/issues/26)
  library(datapack)
  testdf <- data.frame(x=1:10,y=11:20)
  csvfile <- tempfile(pattern = "file", tmpdir = tempdir(), fileext = ".csv")
  write.csv(testdf, csvfile, row.names=FALSE)
  mnId <- "urn:node:mnSandboxUCSB2"
  dp <- new("DataPackage")
  sciObj <- new("DataObject", format="text/csv", user="uid=slaughter,ou=Account,dc=ecoinformatics,dc=org", mnNodeId=mnId, filename=csvfile)
  dp <- addMember(dp, sciObj)
  ids <- getIdentifiers(dp)
  expect_equal(length(ids), 1)
  rm(dp)
  newDP <- new("DataPackage")
  ids <- getIdentifiers(newDP)
  expect_equal(length(ids), 0)
})

test_that("datapack methods work", {
    library(datapack)
    id1 <- "id1"
    id2 <- "id2"
    user <- "matt"
    data <- charToRaw("1,2,3\n4,5,6")
    format <- "text/csv"
    node <- "urn:node:KNB"
    
    dpkg <- new("DataPackage")
    expect_equal(class(dpkg)[[1]], "DataPackage")
    expect_equal(getSize(dpkg), 0)
    do <- new("DataObject", id1, data, format, user, node)
    dpkg <- addMember(dpkg, do)
    expect_equal(getSize(dpkg), 1)
    expect_equal(getIdentifiers(dpkg)[[1]], id1)
    expect_true(containsId(dpkg, id1))
    expect_equal(containsId(dpkg, "bad_id"), FALSE)
    rdata <- getData(dpkg, id1)
    expect_equal(length(rdata), length(data))
    nodata <- getData(dpkg, "bad_id")
    expect_null(nodata)
    rdo <- getMember(dpkg, id1)
    expect_match(class(do), "DataObject")
    expect_match(getIdentifier(do), id1)
    do2 <- new("DataObject", id2, data, format, user, node)
    dpkg <- addMember(dpkg, do2)
    expect_equal(getSize(dpkg), 2)
    expect_equal(getIdentifiers(dpkg)[[1]], id1)
    expect_equal(getIdentifiers(dpkg)[[2]], id2)
    dpkg <- removeMember(dpkg, id1)
    expect_equal(getSize(dpkg), 1)
    expect_false(containsId(dpkg, id1))
    rm(do)
    rm(do2)
    rm(dpkg)
    rm(rdo)
    rm(rdata)
    
    # Test that a DataObject with a file path gets the file path placed in the resource map
    targetPath <- "./data/raster/data.tiff"
    newId <- "testID"
    do <- new("DataObject", newId, data, format, user, node, targetPath=targetPath)

    dpkg <- new("DataPackage")
    dpkg <- addMember(dpkg, do)
    relations <- getRelationships(dpkg)
    expect_match(relations[relations$subject == newId, 'predicate'], "atLocation")
    rm(do)
    rm(dpkg)
    
    # Test if we can associate a science object with a metadata object without calling
    # insertRelationships directly
    dpkg <- new("DataPackage")
    mdId <- "md1"
    md <- new("DataObject", mdId, data, format, user, node)
    # The 'mdId' parameter indicates that this is a metadata object that is
    # to be associated with the 'id1' parameter
    #dpkg <- addMember(dpkg, md)
    do <- new("DataObject", id1, data, format, user, node)
    dpkg <- addMember(dpkg, do, md)
    expect_equal(getSize(dpkg), 2)
    expect_true(containsId(dpkg, id1))
    expect_true(containsId(dpkg, mdId))
    
    # Test that the addMember() function adds the 'documents' relationship if a metadata object is 
    # specified
    relations <- getRelationships(dpkg)
    expect_match(relations[relations$subject == id1, 'predicate'], "isDocumentedBy")
    expect_equal(relations[relations$subject == id1, 'object'], mdId)
    expect_match(relations[relations$subject == mdId, 'predicate'], "documents")
    expect_equal(relations[relations$subject == mdId, 'object'], id1)
})

test_that("InsertRelationship methods work", {
  
  quietOn <- TRUE
  # Test the 'insertRelationships' method that uses the hardwired 'documents', 'isDocumentedBy' relationship
  dp <- new("DataPackage")
  doId1 <- "id1"
  doId2 <- "id2" 
  user <- "smith"
  data <- charToRaw("1,2,3\n4,5,6")
  format <- "text/csv"
  node <- "urn:node:KNB"
  do1 <- new("DataObject", id=doId1, data, format, user, node)
  do2 <- new("DataObject", id=doId2, data, format, user, node)
  mdId <- "md1"
  md1 <- new("DataObject", id=mdId, data, format="eml://ecoinformatics.org/eml-2.1.1", user, node)
  dp <- addMember(dp, do1)
  dp <- addMember(dp, do2)
  
  dp <- insertRelationship(dp, subjectID=mdId, objectIDs=c(doId1, doId2))
  relations <- getRelationships(dp, quiet=quietOn)
  # Test if the data frame with relationships was constructed correctly
  expect_equal(nrow(relations), 4)
  expect_equal(relations[relations$object == doId1, 'subject'], mdId)
  expect_equal(relations[relations$object == doId2, 'subject'], mdId)
  expect_match(relations[relations$subject == mdId, 'predicate'], 'documents')
  expect_match(relations[relations$subject == doId1, 'predicate'], 'isDocumentedBy')
  rm(dp)
  
  # Now test the second 'insertRelationships' that allows specifying the predicate of the relationship
  dp <- new("DataPackage")
  doId1 <- "id1"
  doId2 <- "id2" 
  user <- "smith"
  data <- charToRaw("1,2,3\n4,5,6")
  format <- "text/csv"
  do1 <- new("DataObject", id=doId1, data, format)
  do2 <- new("DataObject", id=doId2, data, format, user, node)
  dp <- addMember(dp, do1)
  dp <- addMember(dp, do2)
  
  # Test invalid argument values
  err <- try(dp <- insertRelationship(dp, subjectID=doId1, objectIDs=doId2, predicate="http://www.w3.org/ns/prov#wasDerivedFrom", subjectType='literal'), silent=TRUE)
  expect_match(class(err), ("try-error"))
  err <- try(dp <- insertRelationship(dp, subjectID=doId1, objectIDs=doId2, predicate="http://www.w3.org/ns/prov#wasDerivedFrom", objectType='foo'), silent=TRUE)
  expect_match(class(err), ("try-error"))
  
  nrel <- 1
  # Insert a typical provenance relationship
  dp <- insertRelationship(dp, subjectID=doId1, objectIDs=doId2, predicate="http://www.w3.org/ns/prov#wasDerivedFrom")
  # Test if the data frame with retrieved relationships was constructed correctly
  relations <- getRelationships(dp, quiet=quietOn)
  expect_equal(nrow(relations), nrel)
  expect_match(relations[relations$subject == doId1, 'predicate'], "wasDerivedFrom")
  expect_equal(relations[relations$subject == doId1, 'object'], doId2)
  expect_equal(relations[relations$subject == doId1, 'subjectType'], as.character(NA))
  expect_equal(relations[relations$subject == doId1, 'objectType'], as.character(NA))
  
  # Test assingment of subjectType, objectType
  dp <- insertRelationship(dp, subjectID="orcid.org/0000-0002-2192-403X", objectIDs="http://www.example.com/home", predicate="http://www.example.com/hadHome",
                     subjectType="uri", objectType="literal")  
  relations <- getRelationships(dp, quiet=quietOn)
  expect_equal(nrow(relations), nrel <- nrel + 1)
  expect_match(relations[relations$subject == "orcid.org/0000-0002-2192-403X", 'predicate'], "hadHome")
  expect_match(relations[relations$subject == "orcid.org/0000-0002-2192-403X", 'object'], "www.example.com/home")
  expect_equal(relations[relations$subject == "orcid.org/0000-0002-2192-403X", 'subjectType'], "uri")
  expect_equal(relations[relations$subject == "orcid.org/0000-0002-2192-403X", 'objectType'], "literal")
  
  # Test that an unspecified objectType (where length(objetIDs) > length(objectTypes)) is set to as.character(NA)
  dp <- insertRelationship(dp, subjectID="urn:uuid:6186b15c-0a4b-4338-a091-1ea68d0fb72d", objectIDs=c("thing1", "thing2", "thing3"), predicate="http://www.myns.org/hadThing", subjectType="blank",
                     objectTypes=c("literal", "literal"))
  relations <- getRelationships(dp, quiet=quietOn)
  expect_equal(nrow(relations), nrel<-nrel + 3)
  expect_match(relations[relations$object == "thing1", 'predicate'], "hadThing")
  expect_match(relations[relations$object == "thing1", 'subject'], "urn:uuid:6186b15c-0a4b-4338-a091-1ea68d0fb72d")
  expect_equal(relations[relations$object == "thing1", 'subjectType'], "blank")
  expect_equal(relations[relations$object == "thing1", 'objectType'], "literal")
  expect_equal(relations[relations$object == "thing2", 'subjectType'], "blank")
  expect_equal(relations[relations$object == "thing2", 'objectType'], "literal")
  expect_equal(relations[relations$object == "thing3", 'objectType'], as.character(NA))
  
  # Multiple objectTypes, first one 'NA'
  dp <- insertRelationship(dp, subjectID="urn:uuid:ea00e863-861b-4253-9ed5-1c0568ee2373", objectIDs=c("thing4", "thing5"), predicate="http://www.myns.org/hadThing", subjectType="blank",
                     objectTypes=c(NA, "literal"))
  relations <- getRelationships(dp, quiet=quietOn)
  expect_equal(nrow(relations), nrel<-nrel + 2)
  expect_equal(relations[relations$object == "thing4", 'objectType'], as.character(NA))
  expect_equal(relations[relations$object == "thing5", 'objectType'], "literal")
  
  # Subject passed in as NA, which means create an "anonymous" blank node for these (software assigns the blank node id, not user)
  dp <- insertRelationship(dp, subjectID=as.character(NA), objectIDs="thing6", predicate="http://www.myns.org/wasThing", objectTypes="literal")
  relations <- getRelationships(dp, quiet=quietOn)
  expect_equal(nrow(relations), nrel<-nrel + 1)
  expect_match(relations[relations$object == "thing6", 'predicate'], "wasThing")
  expect_equal(relations[relations$object == "thing6", 'subjectType'], "blank")
  
  # No objectID specified
  dp <- insertRelationship(dp, subjectID="urn:uuid:5743f16c-e038-4ef2-bcca-0418ff631a34", objectIDs=as.character(NA), predicate="http://www.myns.org/gaveThing")
  relations <- getRelationships(dp, quiet=quietOn)
  expect_equal(nrow(relations), nrel <- nrel + 1)
  expect_match(relations[relations$subject == "urn:uuid:5743f16c-e038-4ef2-bcca-0418ff631a34", 'predicate'], "gaveThing")
  expect_equal(relations[relations$subject == "urn:uuid:5743f16c-e038-4ef2-bcca-0418ff631a34", 'objectType'], "blank")
  
  # Specify dataTypeURIs
  dp <- insertRelationship(dp, subjectID="urn:uuid:abcd", objectIDs="Wed Mar 18 06:26:44 PDT 2015", 
                     predicate="http://www.w3.org/ns/prov#startedAt", subjectType="uri", 
                     objectTypes="literal",
                     dataTypeURIs="http://www.w3.org/2001/XMLSchema#string")
  relations <- getRelationships(dp, quiet=quietOn)
  expect_equal(nrow(relations), nrel <- nrel + 1)
  expect_match(relations[relations$subject == "urn:uuid:abcd", 'dataTypeURI'], "string")
  expect_equal(relations[relations$subject == "urn:uuid:abcd", 'subjectType'], "uri")
  expect_equal(relations[relations$subject == "urn:uuid:abcd", 'objectType'], "literal")
  expect_match(relations[relations$subject == "urn:uuid:abcd", 'predicate'], "startedAt")

  rm(dp)
  dp <- new("DataPackage")
  # Insert derivation relationships
  source <- "https://cn.dataone.org/cn/v1/object/doi:1234/_030MXTI009R00_20030812.40.1"
  derived <- "https://cn.dataone.org/cn/v1/object/doi:1234/_030MXTI009R00_20030812.45.1"
  # Keep recordDerivations in unit tests until v 1.3.0, when it may be marked as defunct
  suppressWarnings(dp <- recordDerivation(dp, sourceID=source, derivedIDs=derived))
  relations <- getRelationships(dp, quiet=quietOn)
  
  # Test if the data frame with retrieved relationships was constructed correctly
  expect_equal(nrow(relations), 3)
  expect_equal(nrow(relations[relations$object == datapack:::provONEdata,]), 2)
  expect_match(relations[relations$predicate == datapack:::provWasDerivedFrom, 'subject'], derived)
  expect_equal(relations[relations$predicate == datapack:::provWasDerivedFrom, 'object'], source)
})

test_that("removeRelationships method works", {
    
    dp <- new("DataPackage")
    # Create a relationship
    dp <- insertRelationship(dp, "/Users/smith/scripts/genFields.R",
                             "https://knb.org/data_20030812.40.1",
                             "http://www.w3.org/ns/prov#used")
    # Create a relationshp with the subject as a blank node with an automatically assigned blank 
    # node id
    dp <- insertRelationship(dp, subjectID=NA_character_, objectIDs="thing6", 
                             predicate="http://myns.org/wasThing")
    # Create a relationshp with the subject as a blank node with a user assigned blank node id
    dp <- insertRelationship(dp, subjectID="urn:uuid:bc9e160e-ca21-47d5-871b-4a4820fe4451", 
                             objectIDs="thing7", predicate="http://myns.org/hadThing")
    # Create multiple relationships with the same subject, predicate, but different objects
    dp <- insertRelationship(dp, subjectID="https://myns.org/subject1", 
                             objectIDs=c("thing4", "thing5"), predicate="http://myns.org/hadThing")
    # Create multiple relationships with subject and object types specified
    dp <- insertRelationship(dp, subjectID="orcid.org/0000-0002-2192-403X", 
                             objectIDs="http://www.example.com/home", predicate="http://myns.org/hadHome",
                             subjectType="uri", objectType="literal")
    
    # Now check if deleting various relationships results in the right number of rows
    expect_equal(nrow(getRelationships(dp)), 6)
    dp <- removeRelationships(dp, predicate='http://myns.org/wasThing')
    expect_equal(nrow(getRelationships(dp)), 5)
    dp <- removeRelationships(dp, subjectID='orcid.org/0000-0002-2192-403X')
    expect_equal(nrow(getRelationships(dp)), 4)
    dp <- removeRelationships(dp, subjectID='https://myns.org/subject1', predicate='http://myns.org/hadThing')
    expect_equal(nrow(getRelationships(dp)), 2)
    dp <- removeRelationships(dp)
    expect_equal(nrow(getRelationships(dp)), 0)
    
})

test_that("Package serialization works", {
  
  library(uuid)
  dp <- new("DataPackage")
  mdId <- "scimeta_id"
  doInId <- "scidataId"
  doOutId <- paste0("urn:uuid:", UUIDgenerate())
  executionId <- "execution1"
  
  user <- "smith"
  data <- charToRaw("1,2,3\n4,5,6")
  format <- "text/csv"
  node <- "urn:node:KNB"
  md1 <- new("DataObject", id=mdId, data, format="eml://ecoinformatics.org/eml-2.1.1", user, node)
  doIn <- new("DataObject", id=doInId, data, format, user, node)
  doOut <- new("DataObject", id=doOutId, data, format, user, node)
  dp <- addMember(dp, md1)
  expect_match(class(dp), "DataPackage")
  dp <- addMember(dp, doIn)
  dp <- addMember(dp, doOut)
  
  # Insert metadata document <-> relationships
  dp <- insertRelationship(dp, subjectID=mdId, objectIDs=c(doOutId))
  expect_match(class(dp), "DataPackage")
  
  # Insert a typical provenance relationship
  dp <- insertRelationship(dp, subjectID=doOutId, objectIDs=doInId, predicate="http://www.w3.org/ns/prov#wasDerivedFrom")
  dp <- insertRelationship(dp, subjectID=executionId, objectIDs=doInId, predicate="http://www.w3.org/ns/prov#used")
  dp <- insertRelationship(dp, subjectID=doOutId, objectIDs=executionId, predicate="http://www.w3.org/ns/prov#wasGeneratedBy")
  dp <- insertRelationship(dp, subjectID="urn:uuid:abcd", objectIDs="Wed Mar 18 06:26:44 PDT 2015", 
                     predicate="http://www.w3.org/ns/prov#startedAt", subjectType="uri", 
                     objectTypes="literal",
                     dataTypeURIs="http://www.w3.org/2001/XMLSchema#string")
  
  # Serialize the ResourceMap to a file.
  serializationId <- sprintf("%s%s", "resourceMap1", UUIDgenerate())
  filePath <- file.path(sprintf("%s/%s.rdf", tempdir(), serializationId))
  status <- serializePackage(dp, filePath, id=serializationId)
  expect_true(file.exists(filePath))
  found <- grep("wasDerivedFrom", readLines(filePath))
  expect_gt(found, 0)
  unlink(filePath)
  
  # Use R serialize/unserialize functions on entire datapackage
  dpFile <- tempfile(pattern = "file", tmpdir = tempdir(), fileext = ".rds")
  saveRDS(dp, file=dpFile)
  dpNew <-readRDS(file=dpFile)
  # Test that the deserialized object is the same as the original. For some
  # reason, comparing the entire datapackage objects fails, so test
  # the individual components
  # First compare ids
  dpIds <- getIdentifiers(dp)
  dpNewIds <- getIdentifiers(dpNew)
  expect_true(identical(dpIds, dpNewIds, ignore.environment = TRUE))
  # Compare relationships
  dpRelations <- getRelationships(dp)
  dpNewRelations <-getRelationships(dpNew)
  expect_true(identical(dpRelations, dpNewRelations, ignore.environment = TRUE))
  # Compare each data object
  for (id in getIdentifiers(dp)) {
    expect_true(identical(getData(dp, id), getData(dpNew, id)))
  }
  unlink(dpFile)
  
})

test_that("Package serialization works with minimal DataPackage", {
    
    # Test methods with a DataPackage that has only one member and no package relationships
    library(uuid)
    dp <- new("DataPackage")
    mdId <- paste("urn:uuid:", UUIDgenerate(), sep="")
    data <- charToRaw("1,2,3\n4,5,6")
    format <- "text/csv"
    md1 <- new("DataObject", id=mdId, data, format="eml://ecoinformatics.org/eml-2.1.1")
    dp <- addMember(dp, md1)
    expect_match(class(dp), "DataPackage")
    
    # Serialize the ResourceMap to a file.
    serializationId <- sprintf("%s%s", "resourceMap1", UUIDgenerate())
    filePath <- file.path(sprintf("%s/%s.rdf", tempdir(), serializationId))
    status <- serializePackage(dp, filePath, id=serializationId)
    expect_true(file.exists(filePath))
    found <- grep(mdId, readLines(filePath))
    expect_gt(found, 0)
    unlink(filePath)
    # Compare relationships
    rels <- getRelationships(dp)
    expect_equal(nrow(rels), 0)
})

test_that("BagIt serialization works", {
  
  library(uuid)
  library(datapack)
  # Test BagIt serializatin of a DataPackage
  # Test bagging with both in-memory DataObjects and file baseed
  testdf <- data.frame(x=1:10,y=11:20)
  csvfile <- tempfile(pattern = "file", tmpdir = tempdir(), fileext = ".csv")
  write.csv(testdf, csvfile, row.names=FALSE)
  someEML <-'"<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:ds="eml://ecoinformatics.org/dataset-2.1.1" 
    xmlns:xs="http://www.w3.org/2001/XMLSchema" 
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
    xmlns:stmml="http://www.xml-cml.org/schema/stmml_1.1" 
    packageId="urn:uuid:439c8d9d-f0ed-4f47-8cac-275d0d64bafd" system="uuid">
    <dataset scope="document">
      <title>My package title</title>
      <creator>
        <individualName>
          <givenName>Smith</givenName>
          <surName>John</surName>
        </individualName>
        <electronicMailAddress>jsmith@example.com</electronicMailAddress>
      </creator>
      <pubDate>2015-10-14</pubDate>
    </dataset>
  </eml:eml>"'
  
  dp <- new("DataPackage")

  emlId <- paste0("urn:uuid:", UUIDgenerate())
  secondEmlId <- paste0("urn:uuid:", UUIDgenerate())
  thirdEmlId <- paste0("urn:uuid:", UUIDgenerate())
  firstDataFileId <- paste0("urn:uuid:", UUIDgenerate())
  extraScienceMetadataId <- paste0("urn:uuid:", UUIDgenerate())
  secondDataFileId <- paste0("urn:uuid:", UUIDgenerate())
  thirdDataFileId <- paste0("urn:uuid:", UUIDgenerate())
  fourthDataFileId <- paste0("urn:uuid:", UUIDgenerate())
  fifthDataFileId <- paste0("urn:uuid:", UUIDgenerate())

  packageFileIds = list(emlId, secondEmlId, thirdEmlId, firstDataFileId,
                        extraScienceMetadataId, secondDataFileId,
                        thirdDataFileId, fourthDataFileId, fifthDataFileId)
  user <- "smith"
  data <- charToRaw("1,2,3\n4,5,6")
  firstFilePath = "my_data/textFile1.csv"
  secondFilePath = "my_data/textFile2.csv"
  thirdFilePath = "more-data\\textFile3.csv"
  fifthFilePath = "misc_data/data.csv"

  
  node <- "urn:node:KNB"

  emlDataObject <- new("DataObject", id=emlId, dataobj=charToRaw(someEML), format="eml://ecoinformatics.org/eml-2.1.1", user=user, mnNodeId=node)
  secondEmlDataObject <- new("DataObject", id=secondEmlId, dataobj=charToRaw(someEML), format="eml://ecoinformatics.org/eml-2.1.1", user=user, mnNodeId=node)
  # An unused EML file, used to test that it gets put in ./data/
  thirdEmlDataObject <- new("DataObject", id=thirdEmlId, dataobj=charToRaw(someEML), format="eml://ecoinformatics.org/eml-2.1.1", user=user, mnNodeId=node)
  # Content Standard for Digital Geospatial Metadata, Biological Data Profile, version 001.1-1999 Science Metatdata file
  extraScienceMetadata <- new("DataObject", id=extraScienceMetadataId, dataobj=charToRaw(someEML), format="FGDC-STD-001.1-1999", user=user, mnNodeId=node)
  thirdEmlDataObject <- new("DataObject", id=thirdEmlId, dataobj=charToRaw(someEML), format="http://www.loc.gov/METS/", user=user, mnNodeId=node)
  firstDataFile <- new("DataObject", id=firstDataFileId, dataobj=data, format="text/csv", user=user, mnNodeId=node, targetPath=firstFilePath)
  secondDataFile <- new("DataObject", id=secondDataFileId, filename=csvfile, format="text/csv", user=user, mnNodeId=node, targetPath=secondFilePath)
  thirdDataFile <- new("DataObject", id=thirdDataFileId, filename=csvfile, format="text/csv", user=user, mnNodeId=node, targetPath=thirdFilePath)
  fourthDataFile <- new("DataObject", id=fourthDataFileId, filename=csvfile, format="text/csv", user=user, mnNodeId=node)
  # Is described by the FGDC metadata file
  fifthDataFile <- new("DataObject", id=fifthDataFileId, dataobj=data, format="text/csv", user=user, mnNodeId=node, targetPath=fifthFilePath)

  # Add all of the EML and data objects
  dp <- addMember(dp, emlDataObject)
  dp <- addMember(dp, secondEmlDataObject)
  dp <- addMember(dp, thirdEmlDataObject)
  dp <- addMember(dp, extraScienceMetadata)
  dp <- addMember(dp, firstDataFile, mo=emlDataObject)
  dp <- addMember(dp, secondDataFile, mo=secondEmlDataObject)
  dp <- addMember(dp, thirdDataFile)
  dp <- addMember(dp, fourthDataFile)
  dp <- addMember(dp, fifthDataFile, mo=extraScienceMetadata)

  # Make note of which data files have a path set
  dataFilesWithPath <- list(firstFilePath, secondFilePath, thirdFilePath, fifthFilePath)

  bagitFile <- suppressMessages(serializeToBagIt(dp))
  # Basic sanity checks
  expect_true(file.exists(bagitFile))
  expect_true(file.info(bagitFile)[['size']] > 0)

  zipFileNames <- zip::zip_list(bagitFile)
  zipFileNames <- as.list(zipFileNames[[1]])
  # Check that the required tag files are present
  requiredBagFiles <- list("bagit.txt", "bag-info.txt", "manifest-md5.txt", "tagmanifest-md5.txt")

  expect_true(all(requiredBagFiles %in% zipFileNames))
 

  # Check that the data files are in the manifest
  manifestData <- read.table(unz(bagitFile, "manifest-md5.txt"), nrows=10, header=F, quote="\"", sep=" ")
  # Get the second column (file names)
  manifestData <- manifestData[[2]]
  dataFilesWithPath <- lapply(dataFilesWithPath, getPlatformPath)
  
  expect_true(all(file.path("data",dataFilesWithPath) %in% zipFileNames))
  expect_true(all(file.path("data",dataFilesWithPath) %in% manifestData))
 

  # Check that the system metata are present
  # The Data Objects should have system metadata assigned to them.
  # We want the IDs of those for testing
  tagManifestData <- read.table(unz(bagitFile, "tagmanifest-md5.txt"), header=F, quote="\"", sep=" ")
  tagManifestData <- tagManifestData[[2]]
  
  for (packageFileId in packageFileIds) {
      updatedObject <- getMember(dp, packageFileId)
      # Remember to sanitize the file name
      sysmetaName <- getPlatformPath(updatedObject@sysmeta@identifier)
      sysmetaName <- gsub("./", "", sysmetaName)
      fullPath <- file.path("metadata", "sysmeta", paste("sysmeta-", sysmetaName, ".xml", sep=""))
      expect_true(fullPath %in% zipFileNames )
      expect_true(fullPath %in% tagManifestData )
  }


  # Check that the rdf is present
  expect_true(file.path("metadata", "oai-ore.xml") %in% zipFileNames)
  # Check that the EML documents are present
  expect_true(file.path("metadata", "eml.xml") %in% zipFileNames)
  expect_true(file.path("metadata", "science-metadata.xml") %in% zipFileNames)

 # Enable these tests when targetPath will be reflected into the BagIt structure (v1.5.0?)
 #zipFileNames = unzip(bagitFile, list=TRUE)$Name
 #expect_true(paste("data/",doInFilePath, sep="") %in% zipFileNames )
 #expect_true(paste("data/", doOutFilePath, sep="") %in% zipFileNames)o
 
 # Check that the 'atLocation' relationships were added correctly by 'serializeToBagIt()'
 relationships <- getRelationships(dp)
 expect_equal(subset(relationships, subject==firstDataFileId & predicate == provAtLocation, select=object)[1,'object'], firstFilePath)
 expect_equal(subset(relationships, subject==secondDataFileId & predicate == provAtLocation, select=object)[1,'object'], secondFilePath)
 
 
}) 


test_that("Adding provenance relationships to a DataPackage via describeWorkflow works", {
    
    library(datapack)
    # 
    # Test describeWorkflow using DataObject ids
    dp <- new("DataPackage")
    inIds <- list()
    outIds <- list()
    # Add the program object to the package
    doProg <- new("DataObject", format="application/R", 
                  filename=system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack"))
    progId <- getIdentifier(doProg)
    dp <- addMember(dp, doProg)
    # Add the input object to the package
    doIn <- new("DataObject", format="text/csv", 
                filename=system.file("./extdata/pkg-example/binary.csv", package="datapack"))
    inIds[[length(inIds)+1]] <- getIdentifier(doIn)
    dp <- addMember(dp, doIn)
    # Add the output object to the package
    doOut <- new("DataObject", format="image/png", 
                 filename=system.file("./extdata/pkg-example/gre-predicted.png", package="datapack"))
    outIds[[length(outIds)+1]] <- getIdentifier(doOut)
    dp <- addMember(dp, doOut)
    # Add the provenenace relationships for the script execution, using ids for 'sources, program, derivation' arguments
    dp <- describeWorkflow(dp, sources=inIds, program=progId, derivations=outIds)
    # Test if the data frame with retrieved relationships was constructed correctly
    relations <- getRelationships(dp, quiet=quietOn)
    execId <- relations[relations$object == datapack:::provONEexecution,'subject']
    expect_match(relations[relations$predicate == datapack:::provWasGeneratedBy,'subject'], getIdentifier(doOut))
    expect_match(relations[relations$predicate == datapack:::provUsed,'object'], getIdentifier(doIn))
    expect_match(relations[relations$predicate == datapack:::provWasGeneratedBy,'object'], execId)
    expect_match(relations[relations$predicate == datapack:::provUsed,'subject'], execId)
    expect_match(relations[relations$predicate == datapack:::provHadPlan,'object'], progId)
    expect_match(relations[relations$predicate == datapack:::provWasDerivedFrom,'subject'], getIdentifier(doOut))
    expect_match(relations[relations$predicate == datapack:::provWasDerivedFrom,'object'], getIdentifier(doIn))
   
    # 
    # Now do the same test passing DataObjects to describeWorkflow
    rm(dp)
    dp <- new("DataPackage")
    inputs <- list()
    outputs <- list()
    doProg <- new("DataObject", format="application/R", 
                  filename=system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack"))
    dp <- addMember(dp, doProg)
    progId <- getIdentifier(doProg)
    doIn <- new("DataObject", format="text/csv", 
                filename=system.file("./extdata/pkg-example/binary.csv", package="datapack"))
    dp <- addMember(dp, doIn)
    inputs[[length(inputs)+1]] <- doIn
    doOut <- new("DataObject", format="image/png", 
                 filename=system.file("./extdata/pkg-example/gre-predicted.png", package="datapack"))
    dp <- addMember(dp, doOut)
    outputs[[length(outputs)+1]] <- doOut
    dp <- describeWorkflow(dp, sources=inputs, program=doProg, derivations=outputs)
    relations <- getRelationships(dp, quiet=quietOn)
    execId <- relations[relations$object == datapack:::provONEexecution,'subject']
    expect_match(relations[relations$predicate == datapack:::provWasGeneratedBy,'subject'], getIdentifier(doOut))
    expect_match(relations[relations$predicate == datapack:::provUsed,'object'], getIdentifier(doIn))
    expect_match(relations[relations$predicate == datapack:::provWasGeneratedBy,'object'], execId)
    expect_match(relations[relations$predicate == datapack:::provUsed,'subject'], execId)
    expect_match(relations[relations$predicate == datapack:::provHadPlan,'object'], progId) 
    expect_match(relations[relations$predicate == datapack:::provWasDerivedFrom,'subject'], getIdentifier(doOut))
    expect_match(relations[relations$predicate == datapack:::provWasDerivedFrom,'object'], getIdentifier(doIn))
    
    #
    # Now do the same test without the R script, so that only 'prov:wasDerivedFrom' between inputs
    # and outputs will be inserted.
    rm(dp)
    dp <- new("DataPackage")
    inputs <- list()
    outputs <- list()
    doIn <- new("DataObject", format="text/csv", 
                filename=system.file("./extdata/pkg-example/binary.csv", package="datapack"))
    dp <- addMember(dp, doIn)
    inputs[[length(inputs)+1]] <- doIn
    doOut <- new("DataObject", format="image/png", 
                 filename=system.file("./extdata/pkg-example/gre-predicted.png", package="datapack"))
    dp <- addMember(dp, doOut)
    outputs[[length(outputs)+1]] <- doOut
    dp <- describeWorkflow(dp, sources=inputs, derivations=outputs)
    relations <- getRelationships(dp, quiet=quietOn)
    expect_match(relations[relations$predicate == datapack:::provWasDerivedFrom,'subject'], getIdentifier(doOut))
    expect_match(relations[relations$predicate == datapack:::provWasDerivedFrom,'object'], getIdentifier(doIn))
    
    #
    # Now do the same test without the inputs so that 'prov:used' are not inserted, but other
    # relationships are.
    rm(dp)
    dp <- new("DataPackage")
    inputs <- list()
    outputs <- list()
    doProg <- new("DataObject", format="application/R", 
                  filename=system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack"))
    dp <- addMember(dp, doProg)
    progId <- getIdentifier(doProg)
    doOut <- new("DataObject", format="image/png", 
                 filename=system.file("./extdata/pkg-example/gre-predicted.png", package="datapack"))
    dp <- addMember(dp, doOut)
    outputs[[length(outputs)+1]] <- doOut
    dp <- describeWorkflow(dp, program=doProg, derivations=outputs)
    relations <- getRelationships(dp, quiet=quietOn)
    execId <- relations[relations$object == datapack:::provONEexecution,'subject']
    expect_match(relations[relations$predicate == datapack:::provWasGeneratedBy,'subject'], getIdentifier(doOut))
    expect_match(relations[relations$predicate == datapack:::provWasGeneratedBy,'object'], execId)
    expect_match(relations[relations$predicate == datapack:::provHadPlan,'object'], progId) 
    
    #
    # Now do the same test without the outputs so that 'prov:generqtedBy' are not inserted, but other
    # relationships are.
    rm(dp)
    dp <- new("DataPackage")
    inputs <- list()
    doProg <- new("DataObject", format="application/R", 
                  filename=system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack"))
    dp <- addMember(dp, doProg)
    progId <- getIdentifier(doProg)

    doIn <- new("DataObject", format="text/csv", 
                filename=system.file("./extdata/pkg-example/binary.csv", package="datapack"))
    dp <- addMember(dp, doIn)
    inputs[[length(inputs)+1]] <- doIn
    dp <- describeWorkflow(dp, sources=inputs, program=doProg)
    relations <- getRelationships(dp, quiet=quietOn)
    expect_match(relations[relations$subject == getIdentifier(doIn),'object'], 'Data')
    execId <- relations[relations$object == datapack:::provONEexecution,'subject']
    expect_match(relations[relations$predicate == datapack:::provUsed,'object'], getIdentifier(doIn))
    expect_match(relations[relations$predicate == datapack:::provUsed,'subject'], execId)
    expect_match(relations[relations$predicate == datapack:::provHadPlan,'object'], progId) 
    
    # 
    # Test if passing only inputs fails
    err <- try(dp <- describeWorkflow(dp, sources=inputs), silent=TRUE)
    expect_match(class(err), "try-error")
    # Test if passing only outputs fails
    err <- try(dp <- describeWorkflow(dp, derivations=outputs), silent=TRUE)
    expect_match(class(err), "try-error")
    # Test if passing only program fails
    err <- try(dp <- describeWorkflow(dp, program=doProg), silent=TRUE)
    expect_match(class(err), "try-error")
    
    
    relations <- getRelationships(dp, quiet=quietOn)
    expect_match(relations[relations$subject == getIdentifier(doIn),'object'], 'Data')
    execId <- relations[relations$object == datapack:::provONEexecution,'subject']
    expect_match(relations[relations$predicate == datapack:::provUsed,'object'], getIdentifier(doIn))
    expect_match(relations[relations$predicate == datapack:::provUsed,'subject'], execId)
    expect_match(relations[relations$predicate == datapack:::provHadPlan,'object'], progId) 
    
    # Test removing a DataObject from the package - make sure that all the package relationships that had
    # the DataObject as a subject or object have been removed. This includes any relationship, i.e. cito:*
    # and prov:*.
    dp <- removeMember(dp, doIn, removeRelationships=TRUE)
    rels <- relations[relations$predicate == datapack:::provUsed,'object']
    
    
    # Test prov:wasDerivedFrom with pids that are not package members
    rm(dp)
    dp <- new("DataPackage")
    # Insert derivation relationships
    source <- "https://cn.dataone.org/cn/v1/object/doi:1234/_030MXTI009R00_20030812.40.1"
    derived <- "https://cn.dataone.org/cn/v1/object/doi:1234/_030MXTI009R00_20030812.45.1"
    # Keep recordDerivation in unit tests until v 1.3.0, when it may be marked as defunct
    dp <- suppressWarnings(recordDerivation(dp, sourceID=source, derivedIDs=derived))
    relations <- getRelationships(dp, quiet=quietOn)
    
    # Test if the data frame with retrieved relationships was constructed correctly
    expect_equal(nrow(relations), 3)
    expect_equal(nrow(relations[relations$object == datapack:::provONEdata,]), 2)
    expect_equal(relations[relations$predicate == datapack:::provWasDerivedFrom, 'subject'], derived)
    expect_equal(relations[relations$predicate == datapack:::provWasDerivedFrom, 'object'], source)
})

test_that("removeMember works", {
    library(datapack)
    #
    # Now do the same test without the outputs so that 'prov:generqtedBy' are not inserted, but other
    # relationships are.
    dp <- new("DataPackage")
    inputs <- list()
    regressionPath <- "./extdata/pkg-example/logit-regression-example.R"
    doProg <- new("DataObject", format="application/R", 
                  filename=system.file(regressionPath, package="datapack"),
                  targetPath=regressionPath)
    dp <- addMember(dp, doProg)
    progId <- getIdentifier(doProg)
    
    binaryPath <- "./extdata/pkg-example/binary.csv"
    doIn <- new("DataObject", format="text/csv", 
                filename=system.file(binaryPath, package="datapack"), targetPath=binaryPath)
    dp <- addMember(dp, doIn)
    inputs[[length(inputs)+1]] <- doIn
    relations <- getRelationships(dp, quiet=quietOn)

    dp <- describeWorkflow(dp, sources=inputs, program=doProg)
    relations <- getRelationships(dp, quiet=quietOn)
    
    objectRelations = relations[relations$subject == getIdentifier(doIn),'object']
    for(objectRelation in objectRelations) {
        expect_true((objectRelation == 'http://purl.dataone.org/provone/2015/01/15/ontology#Data') | 
                        (objectRelation == binaryPath))
    }
    
    execId <- relations[relations$object == datapack:::provONEexecution,'subject']
    expect_match(relations[relations$predicate == datapack:::provUsed,'object'], getIdentifier(doIn))
    expect_match(relations[relations$predicate == datapack:::provUsed,'subject'], execId)
    expect_match(relations[relations$predicate == datapack:::provHadPlan,'object'], progId) 
    
    # Test removing a DataObject from the package - make sure that all the package relationships that had
    # the DataObject as a subject or object have been removed. This includes any relationship, i.e. cito:*
    # and prov:*.
    dp <- removeMember(dp, doIn, r=T)
    relations <- getRelationships(dp, quiet=quietOn)
    rels <- relations[relations$subject == getIdentifier(doIn), ]
    expect_equal(nrow(rels), 0)
    rels <- relations[relations$object == getIdentifier(doIn), ]
    expect_equal(nrow(rels), 0)
})

test_that("replaceMember works", {
    library(datapack)
    #
    # Now do the same test without the outputs so that 'prov:generqtedBy' are not inserted, but other
    # relationships are.
    dp <- new("DataPackage")
    inputs <- list()
    doProg <- new("DataObject", format="application/R", 
                  filename=system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack"))
    dp <- addMember(dp, doProg)
    progId <- getIdentifier(doProg)
    
    doIn <- new("DataObject", format="text/csv", 
                filename=system.file("./extdata/pkg-example/binary.csv", package="datapack"))
    dp <- addMember(dp, doIn)
    inputs[[length(inputs)+1]] <- doIn
    dp <- describeWorkflow(dp, sources=inputs, program=doProg)
    relations <- getRelationships(dp, quiet=quietOn)
    expect_match(relations[relations$subject == getIdentifier(doIn),'object'], 'Data')
    execId <- relations[relations$object == datapack:::provONEexecution,'subject']
    expect_match(relations[relations$predicate == datapack:::provUsed,'object'], getIdentifier(doIn))
    expect_match(relations[relations$predicate == datapack:::provUsed,'subject'], execId)
    expect_match(relations[relations$predicate == datapack:::provHadPlan,'object'], progId) 
    
    # Test removing a DataObject from the package - make sure that all the package relationships that had
    # the DataObject as a subject or object have been removed. This includes any relationship, i.e. cito:*
    # and prov:*.
    # Test replaceMember by substituting the zipped version of the file. 
    dp <- replaceMember(dp, doIn, replacement=system.file("./extdata/pkg-example/binary.csv.zip", package="datapack"),
                        format="application/octet-stream") 
    formatId <- getValue(dp, name="sysmeta@formatId", identifiers=getIdentifier(doIn))
    expect_match(formatId[[1]], "application/octet-stream")
})


test_that("updateMetadata works", {
    library(datapack)
    library(XML)
    # Create a DataObject and add it to the DataPackage
    dp <- new("DataPackage")
    sampleMeta <- system.file("./extdata/sample-eml.xml", package="datapack")
    metaObj <- new("DataObject", format="eml://ecoinformatics.org/eml-2.1.1", file=sampleMeta)
    metaId <- getIdentifier(metaObj)
    dp <- addMember(dp, metaObj)
    
    doIn <- new("DataObject", format="text/csv", 
                filename=system.file("./extdata/sample-data.csv", package="datapack"))
    doInId <- getIdentifier(doIn)
    dp <- addMember(dp, doIn, metaId)
    
    # In the metadata object, insert the newly assigned data 
    xp <- sprintf("//dataTable/physical/distribution[../objectName/text()=\"%s\"]/online/url", "sample-data.csv") 
    newURL <- sprintf("https://cn.dataone.org/cn/v2/resolve/%s", "abc1234xyz")
    dp <- updateMetadata(dp, metaObj, xpath=xp, replacement=newURL)
    
    # Now retrieve the new value from the metadata using an independent method (not using datapack) and see
    # if the metadata was actually updated.
    metadataDoc <- xmlInternalTreeParse(rawToChar(getData(metaObj)))
    nodeSet = xpathApply(metadataDoc, xp)
    URL <- xmlValue(nodeSet[[1]])
    
    expect_match(newURL, URL) 
    
    # Check that the package relationships have been updated correctly so that the new identifier
    # for the metadata object has replaced the old id.
    relations <- getRelationships(dp, quiet=TRUE)
    expect_equal(nrow(relations), 2)
    expect_match(relations[relations$subject == doInId,'object'], metaId)
    expect_match(relations[relations$subject == doInId,'predicate'], datapack:::citoIsDocumentedBy)
    expect_match(relations[relations$subject == metaId,'object'], doInId)
    expect_match(relations[relations$subject == metaId,'predicate'], datapack:::citoDocuments)
    
})

test_that("DataPackage accessPolicy methods", {
    library(datapack)
    library(digest)
   
    dp <- new("DataPackage")
    data <- charToRaw("1,2,3\n4,5,6\n")
    obj <- new("DataObject", id="id1", dataobj=data, format="text/csv")
    dp <- addMember(dp, obj)
    data2 <- charToRaw("7,8,9\n4,10,11\n")
    obj2 <- new("DataObject", id="id2", dataobj=data2, format="text/csv")
    dp <- addMember(dp, obj2)
    # Add access rule to all package members
    dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write")
    dp <- addAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
    
    expect_true(hasAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "write"))
    expect_true(hasAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission"))
    expect_false(hasAccessRule(dp, "uid=edwards,ou=Account,dc=labx,dc=com", "write"))
    
    # Now take 'changePermission' away for user 'uid=smith...'
    dp <- removeAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission")
    expect_false(hasAccessRule(dp, "uid=smith,ou=Account,dc=example,dc=com", "changePermission"))
    
    #' 
    #' # Alternatively, parameter "y" can be a data.frame containing one or more access rules:
    #' # Add write, changePermission for uid=jones,...
    dp <- addAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "write")
    dp <- addAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "changePermission")
    expect_true(hasAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "write"))
    expect_true(hasAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "changePermission"))
    #' # Now take privs for uid=jones,... away
    accessRules <- data.frame(subject=c("uid=jones,ou=Account,dc=example,dc=com", 
                                          "uid=jones,ou=Account,dc=example,dc=com"), 
                                          permission=c("write", "changePermission"))
    dp <- removeAccessRule(dp, accessRules)
    expect_false(hasAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "write"))
    expect_false(hasAccessRule(dp, "uid=jones,ou=Account,dc=example,dc=com", "changePermission"))
    
    # Test that setting public access works
    expect_false(hasAccessRule(dp, "public", "read"))
    
    # Test that custom access rules can be added to sysmeta of a DataObject
    accessRules <- data.frame(subject=c("uid=sanders,ou=Account,dc=example,dc=com", 
                                        "uid=wiggens,o=unaffiliated,dc=example,dc=org"), 
                              permission=c("write", "changePermission"), stringsAsFactors=FALSE)
    dp <- addAccessRule(dp, accessRules)
    expect_true(hasAccessRule(dp, "uid=sanders,ou=Account,dc=example,dc=com", "write"))
    expect_true(hasAccessRule(dp, "uid=wiggens,o=unaffiliated,dc=example,dc=org", "changePermission"))
    
    # Test that an access policy can be cleared, i.e. all access rules removed.
    dp <- clearAccessPolicy(dp)
    expect_false(hasAccessRule(dp, "uid=sanders,ou=Account,dc=example,dc=com", "write"))
    expect_false(hasAccessRule(dp, "uid=wiggens,o=unaffiliated,dc=example,dc=org", "changePermission"))
    
    # Now add public read access to all members
    dp <- setPublicAccess(dp)
    expect_true(hasAccessRule(dp, "public", "read"))
})

test_that("DataPackage member selection, set, get methods", {
    library(datapack)
    library(digest)
    
    dp <- new("DataPackage")
    inputs <- list()
    outputs <- list()
    doProg <- new("DataObject", format="application/R", 
                  filename=system.file("./extdata/pkg-example/logit-regression-example.R", package="datapack"))
    dp <- addMember(dp, doProg)
    progId <- getIdentifier(doProg)
    doIn <- new("DataObject", format="text/csv", 
                filename=system.file("./extdata/pkg-example/binary.csv", package="datapack"))
    dp <- addMember(dp, doIn)
    
    doIn2 <- new("DataObject", format="text/csv", 
                filename=system.file("./extdata/sample-data.csv", package="datapack"))
    dp <- addMember(dp, doIn2) 
    
    doOut <- new("DataObject", format="image/png", 
                 filename=system.file("./extdata/pkg-example/gre-predicted.png", package="datapack"))
    dp <- addMember(dp, doOut)
    
    pid <- selectMember(dp, name="sysmeta@formatId", value="application/R")
    expect_equal(pid, progId)
    
    pids <- selectMember(dp, name="sysmeta@formatId", value="text/csv")
    expect_true(getIdentifier(doIn) %in% pids)
    expect_true(getIdentifier(doIn2) %in% pids)
    expect_equal(length(pids), 2)
    
    dp <- setValue(dp, name="sysmeta@rightsHolder", value="orcid.org/0000-0002-2192-403X", identifiers=getIdentifiers(dp))
    userIds <- selectMember(dp, name="sysmeta@rightsHolder", value="orcid.org/0000-0002-2192-403X")
    expect_equal(length(userIds), 4)
    
    vals <- getValue(dp, name="sysmeta@rightsHolder")
    expect_true(all(vals == "orcid.org/0000-0002-2192-403X"))
})
ropensci/datapack documentation built on June 11, 2022, 11:20 a.m.