R/helpers.R

Defines functions get_coord_list recover_failed_submission read_zip_shapefile create_dummy_package_full create_dummy_enumeratedDomain_dataframe create_dummy_attributes_dataframe create_dummy_parent_package create_dummy_package create_dummy_object create_dummy_metadata

Documented in create_dummy_attributes_dataframe create_dummy_enumeratedDomain_dataframe create_dummy_metadata create_dummy_object create_dummy_package create_dummy_package_full create_dummy_parent_package get_coord_list read_zip_shapefile recover_failed_submission

# Various helper functions for things like testing a package


#' Create a test metadata object
#'
#' Create a test EML metadata object.
#'
#' @param mn (MNode) The Member Node.
#' @param data_pids (character) Optional. PIDs for data objects the metadata documents.
#'
#' @return (character) The PID of the published metadata document.
#'
#' @export
#'
#' @examples
#'\dontrun{
#' # Set environment
#' cn <- CNode("STAGING2")
#' mn <- getMNode(cn,"urn:node:mnTestKNB")
#' pid <- create_dummy_metadata(mn)
#' }
create_dummy_metadata <- function(mn, data_pids = NULL) {

  # Make sure the node is not a production node
  if (mn@env == "prod") {
    stop('Can not create dummy metadata on production node.')
  }

  pid <- paste0("urn:uuid:", uuid::UUIDgenerate())
  me <- get_token_subject()

  # Copy the original EML file to a temporary place
  original_file <- file.path(system.file(package = "arcticdatautils"),
                             "example-eml-220.xml")
  metadata_file <- tempfile()
  file.copy(original_file, metadata_file)

  sysmeta <- new("SystemMetadata",
                 id = pid,
                 formatId = "https://eml.ecoinformatics.org/eml-2.2.0",
                 size = file.size(metadata_file),
                 checksum = digest::digest(metadata_file, algo = "sha256", serialize = FALSE, file = TRUE),
                 checksumAlgorithm = "SHA-256",
                 submitter = me,
                 rightsHolder = me,
                 fileName = "dummy_science_metadata.xml")

  # Temporarily clear out the replication policy to work around NCEI not being
  # Tier 4 MN
  sysmeta <- clear_replication_policy(sysmeta)

  sysmeta <- add_admin_group_access(sysmeta)
  sysmeta <- datapack::addAccessRule(sysmeta, "public", "read")

  message(paste0("Creating metadata ", pid))
  pid <- dataone::createObject(mn, pid, metadata_file, sysmeta)

  # Remove the temporary EML File
  file.remove(metadata_file)

  pid
}


#' Create a test object
#'
#' Create a test data object. Make sure the member node you use is not a production node.
#'
#' @param mn (MNode) The Member Node.
#'
#' @return (character) The PID of the dummy object.
#'
#' @export
#'
#' @examples
#'\dontrun{
#' # Set environment
#' cn <- CNode("STAGING2")
#' mn <- getMNode(cn,"urn:node:mnTestKNB")
#'
#' pid <- create_dummy_object(mn)
#'}
create_dummy_object <- function(mn) {

  # Make sure the node is not a production node
  if (mn@env == "prod") {
    stop('Can not create dummy object on production node.')
  }

  pid <- paste0("urn:uuid:", uuid::UUIDgenerate())
  me <- get_token_subject()
  tmp <- tempfile()

  writeLines(paste0(sample(LETTERS, 26, replace = TRUE), collapse = ""), con = tmp)

  sysmeta <- new("SystemMetadata",
                 id = pid,
                 formatId = "application/octet-stream",
                 size = file.size(tmp),
                 checksum = digest::digest(tmp, algo = "sha256", serialize = FALSE, file = TRUE),
                 checksumAlgorithm = "SHA-256",
                 submitter = me,
                 rightsHolder = me,
                 fileName = "dummy_object")

  # Temporarily clear out the replication policy to work around NCEI not being
  # Tier 4 MN
  sysmeta <- clear_replication_policy(sysmeta)

  sysmeta <- add_admin_group_access(sysmeta)
  sysmeta <- datapack::addAccessRule(sysmeta, "public", "read")

  message(paste0("Creating object ", pid))
  create_response <- dataone::createObject(mn, pid, tmp, sysmeta)

  file.remove(tmp)

  create_response
}


#' Create a test package
#'
#' Create a full test data package with data objects and 1 metadata object. Size = the number of data objects you want in the dummy package + 1 metadata object.
#'
#' @param mn (MNode) The Member Node.
#' @param size (numeric) The number of files in the package, including the metadata file.
#'
#' @return (list) The PIDs for all elements in the data package.
#'
#' @export
#'
#' @examples
#'\dontrun{
#' # Set environment
#' cn <- CNode("STAGING2")
#' mn <- getMNode(cn,"urn:node:mnTestKNB")
#' #Create dummy package with 5 data objects and 1 metadata object
#' pids <- create_dummy_package(mn, 6)
#' }
create_dummy_package <- function(mn, size = 2) {

  # Make sure the node is not a production node
  if (mn@env == "prod") {
    stop('Can not create dummy package on production node.')
  }

  me <- get_token_subject()

  # Data objects
  if (size > 1) {
    data_pids <- sapply(seq_len(size - 1), function(i) {
      create_dummy_object(mn)
    })

    data_pids <- data_pids[!is.na(data_pids)]  # Filter NA pids (failed creates)

  } else {
    data_pids <- NULL
  }

  # Metadata objects
  meta_pid <- create_dummy_metadata(mn, data_pids = data_pids)

  pid <- paste0("urn:uuid:", uuid::UUIDgenerate())
  resmap_path <- generate_resource_map(meta_pid,
                                       data_pids,
                                       resource_map_pid = pid)

  sysmeta <- new("SystemMetadata",
                 identifier = pid,
                 formatId = "http://www.openarchives.org/ore/terms",
                 size = file.size(resmap_path),
                 checksum = digest::digest(resmap_path, algo = "sha256", serialize = FALSE, file = TRUE),
                 checksumAlgorithm = "SHA-256",
                 submitter = me,
                 rightsHolder = me,
                 fileName = "dummy_resource_map.xml")

  # Temporarily clear out the replication policy to work around NCEI not being
  # Tier 4 MN
  sysmeta <- clear_replication_policy(sysmeta)

  sysmeta <- add_admin_group_access(sysmeta)
  sysmeta <- datapack::addAccessRule(sysmeta, "public", "read")

  message(paste0("Creating resource map ", pid))
  resource_map_pid <- dataone::createObject(mn, pid, resmap_path, sysmeta)

  list(metadata = meta_pid,
       resource_map = resource_map_pid,
       data = data_pids)
}


#' Create a test parent package
#'
#' Create a test parent data package. Make sure the node is not a production node.
#'
#' @param mn (MNode) The Member Node.
#' @param children (character) Child package (resource maps) PIDs.
#'
#' @return (list) The resource map PIDs for both the parent and child packages.
#'
#' @export
#'
#' @examples
#'\dontrun{
#' # Set environment
# cn <- CNode("STAGING2")
# mn <- getMNode(cn,"urn:node:mnTestKNB")
#
# child_pid <- "urn:uuid:39a59f99-118b-4c81-9747-4b6c43308e00"
#
# create_dummy_parent_package(mn, child_pid)
#'}
create_dummy_parent_package <- function(mn, children) {

  # Make sure the node is not a production node
  if (mn@env == "prod") {
    stop('Can not create dummy parent package on production node.')
  }

  me <- get_token_subject()
  meta_pid <- create_dummy_metadata(mn)

  pid <- paste0("urn:uuid:", uuid::UUIDgenerate())
  resmap_path <- generate_resource_map(meta_pid,
                                       data_pids = c(),
                                       child_pids = children,
                                       resource_map_pid = pid)

  sysmeta <- new("SystemMetadata",
                 identifier = pid,
                 formatId = "http://www.openarchives.org/ore/terms",
                 size = file.size(resmap_path),
                 checksum = digest::digest(resmap_path, algo = "sha256", serialize = FALSE, file = TRUE),
                 checksumAlgorithm = "SHA-256",
                 submitter = me,
                 rightsHolder = me,
                 fileName = "dummy_resource_map.xml")

  # Temporarily clear out the replication policy to work around NCEI not being
  # Tier 4 MN
  sysmeta <- clear_replication_policy(sysmeta)

  sysmeta <- add_admin_group_access(sysmeta)
  sysmeta <- datapack::addAccessRule(sysmeta, "public", "read")

  message(paste0("Creating parent package map ", pid))
  create_response <- createObject(mn, pid, resmap_path, sysmeta)

  list(parent = create_response,
       children = children)
}


#' Create test attributes data.frame
#'
#' Create a test data.frame of attributes.
#'
#' @param numberAttributes (integer) Number of attributes to be created in the table.
#' @param factors (character) Optional vector of factor names to include.
#'
#' @return (data.frame) A data.frame of attributes.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Create dummy attribute dataframe with 6 attributes and 1 factor
#' attributes <- create_dummy_attributes_dataframe(6, c("Factor1", "Factor2"))
#' }
create_dummy_attributes_dataframe <- function(numberAttributes, factors = NULL) {
  names <- vapply(seq_len(numberAttributes), function(x) { paste0("Attribute ", x)}, "")
  domains <- rep("textDomain", numberAttributes)

  if(!is.null(factors)) {
    domains <- c(rep("textDomain", numberAttributes - length(factors)),
                 rep("enumeratedDomain", length(factors)))
    names[seq((numberAttributes - length(factors) + 1), numberAttributes)] <- factors
  }

  attributes <- data.frame(attributeName = names,
                           attributeDefinition = names,
                           measurementScale = rep("nominal", numberAttributes),
                           domain = domains,
                           formatString = rep(NA, numberAttributes),
                           definition = names,
                           unit = rep(NA, numberAttributes),
                           numberType = rep(NA, numberAttributes),
                           missingValueCode = rep(NA, numberAttributes),
                           missingValueCodeExplanation = rep(NA, numberAttributes),
                           stringsAsFactors = FALSE)

  attributes
}


#' Create test enumeratedDomain data.frame
#'
#' Create a test data.frame of enumeratedDomains.
#'
#' @param factors (character) Vector of factor names to include.
#'
#' @return (data.frame) A data.frame of factors.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Create dummy dataframe of 2 factors/enumerated domains
#' attributes <- create_dummy_enumeratedDomain_dataframe(c("Factor1", "Factor2"))
#' }
create_dummy_enumeratedDomain_dataframe <- function(factors) {
  names <- rep(factors, 4)
  enumeratedDomains <- data.frame(attributeName = names,
                                  code = paste0(names, seq_along(names)),
                                  definition = names)

  enumeratedDomains
}


#' Create dummy package with fuller metadata
#'
#' Creates a more complete package than [create_dummy_package()]
#' but is otherwise based on the same concept. This dummy
#' package includes multiple data objects, responsible parties,
#' geographic locations, method steps, etc.
#'
#' @param mn (MNode) The Member Node.
#' @param title (character) Optional. Title of package. Defaults to "A Dummy Package".
#'
#' @return (list) The PIDs for all elements in the data package.
#'
#' @import EML
#' @import dataone
#'
#' @export
create_dummy_package_full <- function(mn, title = "A Dummy Package") {
  stopifnot(is(mn, "MNode"))
  stopifnot(is.character(title), nchar(title) > 0)
  if (mn@env == "prod") {
    stop("Cannot create dummy package on production node.")
  }

  # Create objects
  file.create(c("dummy1.csv", "dummy2.csv", "dummy1.jpg", "dummy1.R"))
  # TODO: add actual data to dummy files

  pid_csv1 <- publish_object(mn,
                             path = "dummy1.csv",
                             format_id = "text/csv")

  pid_csv2 <- publish_object(mn,
                             path = "dummy2.csv",
                             format_id = "text/csv")

  pid_jpg1 <- publish_object(mn,
                             path = "dummy1.jpg",
                             format_id = "image/jpeg")

  pid_R1 <- publish_object(mn,
                           path = "dummy1.R",
                           format_id = "application/R")

  unlink(c("dummy1.csv", "dummy2.csv", "dummy1.jpg", "dummy1.R"))

  data_pids <- c(pid_csv1, pid_csv2, pid_jpg1, pid_R1)

  # Import EML
  eml_path_original <- file.path(system.file(package = "arcticdatautils"), "example-eml-full.xml")
  doc <- EML::read_eml(eml_path_original)

  # Add objects to EML
  doc$dataset$title <- title

  attr <- data.frame(
    attributeName = c("Date", "Location", "Salinity", "Temperature"),
    attributeDefinition = c("Date sample was taken on", "Location code representing location where sample was taken", "Salinity of sample in PSU", "Temperature of sample"),
    measurementScale = c("dateTime", "nominal","ratio", "interval"),
    domain = c("dateTimeDomain", "enumeratedDomain","numericDomain", "numericDomain"),
    formatString = c("MM-DD-YYYY", NA, NA, NA),
    definition = c(NA,NA, NA, NA),
    unit = c(NA, NA, "dimensionless", "celsius"),
    numberType = c(NA, NA, "real", "real"),
    missingValueCode = c(NA, NA, NA, NA),
    missingValueCodeExplanation = c(NA, NA, NA, NA),
    stringsAsFactors = FALSE)

  location <- c(CASC = "Cascade Lake", CHIK = "Chikumunik Lake", HEAR = "Heart Lake", NISH = "Nishlik Lake")
  fact <- data.frame(attributeName = "Location", code = names(location), definition = unname(location))

  attributeList <- EML::set_attributes(attributes = attr, factors = fact)

  dT1 <- pid_to_eml_entity(mn,
                           pid = pid_csv1,
                           entity_type = "dataTable")
  dT1$attributeList <- attributeList

  dT2 <- pid_to_eml_entity(mn,
                           pid = pid_csv2,
                           entity_type = "dataTable")
  dT2$attributeList <- attributeList

  doc$dataset$dataTable <- list(dT1, dT2)

  oE1 <- pid_to_eml_entity(mn,
                           pid = pid_jpg1,
                           entity_type = "otherEntity")

  oE2 <- pid_to_eml_entity(mn,
                           pid = pid_R1,
                           entity_type = "otherEntity")

  doc$dataset$otherEntity <- list(oE1, oE2)

  eml_path <- tempfile(fileext = ".xml")
  EML::write_eml(doc, eml_path)

  pid_eml <- publish_object(mn,
                            path = eml_path,
                            format_id = "https://eml.ecoinformatics.org/eml-2.2.0")

  # Create resource map
  resource_map_pid <- create_resource_map(mn,
                                          metadata_pid = pid_eml,
                                          data_pids = data_pids)

  file.remove(eml_path)

  return(list(resource_map = resource_map_pid,
              metadata = pid_eml,
              data = data_pids))
}

#' Read a shapefile from a pid
#'
#' Read a shapefile 'sf' from a pid that points to the zipped directory of the shapefile and associated files
#' on a given member node.
#'
#' @param mn (MNode) A DataOne Member Node
#' @param pid (character) An object identifier
#'
#' @return shapefile (sf) The shapefile as an `sf` object
#'
#' @export
#'
#' @author Jeanette Clark jclark@@nceas.ucsb.edu
#'
#' @examples
#' \dontrun{
#' cn <- dataone::CNode('PROD')
#' adc <- dataone::getMNode(cn,'urn:node:ARCTIC')
#' pid <- "urn:uuid:294a365f-c0d1-4cc3-a508-2e16260aa70c"
#'
#' shapefile <- read_zip_shapefile(adc, pid)
#' }
read_zip_shapefile <- function(mn, pid){

  stopifnot(methods::is(mn, 'MNode'))
  stopifnot(is.character(pid))

  if (!requireNamespace("sf")) {
    stop(call. = FALSE,
         "The package 'sf' must be installed to run this function. ",
         "Please install it and try again.")
  }

  temp <- tempfile()
  writeBin(dataone::getObject(mn, pid), temp)
  zip_contents <- utils::unzip(temp, exdir = tempfile())
  zip_contents_f <- grep("__MACOSX", zip_contents, value = TRUE, invert = TRUE)

  if (length(grep("shp", tools::file_ext(zip_contents_f))) !=
      1) {
    stop("Zipped directory must contain one and only one .shp file")
  }
  shapefile <- sf::st_read(zip_contents_f[grep("shp", tools::file_ext(zip_contents_f))],
                           quiet = T, stringsAsFactors = F)
  unlink(temp)

  return(shapefile)
}


#' Recovers failed submissions
#'
#' Recovers failed submissions and writes the new, valid EML to a given path
#'
#' @param node (MNode) The Member Node to publish the object to.
#' @param pid The PID of the EML metadata document to be recovered.
#' @param path path to write XML.
#'
#' @return recovers and write the valid EML to the indicated path
#'
#' @export
#'
#' @author Rachel Sun rachelsun@ucsb.edu
#'
#' @examples
#' \dontrun{
#' # Set environment
#' cn <- dataone::CNode("STAGING2")
#' mn <- dataone::getMNode(cn,"urn:node:mnTestKNB")
#' pid <- "urn:uuid:b1a234f0-eed5-4f58-b8d5-6334ce07c010"
#' path <- tempfile("file", fileext = ".xml")
#' recover_failed_submission(mn, pid, path)
#' eml <- EML::read_eml(path)
#'}



recover_failed_submission <- function(node, pid, path){
  stopifnot(is(node, "MNode"))
  stopifnot(is.character(pid), nchar(pid) > 0, arcticdatautils::object_exists(node, pid))

  convert_to_text <- dataone::getObject(node, pid) %>%
    rawToChar()
  remove_error_tag <- paste0(convert_to_text, collapse = "") %>%
    stringr::str_remove(".*</error>`") %>%
    stringr::str_remove("EML draft.*`") %>%
    stringr::str_remove_all("&nbsp;") %>%
    stringr::str_trim()

  doc <- EML::read_eml(remove_error_tag)
  EML::eml_validate(doc)
  EML::write_eml(doc, path)
}

#' Get list of Coordinate Reference Systems
#'
#' Get a data.frame of EML coordinate reference systems that can
#' be searched and filtered more easily than the raw XML file.
#'
#' @export
#'
get_coord_list <- function(){
  geo_list <- read_eml("https://raw.githubusercontent.com/NCEAS/eml/4417cbf6588fdca4e06bd67190a9d7a18a8e944f/eml-spatialReferenceDictionary.xml")

  coord_df <- data.frame(horizCoordSysDef = rep(NA, length(geo_list$horizCoordSysDef)),
                         geogCoordSys = rep(NA, length(geo_list$horizCoordSysDef)),
                         projection = rep(NA, length(geo_list$horizCoordSysDef)),
                         datum = rep(NA, length(geo_list$horizCoordSysDef)),
                         proj_unit = rep(NA, length(geo_list$horizCoordSysDef)))

  for (i in 1:length(geo_list$horizCoordSysDef)){
    coord_df$horizCoordSysDef[i] <- geo_list$horizCoordSysDef[[i]]$name

    if (!is.null(geo_list$horizCoordSysDef[[i]]$projCoordSys)){
      coord_df$geogCoordSys[i]  <- geo_list$horizCoordSysDef[[i]]$projCoordSys$geogCoordSys$name
      coord_df$datum[i]  <- geo_list$horizCoordSysDef[[i]]$projCoordSys$geogCoordSys$datum$name
      coord_df$projection[i] <- geo_list$horizCoordSysDef[[i]]$projCoordSys$projection$name
      coord_df$proj_unit[i] <- geo_list$horizCoordSysDef[[i]]$projCoordSys$projection$unit$name
    } else {
      coord_df$geogCoordSys[i]  <- geo_list$horizCoordSysDef[[i]]$geogCoordSys$name
      coord_df$datum[i]  <- geo_list$horizCoordSysDef[[i]]$geogCoordSys$datum$name
      coord_df$projection[i] <- NA
      coord_df$proj_unit[i] <- NA
    }
  }
  return(coord_df)
}
NCEAS/arcticdatautils documentation built on Aug. 28, 2023, 12:10 p.m.