arcticdatautils: Utilities for the Arctic Data Center

# Functions related to fixing invalid ISO metadata

# Some functions just test whether a validation issue is present. These are
# prefixed with the text "test". Exactly what they are testing should be
# described in the docstrings.
#
# Other functons fix the bad metadata in place (modifying the original file)
# and these functions are prefixed with "fix_". Exactly what they are fixing
# should be described in the docstrings.
#
# Example usage:
#
# Find and fix documents in 'mydir' that have extra whitespace in their
# topicCategory element(s)
#
# the_files <- dir(mydir)
# bad_enums <- the_files[which(sapply(the_files, test_has_bad_enum))]
# sapply(bad_enums, fix_bad_enums)


test_has_abstract <- function(path) {
  stopifnot(file.exists(path))

  doc <- try({
    XML::xmlParse(path)
  })

  if (inherits(doc, "try-error")) {
    warning(paste0("Document at path ", path, " failed to parse. Returning TRUE anyway."))
    return(TRUE)
  }

  stopifnot("XMLInternalDocument" %in% class(doc))
  # TODO

  TRUE
}


test_has_bad_topic <- function(path) {
  stopifnot(file.exists(path))

  doc <- try({
    XML::xmlParse(path)
  })

  if (inherits(doc, "try-error")) {
    warning(paste0("Document at path ", path, " failed to parse. Returning FALSE anyway."))
    return(FALSE)
  }

  stopifnot("XMLInternalDocument" %in% class(doc))

  # Try to get gmd:topicCategory nodes
  topic_category_nodes <- try({
    XML::getNodeSet(doc, "/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:topicCategory")
  }, silent = TRUE)

  # Return FALSE if executing the xPath failed
  if (inherits(topic_category_nodes, "try-error")) {
    return(FALSE)
  }

  # Return false if there were on gmd:topicCategory nodes
  if (length(topic_category_nodes) == 0) {
    return(FALSE)
  }

  # If we have gmd:topicCategory nodes, scan each one and return TRUE if
  # any of the nodes has more than one child
  for (i in 1:length(topic_category_nodes)) {
    children <- XML::xmlChildren(topic_category_nodes[[1]])

    if (length(children) > 1) {
      return(TRUE)
    }
  }

  # Return FALSE as a fall-through
  FALSE
}


test_has_bad_enum <- function(path) {
  stopifnot(file.exists(path))

  doc <- try({
    XML::xmlParse(path)
  })

  if (inherits(doc, "try-error")) {
    warning(paste0("Document at path ", path, " failed to parse. Returning FALSE anyway."))
    return(FALSE)
  }

  stopifnot("XMLInternalDocument" %in% class(doc))

  # Try to get gmd:topicCategory nodes
  topic_category_nodes <- try({
    XML::getNodeSet(doc, "/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:topicCategory/gmd:MD_TopicCategoryCode")
  }, silent = TRUE)

  # Return FALSE if executing the xPath failed
  if (inherits(topic_category_nodes, "try-error")) {
    return(FALSE)
  }

  # Return false if there were on gmd:topicCategory nodes
  if (length(topic_category_nodes) == 0) {
    return(FALSE)
  }

  # If we have gmd:topicCategory nodes, scan each one and return TRUE if
  # any of the nodes has more than one child
  for (i in 1:length(topic_category_nodes)) {
    children <- XML::xmlChildren(topic_category_nodes[[i]])

    for (i in 1:length(children)) {
      child <- children[[i]]
      child_value <- XML::xmlValue(child)

      if (nchar(child_value) != nchar(stringr::str_trim(child_value))) {
        return(TRUE)
      }
    }
  }

  # Return FALSE as a fall-through
  FALSE
}


#' Fix a metadata record with a bad topicCategory
#'
#' This is the case where the ISO schema says what's inside a
#' gmd:MD_TopicCategoryCode element should match items from a controlled
#' vocabulary. But in the ISO metadata we have, there are newlines and spaces
#' around that text which causes a check for string equality to fail. i.e.
#'
#' 'oceans' != '     oceans     '
#'
#' @param path (character) A file path.
#'
#' @noRd
fix_bad_enum <- function(path) {
  stopifnot(file.exists(path))

  doc <- XML::xmlParse(path)
  stopifnot("XMLInternalDocument" %in% class(doc))

  # Try to get gmd:topicCategory nodes
  topic_category_nodes <- try({
    XML::getNodeSet(doc, "/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:topicCategory/gmd:MD_TopicCategoryCode")
  }, silent = TRUE)

  # Return FALSE if executing the xPath failed
  if (inherits(topic_category_nodes, "try-error")) {
    return(doc)
  }

  for (i in 1:length(topic_category_nodes)) {
    topic_category_node <- topic_category_nodes[[i]]

    try({
      XML::xmlValue(topic_category_node[[1]]) <- stringr::str_trim(XML::xmlValue(topic_category_node[[1]]))
    }, silent = TRUE)
  }

  writeLines(XML::saveXML(doc), con = path)
}


#' Fix a metadata record with multiple MD_TopicCategory children elements
#' inside a single topicCategory element.
#'
#' Example:
#'
#' <gmd:topicCategory>
#'   <gmd:MD_TopicCategoryCode>imageryBaseMapsEarthCover</gmd:MD_TopicCategoryCode>
#'   <gmd:MD_TopicCategoryCode>oceans</gmd:MD_TopicCategoryCode>
#' </gmd:topicCategory>
#'
#' @param path (character) A file path.
#'
#' @noRd
fix_bad_topic <- function(path) {
  stopifnot(file.exists(path))

  doc <- XML::xmlParse(path)
  stopifnot("XMLInternalDocument" %in% class(doc))

  # Get the MD_DataIdentification element so we can search it for topicCategory
  # nodes
  data_ident_nodes <- try({
    XML::getNodeSet(doc, "/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification")
  }, silent = TRUE)

  # Return FALSE if executing the xPath failed
  if (inherits(data_ident_nodes, "try-error")) {
    return(-1)
  }

  for (i in 1:length(data_ident_nodes)) {
    data_ident_node <- data_ident_nodes[[i]]
    children <- XML::xmlChildren(data_ident_node)

    topic_category_indices <- which(names(children) %in% "topicCategory")

    if (length(topic_category_indices) == 0) {
      next
    }

    for (topic_category_index in topic_category_indices) {
      topic_category_node <- children[[topic_category_index]]

      # If it has more than one child, fix it
      # Extract its values
      # Remove it from the document
      # Insert a new topicCategory for each topic
      topic_category_node_children <- XML::xmlChildren(topic_category_node)

      # Skip this one if it doesn't have multiple children
      if (length(topic_category_node_children) <= 1) {
        next
      }

      # Accumulate topics
      topics <- unlist(lapply(topic_category_node_children, function(x) {
        stringr::str_trim(XML::xmlValue(x))
      }))

      # Due to the ISO schema, we have to respect the order of where
      # the topicCategory node goes in the DataIdentification node. To do this
      # we need to keep the origin topicCategory node, modifying it to only
      # have one topic category inside it, then add the other topic categories
      # as siblings (the siblings is imporrtant here).
      first_topic <- topics[1]
      print(first_topic)
      topic_nodes <- XML::xmlChildren(topic_category_node)

      XML::removeChildren(topic_category_node, kids = seq(1, length(topic_nodes)))

      XML::newXMLNode("gmd:MD_TopicCategoryCode",
                      first_topic,
                      parent = topic_category_node)

      for (topic in topics[2:length(topics)]) {
        print(topic)
        XML::newXMLNode("gmd:topicCategory", XML::newXMLNode("gmd:MD_TopicCategoryCode", topic), sibling = topic_category_node)
      }
    }
  }

  writeLines(XML::saveXML(doc), con = path)

  # Workaround
  #
  # For some reason the XML package adds a dummy namespace even when I add
  # a new node as a sibling to a node that has an ancestor with the namesspace
  # This is a REALLY quick and dirty way to fix that line of code and move
  # on

  lines <- readLines(path)
  pattern <- " xmlns:gmd=\"<dummy>\""
  matched_lines <- grep(pattern, lines)

  for (i in matched_lines) {
    lines[i] <- gsub(pattern, "", lines[i])
  }

  writeLines(lines, con = path)
}


#' Use XMLStarlet to pretty-print an XML document
#'
#' This command just runs `xmlstarlet path > path`, doing a simple
#' pretty-printing of the file located at `path`.
#'
#' Note that this function is doing an in-place pretty printing instead of
#' returning the pretty-printed text.
#'
#' Note that this command uses a temporary file as an intermediate step in the
#' pretty-printing process. For some reason, when running xmlstarlet from within
#' R, the same file can't be used as the input to `xmlstarlet format` and as the
#' shell redirection file (`> somefile.txt`). If you try to run `xmlstarlet
#' format` on the same file as you redirect to, you get a weird parse error from
#' xmlstarlet.
#'
#' @param path (character) A file path.
#'
#' @return The result of the `system` command (0 = success).
#'
#' @noRd
pretty_print <- function(path) {
  stopifnot(file.exists(path),
            file.info(path)$size > 0)

  # Detect xmlstarlet
  starlet <- system("which xmlstarlet",
                    ignore.stdout = TRUE)

  if (starlet != 0) {
    stop("Failed to run `xmlstarlet`. Is it in the $PATH?")
  }

  # Use a tempfile because this system command does weird stuff if you
  # try to format and redirect back to the file you're formatting
  tmp_file <- tempfile()
  copy_result <- file.copy(from = path,
                           to = tmp_file)

  if (copy_result != TRUE) {
    stop(paste0("Copy job failed for some reason between ", path, " and ", tmp_file, "."))
  }

  command <- paste0("xmlstarlet format ", tmp_file, " > ", path)

  system_result <- system(command)

  if (system_result != 0) {
    stop(paste0("Result of the command `", command, "` was not zero."))
  }

  system_result
}