R/XMLClasses.R

Defines functions xmlAncestors getXMLPath xpathApply.XMLInternalNode getRootNode xmlDoc xpathApply.XMLInternalDocument xpathApply.XMLNode xpathApply.XMLNode xpathSApply getNodeSet getDefaultNamespace print.XMLProcessingInstruction print.XMLCDataNode print.XMLEntityRef print.XMLTextNode print.XMLCommentNode xmlSize.XMLDocument length.XMLNode names.XMLNode xmlAttrs xmlChildren oldClass setOldClass setS3Method

Documented in getDefaultNamespace getNodeSet length.XMLNode names.XMLNode print.XMLCDataNode print.XMLEntityRef print.XMLProcessingInstruction print.XMLTextNode xmlAncestors xmlAttrs xmlChildren xmlDoc xmlSize.XMLDocument xpathSApply

#
# This file contains the definitions of methods
# for operating on the XMLNode objects to make
# the more user-friendly.  Specifically, these
# methods are
#       print   displays the contents of a node and children
#               as XML text rather than R/S list
#
#       size    returns the number of children
#
#       name    retrieves the tag name
#
#       attrs   retrieves the attributes element of the XML node
#
#    [ and [[   access the children
#                 (To get at the regular R/S fields in the object, use $
#                    e.g.  node$name, node$attributes, node$value)

#
# In S4/Splus5, we should use the new class mechanism.
#

setS3Method =
function(fun, class) {
  if(!useS4)
    return()

  cat("setting method for", fun, class, "\n")
  setMethod(fun, class, get(paste(fun, class, sep = ".")), where = topenv(parent.frame()))
}

if(FALSE)
setOldClass =
function(classes)
{
   ancestors = unique(sapply(classes[-1], oldClass))
   if(length(ancestors)) {
       classes = c(classes[1], ancestors)
       oldClassTable[[ classes[1] ]] <<- ancestors
   }
   methods::setOldClass(classes)
}

# For R 2.7.2 and older.  In 2.8.0, extends() for setOldClass() works
# better.

oldClassTable =  list(
  "XMLNode" =  c("RXMLAbstractNode", "XMLAbstractNode"),
  "XMLTextNode" = c("XMLNode", "RXMLAbstractNode", "XMLAbstractNode"),
  "XMLPINode" = c( "XMLNode", "RXMLAbstractNode", "XMLAbstractNode") ,
  "XMLProcessingInstruction" = c( "XMLNode", "RXMLAbstractNode", "XMLAbstractNode") ,
  "XMLCommentNode" = c("XMLNode", "XMLTextNode", "RXMLAbstractNode", "XMLAbstractNode"),
  "XMLCDataNode" = c("XMLNode", "RXMLAbstractNode", "XMLAbstractNode"),
  "XMLHashTree" = c("XMLAbstractDocument"),
  "XMLHashTreeNode" = c("RXMLAbstractNode"),
  "XMLDocumentContent" = c(),
  "XMLDocument" = c("XMLAbstractDocument"),
  "XMLHashTree" = c("XMLAbstractDocument"),
  "XMLInternalDocument" = c("XMLAbstractDocument"),
  "HTMLInternalDocument" = c("XMLInternalDocument", "XMLAbstractDocument"),
  "XMLTreeNode" = c("RXMLAbstractNode")
)



oldClass =
function(class)
{
  if(version$major == "2" && as.integer(version$minor) >= 8)
     return(unique(c(class, extends(class))))

   c(class, oldClassTable[[ class ]])
}

###############################
# These were in xmlNodes, but need to be defined earlier.

setOldClass("XMLAbstractDocument")
setOldClass(c("XMLInternalDocument", "XMLAbstractDocument"))
setOldClass(c("XMLHashTree", "XMLAbstractDocument"))
setOldClass(c("XMLDocument", "XMLAbstractDocument"))

#XXXsetOldClass(c("HTMLInternalDocument", "XMLInternalDocument")) # , "XMLAbstractDocument"))
setOldClass(c("HTMLInternalDocument", "XMLInternalDocument", "XMLAbstractDocument"))

setOldClass("XMLAbstractNode")
setOldClass(c("RXMLAbstractNode", "XMLAbstractNode"))


# Why do we have to repeat this class inheritance information?
# We don't!
# setOldClass(c("XMLHashTreeNode", "RXMLAbstractNode", "XMLAbstractNode"))
# setOldClass(c("XMLNode", "RXMLAbstractNode", "XMLAbstractNode"))
# setOldClass(c("XMLTextNode", "XMLNode", "RXMLAbstractNode", "XMLAbstractNode"))
# setOldClass(c("XMLPINode", "XMLNode", "RXMLAbstractNode", "XMLAbstractNode"))
# setOldClass(c("XMLCommentNode", "XMLNode", "XMLTextNode", "RXMLAbstractNode", "XMLAbstractNode"))
# setOldClass(c("XMLProcessingInstruction", "XMLNode", "RXMLAbstractNode", "XMLAbstractNode"))
# setOldClass(c("XMLCDataNode", "XMLNode", "RXMLAbstractNode", "XMLAbstractNode"))


setOldClass(c("XMLHashTreeNode", "RXMLAbstractNode"))
setOldClass(c("XMLNode", "RXMLAbstractNode"))
###setOldClass(c("XMLTextNode", "XMLNode"))
methods::setOldClass(c("XMLTextNode", "XMLTextNode", "XMLNode", "RXMLAbstractNode", "XMLAbstractNode" ))
#setOldClass(c("XMLEntitiesEscapedTextNode", "XMLTextNode", "XMLNode", "RXMLAbstractNode", "XMLAbstractNode"))
#setOldClass(c("XMLEntitiesEscapedTextNode", "XMLTextNode"))
setOldClass(c("XMLPINode", "XMLNode"))
setOldClass(c("XMLCommentNode", "XMLNode"))
setOldClass(c("XMLProcessingInstruction", "XMLNode"))
setOldClass(c("XMLCDataNode", "XMLNode"))


setOldClass(c("XMLTreeNode", "XMLNode", "RXMLAbstractNode", "XMLAbstractNode" ))




setOldClass(c("XMLInternalNode", "XMLAbstractNode"))

setOldClass(c("XMLInternalCDataNode", "XMLInternalNode"))
setOldClass(c("XMLInternalPINode", "XMLInternalNode"))
setOldClass(c("XMLInternalCommentNode", "XMLInternalNode"))

setOldClass(c("XMLInternalElementNode", "XMLInternalNode"))
setOldClass(c("XMLInternalTextNode", "XMLInternalNode"))

setOldClass(c("XMLXIncludeStartNode", "XMLInternalNode"))
setOldClass(c("XMLXIncludeEndNode", "XMLInternalNode"))
setOldClass(c("XMLEntityDeclNode", "XMLInternalNode"))
setOldClass(c("XMLAttributeDeclNode", "XMLInternalNode"))
setOldClass(c("XMLDocumentNode", "XMLInternalNode"))
setOldClass(c("XMLDocumentTypeNode", "XMLInternalNode"))
setOldClass(c("XMLDocumentFragNode", "XMLInternalNode"))
setOldClass(c("XMLNamespaceDeclNode", "XMLInternalNode"))

setOldClass(c("XMLAttributeNode", "XMLInternalNode"))


setOldClass(c("XMLDTDNode", "XMLInternalNode"))

setOldClass("XMLNamespace")
setOldClass("XMLNamespaceDefinition")
setOldClass("XMLNamespaceDefinitions")

#setOldClass("XMLInternalDOM")
setOldClass(c("SimplifiedXMLNamespaceDefinitions", "XMLNamespaceDefinitions"))



#setClass("XPathNodeSet", representation(ref = "externalptr"))


setAs("XMLDocument", "XMLInternalDocument",
       function(from) {
         xmlParse(saveXML(from$doc$children$doc))
       })


###################


############

#setMethod("[[", c("XMLInternalElementNode", "numeric") ,
"[[.XMLInternalElementNode" =
function(x, i, j, ..., exact = NA, namespaces = xmlNamespaceDefinitions(x, simplify = TRUE), addFinalizer = NA)
{
   if(is(i, "numeric"))
     .Call("R_getNodeChildByIndex", x, as.integer(i), addFinalizer, PACKAGE = "XML")
   else
       NextMethod()
}


xmlChildren <-
function(x, addNames = TRUE, ...)
{
 UseMethod("xmlChildren")
}

setGeneric("xmlParent",
            function(x, ...)
                standardGeneric("xmlParent"))




xmlChildren.XMLNode <-
#
# Retrieve the list of children (sub-nodes) within
# an XMLNode object.
#
function(x, addNames = TRUE, ...)
{
  structure(if(length(x$children)) x$children else list(), class = "XMLNodeList")
}


if(useS4) {
setGeneric("xmlChildren", function(x, addNames = TRUE, ...) standardGeneric("xmlChildren"))
setMethod("xmlChildren", "XMLNode", xmlChildren.XMLNode)
}

if(useS4) {
setGeneric("xmlName", function(node, full = FALSE) standardGeneric("xmlName"))
setMethod("xmlName", "XMLCommentNode", function(node, full = FALSE) "comment")

setMethod("xmlName", "XMLNode",
function(node, full = FALSE)
{
  ns  = unclass(node)$namespace
  if(!full || is.null(ns) || ns == "")
    return(node$name)

  #
  if(!is.character(ns)) {
    tmp = ns$id
  } else if(inherits(ns, "XMLNamespace"))
    tmp = names(ns)
  else
    tmp = ns

  if(length(tmp))
     paste(tmp, unclass(node)$name, sep=":")
  else
     unclass(node)$name
})
} else {

xmlName <-
function(node, full = FALSE)
{
  UseMethod("xmlName", node)
}

xmlName.XMLComment <-
function(node, full = FALSE) {
 return("comment")
}

xmlName.XMLNode <-
#
# Get the XML tag name of an XMLNode object
#
function(node, full = FALSE)
{
  ns  = unclass(node)$namespace
  if(!full || is.null(ns) || ns == "")
    return(node$name)

  #
  if(!is.character(ns)) {
    tmp = ns$id
  } else if(inherits(ns, "XMLNamespace"))
    tmp = names(ns)
  else
    tmp = ns

  if(length(tmp))
     paste(tmp, unclass(node)$name, sep=":")
  else
     unclass(node)$name
}

}

xmlAttrs <-
function(node, ...)
{
  UseMethod("xmlAttrs", node)
}


xmlAttrs.XMLNode <-
#
# Get the named list of attributes
# for an XMLNode object.
#
function(node, ...)
{
 node$attributes
}

if(useS4)
setMethod("xmlAttrs", "XMLNode", xmlAttrs.XMLNode)


"[.XMLNode" <-
#
# Extract the  children (sub-nodes) within
# the specified object identified by ...
# and return these as a list
#
function(x, ..., all = FALSE)
{
 obj <- xmlChildren(x) # x$children

 if(all) # "all" %in% names(list(...)) && list(...)[["all"]] == TRUE)
   structure(obj[ names(obj) %in% list(...)[[1]] ], class = "XMLNodeList")
 else
   structure(obj[...], class = "XMLNodeList") # NextMethod("[")
}


"[[.XMLDocumentContent" <-
function(x, ...)
{
  x$children[[...]]
}

"[[.XMLNode" <-
#
# Extract the  children (sub-nodes) within
# the specified object identified by ...
#
function(x, ...)
{
 xmlChildren(x)[[...]]
}

names.XMLNode <-
function(x)
{
# names(xmlChildren(x))
 xmlSApply(x, xmlName)
}

"names<-.XMLNode" <-
function(x, value)
{
 names(x$children) <- value
 x
}



length.XMLNode <-
function(x)
{
  xmlSize(x)
}

xmlSize <-
#
# The number of elements within (or length of) a collection
#
function(obj)
{
 UseMethod("xmlSize", obj)
}

xmlSize.XMLDocument <-
function(obj)
{
 return(length(obj$doc$children))
}

xmlSize.default <-
#
# The number of elements within (or length of) a collection
#
function(obj)
{
  length(obj)
}

xmlSize.XMLNode <-
#
# Determine the number of children (or sub-nodes) within an XML node.
#
function(obj)
{
  length(obj$children)
}


print.XMLComment <- print.XMLCommentNode <-
function(x, ..., indent = "", tagSeparator = "\n")
{
  if(is.logical(indent) && !indent)
    indent <- ""

  cat(indent, "<!--", xmlValue(x), "-->", tagSeparator, sep="")
}

print.XMLTextNode <-
function(x, ..., indent = "", tagSeparator = "\n")
{
  if(is.logical(indent) && !indent)
    indent <- ""

  if(inherits(x, "EntitiesEscaped"))
    txt = xmlValue(x)
  else
    txt = insertEntities( xmlValue(x) )

  cat(indent, txt, tagSeparator, sep="")
}

setAs("XMLNamespaceDefinitions", "character",
       function(from) {

         if(length(from) == 0)
            return(character())

         ans = structure(sapply(from, function(x) x$uri), names = sapply(from, function(x) x$id))
         if(length(names(ans)) == 0)
           names(ans) = ""
         ans
       })

setAs("character", "XMLNamespaceDefinitions",
       function(from) {

         ids = names(from)
         if(is.null(ids))
           ids = rep("", length(from))

         structure(lapply(seq(along = from),
                           function(i)
                              structure(list(id = ids[[i]], uri = from[i], local = TRUE), class = "XMLNamespace")),
                   class = "XMLNamespaceDefinitions",
                   names = ids)
       })

print.XMLNode <-
#
# displays a node and attributes (and its children)
# in its XML format.
#
function(x, ..., indent = "", tagSeparator = "\n")
{
 if(length(xmlAttrs(x))) {
   tmp <- paste(names(xmlAttrs(x)),paste("\"", insertEntities(xmlAttrs(x)), "\"", sep=""), sep="=", collapse=" ")
 } else
   tmp <- ""

 if(length(x$namespaceDefinitions) > 0) {
    k = as(x$namespaceDefinitions, "character")
    ns = paste("xmlns", ifelse(nchar(names(k)), ":", ""), names(k), "=", ddQuote(k), sep = "", collapse = " ")
#   ns <- paste(sapply(x$namespaceDefinitions,
#                       function(x) {
#                            paste("xmlns", if(nchar(x$id) > 0) ":" else "", x$id, "=", "\"", x$uri, "\"", sep="")
#                       }), collapse=" ")

 } else
   ns <- ""


   # Add one space to the indentation level for the children.
   # This will accumulate across successive levels of recursion.
  subIndent <- paste(indent, " ", sep="")
  if(is.logical(indent) && !indent) {
    indent <- ""
    subIndent <- FALSE
  }


    if (length(xmlChildren(x)) == 0) {
      ## Empty Node - so difference is <nodename />
      cat(indent, paste("<", xmlName(x, TRUE), ifelse(tmp != "",
          " ", ""), tmp, ifelse(ns != "", " ", ""), ns, "/>", tagSeparator,
          sep = ""), sep = "")
    } else if (length(xmlChildren(x))==1 &&
               inherits(xmlChildren(x)[[1]],"XMLTextNode")) {
      ## Sole child is text node, print without extra white space.
      cat(indent, paste("<", xmlName(x, TRUE), ifelse(tmp != "",
          " ", ""), tmp, ifelse(ns != "", " ", ""), ns, ">",
          sep = ""), sep = "")
      kid = xmlChildren(x)[[1]]
      if(inherits(kid, "EntitiesEscaped"))
        txt = xmlValue(kid)
      else
        txt = insertEntities( xmlValue(kid) )

      cat(txt,sep="")
      cat(paste("</", xmlName(x, TRUE), ">", tagSeparator,
          sep = ""), sep = "")
    } else {
      cat(indent, paste("<", xmlName(x, TRUE), ifelse(tmp != "",
          " ", ""), tmp, ifelse(ns != "", " ", ""), ns, ">", tagSeparator,
          sep = ""), sep = "")
      for (i in xmlChildren(x))
        print(i, indent = subIndent, tagSeparator = tagSeparator)
      cat(indent, paste("</", xmlName(x, TRUE), ">", tagSeparator,
          sep = ""), sep = "")
    }
}

print.XMLEntityRef <-
function(x, ..., indent="", tagSeparator = "\n")
{
  if(is.logical(indent) && !indent)
    indent <- ""

  cat(indent, x$value)
}



print.XMLCDataNode <-
function(x, ..., indent="", tagSeparator = "\n")
{
  if(is.logical(indent) && !indent)
    indent <- ""

 cat(indent, "<![CDATA[", tagSeparator, sep = "")
   # Want new lines in value to be replaced by paste("\n", indent, sep="")
 cat(indent, x$value, sep = "")
 cat(indent, "]]>", tagSeparator, sep = "")
}


print.XMLProcessingInstruction <-
function(x, ..., indent="", tagSeparator = "\n")
{
  if(is.logical(indent) && !indent)
    indent <- ""

 cat(indent, paste("<?", x$name," ", x$value, "?>", tagSeparator, sep=""), sep = "")
}


xmlElementsByTagName <-
#
# Extract all the sub-nodes within an XML node
# with the tag name `name'.
#
function(el, name, recursive = FALSE)
{
    kids = xmlChildren(el)
    idx  =  (names(kids) == name)
    els  = kids[idx]
#    idx <-  (names(el$children) == name)
#    els = el$children[idx]

    if(!recursive  || xmlSize(el) == 0)
      return(els)

    subs = xmlApply(el, xmlElementsByTagName, name, TRUE)
    subs = unlist(subs, recursive = FALSE)

    append.xmlNode(els, subs[!sapply(subs, is.null)])
  }


getDefaultNamespace =
function(doc, ns = xmlNamespaceDefinitions(doc, simplify = simplify), simplify = FALSE)
{
  if(length(ns) == 0)
      return(character())
  i = which(names(ns) == "")
  if(length(i))
     ns[i]
  else
     character()

#  val = unlist(sapply(ns, function(x) if(x$id == "") x$uri))
#  if(length(val))
#     val[1]
#  else
#     character()
}

matchNamespaces =
  # d = xmlTreeParse("data/namespaces.xml", useInternal = TRUE)
  #   "omg"
  #   c("ns", "omegahat", "r")
  #   c("ns", omegahat = "http://www.omegahat.net", "r")
  #   c("ns" = "http://www.omegahat.net", "omg" = "http://www.omegahat.net/XML", "r")
  #
  #  Error because z and rs are not defined in the document.
  #  matchNamespaces(d, c("omg", "z", "rs"))
  #
  #
function(doc, namespaces,
          nsDefs = xmlNamespaceDefinitions(doc, recursive = TRUE, simplify = FALSE),
          defaultNs = getDefaultNamespace(doc, simplify = TRUE)
        )
{

    # 3 cases:
    #  i) we are given a single string (e.g. "r") which we use as a prefix for the default namespace
    # ii) we are given a vector of namespaces, but one has no name and we want to use that as the
    #  prefix for the default namespace
    #    e.g.  sum(names(namespaces) == "") == 1)
    # iii) given several elements with no name and we match these to those in the document
    #    if the first doesn't have a match, we use it as the default one.
    # iv) mixture of prefix = uri values and strings with no names.
    #

    # if it is a single "prefix" and we have a default namespace, then map the prefix to the default URI
    # and return.
  if(is.character(namespaces) && length(namespaces) == 1 &&
        is.null(names(namespaces)) && length(defaultNs) > 0) {
       tmp = defaultNs
       names(tmp)[names(tmp) == ""] = namespaces
         # make certain to convert to c(id = url) form from an XMLNamespaceDefinition
       tmp = as(tmp[[1]], "character")
         # if no name, so default namespace, then use the one in namespaces.
       if(length(names(tmp)) == 0 || names(tmp) == "")
         names(tmp) = namespaces
       return(tmp)
   }

    # fix the names so that we have empty ones if we have none at all.
  if(is.null(names(namespaces)))
    names(namespaces) = rep("", length(namespaces))

   # which need to be fixed up.
  i = (names(namespaces) == "")

  if(any(i)) {

     # from parameters now: nsDefs = xmlNamespaceDefinitions(xmlRoot(doc), recursive = TRUE)

      # deal with the first one as a special case. If this has no match,
      # we will map it to the default namespace's URI.
    if(i[1] && length(defaultNs) && is.na(match(namespaces[1], names(nsDefs)))) {
      names(namespaces)[1] = namespaces[1]
      namespaces[1] = defaultNs
      msg = paste("using", names(namespaces)[1], "as prefix for default namespace", defaultNs)
      e = simpleWarning(msg)
      class(e) = c("XPathDefaultNamespace", class(e))
      warning(e)
      i[1] = FALSE
    }

    if(sum(i) > 0) {
           # So there is at least one namespace without a name.

          # See if there are duplicates
        dups = names(nsDefs)[duplicated(names(nsDefs))]
        tmp = match(namespaces[i], dups)
        if(length(dups) > 0 && any(is.na(tmp)))
          stop("duplicate namespaces, so cannot match namespace prefix(es) ",
                paste(namespaces[i][is.na(tmp)], collapse = ", "),
                " in ", paste(unique(names(nsDefs)), collapse= ", "))

        idx = match(namespaces[i], names(nsDefs))
        if(any(is.na(idx)))
           stop("cannot find defined namespace(s) with prefix(es) ", paste(namespaces[i][is.na(idx)], collapse = ", "))
        names(namespaces)[i] = namespaces[i]
        namespaces[i] = sapply(nsDefs[idx], function(x) x$uri)

      #  warning("namespaces without a name/prefix are not handled as you might expect in XPath. Use a prefix")
    } else if(length(defaultNs) == 0)
        stop("There is no default namespace on the target XML document")
  }

  if(!is.character(namespaces) || ( length(namespaces) > 1 && length(names(namespaces)) == 0))
     stop("Namespaces must be a named character vector")

  if(length(namespaces) && (length(names(namespaces)) == 0 || any(names(namespaces) == "")))
     warning("namespaces without a name/prefix are not handled as you might expect in XPath. Use a prefix")

  namespaces
}



getNodeSet =
function(doc, path, namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE), fun = NULL, sessionEncoding = CE_NATIVE,
          addFinalizer = NA,  ...)
{
  xpathApply(doc, path, fun, ...,  namespaces = namespaces, sessionEncoding = sessionEncoding, addFinalizer = addFinalizer)
}


xpathSApply =
function(doc, path, fun = NULL, ... , namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
          resolveNamespaces = TRUE, simplify = TRUE, addFinalizer = NA)
{
  answer = xpathApply(doc, path, fun, ..., namespaces = namespaces, resolveNamespaces = resolveNamespaces,
                         addFinalizer = addFinalizer)

    # Taken from sapply
    if (simplify && length(answer) && length(common.len <- unique(unlist(lapply(answer,
        length)))) == 1) {
        if (common.len == 1)
            unlist(answer, recursive = FALSE)
        else if (common.len > 1)
            array(unlist(answer, recursive = FALSE), dim = c(common.len,
                length(answer)), dimnames = if (!(is.null(n1 <- names(answer[[1]])) &
                is.null(n2 <- names(answer))))
                list(n1, n2))
        else answer
    }
    else answer
}

xpathApply =
  #
  # the caller can give the same prefixes of the namespaces defined in
  # the target document as simple names.
  #
  #   xpathApply(d, "/o:a//c:c", fun = NULL, namespaces = c("o", "c"))
  #
  #
function(doc, path, fun = NULL, ... , namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
          resolveNamespaces = TRUE, addFinalizer = NA, xpathFuns = list())
{
  UseMethod("xpathApply")
}


xpathApply.XMLNode =
function(doc, path, fun = NULL, ... , namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
          resolveNamespaces = TRUE, addFinalizer = NA, xpathFuns = list())
{
   stop("xpathApply/xpathSApply/getNodeSet require an XML/HTML internal document or node. Use xmlParse() or htmlParse()")
}



toXMLNode =
  #
  # For taking an internal node and converting it to an R-level node
  #
function(x, ...)
{
  txt = saveXML(x)
  xmlRoot(xmlTreeParse(txt, asText = TRUE))
}

xpathApply.XMLNode =
function(doc, path, fun = NULL, ... , namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
          resolveNamespaces = TRUE, addFinalizer = NA, xpathFuns = list(), .node = NULL, noMatchOkay = FALSE)
{
  idoc = xmlParse(saveXML(doc), asText = TRUE)
  ans = xpathApply(idoc, path, fun, ..., namespaces = namespaces, resolveNamespaces = resolveNamespaces,
                        .node = .node, noMatchOkay = noMatchOkay, xpathFuns = xpathFuns)

  # Now convert the results
  if(length(ans))
     ans = lapply(ans, toXMLNode)

  ans
}


xpathApply.XMLInternalDocument =
function(doc, path, fun = NULL, ... , namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
          resolveNamespaces = TRUE, addFinalizer = NA, xpathFuns = list(), .node = NULL, noMatchOkay = FALSE,
           sessionEncoding = CE_NATIVE, noResultOk = FALSE) # native
{
  path = paste(path, collapse = " | ")

  if(is(namespaces, "list") && all(sapply(namespaces, is, "XMLNamespaceDefinition"))) {
     namespaces = structure(sapply(namespaces, `[[`, "uri"), names = names(namespaces))
  }

  if(resolveNamespaces && !inherits( namespaces, "XMLNamespaceDefinitions"))
    namespaces = matchNamespaces(doc, namespaces)

  if(!is.null(fun) && !is.call(fun))
    fun = match.fun(fun)

  # create an expression of the form fun(x, ...) and the C code will insert x for each node.
  args = list(...)
  if(length(args))
    fun = as.call(c(fun, append(1, args)))


      #XXX Match the session encoding c("native" = 0, utf8 = 1, latin1 = 2)
  encoding = if(is.integer(sessionEncoding))
                sessionEncoding
             else
                getEncodingREnum(sessionEncoding)

  anonFuns = NULL
  if(is.character(xpathFuns))
      xpathFuns = as.list(xpathFuns)
  else
      anonFuns = xpathFuns[ vapply(xpathFuns, is.function, FALSE) ]


  ans = .Call("RS_XML_xpathEval", doc, .node, as.character(path), namespaces, fun, encoding, addFinalizer, xpathFuns, anonFuns, PACKAGE = "XML")

  if(!noMatchOkay && length(ans) == 0 && length(getDefaultNamespace(xmlRoot(doc))) > 0) {
    tmp = strsplit(path, "/")[[1]]
       # if they have a function call, ignore.
    tmp = tmp[ - grep("\\(", path) ]
    if(length(grep(":", tmp)) != length(tmp) && !noResultOk)
        warning("the XPath query has no namespace, but the target document has a default namespace. This is often an error and may explain why you obtained no results")
  }

  ans
}

xmlDoc =
function(node, addFinalizer = TRUE)
{
  if(!is(node, "XMLInternalElementNode"))
    stop("xmlDoc must be passed an internal XML node")

  doc = .Call("RS_XML_createDocFromNode", node, PACKAGE = "XML")
  addDocFinalizer(doc, addFinalizer)

  doc
}


# Used to use
#   getDefaultNamespace(doc)


if(FALSE)  {
xpathApply.XMLInternalNode =
  #
  #  There are several situations here.
  #  We/libxml2 needs a document to search.
  #  We have a node. If we use its document, all is fine, but the
  #  search will be over the entire document.  We may get nodes from other sub-trees
  #  that do not pertain to our starting point (doc).
  #  Alternatively, we can create a new doc with this node as the top-level node
  #  and do the search. But then we end up with new nodes. So if you want to find
  #  nodes in the original document in order to change them rather than just read information
  #  from them, then you will be sorely mistaken when you think your changes have been applied
  #  to the original document.

  #
  # Regardless of what we do, we still have to figure out the adding of the doc attribute.
  #
function(doc, path, fun = NULL, ... , namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
          resolveNamespaces = TRUE)
{
  addDocAttribute = FALSE

    # This is a wholesale copy.
  addDocAttribute = TRUE
  doc = xmlDoc(doc, TRUE)

     #
     # If the doc is already there, can't we just use that without copying it? Yes.
     # XXX maybe not. Looks like libxml2 starts at the top of the doc again.
     # But if there is no doc for this node, then we create a new doc and
     # put a finalizer on it. But we attach the document as an attribute to each of the
     # the resulting nodes. Then it will be protected from gc() and so will the nodes
     # until each of the nodes are released.
     #XXX???  What if we do a subsequent search on another of these nodes.
     # Then need to add it to the results.
if(FALSE) {
  tmp = as(doc, "XMLInternalDocument")
  addDocAttribute = is.null(tmp)
  if(is.null(tmp)) {
     if(!is.null(attr(doc, "document")))
       doc = attr(doc, "document")
     else
       doc =  newXMLDoc(node = doc) # if we used  xmlDoc(doc), no finalizer.
  } else
     doc = tmp
}

  ans = xpathApply(doc, path, fun, ..., namespaces = namespaces, resolveNamespaces = resolveNamespaces)

  if(addDocAttribute && length(ans))
    ans = lapply(ans, function(x) { attr(x, "document") = doc; x})

  ans
}
} # end if if(FALSE)



getRootNode =
function(node)
{
  p = node
  while(!is.null(xmlParent(p)))
    p = xmlParent(p)

  p
}

xpathApply.XMLInternalNode =
xpathSubNodeApply =
  #
  # This allows us to use XPath to search within a sub-tree of the document, i.e.
  # from a particular node.
  # This is slightly tricky because the libxml2 routines require a document to search.
  # We could copy the nodes to a new document, e.g.
  #    xmlDoc(node)
  # but then the results would be for new nodes, not the original ones.
  # So we would not be able to find nodes and then modify them as we would be modifying the
  # copies.
  #
  #  If this what is desired, use
  #      doc = xmlDoc(node)
  #      xpathApply(doc, xpath, ...)
  #  and then that is what you will get.

  #
  #  In the case that you want the original nodes in the result,
  #  then we have to do a little bit of work. We create a new document
  #  and set the source node as its root node.  We arrange to
  #   a) put the node back where it came from and b) free the document.
  #  So there is no memory leak.
  #
  #  The other thing we must do is to find the
  #
  #
  # This version avoids doing any copying of nodes when there is a document already
  # associated with the nodes.
  #
function(doc, path, fun = NULL, ...,
          namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
           resolveNamespaces = TRUE, addFinalizer = NA, xpathFuns = list())
{
  path = paste(path, collapse = " | ")

  node = doc

  addDocAttribute = FALSE
  createdNewDocument = FALSE
  tmp = as(doc, "XMLInternalDocument")

  putBack =
    function(node, info)  {
      if(!is.null(info$left))
        addSibling(info$left, node)
      else if(!is.null(info$right))
        addSibling(info$right, node, after = FALSE)
      else if(!is.null(info$parent))
        addChildren(info$parent, node)
      else if(!is.null(tmp))
        addChildren(tmp, node)
    }
  info = list(parent = xmlParent(node),
              left = getSibling(node, after = FALSE),
              right = getSibling(node, after = TRUE))


       # The approaches here are to create a new empty document and then set the node
       # to be its root. We don't set the document for each of the sub-nodes but just this
       # top-level node. Then we arrange that when we end this function, we discard the
       # newly created document and put the node back into the original tree in its
       # original position.
       # This involves knowing the parent and the position at which to put the node back into the tree.
       # If it is the top most node, i.e. no parent, then it is simple - just set the parent back to NULL.
       # If it has a parent, but no siblings, just set the parent.
       # And if it has a sibling, put it back next to that sibling.
       #     If it is the first child, put to the left of the sibling.
       #     If it is not, put to the right.
       # Need to make certain the resulting nodes have the original document
       # Use xmlSetTreeDoc rather than node->doc = node as this is recursive.
       # And so this is now all done in the on.exit() via the call to RS_XML_setDocEl()
#    doc = newXMLDoc(node = node, addFinalizer = getNativeSymbolInfo("R_xmlFreeDocLeaveChildren")$address)

    doc = newXMLDoc(addFinalizer = FALSE)
    parent = xmlParent(node)
    .Call("RS_XML_setRootNode", doc, node, PACKAGE = "XML")
    on.exit({ .Call("RS_XML_unsetDoc", node, unlink = TRUE, parent, TRUE, PACKAGE = "XML")
              .Call("RS_XML_freeDoc", doc, PACKAGE = "XML")
              if(!is.null(tmp)) {
                                        # Need to create a new document with the current node as the root.
                                        # When we are finished, we have to ensure that we put the node back into the original document
                                        # We can use the same mechanism as when we have to create the document from scratch.
               .Call("RS_XML_setDocEl", node, tmp, PACKAGE = "XML")
              }
              putBack(node, info)
            })

    docName(doc) = paste("created for xpathApply for", path, "in node", xmlName(node))


  ans = xpathApply(doc, path, NULL, namespaces = namespaces, resolveNamespaces = resolveNamespaces, addFinalizer = addFinalizer, xpathFuns = xpathFuns)

  if(length(ans) == 0)
    return(ans)

    # now check if the result was actually a descendant of our top-level node for this
    # query. It is possible that it arose from a different sub-tree.

  w = sapply(ans, function(el) .Call("RS_XML_isDescendantOf", el, node, strict = FALSE, PACKAGE = "XML"))

  ans = ans[w]

#  if(FALSE && addDocAttribute && length(ans))
#     ans = lapply(ans, function(x) { attr(x, "document") = doc; x})

#  if(createdNewDocument)
#       # Need to remove the links  from these nodes to the parent.
#    lapply(ans, function(x) .Call("RS_XML_unsetDoc", x, unlink = FALSE, TRUE))


  if(!is.null(fun))
     lapply(ans, fun, ...)
  else
     ans
}



if(TRUE)
xpathApply.XMLInternalNode =
function(doc, path, fun = NULL, ...,
          namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
           resolveNamespaces = TRUE, addFinalizer = NA, xpathFuns = list())
{
  ndoc = as(doc, "XMLInternalDocument")
  if(is.null(ndoc))
    xpathSubNodeApply(doc, path, fun, ..., namespaces = namespaces, resolveNamespaces = resolveNamespaces, addFinalizer = addFinalizer, xpathFuns = xpathFuns)
  else
    xpathApply.XMLInternalDocument(ndoc, path, fun, ...,
                                 namespaces = namespaces, resolveNamespaces = resolveNamespaces,
                                 .node = doc, addFinalizer = addFinalizer, xpathFuns = xpathFuns)
}


xpathApply.XMLDocument =
#xpathApply.XMLNode =
function(doc, path, fun = NULL, ... , namespaces = xmlNamespaceDefinitions(doc, simplify = TRUE),
          resolveNamespaces = TRUE, .node = NULL, addFinalizer = NA, xpathFuns = list())
{
  txt = saveXML(doc)
  doc = xmlParse(txt, asText = TRUE)
  ans = xpathApply(doc, path, fun, ..., namespaces = namespaces, resolveNamespaces = resolveNamespaces, .node = .node,
                     addFinalizer = addFinalizer, xpathFuns = xpathFuns)

  if(length(ans))
     ans = lapply(ans, toXMLNode)

  ans
 # stop("XPath expressions cannot be applied to R-level nodes. Use xmlParse() to process the document and then use xpathApply()")
}


# d = xmlTreeParse("data/book.xml", useInternal = TRUE)
# ch = getNodeSet(d, "//chapter")
# xpathApply(ch[[1]], "//section/title", xmlValue)


# d = xmlTreeParse("data/mtcars.xml", useIntern = TRUE); z = getNodeSet(d, "/dataset/variables")
#  xpathApply(z[[1]], "variable[@unit]", NULL, namespaces = character())

getXMLPath =
function(node, defaultPrefix = "ns")
{
  paste(unlist(c("", xmlAncestors(node, xmlName, defaultPrefix))), collapse = "/")
}

xmlAncestors =
function(x, fun = NULL, ..., addFinalizer = NA, count = -1L)
{
  ans = list()
  tmp = x
  while(!is.null(tmp)) {
    if(!is.null(fun))
      ans = c(fun(tmp, ...), ans)
    else
      ans = c(tmp, ans)

    if(count > 0 && length(ans) == count)
      break

    tmp = xmlParent(tmp, addFinalizer = addFinalizer)
  }
  ans
}

Try the XML package in your browser

Any scripts or data that you put into this service are public.

XML documentation built on Nov. 3, 2023, 1:14 a.m.