#
# This is an experiment to see if a simple hash of the values
# is faster.
#
# Basically, we keep the parents, children and nodes
# each as hash tables not a list. Otherwise, this resembles
# a flat tree
#
# The notion is that we have a collection of nodes
# and a collection of .parents and .children
#
# Each element in .parents is assigned to the name of the node
# whose parent is being stored. The value is the identifier for the
# parent node. The top node has no entry in this collection.
#
# The .children environment maintains a collection of entries
# indexed by the identifier of the relevant node.
# The value is a character vector containing the identifiers of the
# nodes which are children of this node.
#
xmlHashTree =
#
# Currently ignore the nodes, parents and children.
#
function(nodes = list(), parents = character(), children = list(),
env = new.env(TRUE, parent = emptyenv()))
{
# function to generate a new node identifier. Can be given the
# proposed name and will then make one up if that conflicts with another
# identifier.
.count = 0
# ability to be able to refer to this tree itself.
# Not used here since the functions don't make use of the tt environment implicitly.
# assign(".this", env, env)
# We will store the children and parents as regular entries in these hash tables.
env$.children = .children = new.env(TRUE)
env$.parents = .parents = new.env(TRUE)
#XXX we can do without this and make it a regular function
# but we need to deal with XMLFlatListTree slightly different.
f = function(suggestion = "") {
# the check to see if suggestion is a name in env is very expensive.
if(suggestion == "" || exists(suggestion, env, inherits = FALSE))
as.character(.count + 1) # can use length(tt)
else
suggestion
}
assign(".nodeIdGenerator", f, env)
addNode =
# This adds each new node.
function(node, parent = character(), ..., attrs = NULL, namespace = NULL,
namespaceDefinitions = character(),
.children = list(...),
cdata = FALSE,
suppressNamespaceWarning = getOption('suppressXMLNamespaceWarning', FALSE))
{
if(is.character(node))
node = xmlNode(node, attrs = attrs, namespace = namespace, namespaceDefinitions = namespaceDefinitions)
.kids = .children
.children = .this$.children
node = asXMLTreeNode(node, .this, className = "XMLHashTreeNode")
id = node$id
assign(id, node, env)
.count <<- .count + 1
# if no parent,
if(!inherits(parent, "XMLNode") && (!is.environment(parent) && length(parent) == 0) || identical(parent, ""))
return(node)
if(inherits(parent, "XMLHashTreeNode"))
parent = parent$id
if(length(parent)) {
assign(id, parent, envir = .parents)
if(exists(parent, .children, inherits = FALSE))
tmp = c(get(parent, .children), id)
else
tmp = id
assign(parent, tmp, .children)
}
return(node)
}
env$.addNode <- addNode
# Create a .nodes vector with the names
# of the node. And then makes a
#
.tidy = function() {
idx <- idx - 1
length(nodeSet) <- idx
length(nodeNames) <- idx
names(nodeSet) <- nodeNames
.nodes <<- nodeSet
idx
}
# environment(env$.tidy) <- env
.this = structure(env, class = oldClass("XMLHashTree"))
.this
}
# Example of looping over all elements
# table(unlist(eapply(tree, xmlName)))
getDescendants =
#
# This is trying to avoid recursion and use iteration.
#
function(id, tree, kids = tree$.children)
{
# if no kids, then return empty list
if(!exists(id, kids))
return(character())
ans = character()
tmp = get(id, kids)
hasKids = objects(kids)
while( length( tmp ) > 0) {
ans = c(ans, tmp)
tmp = tmp[ tmp %in% hasKids ]
k = get(tmp[1], kids)
tmp = c(tmp[-1], k)
}
ans
}
getDescendants =
# Simple mechanism, for xmlHashTree trees.
# This is recursive.
function(id, tree, kids = tree$.children)
{
if(inherits(id, "XMLHashTreeNode")) {
if(missing(tree))
tree = id$env
id = id$id
}
if(!exists(id, kids))
return(character())
ans = get(id, kids)
c(ans, unlist(lapply(ans, getDescendants, tree, kids)))
#Debugging
# names(ans) = sapply(ans, function(i) xmlName(get(i, tree)))
}
subtree = copyXMLHashSubTree =
function(node)
{
# find all the nodes below this node, i.e. in the subtree
tree = node$env
ids = getDescendants(node$id, tree, tree$.children)
newTree = xmlHashTree()
# Now copy the parent & children information to the new tree
# and also the modified nodes. The only modification necessary
# is to set the env field of the original node to the new tree.
sapply(c(node$id, ids), function(id) {
n = get(id, tree)
n$env = newTree
assign(id, n, newTree)
if(exists(id, tree$.children))
assign(id, get(id, tree$.children), newTree$.children)
if(exists(id, tree$.parents))
assign(id, get(id, tree$.parents), newTree$.parents)
})
remove(list = node$id, envir = newTree$.parents)
newTree
}
xmlNamespaceDefinitions.XMLAbstractDocument =
function(x, addNames = TRUE, recursive = FALSE, simplify = FALSE, ...)
{
xmlNamespaces(as(x, "XMLAbstractNode"))
}
setAs("XMLAbstractDocument", "XMLAbstractNode",
function(from)
xmlRoot(from))
setAs("XMLHashTreeNode", "XMLHashTree",
function(from)
from$env
)
"$.XMLHashTree" =
function(x, name)
get(name, x, inherits = FALSE)
setMethod("xmlParent", "XMLHashTreeNode",
# To get the parent of the node 'obj', we have to look in the .parents object
# for the variable with obj's node identifier and then get the corresponding
# value which is the identifier of the parent.
function(x, ...)
{
p = get(".parents", x$env)
idx = exists(x$id, p, inherits = FALSE)
if(!idx)
return(NULL)
get(get(x$id, p), x$env)
} )
xmlChildren.XMLHashTreeNode =
#
# For a given node 'obj', we have to use its id to find the entry
# in the .children hash table and then the resulting entry is a character
# vector giving the ids of the child nodes of obj. So we have to resolve those
# children id's back in the hash table for the actual nodes.
function(x, addNames = TRUE, ...)
{
e = x$env
kids = get(".children", e)
if(exists(x$id, kids, inherits = FALSE)) {
ids = get(x$id, kids, inherits = FALSE)
nodes = lapply(ids, get, e, inherits = FALSE)
names(nodes) = sapply(nodes, xmlName)
nodes
} else
list()
}
if(useS4)
setMethod("xmlChildren", "XMLHashTreeNode", xmlChildren.XMLHashTreeNode)
xmlSize.XMLHashTreeNode =
function(obj)
{
length(xmlChildren(obj))
}
xmlSize.XMLHashTree =
function(obj)
{
# 3 is the number of entries with a . prefix that we put there
# for our own implementation purposes
# We could calculate this as
# length(grep("^.", objects(obj, all = TRUE))
length(obj) - 3
}
#??? Currently overridden below
xmlRoot.XMLHashTree =
function(x, skip = TRUE, ...)
{
id = setdiff(objects(x), objects(x[[".parents"]]))
get(id, x)
}
"[[.XMLHashTreeNode" =
function(x, ..., copy = FALSE, exact = TRUE)
{
# ans = NextMethod("[[")
ans = xmlChildren(x)[[...]]
if(copy)
xmlRoot(subtree(ans))
else
ans
}
addNode =
function(node, parent, to, ...)
{
UseMethod("addNode", to)
}
addNode.XMLHashTree =
function(node, parent = character(), to, ...)
{
to[[".addNode"]](node, parent, ...)
}
xmlRoot.XMLHashTree =
#
# This can return multiple roots
#
# Find all the identities of the nodes for which there is no
# corresponding entry n the .parents
#
#
# If skip is TRUE, discard comment nodes. Leave PI nodes, etc.
#
# If all is TRUE, return a list() with all the top-level nodes.
#
function(x, skip = TRUE, all = FALSE, ...)
{
parents = get(".parents", x, inherits = FALSE)
tops = objects(x)[ is.na(match(objects(x), objects(parents)))]
if(length(tops) == 0)
return(NULL)
ans = mget(tops, x)
if(skip)
ans = ans[!sapply(ans, inherits, c("XMLCommentNode"))] #XXX names of XML hash tree nodes for comment, processing instruction, text node, etc.
if(all)
return(ans)
ans[[1]]
}
getSibling =
# Access the next field in the xmlNodePtr object.
# not exported.
function(node, after = TRUE, ...)
UseMethod("getSibling")
getSibling.XMLHashTreeNode =
function(node, after = TRUE, ...)
{
.this = node$env
parent = xmlParent(node)
if(!is.null(parent)) {
kids = xmlChildren(parent)
} else
kids = xmlRoot(.this, skip = FALSE, all = TRUE)
i = match(node$id, sapply(kids, function(x) x$id))
if(is.na(i))
stop("shouldn't happen")
if(after) {
if(i < length(kids))
kids[[i+1]]
else
NULL
} else {
if(i > 1)
kids[[i-1]]
else
NULL
}
}
print.XMLHashTree =
function(x, ...)
{
print(xmlRoot(x), ...)
}
xmlElementsByTagName.XMLHashTree =
#
# non-recursive version only at present
#
function(el, name, recursive = FALSE)
{
kids = xmlChildren(el)
if(!recursive)
return(kids [ sapply(kids, xmlName) == name ])
}
convertToHashTree =
function(from)
{
xx = xmlHashTree()
ans = .Call("R_convertDOMToHashTree", from, xx, xx$.children, xx$.parents, PACKAGE = "XML")
docName(xx) = docName(from)
xx
}
setAs("XMLInternalDocument", "XMLHashTree",
function(from) {
convertToHashTree(xmlRoot(from, skip = FALSE))
})
setAs("XMLInternalNode", "XMLHashTree",
function(from) {
ans = convertToHashTree(from)
docName(ans) <- docName(from)
ans
})
docName.XMLHashTree =
function(doc)
{
if(exists(".doc", doc))
doc$.doc
else
as.character(NA)
}
setMethod("docName", "XMLHashTree", docName.XMLHashTree)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.