R/htmlParse.R
In XML: Tools for Parsing and Generating XML Within R and S-Plus

Documented in parseURI

isURL = 
function(file)
{
  is.character(file) && grepl("^(http|ftp)", file)
}


############
#XXXXXXXXX
# This is now replaced by copying xmlTreeParse.
htmlTreeParse <- 
#
# HTML parser that reads the entire `document' tree into memory
# and then converts it to an R/S object. 
# Uses the libxml from Daniel Veillard at W3.org. 
#
# asText  treat the value of file as XML text, not the name of a file containing
#       the XML text, and parse that.
# See also xml
#
function(file, ignoreBlanks = TRUE, handlers = NULL,
           replaceEntities = FALSE, asText = inherits(file, "AsIs") || !isURL && grepl("^<", file), # could have a BOM
            trim = TRUE, 
            isURL = is.character(file) && grepl("^(http|ftp)", file),
            asTree = FALSE, useInternalNodes = FALSE,
            encoding = character(),
            useDotNames = length(grep("^\\.", names(handlers))) > 0,
            xinclude = FALSE, addFinalizer = TRUE, error = function(...){},
            options = integer(), parentFirst = FALSE)
{
if(TRUE) 
  {
     doc = xmlTreeParse(file, ignoreBlanks, handlers, replaceEntities, asText, trim, validate = FALSE,
                      getDTD = FALSE, isURL, asTree, addAttributeNamespaces = FALSE, 
                       useInternalNodes, isSchema = FALSE, fullNamespaceInfo = FALSE,
                        encoding, useDotNames, xinclude, addFinalizer, error, isHTML = TRUE, options = options)
     class(doc) = c("HTMLInternalDocument", class(doc)[1])
     return(doc)
  }
 

  if(length(file) > 1) {
   file = paste(file, collapse = "\n")
    if(!missing(asText) && !asText) 
      stop("multiple URIs passed to xmlTreeParse. If this is the content of the file,  specify asText = TRUE")   
   asText = TRUE
 }


  if(missing(asText) && substring(file, 1, 1) == "<")
    asText = TRUE
  
  if(!asText && missing(isURL)) {
     isURL <- length(grep("^(http|ftp)://", file, useBytes = TRUE, perl = TRUE)) 
  }

    # check whether we are treating the file name as
    # a) the XML text itself, or b) as a URL.
    # Otherwise, check if the file exists and report an error.
 if(asText == FALSE && isURL == FALSE) {
  if(file.exists(file) == FALSE)
     stop(paste("File", file, "does not exist "))
 }

 if(!asText && !isURL)
   file = path.expand(file)

 if (replaceEntities) options <- union(options, NOENT)

 if(!is.logical(xinclude)) {
   if(inherits(xinclude, "numeric"))
    xinclude = bitlist(xinclude)
   else
     xinclude = as.logical(xinclude)   
 }

 .oldErrorHandler = setXMLErrorHandler(error)
 on.exit(.Call("RS_XML_setStructuredErrorHandler", .oldErrorHandler, PACKAGE = "XML"), add = TRUE)
  
 ans <- .Call("RS_XML_ParseTree", as.character(file), handlers, 
         as.logical(ignoreBlanks), as.logical(replaceEntities),
          as.logical(asText), as.logical(trim), 
           FALSE, FALSE, 
           as.logical(isURL), FALSE, 
           as.logical(useInternalNodes), TRUE, FALSE, FALSE, as.character(encoding),
           as.logical(useDotNames), xinclude, error, addFinalizer, options, as.logical(parentFirst), PACKAGE = "XML")

 if(!missing(handlers) & !as.logical(asTree))
   return(handlers)

  if(inherits(ans, "XMLInternalDocument")) {
    addDocFinalizer(ans, addFinalizer)
    class(ans) = c("HTMLInternalDocument", class(ans))
  }

 ans
}

#XXXXXX
# This is another version that doesn't seem to release the document. Weird. I can't seem to find
# out who is holding onto it.
myHTMLParse = 
function(file, ignoreBlanks = TRUE, handlers = NULL,
           replaceEntities = FALSE, asText = inherits(file, "AsIs") || !isURL && grepl("^<", file), # could have a BOM
            trim = TRUE, 
            isURL = is.character(file) && grepl("^(http|ftp)", file),
            asTree = FALSE, useInternalNodes = FALSE,
            encoding = character(),
            useDotNames = length(grep("^\\.", names(handlers))) > 0,
            xinclude = FALSE, addFinalizer = TRUE, error = function(...){})
{
     doc = xmlTreeParse(file, ignoreBlanks, handlers, replaceEntities, asText, trim, validate = FALSE,
                         getDTD = FALSE, isURL, asTree, addAttributeNamespaces = FALSE, 
                           useInternalNodes, isSchema = FALSE, fullNamespaceInfo = FALSE,
                            encoding, useDotNames, xinclude, addFinalizer, error, isHTML = TRUE)
     class(doc) = c("HTMLInternalDocument", class(doc)[2])
     return(doc)
}
 

hideParseErrors = function (...) NULL


htmlTreeParse = xmlTreeParse


formals(htmlTreeParse)$error = as.name("htmlErrorHandler") # as.name("hideParseErrors")
formals(htmlTreeParse)$isHTML = TRUE

htmlParse = htmlTreeParse
formals(htmlParse)$useInternalNodes = TRUE



parseURI =
function(uri)
{
  if(is.na(uri))
    return(structure(as.character(uri), class = "URI"))
  
  u = .Call("R_parseURI", as.character(uri), PACKAGE = "XML")
  if(u$port == 0)
    u$port = as.integer(NA)

  class(u) = "URI"
  
  u
}  

setOldClass("URI")
setOldClass("URL")

setAs("URI", "character",
      function(from) {
          if(from$scheme == "")
              sprintf("%s%s%s",
                      from["path"],
                      if(from[["query"]] != "") sprintf("?%s", from[["query"]]) else "",
                      if(from[["fragment"]] != "") sprintf("#%s", from[["fragment"]]) else "" )
          else
           sprintf("%s://%s%s%s%s%s%s%s",
                                    from[["scheme"]],
                                    from[["user"]],
                                    if(from[["user"]] != "") "@" else "",
                                    from[["server"]],
                                    if(!is.na(from[["port"]])) sprintf(":%d", as.integer(from[["port"]])) else "",
                                    from["path"],
                                    if(from[["query"]] != "") sprintf("?%s", from[["query"]]) else "",
                                    if(from[["fragment"]] != "") sprintf("#%s", from[["fragment"]]) else ""                   
                   )
      })