R/readHTMLTable.R
In XML: Tools for Parsing and Generating XML Within R and S-Plus

trim =
function(x)
  gsub("(^[[:space:]]+|[[:space:]]+$)", "", x)


textNodesOnly =
  # Only process the top-level text nodes, not recursively.
  # Could be done as simply as
  #     xmlValue(x, recursive = FALSE)
function(x)
    paste(xmlSApply(x, function(n) if(is(n, "XMLInternalTextNode")) xmlValue(n) else ""), collapse = "")


toNumber =
function(x)
{
  as.numeric(gsub("[%,]", "", x))
}


if(FALSE) {
  doc = htmlParse("http://elections.nytimes.com/2008/results/states/president/california.html")
  tbls = getNodeSet(doc, "//table[not(./tbody)]|//table/tbody")
  o = readHTMLTable(tbls[[1]], skip.rows = c(1, Inf), header = FALSE, colClasses = c("character", replicate(5, toNumber)), elFun = textOnly)


  o = readHTMLTable("http://elections.nytimes.com/2008/results/states/president/california.html")

  x = readHTMLTable("http://www.usatoday.com/news/politicselections/vote2004/CA.htm", as.data.frame = FALSE)
}

setGeneric("readHTMLTable",
          function(doc, header = NA,
                    colClasses = NULL, skip.rows = integer(), trim = TRUE, elFun = xmlValue,
                     as.data.frame = TRUE, which = integer(), ...)
             standardGeneric("readHTMLTable"))

setMethod("readHTMLTable", "character",
          function(doc, header = NA,
                    colClasses = NULL, skip.rows = integer(), trim = TRUE, elFun = xmlValue,
                     as.data.frame = TRUE, which = integer(), encoding = character(), ...) {
              pdoc = htmlParse(doc, encoding = encoding)
              readHTMLTable(pdoc, header, colClasses, skip.rows, trim, elFun, as.data.frame, which, ...)
          })


 # XXX Should vectorize in header, colClasses, i.e. allow different values for different tables.
setMethod("readHTMLTable", "HTMLInternalDocument",
          function(doc, header = NA,
                    colClasses = NULL, skip.rows = integer(), trim = TRUE, elFun = xmlValue,
                     as.data.frame = TRUE, which = integer(), ...)
{
     #  tbls = getNodeSet(doc, "//table[not(./tbody)]|//table/tbody")
   tbls = getNodeSet(doc, "//table")  # XXX probably want something related to nested tables
                                      # "//table[not(ancestor::table)]" -> outer ones
      # if header is missing, compute it each time.
   if(length(which))
       tbls = tbls[which]

#   ans = lapply(tbls, readHTMLTable,  header, colClasses, skip.rows, trim, elFun, as.data.frame, ...)
   header = rep(header, length = length(tbls))
   ans = mapply(readHTMLTable,
                 tbls, header,
                 MoreArgs = list(colClasses = colClasses, skip.rows = skip.rows, trim = trim, elFun = elFun, as.data.frame = as.data.frame, ...),
                 SIMPLIFY = FALSE)
   names(ans) = sapply(tbls, getHTMLTableName)

   if(length(which) && length(tbls) == 1)
      ans[[1]]
   else
      ans
})

getHTMLTableName =
function(node)
{
  id = xmlGetAttr(node, "id")
  if(!is.null(id))
    return(id)

  cap = getNodeSet(node, "./caption")
  if(length(cap))
    return(xmlValue(cap[[1]]))
}

setClass("FormattedNumber", contains = "numeric")
setClass("FormattedInteger", contains = "integer")

setAs('character', 'FormattedNumber', function(from) as.numeric(gsub(",", "", from)))
setAs('character', 'FormattedInteger', function(from) as.integer(gsub(",", "", from)))

setClass("Currency", contains = "numeric")
setAs("character", "Currency",
       function(from)
          as.numeric(gsub("[$,]", "", from)))

setClass("Percent", contains = "numeric")
setAs('character', 'Percent', function(from) as.numeric(gsub("%", "", from)))

setMethod("readHTMLTable", "XMLInternalElementNode",
#readHTMLTable.XMLInternalElementNode  =
  #
  #
  # header is computed based on whether we have a table node and it has a thead.
  #  (We don't currently bother with the col spans.)
  #
  #  colClasses can be a character vector giving the name of the type for a column,
  #  an NULL to drop the corresponding column, or a function in which case it will
  #  be passed the contents of the column and can transform it as it wants.
  #  This allows us to clean text before converting it.
  #
  # skip.rows - an integer vector indicating which rows to ignore.
  #
  #  trim - a logical indicating whether to trim white space from the start and end of text.
  #
  #  elFun - a function which is called to process each th or td node to extract the content.
  #      This is typically xmlValue, but one can supply others (e.g. textNodesOnly)

  #  as.data.frame
  #

function(doc, header = NA ,
          colClasses = NULL, skip.rows = integer(), trim = TRUE, elFun = xmlValue,
            as.data.frame = TRUE, encoding = 0L, ...)
{

  node = doc
  headerFromTable = FALSE
  dropFirstRow = FALSE


     # check if we have a header
  if(length(header) == 1 && is.na(header))                                     # this node was doc
      header = (xmlName(doc) %in% c("table", "tbody") &&
                      ("thead" %in% names(doc) || length(getNodeSet(node, "./tr[1]/th | ./tr[1]/td")) > 0))

  if(is.logical(header) && (is.na(header) || header) &&  xmlName(node) == "table") {
    if("thead" %in% names(node))
       header = node[["thead"]]
    else {
       if("tr" %in% names(node))
          tmp = node[["tr"]]
       else
          tmp = node[["tbody"]][["tr"]]

       if(!is.null(tmp) && all(names(tmp) %in% c('text', 'th'))) {
          header = xpathSApply(tmp, "./th | ./td", xmlValue, encoding = encoding)
          dropFirstRow = TRUE
        }
    }
  }

     # Moved this from before the check for header as we set node here and that seems
     # premature. Checked on
     #     readHTMLTable("http://www.google.com/finance?q=NASDAQ:MSFT&fstype=ii", header = TRUE, which = 1)
  tbody = getNodeSet(node, "./tbody")
  if(length(tbody))
     node = tbody[[1]]

  if(is(header, "XMLInternalElementNode"))   {
      # get the last tr in the thead
     if(xmlName(header) == "thead") {
        i = which(names(header) == "tr")
        header = header[[ i [ length(i) ] ]]
        xpath = "./th | ./td"
     } else
        xpath = "./*/th | ./*/td"

     header = as.character(xpathSApply(header, xpath, elFun, encoding = encoding))
     headerFromTable = TRUE

     if(xmlName(node) == "table" && "tbody" %in% names(node))
        node = node[["tbody"]]
   }

     # Process each row, by getting the content of each "cell" (th/td)
  rows = getNodeSet(node, ".//tr")
  if(dropFirstRow)
     rows = rows[-1]
  els =  lapply(rows, function(row) {
                           tmp = xpathSApply(row, "./th|./td", elFun)
                           if(trim)
                              trim(tmp)
                           else
                              tmp
                        })


#  spans = getNodeSet(node, ".//td[@rowspan] | .//th[@rowspan]")


  if(length(skip.rows)) {
    infs = (skip.rows == Inf)
    if(any(infs))
          # want Inf - 2, Inf - 1, Inf,  to indicate drop last 3, but that won't work
          # take sequence of Inf to identify Inf - 2, Inf - 1, Inf
       skip.rows[skip.rows == Inf] = length(els)  - seq(0, length = sum(infs))
    els = els[ - skip.rows ]
  }

  if(length(els) == 0)
    return(NULL)

   numEls = sapply(els, length)
                                                           # els[[1]] should be a scalar
   if(is.logical(header) && !is.na(header) && header && any(nchar(els[[1]]) < 999)) {
     header = els[[1]]
     els = els[-1]
     numEls = numEls[ - 1]
   }

  if(length(els) == 0)
    return(NULL)  #XXX we should have a header here so return a data frame with 0 rows.

   ans = lapply(seq(length = max(numEls)),
                  function(col) {
                    sapply(els, `[`, col)
                  })

   if(is.character(header) && length(header) == length(ans))
      names(ans) = header

   if(length(colClasses)) {

      colClasses = rep(colClasses, length = length(ans))

      n = sapply(colClasses, function(x) is.null(x) || x == "NULL")
      if(any(n)) {
         ans = ans[ ! n ]
         colClasses = colClasses[ ! n ]
      }

      ans = lapply(seq(along = ans) ,
                      function(i)
                         if(is.function(colClasses[[i]]))
                            colClasses[[i]](ans[[i]])
                         else if(colClasses[[i]] == "factor")
                             factor(ans[[i]])
                         else if(colClasses[[i]] == "ordered")
                             ordered(ans[[i]])
                         else
                            as(ans[[i]], colClasses[[i]])
                   )

   }

   if(as.data.frame)  {
     ans = as.data.frame(ans, ...)
     if(is.character(header) && length(header) == length(ans))
        names(ans) = header
     else if(nrow(ans) > 0)
       names(ans) = paste("V", seq(along = ans), sep = "")
   }

  ans
})



getTableWithRowSpan =
function(node, r = xmlSize(node),
          c = max(xmlSApply(node, function(x) length(getNodeSet(x, "./td | ./th")))),
           encoding = 0L)
{
  ans = matrix(NA_character_, r, c)
  for(i in seq(length = r)) {
     col = 1
     kids = getNodeSet(node[[i]], "./th | ./td")
     for(k in seq(along = kids)) {
       sp = xmlGetAttr(k, "rowspan", 1)
       ans[seq(i, length = sp)] = xmlValue(k, encoding = encoding)
     }
  }
}