R/xpath.R

Defines functions mclm_xml_text find_xpath

Documented in find_xpath mclm_xml_text

#' Run XPath query
#' 
#' This function finds matches for an XPath query in a corpus.
#'
#' @param x A corpus: an [`fnames`] object, a character vector of an XML source,
#'   or a document parsed with [xml2::read_xml()].
#' @param pattern An XPath query.
#' @param fun Function to be applied to the individual nodes prior
#'   to returning the result.
#' @param final_fun Function to be applied to the complete list
#'  of matches prior to returning the result.
#' @param namespaces A namespace as generated by [xml2::xml_ns()].
#' @param ... Additional arguments.
#'
#' @return A nodeset or the output of applying `fun` to a nodeset.
#' @export
#' 
#' @examples 
#' test_xml <- '
#' <p>
#'   <w pos="at">The</w>
#'   <w pos="nn">example</w>
#'   <punct>.</punct>
#' </p>'
#' 
#' find_xpath(test_xml, "//w")
#' find_xpath(test_xml, "//@pos")
#' find_xpath(test_xml, "//w[@pos='nn']")
#' 
#' find_xpath(test_xml, "//w", fun = xml2::xml_text)
#' find_xpath(test_xml, "//w", fun = xml2::xml_attr, attr = "pos")
find_xpath <- function(x,
                       pattern,
                       fun = NULL,
                       final_fun = NULL,
                       namespaces = NULL,
                       ...) {
  # TODO write tutorial
  if ("xml_document" %in% class(x)) {
    x <- list(x)
  }
  res <- vector("list", length(x))
  for (i in seq_along(x)) {
    cur <- x[[i]]
    if (is.character(cur)) { 
      cur <- xml2::read_xml(cur)
    }
    if (!("xml_document" %in% class(cur) || "xml_node" %in% class(cur))) {
      stop("x must be a character vector or an xml document")
    }
    #res[[i]] <- cur[pattern]
    if (is.null(namespaces)) {
      namespaces = xml2::xml_ns(cur)
    }
    res[[i]] <- xml2::xml_find_all(cur, pattern,
                                   ns = namespaces, ...)
  }
  res <- structure(do.call(c, res), class = "xml_nodeset")
  if (!is.null(fun)) {
    res <- fun(res, ...)
  }
  if (!is.null(final_fun)) {
    res <- do.call(final_fun, list(res))
  }    
  res
}

#' Get text from xml node
#'
#' @param node XML node as read with [xml2::read_xml()].
#' @inheritParams xml2::xml_text
#'
#' @return Character vector: The text value of the (elements of the) node,
#'  concatenated with spaces in between.
#' @export
#'
#' @examples
#' test_xml <- '
#' <p>
#'   <w pos="at">The</w>
#'   <w pos="nn">example</w>
#'   <punct>.</punct>
#' </p>'
#' 
#' test_xml_parsed <- xml2::read_xml(test_xml)
#' 
#' # xml2 output
#' xml2::xml_text(test_xml_parsed)
#' 
#' # mclm version
#' mclm_xml_text(test_xml_parsed)
mclm_xml_text <- function(node, trim = FALSE) {
  if (xml2::xml_length(node) == 0) {
    xml2::xml_text(node, trim = trim)
  } else {
    paste(vapply(xml2::xml_contents(node),
                 mclm_xml_text, trim = trim, FUN.VALUE = character(1)),
          collapse = " ")
  }
}

Try the mclm package in your browser

Any scripts or data that you put into this service are public.

mclm documentation built on Oct. 3, 2022, 9:07 a.m.