R/reader.R
In tm: Text Mining Package

Documented in FunctionGenerator getReaders readDataframe readDOC readPDF readPlain readTagged readXML

## Author: Ingo Feinerer
## Readers

FunctionGenerator <-
function(x)
{
    class(x) <- c("FunctionGenerator", "function")
    x
}

getReaders <-
function()
    c("readDataframe", "readDOC", "readPDF", "readPlain", "readRCV1",
      "readRCV1asPlain", "readReut21578XML", "readReut21578XMLasPlain",
      "readTagged", "readXML")

prepareReader <-
function(readerControl, reader = NULL, ...)
{
    if (is.null(readerControl$reader))
        readerControl$reader <- reader
    if (inherits(readerControl$reader, "FunctionGenerator"))
        readerControl$reader <- readerControl$reader(...)
    if (is.null(readerControl$language))
        readerControl$language <- "en"
    readerControl
}

processURI <-
function(uri)
{
    uri <- as.character(uri)
    if (identical(substr(uri, 1, 7), "file://"))
        uri <- substr(uri, 8, nchar(uri))
    uri
}

readDataframe <-
function(elem, language, id) {
    PlainTextDocument(elem$content[, "text"],
                      id = elem$content[, "doc_id"],
                      language = language)
}

# readDOC needs antiword installed to be able to extract the text
readDOC <-
function(engine = c("antiword", "executable"),
         AntiwordOptions = "")
{
    stopifnot(is.character(engine), is.character(AntiwordOptions))

    engine <- match.arg(engine)

    antiword <-
        switch(engine,
               antiword = antiword::antiword,
               executable = function(x)
                   system2("antiword",
                           c(AntiwordOptions, shQuote(normalizePath(x))),
                           stdout = TRUE))

    if (!is.function(antiword))
        stop("invalid function for DOC extraction")

    function(elem, language, id) {
        uri <- processURI(elem$uri)
        content <- antiword(uri)
        PlainTextDocument(content, id = basename(elem$uri), language = language)
    }
}
class(readDOC) <- c("FunctionGenerator", "function")

readPDF <-
function(engine = c("pdftools", "xpdf", "Rpoppler",
                    "ghostscript", "Rcampdf", "custom"),
         control = list(info = NULL, text = NULL))
{
    stopifnot(is.character(engine), is.list(control))

    engine <- match.arg(engine)

    pdf_info <-
        switch(engine,
               pdftools = function(x) {
                   i <- pdftools::pdf_info(x)
                   c(i$keys, list(CreationDate = i$created))
               },
               xpdf = function(x) pdf_info_via_xpdf(x, control$info),
               Rpoppler = Rpoppler::PDF_info,
               ghostscript = pdf_info_via_gs,
               Rcampdf = Rcampdf::pdf_info,
               custom = control$info)

    pdf_text <-
        switch(engine,
               pdftools = pdftools::pdf_text,
               xpdf = function(x) system2("pdftotext",
                                          c(control$text, shQuote(x), "-"),
                                          stdout = TRUE),
               Rpoppler = Rpoppler::PDF_text,
               ghostscript = pdf_text_via_gs,
               Rcampdf = Rcampdf::pdf_text,
               custom = control$text)

    if (!is.function(pdf_info) || !is.function(pdf_text))
        stop("invalid function for PDF extraction")

    function(elem, language, id) {
        uri <- processURI(elem$uri)
        meta <- pdf_info(uri)
        content <- pdf_text(uri)
        PlainTextDocument(content, meta$Author, meta$CreationDate, meta$Subject,
                          meta$Title, basename(elem$uri), language,
                          meta$Creator)
     }
}
class(readPDF) <- c("FunctionGenerator", "function")

readPlain <-
function(elem, language, id) {
    if (!is.null(elem$uri))
        id <- basename(elem$uri)
    PlainTextDocument(elem$content, id = id, language = language)
}

readXML <-
function(spec, doc)
{
    stopifnot(is.list(spec), inherits(doc, "TextDocument"))

    function(elem, language, id) {
        content <- elem$content
        node <- if(inherits(content, "xml_node"))
            content
        else if(is.character(content)) 
            read_xml(paste(elem$content, collapse = "\n"))
        else
            read_xml(content)
        content(doc) <- if ("content" %in% names(spec))
            .xml_content(node, spec[["content"]])
        else
            node
        for (n in setdiff(names(spec), "content"))
            meta(doc, n) <- .xml_content(node, spec[[n]])
        if (!is.null(elem$uri))
            id <- basename(elem$uri)
        if (!length(meta(doc, "id")))
            meta(doc, "id") <- as.character(id)
        if (!length(meta(doc, "language")))
            meta(doc, "language") <- as.character(language)
        doc
    }
}
class(readXML) <- c("FunctionGenerator", "function")

RCV1Spec <-
    list(author = list("unevaluated", ""),
         datetimestamp = list("function", function(node)
           as.POSIXlt(xml_text(xml_find_all(node, "@date")),
                      tz = "GMT")),
         description = list("unevaluated", ""),
         heading = list("node", "title"),
         id = list("node", "@itemid"),
         origin = list("unevaluated", "Reuters Corpus Volume 1"),
         publisher = list("node",
           "metadata/dc[@element='dc.publisher']/@value"),
         topics = list("node",
           "metadata/codes[@class='bip:topics:1.0']/code/@code"),
         industries = list("node",
           "metadata/codes[@class='bip:industries:1.0']/code/@code"),
         countries = list("node",
           "metadata/codes[@class='bip:countries:1.0']/code/@code"))
readRCV1 <-
    readXML(spec = RCV1Spec,
            doc = XMLTextDocument())
readRCV1asPlain <-
    readXML(spec = c(RCV1Spec,
                     list(content = list("node", "text"))),
            doc = PlainTextDocument())

Reut21578XMLSpec <-
    list(author = list("node", "TEXT/AUTHOR"),
         datetimestamp = list("function", function(node)
           strptime(xml_text(xml_find_all(node, "DATE")),
                    format = "%d-%B-%Y %H:%M:%S",
                    tz = "GMT")),
         description = list("unevaluated", ""),
         heading = list("node", "TEXT/TITLE"),
         id = list("node", "@NEWID"),
         topics = list("node", "@TOPICS"),
         lewissplit = list("node", "@LEWISSPLIT"),
         cgisplit = list("node", "@CGISPLIT"),
         oldid = list("node", "@OLDID"),
         origin = list("unevaluated", "Reuters-21578 XML"),
         topics_cat = list("node", "TOPICS/D"),
         places = list("node", "PLACES/D"),
         people = list("node", "PEOPLE/D"),
         orgs = list("node", "ORGS/D"),
         exchanges = list("node", "EXCHANGES/D"))
readReut21578XML <-
    readXML(spec = Reut21578XMLSpec,
            doc = XMLTextDocument())
readReut21578XMLasPlain <-
    readXML(spec = c(Reut21578XMLSpec,
                     list(content = list("node", "TEXT/BODY"))),
            doc = PlainTextDocument())

readTagged <-
function(...)
{
    args <- list(...)
    function(elem, language, id) {
        if (!is.null(elem$content)) {
            con <- textConnection(elem$content)
            on.exit(close(con))
        } else
            con <- elem$uri

        if (!is.null(elem$uri))
            id <- basename(elem$uri)

        a <- c(list(con = con, meta = list(id = id, language = language)), args)

        do.call(TaggedTextDocument, a)
    }
}
class(readTagged) <- c("FunctionGenerator", "function")

Any scripts or data that you put into this service are public.

tm documentation built on Sept. 11, 2024, 6:47 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tm
Text Mining Package

R/reader.R
In tm: Text Mining Package

Defines functions readTagged readXML readPlain readPDF readDOC readDataframe processURI prepareReader getReaders FunctionGenerator

Documented in FunctionGenerator getReaders readDataframe readDOC readPDF readPlain readTagged readXML

Try the tm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tm Text Mining Package

R/reader.R In tm: Text Mining Package

Defines functions readTagged readXML readPlain readPDF readDOC readDataframe processURI prepareReader getReaders FunctionGenerator

Documented in FunctionGenerator getReaders readDataframe readDOC readPDF readPlain readTagged readXML

Try the tm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tm
Text Mining Package

R/reader.R
In tm: Text Mining Package