R/readFactivaXML.R

readFactivaXML <- readXML(spec = list(
    author = list("function", function(node)
                  toupper(gsub("^\\s+|\\s+$", "",
                               gsub("\n|\\s+", " ",
                                    xml_text(xml_find_all(node, ".//byline")))))),
    content = list("function", function(node)
                   c(xml_text(xml_find_all(node, ".//headline")),
                     xml_text(xml_find_all(node, ".//leadParagraph")),
                     xml_text(xml_find_all(node, ".//tailParagraphs/paragraph")))),
    datetimestamp = list("function", function(node)
                         strptime(xml_text(xml_find_all(node, ".//publicationDate/date")),
                                  format="%Y-%m-%d")),
    heading = list("node", ".//headline"),
    id = list("function", function(node) {
              str <- gsub("^distdoc:archive/ArchiveDoc::Article/", "",
                          xml_text(xml_find_all(node, ".//reference")))
              # If extraction failed for some reason, make sure we return a unique identifier
              if(length(str) > 0 && nchar(str) > 0)
                  str
              else
                  paste(sample(LETTERS, 10), collapse="")
    }),
    origin = list("node", ".//sourceName"),
    language = list("function", function(node)
                    tolower(xml_text(xml_find_all(node, ".//baseLanguage")))),
    edition = list("node", ".//edition"),
    section = list("node", ".//sectionName"),
    subject = list("node", ".//newsSubject/name"),
    coverage = list("node", ".//region/name"),
    company = list("node", ".//company/name"),
    industry = list("node", ".//industry/name"),
    infocode = list("node", ".//descField[@code!='ipd']"),
    infodesc = list("function", function(node) {
                    str <- xml_text(xml_find_all(node, ".//descField[@code='ipd']"))
                    if(length(str) > 0)
                        strsplit(str, "( +\\| +| +-+ +| +--+|--+ +|\\._)")[[1]]
                    else
                        character(0)
    }),
    page = list("function", function(node) {
                str <- xml_text(xml_find_all(node, ".//page"))
                if(length(str) > 0)
                    str
                else
                    NA
    }),
    wordcount = list("function", function(node)
                     as.numeric(xml_text(xml_find_all(node, ".//wordCount")))),
    publisher = list("node", ".//publisherName"),
    rights = list("function", function(node)
                  gsub("^\\s+|\\s+$", "",
                       gsub("\n|\\s+", " ",
                            xml_text(xml_find_all(node, ".//copyright")))))),
    doc = PlainTextDocument())

Try the tm.plugin.factiva package in your browser

Any scripts or data that you put into this service are public.

tm.plugin.factiva documentation built on Oct. 30, 2019, 11:23 a.m.