R/readFactivaHTML.R
In tm.plugin.factiva: Import Articles from 'Factiva' Using the 'tm' Text Mining Framework

readFactivaHTML <- FunctionGenerator(function(elem, language, id) {
    function(elem, language, id) {
        if(is.na(language)) {
            cl <- xml_attr(elem$content, "class")
            language <- regmatches(cl, regexec("^article ([[:alpha:]]{2})Article$", cl))[[1]][2]
        }

        table <- as.data.frame(html_table(xml_children(elem$content)[[1]]))

        # Remove line breaks as paragraphs are used for this
        # (else, line breaks in the source are propagated to the contents)
        text <- gsub("[\n\r]", "",
                     xml_text(xml_find_all(elem$content, ".//p[starts-with(@class, 'articleParagraph')]")))

        vars <- c("AN", "BY", "CO", "CY", "ED", "HD", "IN", "IPC", "IPD",
                  "LA", "LP", "NS", "PD", "PG", "PUB", "RE", "SE", "SN", "TD", "WC")

        # Remove trailing spaces when matching
        data <- as.character(table[match(vars, gsub("[^[A-Z]", "", table[,1])), 2])
        names(data) <- vars

        date <- strptime(data[["PD"]], "%d %B %Y")
        if(is.na(date) && isTRUE(data[["PD"]] != "")) {
            # Try C locale, just in case
            old.locale <- Sys.getlocale("LC_TIME")
            Sys.setlocale("LC_TIME", "C")
            date <- strptime(data[["PD"]], "%d %B %Y")
            Sys.setlocale("LC_TIME", old.locale)

            # A bug in Mac OS gives NA when start of month name matches an abbreviated name:
            # http://www.freebsd.org/cgi/query-pr.cgi?pr=141939
            # https://stat.ethz.ch/pipermail/r-sig-mac/2012-June/009296.html
            # Add a workaround for French
            if (Sys.info()["sysname"] == "Darwin")
                date <- strptime(sub("[jJ]uillet", "07", data[["PD"]]), "%d %m %Y")

            if(any(is.na(date)))
                warning(sprintf("Could not parse document date \"%s\". You may need to change the system locale to match that of the corpus. See LC_TIME in ?Sys.setlocale.", data[["PD"]]))
        }

        data[["AN"]] <- gsub("Document ", "", data[["AN"]])
        id <- if(!is.na(data[["AN"]])) data[["AN"]] else paste(sample(LETTERS, 10), collapse="")

        wc <- as.integer(regmatches(data[["WC"]], regexpr("^[[:digit:]]+", data[["WC"]])))[[1]]

        subject <- if(!is.na(data[["NS"]])) strsplit(data[["NS"]], "( \\| )")[[1]]
                   else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        subject <- gsub("[^[:print:]]", "", subject)
        subject <- gsub(".* : ", "", subject)

        coverage <- if(!is.na(data[["RE"]])) strsplit(data[["RE"]], "( \\| )")[[1]]
                    else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        coverage <- gsub("[^[:print:]]", "", coverage)
        coverage <- gsub(".* : ", "", coverage)

        company <- if(!is.na(data[["CO"]])) strsplit(data[["CO"]], "( \\| )")[[1]]
                   else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        company <- gsub("[^[:print:]]", "", company)
        company <- gsub(".* : ", "", company)

        industry <- if(!is.na(data[["IN"]])) strsplit(data[["IN"]], "( \\| )")[[1]]
                    else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        industry <- gsub("[^[:print:]]", "", industry)
        industry <- gsub(".* : ", "", industry)

        infocode <- if(!is.na(data[["IPC"]])) strsplit(data[["IPC"]], "( \\| )")[[1]]
                    else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        infocode <- gsub("[^[:print:]]", "", infocode)
        infocode <- gsub(".* : ", "", infocode)

        infodesc <- if(!is.na(data[["IPD"]])) strsplit(data[["IPD"]], "( +\\| +| +-+ +| +--+|--+ +|\\._)")[[1]]
                    else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        infodesc <- gsub("[^[:print:]]", "", infodesc)
        infodesc <- gsub(".* : ", "", infodesc)

        # XMLSource uses character(0) rather than NA, do the same
        doc <- PlainTextDocument(x = text,
                                 author = if(!is.na(data[["BY"]])) data[["BY"]] else character(0),
                                 datetimestamp = date,
                                 heading = if(!is.na(data[["HD"]])) data[["HD"]] else character(0),
                                 id = id,
                                 origin = if(!is.na(data[["SN"]])) data[["SN"]] else character(0),
                                 language = language)
        meta(doc, "edition") <- if(!is.na(data[["ED"]])) data[["ED"]] else character(0)
        meta(doc, "section") <- if(!is.na(data[["SE"]])) data[["SE"]] else character(0)
        meta(doc, "subject") <- subject
        meta(doc, "coverage") <- coverage
        meta(doc, "company") <- company
        meta(doc, "industry") <- industry
        meta(doc, "infocode") <- infocode
        meta(doc, "infodesc") <- infodesc
        meta(doc, "page") <- if(!is.na(data[["PG"]])) data[["PG"]] else character(0)
        meta(doc, "wordcount") <- wc
        meta(doc, "publisher") <- if(!is.na(data[["PUB"]])) data[["PUB"]] else character(0)
        meta(doc, "rights") <- if(!is.na(data[["CY"]])) data[["CY"]] else character(0)
        doc
    }
})

Any scripts or data that you put into this service are public.

tm.plugin.factiva documentation built on Oct. 30, 2019, 11:23 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tm.plugin.factiva
Import Articles from 'Factiva' Using the 'tm' Text Mining Framework

R/readFactivaHTML.R
In tm.plugin.factiva: Import Articles from 'Factiva' Using the 'tm' Text Mining Framework

Try the tm.plugin.factiva package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tm.plugin.factiva Import Articles from 'Factiva' Using the 'tm' Text Mining Framework

R/readFactivaHTML.R In tm.plugin.factiva: Import Articles from 'Factiva' Using the 'tm' Text Mining Framework

Try the tm.plugin.factiva package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tm.plugin.factiva
Import Articles from 'Factiva' Using the 'tm' Text Mining Framework

R/readFactivaHTML.R
In tm.plugin.factiva: Import Articles from 'Factiva' Using the 'tm' Text Mining Framework