R/readFactivaHTML.R

readFactivaHTML <- FunctionGenerator(function(elem, language, id) {
    function(elem, language, id) {
        if(is.na(language)) {
            cl <- xml_attr(elem$content, "class")
            language <- regmatches(cl, regexec("^article ([[:alpha:]]{2})Article$", cl))[[1]][2]
        }

        table <- as.data.frame(html_table(xml_children(elem$content)[[1]]))

        # Remove line breaks as paragraphs are used for this
        # (else, line breaks in the source are propagated to the contents)
        text <- gsub("[\n\r]", "",
                     xml_text(xml_find_all(elem$content, ".//p[starts-with(@class, 'articleParagraph')]")))

        vars <- c("AN", "BY", "CO", "CY", "ED", "HD", "IN", "IPC", "IPD",
                  "LA", "LP", "NS", "PD", "PG", "PUB", "RE", "SE", "SN", "TD", "WC")

        # Remove trailing spaces when matching
        data <- as.character(table[match(vars, gsub("[^[A-Z]", "", table[,1])), 2])
        names(data) <- vars

        date <- strptime(data[["PD"]], "%d %B %Y")
        if(is.na(date) && isTRUE(data[["PD"]] != "")) {
            # Try C locale, just in case
            old.locale <- Sys.getlocale("LC_TIME")
            Sys.setlocale("LC_TIME", "C")
            date <- strptime(data[["PD"]], "%d %B %Y")
            Sys.setlocale("LC_TIME", old.locale)

            # A bug in Mac OS gives NA when start of month name matches an abbreviated name:
            # http://www.freebsd.org/cgi/query-pr.cgi?pr=141939
            # https://stat.ethz.ch/pipermail/r-sig-mac/2012-June/009296.html
            # Add a workaround for French
            if (Sys.info()["sysname"] == "Darwin")
                date <- strptime(sub("[jJ]uillet", "07", data[["PD"]]), "%d %m %Y")

            if(any(is.na(date)))
                warning(sprintf("Could not parse document date \"%s\". You may need to change the system locale to match that of the corpus. See LC_TIME in ?Sys.setlocale.", data[["PD"]]))
        }

        data[["AN"]] <- gsub("Document ", "", data[["AN"]])
        id <- if(!is.na(data[["AN"]])) data[["AN"]] else paste(sample(LETTERS, 10), collapse="")

        wc <- as.integer(regmatches(data[["WC"]], regexpr("^[[:digit:]]+", data[["WC"]])))[[1]]

        subject <- if(!is.na(data[["NS"]])) strsplit(data[["NS"]], "( \\| )")[[1]]
                   else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        subject <- gsub("[^[:print:]]", "", subject)
        subject <- gsub(".* : ", "", subject)

        coverage <- if(!is.na(data[["RE"]])) strsplit(data[["RE"]], "( \\| )")[[1]]
                    else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        coverage <- gsub("[^[:print:]]", "", coverage)
        coverage <- gsub(".* : ", "", coverage)

        company <- if(!is.na(data[["CO"]])) strsplit(data[["CO"]], "( \\| )")[[1]]
                   else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        company <- gsub("[^[:print:]]", "", company)
        company <- gsub(".* : ", "", company)

        industry <- if(!is.na(data[["IN"]])) strsplit(data[["IN"]], "( \\| )")[[1]]
                    else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        industry <- gsub("[^[:print:]]", "", industry)
        industry <- gsub(".* : ", "", industry)

        infocode <- if(!is.na(data[["IPC"]])) strsplit(data[["IPC"]], "( \\| )")[[1]]
                    else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        infocode <- gsub("[^[:print:]]", "", infocode)
        infocode <- gsub(".* : ", "", infocode)

        infodesc <- if(!is.na(data[["IPD"]])) strsplit(data[["IPD"]], "( +\\| +| +-+ +| +--+|--+ +|\\._)")[[1]]
                    else character(0)
        # Remove leading code and invisible characters, esp. \n, before matching the pattern
        infodesc <- gsub("[^[:print:]]", "", infodesc)
        infodesc <- gsub(".* : ", "", infodesc)

        # XMLSource uses character(0) rather than NA, do the same
        doc <- PlainTextDocument(x = text,
                                 author = if(!is.na(data[["BY"]])) data[["BY"]] else character(0),
                                 datetimestamp = date,
                                 heading = if(!is.na(data[["HD"]])) data[["HD"]] else character(0),
                                 id = id,
                                 origin = if(!is.na(data[["SN"]])) data[["SN"]] else character(0),
                                 language = language)
        meta(doc, "edition") <- if(!is.na(data[["ED"]])) data[["ED"]] else character(0)
        meta(doc, "section") <- if(!is.na(data[["SE"]])) data[["SE"]] else character(0)
        meta(doc, "subject") <- subject
        meta(doc, "coverage") <- coverage
        meta(doc, "company") <- company
        meta(doc, "industry") <- industry
        meta(doc, "infocode") <- infocode
        meta(doc, "infodesc") <- infodesc
        meta(doc, "page") <- if(!is.na(data[["PG"]])) data[["PG"]] else character(0)
        meta(doc, "wordcount") <- wc
        meta(doc, "publisher") <- if(!is.na(data[["PUB"]])) data[["PUB"]] else character(0)
        meta(doc, "rights") <- if(!is.na(data[["CY"]])) data[["CY"]] else character(0)
        doc
    }
})

Try the tm.plugin.factiva package in your browser

Any scripts or data that you put into this service are public.

tm.plugin.factiva documentation built on Oct. 30, 2019, 11:23 a.m.