R/readEuropresseHTML.R
In tm.plugin.europresse: Import Articles from 'Europresse' Using the 'tm' Text Mining Framework

readEuropresseHTML2 <- FunctionGenerator(function(elem, language, id) {
    function(elem, language, id) {
        # Remove annoying line breaks, as <p> is already here to separate paragraphs
        elem$content <- gsub("\r|&#13;|\n", "", elem$content)
        # In HTML any repetition of spaces must be handled like one space
        # (used by Internet Explorer for identation)
        elem$content <- gsub("  +", " ", elem$content)
        # These markups are used by some sources and create line breaks in the middle
        # of sentences, e.g. with quotations in italics
        elem$content <- gsub("</?(font|b|i).*?>", "", elem$content)
        # This markup is used to mark searches in red and creates the same result
        # Warning: </span> can also close a <span class="DocPublicationName"> tag,
        # within which a <span style=...> tag is sometimes nested:
        # this is why we need such a convoluted regexp
        elem$content <- gsub("<span (style=[^>]*|class=\"occurr?ence\")>(.*?)</span>", "\\2", elem$content)

        encoding <- if(Encoding(elem$content) == "unknown") character(0)
                    else Encoding(elem$content)
        tree <- htmlParse(elem$content, asText=TRUE, encoding=encoding)

        origin <- gsub("^ | $", "",
                       xmlValue(getNodeSet(tree, "//span[@class = 'DocPublicationName']")[[1]]))
        # Issue number is sometimes included in the publication name
        origin <- gsub(", no\\..*$", "", origin)

        dat <- strsplit(sapply(getNodeSet(tree, "//span[@class = 'DocHeader']"), xmlValue), ", ")[[1]]

        # We do not know what is the order of the fields, so try to find the date first
        # Fall back to the third field in case of failure
        # Remove week day name at the start
        strdate <- gsub("^[[:alpha:]]+ ", "", dat)
        date <- strptime(strdate, "%d %B %Y")
        datepos <- which(!is.na(date))[1]

        if(is.na(datepos)) {
            # Try C locale: the English case requires a special date format
            old.locale <- Sys.getlocale("LC_TIME")
            Sys.setlocale("LC_TIME", "C")
            date <- strptime(strdate, "%B %d, %Y")
            Sys.setlocale("LC_TIME", old.locale)

            # A bug in Mac OS gives NA when start of month name matches an abbreviated name:
            # http://www.freebsd.org/cgi/query-pr.cgi?pr=141939
            # https://stat.ethz.ch/pipermail/r-sig-mac/2012-June/009296.html
            # Add a workaround for French
            if(is.na(datepos) && Sys.info()["sysname"] == "Darwin") {
                date <- strptime(sub("[jJ]uillet", "07", strdate), "%d %m %Y")
                datepos <- which(!is.na(date))[1]
            }

            if(is.na(datepos))
                warning(sprintf("Could not parse document date from \"%s\". You may need to change the system locale to match that of the corpus. See LC_TIME in ?Sys.setlocale.", paste(dat, collapse='", "')))
        }

        if(is.na(datepos)) datepos <- 3
        date <- date[datepos]

        if(datepos >= 2)
            section <- dat[1]
        else
            section <- character(0)

        # Pages are not always present, and sometimes time replaces this field
        pages <- if(is.na(dat[datepos + 1]) || grepl("UTC", dat[datepos + 1])) character(0) else gsub("p. ", "", dat[datepos + 1])

        heading <- xmlValue(getNodeSet(tree, "//p[@class = 'titreArticleVisu']")[[1]])

        author <- sapply(getNodeSet(tree, "//p[@class = 'titreArticleVisu']/following-sibling::div[1]"), xmlValue)
        if(length(author) == 0) author <- character(0)

        footer <- sapply(getNodeSet(tree, "/html/body/article/footer/*"), xmlValue)
        copyright <- gsub("^ | $", "", footer[[2]])
        id <- gsub("[^[:alnum:]]|Num\ue9ro de document : news", "", footer[[3]])

        # If extraction failed for some reason, make sure we return a unique identifier
        if(is.na(id))
            id <- paste(if(!is.na(date)) strftime(date, format="%Y%m%d") else "",
                        paste(sample(LETTERS, 10), collapse=""), sep="")

        content <- gsub("^ | $", "",
                        sapply(getNodeSet(tree, "//div[@class = 'docOcurrContainer']/p"), xmlValue))

        free(tree)

        # XMLSource uses character(0) rather than NA, do the same
        doc <- PlainTextDocument(x = content,
                                 author = author,
                                 datetimestamp = date,
                                 heading = heading,
                                 id = id,
                                 origin = origin,
                                 language = language)
        meta(doc, "section") <- section
        meta(doc, "pages") <- pages
        meta(doc, "rights") <- copyright
        doc
    }
})

readEuropresseHTML1 <- FunctionGenerator(function(elem, language, id) {
    function(elem, language, id) {
        # Remove annoying line breaks, as <p> is already here to separate paragraphs
        elem$content <- gsub("\r|&#13;|\n", "", elem$content)
        # In HTML any repetition of spaces must be handled like one space
        # (used by Internet Explorer for identation)
        elem$content <- gsub("  +", " ", elem$content)
        # These markups are used by some sources and create line breaks in the middle
        # of sentences, e.g. with quotations in italics
        elem$content <- gsub("</?(font|b|i).*?>", "", elem$content)
        # This markup is used to mark searches in red and creates the same result
        # Warning: </span> can also close a <span class="DocPublicationName"> tag,
        # within which a <span style=...> tag is sometimes nested:
        # this is why we need such a convoluted regexp
        elem$content <- gsub("<span (style=[^>]*|class=\"occurr?ence\")>(.*?)</span>", "\\2", elem$content)

        encoding <- if(Encoding(elem$content) == "unknown") character(0)
                    else Encoding(elem$content)
        tree <- htmlParse(elem$content, asText=TRUE, encoding=encoding)

        origin <- xmlValue(getNodeSet(tree, "//span[@class = 'DocPublicationName']")[[1]])
        # Issue number is sometimes included in the publication name
        origin <- gsub(", no\\..*$", "", origin)

        dat <- sapply(getNodeSet(tree, "//span[@class = 'DocHeader']"), xmlValue)
        dat <- gsub("^, p. |, $|", "", dat)

        # We do not know what is the order of the fields, so try to find the date first
        # Fall back to the third field in case of failure
        date <- strptime(dat, "%d %B %Y")
        datepos <- which(!is.na(date))[1]

        if(is.na(datepos)) {
            # Try C locale: the English case requires a special date format
            old.locale <- Sys.getlocale("LC_TIME")
            Sys.setlocale("LC_TIME", "C")
            date <- strptime(dat, "%B %d, %Y")
            datepos <- which(!is.na(date))[1]
            Sys.setlocale("LC_TIME", old.locale)

            # A bug in Mac OS gives NA when start of month name matches an abbreviated name:
            # http://www.freebsd.org/cgi/query-pr.cgi?pr=141939
            # https://stat.ethz.ch/pipermail/r-sig-mac/2012-June/009296.html
            # Add a workaround for French
            if(is.na(datepos) && Sys.info()["sysname"] == "Darwin") {
                date <- strptime(sub("[jJ]uillet", "07", dat), "%d %m %Y")
                datepos <- which(!is.na(date))[1]
            }

            if(is.na(datepos))
                warning(sprintf("Could not parse document date from \"%s\". You may need to change the system locale to match that of the corpus. See LC_TIME in ?Sys.setlocale.", paste(dat, collapse='", "')))
        }

        if(is.na(datepos)) datepos <- 3
        date <- date[datepos]

        if(datepos > 2)
            section <- dat[1]
        else
            section <- character(0)

        # Not needed
        # weekday <- dat[datepos - 1]
        # Pages are not always present, and sometimes time replaces this field
        pages <- if(is.na(dat[datepos + 1]) || grepl("UTC", dat[datepos + 1])) character(0) else dat[datepos + 1]

        heading <- xmlValue(getNodeSet(tree, "//span[@class = 'TitreArticleVisu']")[[1]])

        author <- xmlValue(getNodeSet(tree, "//span[@class = 'TitreArticleVisu']/following::text()[1]")[[1]])
        # Sometimes author is missing or moved to the end of the article:
        # consider that above ten words, we are not dealing with the author name
        if(length(gregexpr(" ", author)[[1]]) > 10)
            author <- character(0)

        # If author is present, skip the corresponding line
        if(length(author) > 0)
            content <- sapply(getNodeSet(tree, "//span[@class = 'TitreArticleVisu']/following-sibling::text()[position() > 1]"),
                              xmlValue)
        else
            content <- sapply(getNodeSet(tree, "//span[@class = 'TitreArticleVisu']/following-sibling::text()"),
                              xmlValue)

        copyright.lines <- which(grepl("^([Cc]opyright \uA9|[Cc]opyright|\uA9) [[:digit:]]{4}", content))
        # To prevent false detections, only keep matches in the last 4 paragraphs...
        copyright.lines <- copyright.lines[copyright.lines >= length(content) - 4]
        copyright <- content[copyright.lines]
        # ... and short enough to be a good candidate
        copyright <- copyright[nchar(copyright) < 200]

        if(length(copyright) > 0)
            content <- content[-copyright.lines]

        content <- head(content[nchar(gsub(" ", "", content, fixed=TRUE)) > 0], -1)

        id <- gsub("[^[:alnum:]]|news", "",
                   xmlValue(getNodeSet(tree, "//tr/td[@align = 'center']/text()")[[1]]))
        # If extraction failed for some reason, make sure we return a unique identifier
        if(is.na(id))
            id <- paste(if(!is.na(date)) strftime(date, format="%Y%m%d") else "",
                        paste(sample(LETTERS, 10), collapse=""), sep="")

        free(tree)

        # XMLSource uses character(0) rather than NA, do the same
        doc <- PlainTextDocument(x = content,
                                 author = author,
                                 datetimestamp = date,
                                 heading = heading,
                                 id = id,
                                 origin = origin,
                                 language = language)
        meta(doc, "section") <- section
        meta(doc, "pages") <- pages
        meta(doc, "rights") <- copyright
        doc
    }
})

Any scripts or data that you put into this service are public.

tm.plugin.europresse documentation built on May 1, 2019, 8:18 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tm.plugin.europresse
Import Articles from 'Europresse' Using the 'tm' Text Mining Framework

R/readEuropresseHTML.R
In tm.plugin.europresse: Import Articles from 'Europresse' Using the 'tm' Text Mining Framework

Try the tm.plugin.europresse package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tm.plugin.europresse Import Articles from 'Europresse' Using the 'tm' Text Mining Framework

R/readEuropresseHTML.R In tm.plugin.europresse: Import Articles from 'Europresse' Using the 'tm' Text Mining Framework

Try the tm.plugin.europresse package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tm.plugin.europresse
Import Articles from 'Europresse' Using the 'tm' Text Mining Framework

R/readEuropresseHTML.R
In tm.plugin.europresse: Import Articles from 'Europresse' Using the 'tm' Text Mining Framework