Nothing
#' import_corpus
#'
#' Import a corpus from a file.
#'
#' @param paths Path to one of more files, or to a directory (if `format="txt"`) to import.
#' @param format File format: can be `"csv"`, `"txt"`, `"factiva"`, `"europresse"`,
#' `"lexisnexis"` or `"alceste"`.
#' @param language The language name or code (preferably as IETF language tags,
#' see [`language`][NLP::language]) to be used in particular for stopwords and stemming.
#'
#' @param textcolumn When `format="csv"`, the column containing the text, either
#' as a string or as a position
#' @param encoding The character encoding of the file, or `NULL` to attempt automatic detection.
#'
#' @return A `Corpus` object.
#'
#' @examples
#'
#' file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva")
#' import_corpus(file, "factiva", language="en")
#'
#' @import NLP
#' @import graphics
#' @import slam
#' @import tm
#' @import stats
#' @import utils
#' @import stringi
#' @import FactoMineR
#' @import explor
#'
#' @export
import_corpus <- function(paths, format, language, textcolumn=1, encoding=NULL) {
format <- match.arg(format, c("txt", "csv", "factiva", "europresse", "lexisnexis", "alceste"))
if(is.null(encoding)) {
if(format == "txt")
files <- list.files(paths, full.names=TRUE)
else
files <- paths
encs <- table(sapply(files,
function(f) stri_enc_detect(readBin(f, "raw", 50000))[[1]]$Encoding[1]))
encoding <- names(encs)[order(encs, decreasing=TRUE)][1]
if(is.null(encoding))
""
else
encoding
}
if(format == "txt") {
corpus <- VCorpus(DirSource(paths, encoding=encoding),
readerControl=list(language=language))
}
else if(format == "csv") {
# Try to guess the separator from the most common character of ; and ,
# This should work in all cases where text is not too long
excerpt <- readLines(con <- file(paths, encoding=encoding), 50)
close(con)
n1 <- sum(sapply(gregexpr(",", excerpt), length))
n2 <- sum(sapply(gregexpr(";", excerpt), length))
if(n1 > n2)
corpusDataset <- read.csv(paths, fileEncoding=encoding, stringsAsFactors=FALSE)
else
corpusDataset <- read.csv2(paths, fileEncoding=encoding, stringsAsFactors=FALSE)
corpus <- VCorpus(DataframeSource(data.frame(doc_id=rownames(corpusDataset),
text=corpusDataset[[textcolumn]])),
readerControl=list(language=language))
for(i in names(corpusDataset)[-textcolumn])
meta(corpus,i) <- corpusDataset[[i]]
}
else if(format == "factiva") {
corpus <- import_files(tm.plugin.factiva::FactivaSource, paths, language, encoding)
}
else if(format == "europresse") {
corpus <- import_files(tm.plugin.europresse::EuropresseSource, paths, language, encoding)
}
else if(format == "lexisnexis") {
corpus <- import_files(tm.plugin.lexisnexis::LexisNexisSource, paths, language, encoding)
}
else if(format == "alceste") {
corpus <- import_files(tm.plugin.alceste::AlcesteSource, paths, language, encoding)
}
# make.names ensures that names are unique and
# that they will not be modified by corpus_ca (to work around a bug in explor)
names(corpus) <- make.names(names(corpus))
meta(corpus, type="corpus", tag="language") <- language
corpus
}
import_files <- function(f, paths, language, encoding) {
if(length(paths) == 1) {
corpus <- VCorpus(f(paths, encoding=encoding), readerControl=list(language=language))
}
else {
corpusList <- lapply(paths, function(path)
VCorpus(f(path, encoding=encoding),
readerControl=list(language=language)))
corpus <- do.call(c, c(corpusList, list(recursive=TRUE)))
}
corpus <- set_corpus_variables(corpus, extract_metadata(corpus))
corpus
}
#' split_documents
#'
#' Split documents in a corpus into documents of one of more paragraphs.
#'
#' @param corpus A `Corpus` object.
#' @param chunksize The number of paragraphs each new document should contain at most.
#' @param preserveMetadata Whether to preserve the meta-data of original documents.
#'
#' @return A `Corpus` object with split documents.
#'
#' @examples
#'
#' file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva")
#' corpus <- import_corpus(file, "factiva", language="en")
#' split_documents(corpus, 3)
#'
#' @export
split_documents <- function(corpus, chunksize, preserveMetadata=TRUE)
{
chunks <- list(length(corpus))
origins <- list(length(corpus))
seqnum <- list(length(names))
for (k in seq_along(corpus)) {
paragraphs <- unlist(strsplit(content(corpus[[k]]), "\r\n|\n|\r"))
chunks_k <- tapply(paragraphs,
rep(seq_along(paragraphs),
each=chunksize, length.out=length(paragraphs)), c)
# Skip empty chunks
keep <- nchar(gsub("[\n[:space:][:punct:]]+", "", sapply(chunks_k, paste, collapse=""))) > 0
chunks[[k]] <- chunks_k[keep]
origins[[k]] <- rep(k, sum(keep))
seqnum[[k]] <- seq(sum(keep))
}
# Merge only the per-document lists of chunks at the end to reduce the number of copies
chunks <- do.call(c, chunks)
origins <- do.call(c, origins)
seqnum <- do.call(c, seqnum)
ids <- names(corpus)
newCorpus <- VCorpus(VectorSource(chunks))
# Note: does not work for SimpleCorpus
names(newCorpus) <- paste(ids[origins], seqnum, sep=".")
# Copy meta data from old documents
if(preserveMetadata) {
newCorpus$dmeta <- meta(corpus)[origins,, drop=FALSE]
for(i in seq_along(corpus)) {
attrs <- meta(corpus[[i]])
for(j in which(origins == i)) {
doc <- newCorpus[[j]]
doc$meta <- attrs
# Note: does not work for SimpleCorpus
meta(doc, "id") <- paste(ids[i], seqnum[j], sep=".")
meta(doc, "document") <- ids[i]
newCorpus[[j]] <- doc
}
}
}
meta(newCorpus, "doc_id") <- ids[origins]
meta(newCorpus, "doc_n") <- origins
meta(newCorpus, type="corpus", tag="split") <- TRUE
meta(newCorpus, type="corpus", tag="language") <- meta(corpus, type="corpus", tag="language")
newCorpus
}
#' set_corpus_variables
#'
#' Set corpus meta-data variables from a data frame.
#'
#' @param corpus A `Corpus` object.
#' @param dset A `data.frame` containing meta-data variables, with one row per document in `corpus.
#'
#' @return A `Corpus` object with meta-data added.
#'
#' @examples
#'
#' file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva")
#' corpus <- import_corpus(file, "factiva", language="en")
#' dtm <- build_dtm(corpus)
#' dset <- data.frame(x=1:length(corpus))
#' corpus <- set_corpus_variables(corpus, dset)
#'
#' @export
set_corpus_variables <- function(corpus, dset) {
# If corpus was split, we need to replicate variables
split <- isTRUE(meta(corpus, type="corpus", tag="split"))
len <- if (split) length(unique(meta(corpus, "doc_n")[[1]])) else length(corpus)
# Remove dropped and empty variables
for(var in colnames(meta(corpus))[!colnames(meta(corpus)) %in%
c(colnames(dset), "doc_n", "doc_id", "cluster",
sapply(dset, function(x) all(is.na(x) | x == "")))])
meta(corpus, var) <- NULL
# Add new variables
indices <- which(sapply(dset, function(x) !all(is.na(x) | x == "", na.rm=TRUE)))
# Drop empty levels, which notably happen when changing values manually
if(length(indices) > 0) {
if(split) {
for(i in indices)
meta(corpus, colnames(dset)[i]) <- droplevels(factor(dset[meta(corpus, "doc_n")[[1]], i]))
}
else {
for(i in indices)
meta(corpus, colnames(dset)[i]) <- droplevels(factor(dset[[i]]))
}
}
# Update names only if they changed
oldDocNames <- if(split) unique(meta(corpus, "doc_id")[[1]]) else names(corpus)
corpusNames <- names(corpus)
if(!identical(oldDocNames, row.names(dset))) {
if(split) {
# Note: does not work for SimpleCorpus
names(corpus) <- make.unique(make.names(row.names(dset)[meta(corpus, "doc_n")[[1]]]))
meta(corpus, "Doc ID") <- row.names(dset)[meta(corpus, "doc_n")[[1]]]
}
else {
# Note: does not work for SimpleCorpus
names(corpus) <- make.unique(make.names(row.names(dset)))
}
}
return(corpus)
}
extract_metadata <- function(corpus, date=TRUE) {
if(date) {
dates <- lapply(corpus, meta, "datetimestamp")
dates <- sapply(dates, function(x) if(length(x) > 0) as.character(x) else NA)
vars <- data.frame(origin=rep(NA, length(corpus)),
date=dates,
author=rep(NA, length(corpus)),
section=rep(NA, length(corpus)))
}
else {
vars <- data.frame(origin=rep(NA, length(corpus)),
author=rep(NA, length(corpus)),
section=rep(NA, length(corpus)))
}
specialTags <- c("subject", "coverage", "company", "stocksymbol", "industry", "infocode", "infodesc")
tags <- setdiff(unique(unlist(lapply(corpus, function(x) names(meta(x))))),
c("datetimestamp", "heading", "id", "language", specialTags))
for(tag in tags) {
var <- lapply(corpus, meta, tag)
# paste() is here to prevent an error in case x contains more than one elemen
# This typically happens with Rights
var <- lapply(var, function(x) if(length(x) > 0) paste(x, collapse=" ") else NA)
vars[[tag]] <- unlist(var)
}
# Keep in sync with importCorpusFromTwitter()
colnames(vars)[colnames(vars) == "origin"] <- .gettext("Origin")
colnames(vars)[colnames(vars) == "date"] <- .gettext("Date")
colnames(vars)[colnames(vars) == "author"] <- .gettext("Author")
colnames(vars)[colnames(vars) == "section"] <- .gettext("Section")
colnames(vars)[colnames(vars) == "type"] <- .gettext("Type")
colnames(vars)[colnames(vars) == "edition"] <- .gettext("Edition")
colnames(vars)[colnames(vars) == "wordcount"] <- .gettext("Word.Count")
colnames(vars)[colnames(vars) == "pages"] <- .gettext("Pages")
colnames(vars)[colnames(vars) == "publisher"] <- .gettext("Publisher")
colnames(vars)[colnames(vars) == "rights"] <- .gettext("Rights")
# Drop variables with only NAs, which can appear with sources that do not support them
vars <- vars[sapply(vars, function(x) sum(!is.na(x))) > 0]
# Tags that contain several values and have to be represented using dummies
meta <- sapply(corpus, function(x) meta(x)[specialTags])
# Tags missing from all documents
meta <- meta[!is.na(rownames(meta)),]
# Tags missing from some documents
meta[] <- sapply(meta, function(x) if(is.null(x)) NA else x)
for(tag in rownames(meta)) {
var <- meta[tag,]
levs <- unique(unlist(var))
levs <- levs[!is.na(levs)]
if(length(levs) == 0)
next
# We remove the identifier before ":" and abbreviate the names since they can get out of control
for(lev in levs)
vars[[make.names(substr(gsub("^[[:alnum:]]+ : ", "", lev), 1, 20))]] <- sapply(var, function(x) lev %in% x)
}
rownames(vars) <- names(corpus)
vars
}
#' concordances
#'
#' Print documents which contain one or more terms and return
#' a sub-corpus with these documents.
#'
#' Occurrences of the specified terms are highlighted. If stemming
#' or other transformations have been applied to original words using
#' [`combine_terms`], all original words which have been transformed
#' to the specified terms are highlighted.
#'
#' @param corpus A `Corpus` object.
#' @param dtm A `DocumentTermMatrix` object corresponding to `corpus`.
#' @param terms One of more terms appearing in `dtm`.
#' @param all Whether only documents containing all terms should be printed.
#' By default, documents need to contain at least one of the terms.
#'
#' @return `Corpus` object (invisibly).
#'
#' @examples
#'
#' file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva")
#' corpus <- import_corpus(file, "factiva", language="en")
#' dtm <- build_dtm(corpus)
#' concordances(corpus, dtm, "oil")
#' concordances(corpus, dtm, c("oil", "opec"))
#' concordances(corpus, dtm, c("oil", "opec"), all=TRUE)
#'
#' # Also works when terms have been combined
#' dict <- dictionary(dtm)
#' dtm2 <- combine_terms(dtm, dict)
#' concordances(corpus, dtm2, "product")
#'
#' @export
concordances <- function(corpus, dtm, terms, all=FALSE) {
if(!isTRUE(all.equal(names(corpus), rownames(dtm))))
stop(.gettext("`dtm` must have one row per document in `corpus`, with the same names and in the same order."))
if(!all(terms %in% colnames(dtm)))
stop(.gettext(sprintf("term \"%s\" is not in `dtm`", setdiff(terms, colnames(dtm))[1])))
if(all)
corpus <- corpus[row_sums(dtm[, terms] == 0) == 0]
else
corpus <- corpus[row_sums(dtm[, terms]) > 0]
dict <- attr(dtm, "dict")
if(is.null(dict))
words <- terms
else
words <- rownames(dict)[dict[[.gettext("Term")]] %in% terms]
# Used mostly to benefit from detection of terminal support
color <- crayon::red(crayon::bold("X"))
for(i in seq_along(corpus)) {
cat(crayon::bold(names(corpus)[i]), "\n")
cat(gsub(sprintf("\\b(%s)\\b",
paste(words, collapse="|")),
sub("X", "\\\\1", color),
as.character(corpus[[i]]),
ignore.case=TRUE))
cat("\n")
}
invisible(corpus)
}
#' characteristic_docs
#'
#' Print documents which are the most characteristic of each level of a variable,
#' i.e. those with the lowest Chi-squared distance to the average vocabulary
#' of documents belonging to that level.
#'
#' Occurrences of the `nterms` most specific terms for each level are highlighted.
#' If stemming or other transformations have been applied to original words
#' using [`combine_terms`], all original words which have been transformed
#' to the specified terms are highlighted.
#'
#' @param corpus A `Corpus` object.
#' @param dtm A `DocumentTermMatrix` object corresponding to `corpus`.
#' @param variable A vector of values giving the groups for which most frequent
#' terms should be reported.
#' @param ndocs The number of (most characteristic) documents to print.
#' @param nterms The number of terms to highlight in documents.
#' @param p The maximum p-value up to which specific terms should be hightlighted.
#'
#' @return A list with one `Corpus` object for each level (invisibly).
#'
#' @examples
#'
#' file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva")
#' corpus <- import_corpus(file, "factiva", language="en")
#' dtm <- build_dtm(corpus)
#' characteristic_docs(corpus, dtm, meta(corpus)$Date)
#'
#' # Also works when terms have been combined
#' dict <- dictionary(dtm)
#' dtm2 <- combine_terms(dtm, dict)
#' characteristic_docs(corpus, dtm2, meta(corpus)$Date)
#'
#' @export
characteristic_docs <- function(corpus, dtm, variable, ndocs=10, nterms=25, p=0.1) {
if(!inherits(corpus, "Corpus"))
stop(.gettext("`corpus` must be a `Corpus` object"))
if(!isTRUE(all.equal(names(corpus), rownames(dtm))))
stop(.gettext("`dtm` must have one row per document in `corpus`, with the same names and in the same order."))
levs <- levels(factor(variable))
corpora <- vector("list", length(levs))
names(corpora) <- levs
dists <- c()
for(i in seq_along(levs)) {
subdtm <- dtm[variable %in% levs[i],]
# Remove terms that do not appear in the group
counts <- col_sums(subdtm)
keep <- as.matrix(counts > 0)
subdtm <- subdtm[, keep]
dev <- sweep(as.matrix(subdtm)/row_sums(subdtm), 2,
prop.table(counts[keep]), "-")
chisq <- rowSums(sweep(dev^2, 2, col_sums(dtm[,keep])/sum(dtm), "/"))
chisq <- head(sort(chisq), ndocs)
corpora[[i]] <- corpus[match(names(chisq), names(corpus))]
dists <- c(dists, chisq)
}
specTerms <- specific_terms(dtm, variable, n=nterms, p=p)
# Used mostly to benefit from detection of terminal support
color <- crayon::red(crayon::bold("X"))
for(i in seq_along(corpora)) {
terms <- rownames(specTerms[[i]])
dict <- attr(dtm, "dict")
if(is.null(dict))
words <- terms
else
words <- rownames(dict)[dict[[.gettext("Term")]] %in% terms]
cat(crayon::underline(crayon::bold(paste(.gettext("Documents characteristic of:"), levs[i]))), "\n")
subCorpus <- corpora[[i]]
for(j in seq_along(subCorpus)) {
name <- names(subCorpus)[j]
cat(sprintf(.gettext("%s: distance %s"), crayon::bold(name), formatC(dists[name])), "\n")
cat(gsub(sprintf("\\b(%s)\\b",
paste(words, collapse="|")),
sub("X", "\\\\1", color),
as.character(subCorpus[[j]]),
ignore.case=TRUE))
cat("\n\n")
}
}
invisible(corpora)
}
#' subset_corpus
#'
#' Select documents containing (or not containing) one or more terms.
#'
#' @param corpus A `Corpus` object.
#' @param dtm A `DocumentTermMatrix` object corresponding to `corpus`.
#' @param terms One of more terms appearing in `dtm`.
#' @param exclude Whether documents containing the terms should be excluded rather than retained.
#' @param all Whether only documents containing all terms should be retained or excluded.
#' By default, documents need to contain at least one of the terms.
#'
#' @return `Corpus` object.
#'
#' @examples
#'
#' file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva")
#' corpus <- import_corpus(file, "factiva", language="en")
#' dtm <- build_dtm(corpus)
#' subset_corpus(corpus, dtm, "barrel")
#' subset_corpus(corpus, dtm, c("barrel", "opec"))
#' subset_corpus(corpus, dtm, c("barrel", "opec"), exclude=TRUE)
#' subset_corpus(corpus, dtm, c("barrel", "opec"), all=TRUE)
#'
#' @export
subset_corpus <- function(corpus, dtm, terms, exclude=FALSE, all=FALSE) {
if(exclude) {
if(all) {
corpus[row_sums(dtm[, terms]) == 0]
}
else {
corpus[row_sums(dtm[, terms] > 0) == 0]
}
}
else {
if(all) {
corpus[row_sums(dtm[, terms] == 0) == 0]
}
else {
corpus[row_sums(dtm[, terms]) > 0]
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.