Nothing
#' dictionary
#'
#' Create a dictionary with information on all words in a corpus.
#'
#' @param dtm A `DocumentTermMatrix` object.
#' @param remove_stopwords Whether stopwords should be removed from the dictionary.
#'
#' @return A `data.frame` with row names indicating the terms, and columns giving the stem,
#' the number of occurrences, and whether the term is a stopword.
#'
#' @examples
#'
#' file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva")
#' corpus <- import_corpus(file, "factiva", language="en")
#' dtm <- build_dtm(corpus)
#' dictionary(dtm)
#'
#' @export
dictionary <- function(dtm, remove_stopwords=FALSE) {
language <- attr(dtm, "language")
stopwords <- colnames(dtm) %in% stopwords(language)
if(remove_stopwords) {
dtm <- dtm[, !stopwords]
stopwords <- FALSE
}
dict <- data.frame(row.names=colnames(dtm),
SnowballC::wordStem(colnames(dtm), language),
col_sums(dtm),
ifelse(stopwords, .gettext("Stopword"), ""),
stringsAsFactors=FALSE)
names(dict) <- c(.gettext("Term"), .gettext("Occurrences"), .gettext("Stopword"))
dict
}
#' combine_terms
#'
#' Aggregate terms in a document-term matrix to according to groupings specified
#' by a dictionary.
#'
#' If several terms use the same transformation, they will be aggregated together.
#' Terms missing from `dict` will be dropped.
#'
#' @param dtm A `DocumentTermMatrix` object.
#' @param dict A `data.frame` with one row per term in `dtm` that should be retained.
#' The row names must match names of rows in `dtm`, and the first column must give the
#' term into which it should be transformed.
#' @return An aggregated `DocumentTermMatrix` object.
#'
#' @examples
#'
#' file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva")
#' corpus <- import_corpus(file, "factiva", language="en")
#' dtm <- build_dtm(corpus)
#' dict <- dictionary(dtm)
#' combine_terms(dtm, dict)
#'
#' @export
combine_terms <- function(dtm, dict) {
if(!all(rownames(dict) %in% colnames(dtm)))
stop(sprintf(.gettext("some rows of `dict` do not correspond to terms in `dtm`: %s"),
paste(setdiff(rownames(dict), colnames(dtm)), collapse=", ")))
else if(!.gettext("Term") %in% colnames(dict))
stop(.gettext("`dict` must contain a \"Term\" column"))
adtm <- rollup(dtm[, rownames(dict)], 2, dict[[.gettext("Term")]])
attr(adtm, "weighting") <- attr(dtm, "weighting")
attr(adtm, "language") <- attr(dtm, "language")
attr(adtm, "dict") <- dict
adtm
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.