#' Stem the terms in an object
#'
#' Apply a stemmer to words. This is a wrapper to [wordStem][SnowballC::wordStem]
#' designed to allow this function to be called without loading the entire
#' \pkg{SnowballC} package. [wordStem][SnowballC::wordStem] uses Martin Porter's
#' stemming algorithm and the C libstemmer library generated by Snowball.
#' @param x a character, tokens, or dfm object whose word stems are to be
#' removed. If tokenized texts, the tokenization must be word-based.
#' @param language the name of a recognized language, as returned by
#' [getStemLanguages][SnowballC::getStemLanguages], or a two- or three-letter ISO-639 code
#' corresponding to one of these languages (see references for the list of
#' codes)
#' @seealso [wordStem][SnowballC::wordStem]
#'
#' @references <http://snowball.tartarus.org/>
#'
#' <http://www.iso.org/iso/home/standards/language_codes.htm> for the
#' ISO-639 language codes
#' @export
#' @return `tokens_wordstem` returns a [tokens] object whose word
#' types have been stemmed.
#' @examples
#' # example applied to tokens
#' txt <- c(one = "eating eater eaters eats ate",
#' two = "taxing taxes taxed my tax return")
#' th <- tokens(txt)
#' tokens_wordstem(th)
#'
tokens_wordstem <- function(x, language = quanteda_options("language_stemmer")) {
UseMethod("tokens_wordstem")
}
#' @export
tokens_wordstem.default <- function(x, language = quanteda_options("language_stemmer")) {
stop(friendly_class_undefined_message(class(x), "tokens_wordstem"))
}
#' @importFrom stringi stri_split_fixed stri_paste_list
#' @export
tokens_wordstem.tokens <- function(x, language = quanteda_options("language_stemmer")) {
x <- as.tokens(x)
attrs <- attributes(x)
if (identical(field_object(attrs, "ngrams"), 1L)) {
types(x) <- char_wordstem(types(x), language = language)
} else {
types(x) <- wordstem_ngrams(
types(x),
concatenator = field_object(attrs, "concatenator"),
language = language
)
}
tokens_recompile(x)
}
#' @rdname tokens_wordstem
#' @export
#' @return `char_wordstem` returns a [character] object whose word
#' types have been stemmed.
#' @examples
#' # simple example
#' char_wordstem(c("win", "winning", "wins", "won", "winner"))
#'
char_wordstem <- function(x, language = quanteda_options("language_stemmer")) {
UseMethod("char_wordstem")
}
#' @export
char_wordstem.default <- function(x, language = quanteda_options("language_stemmer")) {
stop(friendly_class_undefined_message(class(x), "char_wordstem"))
}
#' @importFrom stringi stri_detect_regex
#' @export
char_wordstem.character <- function(x, language = quanteda_options("language_stemmer")) {
if (any(stri_detect_regex(x, "^\\P{Z}+\\p{Z}+") & !is.na(x)))
stop("whitespace detected: you can only stem tokenized texts")
result <- SnowballC::wordStem(x, language)
result[which(is.na(x))] <- NA
result
}
#' @rdname tokens_wordstem
#' @return `dfm_wordstem` returns a [dfm] object whose word
#' types (features) have been stemmed, and recombined to consolidate features made
#' equivalent because of stemming.
#' @examples
#' # example applied to a dfm
#' (origdfm <- dfm(txt))
#' dfm_wordstem(origdfm)
#'
#' @export
dfm_wordstem <- function(x, language = quanteda_options("language_stemmer")) {
UseMethod("dfm_wordstem")
}
#' @export
dfm_wordstem.default <- function(x, language = quanteda_options("language_stemmer")) {
stop(friendly_class_undefined_message(class(x), "dfm_wordstem"))
}
#' @noRd
#' @export
dfm_wordstem.dfm <- function(x, language = quanteda_options("language_stemmer")) {
x <- as.dfm(x)
attrs <- attributes(x)
if (identical(field_object(attrs, "ngrams"), 1L)) {
set_dfm_featnames(x) <- char_wordstem(featnames(x), language = language)
} else {
set_dfm_featnames(x) <- wordstem_ngrams(
featnames(x),
field_object(attrs, "concatenator"),
language
)
}
dfm_compress(x, margin = "features")
}
# internal functions -----------
# stemming for ngrams, internal function
wordstem_ngrams <- function(x, concatenator, language) {
temp <- lapply(stri_split_fixed(x, concatenator),
SnowballC::wordStem, language = language)
temp <- stri_paste_list(temp, sep = concatenator)
unlist(temp, use.names = FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.