R/wordstem.R
In quanteda: Quantitative Analysis of Textual Data

Documented in char_wordstem dfm_wordstem tokens_wordstem

#' Stem the terms in an object
#'
#' Apply a stemmer to words.  This is a wrapper to [wordStem][SnowballC::wordStem]
#' designed to allow this function to be called without loading the entire
#' \pkg{SnowballC} package.  [wordStem][SnowballC::wordStem]  uses Martin Porter's
#' stemming algorithm and the C libstemmer library generated by Snowball.
#' @param x a character, tokens, or dfm object whose word stems are to be
#'   removed.  If tokenized texts, the tokenization must be word-based.
#' @param language the name of a recognized language, as returned by
#'   [getStemLanguages][SnowballC::getStemLanguages], or a two- or three-letter ISO-639 code
#'   corresponding to one of these languages (see references for the list of
#'   codes)
#' @inheritParams messages
#' @seealso [wordStem][SnowballC::wordStem]
#'
#' @references <https://snowballstem.org/>
#'
#'   <https://www.iso.org/iso-639-language-code> for the
#'   ISO-639 language codes
#' @export
#' @return [tokens_wordstem()] returns a [tokens] object whose word
#'   types have been stemmed.
#' @examples
#' # example applied to tokens
#' txt <- c(one = "eating eater eaters eats ate",
#'          two = "taxing taxes taxed my tax return")
#' th <- tokens(txt)
#' tokens_wordstem(th)
#'
tokens_wordstem <- function(x, language = quanteda_options("language_stemmer"),
                            verbose = quanteda_options("verbose")) {
    UseMethod("tokens_wordstem")
}

#' @export
tokens_wordstem.default <- function(x, language = quanteda_options("language_stemmer"),
                                    verbose = quanteda_options("verbose")) {
    check_class(class(x), "tokens_wordstem")
}

#' @importFrom stringi stri_split_fixed stri_paste_list
#' @export
tokens_wordstem.tokens_xptr <- function(x, language = quanteda_options("language_stemmer"),
                                        verbose = quanteda_options("verbose")) {

    verbose <- check_logical(verbose)
    attrs <- attributes(x)
    if (verbose)
        before <- stats_tokens(x)
    if (identical(field_object(attrs, "ngram"), 1L)) {
        set_types(x) <- char_wordstem(get_types(x), language = language, check_whitespace = FALSE)
    } else {
        set_types(x) <- wordstem_ngrams(
            get_types(x),
            concatenator = field_object(attrs, "concatenator"),
            language = language
            )
    }
    result <- rebuild_tokens(x, attrs)
    if (verbose)
        message_tokens("tokens_wordstem()", before, stats_tokens(result))
    return(result)
}

#' @export
tokens_wordstem.tokens <- function(x, ...) {
    as.tokens(tokens_wordstem(as.tokens_xptr(x), ...))
}

#' @rdname tokens_wordstem
#' @param check_whitespace logical; if `TRUE`, stop with a warning when trying
#'   to stem inputs containing whitespace
#' @export
#' @return [char_wordstem()] returns a [character] object whose word
#'   types have been stemmed.
#' @examples
#' # simple example
#' char_wordstem(c("win", "winning", "wins", "won", "winner"))
#'
char_wordstem <- function(x, language = quanteda_options("language_stemmer"),
                          check_whitespace = TRUE) {
    UseMethod("char_wordstem")
}

#' @export
char_wordstem.default <- function(x, language = quanteda_options("language_stemmer"),
                                  check_whitespace = TRUE) {
    check_class(class(x), "char_wordstem")
}

#' @importFrom stringi stri_detect_regex
#' @export
char_wordstem.character <- function(x, language = quanteda_options("language_stemmer"),
                                    check_whitespace = TRUE) {
    if (check_whitespace && any(stri_detect_regex(x, "^\\P{Z}+\\p{Z}+") & !is.na(x))) {
        stop("whitespace detected: you can only stem tokenized texts")
    }
    result <- SnowballC::wordStem(x, language)
    result[which(is.na(x))] <- NA
    result
}


#' @rdname tokens_wordstem
#' @return [dfm_wordstem()] returns a [dfm] object whose word
#'   types (features) have been stemmed, and recombined to consolidate features made
#'   equivalent because of stemming.
#' @examples
#' # example applied to a dfm
#' (origdfm <- dfm(tokens(txt)))
#' dfm_wordstem(origdfm)
#'
#' @export
dfm_wordstem <- function(x, language = quanteda_options("language_stemmer"),
                         verbose = quanteda_options("verbose")) {
    UseMethod("dfm_wordstem")
}

#' @export
dfm_wordstem.default <- function(x, language = quanteda_options("language_stemmer"),
                                 verbose = quanteda_options("verbose")) {
    check_class(class(x), "dfm_wordstem")
}

#' @noRd
#' @export
dfm_wordstem.dfm <- function(x, language = quanteda_options("language_stemmer"),
                             verbose = quanteda_options("verbose")) {
    x <- as.dfm(x)
    attrs <- attributes(x)
    if (identical(field_object(attrs, "ngram"), 1L)) {
        set_dfm_featnames(x) <- char_wordstem(featnames(x), language = language, check_whitespace = FALSE)
    } else {
        set_dfm_featnames(x) <- wordstem_ngrams(
            featnames(x),
            field_object(attrs, "concatenator"),
            language
        )
    }

    if (verbose)
        before <- stats_dfm(x)
    x <- dfm_compress(x, margin = "features")
    if (verbose)
        message_dfm("dfm_wordstem()", before, stats_dfm(x))
    return(x)
}


# internal functions -----------

# stemming for ngrams, internal function
wordstem_ngrams <- function(x, concatenator, language) {
    temp <- lapply(stri_split_fixed(x, concatenator),
                   SnowballC::wordStem, language = language)
    temp <- stri_paste_list(temp, sep = concatenator)
    unlist(temp, use.names = FALSE)
}