R/wordstem.R

Defines functions wordstem_ngrams dfm_wordstem.dfm dfm_wordstem.default dfm_wordstem char_wordstem.character char_wordstem.default char_wordstem tokens_wordstem.tokens tokens_wordstem.default tokens_wordstem

Documented in char_wordstem dfm_wordstem tokens_wordstem

#' Stem the terms in an object
#'
#' Apply a stemmer to words.  This is a wrapper to [wordStem][SnowballC::wordStem]
#' designed to allow this function to be called without loading the entire
#' \pkg{SnowballC} package.  [wordStem][SnowballC::wordStem]  uses Martin Porter's
#' stemming algorithm and the C libstemmer library generated by Snowball.
#' @param x a character, tokens, or dfm object whose word stems are to be
#'   removed.  If tokenized texts, the tokenization must be word-based.
#' @param language the name of a recognized language, as returned by
#'   [getStemLanguages][SnowballC::getStemLanguages], or a two- or three-letter ISO-639 code
#'   corresponding to one of these languages (see references for the list of
#'   codes)
#' @seealso [wordStem][SnowballC::wordStem]
#'
#' @references <http://snowball.tartarus.org/>
#'
#'   <http://www.iso.org/iso/home/standards/language_codes.htm> for the
#'   ISO-639 language codes
#' @export
#' @return `tokens_wordstem` returns a [tokens] object whose word
#'   types have been stemmed.
#' @examples
#' # example applied to tokens
#' txt <- c(one = "eating eater eaters eats ate",
#'          two = "taxing taxes taxed my tax return")
#' th <- tokens(txt)
#' tokens_wordstem(th)
#'
tokens_wordstem <- function(x, language = quanteda_options("language_stemmer")) {
    UseMethod("tokens_wordstem")
}

#' @export
tokens_wordstem.default <- function(x, language = quanteda_options("language_stemmer")) {
    check_class(class(x), "tokens_wordstem")
}

#' @importFrom stringi stri_split_fixed stri_paste_list
#' @export
tokens_wordstem.tokens <- function(x, language = quanteda_options("language_stemmer")) {
    x <- as.tokens(x)
    attrs <- attributes(x)
    if (identical(field_object(attrs, "ngram"), 1L)) {
        types(x) <- char_wordstem(types(x), language = language, check_whitespace = FALSE)
    } else {
        types(x) <- wordstem_ngrams(
            types(x),
            concatenator = field_object(attrs, "concatenator"),
            language = language
            )
    }
    tokens_recompile(x)
}


#' @rdname tokens_wordstem
#' @param check_whitespace logical; if `TRUE`, stop with a warning when trying
#'   to stem inputs containing whitespace
#' @export
#' @return `char_wordstem` returns a [character] object whose word
#'   types have been stemmed.
#' @examples
#' # simple example
#' char_wordstem(c("win", "winning", "wins", "won", "winner"))
#'
char_wordstem <- function(x, language = quanteda_options("language_stemmer"),
                          check_whitespace = TRUE) {
    UseMethod("char_wordstem")
}

#' @export
char_wordstem.default <- function(x, language = quanteda_options("language_stemmer"),
                                  check_whitespace = TRUE) {
    check_class(class(x), "char_wordstem")
}

#' @importFrom stringi stri_detect_regex
#' @export
char_wordstem.character <- function(x, language = quanteda_options("language_stemmer"),
                                    check_whitespace = TRUE) {
    if (check_whitespace && any(stri_detect_regex(x, "^\\P{Z}+\\p{Z}+") & !is.na(x))) {
        stop("whitespace detected: you can only stem tokenized texts")
    }
    result <- SnowballC::wordStem(x, language)
    result[which(is.na(x))] <- NA
    result
}


#' @rdname tokens_wordstem
#' @return `dfm_wordstem` returns a [dfm] object whose word
#'   types (features) have been stemmed, and recombined to consolidate features made
#'   equivalent because of stemming.
#' @examples
#' # example applied to a dfm
#' (origdfm <- dfm(tokens(txt)))
#' dfm_wordstem(origdfm)
#'
#' @export
dfm_wordstem <- function(x, language = quanteda_options("language_stemmer")) {
    UseMethod("dfm_wordstem")
}

#' @export
dfm_wordstem.default <- function(x, language = quanteda_options("language_stemmer")) {
    check_class(class(x), "dfm_wordstem")
}

#' @noRd
#' @export
dfm_wordstem.dfm <- function(x, language = quanteda_options("language_stemmer")) {
    x <- as.dfm(x)
    attrs <- attributes(x)
    if (identical(field_object(attrs, "ngram"), 1L)) {
        set_dfm_featnames(x) <- char_wordstem(featnames(x), language = language, check_whitespace = FALSE)
    } else {
        set_dfm_featnames(x) <- wordstem_ngrams(
            featnames(x),
            field_object(attrs, "concatenator"),
            language
        )
    }
    dfm_compress(x, margin = "features")
}


# internal functions -----------

# stemming for ngrams, internal function
wordstem_ngrams <- function(x, concatenator, language) {
    temp <- lapply(stri_split_fixed(x, concatenator),
                   SnowballC::wordStem, language = language)
    temp <- stri_paste_list(temp, sep = concatenator)
    unlist(temp, use.names = FALSE)
}

Try the quanteda package in your browser

Any scripts or data that you put into this service are public.

quanteda documentation built on May 31, 2023, 8:28 p.m.