R/stem.R

Defines functions stem_words stem_strings stem

Documented in stem_strings stem_words

#' Stem a Vector of Words
#'
#' Stem a vector of words.
#'
#' @param x A vector of words.
#' @param language The name of a recognized language (see
#' \code{\link[SnowballC]{wordStem}}).
#' @param \ldots ignored.
#' @return Returns a vector of stemmed words.
#' @export
#' @seealso \code{\link[textstem]{stem_strings}}
#' @examples
#' x <- c("the", 'doggies', ',', 'well', 'they', "aren\'t", 'Joyfully', 'running', '.')
#' stem_words(x)
stem_words <- function(x, language = "porter", ...) {
    out <- stem(x, language = language)
    out[is.na(x)] <- NA
    out
}


#' Stem a Vector of Strings
#'
#' Stem a vector of strings.
#'
#' @param x A vector of strings.
#' @param language The name of a recognized language (see
#' \code{\link[SnowballC]{wordStem}}).
#' @param \ldots Other arguments passed to \code{\link[textshape]{split_token}}.
#' @return Returns a vector of stemmed strings.
#' @note The stemmer requires splitting the string apart into tokens.  After the
#' stemming occurs the strings are pasted back together.  The strings are not
#' guaranteed to retain exact spacing of the original.
#' @export
#' @seealso \code{\link[textstem]{stem_words}}
#' @examples
#' x <- c(
#'     'the dirtier dog has eaten the pies',
#'     'that shameful pooch is tricky and sneaky',
#'     "He opened and then reopened the food bag",
#'     'There are skies of blue and red roses too!',
#'     NA,
#'     "The doggies, well they aren't joyfully running.",
#'     "The daddies are coming over...",
#'     "This is 34.546 above"
#' )
#' stem_strings(x)
stem_strings <- function(x, language = "porter", ...) {

    na_locs <- is.na(x)

    numbs <- stats::na.omit(unique(unlist(stringi::stri_extract_all_regex(x, numreg))))
    x2 <- textclean::sub_holder(x, numbs)
    tokens <- textshape::split_token(x2[['output']], lower = FALSE, ...)

    locs <- textshape::starts(sapply(tokens, length))[-1]

    stemmed <- textshape::split_index(stem_words(unlist(tokens), language = language), locs)
    stemmed[na_locs] <- x[na_locs]
    stemmed[!na_locs] <- gsub("(\\s+)([.!?,;:])", "\\2",
        unlist(lapply(stemmed[!na_locs], paste, collapse = " ")), perl = TRUE)

    x2$unhold(unlist(stemmed))
}

stem <- function(x, language = "porter") SnowballC::wordStem(x, language)

Try the textstem package in your browser

Any scripts or data that you put into this service are public.

textstem documentation built on May 2, 2019, 6:42 a.m.