R/stopwords.R

#' Remove Stopwords
#' 
#' Transcript apply the removal of stopwords.
#' 
#' @param textString A character string of text or a vector of character strings.
#' @param stopwords A character vector of words to remove from the text.  qdap 
#' has a number of data sets that can be used as stopwords including: 
#' Top200Words, Top100Words, Top25Words.  For the tm package's traditional 
#' English stop words use \code{tm::stopwords("english")}
#' @param unlist logical.  If TRUE unlists into one vector.  General use 
#' intended for when separate is FALSE.
#' @param separate logical.  If TRUE separates sentences into words. If FALSE 
#' retains sentences.
#' @param strip logical.  IF TURE strips the text of all punctuation except 
#' apostrophes.
#' @param unique logical.  If TRUE keeps only unique words (if unlist is TURE) 
#' or sentences (if unlist is FALSE).  General use intended for when unlist is 
#' TRUE.
#' @param char.keep If strip is TRUE this argument provides a means of retaining 
#' supplied character(s).
#' @param names logical.  If TRUE will name the elements of the vector or list 
#' with the original textString.
#' @param ignore.case logical.  If TRUE stop words will be removed regardless of 
#' case.  Additionally, case will be stripped from the text.  If FALSE stopwords 
#' removal is contingent upon case.  Additionally, case is not stripped.
#' @param apostrophe.remove logical.  If TRUE removes apostrophe's from the 
#' output.
#' @param \ldots further arguments passed to strip function
#' @return Returns a vector of sentences, vector of words, or (default) a list 
#' of vectors of words with stop words removed.  Output depends on supplied 
#' arguments.
#' @seealso \code{\link[qdap]{strip}}, 
#' \code{\link[qdap]{bag.o.words}},
#' \code{\link[tm]{stopwords}}
#' @keywords stopwords
#' @export
#' @examples
#' stopwords(DATA$state)
#' stopwords(DATA$state, tm::stopwords("english"))
#' stopwords(DATA$state, Top200Words)
#' stopwords(DATA$state, Top200Words, strip = TRUE)
#' stopwords(DATA$state, Top200Words, separate = FALSE)
#' stopwords(DATA$state, Top200Words, separate = FALSE, ignore.case = FALSE)
#' stopwords(DATA$state, Top200Words, unlist = TRUE)
#' stopwords(DATA$state, Top200Words, unlist = TRUE, strip=TRUE)
#' stopwords(DATA$state, Top200Words, unlist = TRUE, unique = TRUE)
stopwords<-
function (textString, stopwords = Top25Words, unlist = FALSE, separate = TRUE, 
    strip = FALSE, unique = FALSE, char.keep = NULL, names = FALSE, 
    ignore.case = TRUE, apostrophe.remove = FALSE, ...) {
    Stopwords <- if (is.null(stopwords)) {
        c(" ")
    } else {
        stopwords
    }
    SW <- function(textString, stopwords) {
        "%w/o%" <- function(x, y) x[!x %in% y]
        breaker2 <- function(X) {
            strsplit(X, "[[:space:]]|(?=[!#$%&,-./:;?@_])", perl=TRUE)
        }  
        if (ignore.case) {
            unblanker(unlist(breaker2(tolower(Trim(textString)))) %w/o% 
                tolower(Trim(stopwords)))
        } else {
            unblanker(unlist(breaker2(Trim(textString))) %w/o% Trim(stopwords))
        }
    }
    if (strip) {
        textString <- qdap::strip(textString, char.keep = char.keep, 
            apostrophe.remove = apostrophe.remove, ...)
    }
    x <- lapply(textString, function(x) SW(x, Stopwords))
    if (unlist) {
        x <- unlist(x)
    }
    if (unique) {
        x <- unique(x)
    }
    if (!separate) {
        x <- sapply(x, paste, collapse = " ", USE.NAMES = FALSE)
        x <- mgsub(c(" .", " ?", " ,", " !"), c(".", "?", ",", "!"), x)
    }
    if (names) {
        names(x) <- textString
    } else {
        names(x) <- NULL
    }
    return(x)
}
trinker/qdap2 documentation built on May 31, 2019, 9:47 p.m.