#' N Most Frequent Terms
#'
#' \code{frequent_terms} - Find a list of the n most frequent terms.
#'
#' @param text.var A vector of character strings.
#' @param n The number of rows to print. If integer selects the frequency at
#' the nth row and prints all rows >= that value. If proportional (less than 0)
#' the frequency value for the nth\% row is selected and prints all rows >= that
#' value.
#' @param stopwords A vector of stopwords to exclude.
#' @param min.freq The minimum frequency to print. Note that this argument
#' overides the \code{n} argument.
#' @param min.char The minimum number of characters a word must be (including
#' apostrophes) for inclusion.
#' @param max.char The maximum number of characters a word must be (including
#' apostrophes) for inclusion.
#' @param stem logical. If \code{TRUE} the \code{\link[SnowballC]{wordStem}}
#' is used with \code{language = "porter"} as the default. Note that stopwords
#' will be stemmed as well.
#' @param language The stem language to use (see \code{\link[SnowballC]{wordStem}}).
#' @param strip logical. If \code{TRUE} all values that are not alpha, apostrophe,
#' or spaces are stripped. This regex can be changed via the \code{strip.regex}
#' argument.
#' @param strip.regex A regular expression used for stripping undesired characters.
#' @param alphabetical logical. Should rows be arranged alphabetically by term
#' or frequency.
#' @param \ldots ignored.
#' @return Returns a \code{\link[base]{data.frame}} of terms and frequencies.
#' @importFrom stopwords stopwords
#' @keywords term word frequency
#' @rdname frequent_terms
#' @export
#' @examples
#' \dontrun{
#' x <- presidential_debates_2012[["dialogue"]]
#'
#' frequent_terms(x)
#' frequent_terms(x, min.char = 1)
#' frequent_terms(x, n = 50)
#' frequent_terms(x, n = .02)
#' frequent_terms(x, stem = TRUE)
#' frequent_terms(x, n = 50, stopwords = c(stopwords::stopwords("english"), "said", "well"))
#'
#' plot(frequent_terms(x))
#' plot(frequent_terms(x, n = .02))
#' plot(frequent_terms(x, n = 40))
#' plot(frequent_terms(x, n = 40), as.cloud = TRUE)
#'
#' ## Note `n` can be used in print to change how many rows are returned.
#' ## This output can be reassigned when wrapped in print. This is useful
#' ## reduce computational time on larger data sets.
#' y <- frequent_terms(x, n=10)
#' nrow(y)
#' z <- print(frequent_terms(x, n=100))
#' nrow(z)
#'
#' ## Cumulative Percent Plot
#' plot_cum_percent(frequent_terms(presidential_debates_2012[["dialogue"]]))
#' }
frequent_terms <- function(text.var, n = 20, stopwords = stopwords::stopwords("english"), min.freq = NULL,
min.char = 4, max.char = Inf, stem = FALSE, language = "porter", strip = TRUE,
strip.regex = "[^a-z' ]", alphabetical = FALSE, ...) {
if (is.data.frame(text.var)) stop("`text.var` is a `data.frame`; please pass a vector")
text.var <- stringi::stri_trans_tolower(text.var)
## remove nonascii characters
text.var <- iconv(text.var, "latin1", "ASCII", sub = "")
## regex strip of non-word/space characters
if (isTRUE(strip)) text.var <- gsub(strip.regex, " ", text.var)
y <- unlist(stringi::stri_extract_all_words(text.var))
n.words <- sum(stringi::stri_count_words(text.var), na.rm = TRUE)
## stemming
if (isTRUE(stem)) {
y <- SnowballC::wordStem(y, language = language)
if (! is.null(stopwords)) stopwords <- SnowballC::wordStem(stopwords, language = language)
}
## exclude less than the min character cut-off
y <- y[nchar(y) > min.char - 1]
## exclude more than the max character cut-off
y <- y[nchar(y) < max.char + 1]
## data frame of counts
y <- sort(table(y), TRUE)
## stopword removal
if (!is.null(stopwords)){
y <- y[!names(y) %in% stopwords]
}
out <- data.frame(term = names(y), frequency = c(unlist(y, use.names=FALSE)),
stringsAsFactors = FALSE, row.names=NULL)
if (isTRUE(alphabetical)){
out <- out[order(out[["term"]]), ]
}
if (n < 1) {
n <- round(n * nrow(out), 0)
}
if (n > nrow(out)) {
n <- nrow(out)
}
if (is.null(min.freq)) {
out2 <- out[out[["frequency"]] >= out[["frequency"]][n], ]
} else {
out2 <- out[out[["frequency"]] >= min.freq, ]
n <- nrow(out2)
}
class(out2) <- c('frequent_terms', class(out))
attributes(out2)[["n"]] <- n
attributes(out2)[["full"]] <- out
attributes(out2)[["n.words"]] <- n.words
out2
}
#' N Most Frequent Terms
#'
#' \code{all_words} - Find a list of all terms used.
#' @rdname frequent_terms
#' @export
all_words <- function(text.var, stopwords = NULL, min.char = 0, ...) {
frequent_terms(text.var, stopwords = stopwords, min.char = min.char, min.freq = 1)
}
#' Prints a frequent_terms Object
#'
#' Prints a frequent_terms object.
#'
#' @param x The \code{frequent_terms} object.
#' @param n The number of rows to print. If integer selects the frequency at
#' the nth row and prints all rows >= that value. If proportional (less than 0)
#' the frequency value for the nth\% row is selected and prints all rows >= that
#' value.
#' @param \ldots ignored.
#' @method print frequent_terms
#' @export
print.frequent_terms <- function(x, n, ...){
if (missing(n)) n <- attributes(x)[["n"]]
full <- x <- attributes(x)[["full"]]
if (n < 1) {
n <- round(n * nrow(x), 0)
}
out2 <- x <- x[x[["frequency"]] >= x[["frequency"]][n], ]
#x <- as.data.frame(x, stringsAsFactors = FALSE)
rownames(x) <- 1:nrow(x)
print(as.matrix(x), justify = "left", quote=FALSE)
class(out2) <- c('frequent_terms', class(out2))
attributes(out2)[["n"]] <- n
attributes(out2)[["full"]] <- full
out2
return(invisible(out2))
}
#' Plots a frequent_terms Object
#'
#' Plots a frequent_terms object.
#'
#' @param x The \code{frequent_terms} object.
#' @param n The number of rows to plot. If integer selects the frequency at
#' the nth row and plots all rows >= that value. If proportional (less than 0)
#' the frequency value for the nth\% row is selected and plots all rows >= that
#' value.
#' @param as.cloud logical. If \code{TRUE} a wordcloud will be plotted rather
#' than a bar plot.
#' @param random.order logical. Should the words be place randomly around the
#' cloud or if \code{FALSE} the more frequent words are in the center of the cloud.
#' @param rot.per The precentage of rotated words.
#' @param \ldots Other arguments passed to \code{\link[wordcloud]{wordcloud}}.
#' @method plot frequent_terms
#' @export
plot.frequent_terms <- function(x, n, as.cloud = FALSE, random.order = FALSE,
rot.per = 0, ...){
if (missing(n)) n <- attributes(x)[["n"]]
if (n < 1) {
n <- round(n * nrow(x), 0)
}
x <- attributes(x)[["full"]]
x <- x[x[["frequency"]] >= x[["frequency"]][n], ]
if (isTRUE(as.cloud)) {
wordcloud::wordcloud(x[[1]], x[[2]], random.order = random.order,
rot.per = rot.per, ...)
} else {
x[["term"]] <- factor(x[["term"]], levels=rev(x[["term"]]))
ggplot2::ggplot(x, ggplot2::aes_string(x='term', weight='frequency')) +
ggplot2::geom_bar() +
ggplot2::coord_flip() +
ggplot2::ylab("Count") +
ggplot2::xlab("Terms") +
ggplot2::scale_y_continuous(expand = c(0, 0),
labels = function(x) format(x, big.mark = ",", scientific = FALSE, trim = TRUE),
limits = c(0, 1.01 * x[1, "frequency"])) +
ggplot2::theme_bw() +
ggplot2::theme(
panel.grid.major.y = ggplot2::element_blank(),
#legend.position="bottom",
legend.title = ggplot2::element_blank(),
panel.border = ggplot2::element_blank(),
axis.line = ggplot2::element_line(color="grey70")
)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.