#' Produce a frequency list of ngrams.
#'
#' Produces a frequency list of ngrams and returns a data frame. Users specify
#' the number of words in the ngrams, the order of the frequency list as either
#' alphabetical or by frequency, and whether the list be in ascending or
#' descending order, among other options.
#'
#' @param text The text with the ngrams whose frequencies are to be determined,
#' as either a character vector or something coercible to it, such as a list
#' of character vectors.
#' @param num_wd Specifies the number of words in the ngrams, whether single words
#' (\code{num_wd = 1}, the default), bigrams (\code{num_wd = 2}), trigrams
#' (\code{num_wd = 3}), or even larger ngrams.
#' @param ignore_case Specifies whether the frequency list be
#' case-insensitive (\code{ignore_case = TRUE}, the default) or case-sensitve
#' (\code{ignore_case = FALSE}). If case-insensitive, the ngrams are converted
#' to upper-case.
#' @param order_by Specifies whether the frequency list be ordered alphabetically
#' (\code{order_by = "alpha"}) or by frequency (\code{order_by = "freq"}).
#' @param descending Specifies whether the frequency list be ordered in
#' ascending order (\code{descending = FALSE}, the default) or descending
#' order (\code{descending = TRUE}).
#' @param min_freq Specifies the minimum frequency that an ngram must have
#' in order to be included in the frequency list. With
#' \code{min_freq = 1} (the default), all ngrams are included.
#' @param word_char If \code{word_char = NULL} (the default), the user's locale
#' is used to distinguish word characters (e.g., "abc") from non-word
#' characters (e.g., ".?!"). A user's locale can be determined with
#' \code{sessionInfo()}. If words are split that shouldn't be, users can
#' give a character class to specify word characters. For example,
#' \code{word_char = "[-'a-z]+"} specifies that a combination of one or
#' more contiguous dashes, apostrophes or letters "a" to "z" be considered
#' as words, and as a result, the sentence "It's a hard-knock life,
#' for us!" has six words rather than more.
#'
#' @return A two-column local data frame, the first column with the
#' ngrams and the second column with the frequencies.
#'
#' @seealso For more info about local data frames, see
#' \url{https://cran.r-project.org/web/packages/dplyr/vignettes/data_frames.html}).
#'
#' @examples
#' text <- c("First sentence here. Short, but sweet.")
#' text <- c(text, "Second one here, maybe?")
#' text <- c(text, "Third short paragraph here!")
#' text <- c(text, "Here too; with another thought.", "Here too.")
#'
#' ngram_freq(text)
#' ngram_freq(text, ignore_case = FALSE)
#' ngram_freq(text, order_by = "freq", descending = TRUE)
#' ngram_freq(text, order_by = "freq", descending = TRUE, min_freq = 2)
#'
#' # view difference (if any, given your locale)
#' ngram_freq("It's a hard-knock life, for us!")
#' ngram_freq("It's a hard-knock life, for us!", word_char = "[-'a-z]+")
#'
#' # gets bigram frequencies
#' ngram_freq(text, num_wd = 2)
#' ngram_freq(text, num_wd = 2, order_by = "freq", descending = TRUE)
#'
#' # gets trigram frequencies
#' ngram_freq(text, num_wd = 3)
#'
#' @export
ngram_freq <- function(text, num_wd = 1, ignore_case = TRUE, order_by = "alpha", descending = FALSE, min_freq = 1, word_char = NULL) {
UseMethod("ngram_freq")
}
#' @export
ngram_freq.list <- function(text, num_wd = 1, ignore_case = TRUE, order_by = "alpha", descending = FALSE, min_freq = 1, word_char = NULL) {
text <- unlist(text)
NextMethod("ngram_freq")
}
#' Method used in ngram_freq.default
#'
#' @param x input character vector
#' @param n the number of words to lead by
#'
#' @keywords internal
#' @export
move_ahead <- function(x, n) c(x[-seq_len(n)], rep(NA, n))
#' @export
ngram_freq.default <- function(text, num_wd = 1, ignore_case = TRUE, order_by = "alpha", descending = FALSE, min_freq = 1, word_char = NULL) {
# optionally, makes freq list case insensitive
if (ignore_case) {
text <- stringr::str_to_upper(text)
}
# gets words
if (is.null(word_char)) {
wds <- stringr::str_extract_all(text, stringr::regex("\\w+", ignore_case = ignore_case))
} else {
wds <- stringr::str_extract_all(text, stringr::regex(word_char, ignore_case = ignore_case))
}
# gets the ngrams
to_eval <- "lapply(wds, function(x) stringr::str_c(x, "
for (i in seq(num_wd) - 1) {
if (i == 0) next
to_add <- stringr::str_c("move_ahead(x, ", i, "), ")
to_eval <- stringr::str_c(to_eval, to_add)
}
to_eval <- stringr::str_c(to_eval, "sep = ' '))")
ngrams <- eval(parse(text = to_eval))
ngrams <- unlist(lapply(ngrams, function(x) na.omit(x)))
df <- data.frame(ngram = ngrams)
df <- dplyr::count(df, ngram)
# optionally, applies minimum frequency
if (min_freq > 1) {
df <- dplyr::filter(df, n >= min_freq)
}
# orders output as specified by user
if (order_by == "alpha" && descending == FALSE) {
output <- dplyr::arrange_(df, "ngram")
} else if (order_by == "alpha" && descending == TRUE) {
output <- dplyr::arrange_(df, "dplyr::desc(ngram)")
} else if (order_by == "freq" && descending == FALSE) {
output <- dplyr::arrange_(df, "n")
} else if (order_by == "freq" && descending == TRUE) {
output <- dplyr::arrange_(df, "dplyr::desc(n)")
} else if (!order_by %in% c("alpha", "freq")) {
stop("The argument 'order_by' must be either 'alpha' or 'freq'.\n")
} else if (!descending %in% c(TRUE, FALSE)) {
stop("The argument 'descending' must be either 'TRUE' or 'FALSE'.\n")
}
return(output)
} # end function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.